README.txt

it could be:

define i32 @test__(i32 %a, i32 %b) nounwind readnone ssp {
entry:
  %0 = icmp sle i32 %a, %b
  %retval = zext i1 %0 to i32
  ret i32 %retval
}

//===---------------------------------------------------------------------===//

This code can be seen in viterbi:

  %64 = call noalias i8* @malloc(i64 %62) nounwind
...
  %67 = call i64 @llvm.objectsize.i64(i8* %64, i1 false) nounwind
  %68 = call i8* @__memset_chk(i8* %64, i32 0, i64 %62, i64 %67) nounwind

llvm.objectsize.i64 should be taught about malloc/calloc, allowing it to
fold to %62.  This is a security win (overflows of malloc will get caught)
and also a performance win by exposing more memsets to the optimizer.

This occurs several times in viterbi.

Note that this would change the semantics of @llvm.objectsize which by its
current definition always folds to a constant. We also should make sure that
we remove checking in code like

  char *p = malloc(strlen(s)+1);
  __strcpy_chk(p, s, __builtin_objectsize(p, 0));

//===---------------------------------------------------------------------===//

This code (from Benchmarks/Dhrystone/dry.c):

define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
entry:
  %sext = shl i32 %0, 24
  %conv = ashr i32 %sext, 24
  %sext6 = shl i32 %1, 24
  %conv4 = ashr i32 %sext6, 24
  %cmp = icmp eq i32 %conv, %conv4
  %. = select i1 %cmp, i32 10000, i32 0
  ret i32 %.
}

Should be simplified into something like:

define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
entry:
  %sext = shl i32 %0, 24
  %conv = and i32 %sext, 0xFF000000
  %sext6 = shl i32 %1, 24
  %conv4 = and i32 %sext6, 0xFF000000
  %cmp = icmp eq i32 %conv, %conv4
  %. = select i1 %cmp, i32 10000, i32 0
  ret i32 %.
}

and then to:

define i32 @Func1(i32, i32) nounwind readnone optsize ssp {
entry:
  %conv = and i32 %0, 0xFF
  %conv4 = and i32 %1, 0xFF
  %cmp = icmp eq i32 %conv, %conv4
  %. = select i1 %cmp, i32 10000, i32 0
  ret i32 %.
}
//===---------------------------------------------------------------------===//

clang -O3 currently compiles this code

int g(unsigned int a) {
  unsigned int c[100];
  c[10] = a;
  c[11] = a;
  unsigned int b = c[10] + c[11];
  if(b > a*2) a = 4;
  else a = 8;
  return a + 7;
}

into

define i32 @g(i32 a) nounwind readnone {
  %add = shl i32 %a, 1
  %mul = shl i32 %a, 1
  %cmp = icmp ugt i32 %add, %mul
  %a.addr.0 = select i1 %cmp, i32 11, i32 15
  ret i32 %a.addr.0
}

The icmp should fold to false. This CSE opportunity is only available
after GVN and InstCombine have run.

//===---------------------------------------------------------------------===//

memcpyopt should turn this:

define i8* @test10(i32 %x) {
  %alloc = call noalias i8* @malloc(i32 %x) nounwind
  call void @llvm.memset.p0i8.i32(i8* %alloc, i8 0, i32 %x, i32 1, i1 false)
  ret i8* %alloc
}

into a call to calloc.  We should make sure that we analyze calloc as
aggressively as malloc though.

//===---------------------------------------------------------------------===//

clang -03 doesn't optimize this:

void f1(int* begin, int* end) {
  std::fill(begin, end, 0);
}

into a memset.  This is PR8942.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

void f(int N) {
  std::vector<int> v(N);

  extern void sink(void*); sink(&v);
}

into

define void @_Z1fi(i32 %N) nounwind {
entry:
  %v2 = alloca [3 x i32*], align 8
  %v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
  %tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
  %conv = sext i32 %N to i64
  store i32* null, i32** %v2.sub, align 8, !tbaa !0
  %tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
  store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  %cmp.i.i.i.i = icmp eq i32 %N, 0
  br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i

_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
  store i32* null, i32** %v2.sub, align 8, !tbaa !0
  store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
  store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit

cond.true.i.i.i.i:                                ; preds = %entry
  %cmp.i.i.i.i.i = icmp slt i32 %N, 0
  br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i

if.then.i.i.i.i.i:                                ; preds = %cond.true.i.i.i.i
  call void @_ZSt17__throw_bad_allocv() noreturn nounwind
  unreachable

_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i:    ; preds = %cond.true.i.i.i.i
  %mul.i.i.i.i.i = shl i64 %conv, 2
  %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
  %0 = bitcast i8* %call3.i.i.i.i.i to i32*
  store i32* %0, i32** %v2.sub, align 8, !tbaa !0
  store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
  store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
  br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit

This is just the handling the construction of the vector. Most surprising here
is the fact that all three null stores in %entry are dead, but not eliminated.
Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

void f(int N) {
  std::vector<int> v(N);
  for (int k = 0; k < N; ++k)
    v[k] = 0;

  extern void sink(void*); sink(&v);
}

into almost the same as the previous note, but replace its final BB with:

for.body.lr.ph:                                   ; preds = %cond.true.i.i.i.i
  %mul.i.i.i.i.i = shl i64 %conv, 2
  %call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
  %0 = bitcast i8* %call3.i.i.i.i.i to i32*
  store i32* %0, i32** %v8.sub, align 8, !tbaa !0
  %add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
  store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
  store i32* %add.ptr.i.i.i, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
  %tmp18 = add i32 %N, -1
  %tmp19 = zext i32 %tmp18 to i64
  %tmp20 = shl i64 %tmp19, 2
  %tmp21 = add i64 %tmp20, 4
  call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %tmp21, i32 4, i1 false)
  br label %for.end

First off, why (((zext %N - 1) << 2) + 4) instead of the ((sext %N) << 2) done
previously? (or better yet, re-use that one?)

Then, the really painful one is the second memset, of the same memory, to the
same value.

//===---------------------------------------------------------------------===//

clang -O3 -fno-exceptions currently compiles this code:

struct S {
  unsigned short m1, m2;
  unsigned char m3, m4;
};

void f(int N) {
  std::vector<S> v(N);
  extern void sink(void*); sink(&v);
}

into poor code for zero-initializing 'v' when N is >0. The problem is that
S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
4 stores on each iteration. If the struct were 8 bytes, this gets turned into
a memset.

//===---------------------------------------------------------------------===//

clang -O3 currently compiles this code:

extern const int magic;
double f() { return 0.0 * magic; }

into

@magic = external constant i32

define double @_Z1fv() nounwind readnone {
entry:
  %tmp = load i32* @magic, align 4, !tbaa !0
  %conv = sitofp i32 %tmp to double
  %mul = fmul double %conv, 0.000000e+00
  ret double %mul
}

We should be able to fold away this fmul to 0.0.  More generally, fmul(x,0.0)
can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
not an INF.  The CannotBeNegativeZero predicate in value tracking should be
extended to support general "fpclassify" operations that can return 
yes/no/unknown for each of these predicates.

In this predicate, we know that [us]itofp is trivially never NaN or -0.0, and
we know that it isn't +/-Inf if the floating point type has enough exponent bits
to represent the largest integer value as < inf.

//===---------------------------------------------------------------------===//

clang -O3 currently compiles this code:

#include <emmintrin.h>
int f(double x) { return _mm_cvtsd_si32(_mm_set_sd(x)); }
int g(double x) { return _mm_cvttsd_si32(_mm_set_sd(x)); }

into

define i32 @_Z1fd(double %x) nounwind readnone {
entry:
  %vecinit.i = insertelement <2 x double> undef, double %x, i32 0
  %vecinit1.i = insertelement <2 x double> %vecinit.i, double 0.000000e+00, i32 1
  %0 = tail call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %vecinit1.i) nounwind
  ret i32 %0
}

define i32 @_Z1gd(double %x) nounwind readnone {
entry:
  %conv.i = fptosi double %x to i32
  ret i32 %conv.i
}

This difference carries over to the assmebly produced, resulting in:

_Z1fd:                                  # @_Z1fd
# BB#0:                                 # %entry
        pushq   %rbp
        movq    %rsp, %rbp
        xorps   %xmm1, %xmm1
        movsd   %xmm0, %xmm1
        cvtsd2sil       %xmm1, %eax
        popq    %rbp
        ret

_Z1gd:                                  # @_Z1gd
# BB#0:                                 # %entry
        pushq   %rbp
        movq    %rsp, %rbp
        cvttsd2si       %xmm0, %eax
        popq    %rbp
        ret

The problem is that we can't see through the intrinsic call used for cvtsd2si,
and fold away the unnecessary manipulation of the function parameter. When
these functions are inlined, it forms a barrier preventing many further
optimizations. LLVM IR doesn't have a good way to model the logic of
'cvtsd2si', its only FP -> int conversion path forces truncation. We should add
a rounding flag onto fptosi so that it can represent this type of rounding
naturally in the IR rather than using intrinsics. We might need to use a
'system_rounding_mode' flag to encode that the semantics of the rounding mode
can be changed by the program, but ideally we could just say that isn't
supported, and hard code the rounding.

//===---------------------------------------------------------------------===//