Newer
Older
else a = 8;
return a + 7;
}
into
define i32 @g(i32 a) nounwind readnone {
%add = shl i32 %a, 1
%mul = shl i32 %a, 1
%cmp = icmp ugt i32 %add, %mul
%a.addr.0 = select i1 %cmp, i32 11, i32 15
ret i32 %a.addr.0
}
The icmp should fold to false. This CSE opportunity is only available
after GVN and InstCombine have run.
//===---------------------------------------------------------------------===//
Chris Lattner
committed
memcpyopt should turn this:
define i8* @test10(i32 %x) {
%alloc = call noalias i8* @malloc(i32 %x) nounwind
call void @llvm.memset.p0i8.i32(i8* %alloc, i8 0, i32 %x, i32 1, i1 false)
ret i8* %alloc
}
into a call to calloc. We should make sure that we analyze calloc as
aggressively as malloc though.
//===---------------------------------------------------------------------===//
clang -O3 doesn't optimize this:
void f1(int* begin, int* end) {
std::fill(begin, end, 0);
}
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
void f(int N) {
std::vector<int> v(N);
extern void sink(void*); sink(&v);
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
}
into
define void @_Z1fi(i32 %N) nounwind {
entry:
%v2 = alloca [3 x i32*], align 8
%v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
%tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
%conv = sext i32 %N to i64
store i32* null, i32** %v2.sub, align 8, !tbaa !0
%tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
%cmp.i.i.i.i = icmp eq i32 %N, 0
br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i
_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
store i32* null, i32** %v2.sub, align 8, !tbaa !0
store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
cond.true.i.i.i.i: ; preds = %entry
%cmp.i.i.i.i.i = icmp slt i32 %N, 0
br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i
if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i
call void @_ZSt17__throw_bad_allocv() noreturn nounwind
unreachable
_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i: ; preds = %cond.true.i.i.i.i
%mul.i.i.i.i.i = shl i64 %conv, 2
%call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
%0 = bitcast i8* %call3.i.i.i.i.i to i32*
store i32* %0, i32** %v2.sub, align 8, !tbaa !0
store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
This is just the handling the construction of the vector. Most surprising here
is the fact that all three null stores in %entry are dead (because we do no
cross-block DSE).
Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
This is a because the client of LazyValueInfo doesn't simplify all instruction
operands, just selected ones.
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
void f(char* a, int n) {
__builtin_memset(a, 0, n);
for (int i = 0; i < n; ++i)
a[i] = 0;
into:
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
entry:
%conv = sext i32 %n to i64
tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%tmp10 = add i32 %n, -1
%tmp11 = zext i32 %tmp10 to i64
%tmp12 = add i64 %tmp11, 1
call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
ret void
for.end: ; preds = %entry
ret void
}
This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
the two memset's together. The issue with %n seems to stem from poor handling
of the original loop.
To simplify this, we need SCEV to know that "n != 0" because of the dominating
conditional. That would turn the second memset into a simple memset of 'n'.
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
struct S {
unsigned short m1, m2;
unsigned char m3, m4;
};
void f(int N) {
std::vector<S> v(N);
extern void sink(void*); sink(&v);
}
into poor code for zero-initializing 'v' when N is >0. The problem is that
S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
4 stores on each iteration. If the struct were 8 bytes, this gets turned into
a memset.
In order to handle this we have to:
A) Teach clang to generate metadata for memsets of structs that have holes in
them.
B) Teach clang to use such a memset for zero init of this struct (since it has
a hole), instead of doing elementwise zeroing.
//===---------------------------------------------------------------------===//
clang -O3 currently compiles this code:
extern const int magic;
double f() { return 0.0 * magic; }
into
@magic = external constant i32
define double @_Z1fv() nounwind readnone {
entry:
%tmp = load i32* @magic, align 4, !tbaa !0
%conv = sitofp i32 %tmp to double
%mul = fmul double %conv, 0.000000e+00
ret double %mul
}
We should be able to fold away this fmul to 0.0. More generally, fmul(x,0.0)
can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
not an INF. The CannotBeNegativeZero predicate in value tracking should be
extended to support general "fpclassify" operations that can return
yes/no/unknown for each of these predicates.
In this predicate, we know that uitofp is trivially never NaN or -0.0, and
we know that it isn't +/-Inf if the floating point type has enough exponent bits
to represent the largest integer value as < inf.
//===---------------------------------------------------------------------===//
When optimizing a transformation that can change the sign of 0.0 (such as the
0.0*val -> 0.0 transformation above), it might be provable that the sign of the
expression doesn't matter. For example, by the above rules, we can't transform
fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
expression is defined to be -0.0.
If we look at the uses of the fmul for example, we might be able to prove that
all uses don't care about the sign of zero. For example, if we have:
fadd(fmul(sitofp(x), 0.0), 2.0)
Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
transform the fmul to 0.0, and then the fadd to 2.0.
//===---------------------------------------------------------------------===//
We should enhance memcpy/memcpy/memset to allow a metadata node on them
indicating that some bytes of the transfer are undefined. This is useful for
frontends like clang when lowering struct copies, when some elements of the
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
struct are undefined. Consider something like this:
struct x {
char a;
int b[4];
};
void foo(struct x*P);
struct x testfunc() {
struct x V1, V2;
foo(&V1);
V2 = V1;
return V2;
}
We currently compile this to:
$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
%struct.x = type { i8, [4 x i32] }
define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
entry:
%V1 = alloca %struct.x, align 4
call void @foo(%struct.x* %V1)
%tmp1 = bitcast %struct.x* %V1 to i8*
%0 = bitcast %struct.x* %V1 to i160*
%srcval1 = load i160* %0, align 4
%tmp2 = bitcast %struct.x* %agg.result to i8*
%1 = bitcast %struct.x* %agg.result to i160*
store i160 %srcval1, i160* %1, align 4
ret void
}
This happens because SRoA sees that the temp alloca has is being memcpy'd into
and out of and it has holes and it has to be conservative. If we knew about the
holes, then this could be much much better.
Having information about these holes would also improve memcpy (etc) lowering at
llc time when it gets inlined, because we can use smaller transfers. This also
avoids partial register stalls in some important cases.
//===---------------------------------------------------------------------===//