Newer
Older
std::fill(begin, end, 0);
}
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
void f(int N) {
std::vector<int> v(N);
extern void sink(void*); sink(&v);
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
}
into
define void @_Z1fi(i32 %N) nounwind {
entry:
%v2 = alloca [3 x i32*], align 8
%v2.sub = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 0
%tmpcast = bitcast [3 x i32*]* %v2 to %"class.std::vector"*
%conv = sext i32 %N to i64
store i32* null, i32** %v2.sub, align 8, !tbaa !0
%tmp3.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 1
store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
%cmp.i.i.i.i = icmp eq i32 %N, 0
br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i
_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
store i32* null, i32** %v2.sub, align 8, !tbaa !0
store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
cond.true.i.i.i.i: ; preds = %entry
%cmp.i.i.i.i.i = icmp slt i32 %N, 0
br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i
if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i
call void @_ZSt17__throw_bad_allocv() noreturn nounwind
unreachable
_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i: ; preds = %cond.true.i.i.i.i
%mul.i.i.i.i.i = shl i64 %conv, 2
%call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
%0 = bitcast i8* %call3.i.i.i.i.i to i32*
store i32* %0, i32** %v2.sub, align 8, !tbaa !0
store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
This is just the handling the construction of the vector. Most surprising here
is the fact that all three null stores in %entry are dead (because we do no
cross-block DSE).
Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
This is a because the client of LazyValueInfo doesn't simplify all instruction
operands, just selected ones.
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
void f(char* a, int n) {
__builtin_memset(a, 0, n);
for (int i = 0; i < n; ++i)
a[i] = 0;
into:
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
entry:
%conv = sext i32 %n to i64
tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%tmp10 = add i32 %n, -1
%tmp11 = zext i32 %tmp10 to i64
%tmp12 = add i64 %tmp11, 1
call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
ret void
for.end: ; preds = %entry
ret void
}
This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
the two memset's together. The issue with %n seems to stem from poor handling
of the original loop.
To simplify this, we need SCEV to know that "n != 0" because of the dominating
conditional. That would turn the second memset into a simple memset of 'n'.
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
struct S {
unsigned short m1, m2;
unsigned char m3, m4;
};
void f(int N) {
std::vector<S> v(N);
extern void sink(void*); sink(&v);
}
into poor code for zero-initializing 'v' when N is >0. The problem is that
S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
4 stores on each iteration. If the struct were 8 bytes, this gets turned into
a memset.
In order to handle this we have to:
A) Teach clang to generate metadata for memsets of structs that have holes in
them.
B) Teach clang to use such a memset for zero init of this struct (since it has
a hole), instead of doing elementwise zeroing.
//===---------------------------------------------------------------------===//
clang -O3 currently compiles this code:
extern const int magic;
double f() { return 0.0 * magic; }
into
@magic = external constant i32
define double @_Z1fv() nounwind readnone {
entry:
%tmp = load i32* @magic, align 4, !tbaa !0
%conv = sitofp i32 %tmp to double
%mul = fmul double %conv, 0.000000e+00
ret double %mul
}
We should be able to fold away this fmul to 0.0. More generally, fmul(x,0.0)
can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
not an INF. The CannotBeNegativeZero predicate in value tracking should be
extended to support general "fpclassify" operations that can return
yes/no/unknown for each of these predicates.
In this predicate, we know that uitofp is trivially never NaN or -0.0, and
we know that it isn't +/-Inf if the floating point type has enough exponent bits
to represent the largest integer value as < inf.
//===---------------------------------------------------------------------===//
When optimizing a transformation that can change the sign of 0.0 (such as the
0.0*val -> 0.0 transformation above), it might be provable that the sign of the
expression doesn't matter. For example, by the above rules, we can't transform
fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
expression is defined to be -0.0.
If we look at the uses of the fmul for example, we might be able to prove that
all uses don't care about the sign of zero. For example, if we have:
fadd(fmul(sitofp(x), 0.0), 2.0)
Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
transform the fmul to 0.0, and then the fadd to 2.0.
//===---------------------------------------------------------------------===//
We should enhance memcpy/memcpy/memset to allow a metadata node on them
indicating that some bytes of the transfer are undefined. This is useful for
frontends like clang when lowering struct copies, when some elements of the
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
struct are undefined. Consider something like this:
struct x {
char a;
int b[4];
};
void foo(struct x*P);
struct x testfunc() {
struct x V1, V2;
foo(&V1);
V2 = V1;
return V2;
}
We currently compile this to:
$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
%struct.x = type { i8, [4 x i32] }
define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
entry:
%V1 = alloca %struct.x, align 4
call void @foo(%struct.x* %V1)
%tmp1 = bitcast %struct.x* %V1 to i8*
%0 = bitcast %struct.x* %V1 to i160*
%srcval1 = load i160* %0, align 4
%tmp2 = bitcast %struct.x* %agg.result to i8*
%1 = bitcast %struct.x* %agg.result to i160*
store i160 %srcval1, i160* %1, align 4
ret void
}
This happens because SRoA sees that the temp alloca has is being memcpy'd into
and out of and it has holes and it has to be conservative. If we knew about the
holes, then this could be much much better.
Having information about these holes would also improve memcpy (etc) lowering at
llc time when it gets inlined, because we can use smaller transfers. This also
avoids partial register stalls in some important cases.
//===---------------------------------------------------------------------===//