Newer
Older
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%tmp4.i.i.i.i.i = getelementptr inbounds [3 x i32*]* %v2, i64 0, i64 2
store i32* null, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
%cmp.i.i.i.i = icmp eq i32 %N, 0
br i1 %cmp.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i, label %cond.true.i.i.i.i
_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.thread.i.i: ; preds = %entry
store i32* null, i32** %v2.sub, align 8, !tbaa !0
store i32* null, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%add.ptr.i5.i.i = getelementptr inbounds i32* null, i64 %conv
store i32* %add.ptr.i5.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
cond.true.i.i.i.i: ; preds = %entry
%cmp.i.i.i.i.i = icmp slt i32 %N, 0
br i1 %cmp.i.i.i.i.i, label %if.then.i.i.i.i.i, label %_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i
if.then.i.i.i.i.i: ; preds = %cond.true.i.i.i.i
call void @_ZSt17__throw_bad_allocv() noreturn nounwind
unreachable
_ZNSt12_Vector_baseIiSaIiEEC2EmRKS0_.exit.i.i: ; preds = %cond.true.i.i.i.i
%mul.i.i.i.i.i = shl i64 %conv, 2
%call3.i.i.i.i.i = call noalias i8* @_Znwm(i64 %mul.i.i.i.i.i) nounwind
%0 = bitcast i8* %call3.i.i.i.i.i to i32*
store i32* %0, i32** %v2.sub, align 8, !tbaa !0
store i32* %0, i32** %tmp3.i.i.i.i.i, align 8, !tbaa !0
%add.ptr.i.i.i = getelementptr inbounds i32* %0, i64 %conv
store i32* %add.ptr.i.i.i, i32** %tmp4.i.i.i.i.i, align 8, !tbaa !0
call void @llvm.memset.p0i8.i64(i8* %call3.i.i.i.i.i, i8 0, i64 %mul.i.i.i.i.i, i32 4, i1 false)
br label %_ZNSt6vectorIiSaIiEEC1EmRKiRKS0_.exit
This is just the handling the construction of the vector. Most surprising here
is the fact that all three null stores in %entry are dead (because we do no
cross-block DSE).
Also surprising is that %conv isn't simplified to 0 in %....exit.thread.i.i.
This is a because the client of LazyValueInfo doesn't simplify all instruction
operands, just selected ones.
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
void f(char* a, int n) {
__builtin_memset(a, 0, n);
for (int i = 0; i < n; ++i)
a[i] = 0;
into:
define void @_Z1fPci(i8* nocapture %a, i32 %n) nounwind {
entry:
%conv = sext i32 %n to i64
tail call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %conv, i32 1, i1 false)
%cmp8 = icmp sgt i32 %n, 0
br i1 %cmp8, label %for.body.lr.ph, label %for.end
for.body.lr.ph: ; preds = %entry
%tmp10 = add i32 %n, -1
%tmp11 = zext i32 %tmp10 to i64
%tmp12 = add i64 %tmp11, 1
call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 %tmp12, i32 1, i1 false)
ret void
for.end: ; preds = %entry
ret void
}
This shouldn't need the ((zext (%n - 1)) + 1) game, and it should ideally fold
the two memset's together.
The issue with the addition only occurs in 64-bit mode, and appears to be at
least partially caused by Scalar Evolution not keeping its cache updated: it
returns the "wrong" result immediately after indvars runs, but figures out the
expected result if it is run from scratch on IR resulting from running indvars.
//===---------------------------------------------------------------------===//
clang -O3 -fno-exceptions currently compiles this code:
struct S {
unsigned short m1, m2;
unsigned char m3, m4;
};
void f(int N) {
std::vector<S> v(N);
extern void sink(void*); sink(&v);
}
into poor code for zero-initializing 'v' when N is >0. The problem is that
S is only 6 bytes, but each element is 8 byte-aligned. We generate a loop and
4 stores on each iteration. If the struct were 8 bytes, this gets turned into
a memset.
In order to handle this we have to:
A) Teach clang to generate metadata for memsets of structs that have holes in
them.
B) Teach clang to use such a memset for zero init of this struct (since it has
a hole), instead of doing elementwise zeroing.
//===---------------------------------------------------------------------===//
clang -O3 currently compiles this code:
extern const int magic;
double f() { return 0.0 * magic; }
into
@magic = external constant i32
define double @_Z1fv() nounwind readnone {
entry:
%tmp = load i32* @magic, align 4, !tbaa !0
%conv = sitofp i32 %tmp to double
%mul = fmul double %conv, 0.000000e+00
ret double %mul
}
We should be able to fold away this fmul to 0.0. More generally, fmul(x,0.0)
can be folded to 0.0 if we can prove that the LHS is not -0.0, not a NaN, and
not an INF. The CannotBeNegativeZero predicate in value tracking should be
extended to support general "fpclassify" operations that can return
yes/no/unknown for each of these predicates.
In this predicate, we know that uitofp is trivially never NaN or -0.0, and
we know that it isn't +/-Inf if the floating point type has enough exponent bits
to represent the largest integer value as < inf.
//===---------------------------------------------------------------------===//
When optimizing a transformation that can change the sign of 0.0 (such as the
0.0*val -> 0.0 transformation above), it might be provable that the sign of the
expression doesn't matter. For example, by the above rules, we can't transform
fmul(sitofp(x), 0.0) into 0.0, because x might be -1 and the result of the
expression is defined to be -0.0.
If we look at the uses of the fmul for example, we might be able to prove that
all uses don't care about the sign of zero. For example, if we have:
fadd(fmul(sitofp(x), 0.0), 2.0)
Since we know that x+2.0 doesn't care about the sign of any zeros in X, we can
transform the fmul to 0.0, and then the fadd to 2.0.
//===---------------------------------------------------------------------===//
We should enhance memcpy/memcpy/memset to allow a metadata node on them
indicating that some bytes of the transfer are undefined. This is useful for
frontends like clang when lowering struct copies, when some elements of the
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
struct are undefined. Consider something like this:
struct x {
char a;
int b[4];
};
void foo(struct x*P);
struct x testfunc() {
struct x V1, V2;
foo(&V1);
V2 = V1;
return V2;
}
We currently compile this to:
$ clang t.c -S -o - -O0 -emit-llvm | opt -scalarrepl -S
%struct.x = type { i8, [4 x i32] }
define void @testfunc(%struct.x* sret %agg.result) nounwind ssp {
entry:
%V1 = alloca %struct.x, align 4
call void @foo(%struct.x* %V1)
%tmp1 = bitcast %struct.x* %V1 to i8*
%0 = bitcast %struct.x* %V1 to i160*
%srcval1 = load i160* %0, align 4
%tmp2 = bitcast %struct.x* %agg.result to i8*
%1 = bitcast %struct.x* %agg.result to i160*
store i160 %srcval1, i160* %1, align 4
ret void
}
This happens because SRoA sees that the temp alloca has is being memcpy'd into
and out of and it has holes and it has to be conservative. If we knew about the
holes, then this could be much much better.
Having information about these holes would also improve memcpy (etc) lowering at
llc time when it gets inlined, because we can use smaller transfers. This also
avoids partial register stalls in some important cases.
//===---------------------------------------------------------------------===//
We don't fold (icmp (add) (add)) unless the two adds only have a single use.
There are a lot of cases that we're refusing to fold in (e.g.) 256.bzip2, for
example:
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
%indvar.next90 = add i64 %indvar89, 1 ;; Has 2 uses
%tmp96 = add i64 %tmp95, 1 ;; Has 1 use
%exitcond97 = icmp eq i64 %indvar.next90, %tmp96
We don't fold this because we don't want to introduce an overlapped live range
of the ivar. However if we can make this more aggressive without causing
performance issues in two ways:
1. If *either* the LHS or RHS has a single use, we can definitely do the
transformation. In the overlapping liverange case we're trading one register
use for one fewer operation, which is a reasonable trade. Before doing this
we should verify that the llc output actually shrinks for some benchmarks.
2. If both ops have multiple uses, we can still fold it if the operations are
both sinkable to *after* the icmp (e.g. in a subsequent block) which doesn't
increase register pressure.
There are a ton of icmp's we aren't simplifying because of the reg pressure
concern. Care is warranted here though because many of these are induction
variables and other cases that matter a lot to performance, like the above.
Here's a blob of code that you can drop into the bottom of visitICmp to see some
missed cases:
{ Value *A, *B, *C, *D;
if (match(Op0, m_Add(m_Value(A), m_Value(B))) &&
match(Op1, m_Add(m_Value(C), m_Value(D))) &&
(A == C || A == D || B == C || B == D)) {
errs() << "OP0 = " << *Op0 << " U=" << Op0->getNumUses() << "\n";
errs() << "OP1 = " << *Op1 << " U=" << Op1->getNumUses() << "\n";
errs() << "CMP = " << I << "\n\n";
}
}
//===---------------------------------------------------------------------===//
define i1 @test1(i32 %x) nounwind {
%and = and i32 %x, 3
%cmp = icmp ult i32 %and, 2
ret i1 %cmp
}
Can be folded to (x & 2) == 0.
define i1 @test2(i32 %x) nounwind {
%and = and i32 %x, 3
%cmp = icmp ugt i32 %and, 1
ret i1 %cmp
}
Can be folded to (x & 2) != 0.
SimplifyDemandedBits shrinks the "and" constant to 2 but instcombine misses the
icmp transform.
//===---------------------------------------------------------------------===//
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
This code:
typedef struct {
int f1:1;
int f2:1;
int f3:1;
int f4:29;
} t1;
typedef struct {
int f1:1;
int f2:1;
int f3:30;
} t2;
t1 s1;
t2 s2;
void func1(void)
{
s1.f1 = s2.f1;
s1.f2 = s2.f2;
}
Compiles into this IR (on x86-64 at least):
%struct.t1 = type { i8, [3 x i8] }
@s2 = global %struct.t1 zeroinitializer, align 4
@s1 = global %struct.t1 zeroinitializer, align 4
define void @func1() nounwind ssp noredzone {
entry:
%0 = load i32* bitcast (%struct.t1* @s2 to i32*), align 4
%bf.val.sext5 = and i32 %0, 1
%1 = load i32* bitcast (%struct.t1* @s1 to i32*), align 4
%2 = and i32 %1, -4
%3 = or i32 %2, %bf.val.sext5
%bf.val.sext26 = and i32 %0, 2
%4 = or i32 %3, %bf.val.sext26
store i32 %4, i32* bitcast (%struct.t1* @s1 to i32*), align 4
ret void
}
The two or/and's should be merged into one each.
//===---------------------------------------------------------------------===//
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
Machine level code hoisting can be useful in some cases. For example, PR9408
is about:
typedef union {
void (*f1)(int);
void (*f2)(long);
} funcs;
void foo(funcs f, int which) {
int a = 5;
if (which) {
f.f1(a);
} else {
f.f2(a);
}
}
which we compile to:
foo: # @foo
# BB#0: # %entry
pushq %rbp
movq %rsp, %rbp
testl %esi, %esi
movq %rdi, %rax
je .LBB0_2
# BB#1: # %if.then
movl $5, %edi
callq *%rax
popq %rbp
ret
.LBB0_2: # %if.else
movl $5, %edi
callq *%rax
popq %rbp
ret
Note that bb1 and bb2 are the same. This doesn't happen at the IR level
because one call is passing an i32 and the other is passing an i64.
//===---------------------------------------------------------------------===//
I see this sort of pattern in 176.gcc in a few places (e.g. the start of
store_bit_field). The rem should be replaced with a multiply and subtract:
%3 = sdiv i32 %A, %B
%4 = srem i32 %A, %B
Similarly for udiv/urem. Note that this shouldn't be done on X86 or ARM,
which can do this in a single operation (instruction or libcall). It is
probably best to do this in the code generator.
//===---------------------------------------------------------------------===//