"git@repo.hca.bsc.es:rferrer/llvm-epi-0.8.git" did not exist on "03071ab74c4f9efec0dab003eec5ec9c604a868c"
Newer
Older
continue;
}
}
// Ignore icmp instructions which are already being analyzed.
if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
unsigned OtherIdx = !UI.getOperandNo();
Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
continue;
}
LSRFixup &LF = getNewFixup();
LF.UserInst = const_cast<Instruction *>(UserInst);
LF.OperandValToReplace = UI.getUse();
std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
LF.LUIdx = P.first;
LF.Offset = P.second;
LSRUse &LU = Uses[LF.LUIdx];
LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
InsertSupplementalFormula(U, LU, LF.LUIdx);
CountRegisters(LU.Formulae.back(), Uses.size() - 1);
break;
}
}
}
}
/// CollectSubexprs - Split S into subexpressions which can be pulled out into
/// separate registers. If C is non-null, multiply each subexpression by C.
static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
SmallVectorImpl<const SCEV *> &Ops,
ScalarEvolution &SE) {
if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
// Break out add operands.
for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
I != E; ++I)
CollectSubexprs(*I, C, Ops, SE);
return;
} else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
// Split a non-zero base out of an addrec.
if (!AR->getStart()->isZero()) {
CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
AR->getStepRecurrence(SE),
AR->getLoop()), C, Ops, SE);
CollectSubexprs(AR->getStart(), C, Ops, SE);
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
return;
}
} else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
// Break (C * (a + b + c)) into C*a + C*b + C*c.
if (Mul->getNumOperands() == 2)
if (const SCEVConstant *Op0 =
dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
CollectSubexprs(Mul->getOperand(1),
C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
Ops, SE);
return;
}
}
// Otherwise use the value itself.
Ops.push_back(C ? SE.getMulExpr(C, S) : S);
}
/// GenerateReassociations - Split out subexpressions from adds and the bases of
/// addrecs.
void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
Formula Base,
unsigned Depth) {
// Arbitrarily cap recursion to protect compile time.
if (Depth >= 3) return;
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *BaseReg = Base.BaseRegs[i];
SmallVector<const SCEV *, 8> AddOps;
CollectSubexprs(BaseReg, 0, AddOps, SE);
if (AddOps.size() == 1) continue;
for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
JE = AddOps.end(); J != JE; ++J) {
// Don't pull a constant into a register if the constant could be folded
// into an immediate field.
if (isAlwaysFoldable(*J, LU.MinOffset, LU.MaxOffset,
Base.getNumRegs() > 1,
LU.Kind, LU.AccessTy, TLI, SE))
continue;
// Collect all operands except *J.
SmallVector<const SCEV *, 8> InnerAddOps;
for (SmallVectorImpl<const SCEV *>::const_iterator K = AddOps.begin(),
KE = AddOps.end(); K != KE; ++K)
if (K != J)
InnerAddOps.push_back(*K);
// Don't leave just a constant behind in a register if the constant could
// be folded into an immediate field.
if (InnerAddOps.size() == 1 &&
isAlwaysFoldable(InnerAddOps[0], LU.MinOffset, LU.MaxOffset,
Base.getNumRegs() > 1,
LU.Kind, LU.AccessTy, TLI, SE))
continue;
const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
if (InnerSum->isZero())
continue;
Formula F = Base;
F.BaseRegs[i] = InnerSum;
F.BaseRegs.push_back(*J);
if (InsertFormula(LU, LUIdx, F))
// If that formula hadn't been seen before, recurse to find more like
// it.
GenerateReassociations(LU, LUIdx, LU.Formulae.back(), Depth+1);
}
}
}
/// GenerateCombinations - Generate a formula consisting of all of the
/// loop-dominating registers added into a single register.
void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
// This method is only interesting on a plurality of registers.
if (Base.BaseRegs.size() <= 1) return;
Formula F = Base;
F.BaseRegs.clear();
SmallVector<const SCEV *, 4> Ops;
for (SmallVectorImpl<const SCEV *>::const_iterator
I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
const SCEV *BaseReg = *I;
if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
!BaseReg->hasComputableLoopEvolution(L))
Ops.push_back(BaseReg);
else
F.BaseRegs.push_back(BaseReg);
}
if (Ops.size() > 1) {
const SCEV *Sum = SE.getAddExpr(Ops);
// TODO: If Sum is zero, it probably means ScalarEvolution missed an
// opportunity to fold something. For now, just ignore such cases
if (!Sum->isZero()) {
F.BaseRegs.push_back(Sum);
(void)InsertFormula(LU, LUIdx, F);
}
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
}
}
/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// We can't add a symbolic offset if the address already contains one.
if (Base.AM.BaseGV) return;
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *G = Base.BaseRegs[i];
GlobalValue *GV = ExtractSymbol(G, SE);
if (G->isZero() || !GV)
continue;
Formula F = Base;
F.AM.BaseGV = GV;
if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, TLI))
continue;
F.BaseRegs[i] = G;
(void)InsertFormula(LU, LUIdx, F);
}
}
/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// TODO: For now, just add the min and max offset, because it usually isn't
// worthwhile looking at everything inbetween.
SmallVector<int64_t, 4> Worklist;
Worklist.push_back(LU.MinOffset);
if (LU.MaxOffset != LU.MinOffset)
Worklist.push_back(LU.MaxOffset);
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *G = Base.BaseRegs[i];
for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
E = Worklist.end(); I != E; ++I) {
Formula F = Base;
F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
LU.Kind, LU.AccessTy, TLI)) {
F.BaseRegs[i] = SE.getAddExpr(G, SE.getConstant(G->getType(), *I));
2189
2190
2191
2192
2193
2194
2195
2196
2197
2198
2199
2200
2201
2202
2203
2204
2205
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
(void)InsertFormula(LU, LUIdx, F);
}
}
int64_t Imm = ExtractImmediate(G, SE);
if (G->isZero() || Imm == 0)
continue;
Formula F = Base;
F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, TLI))
continue;
F.BaseRegs[i] = G;
(void)InsertFormula(LU, LUIdx, F);
}
}
/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
/// the comparison. For example, x == y -> x*c == y*c.
void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
Formula Base) {
if (LU.Kind != LSRUse::ICmpZero) return;
// Determine the integer type for the base formula.
const Type *IntTy = Base.getType();
if (!IntTy) return;
if (SE.getTypeSizeInBits(IntTy) > 64) return;
// Don't do this if there is more than one offset.
if (LU.MinOffset != LU.MaxOffset) return;
assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
// Check each interesting stride.
for (SmallSetVector<int64_t, 8>::const_iterator
I = Factors.begin(), E = Factors.end(); I != E; ++I) {
int64_t Factor = *I;
Formula F = Base;
// Check that the multiplication doesn't overflow.
if (F.AM.BaseOffs == INT64_MIN && Factor == -1)
continue;
F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
if (F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
continue;
// Check that multiplying with the use offset doesn't overflow.
int64_t Offset = LU.MinOffset;
if (Offset == INT64_MIN && Factor == -1)
continue;
Offset = (uint64_t)Offset * Factor;
continue;
// Check that this scale is legal.
if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
continue;
// Compensate for the use having MinOffset built into it.
F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;
const SCEV *FactorS = SE.getConstant(IntTy, Factor);
// Check that multiplying with each base register doesn't overflow.
for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
goto next;
}
// Check that multiplying with the scaled register doesn't overflow.
if (F.ScaledReg) {
F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
continue;
}
// If we make it here and it's legal, add it.
(void)InsertFormula(LU, LUIdx, F);
next:;
}
}
/// GenerateScales - Generate stride factor reuse formulae by making use of
/// scaled-offset address modes, for example.
void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// Determine the integer type for the base formula.
const Type *IntTy = Base.getType();
if (!IntTy) return;
// If this Formula already has a scaled register, we can't add another one.
if (Base.AM.Scale != 0) return;
// Check each interesting stride.
for (SmallSetVector<int64_t, 8>::const_iterator
I = Factors.begin(), E = Factors.end(); I != E; ++I) {
int64_t Factor = *I;
Base.AM.Scale = Factor;
Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
// Check whether this scale is going to be legal.
if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, TLI)) {
// As a special-case, handle special out-of-loop Basic users specially.
// TODO: Reconsider this special case.
if (LU.Kind == LSRUse::Basic &&
isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
LSRUse::Special, LU.AccessTy, TLI) &&
LU.AllFixupsOutsideLoop)
LU.Kind = LSRUse::Special;
else
continue;
}
// For an ICmpZero, negating a solitary base register won't lead to
// new solutions.
if (LU.Kind == LSRUse::ICmpZero &&
!Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
continue;
// For each addrec base reg, apply the scale, if possible.
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
if (const SCEVAddRecExpr *AR =
dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
const SCEV *FactorS = SE.getConstant(IntTy, Factor);
if (FactorS->isZero())
continue;
// Divide out the factor, ignoring high bits, since we'll be
// scaling the value back up in the end.
if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
// TODO: This could be optimized to avoid all the copying.
Formula F = Base;
F.ScaledReg = Quotient;
std::swap(F.BaseRegs[i], F.BaseRegs.back());
F.BaseRegs.pop_back();
(void)InsertFormula(LU, LUIdx, F);
}
}
}
}
/// GenerateTruncates - Generate reuse formulae from different IV types.
void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// This requires TargetLowering to tell us which truncates are free.
if (!TLI) return;
// Don't bother truncating symbolic values.
if (Base.AM.BaseGV) return;
// Determine the integer type for the base formula.
const Type *DstTy = Base.getType();
if (!DstTy) return;
DstTy = SE.getEffectiveSCEVType(DstTy);
for (SmallSetVector<const Type *, 4>::const_iterator
I = Types.begin(), E = Types.end(); I != E; ++I) {
const Type *SrcTy = *I;
if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
Formula F = Base;
if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
JE = F.BaseRegs.end(); J != JE; ++J)
*J = SE.getAnyExtendExpr(*J, SrcTy);
// TODO: This assumes we've done basic processing on all uses and
// have an idea what the register usage is.
if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
continue;
(void)InsertFormula(LU, LUIdx, F);
}
}
}
namespace {
/// WorkItem - Helper class for GenerateCrossUseConstantOffsets. It's used to
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
/// defer modifications so that the search phase doesn't have to worry about
/// the data structures moving underneath it.
struct WorkItem {
size_t LUIdx;
int64_t Imm;
const SCEV *OrigReg;
WorkItem(size_t LI, int64_t I, const SCEV *R)
: LUIdx(LI), Imm(I), OrigReg(R) {}
void print(raw_ostream &OS) const;
void dump() const;
};
}
void WorkItem::print(raw_ostream &OS) const {
OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
<< " , add offset " << Imm;
}
void WorkItem::dump() const {
print(errs()); errs() << '\n';
}
/// GenerateCrossUseConstantOffsets - Look for registers which are a constant
/// distance apart and try to form reuse opportunities between them.
void LSRInstance::GenerateCrossUseConstantOffsets() {
// Group the registers by their value without any added constant offset.
typedef std::map<int64_t, const SCEV *> ImmMapTy;
typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
RegMapTy Map;
DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
SmallVector<const SCEV *, 8> Sequence;
for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
I != E; ++I) {
const SCEV *Reg = *I;
int64_t Imm = ExtractImmediate(Reg, SE);
std::pair<RegMapTy::iterator, bool> Pair =
Map.insert(std::make_pair(Reg, ImmMapTy()));
if (Pair.second)
Sequence.push_back(Reg);
Pair.first->second.insert(std::make_pair(Imm, *I));
UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
}
// Now examine each set of registers with the same base value. Build up
// a list of work to do and do the work in a separate step so that we're
// not adding formulae and register counts while we're searching.
SmallVector<WorkItem, 32> WorkItems;
SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
E = Sequence.end(); I != E; ++I) {
const SCEV *Reg = *I;
const ImmMapTy &Imms = Map.find(Reg)->second;
// It's not worthwhile looking for reuse if there's only one offset.
if (Imms.size() == 1)
continue;
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
J != JE; ++J)
dbgs() << ' ' << J->first;
dbgs() << '\n');
// Examine each offset.
for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
J != JE; ++J) {
const SCEV *OrigReg = J->second;
int64_t JImm = J->first;
const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
if (!isa<SCEVConstant>(OrigReg) &&
UsedByIndicesMap[Reg].count() == 1) {
DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
continue;
}
// Conservatively examine offsets between this orig reg a few selected
// other orig regs.
ImmMapTy::const_iterator OtherImms[] = {
Imms.begin(), prior(Imms.end()),
Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
};
for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
ImmMapTy::const_iterator M = OtherImms[i];
if (M == J || M == JE) continue;
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
// Compute the difference between the two.
int64_t Imm = (uint64_t)JImm - M->first;
for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
LUIdx = UsedByIndices.find_next(LUIdx))
// Make a memo of this use, offset, and register tuple.
if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
}
}
}
Map.clear();
Sequence.clear();
UsedByIndicesMap.clear();
UniqueItems.clear();
// Now iterate through the worklist and add new formulae.
for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
E = WorkItems.end(); I != E; ++I) {
const WorkItem &WI = *I;
size_t LUIdx = WI.LUIdx;
LSRUse &LU = Uses[LUIdx];
int64_t Imm = WI.Imm;
const SCEV *OrigReg = WI.OrigReg;
const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
Formula F = LU.Formulae[L];
// Use the immediate in the scaled register.
if (F.ScaledReg == OrigReg) {
int64_t Offs = (uint64_t)F.AM.BaseOffs +
Imm * (uint64_t)F.AM.Scale;
// Don't create 50 + reg(-50).
if (F.referencesReg(SE.getSCEV(
ConstantInt::get(IntTy, -(uint64_t)Offs))))
continue;
Formula NewF = F;
NewF.AM.BaseOffs = Offs;
if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, TLI))
continue;
NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
// If the new scale is a constant in a register, and adding the constant
// value to the immediate would produce a value closer to zero than the
// immediate itself, then the formula isn't worthwhile.
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
if (C->getValue()->getValue().isNegative() !=
(NewF.AM.BaseOffs < 0) &&
(C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
.ule(abs64(NewF.AM.BaseOffs)))
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
continue;
// OK, looks good.
(void)InsertFormula(LU, LUIdx, NewF);
} else {
// Use the immediate in a base register.
for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
const SCEV *BaseReg = F.BaseRegs[N];
if (BaseReg != OrigReg)
continue;
Formula NewF = F;
NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
LU.Kind, LU.AccessTy, TLI))
continue;
NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
// If the new formula has a constant in a register, and adding the
// constant value to the immediate would produce a value closer to
// zero than the immediate itself, then the formula isn't worthwhile.
for (SmallVectorImpl<const SCEV *>::const_iterator
J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
J != JE; ++J)
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
if (C->getValue()->getValue().isNegative() !=
(NewF.AM.BaseOffs < 0) &&
C->getValue()->getValue().abs()
.ule(abs64(NewF.AM.BaseOffs)))
goto skip_formula;
// Ok, looks good.
(void)InsertFormula(LU, LUIdx, NewF);
break;
skip_formula:;
}
}
}
}
}
/// GenerateAllReuseFormulae - Generate formulae for each use.
void
LSRInstance::GenerateAllReuseFormulae() {
// This is split into multiple loops so that hasRegsUsedByUsesOtherThan
// queries are more precise.
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
}
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
GenerateScales(LU, LUIdx, LU.Formulae[i]);
}
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
}
GenerateCrossUseConstantOffsets();
}
/// If their are multiple formulae with the same set of registers used
/// by other uses, pick the best one and delete the others.
void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
#ifndef NDEBUG
bool Changed = false;
#endif
// Collect the best formula for each unique set of shared registers. This
// is reset for each use.
typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
BestFormulaeTy;
BestFormulaeTy BestFormulae;
for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
LSRUse &LU = Uses[LUIdx];
FormulaSorter Sorter(L, LU, SE, DT);
DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << "\n");
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
for (size_t FIdx = 0, NumForms = LU.Formulae.size();
FIdx != NumForms; ++FIdx) {
Formula &F = LU.Formulae[FIdx];
SmallVector<const SCEV *, 2> Key;
for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
JE = F.BaseRegs.end(); J != JE; ++J) {
const SCEV *Reg = *J;
if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
Key.push_back(Reg);
}
if (F.ScaledReg &&
RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
Key.push_back(F.ScaledReg);
// Unstable sort by host order ok, because this is only used for
// uniquifying.
std::sort(Key.begin(), Key.end());
std::pair<BestFormulaeTy::const_iterator, bool> P =
BestFormulae.insert(std::make_pair(Key, FIdx));
if (!P.second) {
Formula &Best = LU.Formulae[P.first->second];
if (Sorter.operator()(F, Best))
std::swap(F, Best);
DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
dbgs() << "\n"
" in favor of formula "; Best.print(dbgs());
dbgs() << '\n');
#ifndef NDEBUG
Changed = true;
#endif
std::swap(F, LU.Formulae.back());
LU.Formulae.pop_back();
--FIdx;
--NumForms;
continue;
}
}
// Now that we've filtered out some formulae, recompute the Regs set.
LU.Regs.clear();
for (size_t FIdx = 0, NumForms = LU.Formulae.size();
FIdx != NumForms; ++FIdx) {
Formula &F = LU.Formulae[FIdx];
if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
}
// Reset this to prepare for the next use.
BestFormulae.clear();
}
DEBUG(if (Changed) {
dbgs() << "\n"
"After filtering out undesirable candidates:\n";
print_uses(dbgs());
});
}
/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
/// formulae to choose from, use some rough heuristics to prune down the number
/// of formulae. This keeps the main solver from taking an extraordinary amount
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
/// of time in some worst-case scenarios.
void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
// This is a rough guess that seems to work fairly well.
const size_t Limit = UINT16_MAX;
SmallPtrSet<const SCEV *, 4> Taken;
for (;;) {
// Estimate the worst-case number of solutions we might consider. We almost
// never consider this many solutions because we prune the search space,
// but the pruning isn't always sufficient.
uint32_t Power = 1;
for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
E = Uses.end(); I != E; ++I) {
size_t FSize = I->Formulae.size();
if (FSize >= Limit) {
Power = Limit;
break;
}
Power *= FSize;
if (Power >= Limit)
break;
}
if (Power < Limit)
break;
// Ok, we have too many of formulae on our hands to conveniently handle.
// Use a rough heuristic to thin out the list.
// Pick the register which is used by the most LSRUses, which is likely
// to be a good reuse register candidate.
const SCEV *Best = 0;
unsigned BestNum = 0;
for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
I != E; ++I) {
const SCEV *Reg = *I;
if (Taken.count(Reg))
continue;
if (!Best)
Best = Reg;
else {
unsigned Count = RegUses.getUsedByIndices(Reg).count();
if (Count > BestNum) {
Best = Reg;
BestNum = Count;
}
}
}
DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
Taken.insert(Best);
// In any use with formulae which references this register, delete formulae
// which don't reference it.
for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(),
E = Uses.end(); I != E; ++I) {
LSRUse &LU = *I;
if (!LU.Regs.count(Best)) continue;
// Clear out the set of used regs; it will be recomputed.
LU.Regs.clear();
for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
Formula &F = LU.Formulae[i];
if (!F.referencesReg(Best)) {
DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
std::swap(LU.Formulae.back(), F);
LU.Formulae.pop_back();
--e;
--i;
assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
continue;
}
if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
}
}
DEBUG(dbgs() << "After pre-selection:\n";
print_uses(dbgs()));
}
}
/// SolveRecurse - This is the recursive solver.
void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
Cost &SolutionCost,
SmallVectorImpl<const Formula *> &Workspace,
const Cost &CurCost,
const SmallPtrSet<const SCEV *, 16> &CurRegs,
DenseSet<const SCEV *> &VisitedRegs) const {
// Some ideas:
// - prune more:
// - use more aggressive filtering
// - sort the formula so that the most profitable solutions are found first
// - sort the uses too
// - search faster:
// - don't compute a cost, and then compare. compare while computing a cost
// and bail early.
// - track register sets with SmallBitVector
const LSRUse &LU = Uses[Workspace.size()];
// If this use references any register that's already a part of the
// in-progress solution, consider it a requirement that a formula must
// reference that register in order to be considered. This prunes out
// unprofitable searching.
SmallSetVector<const SCEV *, 4> ReqRegs;
for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
E = CurRegs.end(); I != E; ++I)
if (LU.Regs.count(*I))
ReqRegs.insert(*I);
bool AnySatisfiedReqRegs = false;
SmallPtrSet<const SCEV *, 16> NewRegs;
Cost NewCost;
retry:
for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
E = LU.Formulae.end(); I != E; ++I) {
const Formula &F = *I;
// Ignore formulae which do not use any of the required registers.
for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
JE = ReqRegs.end(); J != JE; ++J) {
const SCEV *Reg = *J;
if ((!F.ScaledReg || F.ScaledReg != Reg) &&
std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
F.BaseRegs.end())
goto skip;
}
AnySatisfiedReqRegs = true;
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
// Evaluate the cost of the current formula. If it's already worse than
// the current best, prune the search at that point.
NewCost = CurCost;
NewRegs = CurRegs;
NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
if (NewCost < SolutionCost) {
Workspace.push_back(&F);
if (Workspace.size() != Uses.size()) {
SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
NewRegs, VisitedRegs);
if (F.getNumRegs() == 1 && Workspace.size() == 1)
VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
} else {
DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
dbgs() << ". Regs:";
for (SmallPtrSet<const SCEV *, 16>::const_iterator
I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
dbgs() << ' ' << **I;
dbgs() << '\n');
SolutionCost = NewCost;
Solution = Workspace;
}
Workspace.pop_back();
}
skip:;
}
// If none of the formulae had all of the required registers, relax the
// constraint so that we don't exclude all formulae.
if (!AnySatisfiedReqRegs) {
assert(!ReqRegs.empty() && "Solver failed even without required registers");
ReqRegs.clear();
goto retry;
}
2832
2833
2834
2835
2836
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850
2851
2852
2853
2854
2855
2856
2857
2858
2859
2860
2861
2862
2863
2864
2865
2866
2867
2868
2869
2870
}
void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
SmallVector<const Formula *, 8> Workspace;
Cost SolutionCost;
SolutionCost.Loose();
Cost CurCost;
SmallPtrSet<const SCEV *, 16> CurRegs;
DenseSet<const SCEV *> VisitedRegs;
Workspace.reserve(Uses.size());
SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
CurRegs, VisitedRegs);
// Ok, we've now made all our decisions.
DEBUG(dbgs() << "\n"
"The chosen solution requires "; SolutionCost.print(dbgs());
dbgs() << ":\n";
for (size_t i = 0, e = Uses.size(); i != e; ++i) {
dbgs() << " ";
Uses[i].print(dbgs());
dbgs() << "\n"
" ";
Solution[i]->print(dbgs());
dbgs() << '\n';
});
}
/// getImmediateDominator - A handy utility for the specific DominatorTree
/// query that we need here.
///
static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
DomTreeNode *Node = DT.getNode(BB);
if (!Node) return 0;
Node = Node->getIDom();
if (!Node) return 0;
return Node->getBlock();
}
2871
2872
2873
2874
2875
2876
2877
2878
2879
2880
2881
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
/// the dominator tree far as we can go while still being dominated by the
/// input positions. This helps canonicalize the insert position, which
/// encourages sharing.
BasicBlock::iterator
LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
const SmallVectorImpl<Instruction *> &Inputs)
const {
for (;;) {
const Loop *IPLoop = LI.getLoopFor(IP->getParent());
unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
BasicBlock *IDom;
for (BasicBlock *Rung = IP->getParent(); ; Rung = IDom) {
IDom = getImmediateDominator(Rung, DT);
if (!IDom) return IP;
// Don't climb into a loop though.
const Loop *IDomLoop = LI.getLoopFor(IDom);
unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
if (IDomDepth <= IPLoopDepth &&
(IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
break;
}
bool AllDominate = true;
Instruction *BetterPos = 0;
Instruction *Tentative = IDom->getTerminator();
for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
E = Inputs.end(); I != E; ++I) {
Instruction *Inst = *I;
if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
AllDominate = false;
break;
}
// Attempt to find an insert position in the middle of the block,
// instead of at the end, so that it can be used for other expansions.
if (IDom == Inst->getParent() &&
(!BetterPos || DT.dominates(BetterPos, Inst)))
BetterPos = llvm::next(BasicBlock::iterator(Inst));
}
if (!AllDominate)
break;
if (BetterPos)
IP = BetterPos;
else
IP = Tentative;
}
return IP;
}
/// AdjustInsertPositionForExpand - Determine an input position which will be
/// dominated by the operands and which will dominate the result.
BasicBlock::iterator
LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
const LSRFixup &LF,
const LSRUse &LU) const {
// Collect some instructions which must be dominated by the
// expanding replacement. These must be dominated by any operands that
// will be required in the expansion.
SmallVector<Instruction *, 4> Inputs;
if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
Inputs.push_back(I);
if (LU.Kind == LSRUse::ICmpZero)
if (Instruction *I =
dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
Inputs.push_back(I);
if (LF.PostIncLoops.count(L)) {
if (LF.isUseFullyOutsideLoop(L))
Inputs.push_back(L->getLoopLatch()->getTerminator());
else
Inputs.push_back(IVIncInsertPos);
}
// The expansion must also be dominated by the increment positions of any
// loops it for which it is using post-inc mode.
for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
E = LF.PostIncLoops.end(); I != E; ++I) {
const Loop *PIL = *I;
if (PIL == L) continue;
// Be dominated by the loop exit.
SmallVector<BasicBlock *, 4> ExitingBlocks;
PIL->getExitingBlocks(ExitingBlocks);
if (!ExitingBlocks.empty()) {
BasicBlock *BB = ExitingBlocks[0];
for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
Inputs.push_back(BB->getTerminator());
}
}
// Then, climb up the immediate dominator tree as far as we can go while
// still being dominated by the input positions.
IP = HoistInsertPosition(IP, Inputs);
// Don't insert instructions before PHI nodes.
while (isa<PHINode>(IP)) ++IP;
// Ignore debug intrinsics.
while (isa<DbgInfoIntrinsic>(IP)) ++IP;
return IP;
}
Value *LSRInstance::Expand(const LSRFixup &LF,
const Formula &F,
BasicBlock::iterator IP,
SCEVExpander &Rewriter,
SmallVectorImpl<WeakVH> &DeadInsts) const {
const LSRUse &LU = Uses[LF.LUIdx];
// Determine an input position which will be dominated by the operands and
// which will dominate the result.
IP = AdjustInsertPositionForExpand(IP, LF, LU);
// Inform the Rewriter if we have a post-increment use, so that it can
// perform an advantageous expansion.
Rewriter.setPostInc(LF.PostIncLoops);
// This is the type that the user actually needs.
const Type *OpTy = LF.OperandValToReplace->getType();
// This will be the type that we'll initially expand to.
const Type *Ty = F.getType();
if (!Ty)
// No type known; just expand directly to the ultimate type.
Ty = OpTy;
else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
// Expand directly to the ultimate type if it's the right size.
Ty = OpTy;