Newer
Older
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
if (PN->getIncomingValue(i) == OperandValToReplace) {
BasicBlock *BB = PN->getIncomingBlock(i);
// If this is a critical edge, split the edge so that we do not insert
// the code on all predecessor/successor paths. We do this unless this
// is the canonical backedge for this loop, which complicates post-inc
// users.
if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
!isa<IndirectBrInst>(BB->getTerminator()) &&
(PN->getParent() != L->getHeader() || !L->contains(BB))) {
// Split the critical edge.
BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
// If PN is outside of the loop and BB is in the loop, we want to
// move the block to be immediately before the PHI block, not
// immediately after BB.
if (L->contains(BB) && !L->contains(PN))
NewBB->moveBefore(PN->getParent());
// Splitting the edge can reduce the number of PHI entries we have.
e = PN->getNumIncomingValues();
BB = NewBB;
i = PN->getBasicBlockIndex(BB);
std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
if (!Pair.second)
PN->setIncomingValue(i, Pair.first->second);
else {
Value *FullV = Expand(BB->getTerminator(),
L, Rewriter, DeadInsts, SE, DT);
// If this is reuse-by-noop-cast, insert the noop cast.
if (FullV->getType() != OpTy)
FullV =
CastInst::Create(CastInst::getCastOpcode(FullV, false,
OpTy, false),
FullV, OperandValToReplace->getType(),
"tmp", BB->getTerminator());
PN->setIncomingValue(i, FullV);
Pair.first->second = FullV;
}
} else {
Value *FullV = Expand(UserInst, L, Rewriter, DeadInsts, SE, DT);
// If this is reuse-by-noop-cast, insert the noop cast.
if (FullV->getType() != OpTy) {
Instruction *Cast =
CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
FullV, OpTy, "tmp", UserInst);
FullV = Cast;
}
// Update the user.
UserInst->replaceUsesOfWith(OperandValToReplace, FullV);
}
DeadInsts.push_back(OperandValToReplace);
}
void LSRUse::print(raw_ostream &OS) const {
OS << "LSR Use: Kind=";
switch (Kind) {
case Basic: OS << "Basic"; break;
case Special: OS << "Special"; break;
case ICmpZero: OS << "ICmpZero"; break;
case Address:
OS << "Address of ";
if (isa<PointerType>(AccessTy))
OS << "pointer"; // the full pointer type could be really verbose
else
OS << *AccessTy;
Evan Cheng
committed
}
OS << ", UserInst=";
// Store is common and interesting enough to be worth special-casing.
if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
OS << "store ";
WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
} else if (UserInst->getType()->isVoidTy())
OS << UserInst->getOpcodeName();
else
WriteAsOperand(OS, UserInst, /*PrintType=*/false);
OS << ", OperandValToReplace=";
WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
if (PostIncLoop) {
OS << ", PostIncLoop=";
WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false);
}
void LSRUse::dump() const {
print(errs()); errs() << '\n';
namespace {
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
/// Score - This class is used to measure and compare candidate formulae.
class Score {
unsigned NumRegs;
unsigned NumPhis;
unsigned NumIVMuls;
unsigned NumBaseAdds;
unsigned NumImms;
public:
Score()
: NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {}
void RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
ScalarEvolution &SE);
void Rate(const SCEV *Reg, const SmallBitVector &Bits,
const SmallVector<LSRUse, 16> &Uses, const Loop *L,
ScalarEvolution &SE);
bool operator<(const Score &Other) const;
void print_details(raw_ostream &OS, const SCEV *Reg,
const SmallPtrSet<const SCEV *, 8> &Regs) const;
void print(raw_ostream &OS) const;
void dump() const;
private:
void RateRegister(const SCEV *Reg, SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L);
void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L);
void Loose();
};
}
/// RateRegister - Tally up interesting quantities from the given register.
void Score::RateRegister(const SCEV *Reg,
SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L) {
if (Regs.insert(Reg))
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
NumPhis += AR->getLoop() == L;
// Add the step value register, if it needs one.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
RateRegister(AR->getOperand(1), Regs, L);
Evan Cheng
committed
}
}
void Score::RateFormula(const Formula &F,
SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L) {
// Tally up the registers.
if (F.ScaledReg)
RateRegister(F.ScaledReg, Regs, L);
for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
E = F.BaseRegs.end(); I != E; ++I) {
const SCEV *BaseReg = *I;
RateRegister(BaseReg, Regs, L);
NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
BaseReg->hasComputableLoopEvolution(L);
}
if (F.BaseRegs.size() > 1)
NumBaseAdds += F.BaseRegs.size() - 1;
Evan Cheng
committed
// Tally up the non-zero immediates.
if (F.AM.BaseGV || F.AM.BaseOffs != 0)
++NumImms;
}
Evan Cheng
committed
/// Loose - Set this score to a loosing value.
void Score::Loose() {
NumRegs = ~0u;
NumPhis = ~0u;
NumIVMuls = ~0u;
NumBaseAdds = ~0u;
NumImms = ~0u;
}
/// RateInitial - Compute a score for the initial "fully reduced" solution.
void Score::RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
ScalarEvolution &SE) {
SmallPtrSet<const SCEV *, 8> Regs;
for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
E = Uses.end(); I != E; ++I)
RateFormula(I->Formulae.front(), Regs, L);
NumRegs += Regs.size();
DEBUG(print_details(dbgs(), 0, Regs));
}
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
/// Rate - Compute a score for the solution where the reuse associated with
/// putting Reg in a register is selected.
void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits,
const SmallVector<LSRUse, 16> &Uses, const Loop *L,
ScalarEvolution &SE) {
SmallPtrSet<const SCEV *, 8> Regs;
for (size_t i = 0, e = Uses.size(); i != e; ++i) {
const LSRUse &LU = Uses[i];
const Formula *BestFormula = 0;
if (i >= Bits.size() || !Bits.test(i))
// This use doesn't use the current register. Just go with the current
// leading candidate formula.
BestFormula = &LU.Formulae.front();
else
// Find the best formula for this use that uses the current register.
for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
E = LU.Formulae.end(); I != E; ++I) {
const Formula &F = *I;
if (F.referencesReg(Reg) &&
(!BestFormula || ComplexitySorter()(F, *BestFormula)))
BestFormula = &F;
}
// If we didn't find *any* forumlae, because earlier we eliminated some
// in greedy fashion, skip the current register's reuse opportunity.
if (!BestFormula) {
DEBUG(dbgs() << "Reuse with reg " << *Reg
<< " wouldn't help any users.\n");
Loose();
return;
}
// For an in-loop post-inc user, don't allow multiple base registers,
// because that would require an awkward in-loop add after the increment.
if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) &&
BestFormula->BaseRegs.size() > 1) {
DEBUG(dbgs() << "Reuse with reg " << *Reg
<< " would require an in-loop post-inc add: ";
BestFormula->dump());
Loose();
return;
}
RateFormula(*BestFormula, Regs, L);
}
NumRegs += Regs.size();
DEBUG(print_details(dbgs(), Reg, Regs));
}
/// operator< - Choose the better score.
bool Score::operator<(const Score &Other) const {
if (NumRegs != Other.NumRegs)
return NumRegs < Other.NumRegs;
if (NumPhis != Other.NumPhis)
return NumPhis < Other.NumPhis;
if (NumIVMuls != Other.NumIVMuls)
return NumIVMuls < Other.NumIVMuls;
if (NumBaseAdds != Other.NumBaseAdds)
return NumBaseAdds < Other.NumBaseAdds;
return NumImms < Other.NumImms;
}
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
void Score::print_details(raw_ostream &OS,
const SCEV *Reg,
const SmallPtrSet<const SCEV *, 8> &Regs) const {
if (Reg) OS << "Reuse with reg " << *Reg << " would require ";
else OS << "The initial solution would require ";
print(OS);
OS << ". Regs:";
for (SmallPtrSet<const SCEV *, 8>::const_iterator I = Regs.begin(),
E = Regs.end(); I != E; ++I)
OS << ' ' << **I;
OS << '\n';
}
void Score::print(raw_ostream &OS) const {
OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
if (NumPhis != 0)
OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s");
if (NumIVMuls != 0)
OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
if (NumBaseAdds != 0)
OS << ", plus " << NumBaseAdds << " base add"
<< (NumBaseAdds == 1 ? "" : "s");
if (NumImms != 0)
OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s");
}
void Score::dump() const {
print(errs()); errs() << '\n';
}
/// isAddressUse - Returns true if the specified instruction is using the
/// specified value as an address.
static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
bool isAddress = isa<LoadInst>(Inst);
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
if (SI->getOperand(1) == OperandVal)
isAddress = true;
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
// Addressing modes can also be folded into prefetches and a variety
// of intrinsics.
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::prefetch:
case Intrinsic::x86_sse2_loadu_dq:
case Intrinsic::x86_sse2_loadu_pd:
case Intrinsic::x86_sse_loadu_ps:
case Intrinsic::x86_sse_storeu_ps:
case Intrinsic::x86_sse2_storeu_pd:
case Intrinsic::x86_sse2_storeu_dq:
case Intrinsic::x86_sse2_storel_dq:
if (II->getOperand(1) == OperandVal)
isAddress = true;
break;
}
}
return isAddress;
}
/// getAccessType - Return the type of the memory being accessed.
static const Type *getAccessType(const Instruction *Inst) {
const Type *AccessTy = Inst->getType();
if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
AccessTy = SI->getOperand(0)->getType();
else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
// Addressing modes can also be folded into prefetches and a variety
// of intrinsics.
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::x86_sse_storeu_ps:
case Intrinsic::x86_sse2_storeu_pd:
case Intrinsic::x86_sse2_storeu_dq:
case Intrinsic::x86_sse2_storel_dq:
AccessTy = II->getOperand(1)->getType();
break;
}
return AccessTy;
}
/// DeleteTriviallyDeadInstructions - If any of the instructions is the
/// specified set are trivially dead, delete them and see if this makes any of
/// their operands subsequently dead.
static bool
DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
bool Changed = false;
while (!DeadInsts.empty()) {
Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
if (I == 0 || !isInstructionTriviallyDead(I))
continue;
for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
if (Instruction *U = dyn_cast<Instruction>(*OI)) {
*OI = 0;
if (U->use_empty())
DeadInsts.push_back(U);
}
I->eraseFromParent();
Changed = true;
}
return Changed;
}
namespace {
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
/// LSRInstance - This class holds state for the main loop strength
/// reduction logic.
class LSRInstance {
IVUsers &IU;
ScalarEvolution &SE;
DominatorTree &DT;
const TargetLowering *const TLI;
Loop *const L;
bool Changed;
/// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an
/// arbitrary index value to each register as a sort tie breaker.
unsigned CurrentArbitraryRegIndex;
/// Factors - Interesting factors between use strides.
SmallSetVector<int64_t, 4> Factors;
/// Types - Interesting use types, to facilitate truncation reuse.
SmallSetVector<const Type *, 4> Types;
/// Uses - The list of interesting uses.
SmallVector<LSRUse, 16> Uses;
// TODO: Reorganize these data structures.
typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
RegUsesTy RegUses;
SmallVector<const SCEV *, 16> RegSequence;
void OptimizeShadowIV();
bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
const SCEV* &CondStride);
ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
bool StrideMightBeShared(const SCEV* Stride);
bool OptimizeLoopTermCond(Instruction *&IVIncInsertPos);
LSRUse &getNewUse() {
Uses.push_back(LSRUse());
return Uses.back();
}
void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx);
void CountRegisters(const Formula &F, size_t LUIdx);
void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU,
unsigned LUIdx,
const Formula &Base, unsigned i,
const SmallVectorImpl<const SCEV *>
&AddOps);
void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateConstantOffsetReuse();
void GenerateAllReuseFormulae();
void GenerateLoopInvariantRegisterUses();
public:
LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
bool getChanged() const { return Changed; }
void print(raw_ostream &OS) const;
void dump() const;
};
}
Devang Patel
committed
/// OptimizeShadowIV - If IV is used in a int-to-float cast
/// inside the loop then try to eliminate the cast opeation.
void LSRInstance::OptimizeShadowIV() {
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
Devang Patel
committed
return;
for (size_t StrideIdx = 0, e = IU.StrideOrder.size();
StrideIdx != e; ++StrideIdx) {
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
Devang Patel
committed
if (!isa<SCEVConstant>(SI->first))
continue;
for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
E = SI->second->Users.end(); UI != E; /* empty */) {
ilist<IVStrideUse>::iterator CandidateUI = UI;
++UI;
Instruction *ShadowUse = CandidateUI->getUser();
Devang Patel
committed
const Type *DestTy = NULL;
/* If shadow use is a int->float cast then insert a second IV
to eliminate this cast.
Devang Patel
committed
Devang Patel
committed
foo((double)i);
is transformed into
Devang Patel
committed
double d = 0.0;
Devang Patel
committed
foo(d);
*/
if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
Devang Patel
committed
DestTy = UCast->getDestTy();
else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
Devang Patel
committed
DestTy = SCast->getDestTy();
Devang Patel
committed
if (!DestTy) continue;
if (TLI) {
Evan Cheng
committed
// If target does not support DestTy natively then do not apply
// this transformation.
Owen Anderson
committed
EVT DVT = TLI->getValueType(DestTy);
Devang Patel
committed
if (!TLI->isTypeLegal(DVT)) continue;
}
Devang Patel
committed
PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
if (!PH) continue;
if (PH->getNumIncomingValues() != 2) continue;
const Type *SrcTy = PH->getType();
int Mantissa = DestTy->getFPMantissaWidth();
if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
Devang Patel
committed
continue;
unsigned Entry, Latch;
if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
Entry = 0;
Latch = 1;
} else {
Entry = 1;
Latch = 0;
}
Devang Patel
committed
ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
if (!Init) continue;
Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
Devang Patel
committed
Devang Patel
committed
dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
if (!Incr) continue;
if (Incr->getOpcode() != Instruction::Add
&& Incr->getOpcode() != Instruction::Sub)
continue;
/* Initialize new IV, double d = 0.0 in above example. */
ConstantInt *C = NULL;
if (Incr->getOperand(0) == PH)
C = dyn_cast<ConstantInt>(Incr->getOperand(1));
else if (Incr->getOperand(1) == PH)
C = dyn_cast<ConstantInt>(Incr->getOperand(0));
else
continue;
if (!C) continue;
// Ignore negative constants, as the code below doesn't handle them
// correctly. TODO: Remove this restriction.
if (!C->getValue().isStrictlyPositive()) continue;
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
/* Add new PHINode. */
PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
/* create new increment. '++d' in above example. */
Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
BinaryOperator *NewIncr =
BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
Instruction::FAdd : Instruction::FSub,
NewPH, CFP, "IV.S.next.", Incr);
NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
/* Remove cast operation */
ShadowUse->replaceAllUsesWith(NewPH);
ShadowUse->eraseFromParent();
break;
}
}
}
/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
/// set the IV user and stride information and return true, otherwise return
/// false.
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
IVStrideUse *&CondUse,
const SCEV* &CondStride) {
for (unsigned StrideIdx = 0, e = IU.StrideOrder.size();
StrideIdx != e && !CondUse; ++StrideIdx) {
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
E = SI->second->Users.end(); UI != E; ++UI)
if (UI->getUser() == Cond) {
// NOTE: we could handle setcc instructions with multiple uses here, but
// InstCombine does it as well for simple uses, it's not clear that it
// occurs enough in real life to handle.
CondUse = UI;
CondStride = SI->first;
return true;
}
}
return false;
}
/// OptimizeMax - Rewrite the loop's terminating condition if it uses
/// a max computation.
///
/// This is a narrow solution to a specific, but acute, problem. For loops
/// like this:
///
/// i = 0;
/// do {
/// p[i] = 0.0;
/// } while (++i < n);
///
/// the trip count isn't just 'n', because 'n' might not be positive. And
/// unfortunately this can come up even for loops where the user didn't use
/// a C do-while loop. For example, seemingly well-behaved top-test loops
/// will commonly be lowered like this:
//
/// if (n > 0) {
/// i = 0;
/// do {
/// p[i] = 0.0;
/// } while (++i < n);
/// }
///
/// and then it's possible for subsequent optimization to obscure the if
/// test in such a way that indvars can't find it.
///
/// When indvars can't find the if test in loops like this, it creates a
/// max expression, which allows it to give the loop a canonical
/// induction variable:
///
/// i = 0;
/// max = n < 1 ? 1 : n;
/// do {
/// p[i] = 0.0;
/// } while (++i != max);
///
/// Canonical induction variables are necessary because the loop passes
/// are designed around them. The most obvious example of this is the
/// LoopInfo analysis, which doesn't remember trip count values. It
/// expects to be able to rediscover the trip count each time it is
/// needed, and it does this using a simple analysis that only succeeds if
/// the loop has a canonical induction variable.
///
/// However, when it comes time to generate code, the maximum operation
/// can be quite costly, especially if it's inside of an outer loop.
///
/// This function solves this problem by detecting this type of loop and
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
/// the instructions for the maximum computation.
///
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
// Check that the loop matches the pattern we're looking for.
if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
Cond->getPredicate() != CmpInst::ICMP_NE)
return Cond;
SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
if (!Sel || !Sel->hasOneUse()) return Cond;
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
return Cond;
const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());
// Add one to the backedge-taken count to get the trip count.
const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
// Check for a max calculation that matches the pattern.
if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
return Cond;
const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
if (Max != SE.getSCEV(Sel)) return Cond;
// To handle a max with more than two operands, this optimization would
// require additional checking and setup.
if (Max->getNumOperands() != 2)
return Cond;
Devang Patel
committed
const SCEV *MaxLHS = Max->getOperand(0);
const SCEV *MaxRHS = Max->getOperand(1);
if (!MaxLHS || MaxLHS != One) return Cond;
// Check the relevant induction variable for conformance to
// the pattern.
const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
if (!AR || !AR->isAffine() ||
AR->getStart() != One ||
AR->getStepRecurrence(SE) != One)
return Cond;
Devang Patel
committed
assert(AR->getLoop() == L &&
"Loop condition operand is an addrec in a different loop!");
Devang Patel
committed
// Check the right operand of the select, and remember it, as it will
// be used in the new comparison instruction.
Value *NewRHS = 0;
if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
NewRHS = Sel->getOperand(1);
else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
NewRHS = Sel->getOperand(2);
if (!NewRHS) return Cond;
// Determine the new comparison opcode. It may be signed or unsigned,
// and the original comparison may be either equality or inequality.
CmpInst::Predicate Pred =
isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
if (Cond->getPredicate() == CmpInst::ICMP_EQ)
Pred = CmpInst::getInversePredicate(Pred);
Devang Patel
committed
// Ok, everything looks ok to change the condition into an SLT or SGE and
// delete the max calculation.
ICmpInst *NewCond =
new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
// Delete the max calculation instructions.
Cond->replaceAllUsesWith(NewCond);
CondUse->setUser(NewCond);
Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
Cond->eraseFromParent();
Sel->eraseFromParent();
if (Cmp->use_empty())
Cmp->eraseFromParent();
return NewCond;
bool LSRInstance::StrideMightBeShared(const SCEV* Stride) {
Evan Cheng
committed
int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) {
Evan Cheng
committed
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU.IVUsesByStride.find(IU.StrideOrder[i]);
Evan Cheng
committed
const SCEV *Share = SI->first;
if (!isa<SCEVConstant>(SI->first) || Share == Stride)
continue;
int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
if (SSInt == SInt)
return true; // This can definitely be reused.
if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
continue;
int64_t Scale = SSInt / SInt;
// This AM will be used for conservative queries. At this point in the
// process we don't know which users will have a base reg, immediate,
// etc., so we conservatively assume that it may not, making more
// strides valid, thus erring on the side of assuming that there
// might be sharing.
TargetLowering::AddrMode AM;
AM.Scale = Scale;
// Any pre-inc iv use?
IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share];
for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
E = StrideUses.Users.end(); I != E; ++I) {
bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace());
if (!I->isUseOfPostIncrementedValue() &&
isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic,
isAddress ? getAccessType(I->getUser()) : 0,
TLI))
Evan Cheng
committed
return true;
}
}
return false;
}
/// OptimizeLoopTermCond - Change loop terminating condition to use the
/// postinc iv when possible.
bool
LSRInstance::OptimizeLoopTermCond(Instruction *&IVIncInsertPos) {
SmallPtrSet<Instruction *, 4> PostIncs;
Evan Cheng
committed
BasicBlock *LatchBlock = L->getLoopLatch();
Evan Cheng
committed
SmallVector<BasicBlock*, 8> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
Evan Cheng
committed
for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
BasicBlock *ExitingBlock = ExitingBlocks[i];
// Get the terminating condition for the loop if possible. If we
Evan Cheng
committed
// can, we want to change it to use a post-incremented version of its
// induction variable, to allow coalescing the live ranges for the IV into
// one register value.
Evan Cheng
committed
BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
if (!TermBr)
continue;
// FIXME: Overly conservative, termination condition could be an 'or' etc..
if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
continue;
// Search IVUsesByStride to find Cond's IVUse if there is one.
IVStrideUse *CondUse = 0;
const SCEV *CondStride = 0;
ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
if (!FindIVUserForCond(Cond, CondUse, CondStride))
continue;
// If the trip count is computed in terms of a max (due to ScalarEvolution
// being unable to find a sufficient guard, for example), change the loop
// comparison to use SLT or ULT instead of NE.
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
// One consequence of doing this now is that it disrupts the count-down
// optimization. That's not always a bad thing though, because in such
// cases it may still be worthwhile to avoid a max.
Cond = OptimizeMax(Cond, CondUse);
// If this exiting block is the latch block, and the condition has only
// one use inside the loop (the branch), use the post-incremented value
// of the induction variable
if (ExitingBlock != LatchBlock) {
// If this exiting block dominates the latch block, it may also use
// the post-inc value if it won't be shared with other uses.
// Check for dominance.
if (!DT.dominates(ExitingBlock, LatchBlock))
continue;
// Check for sharing within the same stride.
bool SameStrideSharing = false;
IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride];
for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
E = StrideUses.Users.end(); I != E; ++I) {
if (I->getUser() == Cond)
continue;
if (!I->isUseOfPostIncrementedValue()) {
SameStrideSharing = true;
break;
}
}
if (SameStrideSharing)
continue;
// Check for sharing from a different stride.
if (isa<SCEVConstant>(CondStride) && StrideMightBeShared(CondStride))
continue;
}
if (!Cond->hasOneUse()) {
bool HasOneUseInLoop = true;
for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
UI != UE; ++UI) {
Instruction *U = cast<Instruction>(*UI);
if (U == TermBr)
continue;
if (L->contains(U)) {
HasOneUseInLoop = false;
break;
}
}
if (!HasOneUseInLoop)
continue;
Evan Cheng
committed
}
<< *Cond << '\n');
Evan Cheng
committed
// It's possible for the setcc instruction to be anywhere in the loop, and
// possible for it to have multiple users. If it is not immediately before
// the exiting block branch, move it.
if (&*++BasicBlock::iterator(Cond) != TermBr) {
Evan Cheng
committed
if (Cond->hasOneUse()) { // Condition has a single use, just move it.
Cond->moveBefore(TermBr);
} else {
// Otherwise, clone the terminating condition and insert into the
// loopend.
ICmpInst *OldCond = Cond;
Evan Cheng
committed
Cond = cast<ICmpInst>(Cond->clone());
Cond->setName(L->getHeader()->getName() + ".termcond");
ExitingBlock->getInstList().insert(TermBr, Cond);
// Clone the IVUse, as the old use still exists!
IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
CondUse->getOperandValToReplace());
CondUse = &IU.IVUsesByStride[CondStride]->Users.back();
TermBr->replaceUsesOfWith(OldCond, Cond);
Evan Cheng
committed
}
}
Evan Cheng
committed
// If we get to here, we know that we can transform the setcc instruction to
// use the post-incremented version of the IV, allowing us to coalesce the
// live ranges for the IV correctly.
CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride));
Evan Cheng
committed
CondUse->setIsUseOfPostIncrementedValue(true);
Changed = true;
Evan Cheng
committed
PostIncs.insert(Cond);
Evan Cheng
committed
}
// Determine an insertion point for the loop induction variable increment. It
// must dominate all the post-inc comparisons we just set up, and it must
// dominate the loop latch edge.
IVIncInsertPos = L->getLoopLatch()->getTerminator();
for (SmallPtrSet<Instruction *, 4>::iterator I = PostIncs.begin(),
E = PostIncs.end(); I != E; ++I) {
BasicBlock *BB =
DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
(*I)->getParent());
if (BB == (*I)->getParent())
IVIncInsertPos = *I;
else if (BB != IVIncInsertPos->getParent())
IVIncInsertPos = BB->getTerminator();
}
return Changed;
}
/// CountRegisters - Note the given register.
void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity,
size_t LUIdx) {
std::pair<RegUsesTy::iterator, bool> Pair =
RegUses.insert(std::make_pair(Reg, RegSortData()));
RegSortData &BV = Pair.first->second;
if (Pair.second) {
BV.Index = CurrentArbitraryRegIndex++;
BV.MaxComplexity = Complexity;
RegSequence.push_back(Reg);
} else {
BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity);
BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1));
BV.Bits.set(LUIdx);
}
/// CountRegisters - Note which registers are used by the given formula,
/// updating RegUses.
void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
uint32_t Complexity = F.getComplexity();
if (F.ScaledReg)
CountRegister(F.ScaledReg, Complexity, LUIdx);
for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
E = F.BaseRegs.end(); I != E; ++I)
CountRegister(*I, Complexity, LUIdx);
}
/// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic
/// offsets.
void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// We can't add a symbolic offset if the address already contains one.
if (Base.AM.BaseGV) return;
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *G = Base.BaseRegs[i];
GlobalValue *GV = ExtractSymbol(G, SE);
if (G->isZero())
continue;
Formula F = Base;
F.AM.BaseGV = GV;
if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
continue;
F.BaseRegs[i] = G;
if (LU.InsertFormula(F))
CountRegisters(LU.Formulae.back(), LUIdx);
Evan Cheng
committed
}
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
}
/// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up
/// the comparison. For example, x == y -> x*c == y*c.
void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
Formula Base) {
if (LU.Kind != LSRUse::ICmpZero) return;
// Determine the integer type for the base formula.
const Type *IntTy = Base.getType();
if (!IntTy) return;
if (SE.getTypeSizeInBits(IntTy) > 64) return;
IntTy = SE.getEffectiveSCEVType(IntTy);
assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
// Check each interesting stride.
for (SmallSetVector<int64_t, 4>::const_iterator
I = Factors.begin(), E = Factors.end(); I != E; ++I) {
int64_t Factor = *I;
Formula F = Base;
// Check that the multiplication doesn't overflow.
F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor;
if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs)
continue;
Evan Cheng
committed
// Check that this scale is legal.
if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
continue;
const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
Evan Cheng
committed
// Check that multiplying with each base register doesn't overflow.
for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
goto next;
}
Evan Cheng
committed
// Check that multiplying with the scaled register doesn't overflow.
if (F.ScaledReg) {
F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
continue;
}
Evan Cheng
committed
// If we make it here and it's legal, add it.
if (LU.InsertFormula(F))
CountRegisters(LU.Formulae.back(), LUIdx);
next:;
}
Evan Cheng
committed
}
/// GenerateFormulaeFromReplacedBaseReg - If removing base register with
/// index i from the BaseRegs list and adding the registers in AddOps
/// to the address forms an interesting formula, pursue it.
void
LSRInstance::GenerateFormulaeFromReplacedBaseReg(
LSRUse &LU,
unsigned LUIdx,
const Formula &Base, unsigned i,
const SmallVectorImpl<const SCEV *>
&AddOps) {
if (AddOps.empty()) return;