Newer
Older
ScalarEvolution &SE, DominatorTree &DT,
Pass *P) const {
const Type *OpTy = OperandValToReplace->getType();
// First, find an insertion point that dominates UserInst. For PHI nodes,
// find the nearest block which dominates all the relevant uses.
if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
DenseMap<BasicBlock *, Value *> Inserted;
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
if (PN->getIncomingValue(i) == OperandValToReplace) {
BasicBlock *BB = PN->getIncomingBlock(i);
// If this is a critical edge, split the edge so that we do not insert
// the code on all predecessor/successor paths. We do this unless this
// is the canonical backedge for this loop, which complicates post-inc
// users.
if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
!isa<IndirectBrInst>(BB->getTerminator()) &&
(PN->getParent() != L->getHeader() || !L->contains(BB))) {
// Split the critical edge.
BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);
// If PN is outside of the loop and BB is in the loop, we want to
// move the block to be immediately before the PHI block, not
// immediately after BB.
if (L->contains(BB) && !L->contains(PN))
NewBB->moveBefore(PN->getParent());
// Splitting the edge can reduce the number of PHI entries we have.
e = PN->getNumIncomingValues();
BB = NewBB;
i = PN->getBasicBlockIndex(BB);
std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
if (!Pair.second)
PN->setIncomingValue(i, Pair.first->second);
else {
Value *FullV = Expand(BB->getTerminator(), L, IVIncInsertPos,
Rewriter, DeadInsts, SE, DT);
// If this is reuse-by-noop-cast, insert the noop cast.
if (FullV->getType() != OpTy)
FullV =
CastInst::Create(CastInst::getCastOpcode(FullV, false,
OpTy, false),
FullV, OperandValToReplace->getType(),
"tmp", BB->getTerminator());
PN->setIncomingValue(i, FullV);
Pair.first->second = FullV;
}
} else {
Value *FullV = Expand(UserInst, L, IVIncInsertPos,
Rewriter, DeadInsts, SE, DT);
// If this is reuse-by-noop-cast, insert the noop cast.
if (FullV->getType() != OpTy) {
Instruction *Cast =
CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
FullV, OpTy, "tmp", UserInst);
FullV = Cast;
}
// Update the user.
UserInst->replaceUsesOfWith(OperandValToReplace, FullV);
}
DeadInsts.push_back(OperandValToReplace);
}
void LSRUse::print(raw_ostream &OS) const {
OS << "LSR Use: Kind=";
switch (Kind) {
case Basic: OS << "Basic"; break;
case Special: OS << "Special"; break;
case ICmpZero: OS << "ICmpZero"; break;
case Address:
OS << "Address of ";
if (isa<PointerType>(AccessTy))
OS << "pointer"; // the full pointer type could be really verbose
else
OS << *AccessTy;
Evan Cheng
committed
}
OS << ", UserInst=";
// Store is common and interesting enough to be worth special-casing.
if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
OS << "store ";
WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
} else if (UserInst->getType()->isVoidTy())
OS << UserInst->getOpcodeName();
else
WriteAsOperand(OS, UserInst, /*PrintType=*/false);
OS << ", OperandValToReplace=";
WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);
if (PostIncLoop) {
OS << ", PostIncLoop=";
WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false);
}
void LSRUse::dump() const {
print(errs()); errs() << '\n';
namespace {
/// Score - This class is used to measure and compare candidate formulae.
class Score {
unsigned NumRegs;
unsigned NumPhis;
unsigned NumIVMuls;
unsigned NumBaseAdds;
unsigned NumImms;
public:
Score()
: NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {}
void RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
ScalarEvolution &SE);
void Rate(const SCEV *Reg, const SmallBitVector &Bits,
const SmallVector<LSRUse, 16> &Uses, const Loop *L,
ScalarEvolution &SE);
unsigned getNumRegs() const { return NumRegs; }
bool operator<(const Score &Other) const;
void print_details(raw_ostream &OS, const SCEV *Reg,
const SmallPtrSet<const SCEV *, 8> &Regs) const;
void print(raw_ostream &OS) const;
void dump() const;
private:
void RateRegister(const SCEV *Reg, SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L);
void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L);
void Loose();
};
}
/// RateRegister - Tally up interesting quantities from the given register.
void Score::RateRegister(const SCEV *Reg,
SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L) {
if (Regs.insert(Reg))
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
NumPhis += AR->getLoop() == L;
// Add the step value register, if it needs one.
if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
RateRegister(AR->getOperand(1), Regs, L);
Evan Cheng
committed
}
}
void Score::RateFormula(const Formula &F,
SmallPtrSet<const SCEV *, 8> &Regs,
const Loop *L) {
// Tally up the registers.
if (F.ScaledReg)
RateRegister(F.ScaledReg, Regs, L);
for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
E = F.BaseRegs.end(); I != E; ++I) {
const SCEV *BaseReg = *I;
RateRegister(BaseReg, Regs, L);
NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
BaseReg->hasComputableLoopEvolution(L);
}
if (F.BaseRegs.size() > 1)
NumBaseAdds += F.BaseRegs.size() - 1;
Evan Cheng
committed
// Tally up the non-zero immediates.
if (F.AM.BaseGV || F.AM.BaseOffs != 0)
++NumImms;
}
Evan Cheng
committed
/// Loose - Set this score to a loosing value.
void Score::Loose() {
NumRegs = ~0u;
NumPhis = ~0u;
NumIVMuls = ~0u;
NumBaseAdds = ~0u;
NumImms = ~0u;
}
/// RateInitial - Compute a score for the initial "fully reduced" solution.
void Score::RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
ScalarEvolution &SE) {
SmallPtrSet<const SCEV *, 8> Regs;
for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
E = Uses.end(); I != E; ++I)
RateFormula(I->Formulae.front(), Regs, L);
NumRegs += Regs.size();
DEBUG(print_details(dbgs(), 0, Regs));
}
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
/// Rate - Compute a score for the solution where the reuse associated with
/// putting Reg in a register is selected.
void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits,
const SmallVector<LSRUse, 16> &Uses, const Loop *L,
ScalarEvolution &SE) {
SmallPtrSet<const SCEV *, 8> Regs;
for (size_t i = 0, e = Uses.size(); i != e; ++i) {
const LSRUse &LU = Uses[i];
const Formula *BestFormula = 0;
if (i >= Bits.size() || !Bits.test(i))
// This use doesn't use the current register. Just go with the current
// leading candidate formula.
BestFormula = &LU.Formulae.front();
else
// Find the best formula for this use that uses the current register.
for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
E = LU.Formulae.end(); I != E; ++I) {
const Formula &F = *I;
if (F.referencesReg(Reg) &&
(!BestFormula || ComplexitySorter()(F, *BestFormula)))
BestFormula = &F;
}
// If we didn't find *any* forumlae, because earlier we eliminated some
// in greedy fashion, skip the current register's reuse opportunity.
if (!BestFormula) {
DEBUG(dbgs() << "Reuse with reg " << *Reg
<< " wouldn't help any users.\n");
Loose();
return;
}
// For an in-loop post-inc user, don't allow multiple base registers,
// because that would require an awkward in-loop add after the increment.
if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) &&
BestFormula->BaseRegs.size() > 1) {
DEBUG(dbgs() << "Reuse with reg " << *Reg
<< " would require an in-loop post-inc add: ";
BestFormula->dump());
Loose();
return;
}
RateFormula(*BestFormula, Regs, L);
}
NumRegs += Regs.size();
DEBUG(print_details(dbgs(), Reg, Regs));
}
/// operator< - Choose the better score.
bool Score::operator<(const Score &Other) const {
if (NumRegs != Other.NumRegs)
return NumRegs < Other.NumRegs;
if (NumPhis != Other.NumPhis)
return NumPhis < Other.NumPhis;
if (NumIVMuls != Other.NumIVMuls)
return NumIVMuls < Other.NumIVMuls;
if (NumBaseAdds != Other.NumBaseAdds)
return NumBaseAdds < Other.NumBaseAdds;
return NumImms < Other.NumImms;
}
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
void Score::print_details(raw_ostream &OS,
const SCEV *Reg,
const SmallPtrSet<const SCEV *, 8> &Regs) const {
if (Reg) OS << "Reuse with reg " << *Reg << " would require ";
else OS << "The initial solution would require ";
print(OS);
OS << ". Regs:";
for (SmallPtrSet<const SCEV *, 8>::const_iterator I = Regs.begin(),
E = Regs.end(); I != E; ++I)
OS << ' ' << **I;
OS << '\n';
}
void Score::print(raw_ostream &OS) const {
OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
if (NumPhis != 0)
OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s");
if (NumIVMuls != 0)
OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
if (NumBaseAdds != 0)
OS << ", plus " << NumBaseAdds << " base add"
<< (NumBaseAdds == 1 ? "" : "s");
if (NumImms != 0)
OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s");
}
void Score::dump() const {
print(errs()); errs() << '\n';
}
/// isAddressUse - Returns true if the specified instruction is using the
/// specified value as an address.
static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
bool isAddress = isa<LoadInst>(Inst);
if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
if (SI->getOperand(1) == OperandVal)
isAddress = true;
} else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
// Addressing modes can also be folded into prefetches and a variety
// of intrinsics.
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::prefetch:
case Intrinsic::x86_sse2_loadu_dq:
case Intrinsic::x86_sse2_loadu_pd:
case Intrinsic::x86_sse_loadu_ps:
case Intrinsic::x86_sse_storeu_ps:
case Intrinsic::x86_sse2_storeu_pd:
case Intrinsic::x86_sse2_storeu_dq:
case Intrinsic::x86_sse2_storel_dq:
if (II->getOperand(1) == OperandVal)
isAddress = true;
break;
}
}
return isAddress;
}
/// getAccessType - Return the type of the memory being accessed.
static const Type *getAccessType(const Instruction *Inst) {
const Type *AccessTy = Inst->getType();
if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
AccessTy = SI->getOperand(0)->getType();
else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
// Addressing modes can also be folded into prefetches and a variety
// of intrinsics.
switch (II->getIntrinsicID()) {
default: break;
case Intrinsic::x86_sse_storeu_ps:
case Intrinsic::x86_sse2_storeu_pd:
case Intrinsic::x86_sse2_storeu_dq:
case Intrinsic::x86_sse2_storel_dq:
AccessTy = II->getOperand(1)->getType();
break;
}
return AccessTy;
}
/// DeleteTriviallyDeadInstructions - If any of the instructions is the
/// specified set are trivially dead, delete them and see if this makes any of
/// their operands subsequently dead.
static bool
DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
bool Changed = false;
while (!DeadInsts.empty()) {
Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
if (I == 0 || !isInstructionTriviallyDead(I))
continue;
for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
if (Instruction *U = dyn_cast<Instruction>(*OI)) {
*OI = 0;
if (U->use_empty())
DeadInsts.push_back(U);
}
I->eraseFromParent();
Changed = true;
}
return Changed;
}
namespace {
/// LSRInstance - This class holds state for the main loop strength
/// reduction logic.
class LSRInstance {
IVUsers &IU;
ScalarEvolution &SE;
DominatorTree &DT;
const TargetLowering *const TLI;
Loop *const L;
bool Changed;
/// IVIncInsertPos - This is the insert position that the current loop's
/// induction variable increment should be placed. In simple loops, this is
/// the latch block's terminator. But in more complicated cases, this is
/// a position which will dominate all the in-loop post-increment users.
Instruction *IVIncInsertPos;
/// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an
/// arbitrary index value to each register as a sort tie breaker.
unsigned CurrentArbitraryRegIndex;
/// MaxNumRegs - To help prune the search for solutions, identify the number
/// of registers needed by the initial solution. No formula should require
/// more than this.
unsigned MaxNumRegs;
/// Factors - Interesting factors between use strides.
SmallSetVector<int64_t, 4> Factors;
/// Types - Interesting use types, to facilitate truncation reuse.
SmallSetVector<const Type *, 4> Types;
/// Uses - The list of interesting uses.
SmallVector<LSRUse, 16> Uses;
// TODO: Reorganize these data structures.
typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
RegUsesTy RegUses;
SmallVector<const SCEV *, 16> RegSequence;
void OptimizeShadowIV();
bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
const SCEV* &CondStride);
ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
bool StrideMightBeShared(const SCEV* Stride);
bool OptimizeLoopTermCond();
LSRUse &getNewUse() {
Uses.push_back(LSRUse());
return Uses.back();
}
void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx);
void CountRegisters(const Formula &F, size_t LUIdx);
bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU,
unsigned LUIdx,
const Formula &Base, unsigned i,
const SmallVectorImpl<const SCEV *>
&AddOps);
void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
Formula Base);
void GenerateConstantOffsetReuse();
void GenerateAllReuseFormulae();
void GenerateLoopInvariantRegisterUses();
public:
LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);
bool getChanged() const { return Changed; }
void print(raw_ostream &OS) const;
void dump() const;
};
}
Devang Patel
committed
/// OptimizeShadowIV - If IV is used in a int-to-float cast
/// inside the loop then try to eliminate the cast opeation.
void LSRInstance::OptimizeShadowIV() {
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
Devang Patel
committed
return;
for (size_t StrideIdx = 0, e = IU.StrideOrder.size();
StrideIdx != e; ++StrideIdx) {
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
Devang Patel
committed
if (!isa<SCEVConstant>(SI->first))
continue;
for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
E = SI->second->Users.end(); UI != E; /* empty */) {
ilist<IVStrideUse>::iterator CandidateUI = UI;
++UI;
Instruction *ShadowUse = CandidateUI->getUser();
Devang Patel
committed
const Type *DestTy = NULL;
/* If shadow use is a int->float cast then insert a second IV
to eliminate this cast.
Devang Patel
committed
Devang Patel
committed
foo((double)i);
is transformed into
Devang Patel
committed
double d = 0.0;
Devang Patel
committed
foo(d);
*/
if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
Devang Patel
committed
DestTy = UCast->getDestTy();
else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
Devang Patel
committed
DestTy = SCast->getDestTy();
Devang Patel
committed
if (!DestTy) continue;
if (TLI) {
Evan Cheng
committed
// If target does not support DestTy natively then do not apply
// this transformation.
Owen Anderson
committed
EVT DVT = TLI->getValueType(DestTy);
Devang Patel
committed
if (!TLI->isTypeLegal(DVT)) continue;
}
Devang Patel
committed
PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
if (!PH) continue;
if (PH->getNumIncomingValues() != 2) continue;
const Type *SrcTy = PH->getType();
int Mantissa = DestTy->getFPMantissaWidth();
if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
Devang Patel
committed
continue;
unsigned Entry, Latch;
if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
Entry = 0;
Latch = 1;
} else {
Entry = 1;
Latch = 0;
}
Devang Patel
committed
ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
if (!Init) continue;
Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());
Devang Patel
committed
Devang Patel
committed
dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
if (!Incr) continue;
if (Incr->getOpcode() != Instruction::Add
&& Incr->getOpcode() != Instruction::Sub)
continue;
/* Initialize new IV, double d = 0.0 in above example. */
ConstantInt *C = NULL;
if (Incr->getOperand(0) == PH)
C = dyn_cast<ConstantInt>(Incr->getOperand(1));
else if (Incr->getOperand(1) == PH)
C = dyn_cast<ConstantInt>(Incr->getOperand(0));
else
continue;
if (!C) continue;
// Ignore negative constants, as the code below doesn't handle them
// correctly. TODO: Remove this restriction.
if (!C->getValue().isStrictlyPositive()) continue;
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
/* Add new PHINode. */
PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);
/* create new increment. '++d' in above example. */
Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
BinaryOperator *NewIncr =
BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
Instruction::FAdd : Instruction::FSub,
NewPH, CFP, "IV.S.next.", Incr);
NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
/* Remove cast operation */
ShadowUse->replaceAllUsesWith(NewPH);
ShadowUse->eraseFromParent();
break;
}
}
}
/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
/// set the IV user and stride information and return true, otherwise return
/// false.
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
IVStrideUse *&CondUse,
const SCEV* &CondStride) {
for (unsigned StrideIdx = 0, e = IU.StrideOrder.size();
StrideIdx != e && !CondUse; ++StrideIdx) {
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
E = SI->second->Users.end(); UI != E; ++UI)
if (UI->getUser() == Cond) {
// NOTE: we could handle setcc instructions with multiple uses here, but
// InstCombine does it as well for simple uses, it's not clear that it
// occurs enough in real life to handle.
CondUse = UI;
CondStride = SI->first;
return true;
}
}
return false;
}
/// OptimizeMax - Rewrite the loop's terminating condition if it uses
/// a max computation.
///
/// This is a narrow solution to a specific, but acute, problem. For loops
/// like this:
///
/// i = 0;
/// do {
/// p[i] = 0.0;
/// } while (++i < n);
///
/// the trip count isn't just 'n', because 'n' might not be positive. And
/// unfortunately this can come up even for loops where the user didn't use
/// a C do-while loop. For example, seemingly well-behaved top-test loops
/// will commonly be lowered like this:
//
/// if (n > 0) {
/// i = 0;
/// do {
/// p[i] = 0.0;
/// } while (++i < n);
/// }
///
/// and then it's possible for subsequent optimization to obscure the if
/// test in such a way that indvars can't find it.
///
/// When indvars can't find the if test in loops like this, it creates a
/// max expression, which allows it to give the loop a canonical
/// induction variable:
///
/// i = 0;
/// max = n < 1 ? 1 : n;
/// do {
/// p[i] = 0.0;
/// } while (++i != max);
///
/// Canonical induction variables are necessary because the loop passes
/// are designed around them. The most obvious example of this is the
/// LoopInfo analysis, which doesn't remember trip count values. It
/// expects to be able to rediscover the trip count each time it is
/// needed, and it does this using a simple analysis that only succeeds if
/// the loop has a canonical induction variable.
///
/// However, when it comes time to generate code, the maximum operation
/// can be quite costly, especially if it's inside of an outer loop.
///
/// This function solves this problem by detecting this type of loop and
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
/// the instructions for the maximum computation.
///
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
// Check that the loop matches the pattern we're looking for.
if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
Cond->getPredicate() != CmpInst::ICMP_NE)
return Cond;
SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
if (!Sel || !Sel->hasOneUse()) return Cond;
const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
return Cond;
const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());
// Add one to the backedge-taken count to get the trip count.
const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);
// Check for a max calculation that matches the pattern.
if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
return Cond;
const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
if (Max != SE.getSCEV(Sel)) return Cond;
// To handle a max with more than two operands, this optimization would
// require additional checking and setup.
if (Max->getNumOperands() != 2)
return Cond;
Devang Patel
committed
const SCEV *MaxLHS = Max->getOperand(0);
const SCEV *MaxRHS = Max->getOperand(1);
if (!MaxLHS || MaxLHS != One) return Cond;
// Check the relevant induction variable for conformance to
// the pattern.
const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
if (!AR || !AR->isAffine() ||
AR->getStart() != One ||
AR->getStepRecurrence(SE) != One)
return Cond;
Devang Patel
committed
assert(AR->getLoop() == L &&
"Loop condition operand is an addrec in a different loop!");
Devang Patel
committed
// Check the right operand of the select, and remember it, as it will
// be used in the new comparison instruction.
Value *NewRHS = 0;
if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
NewRHS = Sel->getOperand(1);
else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
NewRHS = Sel->getOperand(2);
if (!NewRHS) return Cond;
// Determine the new comparison opcode. It may be signed or unsigned,
// and the original comparison may be either equality or inequality.
CmpInst::Predicate Pred =
isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
if (Cond->getPredicate() == CmpInst::ICMP_EQ)
Pred = CmpInst::getInversePredicate(Pred);
Devang Patel
committed
// Ok, everything looks ok to change the condition into an SLT or SGE and
// delete the max calculation.
ICmpInst *NewCond =
new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
// Delete the max calculation instructions.
Cond->replaceAllUsesWith(NewCond);
CondUse->setUser(NewCond);
Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
Cond->eraseFromParent();
Sel->eraseFromParent();
if (Cmp->use_empty())
Cmp->eraseFromParent();
return NewCond;
bool LSRInstance::StrideMightBeShared(const SCEV* Stride) {
Evan Cheng
committed
int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) {
Evan Cheng
committed
std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
IU.IVUsesByStride.find(IU.StrideOrder[i]);
Evan Cheng
committed
const SCEV *Share = SI->first;
if (!isa<SCEVConstant>(SI->first) || Share == Stride)
continue;
int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
if (SSInt == SInt)
return true; // This can definitely be reused.
if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
continue;
int64_t Scale = SSInt / SInt;
// This AM will be used for conservative queries. At this point in the
// process we don't know which users will have a base reg, immediate,
// etc., so we conservatively assume that it may not, making more
// strides valid, thus erring on the side of assuming that there
// might be sharing.
TargetLowering::AddrMode AM;
AM.Scale = Scale;
// Any pre-inc iv use?
IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share];
for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
E = StrideUses.Users.end(); I != E; ++I) {
bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace());
if (!I->isUseOfPostIncrementedValue() &&
isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic,
isAddress ? getAccessType(I->getUser()) : 0,
TLI))
Evan Cheng
committed
return true;
}
}
return false;
}
/// OptimizeLoopTermCond - Change loop terminating condition to use the
/// postinc iv when possible.
bool
LSRInstance::OptimizeLoopTermCond() {
SmallPtrSet<Instruction *, 4> PostIncs;
Evan Cheng
committed
BasicBlock *LatchBlock = L->getLoopLatch();
Evan Cheng
committed
SmallVector<BasicBlock*, 8> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
Evan Cheng
committed
for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
BasicBlock *ExitingBlock = ExitingBlocks[i];
// Get the terminating condition for the loop if possible. If we
Evan Cheng
committed
// can, we want to change it to use a post-incremented version of its
// induction variable, to allow coalescing the live ranges for the IV into
// one register value.
Evan Cheng
committed
BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
if (!TermBr)
continue;
// FIXME: Overly conservative, termination condition could be an 'or' etc..
if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
continue;
// Search IVUsesByStride to find Cond's IVUse if there is one.
IVStrideUse *CondUse = 0;
const SCEV *CondStride = 0;
ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
if (!FindIVUserForCond(Cond, CondUse, CondStride))
continue;
// If the trip count is computed in terms of a max (due to ScalarEvolution
// being unable to find a sufficient guard, for example), change the loop
// comparison to use SLT or ULT instead of NE.
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
// One consequence of doing this now is that it disrupts the count-down
// optimization. That's not always a bad thing though, because in such
// cases it may still be worthwhile to avoid a max.
Cond = OptimizeMax(Cond, CondUse);
// If this exiting block is the latch block, and the condition has only
// one use inside the loop (the branch), use the post-incremented value
// of the induction variable
if (ExitingBlock != LatchBlock) {
// If this exiting block dominates the latch block, it may also use
// the post-inc value if it won't be shared with other uses.
// Check for dominance.
if (!DT.dominates(ExitingBlock, LatchBlock))
continue;
// Check for sharing within the same stride.
bool SameStrideSharing = false;
IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride];
for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
E = StrideUses.Users.end(); I != E; ++I) {
if (I->getUser() == Cond)
continue;
if (!I->isUseOfPostIncrementedValue()) {
SameStrideSharing = true;
break;
}
}
if (SameStrideSharing)
continue;
// Check for sharing from a different stride.
if (isa<SCEVConstant>(CondStride) && StrideMightBeShared(CondStride))
continue;
}
if (!Cond->hasOneUse()) {
bool HasOneUseInLoop = true;
for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
UI != UE; ++UI) {
Instruction *U = cast<Instruction>(*UI);
if (U == TermBr)
continue;
if (L->contains(U)) {
HasOneUseInLoop = false;
break;
}
}
if (!HasOneUseInLoop)
continue;
Evan Cheng
committed
}
<< *Cond << '\n');
Evan Cheng
committed
// It's possible for the setcc instruction to be anywhere in the loop, and
// possible for it to have multiple users. If it is not immediately before
// the exiting block branch, move it.
if (&*++BasicBlock::iterator(Cond) != TermBr) {
Evan Cheng
committed
if (Cond->hasOneUse()) { // Condition has a single use, just move it.
Cond->moveBefore(TermBr);
} else {
// Otherwise, clone the terminating condition and insert into the
// loopend.
ICmpInst *OldCond = Cond;
Evan Cheng
committed
Cond = cast<ICmpInst>(Cond->clone());
Cond->setName(L->getHeader()->getName() + ".termcond");
ExitingBlock->getInstList().insert(TermBr, Cond);
// Clone the IVUse, as the old use still exists!
IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
CondUse->getOperandValToReplace());
CondUse = &IU.IVUsesByStride[CondStride]->Users.back();
TermBr->replaceUsesOfWith(OldCond, Cond);
Evan Cheng
committed
}
}
Evan Cheng
committed
// If we get to here, we know that we can transform the setcc instruction to
// use the post-incremented version of the IV, allowing us to coalesce the
// live ranges for the IV correctly.
CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride));
Evan Cheng
committed
CondUse->setIsUseOfPostIncrementedValue(true);
Changed = true;
Evan Cheng
committed
PostIncs.insert(Cond);
Evan Cheng
committed
}
// Determine an insertion point for the loop induction variable increment. It
// must dominate all the post-inc comparisons we just set up, and it must
// dominate the loop latch edge.
IVIncInsertPos = L->getLoopLatch()->getTerminator();
for (SmallPtrSet<Instruction *, 4>::iterator I = PostIncs.begin(),
E = PostIncs.end(); I != E; ++I) {
BasicBlock *BB =
DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
(*I)->getParent());
if (BB == (*I)->getParent())
IVIncInsertPos = *I;
else if (BB != IVIncInsertPos->getParent())
IVIncInsertPos = BB->getTerminator();
}
return Changed;
}
/// CountRegisters - Note the given register.
void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity,
size_t LUIdx) {
std::pair<RegUsesTy::iterator, bool> Pair =
RegUses.insert(std::make_pair(Reg, RegSortData()));
RegSortData &BV = Pair.first->second;
if (Pair.second) {
BV.Index = CurrentArbitraryRegIndex++;
BV.MaxComplexity = Complexity;
RegSequence.push_back(Reg);
} else {
BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity);
BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1));
BV.Bits.set(LUIdx);
}
/// CountRegisters - Note which registers are used by the given formula,
/// updating RegUses.
void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
uint32_t Complexity = F.getComplexity();
if (F.ScaledReg)
CountRegister(F.ScaledReg, Complexity, LUIdx);
for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
E = F.BaseRegs.end(); I != E; ++I)
CountRegister(*I, Complexity, LUIdx);
}
/// InsertFormula - If the given formula has not yet been inserted, add it
/// to the list, and return true. Return false otherwise.
bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
// If a formula by itself would require more registers than the base solution,
// discard it and stop searching from it, as it won't be profitable. This is
// actually more permissive than it could be, because it doesn't include
// registers used by non-constant strides in F.
if (F.getNumRegs() > MaxNumRegs)
return false;
if (!LU.InsertFormula(F))
return false;
CountRegisters(LU.Formulae.back(), LUIdx);
return true;
}
/// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic
/// offsets.
void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
Formula Base) {
// We can't add a symbolic offset if the address already contains one.
if (Base.AM.BaseGV) return;
for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
const SCEV *G = Base.BaseRegs[i];
GlobalValue *GV = ExtractSymbol(G, SE);
if (G->isZero())
continue;
Formula F = Base;
F.AM.BaseGV = GV;
if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
continue;
F.BaseRegs[i] = G;
(void)InsertFormula(LU, LUIdx, F);
Evan Cheng
committed
}
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
}
/// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up
/// the comparison. For example, x == y -> x*c == y*c.
void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
Formula Base) {
if (LU.Kind != LSRUse::ICmpZero) return;
// Determine the integer type for the base formula.
const Type *IntTy = Base.getType();
if (!IntTy) return;
if (SE.getTypeSizeInBits(IntTy) > 64) return;
IntTy = SE.getEffectiveSCEVType(IntTy);
assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");
// Check each interesting stride.
for (SmallSetVector<int64_t, 4>::const_iterator
I = Factors.begin(), E = Factors.end(); I != E; ++I) {
int64_t Factor = *I;
Formula F = Base;
// Check that the multiplication doesn't overflow.
F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor;
if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs)