LoopStrengthReduce.cpp

    // fold a GV into an ICmp.
    if (AM.BaseGV)
      return false;

    // ICmp only has two operands; don't allow more than two non-trivial parts.
    if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
      return false;

    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
    // putting the scaled register in the other operand of the icmp.
    if (AM.Scale != 0 && AM.Scale != -1)
      return false;

    // If we have low-level target information, ask the target if it can fold an
    // integer immediate on an icmp.
    if (AM.BaseOffs != 0) {
      if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
      return false;
    }

    return true;

  case LSRUse::Basic:
    // Only handle single-register values.
    return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;

  case LSRUse::Special:
    // Only handle -1 scales, or no scale.
    return AM.Scale == 0 || AM.Scale == -1;
  }

  return false;
}

static bool isLegalUse(TargetLowering::AddrMode AM,
                       int64_t MinOffset, int64_t MaxOffset,
                       LSRUse::KindType Kind, const Type *AccessTy,
                       const TargetLowering *TLI) {
  // Check for overflow.
  if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
      (MinOffset > 0))
    return false;
  AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
  if (isLegalUse(AM, Kind, AccessTy, TLI)) {
    AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
    // Check for overflow.
    if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
        (MaxOffset > 0))
      return false;
    AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
    return isLegalUse(AM, Kind, AccessTy, TLI);
  }
  return false;
}

static bool isAlwaysFoldable(int64_t BaseOffs,
                             GlobalValue *BaseGV,
                             bool HasBaseReg,
                             LSRUse::KindType Kind, const Type *AccessTy,
                             const TargetLowering *TLI) {
  // Fast-path: zero is always foldable.
  if (BaseOffs == 0 && !BaseGV) return true;

  // Conservatively, create an address with an immediate and a
  // base and a scale.
  TargetLowering::AddrMode AM;
  AM.BaseOffs = BaseOffs;
  AM.BaseGV = BaseGV;
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;

  return isLegalUse(AM, Kind, AccessTy, TLI);
}

static bool isAlwaysFoldable(const SCEV *S,
                             int64_t MinOffset, int64_t MaxOffset,
                             bool HasBaseReg,
                             LSRUse::KindType Kind, const Type *AccessTy,
                             const TargetLowering *TLI,
                             ScalarEvolution &SE) {
  // Fast-path: zero is always foldable.
  if (S->isZero()) return true;

  // Conservatively, create an address with an immediate and a
  // base and a scale.
  int64_t BaseOffs = ExtractImmediate(S, SE);
  GlobalValue *BaseGV = ExtractSymbol(S, SE);

  // If there's anything else involved, it's not foldable.
  if (!S->isZero()) return false;

  // Fast-path: zero is always foldable.
  if (BaseOffs == 0 && !BaseGV) return true;

  // Conservatively, create an address with an immediate and a
  // base and a scale.
  TargetLowering::AddrMode AM;
  AM.BaseOffs = BaseOffs;
  AM.BaseGV = BaseGV;
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;

  return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
}

/// FormulaSorter - This class implements an ordering for formulae which sorts
/// the by their standalone cost.
class FormulaSorter {
  /// These two sets are kept empty, so that we compute standalone costs.
  DenseSet<const SCEV *> VisitedRegs;
  SmallPtrSet<const SCEV *, 16> Regs;
  Loop *L;
  LSRUse *LU;
  ScalarEvolution &SE;
  DominatorTree &DT;

public:
  FormulaSorter(Loop *l, LSRUse &lu, ScalarEvolution &se, DominatorTree &dt)
    : L(l), LU(&lu), SE(se), DT(dt) {}

  bool operator()(const Formula &A, const Formula &B) {
    Cost CostA;
    CostA.RateFormula(A, Regs, VisitedRegs, L, LU->Offsets, SE, DT);
    Regs.clear();
    Cost CostB;
    CostB.RateFormula(B, Regs, VisitedRegs, L, LU->Offsets, SE, DT);
    Regs.clear();
    return CostA < CostB;
  }
};

/// LSRInstance - This class holds state for the main loop strength reduction
/// logic.
class LSRInstance {
  IVUsers &IU;
  ScalarEvolution &SE;
  DominatorTree &DT;
  const TargetLowering *const TLI;
  Loop *const L;
  bool Changed;

  /// IVIncInsertPos - This is the insert position that the current loop's
  /// induction variable increment should be placed. In simple loops, this is
  /// the latch block's terminator. But in more complicated cases, this is a
  /// position which will dominate all the in-loop post-increment users.
  Instruction *IVIncInsertPos;

  /// Factors - Interesting factors between use strides.
  SmallSetVector<int64_t, 8> Factors;

  /// Types - Interesting use types, to facilitate truncation reuse.
  SmallSetVector<const Type *, 4> Types;

  /// Fixups - The list of operands which are to be replaced.
  SmallVector<LSRFixup, 16> Fixups;

  /// Uses - The list of interesting uses.
  SmallVector<LSRUse, 16> Uses;

  /// RegUses - Track which uses use which register candidates.
  RegUseTracker RegUses;

  void OptimizeShadowIV();
  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
  bool OptimizeLoopTermCond();

  void CollectInterestingTypesAndFactors();
  void CollectFixupsAndInitialFormulae();

  LSRFixup &getNewFixup() {
    Fixups.push_back(LSRFixup());
    return Fixups.back();
  }

  // Support for sharing of LSRUses between LSRFixups.
  typedef DenseMap<const SCEV *, size_t> UseMapTy;
  UseMapTy UseMap;

  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
                          LSRUse::KindType Kind, const Type *AccessTy);

  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
                                    LSRUse::KindType Kind,
                                    const Type *AccessTy);

public:
  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
  void CountRegisters(const Formula &F, size_t LUIdx);
  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);

  void CollectLoopInvariantFixupsAndFormulae();

  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
                              unsigned Depth = 0);
  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateCrossUseConstantOffsets();
  void GenerateAllReuseFormulae();

  void FilterOutUndesirableDedicatedRegisters();
  void NarrowSearchSpaceUsingHeuristics();

  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                    Cost &SolutionCost,
                    SmallVectorImpl<const Formula *> &Workspace,
                    const Cost &CurCost,
                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
                    DenseSet<const SCEV *> &VisitedRegs) const;
  void Solve(SmallVectorImpl<const Formula *> &Solution) const;

  Value *Expand(const LSRFixup &LF,
                const Formula &F,
                BasicBlock::iterator IP,
                SCEVExpander &Rewriter,
                SmallVectorImpl<WeakVH> &DeadInsts) const;
  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
                     const Formula &F,
                     SCEVExpander &Rewriter,
                     SmallVectorImpl<WeakVH> &DeadInsts,
                     Pass *P) const;
  void Rewrite(const LSRFixup &LF,
               const Formula &F,
               SCEVExpander &Rewriter,
               SmallVectorImpl<WeakVH> &DeadInsts,
               Pass *P) const;
  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
                         Pass *P);

  LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);

  bool getChanged() const { return Changed; }

  void print_factors_and_types(raw_ostream &OS) const;
  void print_fixups(raw_ostream &OS) const;
  void print_uses(raw_ostream &OS) const;
  void print(raw_ostream &OS) const;
  void dump() const;
};

}

/// OptimizeShadowIV - If IV is used in a int-to-float cast
/// inside the loop then try to eliminate the cast operation.
void LSRInstance::OptimizeShadowIV() {
  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return;

  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
       UI != E; /* empty */) {
    IVUsers::const_iterator CandidateUI = UI;
    ++UI;
    Instruction *ShadowUse = CandidateUI->getUser();
    const Type *DestTy = NULL;

    /* If shadow use is a int->float cast then insert a second IV
       to eliminate this cast.

         for (unsigned i = 0; i < n; ++i)
           foo((double)i);

       is transformed into

         double d = 0.0;
         for (unsigned i = 0; i < n; ++i, ++d)
           foo(d);
    */
    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
      DestTy = UCast->getDestTy();
    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
      DestTy = SCast->getDestTy();
    if (!DestTy) continue;

    if (TLI) {
      // If target does not support DestTy natively then do not apply
      // this transformation.
      EVT DVT = TLI->getValueType(DestTy);
      if (!TLI->isTypeLegal(DVT)) continue;
    }

    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
    if (!PH) continue;
    if (PH->getNumIncomingValues() != 2) continue;

    const Type *SrcTy = PH->getType();
    int Mantissa = DestTy->getFPMantissaWidth();
    if (Mantissa == -1) continue;
    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
      continue;

    unsigned Entry, Latch;
    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
      Entry = 0;
      Latch = 1;
    } else {
      Entry = 1;
      Latch = 0;
    }

    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
    if (!Init) continue;
    Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());

    BinaryOperator *Incr =
      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
    if (!Incr) continue;
    if (Incr->getOpcode() != Instruction::Add
        && Incr->getOpcode() != Instruction::Sub)
      continue;

    /* Initialize new IV, double d = 0.0 in above example. */
    ConstantInt *C = NULL;
    if (Incr->getOperand(0) == PH)
      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
    else if (Incr->getOperand(1) == PH)
      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
    else
      continue;

    if (!C) continue;

    // Ignore negative constants, as the code below doesn't handle them
    // correctly. TODO: Remove this restriction.
    if (!C->getValue().isStrictlyPositive()) continue;

    /* Add new PHINode. */
    PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);

    /* create new increment. '++d' in above example. */
    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
    BinaryOperator *NewIncr =
      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
                               Instruction::FAdd : Instruction::FSub,
                             NewPH, CFP, "IV.S.next.", Incr);

    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));

    /* Remove cast operation */
    ShadowUse->replaceAllUsesWith(NewPH);
    ShadowUse->eraseFromParent();
    break;
  }
}

/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
/// set the IV user and stride information and return true, otherwise return
/// false.
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
                                    IVStrideUse *&CondUse) {
  for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
    if (UI->getUser() == Cond) {
      // NOTE: we could handle setcc instructions with multiple uses here, but
      // InstCombine does it as well for simple uses, it's not clear that it
      // occurs enough in real life to handle.
      CondUse = UI;
      return true;
    }
  return false;
}

/// OptimizeMax - Rewrite the loop's terminating condition if it uses
/// a max computation.
///
/// This is a narrow solution to a specific, but acute, problem. For loops
/// like this:
///
///   i = 0;
///   do {
///     p[i] = 0.0;
///   } while (++i < n);
///
/// the trip count isn't just 'n', because 'n' might not be positive. And
/// unfortunately this can come up even for loops where the user didn't use
/// a C do-while loop. For example, seemingly well-behaved top-test loops
/// will commonly be lowered like this:
//
///   if (n > 0) {
///     i = 0;
///     do {
///       p[i] = 0.0;
///     } while (++i < n);
///   }
///
/// and then it's possible for subsequent optimization to obscure the if
/// test in such a way that indvars can't find it.
///
/// When indvars can't find the if test in loops like this, it creates a
/// max expression, which allows it to give the loop a canonical
/// induction variable:
///
///   i = 0;
///   max = n < 1 ? 1 : n;
///   do {
///     p[i] = 0.0;
///   } while (++i != max);
///
/// Canonical induction variables are necessary because the loop passes
/// are designed around them. The most obvious example of this is the
/// LoopInfo analysis, which doesn't remember trip count values. It
/// expects to be able to rediscover the trip count each time it is
/// needed, and it does this using a simple analysis that only succeeds if
/// the loop has a canonical induction variable.
///
/// However, when it comes time to generate code, the maximum operation
/// can be quite costly, especially if it's inside of an outer loop.
///
/// This function solves this problem by detecting this type of loop and
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
/// the instructions for the maximum computation.
///
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
  // Check that the loop matches the pattern we're looking for.
  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
      Cond->getPredicate() != CmpInst::ICMP_NE)
    return Cond;

  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
  if (!Sel || !Sel->hasOneUse()) return Cond;

  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return Cond;
  const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());

  // Add one to the backedge-taken count to get the trip count.
  const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);

  // Check for a max calculation that matches the pattern.
  if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
    return Cond;
  const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
  if (Max != SE.getSCEV(Sel)) return Cond;

  // To handle a max with more than two operands, this optimization would
  // require additional checking and setup.
  if (Max->getNumOperands() != 2)
    return Cond;

  const SCEV *MaxLHS = Max->getOperand(0);
  const SCEV *MaxRHS = Max->getOperand(1);
  if (!MaxLHS || MaxLHS != One) return Cond;
  // Check the relevant induction variable for conformance to
  // the pattern.
  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
  if (!AR || !AR->isAffine() ||
      AR->getStart() != One ||
      AR->getStepRecurrence(SE) != One)
    return Cond;

  assert(AR->getLoop() == L &&
         "Loop condition operand is an addrec in a different loop!");

  // Check the right operand of the select, and remember it, as it will
  // be used in the new comparison instruction.
  Value *NewRHS = 0;
  if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
    NewRHS = Sel->getOperand(1);
  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
    NewRHS = Sel->getOperand(2);
  if (!NewRHS) return Cond;

  // Determine the new comparison opcode. It may be signed or unsigned,
  // and the original comparison may be either equality or inequality.
  CmpInst::Predicate Pred =
    isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
    Pred = CmpInst::getInversePredicate(Pred);

  // Ok, everything looks ok to change the condition into an SLT or SGE and
  // delete the max calculation.
  ICmpInst *NewCond =
    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");

  // Delete the max calculation instructions.
  Cond->replaceAllUsesWith(NewCond);
  CondUse->setUser(NewCond);
  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
  Cond->eraseFromParent();
  Sel->eraseFromParent();
  if (Cmp->use_empty())
    Cmp->eraseFromParent();
  return NewCond;
}

/// OptimizeLoopTermCond - Change loop terminating condition to use the
/// postinc iv when possible.
bool
LSRInstance::OptimizeLoopTermCond() {
  SmallPtrSet<Instruction *, 4> PostIncs;

  BasicBlock *LatchBlock = L->getLoopLatch();
  SmallVector<BasicBlock*, 8> ExitingBlocks;
  L->getExitingBlocks(ExitingBlocks);

  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
    BasicBlock *ExitingBlock = ExitingBlocks[i];

    // Get the terminating condition for the loop if possible.  If we
    // can, we want to change it to use a post-incremented version of its
    // induction variable, to allow coalescing the live ranges for the IV into
    // one register value.

    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
    if (!TermBr)
      continue;
    // FIXME: Overly conservative, termination condition could be an 'or' etc..
    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
      continue;

    // Search IVUsesByStride to find Cond's IVUse if there is one.
    IVStrideUse *CondUse = 0;
    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
    if (!FindIVUserForCond(Cond, CondUse))
      continue;

    // If the trip count is computed in terms of a max (due to ScalarEvolution
    // being unable to find a sufficient guard, for example), change the loop
    // comparison to use SLT or ULT instead of NE.
    // One consequence of doing this now is that it disrupts the count-down
    // optimization. That's not always a bad thing though, because in such
    // cases it may still be worthwhile to avoid a max.
    Cond = OptimizeMax(Cond, CondUse);

    // If this exiting block dominates the latch block, it may also use
    // the post-inc value if it won't be shared with other uses.
    // Check for dominance.
    if (!DT.dominates(ExitingBlock, LatchBlock))
      continue;

    // Conservatively avoid trying to use the post-inc value in non-latch
    // exits if there may be pre-inc users in intervening blocks.
    if (LatchBlock != ExitingBlock)
      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
        // Test if the use is reachable from the exiting block. This dominator
        // query is a conservative approximation of reachability.
        if (&*UI != CondUse &&
            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
          // Conservatively assume there may be reuse if the quotient of their
          // strides could be a legal scale.
          const SCEV *A = CondUse->getStride();
          const SCEV *B = UI->getStride();
          if (SE.getTypeSizeInBits(A->getType()) !=
              SE.getTypeSizeInBits(B->getType())) {
            if (SE.getTypeSizeInBits(A->getType()) >
                SE.getTypeSizeInBits(B->getType()))
              B = SE.getSignExtendExpr(B, A->getType());
            else
              A = SE.getSignExtendExpr(A, B->getType());
          }
          if (const SCEVConstant *D =
                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
            // Stride of one or negative one can have reuse with non-addresses.
            if (D->getValue()->isOne() ||
                D->getValue()->isAllOnesValue())
              goto decline_post_inc;
            // Avoid weird situations.
            if (D->getValue()->getValue().getMinSignedBits() >= 64 ||
                D->getValue()->getValue().isMinSignedValue())
              goto decline_post_inc;
            // Without TLI, assume that any stride might be valid, and so any
            // use might be shared.
            if (!TLI)
              goto decline_post_inc;
            // Check for possible scaled-address reuse.
            const Type *AccessTy = getAccessType(UI->getUser());
            TargetLowering::AddrMode AM;
            AM.Scale = D->getValue()->getSExtValue();
            if (TLI->isLegalAddressingMode(AM, AccessTy))
              goto decline_post_inc;
            AM.Scale = -AM.Scale;
            if (TLI->isLegalAddressingMode(AM, AccessTy))
              goto decline_post_inc;
          }
        }

    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
                 << *Cond << '\n');

    // It's possible for the setcc instruction to be anywhere in the loop, and
    // possible for it to have multiple users.  If it is not immediately before
    // the exiting block branch, move it.
    if (&*++BasicBlock::iterator(Cond) != TermBr) {
      if (Cond->hasOneUse()) {
        Cond->moveBefore(TermBr);
      } else {
        // Clone the terminating condition and insert into the loopend.
        ICmpInst *OldCond = Cond;
        Cond = cast<ICmpInst>(Cond->clone());
        Cond->setName(L->getHeader()->getName() + ".termcond");
        ExitingBlock->getInstList().insert(TermBr, Cond);

        // Clone the IVUse, as the old use still exists!
        CondUse = &IU.AddUser(CondUse->getStride(), CondUse->getOffset(),
                              Cond, CondUse->getOperandValToReplace());
        TermBr->replaceUsesOfWith(OldCond, Cond);
      }
    }

    // If we get to here, we know that we can transform the setcc instruction to
    // use the post-incremented version of the IV, allowing us to coalesce the
    // live ranges for the IV correctly.
    CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(),
                                       CondUse->getStride()));
    CondUse->setIsUseOfPostIncrementedValue(true);
    Changed = true;

    PostIncs.insert(Cond);
  decline_post_inc:;
  }

  // Determine an insertion point for the loop induction variable increment. It
  // must dominate all the post-inc comparisons we just set up, and it must
  // dominate the loop latch edge.
  IVIncInsertPos = L->getLoopLatch()->getTerminator();
  for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
       E = PostIncs.end(); I != E; ++I) {
    BasicBlock *BB =
      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
                                    (*I)->getParent());
    if (BB == (*I)->getParent())
      IVIncInsertPos = *I;
    else if (BB != IVIncInsertPos->getParent())
      IVIncInsertPos = BB->getTerminator();
  }

  return Changed;
}

bool
LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
                                LSRUse::KindType Kind, const Type *AccessTy) {
  int64_t NewMinOffset = LU.MinOffset;
  int64_t NewMaxOffset = LU.MaxOffset;
  const Type *NewAccessTy = AccessTy;

  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
  // something conservative, however this can pessimize in the case that one of
  // the uses will have all its uses outside the loop, for example.
  if (LU.Kind != Kind)
    return false;
  // Conservatively assume HasBaseReg is true for now.
  if (NewOffset < LU.MinOffset) {
    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true,
                          Kind, AccessTy, TLI))
      return false;
    NewMinOffset = NewOffset;
  } else if (NewOffset > LU.MaxOffset) {
    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true,
                          Kind, AccessTy, TLI))
      return false;
    NewMaxOffset = NewOffset;
  }
  // Check for a mismatched access type, and fall back conservatively as needed.
  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
    NewAccessTy = Type::getVoidTy(AccessTy->getContext());

  // Update the use.
  LU.MinOffset = NewMinOffset;
  LU.MaxOffset = NewMaxOffset;
  LU.AccessTy = NewAccessTy;
  if (NewOffset != LU.Offsets.back())
    LU.Offsets.push_back(NewOffset);
  return true;
}

/// getUse - Return an LSRUse index and an offset value for a fixup which
/// needs the given expression, with the given kind and optional access type.
/// Either reuse an existing use or create a new one, as needed.
std::pair<size_t, int64_t>
LSRInstance::getUse(const SCEV *&Expr,
                    LSRUse::KindType Kind, const Type *AccessTy) {
  const SCEV *Copy = Expr;
  int64_t Offset = ExtractImmediate(Expr, SE);

  // Basic uses can't accept any offset, for example.
  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
    Expr = Copy;
    Offset = 0;
  }

  std::pair<UseMapTy::iterator, bool> P =
    UseMap.insert(std::make_pair(Expr, 0));
  if (!P.second) {
    // A use already existed with this base.
    size_t LUIdx = P.first->second;
    LSRUse &LU = Uses[LUIdx];
    if (reconcileNewOffset(LU, Offset, Kind, AccessTy))
      // Reuse this use.
      return std::make_pair(LUIdx, Offset);
  }

  // Create a new use.
  size_t LUIdx = Uses.size();
  P.first->second = LUIdx;
  Uses.push_back(LSRUse(Kind, AccessTy));
  LSRUse &LU = Uses[LUIdx];

  // We don't need to track redundant offsets, but we don't need to go out
  // of our way here to avoid them.
  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
    LU.Offsets.push_back(Offset);

  LU.MinOffset = Offset;
  LU.MaxOffset = Offset;
  return std::make_pair(LUIdx, Offset);
}

void LSRInstance::CollectInterestingTypesAndFactors() {
  SmallSetVector<const SCEV *, 4> Strides;

  // Collect interesting types and strides.
  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
    const SCEV *Stride = UI->getStride();

    // Collect interesting types.
    Types.insert(SE.getEffectiveSCEVType(Stride->getType()));

    // Add the stride for this loop.
    Strides.insert(Stride);

    // Add strides for other mentioned loops.
    for (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(UI->getOffset());
         AR; AR = dyn_cast<SCEVAddRecExpr>(AR->getStart()))
      Strides.insert(AR->getStepRecurrence(SE));
  }

  // Compute interesting factors from the set of interesting strides.
  for (SmallSetVector<const SCEV *, 4>::const_iterator
       I = Strides.begin(), E = Strides.end(); I != E; ++I)
    for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
         next(I); NewStrideIter != E; ++NewStrideIter) {
      const SCEV *OldStride = *I;
      const SCEV *NewStride = *NewStrideIter;

      if (SE.getTypeSizeInBits(OldStride->getType()) !=
          SE.getTypeSizeInBits(NewStride->getType())) {
        if (SE.getTypeSizeInBits(OldStride->getType()) >
            SE.getTypeSizeInBits(NewStride->getType()))
          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
        else
          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
      }
      if (const SCEVConstant *Factor =
            dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
                                                        SE, true))) {
        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
          Factors.insert(Factor->getValue()->getValue().getSExtValue());
      } else if (const SCEVConstant *Factor =
                   dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
                                                               NewStride,
                                                               SE, true))) {
        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
          Factors.insert(Factor->getValue()->getValue().getSExtValue());
      }
    }

  // If all uses use the same type, don't bother looking for truncation-based
  // reuse.
  if (Types.size() == 1)
    Types.clear();

  DEBUG(print_factors_and_types(dbgs()));
}

void LSRInstance::CollectFixupsAndInitialFormulae() {
  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI) {
    // Record the uses.
    LSRFixup &LF = getNewFixup();
    LF.UserInst = UI->getUser();
    LF.OperandValToReplace = UI->getOperandValToReplace();
    if (UI->isUseOfPostIncrementedValue())
      LF.PostIncLoop = L;

    LSRUse::KindType Kind = LSRUse::Basic;
    const Type *AccessTy = 0;
    if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) {
      Kind = LSRUse::Address;
      AccessTy = getAccessType(LF.UserInst);
    }

    const SCEV *S = IU.getCanonicalExpr(*UI);

    // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
    // (N - i == 0), and this allows (N - i) to be the expression that we work
    // with rather than just N or i, so we can consider the register
    // requirements for both N and i at the same time. Limiting this code to
    // equality icmps is not a problem because all interesting loops use
    // equality icmps, thanks to IndVarSimplify.
    if (ICmpInst *CI = dyn_cast<ICmpInst>(LF.UserInst))
      if (CI->isEquality()) {
        // Swap the operands if needed to put the OperandValToReplace on the
        // left, for consistency.
        Value *NV = CI->getOperand(1);
        if (NV == LF.OperandValToReplace) {
          CI->setOperand(1, CI->getOperand(0));
          CI->setOperand(0, NV);
        }

        // x == y  -->  x - y == 0
        const SCEV *N = SE.getSCEV(NV);
        if (N->isLoopInvariant(L)) {
          Kind = LSRUse::ICmpZero;
          S = SE.getMinusSCEV(N, S);
        }

        // -1 and the negations of all interesting strides (except the negation
        // of -1) are now also interesting.
        for (size_t i = 0, e = Factors.size(); i != e; ++i)
          if (Factors[i] != -1)
            Factors.insert(-(uint64_t)Factors[i]);
        Factors.insert(-1);
      }

    // Set up the initial formula for this use.
    std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
    LF.LUIdx = P.first;
    LF.Offset = P.second;
    LSRUse &LU = Uses[LF.LUIdx];
    LU.AllFixupsOutsideLoop &= !L->contains(LF.UserInst);

    // If this is the first use of this LSRUse, give it a formula.
    if (LU.Formulae.empty()) {
      InsertInitialFormula(S, LU, LF.LUIdx);
      CountRegisters(LU.Formulae.back(), LF.LUIdx);
    }
  }

  DEBUG(print_fixups(dbgs()));
}

void
LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
  Formula F;
  F.InitialMatch(S, L, SE, DT);
  bool Inserted = InsertFormula(LU, LUIdx, F);
  assert(Inserted && "Initial formula already exists!"); (void)Inserted;
}

void
LSRInstance::InsertSupplementalFormula(const SCEV *S,
                                       LSRUse &LU, size_t LUIdx) {
  Formula F;
  F.BaseRegs.push_back(S);
  F.AM.HasBaseReg = true;
  bool Inserted = InsertFormula(LU, LUIdx, F);
  assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
}

/// CountRegisters - Note which registers are used by the given formula,
/// updating RegUses.
void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
  if (F.ScaledReg)
    RegUses.CountRegister(F.ScaledReg, LUIdx);
  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
       E = F.BaseRegs.end(); I != E; ++I)
    RegUses.CountRegister(*I, LUIdx);
}

/// InsertFormula - If the given formula has not yet been inserted, add it to
/// the list, and return true. Return false otherwise.
bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
  if (!LU.InsertFormula(F))
    return false;

  CountRegisters(F, LUIdx);
  return true;
}

/// CollectLoopInvariantFixupsAndFormulae - Check for other uses of
/// loop-invariant values which we're tracking. These other uses will pin these
/// values in registers, making them less profitable for elimination.
/// TODO: This currently misses non-constant addrec step registers.
/// TODO: Should this give more weight to users inside the loop?
void
LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
  SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
  SmallPtrSet<const SCEV *, 8> Inserted;

  while (!Worklist.empty()) {
    const SCEV *S = Worklist.pop_back_val();

    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
      Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
      Worklist.push_back(C->getOperand());
    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
      Worklist.push_back(D->getLHS());
      Worklist.push_back(D->getRHS());
    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
      if (!Inserted.insert(U)) continue;
      const Value *V = U->getValue();
      if (const Instruction *Inst = dyn_cast<Instruction>(V))
        if (L->contains(Inst)) continue;
      for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
           UI != UE; ++UI) {
        const Instruction *UserInst = dyn_cast<Instruction>(*UI);
        // Ignore non-instructions.
        if (!UserInst)
          continue;
        // Ignore instructions in other functions (as can happen with
        // Constants).
        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
          continue;
        // Ignore instructions not dominated by the loop.
        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
          UserInst->getParent() :
          cast<PHINode>(UserInst)->getIncomingBlock(
            PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
        if (!DT.dominates(L->getHeader(), UseBB))
          continue;
        // Ignore uses which are part of other SCEV expressions, to avoid
        // analyzing them multiple times.
        if (SE.isSCEVable(UserInst->getType()) &&
            !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst))))
          continue;
        // Ignore icmp instructions which are already being analyzed.
        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
          unsigned OtherIdx = !UI.getOperandNo();
          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
          if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
            continue;
        }

        LSRFixup &LF = getNewFixup();
        LF.UserInst = const_cast<Instruction *>(UserInst);
        LF.OperandValToReplace = UI.getUse();
        std::pair<size_t, int64_t> P = getUse(S, LSRUse::Basic, 0);
        LF.LUIdx = P.first;
        LF.Offset = P.second;
        LSRUse &LU = Uses[LF.LUIdx];
        LU.AllFixupsOutsideLoop &= L->contains(LF.UserInst);
        InsertSupplementalFormula(U, LU, LF.LUIdx);
        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
        break;
      }
    }
  }
}

/// CollectSubexprs - Split S into subexpressions which can be pulled out into
/// separate registers. If C is non-null, multiply each subexpression by C.
static void CollectSubexprs(const SCEV *S, const SCEVConstant *C,
                            SmallVectorImpl<const SCEV *> &Ops,
                            ScalarEvolution &SE) {
  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
    // Break out add operands.
    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
         I != E; ++I)
      CollectSubexprs(*I, C, Ops, SE);
    return;
  } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
    // Split a non-zero base out of an addrec.
    if (!AR->getStart()->isZero()) {
      CollectSubexprs(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
                                       AR->getStepRecurrence(SE),
                                       AR->getLoop()), C, Ops, SE);
      CollectSubexprs(AR->getStart(), C, Ops, SE);
      return;
    }
  } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
    // Break (C * (a + b + c)) into C*a + C*b + C*c.
    if (Mul->getNumOperands() == 2)
      if (const SCEVConstant *Op0 =
            dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
        CollectSubexprs(Mul->getOperand(1),
                        C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0,
                        Ops, SE);
        return;
      }
  }

  // Otherwise use the value itself.
  Ops.push_back(C ? SE.getMulExpr(C, S) : S);
}

/// GenerateReassociations - Split out subexpressions from adds and the bases of
/// addrecs.
void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                         Formula Base,
                                         unsigned Depth) {
  // Arbitrarily cap recursion to protect compile time.
  if (Depth >= 3) return;

  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
    const SCEV *BaseReg = Base.BaseRegs[i];

    SmallVector<const SCEV *, 8> AddOps;
    CollectSubexprs(BaseReg, 0, AddOps, SE);
    if (AddOps.size() == 1) continue;

    for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
         JE = AddOps.end(); J != JE; ++J) {