LoopStrengthReduce.cpp

    return Result;
  }

  static bool isEqual(const SmallVector<const SCEV *, 2> &LHS,
                      const SmallVector<const SCEV *, 2> &RHS) {
    return LHS == RHS;
  }
};

/// LSRUse - This class holds the state that LSR keeps for each use in
/// IVUsers, as well as uses invented by LSR itself. It includes information
/// about what kinds of things can be folded into the user, information about
/// the user itself, and information about how the use may be satisfied.
/// TODO: Represent multiple users of the same expression in common?
class LSRUse {
  DenseSet<SmallVector<const SCEV *, 2>, UniquifierDenseMapInfo> Uniquifier;

public:
  /// KindType - An enum for a kind of use, indicating what types of
  /// scaled and immediate operands it might support.
  enum KindType {
    Basic,   ///< A normal use, with no folding.
    Special, ///< A special case of basic, allowing -1 scales.
    Address, ///< An address use; folding according to TargetLowering
    ICmpZero ///< An equality icmp with both operands folded into one.
    // TODO: Add a generic icmp too?
  };

  KindType Kind;
  Type *AccessTy;

  SmallVector<int64_t, 8> Offsets;
  int64_t MinOffset;
  int64_t MaxOffset;

  /// AllFixupsOutsideLoop - This records whether all of the fixups using this
  /// LSRUse are outside of the loop, in which case some special-case heuristics
  /// may be used.
  bool AllFixupsOutsideLoop;

  /// WidestFixupType - This records the widest use type for any fixup using
  /// this LSRUse. FindUseWithSimilarFormula can't consider uses with different
  /// max fixup widths to be equivalent, because the narrower one may be relying
  /// on the implicit truncation to truncate away bogus bits.
  Type *WidestFixupType;

  /// Formulae - A list of ways to build a value that can satisfy this user.
  /// After the list is populated, one of these is selected heuristically and
  /// used to formulate a replacement for OperandValToReplace in UserInst.
  SmallVector<Formula, 12> Formulae;

  /// Regs - The set of register candidates used by all formulae in this LSRUse.
  SmallPtrSet<const SCEV *, 4> Regs;

  LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T),
                                      MinOffset(INT64_MAX),
                                      MaxOffset(INT64_MIN),
                                      AllFixupsOutsideLoop(true),
                                      WidestFixupType(0) {}

  bool HasFormulaWithSameRegs(const Formula &F) const;
  bool InsertFormula(const Formula &F);
  void DeleteFormula(Formula &F);
  void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);

  void print(raw_ostream &OS) const;
  void dump() const;
};

}

/// HasFormula - Test whether this use as a formula which has the same
/// registers as the given formula.
bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
  if (F.ScaledReg) Key.push_back(F.ScaledReg);
  // Unstable sort by host order ok, because this is only used for uniquifying.
  std::sort(Key.begin(), Key.end());
  return Uniquifier.count(Key);
}

/// InsertFormula - If the given formula has not yet been inserted, add it to
/// the list, and return true. Return false otherwise.
bool LSRUse::InsertFormula(const Formula &F) {
  SmallVector<const SCEV *, 2> Key = F.BaseRegs;
  if (F.ScaledReg) Key.push_back(F.ScaledReg);
  // Unstable sort by host order ok, because this is only used for uniquifying.
  std::sort(Key.begin(), Key.end());

  if (!Uniquifier.insert(Key).second)
    return false;

  // Using a register to hold the value of 0 is not profitable.
  assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
         "Zero allocated in a scaled register!");
#ifndef NDEBUG
  for (SmallVectorImpl<const SCEV *>::const_iterator I =
       F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
    assert(!(*I)->isZero() && "Zero allocated in a base register!");
#endif

  // Add the formula to the list.
  Formulae.push_back(F);

  // Record registers now being used by this use.
  if (F.ScaledReg) Regs.insert(F.ScaledReg);
  Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());

  return true;
}

/// DeleteFormula - Remove the given formula from this use's list.
void LSRUse::DeleteFormula(Formula &F) {
  if (&F != &Formulae.back())
    std::swap(F, Formulae.back());
  Formulae.pop_back();
  assert(!Formulae.empty() && "LSRUse has no formulae left!");
}

/// RecomputeRegs - Recompute the Regs field, and update RegUses.
void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
  // Now that we've filtered out some formulae, recompute the Regs set.
  SmallPtrSet<const SCEV *, 4> OldRegs = Regs;
  Regs.clear();
  for (SmallVectorImpl<Formula>::const_iterator I = Formulae.begin(),
       E = Formulae.end(); I != E; ++I) {
    const Formula &F = *I;
    if (F.ScaledReg) Regs.insert(F.ScaledReg);
    Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
  }

  // Update the RegTracker.
  for (SmallPtrSet<const SCEV *, 4>::iterator I = OldRegs.begin(),
       E = OldRegs.end(); I != E; ++I)
    if (!Regs.count(*I))
      RegUses.DropRegister(*I, LUIdx);
}

void LSRUse::print(raw_ostream &OS) const {
  OS << "LSR Use: Kind=";
  switch (Kind) {
  case Basic:    OS << "Basic"; break;
  case Special:  OS << "Special"; break;
  case ICmpZero: OS << "ICmpZero"; break;
  case Address:
    OS << "Address of ";
    if (AccessTy->isPointerTy())
      OS << "pointer"; // the full pointer type could be really verbose
    else
      OS << *AccessTy;
  }

  OS << ", Offsets={";
  for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(),
       E = Offsets.end(); I != E; ++I) {
    OS << *I;
    if (llvm::next(I) != E)
      OS << ',';
  }
  OS << '}';

  if (AllFixupsOutsideLoop)
    OS << ", all-fixups-outside-loop";

  if (WidestFixupType)
    OS << ", widest fixup type: " << *WidestFixupType;
}

void LSRUse::dump() const {
  print(errs()); errs() << '\n';
}

/// isLegalUse - Test whether the use described by AM is "legal", meaning it can
/// be completely folded into the user instruction at isel time. This includes
/// address-mode folding and special icmp tricks.
static bool isLegalUse(const TargetLowering::AddrMode &AM,
                       LSRUse::KindType Kind, Type *AccessTy,
                       const TargetLowering *TLI) {
  switch (Kind) {
  case LSRUse::Address:
    // If we have low-level target information, ask the target if it can
    // completely fold this address.
    if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);

    // Otherwise, just guess that reg+reg addressing is legal.
    return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;

  case LSRUse::ICmpZero:
    // There's not even a target hook for querying whether it would be legal to
    // fold a GV into an ICmp.
    if (AM.BaseGV)
      return false;

    // ICmp only has two operands; don't allow more than two non-trivial parts.
    if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
      return false;

    // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
    // putting the scaled register in the other operand of the icmp.
    if (AM.Scale != 0 && AM.Scale != -1)
      return false;

    // If we have low-level target information, ask the target if it can fold an
    // integer immediate on an icmp.
    if (AM.BaseOffs != 0) {
      if (TLI) return TLI->isLegalICmpImmediate(-(uint64_t)AM.BaseOffs);
      return false;
    }

    return true;

  case LSRUse::Basic:
    // Only handle single-register values.
    return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;

  case LSRUse::Special:
    // Only handle -1 scales, or no scale.
    return AM.Scale == 0 || AM.Scale == -1;
  }

  return false;
}

static bool isLegalUse(TargetLowering::AddrMode AM,
                       int64_t MinOffset, int64_t MaxOffset,
                       LSRUse::KindType Kind, Type *AccessTy,
                       const TargetLowering *TLI) {
  // Check for overflow.
  if (((int64_t)((uint64_t)AM.BaseOffs + MinOffset) > AM.BaseOffs) !=
      (MinOffset > 0))
    return false;
  AM.BaseOffs = (uint64_t)AM.BaseOffs + MinOffset;
  if (isLegalUse(AM, Kind, AccessTy, TLI)) {
    AM.BaseOffs = (uint64_t)AM.BaseOffs - MinOffset;
    // Check for overflow.
    if (((int64_t)((uint64_t)AM.BaseOffs + MaxOffset) > AM.BaseOffs) !=
        (MaxOffset > 0))
      return false;
    AM.BaseOffs = (uint64_t)AM.BaseOffs + MaxOffset;
    return isLegalUse(AM, Kind, AccessTy, TLI);
  }
  return false;
}

static bool isAlwaysFoldable(int64_t BaseOffs,
                             GlobalValue *BaseGV,
                             bool HasBaseReg,
                             LSRUse::KindType Kind, Type *AccessTy,
                             const TargetLowering *TLI) {
  // Fast-path: zero is always foldable.
  if (BaseOffs == 0 && !BaseGV) return true;

  // Conservatively, create an address with an immediate and a
  // base and a scale.
  TargetLowering::AddrMode AM;
  AM.BaseOffs = BaseOffs;
  AM.BaseGV = BaseGV;
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;

  // Canonicalize a scale of 1 to a base register if the formula doesn't
  // already have a base register.
  if (!AM.HasBaseReg && AM.Scale == 1) {
    AM.Scale = 0;
    AM.HasBaseReg = true;
  }

  return isLegalUse(AM, Kind, AccessTy, TLI);
}

static bool isAlwaysFoldable(const SCEV *S,
                             int64_t MinOffset, int64_t MaxOffset,
                             bool HasBaseReg,
                             LSRUse::KindType Kind, Type *AccessTy,
                             const TargetLowering *TLI,
                             ScalarEvolution &SE) {
  // Fast-path: zero is always foldable.
  if (S->isZero()) return true;

  // Conservatively, create an address with an immediate and a
  // base and a scale.
  int64_t BaseOffs = ExtractImmediate(S, SE);
  GlobalValue *BaseGV = ExtractSymbol(S, SE);

  // If there's anything else involved, it's not foldable.
  if (!S->isZero()) return false;

  // Fast-path: zero is always foldable.
  if (BaseOffs == 0 && !BaseGV) return true;

  // Conservatively, create an address with an immediate and a
  // base and a scale.
  TargetLowering::AddrMode AM;
  AM.BaseOffs = BaseOffs;
  AM.BaseGV = BaseGV;
  AM.HasBaseReg = HasBaseReg;
  AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;

  return isLegalUse(AM, MinOffset, MaxOffset, Kind, AccessTy, TLI);
}

namespace {

/// UseMapDenseMapInfo - A DenseMapInfo implementation for holding
/// DenseMaps and DenseSets of pairs of const SCEV* and LSRUse::Kind.
struct UseMapDenseMapInfo {
  static std::pair<const SCEV *, LSRUse::KindType> getEmptyKey() {
    return std::make_pair(reinterpret_cast<const SCEV *>(-1), LSRUse::Basic);
  }

  static std::pair<const SCEV *, LSRUse::KindType> getTombstoneKey() {
    return std::make_pair(reinterpret_cast<const SCEV *>(-2), LSRUse::Basic);
  }

  static unsigned
  getHashValue(const std::pair<const SCEV *, LSRUse::KindType> &V) {
    unsigned Result = DenseMapInfo<const SCEV *>::getHashValue(V.first);
    Result ^= DenseMapInfo<unsigned>::getHashValue(unsigned(V.second));
    return Result;
  }

  static bool isEqual(const std::pair<const SCEV *, LSRUse::KindType> &LHS,
                      const std::pair<const SCEV *, LSRUse::KindType> &RHS) {
    return LHS == RHS;
  }
};

/// LSRInstance - This class holds state for the main loop strength reduction
/// logic.
class LSRInstance {
  IVUsers &IU;
  ScalarEvolution &SE;
  DominatorTree &DT;
  LoopInfo &LI;
  const TargetLowering *const TLI;
  Loop *const L;
  bool Changed;

  /// IVIncInsertPos - This is the insert position that the current loop's
  /// induction variable increment should be placed. In simple loops, this is
  /// the latch block's terminator. But in more complicated cases, this is a
  /// position which will dominate all the in-loop post-increment users.
  Instruction *IVIncInsertPos;

  /// Factors - Interesting factors between use strides.
  SmallSetVector<int64_t, 8> Factors;

  /// Types - Interesting use types, to facilitate truncation reuse.
  SmallSetVector<Type *, 4> Types;

  /// Fixups - The list of operands which are to be replaced.
  SmallVector<LSRFixup, 16> Fixups;

  /// Uses - The list of interesting uses.
  SmallVector<LSRUse, 16> Uses;

  /// RegUses - Track which uses use which register candidates.
  RegUseTracker RegUses;

  void OptimizeShadowIV();
  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
  void OptimizeLoopTermCond();

  void CollectInterestingTypesAndFactors();
  void CollectFixupsAndInitialFormulae();

  LSRFixup &getNewFixup() {
    Fixups.push_back(LSRFixup());
    return Fixups.back();
  }

  // Support for sharing of LSRUses between LSRFixups.
  typedef DenseMap<std::pair<const SCEV *, LSRUse::KindType>,
                   size_t,
                   UseMapDenseMapInfo> UseMapTy;
  UseMapTy UseMap;

  bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
                          LSRUse::KindType Kind, Type *AccessTy);

  std::pair<size_t, int64_t> getUse(const SCEV *&Expr,
                                    LSRUse::KindType Kind,
                                    Type *AccessTy);

  void DeleteUse(LSRUse &LU, size_t LUIdx);

  LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);

public:
  void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
  void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
  void CountRegisters(const Formula &F, size_t LUIdx);
  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);

  void CollectLoopInvariantFixupsAndFormulae();

  void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
                              unsigned Depth = 0);
  void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
  void GenerateCrossUseConstantOffsets();
  void GenerateAllReuseFormulae();

  void FilterOutUndesirableDedicatedRegisters();

  size_t EstimateSearchSpaceComplexity() const;
  void NarrowSearchSpaceByDetectingSupersets();
  void NarrowSearchSpaceByCollapsingUnrolledCode();
  void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
  void NarrowSearchSpaceByPickingWinnerRegs();
  void NarrowSearchSpaceUsingHeuristics();

  void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                    Cost &SolutionCost,
                    SmallVectorImpl<const Formula *> &Workspace,
                    const Cost &CurCost,
                    const SmallPtrSet<const SCEV *, 16> &CurRegs,
                    DenseSet<const SCEV *> &VisitedRegs) const;
  void Solve(SmallVectorImpl<const Formula *> &Solution) const;

  BasicBlock::iterator
    HoistInsertPosition(BasicBlock::iterator IP,
                        const SmallVectorImpl<Instruction *> &Inputs) const;
  BasicBlock::iterator AdjustInsertPositionForExpand(BasicBlock::iterator IP,
                                                     const LSRFixup &LF,
                                                     const LSRUse &LU) const;

  Value *Expand(const LSRFixup &LF,
                const Formula &F,
                BasicBlock::iterator IP,
                SCEVExpander &Rewriter,
                SmallVectorImpl<WeakVH> &DeadInsts) const;
  void RewriteForPHI(PHINode *PN, const LSRFixup &LF,
                     const Formula &F,
                     SCEVExpander &Rewriter,
                     SmallVectorImpl<WeakVH> &DeadInsts,
                     Pass *P) const;
  void Rewrite(const LSRFixup &LF,
               const Formula &F,
               SCEVExpander &Rewriter,
               SmallVectorImpl<WeakVH> &DeadInsts,
               Pass *P) const;
  void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
                         Pass *P);

  LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);

  bool getChanged() const { return Changed; }

  void print_factors_and_types(raw_ostream &OS) const;
  void print_fixups(raw_ostream &OS) const;
  void print_uses(raw_ostream &OS) const;
  void print(raw_ostream &OS) const;
  void dump() const;
};

}

/// OptimizeShadowIV - If IV is used in a int-to-float cast
/// inside the loop then try to eliminate the cast operation.
void LSRInstance::OptimizeShadowIV() {
  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return;

  for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
       UI != E; /* empty */) {
    IVUsers::const_iterator CandidateUI = UI;
    ++UI;
    Instruction *ShadowUse = CandidateUI->getUser();
    Type *DestTy = NULL;
    bool IsSigned = false;

    /* If shadow use is a int->float cast then insert a second IV
       to eliminate this cast.

         for (unsigned i = 0; i < n; ++i)
           foo((double)i);

       is transformed into

         double d = 0.0;
         for (unsigned i = 0; i < n; ++i, ++d)
           foo(d);
    */
    if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
      IsSigned = false;
      DestTy = UCast->getDestTy();
    }
    else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
      IsSigned = true;
      DestTy = SCast->getDestTy();
    }
    if (!DestTy) continue;

    if (TLI) {
      // If target does not support DestTy natively then do not apply
      // this transformation.
      EVT DVT = TLI->getValueType(DestTy);
      if (!TLI->isTypeLegal(DVT)) continue;
    }

    PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
    if (!PH) continue;
    if (PH->getNumIncomingValues() != 2) continue;

    Type *SrcTy = PH->getType();
    int Mantissa = DestTy->getFPMantissaWidth();
    if (Mantissa == -1) continue;
    if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
      continue;

    unsigned Entry, Latch;
    if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
      Entry = 0;
      Latch = 1;
    } else {
      Entry = 1;
      Latch = 0;
    }

    ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
    if (!Init) continue;
    Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
                                        (double)Init->getSExtValue() :
                                        (double)Init->getZExtValue());

    BinaryOperator *Incr =
      dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
    if (!Incr) continue;
    if (Incr->getOpcode() != Instruction::Add
        && Incr->getOpcode() != Instruction::Sub)
      continue;

    /* Initialize new IV, double d = 0.0 in above example. */
    ConstantInt *C = NULL;
    if (Incr->getOperand(0) == PH)
      C = dyn_cast<ConstantInt>(Incr->getOperand(1));
    else if (Incr->getOperand(1) == PH)
      C = dyn_cast<ConstantInt>(Incr->getOperand(0));
    else
      continue;

    if (!C) continue;

    // Ignore negative constants, as the code below doesn't handle them
    // correctly. TODO: Remove this restriction.
    if (!C->getValue().isStrictlyPositive()) continue;

    /* Add new PHINode. */
    PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);

    /* create new increment. '++d' in above example. */
    Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
    BinaryOperator *NewIncr =
      BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
                               Instruction::FAdd : Instruction::FSub,
                             NewPH, CFP, "IV.S.next.", Incr);

    NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
    NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));

    /* Remove cast operation */
    ShadowUse->replaceAllUsesWith(NewPH);
    ShadowUse->eraseFromParent();
    Changed = true;
    break;
  }
}

/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
/// set the IV user and stride information and return true, otherwise return
/// false.
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
  for (IVUsers::iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
    if (UI->getUser() == Cond) {
      // NOTE: we could handle setcc instructions with multiple uses here, but
      // InstCombine does it as well for simple uses, it's not clear that it
      // occurs enough in real life to handle.
      CondUse = UI;
      return true;
    }
  return false;
}

/// OptimizeMax - Rewrite the loop's terminating condition if it uses
/// a max computation.
///
/// This is a narrow solution to a specific, but acute, problem. For loops
/// like this:
///
///   i = 0;
///   do {
///     p[i] = 0.0;
///   } while (++i < n);
///
/// the trip count isn't just 'n', because 'n' might not be positive. And
/// unfortunately this can come up even for loops where the user didn't use
/// a C do-while loop. For example, seemingly well-behaved top-test loops
/// will commonly be lowered like this:
//
///   if (n > 0) {
///     i = 0;
///     do {
///       p[i] = 0.0;
///     } while (++i < n);
///   }
///
/// and then it's possible for subsequent optimization to obscure the if
/// test in such a way that indvars can't find it.
///
/// When indvars can't find the if test in loops like this, it creates a
/// max expression, which allows it to give the loop a canonical
/// induction variable:
///
///   i = 0;
///   max = n < 1 ? 1 : n;
///   do {
///     p[i] = 0.0;
///   } while (++i != max);
///
/// Canonical induction variables are necessary because the loop passes
/// are designed around them. The most obvious example of this is the
/// LoopInfo analysis, which doesn't remember trip count values. It
/// expects to be able to rediscover the trip count each time it is
/// needed, and it does this using a simple analysis that only succeeds if
/// the loop has a canonical induction variable.
///
/// However, when it comes time to generate code, the maximum operation
/// can be quite costly, especially if it's inside of an outer loop.
///
/// This function solves this problem by detecting this type of loop and
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
/// the instructions for the maximum computation.
///
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
  // Check that the loop matches the pattern we're looking for.
  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
      Cond->getPredicate() != CmpInst::ICMP_NE)
    return Cond;

  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
  if (!Sel || !Sel->hasOneUse()) return Cond;

  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return Cond;
  const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);

  // Add one to the backedge-taken count to get the trip count.
  const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
  if (IterationCount != SE.getSCEV(Sel)) return Cond;

  // Check for a max calculation that matches the pattern. There's no check
  // for ICMP_ULE here because the comparison would be with zero, which
  // isn't interesting.
  CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
  const SCEVNAryExpr *Max = 0;
  if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
    Pred = ICmpInst::ICMP_SLE;
    Max = S;
  } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
    Pred = ICmpInst::ICMP_SLT;
    Max = S;
  } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
    Pred = ICmpInst::ICMP_ULT;
    Max = U;
  } else {
    // No match; bail.
    return Cond;
  }

  // To handle a max with more than two operands, this optimization would
  // require additional checking and setup.
  if (Max->getNumOperands() != 2)
    return Cond;

  const SCEV *MaxLHS = Max->getOperand(0);
  const SCEV *MaxRHS = Max->getOperand(1);

  // ScalarEvolution canonicalizes constants to the left. For < and >, look
  // for a comparison with 1. For <= and >=, a comparison with zero.
  if (!MaxLHS ||
      (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
    return Cond;

  // Check the relevant induction variable for conformance to
  // the pattern.
  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
  if (!AR || !AR->isAffine() ||
      AR->getStart() != One ||
      AR->getStepRecurrence(SE) != One)
    return Cond;

  assert(AR->getLoop() == L &&
         "Loop condition operand is an addrec in a different loop!");

  // Check the right operand of the select, and remember it, as it will
  // be used in the new comparison instruction.
  Value *NewRHS = 0;
  if (ICmpInst::isTrueWhenEqual(Pred)) {
    // Look for n+1, and grab n.
    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
      if (isa<ConstantInt>(BO->getOperand(1)) &&
          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
        NewRHS = BO->getOperand(0);
    if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
      if (isa<ConstantInt>(BO->getOperand(1)) &&
          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
        NewRHS = BO->getOperand(0);
    if (!NewRHS)
      return Cond;
  } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
    NewRHS = Sel->getOperand(1);
  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
    NewRHS = Sel->getOperand(2);
  else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
    NewRHS = SU->getValue();
  else
    // Max doesn't match expected pattern.
    return Cond;

  // Determine the new comparison opcode. It may be signed or unsigned,
  // and the original comparison may be either equality or inequality.
  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
    Pred = CmpInst::getInversePredicate(Pred);

  // Ok, everything looks ok to change the condition into an SLT or SGE and
  // delete the max calculation.
  ICmpInst *NewCond =
    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");

  // Delete the max calculation instructions.
  Cond->replaceAllUsesWith(NewCond);
  CondUse->setUser(NewCond);
  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
  Cond->eraseFromParent();
  Sel->eraseFromParent();
  if (Cmp->use_empty())
    Cmp->eraseFromParent();
  return NewCond;
}

/// OptimizeLoopTermCond - Change loop terminating condition to use the
/// postinc iv when possible.
void
LSRInstance::OptimizeLoopTermCond() {
  SmallPtrSet<Instruction *, 4> PostIncs;

  BasicBlock *LatchBlock = L->getLoopLatch();
  SmallVector<BasicBlock*, 8> ExitingBlocks;
  L->getExitingBlocks(ExitingBlocks);

  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
    BasicBlock *ExitingBlock = ExitingBlocks[i];

    // Get the terminating condition for the loop if possible.  If we
    // can, we want to change it to use a post-incremented version of its
    // induction variable, to allow coalescing the live ranges for the IV into
    // one register value.

    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
    if (!TermBr)
      continue;
    // FIXME: Overly conservative, termination condition could be an 'or' etc..
    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
      continue;

    // Search IVUsesByStride to find Cond's IVUse if there is one.
    IVStrideUse *CondUse = 0;
    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
    if (!FindIVUserForCond(Cond, CondUse))
      continue;

    // If the trip count is computed in terms of a max (due to ScalarEvolution
    // being unable to find a sufficient guard, for example), change the loop
    // comparison to use SLT or ULT instead of NE.
    // One consequence of doing this now is that it disrupts the count-down
    // optimization. That's not always a bad thing though, because in such
    // cases it may still be worthwhile to avoid a max.
    Cond = OptimizeMax(Cond, CondUse);

    // If this exiting block dominates the latch block, it may also use
    // the post-inc value if it won't be shared with other uses.
    // Check for dominance.
    if (!DT.dominates(ExitingBlock, LatchBlock))
      continue;

    // Conservatively avoid trying to use the post-inc value in non-latch
    // exits if there may be pre-inc users in intervening blocks.
    if (LatchBlock != ExitingBlock)
      for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
        // Test if the use is reachable from the exiting block. This dominator
        // query is a conservative approximation of reachability.
        if (&*UI != CondUse &&
            !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
          // Conservatively assume there may be reuse if the quotient of their
          // strides could be a legal scale.
          const SCEV *A = IU.getStride(*CondUse, L);
          const SCEV *B = IU.getStride(*UI, L);
          if (!A || !B) continue;
          if (SE.getTypeSizeInBits(A->getType()) !=
              SE.getTypeSizeInBits(B->getType())) {
            if (SE.getTypeSizeInBits(A->getType()) >
                SE.getTypeSizeInBits(B->getType()))
              B = SE.getSignExtendExpr(B, A->getType());
            else
              A = SE.getSignExtendExpr(A, B->getType());
          }
          if (const SCEVConstant *D =
                dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
            const ConstantInt *C = D->getValue();
            // Stride of one or negative one can have reuse with non-addresses.
            if (C->isOne() || C->isAllOnesValue())
              goto decline_post_inc;
            // Avoid weird situations.
            if (C->getValue().getMinSignedBits() >= 64 ||
                C->getValue().isMinSignedValue())
              goto decline_post_inc;
            // Without TLI, assume that any stride might be valid, and so any
            // use might be shared.
            if (!TLI)
              goto decline_post_inc;
            // Check for possible scaled-address reuse.
            Type *AccessTy = getAccessType(UI->getUser());
            TargetLowering::AddrMode AM;
            AM.Scale = C->getSExtValue();
            if (TLI->isLegalAddressingMode(AM, AccessTy))
              goto decline_post_inc;
            AM.Scale = -AM.Scale;
            if (TLI->isLegalAddressingMode(AM, AccessTy))
              goto decline_post_inc;
          }
        }

    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
                 << *Cond << '\n');

    // It's possible for the setcc instruction to be anywhere in the loop, and
    // possible for it to have multiple users.  If it is not immediately before
    // the exiting block branch, move it.
    if (&*++BasicBlock::iterator(Cond) != TermBr) {
      if (Cond->hasOneUse()) {
        Cond->moveBefore(TermBr);
      } else {
        // Clone the terminating condition and insert into the loopend.
        ICmpInst *OldCond = Cond;
        Cond = cast<ICmpInst>(Cond->clone());
        Cond->setName(L->getHeader()->getName() + ".termcond");
        ExitingBlock->getInstList().insert(TermBr, Cond);

        // Clone the IVUse, as the old use still exists!
        CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
        TermBr->replaceUsesOfWith(OldCond, Cond);
      }
    }

    // If we get to here, we know that we can transform the setcc instruction to
    // use the post-incremented version of the IV, allowing us to coalesce the
    // live ranges for the IV correctly.
    CondUse->transformToPostInc(L);
    Changed = true;

    PostIncs.insert(Cond);
  decline_post_inc:;
  }

  // Determine an insertion point for the loop induction variable increment. It
  // must dominate all the post-inc comparisons we just set up, and it must
  // dominate the loop latch edge.
  IVIncInsertPos = L->getLoopLatch()->getTerminator();
  for (SmallPtrSet<Instruction *, 4>::const_iterator I = PostIncs.begin(),
       E = PostIncs.end(); I != E; ++I) {
    BasicBlock *BB =
      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
                                    (*I)->getParent());
    if (BB == (*I)->getParent())
      IVIncInsertPos = *I;
    else if (BB != IVIncInsertPos->getParent())
      IVIncInsertPos = BB->getTerminator();
  }
}

/// reconcileNewOffset - Determine if the given use can accommodate a fixup
/// at the given offset and other details. If so, update the use and
/// return true.
bool
LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
                                LSRUse::KindType Kind, Type *AccessTy) {
  int64_t NewMinOffset = LU.MinOffset;
  int64_t NewMaxOffset = LU.MaxOffset;
  Type *NewAccessTy = AccessTy;

  // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
  // something conservative, however this can pessimize in the case that one of
  // the uses will have all its uses outside the loop, for example.
  if (LU.Kind != Kind)
    return false;
  // Conservatively assume HasBaseReg is true for now.
  if (NewOffset < LU.MinOffset) {
    if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg,
                          Kind, AccessTy, TLI))
      return false;
    NewMinOffset = NewOffset;
  } else if (NewOffset > LU.MaxOffset) {
    if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg,
                          Kind, AccessTy, TLI))
      return false;
    NewMaxOffset = NewOffset;
  }
  // Check for a mismatched access type, and fall back conservatively as needed.
  // TODO: Be less conservative when the type is similar and can use the same
  // addressing modes.
  if (Kind == LSRUse::Address && AccessTy != LU.AccessTy)
    NewAccessTy = Type::getVoidTy(AccessTy->getContext());

  // Update the use.
  LU.MinOffset = NewMinOffset;
  LU.MaxOffset = NewMaxOffset;
  LU.AccessTy = NewAccessTy;
  if (NewOffset != LU.Offsets.back())
    LU.Offsets.push_back(NewOffset);
  return true;
}

/// getUse - Return an LSRUse index and an offset value for a fixup which
/// needs the given expression, with the given kind and optional access type.
/// Either reuse an existing use or create a new one, as needed.
std::pair<size_t, int64_t>
LSRInstance::getUse(const SCEV *&Expr,
                    LSRUse::KindType Kind, Type *AccessTy) {
  const SCEV *Copy = Expr;
  int64_t Offset = ExtractImmediate(Expr, SE);

  // Basic uses can't accept any offset, for example.
  if (!isAlwaysFoldable(Offset, 0, /*HasBaseReg=*/true, Kind, AccessTy, TLI)) {
    Expr = Copy;
    Offset = 0;
  }

  std::pair<UseMapTy::iterator, bool> P =
    UseMap.insert(std::make_pair(std::make_pair(Expr, Kind), 0));
  if (!P.second) {
    // A use already existed with this base.
    size_t LUIdx = P.first->second;
    LSRUse &LU = Uses[LUIdx];
    if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
      // Reuse this use.
      return std::make_pair(LUIdx, Offset);
  }

  // Create a new use.
  size_t LUIdx = Uses.size();
  P.first->second = LUIdx;
  Uses.push_back(LSRUse(Kind, AccessTy));
  LSRUse &LU = Uses[LUIdx];

  // We don't need to track redundant offsets, but we don't need to go out
  // of our way here to avoid them.
  if (LU.Offsets.empty() || Offset != LU.Offsets.back())
    LU.Offsets.push_back(Offset);

  LU.MinOffset = Offset;
  LU.MaxOffset = Offset;
  return std::make_pair(LUIdx, Offset);
}

/// DeleteUse - Delete the given use from the Uses list.
void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
  if (&LU != &Uses.back())
    std::swap(LU, Uses.back());
  Uses.pop_back();

  // Update RegUses.
  RegUses.SwapAndDropUse(LUIdx, Uses.size());
}

/// FindUseWithFormula - Look for a use distinct from OrigLU which is has
/// a formula that has the same registers as the given formula.
LSRUse *
LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
                                       const LSRUse &OrigLU) {
  // Search all uses for the formula. This could be more clever.
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    // Check whether this use is close enough to OrigLU, to see whether it's
    // worthwhile looking through its formulae.
    // Ignore ICmpZero uses because they may contain formulae generated by
    // GenerateICmpZeroScales, in which case adding fixup offsets may
    // be invalid.
    if (&LU != &OrigLU &&
        LU.Kind != LSRUse::ICmpZero &&