LoopStrengthReduce.cpp

      continue;

    // Check that this scale is legal.
    if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
      continue;

    const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));

    // Check that multiplying with each base register doesn't overflow.
    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
      if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
        goto next;
    }

    // Check that multiplying with the scaled register doesn't overflow.
    if (F.ScaledReg) {
      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
      if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
        continue;
    }

    // If we make it here and it's legal, add it.
    (void)InsertFormula(LU, LUIdx, F);
  next:;
  }
}

/// GenerateFormulaeFromReplacedBaseReg - If removing base register with
/// index i from the BaseRegs list and adding the registers in AddOps
/// to the address forms an interesting formula, pursue it.
void
LSRInstance::GenerateFormulaeFromReplacedBaseReg(
                                             LSRUse &LU,
                                             unsigned LUIdx,
                                             const Formula &Base, unsigned i,
                                             const SmallVectorImpl<const SCEV *>
                                               &AddOps) {
  if (AddOps.empty()) return;

  Formula F = Base;
  std::swap(F.BaseRegs[i], F.BaseRegs.back());
  F.BaseRegs.pop_back();
  for (SmallVectorImpl<const SCEV *>::const_iterator I = AddOps.begin(),
       E = AddOps.end(); I != E; ++I)
    F.BaseRegs.push_back(*I);
  F.AM.HasBaseReg = !F.BaseRegs.empty();
  if (InsertFormula(LU, LUIdx, F))
    // If that formula hadn't been seen before, recurse to find more like it.
    GenerateReassociationReuse(LU, LUIdx, LU.Formulae.back());
}

/// GenerateReassociationReuse - Split out subexpressions from adds and
/// the bases of addrecs.
void LSRInstance::GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
                                             Formula Base) {
  SmallVector<const SCEV *, 8> AddOps;
  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
    const SCEV *BaseReg = Base.BaseRegs[i];
    if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BaseReg)) {
      for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
           J != JE; ++J) {
        // Don't pull a constant into a register if the constant could be
        // folded into an immediate field.
        if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE)) continue;
        SmallVector<const SCEV *, 8> InnerAddOps;
        for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
          if (K != J)
            InnerAddOps.push_back(*K);
        // Splitting a 2-operand add both ways is redundant. Pruning this
        // now saves compile time.
        if (InnerAddOps.size() < 2 && next(J) == JE)
          continue;
        AddOps.push_back(*J);
        const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
        AddOps.push_back(InnerAdd);
        GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
        AddOps.clear();
      }
    } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BaseReg)) {
      if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(AR->getStart())) {
        for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
             J != JE; ++J) {
          // Don't pull a constant into a register if the constant could be
          // folded into an immediate field.
          if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE))
            continue;
          SmallVector<const SCEV *, 8> InnerAddOps;
          for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
            if (K != J)
              InnerAddOps.push_back(*K);
          AddOps.push_back(*J);
          const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
          AddOps.push_back(SE.getAddRecExpr(InnerAdd,
                                            AR->getStepRecurrence(SE),
                                            AR->getLoop()));
          GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
          AddOps.clear();
        }
      } else if (!isAlwaysFoldable(AR->getStart(), Base.BaseRegs.size() > 1,
                                   LU.Kind, LU.AccessTy,
                                   TLI, SE)) {
        AddOps.push_back(AR->getStart());
        AddOps.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0,
                                                            BaseReg->getType()),
                                          AR->getStepRecurrence(SE),
                                          AR->getLoop()));
        GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
        AddOps.clear();
      }
    }
  }
}

/// GenerateCombinationReuse - Generate a formula consisting of all of the
/// loop-dominating registers added into a single register.
void LSRInstance::GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
                                           Formula Base) {
  // This method is only intersting on a plurality of registers.
  if (Base.BaseRegs.size() <= 1) return;

  Formula F = Base;
  F.BaseRegs.clear();
  SmallVector<const SCEV *, 4> Ops;
  for (SmallVectorImpl<const SCEV *>::const_iterator
       I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
    const SCEV *BaseReg = *I;
    if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
        !BaseReg->hasComputableLoopEvolution(L))
      Ops.push_back(BaseReg);
    else
      F.BaseRegs.push_back(BaseReg);
  }
  if (Ops.size() > 1) {
    F.BaseRegs.push_back(SE.getAddExpr(Ops));
    (void)InsertFormula(LU, LUIdx, F);
  }
}

/// GenerateScaledReuse - Generate stride factor reuse formulae by making
/// use of scaled-offset address modes, for example.
void LSRInstance::GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
                                      Formula Base) {
  // Determine the integer type for the base formula.
  const Type *IntTy = Base.getType();
  if (!IntTy) return;
  IntTy = SE.getEffectiveSCEVType(IntTy);

  // Check each interesting stride.
  for (SmallSetVector<int64_t, 4>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    int64_t Factor = *I;

    // If this Formula already has a scaled register, we can't add another one.
    if (Base.AM.Scale != 0)
      continue;
    Formula F = Base;
    F.AM.Scale = Factor;
    // Check whether this scale is going to be legal.
    if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) {
      // As a special-case, handle special out-of-loop Basic users specially.
      // TODO: Reconsider this special case.
      if (LU.Kind == LSRUse::Basic &&
          isLegalUse(F.AM, LSRUse::Special, LU.AccessTy, TLI) &&
          !L->contains(LU.UserInst))
        LU.Kind = LSRUse::Special;
      else
        continue;
    }
    // For each addrec base reg, apply the scale, if possible.
    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
      if (const SCEVAddRecExpr *AR =
            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
        const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
        // Divide out the factor, ignoring high bits, since we'll be
        // scaling the value back up in the end.
        if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
          // TODO: This could be optimized to avoid all the copying.
          Formula NewF = F;
          NewF.ScaledReg = Quotient;
          std::swap(NewF.BaseRegs[i], NewF.BaseRegs.back());
          NewF.BaseRegs.pop_back();
          NewF.AM.HasBaseReg = !NewF.BaseRegs.empty();
          (void)InsertFormula(LU, LUIdx, NewF);
        }
      }
  }
}

/// GenerateTruncateReuse - Generate reuse formulae from different IV types.
void LSRInstance::GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
                                        Formula Base) {
  // This requires TargetLowering to tell us which truncates are free.
  if (!TLI) return;

  // Don't attempt to truncate symbolic values.
  if (Base.AM.BaseGV) return;

  // Determine the integer type for the base formula.
  const Type *DstTy = Base.getType();
  if (!DstTy) return;
  DstTy = SE.getEffectiveSCEVType(DstTy);

  for (SmallSetVector<const Type *, 4>::const_iterator
       I = Types.begin(), E = Types.end(); I != E; ++I) {
    const Type *SrcTy = *I;
    if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
      Formula F = Base;
      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
      for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
           JE = F.BaseRegs.end(); J != JE; ++J)
        *J = SE.getAnyExtendExpr(*J, SrcTy);
      (void)InsertFormula(LU, LUIdx, F);
    }
  }
}

namespace {

/// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
/// defer modifications so that the search phase doesn't have to worry about
/// the data structures moving underneath it.
struct WorkItem {
  LSRUse *LU;
  size_t LUIdx;
  int64_t Imm;
  const SCEV *OrigReg;

  WorkItem(LSRUse *U, size_t LI, int64_t I, const SCEV *R)
    : LU(U), LUIdx(LI), Imm(I), OrigReg(R) {}

  void print(raw_ostream &OS) const;
  void dump() const;
};

void WorkItem::print(raw_ostream &OS) const {
  OS << "in use ";
  LU->print(OS);
  OS << " (at index " << LUIdx << "), add offset " << Imm
     << " and compensate by adjusting refences to " << *OrigReg << "\n";
}

void WorkItem::dump() const {
  print(errs()); errs() << '\n';
}

}

/// GenerateConstantOffsetReuse - Look for registers which are a constant
/// distance apart and try to form reuse opportunities between them.
void LSRInstance::GenerateConstantOffsetReuse() {
  // Group the registers by their value without any added constant offset.
  typedef std::map<int64_t, const SCEV *> ImmMapTy;
  typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
  RegMapTy Map;
  SmallVector<const SCEV *, 8> Sequence;
  for (SmallVectorImpl<const SCEV *>::iterator I = RegSequence.begin(),
       E = RegSequence.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    int64_t Imm = ExtractImmediate(Reg, SE);
    std::pair<RegMapTy::iterator, bool> Pair =
      Map.insert(std::make_pair(Reg, ImmMapTy()));
    if (Pair.second)
      Sequence.push_back(Reg);
    Pair.first->second.insert(std::make_pair(Imm, *I));
  }

  // Insert an artificial expression at offset 0 (if there isn't one already),
  // as this may lead to more reuse opportunities.
  for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
       E = Sequence.end(); I != E; ++I)
    Map.find(*I)->second.insert(ImmMapTy::value_type(0, 0));

  // Now examine each set of registers with the same base value. Build up
  // a list of work to do and do the work in a separate step so that we're
  // not adding formulae and register counts while we're searching.
  SmallVector<WorkItem, 32> WorkItems;
  for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
       E = Sequence.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    const ImmMapTy &Imms = Map.find(Reg)->second;
    // Examine each offset.
    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
         J != JE; ++J) {
      const SCEV *OrigReg = J->second;
      // Skip the artifical register at offset 0.
      if (!OrigReg) continue;

      int64_t JImm = J->first;
      const SmallBitVector &Bits = RegUses.find(OrigReg)->second.Bits;

      // Examine each other offset associated with the same register. This is
      // quadradic in the number of registers with the same base, but it's
      // uncommon for this to be a large number.
      for (ImmMapTy::const_iterator M = Imms.begin(); M != JE; ++M) {
        if (M == J) continue;

        // Compute the difference between the two.
        int64_t Imm = (uint64_t)JImm - M->first;
        for (int LUIdx = Bits.find_first(); LUIdx != -1;
             LUIdx = Bits.find_next(LUIdx))
          // Make a memo of this use, offset, and register tuple.
          WorkItems.push_back(WorkItem(&Uses[LUIdx], LUIdx, Imm, OrigReg));
      }
    }
  }

  // Now iterate through the worklist and add new formulae.
  for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
       E = WorkItems.end(); I != E; ++I) {
    const WorkItem &WI = *I;
    LSRUse &LU = *WI.LU;
    size_t LUIdx = WI.LUIdx;
    int64_t Imm = WI.Imm;
    const SCEV *OrigReg = WI.OrigReg;

    const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy,
                                                      -(uint64_t)Imm));

    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
      Formula F = LU.Formulae[L];
      // Use the immediate in the scaled register.
      if (F.ScaledReg == OrigReg) {
        int64_t Offs = (uint64_t)F.AM.BaseOffs +
                       Imm * (uint64_t)F.AM.Scale;
        // Don't create 50 + reg(-50).
        if (F.referencesReg(SE.getSCEV(
                   ConstantInt::get(IntTy, -(uint64_t)Offs))))
          continue;
        Formula NewF = F;
        NewF.AM.BaseOffs = Offs;
        if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
          continue;
        const SCEV *Diff = SE.getAddExpr(NegImmS, NewF.ScaledReg);
        if (Diff->isZero()) continue;
        NewF.ScaledReg = Diff;
        (void)InsertFormula(LU, LUIdx, NewF);
      }
      // Use the immediate in a base register.
      for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
        const SCEV *BaseReg = F.BaseRegs[N];
        if (BaseReg != OrigReg)
          continue;
        Formula NewF = F;
        NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
        if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
          continue;
        const SCEV *Diff = SE.getAddExpr(NegImmS, BaseReg);
        if (Diff->isZero()) continue;
        // Don't create 50 + reg(-50).
        if (Diff ==
            SE.getSCEV(ConstantInt::get(IntTy,
                                        -(uint64_t)NewF.AM.BaseOffs)))
          continue;
        NewF.BaseRegs[N] = Diff;
        (void)InsertFormula(LU, LUIdx, NewF);
      }
    }
  }
}

/// GenerateAllReuseFormulae - Generate formulae for each use.
void
LSRInstance::GenerateAllReuseFormulae() {
  SmallVector<Formula, 12> Save;
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];

    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateSymbolicOffsetReuse(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateICmpZeroScaledReuse(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateReassociationReuse(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateCombinationReuse(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateScaledReuse(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateTruncateReuse(LU, LUIdx, LU.Formulae[i]);
  }

  GenerateConstantOffsetReuse();
}

/// GenerateLoopInvariantRegisterUses - Check for other uses of loop-invariant
/// values which we're tracking. These other uses will pin these values in
/// registers, making them less profitable for elimination.
/// TODO: This currently misses non-constant addrec step registers.
/// TODO: Should this give more weight to users inside the loop?
void
LSRInstance::GenerateLoopInvariantRegisterUses() {
  SmallVector<const SCEV *, 8> Worklist(RegSequence.begin(), RegSequence.end());

  while (!Worklist.empty()) {
    const SCEV *S = Worklist.pop_back_val();

    if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
      Worklist.insert(Worklist.end(), N->op_begin(), N->op_end());
    else if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S))
      Worklist.push_back(C->getOperand());
    else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
      Worklist.push_back(D->getLHS());
      Worklist.push_back(D->getRHS());
    } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
      const Value *V = U->getValue();
      if (const Instruction *Inst = dyn_cast<Instruction>(V))
        if (L->contains(Inst)) continue;
      for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
           UI != UE; ++UI) {
        const Instruction *UserInst = dyn_cast<Instruction>(*UI);
        // Ignore non-instructions.
        if (!UserInst)
          continue;
        // Ignore instructions in other functions (as can happen with
        // Constants).
        if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
          continue;
        // Ignore instructions not dominated by the loop.
        const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
          UserInst->getParent() :
          cast<PHINode>(UserInst)->getIncomingBlock(
            PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
        if (!DT.dominates(L->getHeader(), UseBB))
          continue;
        // Ignore uses which are part of other SCEV expressions, to avoid
        // analyzing them multiple times.
        if (SE.isSCEVable(UserInst->getType()) &&
            !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst))))
          continue;
        // Ignore icmp instructions which are already being analyzed.
        if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
          unsigned OtherIdx = !UI.getOperandNo();
          Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
          if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
            continue;
        }

        LSRUse &LU = getNewUse();
        LU.UserInst = const_cast<Instruction *>(UserInst);
        LU.OperandValToReplace = UI.getUse();
        LU.InsertSupplementalFormula(U);
        CountRegisters(LU.Formulae.back(), Uses.size() - 1);
      }
    }
  }
}

#ifndef NDEBUG

static void debug_winner(SmallVector<LSRUse, 16> const &Uses) {
  dbgs() << "LSR has selected formulae for each use:\n";
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I) {
    const LSRUse &LU = *I;
    dbgs() << "  ";
    LU.print(dbgs());
    dbgs() << '\n';
    dbgs() << "    ";
    LU.Formulae.front().print(dbgs());
    dbgs() << "\n";
  }
}

#endif

LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
  : IU(P->getAnalysis<IVUsers>()),
    SE(P->getAnalysis<ScalarEvolution>()),
    DT(P->getAnalysis<DominatorTree>()),
    TLI(tli), L(l), Changed(false), IVIncInsertPos(0),
    CurrentArbitraryRegIndex(0), MaxNumRegs(0) {

  // If LoopSimplify form is not available, stay out of trouble.
  if (!L->isLoopSimplifyForm()) return;

  // If there's no interesting work to be done, bail early.
  if (IU.IVUsesByStride.empty()) return;

  DEBUG(dbgs() << "\nLSR on loop ";
        WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
        dbgs() << ":\n");

  // Sort the StrideOrder so we process larger strides first.
  std::stable_sort(IU.StrideOrder.begin(), IU.StrideOrder.end(),
                   StrideCompare(SE));

  /// OptimizeShadowIV - If IV is used in a int-to-float cast
  /// inside the loop then try to eliminate the cast opeation.
  OptimizeShadowIV();

  // Change loop terminating condition to use the postinc iv when possible.
  Changed |= OptimizeLoopTermCond();

  for (SmallVectorImpl<const SCEV *>::const_iterator SIter =
       IU.StrideOrder.begin(), SEnd = IU.StrideOrder.end();
       SIter != SEnd; ++SIter) {
    const SCEV *Stride = *SIter;

    // Collect interesting types.
    Types.insert(SE.getEffectiveSCEVType(Stride->getType()));

    // Collect interesting factors.
    for (SmallVectorImpl<const SCEV *>::const_iterator NewStrideIter =
         SIter + 1; NewStrideIter != SEnd; ++NewStrideIter) {
      const SCEV *OldStride = Stride;
      const SCEV *NewStride = *NewStrideIter;

      if (SE.getTypeSizeInBits(OldStride->getType()) !=
          SE.getTypeSizeInBits(NewStride->getType())) {
        if (SE.getTypeSizeInBits(OldStride->getType()) >
            SE.getTypeSizeInBits(NewStride->getType()))
          NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
        else
          OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
      }
      if (const SCEVConstant *Factor =
            dyn_cast_or_null<SCEVConstant>(getSDiv(NewStride, OldStride,
                                                   SE, true)))
        if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
          Factors.insert(Factor->getValue()->getValue().getSExtValue());
    }

    std::map<const SCEV *, IVUsersOfOneStride *>::const_iterator SI =
      IU.IVUsesByStride.find(Stride);
    assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
    for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
         E = SI->second->Users.end(); UI != E; ++UI) {
      // Record the uses.
      LSRUse &LU = getNewUse();
      LU.UserInst = UI->getUser();
      LU.OperandValToReplace = UI->getOperandValToReplace();
      if (isAddressUse(LU.UserInst, LU.OperandValToReplace)) {
        LU.Kind = LSRUse::Address;
        LU.AccessTy = getAccessType(LU.UserInst);
      }
      if (UI->isUseOfPostIncrementedValue())
        LU.PostIncLoop = L;

      const SCEV *S = IU.getCanonicalExpr(*UI);

      // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
      // (N - i == 0), and this allows (N - i) to be the expression that we
      // work with rather than just N or i, so we can consider the register
      // requirements for both N and i at the same time. Limiting this code
      // to equality icmps is not a problem because all interesting loops
      // use equality icmps, thanks to IndVarSimplify.
      if (ICmpInst *CI = dyn_cast<ICmpInst>(LU.UserInst))
        if (CI->isEquality()) {
          // Swap the operands if needed to put the OperandValToReplace on
          // the left, for consistency.
          Value *NV = CI->getOperand(1);
          if (NV == LU.OperandValToReplace) {
            CI->setOperand(1, CI->getOperand(0));
            CI->setOperand(0, NV);
          }

          // x == y  -->  x - y == 0
          const SCEV *N = SE.getSCEV(NV);
          if (N->isLoopInvariant(L)) {
            LU.Kind = LSRUse::ICmpZero;
            S = SE.getMinusSCEV(N, S);
          }

          // -1 and the negations of all interesting strides (except the
          // negation of -1) are now also interesting.
          for (size_t i = 0, e = Factors.size(); i != e; ++i)
            if (Factors[i] != -1)
              Factors.insert(-(uint64_t)Factors[i]);
          Factors.insert(-1);
        }

      // Set up the initial formula for this use.
      LU.InsertInitialFormula(S, L, SE, DT);
      CountRegisters(LU.Formulae.back(), Uses.size() - 1);
    }
  }

  // If all uses use the same type, don't bother looking for truncation-based
  // reuse.
  if (Types.size() == 1)
    Types.clear();

  // If there are any uses of registers that we're tracking that have escaped
  // IVUsers' attention, add trivial uses for them, so that the register
  // voting process takes the into consideration.
  GenerateLoopInvariantRegisterUses();

  // Start by assuming we'll assign each use its own register. This is
  // sometimes called "full" strength reduction, or "superhero" mode.
  // Sometimes this is the best solution, but if there are opportunities for
  // reuse we may find a better solution.
  Score CurScore;
  CurScore.RateInitial(Uses, L, SE);

  MaxNumRegs = CurScore.getNumRegs();

  // Now use the reuse data to generate a bunch of interesting ways
  // to formulate the values needed for the uses.
  GenerateAllReuseFormulae();

  // Sort the formulae. TODO: This is redundantly sorted below.
  for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(), E = Uses.end();
       I != E; ++I) {
    LSRUse &LU = *I;
    std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
                     ComplexitySorter());
  }

  // Ok, we've now collected all the uses and noted their register uses. The
  // next step is to start looking at register reuse possibilities.
  DEBUG(print(dbgs()); dbgs() << '\n'; IU.dump());

  // Create a sorted list of registers with those with the most uses appearing
  // earlier in the list. We'll visit them first, as they're the most likely
  // to represent profitable reuse opportunities.
  SmallVector<RegCount, 8> RegOrder;
  for (SmallVectorImpl<const SCEV *>::const_iterator I =
       RegSequence.begin(), E = RegSequence.end(); I != E; ++I)
    RegOrder.push_back(RegCount(*I, RegUses.find(*I)->second));
  std::stable_sort(RegOrder.begin(), RegOrder.end());

  // Visit each register. Determine which ones represent profitable reuse
  // opportunities and remember them.
  // TODO: Extract this code into a function.
  for (SmallVectorImpl<RegCount>::const_iterator I = RegOrder.begin(),
       E = RegOrder.end(); I != E; ++I) {
    const SCEV *Reg = I->Reg;
    const SmallBitVector &Bits = I->Sort.Bits;

    // Registers with only one use don't represent reuse opportunities, so
    // when we get there, we're done.
    if (Bits.count() <= 1) break;

    DEBUG(dbgs() << "Reg " << *Reg << ": ";
          I->Sort.print(dbgs());
          dbgs() << '\n');

    // Determine the total number of registers will be needed if we make use
    // of the reuse opportunity represented by the current register.
    Score NewScore;
    NewScore.Rate(Reg, Bits, Uses, L, SE);

    // Now decide whether this register's reuse opportunity is an overall win.
    // Currently the decision is heavily skewed for register pressure.
    if (!(NewScore < CurScore)) {
      continue;
    }

    // Ok, use this opportunity.
    DEBUG(dbgs() << "This candidate has been accepted.\n");
    CurScore = NewScore;

    // Now that we've selected a new reuse opportunity, delete formulae that
    // do not participate in that opportunity.
    for (int j = Bits.find_first(); j != -1; j = Bits.find_next(j)) {
      LSRUse &LU = Uses[j];
      for (unsigned k = 0, h = LU.Formulae.size(); k != h; ++k) {
        Formula &F = LU.Formulae[k];
        if (!F.referencesReg(Reg)) {
          std::swap(LU.Formulae[k], LU.Formulae.back());
          LU.Formulae.pop_back();
          --k; --h;
        }
      }
      // Also re-sort the list to put the formulae with the fewest registers
      // at the front.
      // TODO: Do this earlier, we don't need it each time.
      std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
                       ComplexitySorter());
    }
  }

  // Ok, we've now made all our decisions. The first formula for each use
  // will be used.
  DEBUG(dbgs() << "Concluding, we need "; CurScore.print(dbgs());
        dbgs() << ".\n";
        debug_winner(Uses));

  // Free memory no longer needed.
  RegOrder.clear();
  Factors.clear();
  Types.clear();
  RegUses.clear();
  RegSequence.clear();

  // Keep track of instructions we may have made dead, so that
  // we can remove them after we are done working.
  SmallVector<WeakVH, 16> DeadInsts;

  SCEVExpander Rewriter(SE);
  Rewriter.disableCanonicalMode();
  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);

  // Expand the new value definitions and update the users.
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I) {
    // Formulae should be legal.
    DEBUG(for (SmallVectorImpl<Formula>::const_iterator J = I->Formulae.begin(),
               JE = I->Formulae.end(); J != JE; ++J)
            assert(isLegalUse(J->AM, I->Kind, I->AccessTy, TLI) &&
                   "Illegal formula generated!"));

    // Expand the new code and update the user.
    I->Rewrite(L, IVIncInsertPos, Rewriter, DeadInsts, SE, DT, P);
    Changed = true;
  }

  // Clean up after ourselves. This must be done before deleting any
  // instructions.
  Rewriter.clear();

  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
}

void LSRInstance::print(raw_ostream &OS) const {
  if (MaxNumRegs != 0)
    OS << "LSR is considering " << MaxNumRegs << " to be the maximum "
          "number of registers needed.\n";

  OS << "LSR has identified the following interesting factors and types: ";
  bool First = true;

  for (SmallSetVector<int64_t, 4>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    if (!First) OS << ", ";
    First = false;
    OS << '*' << *I;
  }

  for (SmallSetVector<const Type *, 4>::const_iterator
       I = Types.begin(), E = Types.end(); I != E; ++I) {
    if (!First) OS << ", ";
    First = false;
    OS << '(' << **I << ')';
  }
  OS << '\n';

  OS << "LSR is examining the following uses, and candidate formulae:\n";
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I) {
    const LSRUse &LU = *I;
    dbgs() << "  ";
    LU.print(OS);
    OS << '\n';
    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
         JE = LU.Formulae.end(); J != JE; ++J) {
      OS << "    ";
      J->print(OS);
      OS << "\n";
    }
  }
}

void LSRInstance::dump() const {
  print(errs()); errs() << '\n';
}

namespace {

class LoopStrengthReduce : public LoopPass {
  /// TLI - Keep a pointer of a TargetLowering to consult for determining
  /// transformation profitability.
  const TargetLowering *const TLI;

public:
  static char ID; // Pass ID, replacement for typeid
  explicit LoopStrengthReduce(const TargetLowering *tli = NULL);

private:
  bool runOnLoop(Loop *L, LPPassManager &LPM);
  void getAnalysisUsage(AnalysisUsage &AU) const;
};

}

char LoopStrengthReduce::ID = 0;
static RegisterPass<LoopStrengthReduce>
X("loop-reduce", "Loop Strength Reduction");

Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
  return new LoopStrengthReduce(TLI);
}

LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
  : LoopPass(&ID), TLI(tli) {}

void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
  // We split critical edges, so we change the CFG.  However, we do update
  // many analyses if they are around.
  AU.addPreservedID(LoopSimplifyID);
  AU.addPreserved<LoopInfo>();
  AU.addPreserved("domfrontier");

  AU.addRequiredID(LoopSimplifyID);
  AU.addRequired<DominatorTree>();
  AU.addPreserved<DominatorTree>();
  AU.addRequired<ScalarEvolution>();
  AU.addPreserved<ScalarEvolution>();
  AU.addRequired<IVUsers>();
  AU.addPreserved<IVUsers>();
}

bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
  bool Changed = false;

  // Run the main LSR transformation.
  Changed |= LSRInstance(TLI, L, this).getChanged();

  // At this point, it is worth checking to see if any recurrence PHIs are also
  // dead, so that we can remove them as well.
  Changed |= DeleteDeadPHIs(L->getHeader());

  return Changed;
}