LoopStrengthReduce.cpp

       E = Uses.end(); I != E; ++I) {
    size_t FSize = I->Formulae.size();
    if (FSize >= ComplexityLimit) {
      Power = ComplexityLimit;
      break;
    }
    Power *= FSize;
    if (Power >= ComplexityLimit)
      break;
  }
  return Power;
}

/// NarrowSearchSpaceByDetectingSupersets - When one formula uses a superset
/// of the registers of another formula, it won't help reduce register
/// pressure (though it may not necessarily hurt register pressure); remove
/// it to simplify the system.
void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    DEBUG(dbgs() << "The search space is too complex.\n");

    DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
                    "which use a superset of registers used by other "
                    "formulae.\n");

    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
      LSRUse &LU = Uses[LUIdx];
      bool Any = false;
      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
        Formula &F = LU.Formulae[i];
        // Look for a formula with a constant or GV in a register. If the use
        // also has a formula with that same value in an immediate field,
        // delete the one that uses a register.
        for (SmallVectorImpl<const SCEV *>::const_iterator
             I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
          if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
            Formula NewF = F;
            NewF.AM.BaseOffs += C->getValue()->getSExtValue();
            NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                (I - F.BaseRegs.begin()));
            if (LU.HasFormulaWithSameRegs(NewF)) {
              DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
              LU.DeleteFormula(F);
              --i;
              --e;
              Any = true;
              break;
            }
          } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
            if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
              if (!F.AM.BaseGV) {
                Formula NewF = F;
                NewF.AM.BaseGV = GV;
                NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
                                    (I - F.BaseRegs.begin()));
                if (LU.HasFormulaWithSameRegs(NewF)) {
                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
                        dbgs() << '\n');
                  LU.DeleteFormula(F);
                  --i;
                  --e;
                  Any = true;
                  break;
                }
              }
          }
        }
      }
      if (Any)
        LU.RecomputeRegs(LUIdx, RegUses);
    }

    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
}

/// NarrowSearchSpaceByCollapsingUnrolledCode - When there are many registers
/// for expressions like A, A+1, A+2, etc., allocate a single register for
/// them.
void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    DEBUG(dbgs() << "The search space is too complex.\n");

    DEBUG(dbgs() << "Narrowing the search space by assuming that uses "
                    "separated by a constant offset will use the same "
                    "registers.\n");

    // This is especially useful for unrolled loops.

    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
      LSRUse &LU = Uses[LUIdx];
      for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
           E = LU.Formulae.end(); I != E; ++I) {
        const Formula &F = *I;
        if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) {
          if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) {
            if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs,
                                   /*HasBaseReg=*/false,
                                   LU.Kind, LU.AccessTy)) {
              DEBUG(dbgs() << "  Deleting use "; LU.print(dbgs());
                    dbgs() << '\n');

              LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;

              // Update the relocs to reference the new use.
              for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(),
                   E = Fixups.end(); I != E; ++I) {
                LSRFixup &Fixup = *I;
                if (Fixup.LUIdx == LUIdx) {
                  Fixup.LUIdx = LUThatHas - &Uses.front();
                  Fixup.Offset += F.AM.BaseOffs;
                  // Add the new offset to LUThatHas' offset list.
                  if (LUThatHas->Offsets.back() != Fixup.Offset) {
                    LUThatHas->Offsets.push_back(Fixup.Offset);
                    if (Fixup.Offset > LUThatHas->MaxOffset)
                      LUThatHas->MaxOffset = Fixup.Offset;
                    if (Fixup.Offset < LUThatHas->MinOffset)
                      LUThatHas->MinOffset = Fixup.Offset;
                  }
                  DEBUG(dbgs() << "New fixup has offset "
                               << Fixup.Offset << '\n');
                }
                if (Fixup.LUIdx == NumUses-1)
                  Fixup.LUIdx = LUIdx;
              }

              // Delete formulae from the new use which are no longer legal.
              bool Any = false;
              for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
                Formula &F = LUThatHas->Formulae[i];
                if (!isLegalUse(F.AM,
                                LUThatHas->MinOffset, LUThatHas->MaxOffset,
                                LUThatHas->Kind, LUThatHas->AccessTy, TLI)) {
                  DEBUG(dbgs() << "  Deleting "; F.print(dbgs());
                        dbgs() << '\n');
                  LUThatHas->DeleteFormula(F);
                  --i;
                  --e;
                  Any = true;
                }
              }
              if (Any)
                LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);

              // Delete the old use.
              DeleteUse(LU, LUIdx);
              --LUIdx;
              --NumUses;
              break;
            }
          }
        }
      }
    }

    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
}

/// NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters - Call
/// FilterOutUndesirableDedicatedRegisters again, if necessary, now that
/// we've done more filtering, as it may be able to find more formulae to
/// eliminate.
void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
  if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    DEBUG(dbgs() << "The search space is too complex.\n");

    DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
                    "undesirable dedicated registers.\n");

    FilterOutUndesirableDedicatedRegisters();

    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
}

/// NarrowSearchSpaceByPickingWinnerRegs - Pick a register which seems likely
/// to be profitable, and then in any use which has any reference to that
/// register, delete all formulae which do not reference that register.
void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
  // With all other options exhausted, loop until the system is simple
  // enough to handle.
  SmallPtrSet<const SCEV *, 4> Taken;
  while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
    // Ok, we have too many of formulae on our hands to conveniently handle.
    // Use a rough heuristic to thin out the list.
    DEBUG(dbgs() << "The search space is too complex.\n");

    // Pick the register which is used by the most LSRUses, which is likely
    // to be a good reuse register candidate.
    const SCEV *Best = 0;
    unsigned BestNum = 0;
    for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
         I != E; ++I) {
      const SCEV *Reg = *I;
      if (Taken.count(Reg))
        continue;
      if (!Best)
        Best = Reg;
      else {
        unsigned Count = RegUses.getUsedByIndices(Reg).count();
        if (Count > BestNum) {
          Best = Reg;
          BestNum = Count;
        }
      }
    }

    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
                 << " will yield profitable reuse.\n");
    Taken.insert(Best);

    // In any use with formulae which references this register, delete formulae
    // which don't reference it.
    for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
      LSRUse &LU = Uses[LUIdx];
      if (!LU.Regs.count(Best)) continue;

      bool Any = false;
      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
        Formula &F = LU.Formulae[i];
        if (!F.referencesReg(Best)) {
          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
          LU.DeleteFormula(F);
          --e;
          --i;
          Any = true;
          assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
          continue;
        }
      }

      if (Any)
        LU.RecomputeRegs(LUIdx, RegUses);
    }

    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
}

/// NarrowSearchSpaceUsingHeuristics - If there are an extraordinary number of
/// formulae to choose from, use some rough heuristics to prune down the number
/// of formulae. This keeps the main solver from taking an extraordinary amount
/// of time in some worst-case scenarios.
void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
  NarrowSearchSpaceByDetectingSupersets();
  NarrowSearchSpaceByCollapsingUnrolledCode();
  NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
  NarrowSearchSpaceByPickingWinnerRegs();
}

/// SolveRecurse - This is the recursive solver.
void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                               Cost &SolutionCost,
                               SmallVectorImpl<const Formula *> &Workspace,
                               const Cost &CurCost,
                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
                               DenseSet<const SCEV *> &VisitedRegs) const {
  // Some ideas:
  //  - prune more:
  //    - use more aggressive filtering
  //    - sort the formula so that the most profitable solutions are found first
  //    - sort the uses too
  //  - search faster:
  //    - don't compute a cost, and then compare. compare while computing a cost
  //      and bail early.
  //    - track register sets with SmallBitVector

  const LSRUse &LU = Uses[Workspace.size()];

  // If this use references any register that's already a part of the
  // in-progress solution, consider it a requirement that a formula must
  // reference that register in order to be considered. This prunes out
  // unprofitable searching.
  SmallSetVector<const SCEV *, 4> ReqRegs;
  for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
       E = CurRegs.end(); I != E; ++I)
    if (LU.Regs.count(*I))
      ReqRegs.insert(*I);

  bool AnySatisfiedReqRegs = false;
  SmallPtrSet<const SCEV *, 16> NewRegs;
  Cost NewCost;
retry:
  for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
       E = LU.Formulae.end(); I != E; ++I) {
    const Formula &F = *I;

    // Ignore formulae which do not use any of the required registers.
    for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
         JE = ReqRegs.end(); J != JE; ++J) {
      const SCEV *Reg = *J;
      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
          F.BaseRegs.end())
        goto skip;
    }
    AnySatisfiedReqRegs = true;

    // Evaluate the cost of the current formula. If it's already worse than
    // the current best, prune the search at that point.
    NewCost = CurCost;
    NewRegs = CurRegs;
    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
    if (NewCost < SolutionCost) {
      Workspace.push_back(&F);
      if (Workspace.size() != Uses.size()) {
        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
                     NewRegs, VisitedRegs);
        if (F.getNumRegs() == 1 && Workspace.size() == 1)
          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
      } else {
        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
              dbgs() << ". Regs:";
              for (SmallPtrSet<const SCEV *, 16>::const_iterator
                   I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
                dbgs() << ' ' << **I;
              dbgs() << '\n');

        SolutionCost = NewCost;
        Solution = Workspace;
      }
      Workspace.pop_back();
    }
  skip:;
  }

  if (!EnableRetry && !AnySatisfiedReqRegs)
    return;

  // If none of the formulae had all of the required registers, relax the
  // constraint so that we don't exclude all formulae.
  if (!AnySatisfiedReqRegs) {
    assert(!ReqRegs.empty() && "Solver failed even without required registers");
    ReqRegs.clear();
    goto retry;
  }
}

/// Solve - Choose one formula from each use. Return the results in the given
/// Solution vector.
void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
  SmallVector<const Formula *, 8> Workspace;
  Cost SolutionCost;
  SolutionCost.Loose();
  Cost CurCost;
  SmallPtrSet<const SCEV *, 16> CurRegs;
  DenseSet<const SCEV *> VisitedRegs;
  Workspace.reserve(Uses.size());

  // SolveRecurse does all the work.
  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
               CurRegs, VisitedRegs);
  if (Solution.empty()) {
    DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
    return;
  }

  // Ok, we've now made all our decisions.
  DEBUG(dbgs() << "\n"
                  "The chosen solution requires "; SolutionCost.print(dbgs());
        dbgs() << ":\n";
        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
          dbgs() << "  ";
          Uses[i].print(dbgs());
          dbgs() << "\n"
                    "    ";
          Solution[i]->print(dbgs());
          dbgs() << '\n';
        });

  assert(Solution.size() == Uses.size() && "Malformed solution!");
}

/// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up
/// the dominator tree far as we can go while still being dominated by the
/// input positions. This helps canonicalize the insert position, which
/// encourages sharing.
BasicBlock::iterator
LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
                                 const SmallVectorImpl<Instruction *> &Inputs)
                                                                         const {
  for (;;) {
    const Loop *IPLoop = LI.getLoopFor(IP->getParent());
    unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;

    BasicBlock *IDom;
    for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
      if (!Rung) return IP;
      Rung = Rung->getIDom();
      if (!Rung) return IP;
      IDom = Rung->getBlock();

      // Don't climb into a loop though.
      const Loop *IDomLoop = LI.getLoopFor(IDom);
      unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
      if (IDomDepth <= IPLoopDepth &&
          (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
        break;
    }

    bool AllDominate = true;
    Instruction *BetterPos = 0;
    Instruction *Tentative = IDom->getTerminator();
    for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
         E = Inputs.end(); I != E; ++I) {
      Instruction *Inst = *I;
      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
        AllDominate = false;
        break;
      }
      // Attempt to find an insert position in the middle of the block,
      // instead of at the end, so that it can be used for other expansions.
      if (IDom == Inst->getParent() &&
          (!BetterPos || DT.dominates(BetterPos, Inst)))
        BetterPos = llvm::next(BasicBlock::iterator(Inst));
    }
    if (!AllDominate)
      break;
    if (BetterPos)
      IP = BetterPos;
    else
      IP = Tentative;
  }

  return IP;
}

/// AdjustInsertPositionForExpand - Determine an input position which will be
/// dominated by the operands and which will dominate the result.
BasicBlock::iterator
LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator IP,
                                           const LSRFixup &LF,
                                           const LSRUse &LU) const {
  // Collect some instructions which must be dominated by the
  // expanding replacement. These must be dominated by any operands that
  // will be required in the expansion.
  SmallVector<Instruction *, 4> Inputs;
  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
    Inputs.push_back(I);
  if (LU.Kind == LSRUse::ICmpZero)
    if (Instruction *I =
          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
      Inputs.push_back(I);
  if (LF.PostIncLoops.count(L)) {
    if (LF.isUseFullyOutsideLoop(L))
      Inputs.push_back(L->getLoopLatch()->getTerminator());
    else
      Inputs.push_back(IVIncInsertPos);
  }
  // The expansion must also be dominated by the increment positions of any
  // loops it for which it is using post-inc mode.
  for (PostIncLoopSet::const_iterator I = LF.PostIncLoops.begin(),
       E = LF.PostIncLoops.end(); I != E; ++I) {
    const Loop *PIL = *I;
    if (PIL == L) continue;

    // Be dominated by the loop exit.
    SmallVector<BasicBlock *, 4> ExitingBlocks;
    PIL->getExitingBlocks(ExitingBlocks);
    if (!ExitingBlocks.empty()) {
      BasicBlock *BB = ExitingBlocks[0];
      for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
        BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
      Inputs.push_back(BB->getTerminator());
    }
  }

  // Then, climb up the immediate dominator tree as far as we can go while
  // still being dominated by the input positions.
  IP = HoistInsertPosition(IP, Inputs);

  // Don't insert instructions before PHI nodes.
  while (isa<PHINode>(IP)) ++IP;

  // Ignore landingpad instructions.
  while (isa<LandingPadInst>(IP)) ++IP;

  // Ignore debug intrinsics.
  while (isa<DbgInfoIntrinsic>(IP)) ++IP;

  return IP;
}

/// Expand - Emit instructions for the leading candidate expression for this
/// LSRUse (this is called "expanding").
Value *LSRInstance::Expand(const LSRFixup &LF,
                           const Formula &F,
                           BasicBlock::iterator IP,
                           SCEVExpander &Rewriter,
                           SmallVectorImpl<WeakVH> &DeadInsts) const {
  const LSRUse &LU = Uses[LF.LUIdx];

  // Determine an input position which will be dominated by the operands and
  // which will dominate the result.
  IP = AdjustInsertPositionForExpand(IP, LF, LU);

  // Inform the Rewriter if we have a post-increment use, so that it can
  // perform an advantageous expansion.
  Rewriter.setPostInc(LF.PostIncLoops);

  // This is the type that the user actually needs.
  Type *OpTy = LF.OperandValToReplace->getType();
  // This will be the type that we'll initially expand to.
  Type *Ty = F.getType();
  if (!Ty)
    // No type known; just expand directly to the ultimate type.
    Ty = OpTy;
  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
    // Expand directly to the ultimate type if it's the right size.
    Ty = OpTy;
  // This is the type to do integer arithmetic in.
  Type *IntTy = SE.getEffectiveSCEVType(Ty);

  // Build up a list of operands to add together to form the full base.
  SmallVector<const SCEV *, 8> Ops;

  // Expand the BaseRegs portion.
  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
       E = F.BaseRegs.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    assert(!Reg->isZero() && "Zero allocated in a base register!");

    // If we're expanding for a post-inc user, make the post-inc adjustment.
    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
    Reg = TransformForPostIncUse(Denormalize, Reg,
                                 LF.UserInst, LF.OperandValToReplace,
                                 Loops, SE, DT);

    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
  }

  // Flush the operand list to suppress SCEVExpander hoisting.
  if (!Ops.empty()) {
    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
    Ops.clear();
    Ops.push_back(SE.getUnknown(FullV));
  }

  // Expand the ScaledReg portion.
  Value *ICmpScaledV = 0;
  if (F.AM.Scale != 0) {
    const SCEV *ScaledS = F.ScaledReg;

    // If we're expanding for a post-inc user, make the post-inc adjustment.
    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
                                     LF.UserInst, LF.OperandValToReplace,
                                     Loops, SE, DT);

    if (LU.Kind == LSRUse::ICmpZero) {
      // An interesting way of "folding" with an icmp is to use a negated
      // scale, which we'll implement by inserting it into the other operand
      // of the icmp.
      assert(F.AM.Scale == -1 &&
             "The only scale supported by ICmpZero uses is -1!");
      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
    } else {
      // Otherwise just expand the scaled register and an explicit scale,
      // which is expected to be matched as part of the address.
      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
      ScaledS = SE.getMulExpr(ScaledS,
                              SE.getConstant(ScaledS->getType(), F.AM.Scale));
      Ops.push_back(ScaledS);

      // Flush the operand list to suppress SCEVExpander hoisting.
      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
      Ops.clear();
      Ops.push_back(SE.getUnknown(FullV));
    }
  }

  // Expand the GV portion.
  if (F.AM.BaseGV) {
    Ops.push_back(SE.getUnknown(F.AM.BaseGV));

    // Flush the operand list to suppress SCEVExpander hoisting.
    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
    Ops.clear();
    Ops.push_back(SE.getUnknown(FullV));
  }

  // Expand the immediate portion.
  int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
  if (Offset != 0) {
    if (LU.Kind == LSRUse::ICmpZero) {
      // The other interesting way of "folding" with an ICmpZero is to use a
      // negated immediate.
      if (!ICmpScaledV)
        ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
      else {
        Ops.push_back(SE.getUnknown(ICmpScaledV));
        ICmpScaledV = ConstantInt::get(IntTy, Offset);
      }
    } else {
      // Just add the immediate values. These again are expected to be matched
      // as part of the address.
      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
    }
  }

  // Expand the unfolded offset portion.
  int64_t UnfoldedOffset = F.UnfoldedOffset;
  if (UnfoldedOffset != 0) {
    // Just add the immediate values.
    Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
                                                       UnfoldedOffset)));
  }

  // Emit instructions summing all the operands.
  const SCEV *FullS = Ops.empty() ?
                      SE.getConstant(IntTy, 0) :
                      SE.getAddExpr(Ops);
  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);

  // We're done expanding now, so reset the rewriter.
  Rewriter.clearPostInc();

  // An ICmpZero Formula represents an ICmp which we're handling as a
  // comparison against zero. Now that we've expanded an expression for that
  // form, update the ICmp's other operand.
  if (LU.Kind == LSRUse::ICmpZero) {
    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
    DeadInsts.push_back(CI->getOperand(1));
    assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
                           "a scale at the same time!");
    if (F.AM.Scale == -1) {
      if (ICmpScaledV->getType() != OpTy) {
        Instruction *Cast =
          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
                                                   OpTy, false),
                           ICmpScaledV, OpTy, "tmp", CI);
        ICmpScaledV = Cast;
      }
      CI->setOperand(1, ICmpScaledV);
    } else {
      assert(F.AM.Scale == 0 &&
             "ICmp does not support folding a global value and "
             "a scale at the same time!");
      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
                                           -(uint64_t)Offset);
      if (C->getType() != OpTy)
        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                          OpTy, false),
                                  C, OpTy);

      CI->setOperand(1, C);
    }
  }

  return FullV;
}

/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
/// of their operands effectively happens in their predecessor blocks, so the
/// expression may need to be expanded in multiple places.
void LSRInstance::RewriteForPHI(PHINode *PN,
                                const LSRFixup &LF,
                                const Formula &F,
                                SCEVExpander &Rewriter,
                                SmallVectorImpl<WeakVH> &DeadInsts,
                                Pass *P) const {
  DenseMap<BasicBlock *, Value *> Inserted;
  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
      BasicBlock *BB = PN->getIncomingBlock(i);

      // If this is a critical edge, split the edge so that we do not insert
      // the code on all predecessor/successor paths.  We do this unless this
      // is the canonical backedge for this loop, which complicates post-inc
      // users.
      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
          !isa<IndirectBrInst>(BB->getTerminator())) {
        BasicBlock *Parent = PN->getParent();
        Loop *PNLoop = LI.getLoopFor(Parent);
        if (!PNLoop || Parent != PNLoop->getHeader()) {
          // Split the critical edge.
          BasicBlock *NewBB = 0;
          if (!Parent->isLandingPad()) {
            NewBB = SplitCriticalEdge(BB, Parent, P,
                                      /*MergeIdenticalEdges=*/true,
                                      /*DontDeleteUselessPhis=*/true);
          } else {
            SmallVector<BasicBlock*, 2> NewBBs;
            SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
            NewBB = NewBBs[0];
          }

          // If PN is outside of the loop and BB is in the loop, we want to
          // move the block to be immediately before the PHI block, not
          // immediately after BB.
          if (L->contains(BB) && !L->contains(PN))
            NewBB->moveBefore(PN->getParent());

          // Splitting the edge can reduce the number of PHI entries we have.
          e = PN->getNumIncomingValues();
          BB = NewBB;
          i = PN->getBasicBlockIndex(BB);
        }
      }

      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
        Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
      if (!Pair.second)
        PN->setIncomingValue(i, Pair.first->second);
      else {
        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);

        // If this is reuse-by-noop-cast, insert the noop cast.
        Type *OpTy = LF.OperandValToReplace->getType();
        if (FullV->getType() != OpTy)
          FullV =
            CastInst::Create(CastInst::getCastOpcode(FullV, false,
                                                     OpTy, false),
                             FullV, LF.OperandValToReplace->getType(),
                             "tmp", BB->getTerminator());

        PN->setIncomingValue(i, FullV);
        Pair.first->second = FullV;
      }
    }
}

/// Rewrite - Emit instructions for the leading candidate expression for this
/// LSRUse (this is called "expanding"), and update the UserInst to reference
/// the newly expanded value.
void LSRInstance::Rewrite(const LSRFixup &LF,
                          const Formula &F,
                          SCEVExpander &Rewriter,
                          SmallVectorImpl<WeakVH> &DeadInsts,
                          Pass *P) const {
  // First, find an insertion point that dominates UserInst. For PHI nodes,
  // find the nearest block which dominates all the relevant uses.
  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
  } else {
    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);

    // If this is reuse-by-noop-cast, insert the noop cast.
    Type *OpTy = LF.OperandValToReplace->getType();
    if (FullV->getType() != OpTy) {
      Instruction *Cast =
        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
                         FullV, OpTy, "tmp", LF.UserInst);
      FullV = Cast;
    }

    // Update the user. ICmpZero is handled specially here (for now) because
    // Expand may have updated one of the operands of the icmp already, and
    // its new value may happen to be equal to LF.OperandValToReplace, in
    // which case doing replaceUsesOfWith leads to replacing both operands
    // with the same value. TODO: Reorganize this.
    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
      LF.UserInst->setOperand(0, FullV);
    else
      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
  }

  DeadInsts.push_back(LF.OperandValToReplace);
}

/// ImplementSolution - Rewrite all the fixup locations with new values,
/// following the chosen solution.
void
LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
                               Pass *P) {
  // Keep track of instructions we may have made dead, so that
  // we can remove them after we are done working.
  SmallVector<WeakVH, 16> DeadInsts;

  SCEVExpander Rewriter(SE, "lsr");
  Rewriter.disableCanonicalMode();
  Rewriter.enableLSRMode();
  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);

  // Expand the new value definitions and update the users.
  for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
       E = Fixups.end(); I != E; ++I) {
    const LSRFixup &Fixup = *I;

    Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts, P);

    Changed = true;
  }

  // Clean up after ourselves. This must be done before deleting any
  // instructions.
  Rewriter.clear();

  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
}

LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
  : IU(P->getAnalysis<IVUsers>()),
    SE(P->getAnalysis<ScalarEvolution>()),
    DT(P->getAnalysis<DominatorTree>()),
    LI(P->getAnalysis<LoopInfo>()),
    TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {

  // If LoopSimplify form is not available, stay out of trouble.
  if (!L->isLoopSimplifyForm()) return;

  // If there's no interesting work to be done, bail early.
  if (IU.empty()) return;

  DEBUG(dbgs() << "\nLSR on loop ";
        WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
        dbgs() << ":\n");

  // First, perform some low-level loop optimizations.
  OptimizeShadowIV();
  OptimizeLoopTermCond();

  // If loop preparation eliminates all interesting IV users, bail.
  if (IU.empty()) return;

  // Skip nested loops until we can model them better with formulae.
  if (!EnableNested && !L->empty()) {

    if (EnablePhiElim) {
      // Remove any extra phis created by processing inner loops.
      SmallVector<WeakVH, 16> DeadInsts;
      SCEVExpander Rewriter(SE, "lsr");
      Changed |= (bool)Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
      Changed |= (bool)DeleteTriviallyDeadInstructions(DeadInsts);
    }
    DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
    return;
  }

  // Start collecting data and preparing for the solver.
  CollectInterestingTypesAndFactors();
  CollectFixupsAndInitialFormulae();
  CollectLoopInvariantFixupsAndFormulae();

  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
        print_uses(dbgs()));

  // Now use the reuse data to generate a bunch of interesting ways
  // to formulate the values needed for the uses.
  GenerateAllReuseFormulae();

  FilterOutUndesirableDedicatedRegisters();
  NarrowSearchSpaceUsingHeuristics();

  SmallVector<const Formula *, 8> Solution;
  Solve(Solution);

  // Release memory that is no longer needed.
  Factors.clear();
  Types.clear();
  RegUses.clear();

  if (Solution.empty())
    return;

#ifndef NDEBUG
  // Formulae should be legal.
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I) {
     const LSRUse &LU = *I;
     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
          JE = LU.Formulae.end(); J != JE; ++J)
        assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
                          LU.Kind, LU.AccessTy, TLI) &&
               "Illegal formula generated!");
  };
#endif

  // Now that we've decided what we want, make it so.
  ImplementSolution(Solution, P);

  if (EnablePhiElim) {
    // Remove any extra phis created by processing inner loops.
    SmallVector<WeakVH, 16> DeadInsts;
    SCEVExpander Rewriter(SE, "lsr");
    Changed |= (bool)Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
    Changed |= (bool)DeleteTriviallyDeadInstructions(DeadInsts);
  }
}

void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
  if (Factors.empty() && Types.empty()) return;

  OS << "LSR has identified the following interesting factors and types: ";
  bool First = true;

  for (SmallSetVector<int64_t, 8>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    if (!First) OS << ", ";
    First = false;
    OS << '*' << *I;
  }

  for (SmallSetVector<Type *, 4>::const_iterator
       I = Types.begin(), E = Types.end(); I != E; ++I) {
    if (!First) OS << ", ";
    First = false;
    OS << '(' << **I << ')';
  }
  OS << '\n';
}

void LSRInstance::print_fixups(raw_ostream &OS) const {
  OS << "LSR is examining the following fixup sites:\n";
  for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
       E = Fixups.end(); I != E; ++I) {
    dbgs() << "  ";
    I->print(OS);
    OS << '\n';
  }
}

void LSRInstance::print_uses(raw_ostream &OS) const {
  OS << "LSR is examining the following uses:\n";
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I) {
    const LSRUse &LU = *I;
    dbgs() << "  ";
    LU.print(OS);
    OS << '\n';
    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
         JE = LU.Formulae.end(); J != JE; ++J) {
      OS << "    ";
      J->print(OS);
      OS << '\n';
    }
  }
}

void LSRInstance::print(raw_ostream &OS) const {
  print_factors_and_types(OS);
  print_fixups(OS);
  print_uses(OS);
}

void LSRInstance::dump() const {
  print(errs()); errs() << '\n';
}

namespace {

class LoopStrengthReduce : public LoopPass {
  /// TLI - Keep a pointer of a TargetLowering to consult for determining
  /// transformation profitability.
  const TargetLowering *const TLI;

public:
  static char ID; // Pass ID, replacement for typeid
  explicit LoopStrengthReduce(const TargetLowering *tli = 0);

private:
  bool runOnLoop(Loop *L, LPPassManager &LPM);
  void getAnalysisUsage(AnalysisUsage &AU) const;
};

}

char LoopStrengthReduce::ID = 0;
INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
                "Loop Strength Reduction", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTree)
INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
INITIALIZE_PASS_DEPENDENCY(IVUsers)
INITIALIZE_PASS_DEPENDENCY(LoopInfo)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
                "Loop Strength Reduction", false, false)


Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
  return new LoopStrengthReduce(TLI);
}

LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
  : LoopPass(ID), TLI(tli) {
    initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
  }

void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
  // We split critical edges, so we change the CFG.  However, we do update
  // many analyses if they are around.
  AU.addPreservedID(LoopSimplifyID);

  AU.addRequired<LoopInfo>();
  AU.addPreserved<LoopInfo>();
  AU.addRequiredID(LoopSimplifyID);
  AU.addRequired<DominatorTree>();
  AU.addPreserved<DominatorTree>();
  AU.addRequired<ScalarEvolution>();
  AU.addPreserved<ScalarEvolution>();
  // Requiring LoopSimplify a second time here prevents IVUsers from running
  // twice, since LoopSimplify was invalidated by running ScalarEvolution.
  AU.addRequiredID(LoopSimplifyID);
  AU.addRequired<IVUsers>();
  AU.addPreserved<IVUsers>();