Skip to content
LoopStrengthReduce.cpp 114 KiB
Newer Older

  Formula F = Base;
  F.BaseRegs.clear();
  SmallVector<const SCEV *, 4> Ops;
  for (SmallVectorImpl<const SCEV *>::const_iterator
       I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
    const SCEV *BaseReg = *I;
    if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
        !BaseReg->hasComputableLoopEvolution(L))
      Ops.push_back(BaseReg);
    else
      F.BaseRegs.push_back(BaseReg);
  }
  if (Ops.size() > 1) {
    const SCEV *Sum = SE.getAddExpr(Ops);
    // TODO: If Sum is zero, it probably means ScalarEvolution missed an
    // opportunity to fold something. For now, just ignore such cases
    // rather than procede with zero in a register.
    if (!Sum->isZero()) {
      F.BaseRegs.push_back(Sum);
      (void)InsertFormula(LU, LUIdx, F);
    }
  }
}

/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
                                          Formula Base) {
  // We can't add a symbolic offset if the address already contains one.
  if (Base.AM.BaseGV) return;

  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
    const SCEV *G = Base.BaseRegs[i];
    GlobalValue *GV = ExtractSymbol(G, SE);
    if (G->isZero() || !GV)
      continue;
    Formula F = Base;
    F.AM.BaseGV = GV;
    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
                    LU.Kind, LU.AccessTy, TLI))
      continue;
    F.BaseRegs[i] = G;
    (void)InsertFormula(LU, LUIdx, F);
  }
}

/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
                                          Formula Base) {
  // TODO: For now, just add the min and max offset, because it usually isn't
  // worthwhile looking at everything inbetween.
  SmallVector<int64_t, 4> Worklist;
  Worklist.push_back(LU.MinOffset);
  if (LU.MaxOffset != LU.MinOffset)
    Worklist.push_back(LU.MaxOffset);

  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
    const SCEV *G = Base.BaseRegs[i];

    for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
         E = Worklist.end(); I != E; ++I) {
      Formula F = Base;
      F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
      if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
                     LU.Kind, LU.AccessTy, TLI)) {
        F.BaseRegs[i] = SE.getAddExpr(G, SE.getIntegerSCEV(*I, G->getType()));

        (void)InsertFormula(LU, LUIdx, F);
      }
    }

    int64_t Imm = ExtractImmediate(G, SE);
    if (G->isZero() || Imm == 0)
      continue;
    Formula F = Base;
    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
                    LU.Kind, LU.AccessTy, TLI))
      continue;
    F.BaseRegs[i] = G;
    (void)InsertFormula(LU, LUIdx, F);
  }
}

/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
/// the comparison. For example, x == y -> x*c == y*c.
void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
                                         Formula Base) {
  if (LU.Kind != LSRUse::ICmpZero) return;

  // Determine the integer type for the base formula.
  const Type *IntTy = Base.getType();
  if (!IntTy) return;
  if (SE.getTypeSizeInBits(IntTy) > 64) return;

  // Don't do this if there is more than one offset.
  if (LU.MinOffset != LU.MaxOffset) return;

  assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");

  // Check each interesting stride.
  for (SmallSetVector<int64_t, 8>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    int64_t Factor = *I;
    Formula F = Base;

    // Check that the multiplication doesn't overflow.
    F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
    if ((int64_t)F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
      continue;

    // Check that multiplying with the use offset doesn't overflow.
    int64_t Offset = LU.MinOffset;
    Offset = (uint64_t)Offset * Factor;
    if ((int64_t)Offset / Factor != LU.MinOffset)
      continue;

    // Check that this scale is legal.
    if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
      continue;

    // Compensate for the use having MinOffset built into it.
    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;

    const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy);

    // Check that multiplying with each base register doesn't overflow.
    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
      if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
        goto next;
    }

    // Check that multiplying with the scaled register doesn't overflow.
    if (F.ScaledReg) {
      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
      if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
        continue;
    }

    // If we make it here and it's legal, add it.
    (void)InsertFormula(LU, LUIdx, F);
  next:;
  }
}

/// GenerateScales - Generate stride factor reuse formulae by making use of
/// scaled-offset address modes, for example.
void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx,
                                 Formula Base) {
  // Determine the integer type for the base formula.
  const Type *IntTy = Base.getType();
  if (!IntTy) return;

  // If this Formula already has a scaled register, we can't add another one.
  if (Base.AM.Scale != 0) return;

  // Check each interesting stride.
  for (SmallSetVector<int64_t, 8>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    int64_t Factor = *I;

    Base.AM.Scale = Factor;
    Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
    // Check whether this scale is going to be legal.
    if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
                    LU.Kind, LU.AccessTy, TLI)) {
      // As a special-case, handle special out-of-loop Basic users specially.
      // TODO: Reconsider this special case.
      if (LU.Kind == LSRUse::Basic &&
          isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
                     LSRUse::Special, LU.AccessTy, TLI) &&
          LU.AllFixupsOutsideLoop)
        LU.Kind = LSRUse::Special;
      else
        continue;
    }
    // For an ICmpZero, negating a solitary base register won't lead to
    // new solutions.
    if (LU.Kind == LSRUse::ICmpZero &&
        !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
      continue;
    // For each addrec base reg, apply the scale, if possible.
    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
      if (const SCEVAddRecExpr *AR =
            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
        const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy);
        if (FactorS->isZero())
          continue;
        // Divide out the factor, ignoring high bits, since we'll be
        // scaling the value back up in the end.
        if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
          // TODO: This could be optimized to avoid all the copying.
          Formula F = Base;
          F.ScaledReg = Quotient;
          std::swap(F.BaseRegs[i], F.BaseRegs.back());
          F.BaseRegs.pop_back();
          (void)InsertFormula(LU, LUIdx, F);
        }
      }
  }
}

/// GenerateTruncates - Generate reuse formulae from different IV types.
void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx,
                                    Formula Base) {
  // This requires TargetLowering to tell us which truncates are free.
  if (!TLI) return;

  // Don't bother truncating symbolic values.
  if (Base.AM.BaseGV) return;

  // Determine the integer type for the base formula.
  const Type *DstTy = Base.getType();
  if (!DstTy) return;
  DstTy = SE.getEffectiveSCEVType(DstTy);

  for (SmallSetVector<const Type *, 4>::const_iterator
       I = Types.begin(), E = Types.end(); I != E; ++I) {
    const Type *SrcTy = *I;
    if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
      Formula F = Base;

      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
      for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
           JE = F.BaseRegs.end(); J != JE; ++J)
        *J = SE.getAnyExtendExpr(*J, SrcTy);

      // TODO: This assumes we've done basic processing on all uses and
      // have an idea what the register usage is.
      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
        continue;

      (void)InsertFormula(LU, LUIdx, F);
    }
  }
}

namespace {

/// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
/// defer modifications so that the search phase doesn't have to worry about
/// the data structures moving underneath it.
struct WorkItem {
  size_t LUIdx;
  int64_t Imm;
  const SCEV *OrigReg;

  WorkItem(size_t LI, int64_t I, const SCEV *R)
    : LUIdx(LI), Imm(I), OrigReg(R) {}

  void print(raw_ostream &OS) const;
  void dump() const;
};

}

void WorkItem::print(raw_ostream &OS) const {
  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
     << " , add offset " << Imm;
}

void WorkItem::dump() const {
  print(errs()); errs() << '\n';
}

/// GenerateCrossUseConstantOffsets - Look for registers which are a constant
/// distance apart and try to form reuse opportunities between them.
void LSRInstance::GenerateCrossUseConstantOffsets() {
  // Group the registers by their value without any added constant offset.
  typedef std::map<int64_t, const SCEV *> ImmMapTy;
  typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
  RegMapTy Map;
  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
  SmallVector<const SCEV *, 8> Sequence;
  for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
       I != E; ++I) {
    const SCEV *Reg = *I;
    int64_t Imm = ExtractImmediate(Reg, SE);
    std::pair<RegMapTy::iterator, bool> Pair =
      Map.insert(std::make_pair(Reg, ImmMapTy()));
    if (Pair.second)
      Sequence.push_back(Reg);
    Pair.first->second.insert(std::make_pair(Imm, *I));
    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
  }

  // Now examine each set of registers with the same base value. Build up
  // a list of work to do and do the work in a separate step so that we're
  // not adding formulae and register counts while we're searching.
  SmallVector<WorkItem, 32> WorkItems;
  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
  for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
       E = Sequence.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    const ImmMapTy &Imms = Map.find(Reg)->second;

    // It's not worthwhile looking for reuse if there's only one offset.
    if (Imms.size() == 1)
      continue;

    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
          for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
               J != JE; ++J)
            dbgs() << ' ' << J->first;
          dbgs() << '\n');

    // Examine each offset.
    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
         J != JE; ++J) {
      const SCEV *OrigReg = J->second;

      int64_t JImm = J->first;
      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);

      if (!isa<SCEVConstant>(OrigReg) &&
          UsedByIndicesMap[Reg].count() == 1) {
        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
        continue;
      }

      // Conservatively examine offsets between this orig reg a few selected
      // other orig regs.
      ImmMapTy::const_iterator OtherImms[] = {
        Imms.begin(), prior(Imms.end()),
        Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
      };
      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
        ImmMapTy::const_iterator M = OtherImms[i];
        if (M == J || M == JE) continue;

        // Compute the difference between the two.
        int64_t Imm = (uint64_t)JImm - M->first;
        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
             LUIdx = UsedByIndices.find_next(LUIdx))
          // Make a memo of this use, offset, and register tuple.
          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
      }
    }
  }

  Map.clear();
  Sequence.clear();
  UsedByIndicesMap.clear();
  UniqueItems.clear();

  // Now iterate through the worklist and add new formulae.
  for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
       E = WorkItems.end(); I != E; ++I) {
    const WorkItem &WI = *I;
    size_t LUIdx = WI.LUIdx;
    LSRUse &LU = Uses[LUIdx];
    int64_t Imm = WI.Imm;
    const SCEV *OrigReg = WI.OrigReg;

    const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);

    // TODO: Use a more targetted data structure.
    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
      Formula F = LU.Formulae[L];
      // Use the immediate in the scaled register.
      if (F.ScaledReg == OrigReg) {
        int64_t Offs = (uint64_t)F.AM.BaseOffs +
                       Imm * (uint64_t)F.AM.Scale;
        // Don't create 50 + reg(-50).
        if (F.referencesReg(SE.getSCEV(
                   ConstantInt::get(IntTy, -(uint64_t)Offs))))
          continue;
        Formula NewF = F;
        NewF.AM.BaseOffs = Offs;
        if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
                        LU.Kind, LU.AccessTy, TLI))
          continue;
        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);

        // If the new scale is a constant in a register, and adding the constant
        // value to the immediate would produce a value closer to zero than the
        // immediate itself, then the formula isn't worthwhile.
        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
          if (C->getValue()->getValue().isNegative() !=
                (NewF.AM.BaseOffs < 0) &&
              (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
                .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs()))
            continue;

        // OK, looks good.
        (void)InsertFormula(LU, LUIdx, NewF);
      } else {
        // Use the immediate in a base register.
        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
          const SCEV *BaseReg = F.BaseRegs[N];
          if (BaseReg != OrigReg)
            continue;
          Formula NewF = F;
          NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
          if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
                          LU.Kind, LU.AccessTy, TLI))
            continue;
          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);

          // If the new formula has a constant in a register, and adding the
          // constant value to the immediate would produce a value closer to
          // zero than the immediate itself, then the formula isn't worthwhile.
          for (SmallVectorImpl<const SCEV *>::const_iterator
               J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
               J != JE; ++J)
            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
              if (C->getValue()->getValue().isNegative() !=
                    (NewF.AM.BaseOffs < 0) &&
                  C->getValue()->getValue().abs()
                    .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs()))
                goto skip_formula;

          // Ok, looks good.
          (void)InsertFormula(LU, LUIdx, NewF);
          break;
        skip_formula:;
        }
      }
    }
  }
}

/// GenerateAllReuseFormulae - Generate formulae for each use.
void
LSRInstance::GenerateAllReuseFormulae() {
  // This is split into two loops so that hasRegsUsedByUsesOtherThan
  // queries are more precise.
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
  }
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateScales(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
  }

  GenerateCrossUseConstantOffsets();
}

/// If their are multiple formulae with the same set of registers used
/// by other uses, pick the best one and delete the others.
void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
#ifndef NDEBUG
  bool Changed = false;
#endif

  // Collect the best formula for each unique set of shared registers. This
  // is reset for each use.
  typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
    BestFormulaeTy;
  BestFormulaeTy BestFormulae;

  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    FormulaSorter Sorter(L, LU, SE, DT);

    // Clear out the set of used regs; it will be recomputed.
    LU.Regs.clear();

    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
         FIdx != NumForms; ++FIdx) {
      Formula &F = LU.Formulae[FIdx];

      SmallVector<const SCEV *, 2> Key;
      for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
           JE = F.BaseRegs.end(); J != JE; ++J) {
        const SCEV *Reg = *J;
        if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
          Key.push_back(Reg);
      }
      if (F.ScaledReg &&
          RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
        Key.push_back(F.ScaledReg);
      // Unstable sort by host order ok, because this is only used for
      // uniquifying.
      std::sort(Key.begin(), Key.end());

      std::pair<BestFormulaeTy::const_iterator, bool> P =
        BestFormulae.insert(std::make_pair(Key, FIdx));
      if (!P.second) {
        Formula &Best = LU.Formulae[P.first->second];
        if (Sorter.operator()(F, Best))
          std::swap(F, Best);
        DEBUG(dbgs() << "Filtering out "; F.print(dbgs());
              dbgs() << "\n"
                        "  in favor of "; Best.print(dbgs());
              dbgs() << '\n');
#ifndef NDEBUG
        Changed = true;
#endif
        std::swap(F, LU.Formulae.back());
        LU.Formulae.pop_back();
        --FIdx;
        --NumForms;
        continue;
      }
      if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
      LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
    }
    BestFormulae.clear();
  }

  DEBUG(if (Changed) {
          dbgs() << "\n"
                    "After filtering out undesirable candidates:\n";
          print_uses(dbgs());
        });
}

/// NarrowSearchSpaceUsingHeuristics - If there are an extrordinary number of
/// formulae to choose from, use some rough heuristics to prune down the number
/// of formulae. This keeps the main solver from taking an extrordinary amount
/// of time in some worst-case scenarios.
void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
  // This is a rough guess that seems to work fairly well.
  const size_t Limit = UINT16_MAX;

  SmallPtrSet<const SCEV *, 4> Taken;
  for (;;) {
    // Estimate the worst-case number of solutions we might consider. We almost
    // never consider this many solutions because we prune the search space,
    // but the pruning isn't always sufficient.
    uint32_t Power = 1;
    for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
         E = Uses.end(); I != E; ++I) {
      size_t FSize = I->Formulae.size();
      if (FSize >= Limit) {
        Power = Limit;
        break;
      }
      Power *= FSize;
      if (Power >= Limit)
        break;
    }
    if (Power < Limit)
      break;

    // Ok, we have too many of formulae on our hands to conveniently handle.
    // Use a rough heuristic to thin out the list.

    // Pick the register which is used by the most LSRUses, which is likely
    // to be a good reuse register candidate.
    const SCEV *Best = 0;
    unsigned BestNum = 0;
    for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
         I != E; ++I) {
      const SCEV *Reg = *I;
      if (Taken.count(Reg))
        continue;
      if (!Best)
        Best = Reg;
      else {
        unsigned Count = RegUses.getUsedByIndices(Reg).count();
        if (Count > BestNum) {
          Best = Reg;
          BestNum = Count;
        }
      }
    }

    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
                 << " will yeild profitable reuse.\n");
    Taken.insert(Best);

    // In any use with formulae which references this register, delete formulae
    // which don't reference it.
    for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(),
         E = Uses.end(); I != E; ++I) {
      LSRUse &LU = *I;
      if (!LU.Regs.count(Best)) continue;

      // Clear out the set of used regs; it will be recomputed.
      LU.Regs.clear();

      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
        Formula &F = LU.Formulae[i];
        if (!F.referencesReg(Best)) {
          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
          std::swap(LU.Formulae.back(), F);
          LU.Formulae.pop_back();
          --e;
          --i;
          continue;
        }

        if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
        LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
      }
    }

    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
}

/// SolveRecurse - This is the recursive solver.
void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                               Cost &SolutionCost,
                               SmallVectorImpl<const Formula *> &Workspace,
                               const Cost &CurCost,
                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
                               DenseSet<const SCEV *> &VisitedRegs) const {
  // Some ideas:
  //  - prune more:
  //    - use more aggressive filtering
  //    - sort the formula so that the most profitable solutions are found first
  //    - sort the uses too
  //  - search faster:
  //    - dont compute a cost, and then compare. compare while computing a cost
  //      and bail early.
  //    - track register sets with SmallBitVector

  const LSRUse &LU = Uses[Workspace.size()];

  // If this use references any register that's already a part of the
  // in-progress solution, consider it a requirement that a formula must
  // reference that register in order to be considered. This prunes out
  // unprofitable searching.
  SmallSetVector<const SCEV *, 4> ReqRegs;
  for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
       E = CurRegs.end(); I != E; ++I)
  bool AnySatisfiedReqRegs = false;
  SmallPtrSet<const SCEV *, 16> NewRegs;
  Cost NewCost;
  for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
       E = LU.Formulae.end(); I != E; ++I) {
    const Formula &F = *I;

    // Ignore formulae which do not use any of the required registers.
    for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
         JE = ReqRegs.end(); J != JE; ++J) {
      const SCEV *Reg = *J;
      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
          F.BaseRegs.end())
        goto skip;
    }

    // Evaluate the cost of the current formula. If it's already worse than
    // the current best, prune the search at that point.
    NewCost = CurCost;
    NewRegs = CurRegs;
    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
    if (NewCost < SolutionCost) {
      Workspace.push_back(&F);
      if (Workspace.size() != Uses.size()) {
        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
                     NewRegs, VisitedRegs);
        if (F.getNumRegs() == 1 && Workspace.size() == 1)
          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
      } else {
        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
              dbgs() << ". Regs:";
              for (SmallPtrSet<const SCEV *, 16>::const_iterator
                   I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
                dbgs() << ' ' << **I;
              dbgs() << '\n');

        SolutionCost = NewCost;
        Solution = Workspace;
      }
      Workspace.pop_back();
    }
  skip:;
  }

  // If none of the formulae had all of the required registers, relax the
  // constraint so that we don't exclude all formulae.
  if (!AnySatisfiedReqRegs) {
    ReqRegs.clear();
    goto retry;
  }
2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000
}

void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
  SmallVector<const Formula *, 8> Workspace;
  Cost SolutionCost;
  SolutionCost.Loose();
  Cost CurCost;
  SmallPtrSet<const SCEV *, 16> CurRegs;
  DenseSet<const SCEV *> VisitedRegs;
  Workspace.reserve(Uses.size());

  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
               CurRegs, VisitedRegs);

  // Ok, we've now made all our decisions.
  DEBUG(dbgs() << "\n"
                  "The chosen solution requires "; SolutionCost.print(dbgs());
        dbgs() << ":\n";
        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
          dbgs() << "  ";
          Uses[i].print(dbgs());
          dbgs() << "\n"
                    "    ";
          Solution[i]->print(dbgs());
          dbgs() << '\n';
        });
}

/// getImmediateDominator - A handy utility for the specific DominatorTree
/// query that we need here.
///
static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
  DomTreeNode *Node = DT.getNode(BB);
  if (!Node) return 0;
  Node = Node->getIDom();
  if (!Node) return 0;
  return Node->getBlock();
}

Value *LSRInstance::Expand(const LSRFixup &LF,
                           const Formula &F,
                           BasicBlock::iterator IP,
                           Loop *L, Instruction *IVIncInsertPos,
                           SCEVExpander &Rewriter,
                           SmallVectorImpl<WeakVH> &DeadInsts,
                           ScalarEvolution &SE, DominatorTree &DT) const {
  const LSRUse &LU = Uses[LF.LUIdx];

  // Then, collect some instructions which we will remain dominated by when
  // expanding the replacement. These must be dominated by any operands that
  // will be required in the expansion.
  SmallVector<Instruction *, 4> Inputs;
  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
    Inputs.push_back(I);
  if (LU.Kind == LSRUse::ICmpZero)
    if (Instruction *I =
          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
      Inputs.push_back(I);
  if (LF.PostIncLoop && !L->contains(LF.UserInst))
    Inputs.push_back(L->getLoopLatch()->getTerminator());

  // Then, climb up the immediate dominator tree as far as we can go while
  // still being dominated by the input positions.
  for (;;) {
    bool AllDominate = true;
    Instruction *BetterPos = 0;
    BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT);
    if (!IDom) break;
    Instruction *Tentative = IDom->getTerminator();
    for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
         E = Inputs.end(); I != E; ++I) {
      Instruction *Inst = *I;
      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
        AllDominate = false;
        break;
      }
      if (IDom == Inst->getParent() &&
          (!BetterPos || DT.dominates(BetterPos, Inst)))
        BetterPos = next(BasicBlock::iterator(Inst));
    }
    if (!AllDominate)
      break;
    if (BetterPos)
      IP = BetterPos;
    else
      IP = Tentative;
  }
  while (isa<PHINode>(IP)) ++IP;

  // Inform the Rewriter if we have a post-increment use, so that it can
  // perform an advantageous expansion.
  Rewriter.setPostInc(LF.PostIncLoop);

  // This is the type that the user actually needs.
  const Type *OpTy = LF.OperandValToReplace->getType();
  // This will be the type that we'll initially expand to.
  const Type *Ty = F.getType();
  if (!Ty)
    // No type known; just expand directly to the ultimate type.
    Ty = OpTy;
  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
    // Expand directly to the ultimate type if it's the right size.
    Ty = OpTy;
  // This is the type to do integer arithmetic in.
  const Type *IntTy = SE.getEffectiveSCEVType(Ty);

  // Build up a list of operands to add together to form the full base.
  SmallVector<const SCEV *, 8> Ops;

  // Expand the BaseRegs portion.
  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
       E = F.BaseRegs.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    assert(!Reg->isZero() && "Zero allocated in a base register!");

    // If we're expanding for a post-inc user for the add-rec's loop, make the
    // post-inc adjustment.
    const SCEV *Start = Reg;
    while (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Start)) {
      if (AR->getLoop() == LF.PostIncLoop) {
        Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE));
        // If the user is inside the loop, insert the code after the increment
        // so that it is dominated by its operand.
        if (L->contains(LF.UserInst))
          IP = IVIncInsertPos;
        break;
      }
      Start = AR->getStart();
    }

    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
  }

  // Expand the ScaledReg portion.
  Value *ICmpScaledV = 0;
  if (F.AM.Scale != 0) {
    const SCEV *ScaledS = F.ScaledReg;

    // If we're expanding for a post-inc user for the add-rec's loop, make the
    // post-inc adjustment.
    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS))
      if (AR->getLoop() == LF.PostIncLoop)
        ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE));

    if (LU.Kind == LSRUse::ICmpZero) {
      // An interesting way of "folding" with an icmp is to use a negated
      // scale, which we'll implement by inserting it into the other operand
      // of the icmp.
      assert(F.AM.Scale == -1 &&
             "The only scale supported by ICmpZero uses is -1!");
      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
    } else {
      // Otherwise just expand the scaled register and an explicit scale,
      // which is expected to be matched as part of the address.
      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
      ScaledS = SE.getMulExpr(ScaledS,
                              SE.getIntegerSCEV(F.AM.Scale,
                                                ScaledS->getType()));
      Ops.push_back(ScaledS);
    }
  }

  // Expand the immediate portions.
  if (F.AM.BaseGV)
    Ops.push_back(SE.getSCEV(F.AM.BaseGV));
  int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
  if (Offset != 0) {
    if (LU.Kind == LSRUse::ICmpZero) {
      // The other interesting way of "folding" with an ICmpZero is to use a
      // negated immediate.
      if (!ICmpScaledV)
        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
      else {
        Ops.push_back(SE.getUnknown(ICmpScaledV));
        ICmpScaledV = ConstantInt::get(IntTy, Offset);
      }
    } else {
      // Just add the immediate values. These again are expected to be matched
      // as part of the address.
      Ops.push_back(SE.getIntegerSCEV(Offset, IntTy));
    }
  }

  // Emit instructions summing all the operands.
  const SCEV *FullS = Ops.empty() ?
                      SE.getIntegerSCEV(0, IntTy) :
                      SE.getAddExpr(Ops);
  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);

  // We're done expanding now, so reset the rewriter.
  Rewriter.setPostInc(0);

  // An ICmpZero Formula represents an ICmp which we're handling as a
  // comparison against zero. Now that we've expanded an expression for that
  // form, update the ICmp's other operand.
  if (LU.Kind == LSRUse::ICmpZero) {
    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
    DeadInsts.push_back(CI->getOperand(1));
    assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
                           "a scale at the same time!");
    if (F.AM.Scale == -1) {
      if (ICmpScaledV->getType() != OpTy) {
        Instruction *Cast =
          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
                                                   OpTy, false),
                           ICmpScaledV, OpTy, "tmp", CI);
        ICmpScaledV = Cast;
      }
      CI->setOperand(1, ICmpScaledV);
    } else {
      assert(F.AM.Scale == 0 &&
             "ICmp does not support folding a global value and "
             "a scale at the same time!");
      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
                                           -(uint64_t)Offset);
      if (C->getType() != OpTy)
        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                          OpTy, false),
                                  C, OpTy);

      CI->setOperand(1, C);
    }
  }

  return FullV;
}

/// Rewrite - Emit instructions for the leading candidate expression for this
/// LSRUse (this is called "expanding"), and update the UserInst to reference
/// the newly expanded value.
void LSRInstance::Rewrite(const LSRFixup &LF,
                          const Formula &F,
                          Loop *L, Instruction *IVIncInsertPos,
                          SCEVExpander &Rewriter,
                          SmallVectorImpl<WeakVH> &DeadInsts,
                          ScalarEvolution &SE, DominatorTree &DT,
                          Pass *P) const {
  const Type *OpTy = LF.OperandValToReplace->getType();

  // First, find an insertion point that dominates UserInst. For PHI nodes,
  // find the nearest block which dominates all the relevant uses.
  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
    DenseMap<BasicBlock *, Value *> Inserted;
    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
      if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
        BasicBlock *BB = PN->getIncomingBlock(i);

        // If this is a critical edge, split the edge so that we do not insert
        // the code on all predecessor/successor paths.  We do this unless this
        // is the canonical backedge for this loop, which complicates post-inc
        // users.
        if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
            !isa<IndirectBrInst>(BB->getTerminator()) &&
            (PN->getParent() != L->getHeader() || !L->contains(BB))) {
          // Split the critical edge.
          BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);

          // If PN is outside of the loop and BB is in the loop, we want to
          // move the block to be immediately before the PHI block, not
          // immediately after BB.
          if (L->contains(BB) && !L->contains(PN))
            NewBB->moveBefore(PN->getParent());

          // Splitting the edge can reduce the number of PHI entries we have.
          e = PN->getNumIncomingValues();
          BB = NewBB;
          i = PN->getBasicBlockIndex(BB);
        }

        std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
          Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
        if (!Pair.second)
          PN->setIncomingValue(i, Pair.first->second);
        else {
          Value *FullV = Expand(LF, F, BB->getTerminator(), L, IVIncInsertPos,
                                Rewriter, DeadInsts, SE, DT);

          // If this is reuse-by-noop-cast, insert the noop cast.
          if (FullV->getType() != OpTy)
            FullV =
              CastInst::Create(CastInst::getCastOpcode(FullV, false,
                                                       OpTy, false),
                               FullV, LF.OperandValToReplace->getType(),
                               "tmp", BB->getTerminator());

          PN->setIncomingValue(i, FullV);
          Pair.first->second = FullV;
        }
      }
  } else {
    Value *FullV = Expand(LF, F, LF.UserInst, L, IVIncInsertPos,
                          Rewriter, DeadInsts, SE, DT);

    // If this is reuse-by-noop-cast, insert the noop cast.
    if (FullV->getType() != OpTy) {
      Instruction *Cast =
        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
                         FullV, OpTy, "tmp", LF.UserInst);
      FullV = Cast;
    }

    // Update the user. ICmpZero is handled specially here (for now) because
    // Expand may have updated one of the operands of the icmp already, and
    // its new value may happen to be equal to LF.OperandValToReplace, in
    // which case doing replaceUsesOfWith leads to replacing both operands
    // with the same value. TODO: Reorganize this.
    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)