LoopStrengthReduce.cpp

                     ScalarEvolution &SE, DominatorTree &DT,
                     Pass *P) const {
  const Type *OpTy = OperandValToReplace->getType();

  // First, find an insertion point that dominates UserInst. For PHI nodes,
  // find the nearest block which dominates all the relevant uses.
  if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
    DenseMap<BasicBlock *, Value *> Inserted;
    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
      if (PN->getIncomingValue(i) == OperandValToReplace) {
        BasicBlock *BB = PN->getIncomingBlock(i);

        // If this is a critical edge, split the edge so that we do not insert
        // the code on all predecessor/successor paths.  We do this unless this
        // is the canonical backedge for this loop, which complicates post-inc
        // users.
        if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
            !isa<IndirectBrInst>(BB->getTerminator()) &&
            (PN->getParent() != L->getHeader() || !L->contains(BB))) {
          // Split the critical edge.
          BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);

          // If PN is outside of the loop and BB is in the loop, we want to
          // move the block to be immediately before the PHI block, not
          // immediately after BB.
          if (L->contains(BB) && !L->contains(PN))
            NewBB->moveBefore(PN->getParent());

          // Splitting the edge can reduce the number of PHI entries we have.
          e = PN->getNumIncomingValues();
          BB = NewBB;
          i = PN->getBasicBlockIndex(BB);
        }

        std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
          Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
        if (!Pair.second)
          PN->setIncomingValue(i, Pair.first->second);
        else {
          Value *FullV = Expand(BB->getTerminator(), L, IVIncInsertPos,
                                Rewriter, DeadInsts, SE, DT);

          // If this is reuse-by-noop-cast, insert the noop cast.
          if (FullV->getType() != OpTy)
            FullV =
              CastInst::Create(CastInst::getCastOpcode(FullV, false,
                                                       OpTy, false),
                               FullV, OperandValToReplace->getType(),
                               "tmp", BB->getTerminator());

          PN->setIncomingValue(i, FullV);
          Pair.first->second = FullV;
        }
      }
  } else {
    Value *FullV = Expand(UserInst, L, IVIncInsertPos,
                          Rewriter, DeadInsts, SE, DT);

    // If this is reuse-by-noop-cast, insert the noop cast.
    if (FullV->getType() != OpTy) {
      Instruction *Cast =
        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
                         FullV, OpTy, "tmp", UserInst);
      FullV = Cast;
    }

    // Update the user.
    UserInst->replaceUsesOfWith(OperandValToReplace, FullV);
  }

  DeadInsts.push_back(OperandValToReplace);
}

void LSRUse::print(raw_ostream &OS) const {
  OS << "LSR Use: Kind=";
  switch (Kind) {
  case Basic:    OS << "Basic"; break;
  case Special:  OS << "Special"; break;
  case ICmpZero: OS << "ICmpZero"; break;
  case Address:
    OS << "Address of ";
    if (isa<PointerType>(AccessTy))
      OS << "pointer"; // the full pointer type could be really verbose
    else
      OS << *AccessTy;
  }

  OS << ", UserInst=";
  // Store is common and interesting enough to be worth special-casing.
  if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
    OS << "store ";
    WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
  } else if (UserInst->getType()->isVoidTy())
    OS << UserInst->getOpcodeName();
  else
    WriteAsOperand(OS, UserInst, /*PrintType=*/false);

  OS << ", OperandValToReplace=";
  WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);

  if (PostIncLoop) {
    OS << ", PostIncLoop=";
    WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false);
  }
}

void LSRUse::dump() const {
  print(errs()); errs() << '\n';
}

namespace {

/// Score - This class is used to measure and compare candidate formulae.
class Score {
  unsigned NumRegs;
  unsigned NumPhis;
  unsigned NumIVMuls;
  unsigned NumBaseAdds;
  unsigned NumImms;

public:
  Score()
    : NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {}

  void RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
                   ScalarEvolution &SE);

  void Rate(const SCEV *Reg, const SmallBitVector &Bits,
            const SmallVector<LSRUse, 16> &Uses, const Loop *L,
            ScalarEvolution &SE);

  unsigned getNumRegs() const { return NumRegs; }

  bool operator<(const Score &Other) const;

  void print_details(raw_ostream &OS, const SCEV *Reg,
                     const SmallPtrSet<const SCEV *, 8> &Regs) const;

  void print(raw_ostream &OS) const;
  void dump() const;

private:
  void RateRegister(const SCEV *Reg, SmallPtrSet<const SCEV *, 8> &Regs,
                    const Loop *L);
  void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 8> &Regs,
                   const Loop *L);

  void Loose();
};

}

/// RateRegister - Tally up interesting quantities from the given register.
void Score::RateRegister(const SCEV *Reg,
                         SmallPtrSet<const SCEV *, 8> &Regs,
                         const Loop *L) {
  if (Regs.insert(Reg))
    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
      NumPhis += AR->getLoop() == L;

      // Add the step value register, if it needs one.
      if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
        RateRegister(AR->getOperand(1), Regs, L);
    }
}

void Score::RateFormula(const Formula &F,
                        SmallPtrSet<const SCEV *, 8> &Regs,
                        const Loop *L) {
  // Tally up the registers.
  if (F.ScaledReg)
    RateRegister(F.ScaledReg, Regs, L);
  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
       E = F.BaseRegs.end(); I != E; ++I) {
    const SCEV *BaseReg = *I;
    RateRegister(BaseReg, Regs, L);

    NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
                 BaseReg->hasComputableLoopEvolution(L);
  }

  if (F.BaseRegs.size() > 1)
    NumBaseAdds += F.BaseRegs.size() - 1;

  // Tally up the non-zero immediates.
  if (F.AM.BaseGV || F.AM.BaseOffs != 0)
    ++NumImms;
}

/// Loose - Set this score to a loosing value.
void Score::Loose() {
  NumRegs = ~0u;
  NumPhis = ~0u;
  NumIVMuls = ~0u;
  NumBaseAdds = ~0u;
  NumImms = ~0u;
}

/// RateInitial - Compute a score for the initial "fully reduced" solution.
void Score::RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
                        ScalarEvolution &SE) {
  SmallPtrSet<const SCEV *, 8> Regs;
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I)
    RateFormula(I->Formulae.front(), Regs, L);
  NumRegs += Regs.size();

  DEBUG(print_details(dbgs(), 0, Regs));
}

/// Rate - Compute a score for the solution where the reuse associated with
/// putting Reg in a register is selected.
void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits,
                 const SmallVector<LSRUse, 16> &Uses, const Loop *L,
                 ScalarEvolution &SE) {
  SmallPtrSet<const SCEV *, 8> Regs;
  for (size_t i = 0, e = Uses.size(); i != e; ++i) {
    const LSRUse &LU = Uses[i];

    const Formula *BestFormula = 0;
    if (i >= Bits.size() || !Bits.test(i))
      // This use doesn't use the current register. Just go with the current
      // leading candidate formula.
      BestFormula = &LU.Formulae.front();
    else
      // Find the best formula for this use that uses the current register.
      for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
           E = LU.Formulae.end(); I != E; ++I) {
        const Formula &F = *I;
        if (F.referencesReg(Reg) &&
            (!BestFormula || ComplexitySorter()(F, *BestFormula)))
          BestFormula = &F;
      }

    // If we didn't find *any* forumlae, because earlier we eliminated some
    // in greedy fashion, skip the current register's reuse opportunity.
    if (!BestFormula) {
      DEBUG(dbgs() << "Reuse with reg " << *Reg
                   << " wouldn't help any users.\n");
      Loose();
      return;
    }

    // For an in-loop post-inc user, don't allow multiple base registers,
    // because that would require an awkward in-loop add after the increment.
    if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) &&
        BestFormula->BaseRegs.size() > 1) {
      DEBUG(dbgs() << "Reuse with reg " << *Reg
                   << " would require an in-loop post-inc add: ";
            BestFormula->dump());
      Loose();
      return;
    }

    RateFormula(*BestFormula, Regs, L);
  }
  NumRegs += Regs.size();

  DEBUG(print_details(dbgs(), Reg, Regs));
}

/// operator< - Choose the better score.
bool Score::operator<(const Score &Other) const {
  if (NumRegs != Other.NumRegs)
    return NumRegs < Other.NumRegs;
  if (NumPhis != Other.NumPhis)
    return NumPhis < Other.NumPhis;
  if (NumIVMuls != Other.NumIVMuls)
    return NumIVMuls < Other.NumIVMuls;
  if (NumBaseAdds != Other.NumBaseAdds)
    return NumBaseAdds < Other.NumBaseAdds;
  return NumImms < Other.NumImms;
}

void Score::print_details(raw_ostream &OS,
                          const SCEV *Reg,
                          const SmallPtrSet<const SCEV *, 8> &Regs) const {
  if (Reg) OS << "Reuse with reg " << *Reg << " would require ";
  else     OS << "The initial solution would require ";
  print(OS);
  OS << ". Regs:";
  for (SmallPtrSet<const SCEV *, 8>::const_iterator I = Regs.begin(),
       E = Regs.end(); I != E; ++I)
    OS << ' ' << **I;
  OS << '\n';
}

void Score::print(raw_ostream &OS) const {
  OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
  if (NumPhis != 0)
    OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s");
  if (NumIVMuls != 0)
    OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
  if (NumBaseAdds != 0)
    OS << ", plus " << NumBaseAdds << " base add"
       << (NumBaseAdds == 1 ? "" : "s");
  if (NumImms != 0)
    OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s");
}

void Score::dump() const {
  print(errs()); errs() << '\n';
}

/// isAddressUse - Returns true if the specified instruction is using the
/// specified value as an address.
static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
  bool isAddress = isa<LoadInst>(Inst);
  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
    if (SI->getOperand(1) == OperandVal)
      isAddress = true;
  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
    // Addressing modes can also be folded into prefetches and a variety
    // of intrinsics.
    switch (II->getIntrinsicID()) {
      default: break;
      case Intrinsic::prefetch:
      case Intrinsic::x86_sse2_loadu_dq:
      case Intrinsic::x86_sse2_loadu_pd:
      case Intrinsic::x86_sse_loadu_ps:
      case Intrinsic::x86_sse_storeu_ps:
      case Intrinsic::x86_sse2_storeu_pd:
      case Intrinsic::x86_sse2_storeu_dq:
      case Intrinsic::x86_sse2_storel_dq:
        if (II->getOperand(1) == OperandVal)
          isAddress = true;
        break;
    }
  }
  return isAddress;
}

/// getAccessType - Return the type of the memory being accessed.
static const Type *getAccessType(const Instruction *Inst) {
  const Type *AccessTy = Inst->getType();
  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
    AccessTy = SI->getOperand(0)->getType();
  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
    // Addressing modes can also be folded into prefetches and a variety
    // of intrinsics.
    switch (II->getIntrinsicID()) {
    default: break;
    case Intrinsic::x86_sse_storeu_ps:
    case Intrinsic::x86_sse2_storeu_pd:
    case Intrinsic::x86_sse2_storeu_dq:
    case Intrinsic::x86_sse2_storel_dq:
      AccessTy = II->getOperand(1)->getType();
      break;
    }
  }
  return AccessTy;
}

/// DeleteTriviallyDeadInstructions - If any of the instructions is the
/// specified set are trivially dead, delete them and see if this makes any of
/// their operands subsequently dead.
static bool
DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
  bool Changed = false;

  while (!DeadInsts.empty()) {
    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());

    if (I == 0 || !isInstructionTriviallyDead(I))
      continue;

    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
        *OI = 0;
        if (U->use_empty())
          DeadInsts.push_back(U);
      }

    I->eraseFromParent();
    Changed = true;
  }

  return Changed;
}

namespace {

/// LSRInstance - This class holds state for the main loop strength
/// reduction logic.
class LSRInstance {
  IVUsers &IU;
  ScalarEvolution &SE;
  DominatorTree &DT;
  const TargetLowering *const TLI;
  Loop *const L;
  bool Changed;

  /// IVIncInsertPos - This is the insert position that the current loop's
  /// induction variable increment should be placed. In simple loops, this is
  /// the latch block's terminator. But in more complicated cases, this is
  /// a position which will dominate all the in-loop post-increment users.
  Instruction *IVIncInsertPos;

  /// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an
  /// arbitrary index value to each register as a sort tie breaker.
  unsigned CurrentArbitraryRegIndex;

  /// MaxNumRegs - To help prune the search for solutions, identify the number
  /// of registers needed by the initial solution. No formula should require
  /// more than this.
  unsigned MaxNumRegs;

  /// Factors - Interesting factors between use strides.
  SmallSetVector<int64_t, 4> Factors;

  /// Types - Interesting use types, to facilitate truncation reuse.
  SmallSetVector<const Type *, 4> Types;

  /// Uses - The list of interesting uses.
  SmallVector<LSRUse, 16> Uses;

  // TODO: Reorganize these data structures.
  typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
  RegUsesTy RegUses;
  SmallVector<const SCEV *, 16> RegSequence;

  void OptimizeShadowIV();
  bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
                         const SCEV* &CondStride);
  ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
  bool StrideMightBeShared(const SCEV* Stride);
  bool OptimizeLoopTermCond();

  LSRUse &getNewUse() {
    Uses.push_back(LSRUse());
    return Uses.back();
  }

  void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx);
  void CountRegisters(const Formula &F, size_t LUIdx);

  bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);

  void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
                                   Formula Base);
  void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
                                   Formula Base);
  void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU,
                                           unsigned LUIdx,
                                           const Formula &Base, unsigned i,
                                           const SmallVectorImpl<const SCEV *>
                                             &AddOps);
  void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
                                  Formula Base);
  void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
                                Formula Base);
  void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
                           Formula Base);
  void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
                             Formula Base);

  void GenerateConstantOffsetReuse();

  void GenerateAllReuseFormulae();

  void GenerateLoopInvariantRegisterUses();

public:
  LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);

  bool getChanged() const { return Changed; }

  void print(raw_ostream &OS) const;
  void dump() const;
};

}

/// OptimizeShadowIV - If IV is used in a int-to-float cast
/// inside the loop then try to eliminate the cast opeation.
void LSRInstance::OptimizeShadowIV() {
  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return;

  for (size_t StrideIdx = 0, e = IU.StrideOrder.size();
       StrideIdx != e; ++StrideIdx) {
    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
      IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
    assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
    if (!isa<SCEVConstant>(SI->first))
      continue;

    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
           E = SI->second->Users.end(); UI != E; /* empty */) {
      ilist<IVStrideUse>::iterator CandidateUI = UI;
      ++UI;
      Instruction *ShadowUse = CandidateUI->getUser();
      const Type *DestTy = NULL;

      /* If shadow use is a int->float cast then insert a second IV
         to eliminate this cast.

           for (unsigned i = 0; i < n; ++i)
             foo((double)i);

         is transformed into

           double d = 0.0;
           for (unsigned i = 0; i < n; ++i, ++d)
             foo(d);
      */
      if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
        DestTy = UCast->getDestTy();
      else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
        DestTy = SCast->getDestTy();
      if (!DestTy) continue;

      if (TLI) {
        // If target does not support DestTy natively then do not apply
        // this transformation.
        EVT DVT = TLI->getValueType(DestTy);
        if (!TLI->isTypeLegal(DVT)) continue;
      }

      PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
      if (!PH) continue;
      if (PH->getNumIncomingValues() != 2) continue;

      const Type *SrcTy = PH->getType();
      int Mantissa = DestTy->getFPMantissaWidth();
      if (Mantissa == -1) continue;
      if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
        continue;

      unsigned Entry, Latch;
      if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
        Entry = 0;
        Latch = 1;
      } else {
        Entry = 1;
        Latch = 0;
      }

      ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
      if (!Init) continue;
      Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());

      BinaryOperator *Incr =
        dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
      if (!Incr) continue;
      if (Incr->getOpcode() != Instruction::Add
          && Incr->getOpcode() != Instruction::Sub)
        continue;

      /* Initialize new IV, double d = 0.0 in above example. */
      ConstantInt *C = NULL;
      if (Incr->getOperand(0) == PH)
        C = dyn_cast<ConstantInt>(Incr->getOperand(1));
      else if (Incr->getOperand(1) == PH)
        C = dyn_cast<ConstantInt>(Incr->getOperand(0));
      else
        continue;

      if (!C) continue;

      // Ignore negative constants, as the code below doesn't handle them
      // correctly. TODO: Remove this restriction.
      if (!C->getValue().isStrictlyPositive()) continue;

      /* Add new PHINode. */
      PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);

      /* create new increment. '++d' in above example. */
      Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
      BinaryOperator *NewIncr =
        BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
                                 Instruction::FAdd : Instruction::FSub,
                               NewPH, CFP, "IV.S.next.", Incr);

      NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
      NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));

      /* Remove cast operation */
      ShadowUse->replaceAllUsesWith(NewPH);
      ShadowUse->eraseFromParent();
      break;
    }
  }
}

/// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
/// set the IV user and stride information and return true, otherwise return
/// false.
bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
                                    IVStrideUse *&CondUse,
                                    const SCEV* &CondStride) {
  for (unsigned StrideIdx = 0, e = IU.StrideOrder.size();
       StrideIdx != e && !CondUse; ++StrideIdx) {
    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
      IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
    assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");

    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
         E = SI->second->Users.end(); UI != E; ++UI)
      if (UI->getUser() == Cond) {
        // NOTE: we could handle setcc instructions with multiple uses here, but
        // InstCombine does it as well for simple uses, it's not clear that it
        // occurs enough in real life to handle.
        CondUse = UI;
        CondStride = SI->first;
        return true;
      }
  }
  return false;
}

/// OptimizeMax - Rewrite the loop's terminating condition if it uses
/// a max computation.
///
/// This is a narrow solution to a specific, but acute, problem. For loops
/// like this:
///
///   i = 0;
///   do {
///     p[i] = 0.0;
///   } while (++i < n);
///
/// the trip count isn't just 'n', because 'n' might not be positive. And
/// unfortunately this can come up even for loops where the user didn't use
/// a C do-while loop. For example, seemingly well-behaved top-test loops
/// will commonly be lowered like this:
//
///   if (n > 0) {
///     i = 0;
///     do {
///       p[i] = 0.0;
///     } while (++i < n);
///   }
///
/// and then it's possible for subsequent optimization to obscure the if
/// test in such a way that indvars can't find it.
///
/// When indvars can't find the if test in loops like this, it creates a
/// max expression, which allows it to give the loop a canonical
/// induction variable:
///
///   i = 0;
///   max = n < 1 ? 1 : n;
///   do {
///     p[i] = 0.0;
///   } while (++i != max);
///
/// Canonical induction variables are necessary because the loop passes
/// are designed around them. The most obvious example of this is the
/// LoopInfo analysis, which doesn't remember trip count values. It
/// expects to be able to rediscover the trip count each time it is
/// needed, and it does this using a simple analysis that only succeeds if
/// the loop has a canonical induction variable.
///
/// However, when it comes time to generate code, the maximum operation
/// can be quite costly, especially if it's inside of an outer loop.
///
/// This function solves this problem by detecting this type of loop and
/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
/// the instructions for the maximum computation.
///
ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
  // Check that the loop matches the pattern we're looking for.
  if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
      Cond->getPredicate() != CmpInst::ICMP_NE)
    return Cond;

  SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
  if (!Sel || !Sel->hasOneUse()) return Cond;

  const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
    return Cond;
  const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());

  // Add one to the backedge-taken count to get the trip count.
  const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);

  // Check for a max calculation that matches the pattern.
  if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
    return Cond;
  const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
  if (Max != SE.getSCEV(Sel)) return Cond;

  // To handle a max with more than two operands, this optimization would
  // require additional checking and setup.
  if (Max->getNumOperands() != 2)
    return Cond;

  const SCEV *MaxLHS = Max->getOperand(0);
  const SCEV *MaxRHS = Max->getOperand(1);
  if (!MaxLHS || MaxLHS != One) return Cond;
  // Check the relevant induction variable for conformance to
  // the pattern.
  const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
  if (!AR || !AR->isAffine() ||
      AR->getStart() != One ||
      AR->getStepRecurrence(SE) != One)
    return Cond;

  assert(AR->getLoop() == L &&
         "Loop condition operand is an addrec in a different loop!");

  // Check the right operand of the select, and remember it, as it will
  // be used in the new comparison instruction.
  Value *NewRHS = 0;
  if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
    NewRHS = Sel->getOperand(1);
  else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
    NewRHS = Sel->getOperand(2);
  if (!NewRHS) return Cond;

  // Determine the new comparison opcode. It may be signed or unsigned,
  // and the original comparison may be either equality or inequality.
  CmpInst::Predicate Pred =
    isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
  if (Cond->getPredicate() == CmpInst::ICMP_EQ)
    Pred = CmpInst::getInversePredicate(Pred);

  // Ok, everything looks ok to change the condition into an SLT or SGE and
  // delete the max calculation.
  ICmpInst *NewCond =
    new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");

  // Delete the max calculation instructions.
  Cond->replaceAllUsesWith(NewCond);
  CondUse->setUser(NewCond);
  Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
  Cond->eraseFromParent();
  Sel->eraseFromParent();
  if (Cmp->use_empty())
    Cmp->eraseFromParent();
  return NewCond;
}

bool LSRInstance::StrideMightBeShared(const SCEV* Stride) {
  int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
  for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) {
    std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
      IU.IVUsesByStride.find(IU.StrideOrder[i]);
    const SCEV *Share = SI->first;
    if (!isa<SCEVConstant>(SI->first) || Share == Stride)
      continue;
    int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
    if (SSInt == SInt)
      return true; // This can definitely be reused.
    if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
      continue;
    int64_t Scale = SSInt / SInt;

    // This AM will be used for conservative queries. At this point in the
    // process we don't know which users will have a base reg, immediate,
    // etc., so we conservatively assume that it may not, making more
    // strides valid, thus erring on the side of assuming that there
    // might be sharing.
    TargetLowering::AddrMode AM;
    AM.Scale = Scale;

    // Any pre-inc iv use?
    IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share];
    for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
           E = StrideUses.Users.end(); I != E; ++I) {
      bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace());
      if (!I->isUseOfPostIncrementedValue() &&
          isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic,
                     isAddress ? getAccessType(I->getUser()) : 0,
                     TLI))
        return true;
    }
  }
  return false;
}

/// OptimizeLoopTermCond - Change loop terminating condition to use the
/// postinc iv when possible.
bool
LSRInstance::OptimizeLoopTermCond() {
  SmallPtrSet<Instruction *, 4> PostIncs;

  BasicBlock *LatchBlock = L->getLoopLatch();
  SmallVector<BasicBlock*, 8> ExitingBlocks;
  L->getExitingBlocks(ExitingBlocks);

  for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
    BasicBlock *ExitingBlock = ExitingBlocks[i];

    // Get the terminating condition for the loop if possible.  If we
    // can, we want to change it to use a post-incremented version of its
    // induction variable, to allow coalescing the live ranges for the IV into
    // one register value.

    BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
    if (!TermBr)
      continue;
    // FIXME: Overly conservative, termination condition could be an 'or' etc..
    if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
      continue;

    // Search IVUsesByStride to find Cond's IVUse if there is one.
    IVStrideUse *CondUse = 0;
    const SCEV *CondStride = 0;
    ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
    if (!FindIVUserForCond(Cond, CondUse, CondStride))
      continue;

    // If the trip count is computed in terms of a max (due to ScalarEvolution
    // being unable to find a sufficient guard, for example), change the loop
    // comparison to use SLT or ULT instead of NE.
    // One consequence of doing this now is that it disrupts the count-down
    // optimization. That's not always a bad thing though, because in such
    // cases it may still be worthwhile to avoid a max.
    Cond = OptimizeMax(Cond, CondUse);

    // If this exiting block is the latch block, and the condition has only
    // one use inside the loop (the branch), use the post-incremented value
    // of the induction variable
    if (ExitingBlock != LatchBlock) {
      // If this exiting block dominates the latch block, it may also use
      // the post-inc value if it won't be shared with other uses.
      // Check for dominance.
      if (!DT.dominates(ExitingBlock, LatchBlock))
        continue;
      // Check for sharing within the same stride.
      bool SameStrideSharing = false;
      IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride];
      for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
             E = StrideUses.Users.end(); I != E; ++I) {
        if (I->getUser() == Cond)
          continue;
        if (!I->isUseOfPostIncrementedValue()) {
          SameStrideSharing = true;
          break;
        }
      }
      if (SameStrideSharing)
        continue;
      // Check for sharing from a different stride.
      if (isa<SCEVConstant>(CondStride) && StrideMightBeShared(CondStride))
        continue;
    }
    if (!Cond->hasOneUse()) {
      bool HasOneUseInLoop = true;
      for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
           UI != UE; ++UI) {
        Instruction *U = cast<Instruction>(*UI);
        if (U == TermBr)
          continue;
        if (L->contains(U)) {
          HasOneUseInLoop = false;
          break;
        }
      }
      if (!HasOneUseInLoop)
        continue;
    }

    DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
                 << *Cond << '\n');

    // It's possible for the setcc instruction to be anywhere in the loop, and
    // possible for it to have multiple users.  If it is not immediately before
    // the exiting block branch, move it.
    if (&*++BasicBlock::iterator(Cond) != TermBr) {
      if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
        Cond->moveBefore(TermBr);
      } else {
        // Otherwise, clone the terminating condition and insert into the
        // loopend.
        ICmpInst *OldCond = Cond;
        Cond = cast<ICmpInst>(Cond->clone());
        Cond->setName(L->getHeader()->getName() + ".termcond");
        ExitingBlock->getInstList().insert(TermBr, Cond);

        // Clone the IVUse, as the old use still exists!
        IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
                                             CondUse->getOperandValToReplace());
        CondUse = &IU.IVUsesByStride[CondStride]->Users.back();
        TermBr->replaceUsesOfWith(OldCond, Cond);
      }
    }

    // If we get to here, we know that we can transform the setcc instruction to
    // use the post-incremented version of the IV, allowing us to coalesce the
    // live ranges for the IV correctly.
    CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride));
    CondUse->setIsUseOfPostIncrementedValue(true);
    Changed = true;

    PostIncs.insert(Cond);
  }

  // Determine an insertion point for the loop induction variable increment. It
  // must dominate all the post-inc comparisons we just set up, and it must
  // dominate the loop latch edge.
  IVIncInsertPos = L->getLoopLatch()->getTerminator();
  for (SmallPtrSet<Instruction *, 4>::iterator I = PostIncs.begin(),
       E = PostIncs.end(); I != E; ++I) {
    BasicBlock *BB =
      DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
                                    (*I)->getParent());
    if (BB == (*I)->getParent())
      IVIncInsertPos = *I;
    else if (BB != IVIncInsertPos->getParent())
      IVIncInsertPos = BB->getTerminator();
  }

  return Changed;
}

/// CountRegisters - Note the given register.
void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity,
                                size_t LUIdx) {
  std::pair<RegUsesTy::iterator, bool> Pair =
    RegUses.insert(std::make_pair(Reg, RegSortData()));
  RegSortData &BV = Pair.first->second;
  if (Pair.second) {
    BV.Index = CurrentArbitraryRegIndex++;
    BV.MaxComplexity = Complexity;
    RegSequence.push_back(Reg);
  } else {
    BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity);
  }
  BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1));
  BV.Bits.set(LUIdx);
}

/// CountRegisters - Note which registers are used by the given formula,
/// updating RegUses.
void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
  uint32_t Complexity = F.getComplexity();
  if (F.ScaledReg)
    CountRegister(F.ScaledReg, Complexity, LUIdx);
  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
       E = F.BaseRegs.end(); I != E; ++I)
    CountRegister(*I, Complexity, LUIdx);
}

/// InsertFormula - If the given formula has not yet been inserted, add it
/// to the list, and return true. Return false otherwise.
bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
  // If a formula by itself would require more registers than the base solution,
  // discard it and stop searching from it, as it won't be profitable. This is
  // actually more permissive than it could be, because it doesn't include
  // registers used by non-constant strides in F.
  if (F.getNumRegs() > MaxNumRegs)
    return false;

  if (!LU.InsertFormula(F))
    return false;

  CountRegisters(LU.Formulae.back(), LUIdx);
  return true;
}

/// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic
/// offsets.
void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
                                              Formula Base) {
  // We can't add a symbolic offset if the address already contains one.
  if (Base.AM.BaseGV) return;

  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
    const SCEV *G = Base.BaseRegs[i];
    GlobalValue *GV = ExtractSymbol(G, SE);
    if (G->isZero())
      continue;
    Formula F = Base;
    F.AM.BaseGV = GV;
    if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
      continue;
    F.BaseRegs[i] = G;
    (void)InsertFormula(LU, LUIdx, F);
  }
}

/// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up
/// the comparison. For example, x == y -> x*c == y*c.
void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
                                              Formula Base) {
  if (LU.Kind != LSRUse::ICmpZero) return;

  // Determine the integer type for the base formula.
  const Type *IntTy = Base.getType();
  if (!IntTy) return;
  if (SE.getTypeSizeInBits(IntTy) > 64) return;
  IntTy = SE.getEffectiveSCEVType(IntTy);

  assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");

  // Check each interesting stride.
  for (SmallSetVector<int64_t, 4>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    int64_t Factor = *I;
    Formula F = Base;

    // Check that the multiplication doesn't overflow.
    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor;
    if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs)