LoopStrengthReduce.cpp

       E = F.BaseRegs.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    assert(!Reg->isZero() && "Zero allocated in a base register!");

    // If we're expanding for a post-inc user, make the post-inc adjustment.
    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
    Reg = TransformForPostIncUse(Denormalize, Reg,
                                 LF.UserInst, LF.OperandValToReplace,
                                 Loops, SE, DT);

    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
  }

  // Flush the operand list to suppress SCEVExpander hoisting.
  if (!Ops.empty()) {
    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
    Ops.clear();
    Ops.push_back(SE.getUnknown(FullV));
  }

  // Expand the ScaledReg portion.
  Value *ICmpScaledV = 0;
  if (F.AM.Scale != 0) {
    const SCEV *ScaledS = F.ScaledReg;

    // If we're expanding for a post-inc user, make the post-inc adjustment.
    PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
    ScaledS = TransformForPostIncUse(Denormalize, ScaledS,
                                     LF.UserInst, LF.OperandValToReplace,
                                     Loops, SE, DT);

    if (LU.Kind == LSRUse::ICmpZero) {
      // An interesting way of "folding" with an icmp is to use a negated
      // scale, which we'll implement by inserting it into the other operand
      // of the icmp.
      assert(F.AM.Scale == -1 &&
             "The only scale supported by ICmpZero uses is -1!");
      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
    } else {
      // Otherwise just expand the scaled register and an explicit scale,
      // which is expected to be matched as part of the address.
      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
      ScaledS = SE.getMulExpr(ScaledS,
                              SE.getConstant(ScaledS->getType(), F.AM.Scale));
      Ops.push_back(ScaledS);

      // Flush the operand list to suppress SCEVExpander hoisting.
      Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
      Ops.clear();
      Ops.push_back(SE.getUnknown(FullV));
    }
  }

  // Expand the GV portion.
  if (F.AM.BaseGV) {
    Ops.push_back(SE.getUnknown(F.AM.BaseGV));

    // Flush the operand list to suppress SCEVExpander hoisting.
    Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP);
    Ops.clear();
    Ops.push_back(SE.getUnknown(FullV));
  }

  // Expand the immediate portion.
  int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
  if (Offset != 0) {
    if (LU.Kind == LSRUse::ICmpZero) {
      // The other interesting way of "folding" with an ICmpZero is to use a
      // negated immediate.
      if (!ICmpScaledV)
        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
      else {
        Ops.push_back(SE.getUnknown(ICmpScaledV));
        ICmpScaledV = ConstantInt::get(IntTy, Offset);
      }
    } else {
      // Just add the immediate values. These again are expected to be matched
      // as part of the address.
      Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
    }
  }

  // Emit instructions summing all the operands.
  const SCEV *FullS = Ops.empty() ?
                      SE.getConstant(IntTy, 0) :
                      SE.getAddExpr(Ops);
  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);

  // We're done expanding now, so reset the rewriter.
  Rewriter.clearPostInc();

  // An ICmpZero Formula represents an ICmp which we're handling as a
  // comparison against zero. Now that we've expanded an expression for that
  // form, update the ICmp's other operand.
  if (LU.Kind == LSRUse::ICmpZero) {
    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
    DeadInsts.push_back(CI->getOperand(1));
    assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
                           "a scale at the same time!");
    if (F.AM.Scale == -1) {
      if (ICmpScaledV->getType() != OpTy) {
        Instruction *Cast =
          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
                                                   OpTy, false),
                           ICmpScaledV, OpTy, "tmp", CI);
        ICmpScaledV = Cast;
      }
      CI->setOperand(1, ICmpScaledV);
    } else {
      assert(F.AM.Scale == 0 &&
             "ICmp does not support folding a global value and "
             "a scale at the same time!");
      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
                                           -(uint64_t)Offset);
      if (C->getType() != OpTy)
        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                          OpTy, false),
                                  C, OpTy);

      CI->setOperand(1, C);
    }
  }

  return FullV;
}

/// RewriteForPHI - Helper for Rewrite. PHI nodes are special because the use
/// of their operands effectively happens in their predecessor blocks, so the
/// expression may need to be expanded in multiple places.
void LSRInstance::RewriteForPHI(PHINode *PN,
                                const LSRFixup &LF,
                                const Formula &F,
                                SCEVExpander &Rewriter,
                                SmallVectorImpl<WeakVH> &DeadInsts,
                                Pass *P) const {
  DenseMap<BasicBlock *, Value *> Inserted;
  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
    if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
      BasicBlock *BB = PN->getIncomingBlock(i);

      // If this is a critical edge, split the edge so that we do not insert
      // the code on all predecessor/successor paths.  We do this unless this
      // is the canonical backedge for this loop, which complicates post-inc
      // users.
      if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
          !isa<IndirectBrInst>(BB->getTerminator()) &&
          (PN->getParent() != L->getHeader() || !L->contains(BB))) {
        // Split the critical edge.
        BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);

        // If PN is outside of the loop and BB is in the loop, we want to
        // move the block to be immediately before the PHI block, not
        // immediately after BB.
        if (L->contains(BB) && !L->contains(PN))
          NewBB->moveBefore(PN->getParent());

        // Splitting the edge can reduce the number of PHI entries we have.
        e = PN->getNumIncomingValues();
        BB = NewBB;
        i = PN->getBasicBlockIndex(BB);
      }

      std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
        Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
      if (!Pair.second)
        PN->setIncomingValue(i, Pair.first->second);
      else {
        Value *FullV = Expand(LF, F, BB->getTerminator(), Rewriter, DeadInsts);

        // If this is reuse-by-noop-cast, insert the noop cast.
        const Type *OpTy = LF.OperandValToReplace->getType();
        if (FullV->getType() != OpTy)
          FullV =
            CastInst::Create(CastInst::getCastOpcode(FullV, false,
                                                     OpTy, false),
                             FullV, LF.OperandValToReplace->getType(),
                             "tmp", BB->getTerminator());

        PN->setIncomingValue(i, FullV);
        Pair.first->second = FullV;
      }
    }
}

/// Rewrite - Emit instructions for the leading candidate expression for this
/// LSRUse (this is called "expanding"), and update the UserInst to reference
/// the newly expanded value.
void LSRInstance::Rewrite(const LSRFixup &LF,
                          const Formula &F,
                          SCEVExpander &Rewriter,
                          SmallVectorImpl<WeakVH> &DeadInsts,
                          Pass *P) const {
  // First, find an insertion point that dominates UserInst. For PHI nodes,
  // find the nearest block which dominates all the relevant uses.
  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
    RewriteForPHI(PN, LF, F, Rewriter, DeadInsts, P);
  } else {
    Value *FullV = Expand(LF, F, LF.UserInst, Rewriter, DeadInsts);

    // If this is reuse-by-noop-cast, insert the noop cast.
    const Type *OpTy = LF.OperandValToReplace->getType();
    if (FullV->getType() != OpTy) {
      Instruction *Cast =
        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
                         FullV, OpTy, "tmp", LF.UserInst);
      FullV = Cast;
    }

    // Update the user. ICmpZero is handled specially here (for now) because
    // Expand may have updated one of the operands of the icmp already, and
    // its new value may happen to be equal to LF.OperandValToReplace, in
    // which case doing replaceUsesOfWith leads to replacing both operands
    // with the same value. TODO: Reorganize this.
    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
      LF.UserInst->setOperand(0, FullV);
    else
      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
  }

  DeadInsts.push_back(LF.OperandValToReplace);
}

void
LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
                               Pass *P) {
  // Keep track of instructions we may have made dead, so that
  // we can remove them after we are done working.
  SmallVector<WeakVH, 16> DeadInsts;

  SCEVExpander Rewriter(SE);
  Rewriter.disableCanonicalMode();
  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);

  // Expand the new value definitions and update the users.
  for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
    size_t LUIdx = Fixups[i].LUIdx;

    Rewrite(Fixups[i], *Solution[LUIdx], Rewriter, DeadInsts, P);

    Changed = true;
  }

  // Clean up after ourselves. This must be done before deleting any
  // instructions.
  Rewriter.clear();

  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
}

LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
  : IU(P->getAnalysis<IVUsers>()),
    SE(P->getAnalysis<ScalarEvolution>()),
    DT(P->getAnalysis<DominatorTree>()),
    LI(P->getAnalysis<LoopInfo>()),
    TLI(tli), L(l), Changed(false), IVIncInsertPos(0) {

  // If LoopSimplify form is not available, stay out of trouble.
  if (!L->isLoopSimplifyForm()) return;

  // If there's no interesting work to be done, bail early.
  if (IU.empty()) return;

  DEBUG(dbgs() << "\nLSR on loop ";
        WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
        dbgs() << ":\n");

  /// OptimizeShadowIV - If IV is used in a int-to-float cast
  /// inside the loop then try to eliminate the cast operation.
  OptimizeShadowIV();

  // Change loop terminating condition to use the postinc iv when possible.
  Changed |= OptimizeLoopTermCond();

  CollectInterestingTypesAndFactors();
  CollectFixupsAndInitialFormulae();
  CollectLoopInvariantFixupsAndFormulae();

  DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
        print_uses(dbgs()));

  // Now use the reuse data to generate a bunch of interesting ways
  // to formulate the values needed for the uses.
  GenerateAllReuseFormulae();

  DEBUG(dbgs() << "\n"
                  "After generating reuse formulae:\n";
        print_uses(dbgs()));

  FilterOutUndesirableDedicatedRegisters();
  NarrowSearchSpaceUsingHeuristics();

  SmallVector<const Formula *, 8> Solution;
  Solve(Solution);
  assert(Solution.size() == Uses.size() && "Malformed solution!");

  // Release memory that is no longer needed.
  Factors.clear();
  Types.clear();
  RegUses.clear();

#ifndef NDEBUG
  // Formulae should be legal.
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I) {
     const LSRUse &LU = *I;
     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
          JE = LU.Formulae.end(); J != JE; ++J)
        assert(isLegalUse(J->AM, LU.MinOffset, LU.MaxOffset,
                          LU.Kind, LU.AccessTy, TLI) &&
               "Illegal formula generated!");
  };
#endif

  // Now that we've decided what we want, make it so.
  ImplementSolution(Solution, P);
}

void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
  if (Factors.empty() && Types.empty()) return;

  OS << "LSR has identified the following interesting factors and types: ";
  bool First = true;

  for (SmallSetVector<int64_t, 8>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    if (!First) OS << ", ";
    First = false;
    OS << '*' << *I;
  }

  for (SmallSetVector<const Type *, 4>::const_iterator
       I = Types.begin(), E = Types.end(); I != E; ++I) {
    if (!First) OS << ", ";
    First = false;
    OS << '(' << **I << ')';
  }
  OS << '\n';
}

void LSRInstance::print_fixups(raw_ostream &OS) const {
  OS << "LSR is examining the following fixup sites:\n";
  for (SmallVectorImpl<LSRFixup>::const_iterator I = Fixups.begin(),
       E = Fixups.end(); I != E; ++I) {
    const LSRFixup &LF = *I;
    dbgs() << "  ";
    LF.print(OS);
    OS << '\n';
  }
}

void LSRInstance::print_uses(raw_ostream &OS) const {
  OS << "LSR is examining the following uses:\n";
  for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
       E = Uses.end(); I != E; ++I) {
    const LSRUse &LU = *I;
    dbgs() << "  ";
    LU.print(OS);
    OS << '\n';
    for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
         JE = LU.Formulae.end(); J != JE; ++J) {
      OS << "    ";
      J->print(OS);
      OS << '\n';
    }
  }
}

void LSRInstance::print(raw_ostream &OS) const {
  print_factors_and_types(OS);
  print_fixups(OS);
  print_uses(OS);
}

void LSRInstance::dump() const {
  print(errs()); errs() << '\n';
}

namespace {

class LoopStrengthReduce : public LoopPass {
  /// TLI - Keep a pointer of a TargetLowering to consult for determining
  /// transformation profitability.
  const TargetLowering *const TLI;

public:
  static char ID; // Pass ID, replacement for typeid
  explicit LoopStrengthReduce(const TargetLowering *tli = 0);

private:
  bool runOnLoop(Loop *L, LPPassManager &LPM);
  void getAnalysisUsage(AnalysisUsage &AU) const;
};

}

char LoopStrengthReduce::ID = 0;
static RegisterPass<LoopStrengthReduce>
X("loop-reduce", "Loop Strength Reduction");

Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
  return new LoopStrengthReduce(TLI);
}

LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
  : LoopPass(&ID), TLI(tli) {}

void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
  // We split critical edges, so we change the CFG.  However, we do update
  // many analyses if they are around.
  AU.addPreservedID(LoopSimplifyID);
  AU.addPreserved("domfrontier");

  AU.addRequired<LoopInfo>();
  AU.addPreserved<LoopInfo>();
  AU.addRequiredID(LoopSimplifyID);
  AU.addRequired<DominatorTree>();
  AU.addPreserved<DominatorTree>();
  AU.addRequired<ScalarEvolution>();
  AU.addPreserved<ScalarEvolution>();
  AU.addRequired<IVUsers>();
  AU.addPreserved<IVUsers>();
}

bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
  bool Changed = false;

  // Run the main LSR transformation.
  Changed |= LSRInstance(TLI, L, this).getChanged();

  // At this point, it is worth checking to see if any recurrence PHIs are also
  // dead, so that we can remove them as well.
  Changed |= DeleteDeadPHIs(L->getHeader());

  return Changed;
}