LoopStrengthReduce.cpp

//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This transformation analyzes and transforms the induction variables (and
// computations derived from them) into forms suitable for efficient execution
// on the target.
//
// This pass performs a strength reduction on array references inside loops that
// have as one or more of their components the loop induction variable, it
// rewrites expressions to take advantage of scaled-index addressing modes
// available on the target, and it performs a variety of other optimizations
// related to loop induction variables.
//
// Terminology note: this code has a lot of handling for "post-increment" or
// "post-inc" users. This is not talking about post-increment addressing modes;
// it is instead talking about code like this:
//
//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
//   ...
//   %i.next = add %i, 1
//   %c = icmp eq %i.next, %n
//
// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
// it's useful to think about these as the same register, with some uses using
// the value of the register before the add and some using // it after. In this
// example, the icmp is a post-increment user, since it uses %i.next, which is
// the value of the induction variable after the increment. The other common
// case of post-increment users is users outside the loop.
//
// TODO: More sophistication in the way Formulae are generated.
//
// TODO: Handle multiple loops at a time.
//
// TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is
//       that {0,+,1}<%bb> is getting picked first because all 7 uses can
//       use it, and while it's a pretty good solution, it means that LSR
//       doesn't look further to find an even better solution.
//
// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
//       instead of a GlobalValue?
//
// TODO: When truncation is free, truncate ICmp users' operands to make it a
//       smaller encoding (on x86 at least).
//
// TODO: When a negated register is used by an add (such as in a list of
//       multiple base registers, or as the increment expression in an addrec),
//       we may not actually need both reg and (-1 * reg) in registers; the
//       negation can be implemented by using a sub instead of an add. The
//       lack of support for taking this into consideration when making
//       register pressure decisions is partly worked around by the "Special"
//       use kind.
//
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "loop-reduce"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Constants.h"
#include "llvm/Instructions.h"
#include "llvm/IntrinsicInst.h"
#include "llvm/DerivedTypes.h"
#include "llvm/Analysis/IVUsers.h"
#include "llvm/Analysis/Dominators.h"
#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ValueHandle.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetLowering.h"
#include <algorithm>
using namespace llvm;

namespace {

// Constant strides come first which in turns are sorted by their absolute
// values. If absolute values are the same, then positive strides comes first.
// e.g.
// 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
struct StrideCompare {
  const ScalarEvolution &SE;
  explicit StrideCompare(const ScalarEvolution &se) : SE(se) {}

  bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const {
    const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
    const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
    if (LHSC && RHSC) {
      unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()),
                                   SE.getTypeSizeInBits(RHS->getType()));
      APInt  LV = LHSC->getValue()->getValue();
      APInt  RV = RHSC->getValue()->getValue();
      LV.sextOrTrunc(BitWidth);
      RV.sextOrTrunc(BitWidth);
      APInt ALV = LV.abs();
      APInt ARV = RV.abs();
      if (ALV == ARV) {
        if (LV != RV)
          return LV.sgt(RV);
      } else {
        return ALV.ult(ARV);
      }

      // If it's the same value but different type, sort by bit width so
      // that we emit larger induction variables before smaller
      // ones, letting the smaller be re-written in terms of larger ones.
      return SE.getTypeSizeInBits(RHS->getType()) <
             SE.getTypeSizeInBits(LHS->getType());
    }
    return LHSC && !RHSC;
  }
};

/// RegSortData - This class holds data which is used to order reuse
/// candidates.
class RegSortData {
public:
  /// Bits - This represents the set of LSRUses (by index) which reference a
  /// particular register.
  SmallBitVector Bits;

  /// MaxComplexity - This represents the greatest complexity (see the comments
  /// on Formula::getComplexity) seen with a particular register.
  uint32_t MaxComplexity;

  /// Index - This holds an arbitrary value used as a last-resort tie breaker
  /// to ensure deterministic behavior.
  unsigned Index;

  RegSortData() {}

  void print(raw_ostream &OS) const;
  void dump() const;
};

}

void RegSortData::print(raw_ostream &OS) const {
  OS << "[NumUses=" << Bits.count()
     << ", MaxComplexity=";
  OS.write_hex(MaxComplexity);
  OS << ", Index=" << Index << ']';
}

void RegSortData::dump() const {
  print(errs()); errs() << '\n';
}

namespace {

/// RegCount - This is a helper class to sort a given set of registers
/// according to associated RegSortData values.
class RegCount {
public:
  const SCEV *Reg;
  RegSortData Sort;

  RegCount(const SCEV *R, const RegSortData &RSD)
    : Reg(R), Sort(RSD) {}

  // Sort by count. Returning true means the register is preferred.
  bool operator<(const RegCount &Other) const {
    // Sort by the number of unique uses of this register.
    unsigned A = Sort.Bits.count();
    unsigned B = Other.Sort.Bits.count();
    if (A != B) return A > B;

    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
      const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg);
      // AddRecs have higher priority than other things.
      if (!BR) return true;

      // Prefer affine values.
      if (AR->isAffine() != BR->isAffine())
        return AR->isAffine();

      const Loop *AL = AR->getLoop();
      const Loop *BL = BR->getLoop();
      if (AL != BL) {
        unsigned ADepth = AL->getLoopDepth();
        unsigned BDepth = BL->getLoopDepth();
        // Prefer a less deeply nested addrec.
        if (ADepth != BDepth)
          return ADepth < BDepth;

        // Different loops at the same depth; do something arbitrary.
        BasicBlock *AH = AL->getHeader();
        BasicBlock *BH = BL->getHeader();
        for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I)
          if (&*I == BH) return true;
        return false;
      }