Skip to content
LoopStrengthReduce.cpp 113 KiB
Newer Older

/// GenerateSymbolicOffsets - Generate reuse formulae using symbolic offsets.
void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
                                          Formula Base) {
  // We can't add a symbolic offset if the address already contains one.
  if (Base.AM.BaseGV) return;

  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
    const SCEV *G = Base.BaseRegs[i];
    GlobalValue *GV = ExtractSymbol(G, SE);
    if (G->isZero() || !GV)
      continue;
    Formula F = Base;
    F.AM.BaseGV = GV;
    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
                    LU.Kind, LU.AccessTy, TLI))
      continue;
    F.BaseRegs[i] = G;
    (void)InsertFormula(LU, LUIdx, F);
  }
}

/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
                                          Formula Base) {
  // TODO: For now, just add the min and max offset, because it usually isn't
  // worthwhile looking at everything inbetween.
  SmallVector<int64_t, 4> Worklist;
  Worklist.push_back(LU.MinOffset);
  if (LU.MaxOffset != LU.MinOffset)
    Worklist.push_back(LU.MaxOffset);

  for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
    const SCEV *G = Base.BaseRegs[i];

    for (SmallVectorImpl<int64_t>::const_iterator I = Worklist.begin(),
         E = Worklist.end(); I != E; ++I) {
      Formula F = Base;
      F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs - *I;
      if (isLegalUse(F.AM, LU.MinOffset - *I, LU.MaxOffset - *I,
                     LU.Kind, LU.AccessTy, TLI)) {
        F.BaseRegs[i] = SE.getAddExpr(G, SE.getIntegerSCEV(*I, G->getType()));

        (void)InsertFormula(LU, LUIdx, F);
      }
    }

    int64_t Imm = ExtractImmediate(G, SE);
    if (G->isZero() || Imm == 0)
      continue;
    Formula F = Base;
    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Imm;
    if (!isLegalUse(F.AM, LU.MinOffset, LU.MaxOffset,
                    LU.Kind, LU.AccessTy, TLI))
      continue;
    F.BaseRegs[i] = G;
    (void)InsertFormula(LU, LUIdx, F);
  }
}

/// GenerateICmpZeroScales - For ICmpZero, check to see if we can scale up
/// the comparison. For example, x == y -> x*c == y*c.
void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
                                         Formula Base) {
  if (LU.Kind != LSRUse::ICmpZero) return;

  // Determine the integer type for the base formula.
  const Type *IntTy = Base.getType();
  if (!IntTy) return;
  if (SE.getTypeSizeInBits(IntTy) > 64) return;

  // Don't do this if there is more than one offset.
  if (LU.MinOffset != LU.MaxOffset) return;

  assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");

  // Check each interesting stride.
  for (SmallSetVector<int64_t, 8>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    int64_t Factor = *I;
    Formula F = Base;

    // Check that the multiplication doesn't overflow.
    F.AM.BaseOffs = (uint64_t)Base.AM.BaseOffs * Factor;
    if ((int64_t)F.AM.BaseOffs / Factor != Base.AM.BaseOffs)
      continue;

    // Check that multiplying with the use offset doesn't overflow.
    int64_t Offset = LU.MinOffset;
    Offset = (uint64_t)Offset * Factor;
    if ((int64_t)Offset / Factor != LU.MinOffset)
      continue;

    // Check that this scale is legal.
    if (!isLegalUse(F.AM, Offset, Offset, LU.Kind, LU.AccessTy, TLI))
      continue;

    // Compensate for the use having MinOffset built into it.
    F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs + Offset - LU.MinOffset;

    const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy);

    // Check that multiplying with each base register doesn't overflow.
    for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
      F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
      if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
        goto next;
    }

    // Check that multiplying with the scaled register doesn't overflow.
    if (F.ScaledReg) {
      F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
      if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
        continue;
    }

    // If we make it here and it's legal, add it.
    (void)InsertFormula(LU, LUIdx, F);
  next:;
  }
}

/// GenerateScales - Generate stride factor reuse formulae by making use of
/// scaled-offset address modes, for example.
void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx,
                                 Formula Base) {
  // Determine the integer type for the base formula.
  const Type *IntTy = Base.getType();
  if (!IntTy) return;

  // If this Formula already has a scaled register, we can't add another one.
  if (Base.AM.Scale != 0) return;

  // Check each interesting stride.
  for (SmallSetVector<int64_t, 8>::const_iterator
       I = Factors.begin(), E = Factors.end(); I != E; ++I) {
    int64_t Factor = *I;

    Base.AM.Scale = Factor;
    Base.AM.HasBaseReg = Base.BaseRegs.size() > 1;
    // Check whether this scale is going to be legal.
    if (!isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
                    LU.Kind, LU.AccessTy, TLI)) {
      // As a special-case, handle special out-of-loop Basic users specially.
      // TODO: Reconsider this special case.
      if (LU.Kind == LSRUse::Basic &&
          isLegalUse(Base.AM, LU.MinOffset, LU.MaxOffset,
                     LSRUse::Special, LU.AccessTy, TLI) &&
          LU.AllFixupsOutsideLoop)
        LU.Kind = LSRUse::Special;
      else
        continue;
    }
    // For an ICmpZero, negating a solitary base register won't lead to
    // new solutions.
    if (LU.Kind == LSRUse::ICmpZero &&
        !Base.AM.HasBaseReg && Base.AM.BaseOffs == 0 && !Base.AM.BaseGV)
      continue;
    // For each addrec base reg, apply the scale, if possible.
    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
      if (const SCEVAddRecExpr *AR =
            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
        const SCEV *FactorS = SE.getIntegerSCEV(Factor, IntTy);
        if (FactorS->isZero())
          continue;
        // Divide out the factor, ignoring high bits, since we'll be
        // scaling the value back up in the end.
        if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
          // TODO: This could be optimized to avoid all the copying.
          Formula F = Base;
          F.ScaledReg = Quotient;
          std::swap(F.BaseRegs[i], F.BaseRegs.back());
          F.BaseRegs.pop_back();
          (void)InsertFormula(LU, LUIdx, F);
        }
      }
  }
}

/// GenerateTruncates - Generate reuse formulae from different IV types.
void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx,
                                    Formula Base) {
  // This requires TargetLowering to tell us which truncates are free.
  if (!TLI) return;

  // Don't bother truncating symbolic values.
  if (Base.AM.BaseGV) return;

  // Determine the integer type for the base formula.
  const Type *DstTy = Base.getType();
  if (!DstTy) return;
  DstTy = SE.getEffectiveSCEVType(DstTy);

  for (SmallSetVector<const Type *, 4>::const_iterator
       I = Types.begin(), E = Types.end(); I != E; ++I) {
    const Type *SrcTy = *I;
    if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
      Formula F = Base;

      if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
      for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
           JE = F.BaseRegs.end(); J != JE; ++J)
        *J = SE.getAnyExtendExpr(*J, SrcTy);

      // TODO: This assumes we've done basic processing on all uses and
      // have an idea what the register usage is.
      if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
        continue;

      (void)InsertFormula(LU, LUIdx, F);
    }
  }
}

namespace {

/// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
/// defer modifications so that the search phase doesn't have to worry about
/// the data structures moving underneath it.
struct WorkItem {
  size_t LUIdx;
  int64_t Imm;
  const SCEV *OrigReg;

  WorkItem(size_t LI, int64_t I, const SCEV *R)
    : LUIdx(LI), Imm(I), OrigReg(R) {}

  void print(raw_ostream &OS) const;
  void dump() const;
};

}

void WorkItem::print(raw_ostream &OS) const {
  OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
     << " , add offset " << Imm;
}

void WorkItem::dump() const {
  print(errs()); errs() << '\n';
}

/// GenerateCrossUseConstantOffsets - Look for registers which are a constant
/// distance apart and try to form reuse opportunities between them.
void LSRInstance::GenerateCrossUseConstantOffsets() {
  // Group the registers by their value without any added constant offset.
  typedef std::map<int64_t, const SCEV *> ImmMapTy;
  typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
  RegMapTy Map;
  DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
  SmallVector<const SCEV *, 8> Sequence;
  for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
       I != E; ++I) {
    const SCEV *Reg = *I;
    int64_t Imm = ExtractImmediate(Reg, SE);
    std::pair<RegMapTy::iterator, bool> Pair =
      Map.insert(std::make_pair(Reg, ImmMapTy()));
    if (Pair.second)
      Sequence.push_back(Reg);
    Pair.first->second.insert(std::make_pair(Imm, *I));
    UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(*I);
  }

  // Now examine each set of registers with the same base value. Build up
  // a list of work to do and do the work in a separate step so that we're
  // not adding formulae and register counts while we're searching.
  SmallVector<WorkItem, 32> WorkItems;
  SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
  for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
       E = Sequence.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    const ImmMapTy &Imms = Map.find(Reg)->second;

    // It's not worthwhile looking for reuse if there's only one offset.
    if (Imms.size() == 1)
      continue;

    DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
          for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
               J != JE; ++J)
            dbgs() << ' ' << J->first;
          dbgs() << '\n');

    // Examine each offset.
    for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
         J != JE; ++J) {
      const SCEV *OrigReg = J->second;

      int64_t JImm = J->first;
      const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);

      if (!isa<SCEVConstant>(OrigReg) &&
          UsedByIndicesMap[Reg].count() == 1) {
        DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg << '\n');
        continue;
      }

      // Conservatively examine offsets between this orig reg a few selected
      // other orig regs.
      ImmMapTy::const_iterator OtherImms[] = {
        Imms.begin(), prior(Imms.end()),
        Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2)
      };
      for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
        ImmMapTy::const_iterator M = OtherImms[i];
        if (M == J || M == JE) continue;
2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999

        // Compute the difference between the two.
        int64_t Imm = (uint64_t)JImm - M->first;
        for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1;
             LUIdx = UsedByIndices.find_next(LUIdx))
          // Make a memo of this use, offset, and register tuple.
          if (UniqueItems.insert(std::make_pair(LUIdx, Imm)))
            WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
      }
    }
  }

  Map.clear();
  Sequence.clear();
  UsedByIndicesMap.clear();
  UniqueItems.clear();

  // Now iterate through the worklist and add new formulae.
  for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
       E = WorkItems.end(); I != E; ++I) {
    const WorkItem &WI = *I;
    size_t LUIdx = WI.LUIdx;
    LSRUse &LU = Uses[LUIdx];
    int64_t Imm = WI.Imm;
    const SCEV *OrigReg = WI.OrigReg;

    const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
    const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
    unsigned BitWidth = SE.getTypeSizeInBits(IntTy);

    // TODO: Use a more targetted data structure.
    for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
      Formula F = LU.Formulae[L];
      // Use the immediate in the scaled register.
      if (F.ScaledReg == OrigReg) {
        int64_t Offs = (uint64_t)F.AM.BaseOffs +
                       Imm * (uint64_t)F.AM.Scale;
        // Don't create 50 + reg(-50).
        if (F.referencesReg(SE.getSCEV(
                   ConstantInt::get(IntTy, -(uint64_t)Offs))))
          continue;
        Formula NewF = F;
        NewF.AM.BaseOffs = Offs;
        if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
                        LU.Kind, LU.AccessTy, TLI))
          continue;
        NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);

        // If the new scale is a constant in a register, and adding the constant
        // value to the immediate would produce a value closer to zero than the
        // immediate itself, then the formula isn't worthwhile.
        if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
          if (C->getValue()->getValue().isNegative() !=
                (NewF.AM.BaseOffs < 0) &&
              (C->getValue()->getValue().abs() * APInt(BitWidth, F.AM.Scale))
                .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs()))
            continue;

        // OK, looks good.
        (void)InsertFormula(LU, LUIdx, NewF);
      } else {
        // Use the immediate in a base register.
        for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
          const SCEV *BaseReg = F.BaseRegs[N];
          if (BaseReg != OrigReg)
            continue;
          Formula NewF = F;
          NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
          if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset,
                          LU.Kind, LU.AccessTy, TLI))
            continue;
          NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);

          // If the new formula has a constant in a register, and adding the
          // constant value to the immediate would produce a value closer to
          // zero than the immediate itself, then the formula isn't worthwhile.
          for (SmallVectorImpl<const SCEV *>::const_iterator
               J = NewF.BaseRegs.begin(), JE = NewF.BaseRegs.end();
               J != JE; ++J)
            if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*J))
              if (C->getValue()->getValue().isNegative() !=
                    (NewF.AM.BaseOffs < 0) &&
                  C->getValue()->getValue().abs()
                    .ule(APInt(BitWidth, NewF.AM.BaseOffs).abs()))
                goto skip_formula;

          // Ok, looks good.
          (void)InsertFormula(LU, LUIdx, NewF);
          break;
        skip_formula:;
        }
      }
    }
  }
}

/// GenerateAllReuseFormulae - Generate formulae for each use.
void
LSRInstance::GenerateAllReuseFormulae() {
  // This is split into two loops so that hasRegsUsedByUsesOtherThan
  // queries are more precise.
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
  }
  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateScales(LU, LUIdx, LU.Formulae[i]);
    for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
      GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
  }

  GenerateCrossUseConstantOffsets();
}

/// If their are multiple formulae with the same set of registers used
/// by other uses, pick the best one and delete the others.
void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
#ifndef NDEBUG
  bool Changed = false;
#endif

  // Collect the best formula for each unique set of shared registers. This
  // is reset for each use.
  typedef DenseMap<SmallVector<const SCEV *, 2>, size_t, UniquifierDenseMapInfo>
    BestFormulaeTy;
  BestFormulaeTy BestFormulae;

  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
    LSRUse &LU = Uses[LUIdx];
    FormulaSorter Sorter(L, LU, SE, DT);

    // Clear out the set of used regs; it will be recomputed.
    LU.Regs.clear();

    for (size_t FIdx = 0, NumForms = LU.Formulae.size();
         FIdx != NumForms; ++FIdx) {
      Formula &F = LU.Formulae[FIdx];

      SmallVector<const SCEV *, 2> Key;
      for (SmallVectorImpl<const SCEV *>::const_iterator J = F.BaseRegs.begin(),
           JE = F.BaseRegs.end(); J != JE; ++J) {
        const SCEV *Reg = *J;
        if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
          Key.push_back(Reg);
      }
      if (F.ScaledReg &&
          RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
        Key.push_back(F.ScaledReg);
      // Unstable sort by host order ok, because this is only used for
      // uniquifying.
      std::sort(Key.begin(), Key.end());

      std::pair<BestFormulaeTy::const_iterator, bool> P =
        BestFormulae.insert(std::make_pair(Key, FIdx));
      if (!P.second) {
        Formula &Best = LU.Formulae[P.first->second];
        if (Sorter.operator()(F, Best))
          std::swap(F, Best);
        DEBUG(dbgs() << "Filtering out "; F.print(dbgs());
              dbgs() << "\n"
                        "  in favor of "; Best.print(dbgs());
              dbgs() << '\n');
#ifndef NDEBUG
        Changed = true;
#endif
        std::swap(F, LU.Formulae.back());
        LU.Formulae.pop_back();
        --FIdx;
        --NumForms;
        continue;
      }
      if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
      LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
    }
    BestFormulae.clear();
  }

  DEBUG(if (Changed) {
          dbgs() << "After filtering out undesirable candidates:\n";
          print_uses(dbgs());
        });
}

/// NarrowSearchSpaceUsingHeuristics - If there are an extrordinary number of
/// formulae to choose from, use some rough heuristics to prune down the number
/// of formulae. This keeps the main solver from taking an extrordinary amount
/// of time in some worst-case scenarios.
void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
  // This is a rough guess that seems to work fairly well.
  const size_t Limit = UINT16_MAX;

  SmallPtrSet<const SCEV *, 4> Taken;
  for (;;) {
    // Estimate the worst-case number of solutions we might consider. We almost
    // never consider this many solutions because we prune the search space,
    // but the pruning isn't always sufficient.
    uint32_t Power = 1;
    for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
         E = Uses.end(); I != E; ++I) {
      size_t FSize = I->Formulae.size();
      if (FSize >= Limit) {
        Power = Limit;
        break;
      }
      Power *= FSize;
      if (Power >= Limit)
        break;
    }
    if (Power < Limit)
      break;

    // Ok, we have too many of formulae on our hands to conveniently handle.
    // Use a rough heuristic to thin out the list.

    // Pick the register which is used by the most LSRUses, which is likely
    // to be a good reuse register candidate.
    const SCEV *Best = 0;
    unsigned BestNum = 0;
    for (RegUseTracker::const_iterator I = RegUses.begin(), E = RegUses.end();
         I != E; ++I) {
      const SCEV *Reg = *I;
      if (Taken.count(Reg))
        continue;
      if (!Best)
        Best = Reg;
      else {
        unsigned Count = RegUses.getUsedByIndices(Reg).count();
        if (Count > BestNum) {
          Best = Reg;
          BestNum = Count;
        }
      }
    }

    DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
                 << " will yeild profitable reuse.\n");
    Taken.insert(Best);

    // In any use with formulae which references this register, delete formulae
    // which don't reference it.
    for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(),
         E = Uses.end(); I != E; ++I) {
      LSRUse &LU = *I;
      if (!LU.Regs.count(Best)) continue;

      // Clear out the set of used regs; it will be recomputed.
      LU.Regs.clear();

      for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
        Formula &F = LU.Formulae[i];
        if (!F.referencesReg(Best)) {
          DEBUG(dbgs() << "  Deleting "; F.print(dbgs()); dbgs() << '\n');
          std::swap(LU.Formulae.back(), F);
          LU.Formulae.pop_back();
          --e;
          --i;
          continue;
        }

        if (F.ScaledReg) LU.Regs.insert(F.ScaledReg);
        LU.Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
      }
    }

    DEBUG(dbgs() << "After pre-selection:\n";
          print_uses(dbgs()));
  }
}

/// SolveRecurse - This is the recursive solver.
void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
                               Cost &SolutionCost,
                               SmallVectorImpl<const Formula *> &Workspace,
                               const Cost &CurCost,
                               const SmallPtrSet<const SCEV *, 16> &CurRegs,
                               DenseSet<const SCEV *> &VisitedRegs) const {
  // Some ideas:
  //  - prune more:
  //    - use more aggressive filtering
  //    - sort the formula so that the most profitable solutions are found first
  //    - sort the uses too
  //  - search faster:
  //    - dont compute a cost, and then compare. compare while computing a cost
  //      and bail early.
  //    - track register sets with SmallBitVector

  const LSRUse &LU = Uses[Workspace.size()];

  // If this use references any register that's already a part of the
  // in-progress solution, consider it a requirement that a formula must
  // reference that register in order to be considered. This prunes out
  // unprofitable searching.
  SmallSetVector<const SCEV *, 4> ReqRegs;
  for (SmallPtrSet<const SCEV *, 16>::const_iterator I = CurRegs.begin(),
       E = CurRegs.end(); I != E; ++I)
    if (LU.Regs.count(*I)) {
      ReqRegs.insert(*I);
      break;
    }

  SmallPtrSet<const SCEV *, 16> NewRegs;
  Cost NewCost;
  for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
       E = LU.Formulae.end(); I != E; ++I) {
    const Formula &F = *I;

    // Ignore formulae which do not use any of the required registers.
    for (SmallSetVector<const SCEV *, 4>::const_iterator J = ReqRegs.begin(),
         JE = ReqRegs.end(); J != JE; ++J) {
      const SCEV *Reg = *J;
      if ((!F.ScaledReg || F.ScaledReg != Reg) &&
          std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) ==
          F.BaseRegs.end())
        goto skip;
    }

    // Evaluate the cost of the current formula. If it's already worse than
    // the current best, prune the search at that point.
    NewCost = CurCost;
    NewRegs = CurRegs;
    NewCost.RateFormula(F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT);
    if (NewCost < SolutionCost) {
      Workspace.push_back(&F);
      if (Workspace.size() != Uses.size()) {
        SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
                     NewRegs, VisitedRegs);
        if (F.getNumRegs() == 1 && Workspace.size() == 1)
          VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
      } else {
        DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
              dbgs() << ". Regs:";
              for (SmallPtrSet<const SCEV *, 16>::const_iterator
                   I = NewRegs.begin(), E = NewRegs.end(); I != E; ++I)
                dbgs() << ' ' << **I;
              dbgs() << '\n');

        SolutionCost = NewCost;
        Solution = Workspace;
      }
      Workspace.pop_back();
    }
  skip:;
  }
}

void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
  SmallVector<const Formula *, 8> Workspace;
  Cost SolutionCost;
  SolutionCost.Loose();
  Cost CurCost;
  SmallPtrSet<const SCEV *, 16> CurRegs;
  DenseSet<const SCEV *> VisitedRegs;
  Workspace.reserve(Uses.size());

  SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
               CurRegs, VisitedRegs);

  // Ok, we've now made all our decisions.
  DEBUG(dbgs() << "\n"
                  "The chosen solution requires "; SolutionCost.print(dbgs());
        dbgs() << ":\n";
        for (size_t i = 0, e = Uses.size(); i != e; ++i) {
          dbgs() << "  ";
          Uses[i].print(dbgs());
          dbgs() << "\n"
                    "    ";
          Solution[i]->print(dbgs());
          dbgs() << '\n';
        });
}

/// getImmediateDominator - A handy utility for the specific DominatorTree
/// query that we need here.
///
static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
  DomTreeNode *Node = DT.getNode(BB);
  if (!Node) return 0;
  Node = Node->getIDom();
  if (!Node) return 0;
  return Node->getBlock();
}

Value *LSRInstance::Expand(const LSRFixup &LF,
                           const Formula &F,
                           BasicBlock::iterator IP,
                           Loop *L, Instruction *IVIncInsertPos,
                           SCEVExpander &Rewriter,
                           SmallVectorImpl<WeakVH> &DeadInsts,
                           ScalarEvolution &SE, DominatorTree &DT) const {
  const LSRUse &LU = Uses[LF.LUIdx];

  // Then, collect some instructions which we will remain dominated by when
  // expanding the replacement. These must be dominated by any operands that
  // will be required in the expansion.
  SmallVector<Instruction *, 4> Inputs;
  if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
    Inputs.push_back(I);
  if (LU.Kind == LSRUse::ICmpZero)
    if (Instruction *I =
          dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
      Inputs.push_back(I);
  if (LF.PostIncLoop && !L->contains(LF.UserInst))
    Inputs.push_back(L->getLoopLatch()->getTerminator());

  // Then, climb up the immediate dominator tree as far as we can go while
  // still being dominated by the input positions.
  for (;;) {
    bool AllDominate = true;
    Instruction *BetterPos = 0;
    BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT);
    if (!IDom) break;
    Instruction *Tentative = IDom->getTerminator();
    for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
         E = Inputs.end(); I != E; ++I) {
      Instruction *Inst = *I;
      if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
        AllDominate = false;
        break;
      }
      if (IDom == Inst->getParent() &&
          (!BetterPos || DT.dominates(BetterPos, Inst)))
        BetterPos = next(BasicBlock::iterator(Inst));
    }
    if (!AllDominate)
      break;
    if (BetterPos)
      IP = BetterPos;
    else
      IP = Tentative;
  }
  while (isa<PHINode>(IP)) ++IP;

  // Inform the Rewriter if we have a post-increment use, so that it can
  // perform an advantageous expansion.
  Rewriter.setPostInc(LF.PostIncLoop);

  // This is the type that the user actually needs.
  const Type *OpTy = LF.OperandValToReplace->getType();
  // This will be the type that we'll initially expand to.
  const Type *Ty = F.getType();
  if (!Ty)
    // No type known; just expand directly to the ultimate type.
    Ty = OpTy;
  else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
    // Expand directly to the ultimate type if it's the right size.
    Ty = OpTy;
  // This is the type to do integer arithmetic in.
  const Type *IntTy = SE.getEffectiveSCEVType(Ty);

  // Build up a list of operands to add together to form the full base.
  SmallVector<const SCEV *, 8> Ops;

  // Expand the BaseRegs portion.
  for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
       E = F.BaseRegs.end(); I != E; ++I) {
    const SCEV *Reg = *I;
    assert(!Reg->isZero() && "Zero allocated in a base register!");

    // If we're expanding for a post-inc user for the add-rec's loop, make the
    // post-inc adjustment.
    const SCEV *Start = Reg;
    while (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Start)) {
      if (AR->getLoop() == LF.PostIncLoop) {
        Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE));
        // If the user is inside the loop, insert the code after the increment
        // so that it is dominated by its operand.
        if (L->contains(LF.UserInst))
          IP = IVIncInsertPos;
        break;
      }
      Start = AR->getStart();
    }

    Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
  }

  // Expand the ScaledReg portion.
  Value *ICmpScaledV = 0;
  if (F.AM.Scale != 0) {
    const SCEV *ScaledS = F.ScaledReg;

    // If we're expanding for a post-inc user for the add-rec's loop, make the
    // post-inc adjustment.
    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS))
      if (AR->getLoop() == LF.PostIncLoop)
        ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE));

    if (LU.Kind == LSRUse::ICmpZero) {
      // An interesting way of "folding" with an icmp is to use a negated
      // scale, which we'll implement by inserting it into the other operand
      // of the icmp.
      assert(F.AM.Scale == -1 &&
             "The only scale supported by ICmpZero uses is -1!");
      ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
    } else {
      // Otherwise just expand the scaled register and an explicit scale,
      // which is expected to be matched as part of the address.
      ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
      ScaledS = SE.getMulExpr(ScaledS,
                              SE.getIntegerSCEV(F.AM.Scale,
                                                ScaledS->getType()));
      Ops.push_back(ScaledS);
    }
  }

  // Expand the immediate portions.
  if (F.AM.BaseGV)
    Ops.push_back(SE.getSCEV(F.AM.BaseGV));
  int64_t Offset = (uint64_t)F.AM.BaseOffs + LF.Offset;
  if (Offset != 0) {
    if (LU.Kind == LSRUse::ICmpZero) {
      // The other interesting way of "folding" with an ICmpZero is to use a
      // negated immediate.
      if (!ICmpScaledV)
        ICmpScaledV = ConstantInt::get(IntTy, -Offset);
      else {
        Ops.push_back(SE.getUnknown(ICmpScaledV));
        ICmpScaledV = ConstantInt::get(IntTy, Offset);
      }
    } else {
      // Just add the immediate values. These again are expected to be matched
      // as part of the address.
      Ops.push_back(SE.getIntegerSCEV(Offset, IntTy));
    }
  }

  // Emit instructions summing all the operands.
  const SCEV *FullS = Ops.empty() ?
                      SE.getIntegerSCEV(0, IntTy) :
                      SE.getAddExpr(Ops);
  Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);

  // We're done expanding now, so reset the rewriter.
  Rewriter.setPostInc(0);

  // An ICmpZero Formula represents an ICmp which we're handling as a
  // comparison against zero. Now that we've expanded an expression for that
  // form, update the ICmp's other operand.
  if (LU.Kind == LSRUse::ICmpZero) {
    ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
    DeadInsts.push_back(CI->getOperand(1));
    assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
                           "a scale at the same time!");
    if (F.AM.Scale == -1) {
      if (ICmpScaledV->getType() != OpTy) {
        Instruction *Cast =
          CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
                                                   OpTy, false),
                           ICmpScaledV, OpTy, "tmp", CI);
        ICmpScaledV = Cast;
      }
      CI->setOperand(1, ICmpScaledV);
    } else {
      assert(F.AM.Scale == 0 &&
             "ICmp does not support folding a global value and "
             "a scale at the same time!");
      Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
                                           -(uint64_t)Offset);
      if (C->getType() != OpTy)
        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                          OpTy, false),
                                  C, OpTy);

      CI->setOperand(1, C);
    }
  }

  return FullV;
}

/// Rewrite - Emit instructions for the leading candidate expression for this
/// LSRUse (this is called "expanding"), and update the UserInst to reference
/// the newly expanded value.
void LSRInstance::Rewrite(const LSRFixup &LF,
                          const Formula &F,
                          Loop *L, Instruction *IVIncInsertPos,
                          SCEVExpander &Rewriter,
                          SmallVectorImpl<WeakVH> &DeadInsts,
                          ScalarEvolution &SE, DominatorTree &DT,
                          Pass *P) const {
  const Type *OpTy = LF.OperandValToReplace->getType();

  // First, find an insertion point that dominates UserInst. For PHI nodes,
  // find the nearest block which dominates all the relevant uses.
  if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
    DenseMap<BasicBlock *, Value *> Inserted;
    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
      if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
        BasicBlock *BB = PN->getIncomingBlock(i);

        // If this is a critical edge, split the edge so that we do not insert
        // the code on all predecessor/successor paths.  We do this unless this
        // is the canonical backedge for this loop, which complicates post-inc
        // users.
        if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
            !isa<IndirectBrInst>(BB->getTerminator()) &&
            (PN->getParent() != L->getHeader() || !L->contains(BB))) {
          // Split the critical edge.
          BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);

          // If PN is outside of the loop and BB is in the loop, we want to
          // move the block to be immediately before the PHI block, not
          // immediately after BB.
          if (L->contains(BB) && !L->contains(PN))
            NewBB->moveBefore(PN->getParent());

          // Splitting the edge can reduce the number of PHI entries we have.
          e = PN->getNumIncomingValues();
          BB = NewBB;
          i = PN->getBasicBlockIndex(BB);
        }

        std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
          Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
        if (!Pair.second)
          PN->setIncomingValue(i, Pair.first->second);
        else {
          Value *FullV = Expand(LF, F, BB->getTerminator(), L, IVIncInsertPos,
                                Rewriter, DeadInsts, SE, DT);

          // If this is reuse-by-noop-cast, insert the noop cast.
          if (FullV->getType() != OpTy)
            FullV =
              CastInst::Create(CastInst::getCastOpcode(FullV, false,
                                                       OpTy, false),
                               FullV, LF.OperandValToReplace->getType(),
                               "tmp", BB->getTerminator());

          PN->setIncomingValue(i, FullV);
          Pair.first->second = FullV;
        }
      }
  } else {
    Value *FullV = Expand(LF, F, LF.UserInst, L, IVIncInsertPos,
                          Rewriter, DeadInsts, SE, DT);

    // If this is reuse-by-noop-cast, insert the noop cast.
    if (FullV->getType() != OpTy) {
      Instruction *Cast =
        CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
                         FullV, OpTy, "tmp", LF.UserInst);
      FullV = Cast;
    }

    // Update the user. ICmpZero is handled specially here (for now) because
    // Expand may have updated one of the operands of the icmp already, and
    // its new value may happen to be equal to LF.OperandValToReplace, in
    // which case doing replaceUsesOfWith leads to replacing both operands
    // with the same value. TODO: Reorganize this.
    if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero)
      LF.UserInst->setOperand(0, FullV);
    else
      LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
  }

  DeadInsts.push_back(LF.OperandValToReplace);
}

void
LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution,
                               Pass *P) {
  // Keep track of instructions we may have made dead, so that
  // we can remove them after we are done working.
  SmallVector<WeakVH, 16> DeadInsts;

  SCEVExpander Rewriter(SE);
  Rewriter.disableCanonicalMode();
  Rewriter.setIVIncInsertPos(L, IVIncInsertPos);

  // Expand the new value definitions and update the users.
  for (size_t i = 0, e = Fixups.size(); i != e; ++i) {
    size_t LUIdx = Fixups[i].LUIdx;

    Rewrite(Fixups[i], *Solution[LUIdx], L, IVIncInsertPos, Rewriter,
            DeadInsts, SE, DT, P);

    Changed = true;
  }

  // Clean up after ourselves. This must be done before deleting any
  // instructions.
  Rewriter.clear();