[AMDGPU] Revert failed scheduling

This patch reverts region's scheduling to the original untouched state in case if we have have decreased occupancy. In addition it switches to use TargetRegisterInfo occupancy callback for pressure limits instead of gradually increasing limits which were just passed by. We are going to stay with the best schedule so we do not need to tolerate worsened scheduling anymore. Differential Revision: https://reviews.llvm.org/D29971 llvm-svn: 295206

[AMDGPU] Revert failed scheduling
This patch reverts region's scheduling to the original untouched state in case if we have have decreased occupancy. In addition it switches to use TargetRegisterInfo occupancy callback for pressure limits instead of gradually increasing limits which were just passed by. We are going to stay with the best schedule so we do not need to tolerate worsened scheduling anymore. Differential Revision: https://reviews.llvm.org/D29971 llvm-svn: 295206
582a5237 · Stanislav Mekhanoshin · 0a6913bc · 582a5237 · 582a5237 · 582a5237
Commit 582a5237 authored 8 years ago by Stanislav Mekhanoshin
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -136,8 +136,7 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
  ScheduleDAGMILive *DAG =
-      new ScheduleDAGMILive(C,
-                            llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+    new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
  return DAG;

--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -39,15 +39,30 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
                                                  *MF.getFunction()));
 }

+void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
+  GenericScheduler::initialize(DAG);
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  // FIXME: This is also necessary, because some passes that run after
+  // scheduling and before regalloc increase register pressure.
+  const int ErrorMargin = 3;
+
+  SGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin;
+  VGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin;
+  SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                        SRI->getSGPRPressureSet()) - ErrorMargin;
+  VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                        SRI->getVGPRPressureSet()) - ErrorMargin;
+}
+
 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                     bool AtTop, const RegPressureTracker &RPTracker,
                                     const SIRegisterInfo *SRI,
-                                     int SGPRPressure,
-                                     int VGPRPressure,
-                                     int SGPRExcessLimit,
-                                     int VGPRExcessLimit,
-                                     int SGPRCriticalLimit,
-                                     int VGPRCriticalLimit) {
+                                     unsigned SGPRPressure,
+                                     unsigned VGPRPressure) {

  Cand.SU = SU;
  Cand.AtTop = AtTop;
@@ -67,8 +82,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
    TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
  }

-  int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
-  int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+  unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+  unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];

  // If two instructions increase the pressure of different register sets
  // by the same amount, the generic scheduler will prefer to schedule the
@@ -78,7 +93,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
  // only for VGPRs or only for SGPRs.

  // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
-  const int MaxVGPRPressureInc = 16;
+  const unsigned MaxVGPRPressureInc = 16;
  bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
  bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;

@@ -87,11 +102,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
  // to increase the likelihood we don't go over the limits.  We should improve
  // the analysis to look through dependencies to find the path with the least
  // register pressure.
-  // FIXME: This is also necessary, because some passes that run after
-  // scheduling and before regalloc increase register pressure.
-  const int ErrorMargin = 3;
-  VGPRExcessLimit -= ErrorMargin;
-  SGPRExcessLimit -= ErrorMargin;

  // We only need to update the RPDelata for instructions that increase
  // register pressure.  Instructions that decrease or keep reg pressure
@@ -112,9 +122,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
  // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
  // has the same cost, so we don't need to prefer one over the other.

-  VGPRCriticalLimit -= ErrorMargin;
-  SGPRCriticalLimit -= ErrorMargin;
-
  int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
  int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;

@@ -135,27 +142,16 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                         const CandPolicy &ZonePolicy,
                                         const RegPressureTracker &RPTracker,
                                         SchedCandidate &Cand) {
-  const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
  ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
  unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
  unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
-  unsigned SGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
-  unsigned VGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
-  unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
-  unsigned SGPRCriticalLimit = ST.getMaxNumSGPRs(MaxWaves, true);
-  unsigned VGPRCriticalLimit = ST.getMaxNumVGPRs(MaxWaves);
-
  ReadyQueue &Q = Zone.Available;
  for (SUnit *SU : Q) {

    SchedCandidate TryCand(ZonePolicy);
    initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
-                  SGPRPressure, VGPRPressure,
-                  SGPRExcessLimit, VGPRExcessLimit,
-                  SGPRCriticalLimit, VGPRCriticalLimit);
+                  SGPRPressure, VGPRPressure);
    // Pass SchedBoundary only when comparing nodes from the same boundary.
    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
    GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
@@ -311,3 +307,66 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
  return SU;
 }
+
+void GCNScheduleDAGMILive::schedule() {
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  std::vector<MachineInstr*> Unsched;
+  Unsched.reserve(NumRegionInstrs);
+  for (auto &I : *this)
+    Unsched.push_back(&I);
+
+  ScheduleDAGMILive::schedule();
+
+  // Check the results of scheduling.
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  std::vector<unsigned> UnschedPressure = getRegPressure().MaxSetPressure;
+  unsigned MaxSGPRs = std::max(
+    getTopRPTracker().getPressure().MaxSetPressure[SRI->getSGPRPressureSet()],
+    getBotRPTracker().getPressure().MaxSetPressure[SRI->getSGPRPressureSet()]);
+  unsigned MaxVGPRs = std::max(
+    getTopRPTracker().getPressure().MaxSetPressure[SRI->getVGPRPressureSet()],
+    getBotRPTracker().getPressure().MaxSetPressure[SRI->getVGPRPressureSet()]);
+  DEBUG(dbgs() << "Pressure after scheduling:\nSGPR = " << MaxSGPRs
+               << "\nVGPR = " << MaxVGPRs << '\n');
+  if (MaxSGPRs <= S.SGPRCriticalLimit &&
+      MaxVGPRs <= S.VGPRCriticalLimit) {
+    DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    return;
+  }
+  unsigned WavesAfter = getMaxWaves(MaxSGPRs, MaxVGPRs, MF);
+  unsigned WavesUnsched = getMaxWaves(UnschedPressure[SRI->getSGPRPressureSet()],
+                            UnschedPressure[SRI->getVGPRPressureSet()], MF);
+  DEBUG(dbgs() << "Occupancy before scheduling: " << WavesUnsched <<
+        ", after " << WavesAfter << ".\n");
+  if (WavesAfter >= WavesUnsched)
+    return;
+
+  DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  RegionEnd = RegionBegin;
+  for (MachineInstr *MI : Unsched) {
+    if (MI->getIterator() != RegionEnd) {
+      BB->remove(MI);
+      BB->insert(RegionEnd, MI);
+      if (LIS) {
+        LIS->handleMove(*MI, true);
+        RegisterOperands RegOpers;
+        RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+        if (ShouldTrackLaneMasks) {
+          // Adjust liveness and add missing dead+read-undef flags.
+          SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+          RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+        } else {
+          // Adjust for missing dead-def flags.
+          RegOpers.detectDeadDefs(*MI, *LIS);
+        }
+      }
+    }
+    RegionEnd = MI->getIterator();
+    ++RegionEnd;
+    DEBUG(dbgs() << "Scheduling " << *MI);
+  }
+  RegionBegin = Unsched.front()->getIterator();
+
+  placeDebugValues();
+}
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -25,6 +25,7 @@ class SIRegisterInfo;
 /// heuristics to determine excess/critical pressure sets.  Its goal is to
 /// maximize kernel occupancy (i.e. maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+  friend class GCNScheduleDAGMILive;

  SUnit *pickNodeBidirectional(bool &IsTopNode);

@@ -35,18 +36,28 @@ class GCNMaxOccupancySchedStrategy : public GenericScheduler {
  void initCandidate(SchedCandidate &Cand, SUnit *SU,
                     bool AtTop, const RegPressureTracker &RPTracker,
                     const SIRegisterInfo *SRI,
-                     int SGPRPressure, int VGPRPressure,
-                     int SGPRExcessLimit, int VGPRExcessLimit,
-                     int SGPRCriticalLimit, int VGPRCriticalLimit);
+                     unsigned SGPRPressure, unsigned VGPRPressure);

-  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                    SchedBoundary *Zone, const SIRegisterInfo *SRI,
-                    unsigned SGPRPressure, unsigned VGPRPressure);
+  unsigned SGPRExcessLimit;
+  unsigned VGPRExcessLimit;
+  unsigned SGPRCriticalLimit;
+  unsigned VGPRCriticalLimit;

 public:
  GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);

  SUnit *pickNode(bool &IsTopNode) override;
+
+  void initialize(ScheduleDAGMI *DAG) override;
+};
+
+class GCNScheduleDAGMILive : public ScheduleDAGMILive {
+public:
+  GCNScheduleDAGMILive(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S) :
+    ScheduleDAGMILive(C, std::move(S)) {}
+
+  void schedule() override;
 };

 } // End namespace llvm

--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -13,11 +13,13 @@ define void @max_9_sgprs(i32 addrspace(1)* %out1,
                          i32 addrspace(1)* %out2,
                          i32 addrspace(1)* %out3,
                          i32 addrspace(1)* %out4,
-                          i32 %one, i32 %two, i32 %three, i32 %four) #0 {
+                          i32 addrspace(1)* %out5,
+                          i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
  store i32 %one, i32 addrspace(1)* %out1
  store i32 %two, i32 addrspace(1)* %out2
  store i32 %three, i32 addrspace(1)* %out3
  store i32 %four, i32 addrspace(1)* %out4
+  store i32 %five, i32 addrspace(1)* %out5
  ret void
 }

@@ -52,18 +54,21 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                        i32 addrspace(1)* %out3,
                                        i32 addrspace(1)* %out4,
                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
-  store volatile i32 0, i32* undef
  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
-  store volatile i64 %x.3, i64 addrspace(1)* undef
  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i32 0, i32* undef
+  br label %stores
+
+stores:
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef

  store i32 %one, i32 addrspace(1)* %out1

--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -234,23 +234,23 @@ define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
 ; SI: {{buffer|flat}}_load_dwordx4
 ; SI: {{buffer|flat}}_load_dwordx4

-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
 ; SI: v_add_i32_e32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32

 ; SI: {{buffer|flat}}_store_dwordx4
 define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {

--- a/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
+++ b/llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -12,17 +12,16 @@

 ; GCN-LABEL: {{^}}main:

-; GCN-DAG: s_mov_b32 s11, s12
-; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s14, -1
-; SI-DAG: s_mov_b32 s15, 0xe8f000
-; VI-DAG: s_mov_b32 s15, 0xe80000
-
-; s11 is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload
+; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12
+; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
+; SI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe8f000
+; VI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe80000

+; OFFREG is offset system SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
 ; GCN: ScratchSize: 1024