Skip to content
Snippets Groups Projects
Commit 582a5237 authored by Stanislav Mekhanoshin's avatar Stanislav Mekhanoshin
Browse files

[AMDGPU] Revert failed scheduling

This patch reverts region's scheduling to the original untouched state
in case if we have have decreased occupancy.

In addition it switches to use TargetRegisterInfo occupancy callback
for pressure limits instead of gradually increasing limits which were
just passed by. We are going to stay with the best schedule so we do
not need to tolerate worsened scheduling anymore.

Differential Revision: https://reviews.llvm.org/D29971

llvm-svn: 295206
parent 0a6913bc
No related branches found
No related tags found
No related merge requests found
......@@ -136,8 +136,7 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new ScheduleDAGMILive(C,
llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
......
......@@ -39,15 +39,30 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
*MF.getFunction()));
}
void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
GenericScheduler::initialize(DAG);
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
// FIXME: This is also necessary, because some passes that run after
// scheduling and before regalloc increase register pressure.
const int ErrorMargin = 3;
SGPRExcessLimit = Context->RegClassInfo
->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin;
VGPRExcessLimit = Context->RegClassInfo
->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin;
SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
SRI->getSGPRPressureSet()) - ErrorMargin;
VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
SRI->getVGPRPressureSet()) - ErrorMargin;
}
void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop, const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
int SGPRPressure,
int VGPRPressure,
int SGPRExcessLimit,
int VGPRExcessLimit,
int SGPRCriticalLimit,
int VGPRCriticalLimit) {
unsigned SGPRPressure,
unsigned VGPRPressure) {
Cand.SU = SU;
Cand.AtTop = AtTop;
......@@ -67,8 +82,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
}
int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
// If two instructions increase the pressure of different register sets
// by the same amount, the generic scheduler will prefer to schedule the
......@@ -78,7 +93,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// only for VGPRs or only for SGPRs.
// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
const int MaxVGPRPressureInc = 16;
const unsigned MaxVGPRPressureInc = 16;
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
......@@ -87,11 +102,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// to increase the likelihood we don't go over the limits. We should improve
// the analysis to look through dependencies to find the path with the least
// register pressure.
// FIXME: This is also necessary, because some passes that run after
// scheduling and before regalloc increase register pressure.
const int ErrorMargin = 3;
VGPRExcessLimit -= ErrorMargin;
SGPRExcessLimit -= ErrorMargin;
// We only need to update the RPDelata for instructions that increase
// register pressure. Instructions that decrease or keep reg pressure
......@@ -112,9 +122,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
// has the same cost, so we don't need to prefer one over the other.
VGPRCriticalLimit -= ErrorMargin;
SGPRCriticalLimit -= ErrorMargin;
int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
......@@ -135,27 +142,16 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
const CandPolicy &ZonePolicy,
const RegPressureTracker &RPTracker,
SchedCandidate &Cand) {
const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
unsigned SGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
unsigned VGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
unsigned SGPRCriticalLimit = ST.getMaxNumSGPRs(MaxWaves, true);
unsigned VGPRCriticalLimit = ST.getMaxNumVGPRs(MaxWaves);
ReadyQueue &Q = Zone.Available;
for (SUnit *SU : Q) {
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
SGPRPressure, VGPRPressure,
SGPRExcessLimit, VGPRExcessLimit,
SGPRCriticalLimit, VGPRCriticalLimit);
SGPRPressure, VGPRPressure);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
......@@ -311,3 +307,66 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
return SU;
}
void GCNScheduleDAGMILive::schedule() {
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
std::vector<MachineInstr*> Unsched;
Unsched.reserve(NumRegionInstrs);
for (auto &I : *this)
Unsched.push_back(&I);
ScheduleDAGMILive::schedule();
// Check the results of scheduling.
GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
std::vector<unsigned> UnschedPressure = getRegPressure().MaxSetPressure;
unsigned MaxSGPRs = std::max(
getTopRPTracker().getPressure().MaxSetPressure[SRI->getSGPRPressureSet()],
getBotRPTracker().getPressure().MaxSetPressure[SRI->getSGPRPressureSet()]);
unsigned MaxVGPRs = std::max(
getTopRPTracker().getPressure().MaxSetPressure[SRI->getVGPRPressureSet()],
getBotRPTracker().getPressure().MaxSetPressure[SRI->getVGPRPressureSet()]);
DEBUG(dbgs() << "Pressure after scheduling:\nSGPR = " << MaxSGPRs
<< "\nVGPR = " << MaxVGPRs << '\n');
if (MaxSGPRs <= S.SGPRCriticalLimit &&
MaxVGPRs <= S.VGPRCriticalLimit) {
DEBUG(dbgs() << "Pressure in desired limits, done.\n");
return;
}
unsigned WavesAfter = getMaxWaves(MaxSGPRs, MaxVGPRs, MF);
unsigned WavesUnsched = getMaxWaves(UnschedPressure[SRI->getSGPRPressureSet()],
UnschedPressure[SRI->getVGPRPressureSet()], MF);
DEBUG(dbgs() << "Occupancy before scheduling: " << WavesUnsched <<
", after " << WavesAfter << ".\n");
if (WavesAfter >= WavesUnsched)
return;
DEBUG(dbgs() << "Attempting to revert scheduling.\n");
RegionEnd = RegionBegin;
for (MachineInstr *MI : Unsched) {
if (MI->getIterator() != RegionEnd) {
BB->remove(MI);
BB->insert(RegionEnd, MI);
if (LIS) {
LIS->handleMove(*MI, true);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
if (ShouldTrackLaneMasks) {
// Adjust liveness and add missing dead+read-undef flags.
SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
} else {
// Adjust for missing dead-def flags.
RegOpers.detectDeadDefs(*MI, *LIS);
}
}
}
RegionEnd = MI->getIterator();
++RegionEnd;
DEBUG(dbgs() << "Scheduling " << *MI);
}
RegionBegin = Unsched.front()->getIterator();
placeDebugValues();
}
......@@ -25,6 +25,7 @@ class SIRegisterInfo;
/// heuristics to determine excess/critical pressure sets. Its goal is to
/// maximize kernel occupancy (i.e. maximum number of waves per simd).
class GCNMaxOccupancySchedStrategy : public GenericScheduler {
friend class GCNScheduleDAGMILive;
SUnit *pickNodeBidirectional(bool &IsTopNode);
......@@ -35,18 +36,28 @@ class GCNMaxOccupancySchedStrategy : public GenericScheduler {
void initCandidate(SchedCandidate &Cand, SUnit *SU,
bool AtTop, const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
int SGPRPressure, int VGPRPressure,
int SGPRExcessLimit, int VGPRExcessLimit,
int SGPRCriticalLimit, int VGPRCriticalLimit);
unsigned SGPRPressure, unsigned VGPRPressure);
void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
SchedBoundary *Zone, const SIRegisterInfo *SRI,
unsigned SGPRPressure, unsigned VGPRPressure);
unsigned SGPRExcessLimit;
unsigned VGPRExcessLimit;
unsigned SGPRCriticalLimit;
unsigned VGPRCriticalLimit;
public:
GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
SUnit *pickNode(bool &IsTopNode) override;
void initialize(ScheduleDAGMI *DAG) override;
};
class GCNScheduleDAGMILive : public ScheduleDAGMILive {
public:
GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S) :
ScheduleDAGMILive(C, std::move(S)) {}
void schedule() override;
};
} // End namespace llvm
......
......@@ -13,11 +13,13 @@ define void @max_9_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
i32 addrspace(1)* %out4,
i32 %one, i32 %two, i32 %three, i32 %four) #0 {
i32 addrspace(1)* %out5,
i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
store i32 %one, i32 addrspace(1)* %out1
store i32 %two, i32 addrspace(1)* %out2
store i32 %three, i32 addrspace(1)* %out3
store i32 %four, i32 addrspace(1)* %out4
store i32 %five, i32 addrspace(1)* %out5
ret void
}
......@@ -52,18 +54,21 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out3,
i32 addrspace(1)* %out4,
i32 %one, i32 %two, i32 %three, i32 %four) #2 {
store volatile i32 0, i32* undef
%x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
store volatile i32 %x.0, i32 addrspace(1)* undef
%x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
store volatile i32 %x.0, i32 addrspace(1)* undef
%x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
store volatile i32 %x.0, i32 addrspace(1)* undef
%x.3 = call i64 @llvm.amdgcn.dispatch.id()
store volatile i64 %x.3, i64 addrspace(1)* undef
%x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
%x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
store volatile i32 0, i32* undef
br label %stores
stores:
store volatile i32 %x.0, i32 addrspace(1)* undef
store volatile i32 %x.0, i32 addrspace(1)* undef
store volatile i32 %x.0, i32 addrspace(1)* undef
store volatile i64 %x.3, i64 addrspace(1)* undef
store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
store i32 %one, i32 addrspace(1)* %out1
......
......@@ -234,23 +234,23 @@ define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
; SI: {{buffer|flat}}_load_dwordx4
; SI: {{buffer|flat}}_load_dwordx4
; SI: v_mul_lo_i32
; SI: v_mul_hi_u32
; SI: v_mul_hi_u32
; SI: v_mul_lo_i32
; SI: v_mul_hi_u32
; SI: v_mul_hi_u32
; SI: v_mul_lo_i32
; SI: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
; SI: v_add_i32_e32
; SI: v_mul_hi_u32
; SI: v_mul_lo_i32
; SI: v_mul_hi_u32
; SI: v_mul_lo_i32
; SI: v_mul_lo_i32
; SI: v_mul_lo_i32
; SI: v_mul_lo_i32
; SI: v_mul_lo_i32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_hi_u32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
; SI-DAG: v_mul_lo_i32
; SI: {{buffer|flat}}_store_dwordx4
define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
......
This diff is collapsed.
......@@ -12,17 +12,16 @@
; GCN-LABEL: {{^}}main:
; GCN-DAG: s_mov_b32 s11, s12
; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s14, -1
; SI-DAG: s_mov_b32 s15, 0xe8f000
; VI-DAG: s_mov_b32 s15, 0xe80000
; s11 is offset system SGPR
; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12
; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
; SI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe8f000
; VI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe80000
; OFFREG is offset system SGPR
; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill
; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload
; GCN: NumVgprs: 256
; GCN: ScratchSize: 1024
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment