llvm.org GIT mirror llvm / e0ef509
Increased the register pressure limit on x86_64 from 8 to 12 regs. This is the only change in this checkin that may affects the default scheduler. With better register tracking and heuristics, it doesn't make sense to artificially lower the register limit so much. Added -sched-high-latency-cycles and X86InstrInfo::isHighLatencyDef to give the scheduler a way to account for div and sqrt on targets that don't have an itinerary. It is currently defaults to 10 (the actual number doesn't matter much), but only takes effect on non-default schedulers: list-hybrid and list-ilp. Added several heuristics that can be individually disabled for the non-default sched=list-ilp mode. This helps us determine how much better we can do on a given benchmark than the default scheduler. Certain compute intensive loops run much faster in this mode with the right set of heuristics, and it doesn't seem to have much negative impact elsewhere. Not all of the heuristics are needed, but we still need to experiment to decide which should be disabled by default for sched=list-ilp. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@127067 91177308-0d34-0410-b5e6-96231b3b80d8 Andrew Trick 8 years ago
6 changed file(s) with 173 addition(s) and 30 deletion(s). Raw diff Collapse all Expand all
639639
640640 virtual int getInstrLatency(const InstrItineraryData *ItinData,
641641 SDNode *Node) const;
642
643 /// isHighLatencyDef - Return true if this opcode has high latency to its
644 /// result.
645 bool isHighLatencyDef(int opc) const { return false; }
642646
643647 /// hasHighOperandLatency - Compute operand latency between a def of 'Reg'
644648 /// and an use in the current loop, return true if the target considered
6969 "disable-sched-cycles", cl::Hidden, cl::init(false),
7070 cl::desc("Disable cycle-level precision during preRA scheduling"));
7171
72 // Temporary sched=list-ilp flags until the heuristics are robust.
73 static cl::opt DisableSchedRegPressure(
74 "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
75 cl::desc("Disable regpressure priority in sched=list-ilp"));
76 static cl::opt DisableSchedLiveUses(
77 "disable-sched-live-uses", cl::Hidden, cl::init(false),
78 cl::desc("Disable live use priority in sched=list-ilp"));
79 static cl::opt DisableSchedStalls(
80 "disable-sched-stalls", cl::Hidden, cl::init(false),
81 cl::desc("Disable no-stall priority in sched=list-ilp"));
82 static cl::opt DisableSchedCriticalPath(
83 "disable-sched-critical-path", cl::Hidden, cl::init(false),
84 cl::desc("Disable critical path priority in sched=list-ilp"));
85 static cl::opt DisableSchedHeight(
86 "disable-sched-height", cl::Hidden, cl::init(false),
87 cl::desc("Disable scheduled-height priority in sched=list-ilp"));
88
89 static cl::opt MaxReorderWindow(
90 "max-sched-reorder", cl::Hidden, cl::init(6),
91 cl::desc("Number of instructions to allow ahead of the critical path "
92 "in sched=list-ilp"));
93
94 static cl::opt AvgIPC(
95 "sched-avg-ipc", cl::Hidden, cl::init(1),
96 cl::desc("Average inst/cycle whan no target itinerary exists."));
97
98 #ifndef NDEBUG
99 namespace {
100 // For sched=list-ilp, Count the number of times each factor comes into play.
101 enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactUllman,
102 NumFactors };
103 }
104 static const char *FactorName[NumFactors] =
105 {"PressureDiff", "RegUses", "Height", "Depth","Ullman"};
106 static int FactorCount[NumFactors];
107 #endif //!NDEBUG
108
72109 namespace {
73110 //===----------------------------------------------------------------------===//
74111 /// ScheduleDAGRRList - The actual register reduction list scheduler
101138
102139 /// MinAvailableCycle - Cycle of the soonest available instruction.
103140 unsigned MinAvailableCycle;
141
142 /// IssueCount - Count instructions issued in this cycle
143 /// Currently valid only for bottom-up scheduling.
144 unsigned IssueCount;
104145
105146 /// LiveRegDefs - A set of physical registers and their definition
106147 /// that are "live". These nodes must be scheduled before any other nodes that
233274 DEBUG(dbgs()
234275 << "********** List Scheduling BB#" << BB->getNumber()
235276 << " '" << BB->getName() << "' **********\n");
277 #ifndef NDEBUG
278 for (int i = 0; i < NumFactors; ++i) {
279 FactorCount[i] = 0;
280 }
281 #endif //!NDEBUG
236282
237283 CurCycle = 0;
284 IssueCount = 0;
238285 MinAvailableCycle = DisableSchedCycles ? 0 : UINT_MAX;
239286 NumLiveRegs = 0;
240287 LiveRegDefs.resize(TRI->getNumRegs(), NULL);
257304 else
258305 ListScheduleTopDown();
259306
307 #ifndef NDEBUG
308 for (int i = 0; i < NumFactors; ++i) {
309 DEBUG(dbgs() << FactorName[i] << "\t" << FactorCount[i] << "\n");
310 }
311 #endif // !NDEBUG
260312 AvailableQueue->releaseState();
261313 }
262314
382434 if (NextCycle <= CurCycle)
383435 return;
384436
437 IssueCount = 0;
385438 AvailableQueue->setCurCycle(NextCycle);
386439 if (!HazardRec->isEnabled()) {
387440 // Bypass lots of virtual calls in case of long latency.
501554
502555 AvailableQueue->ScheduledNode(SU);
503556
504 // If HazardRec is disabled, count each inst as one cycle.
505 // Advance CurCycle before ReleasePredecessors to avoid useles pushed to
557 // If HazardRec is disabled, and each inst counts as one cycle, then
558 // advance CurCycle before ReleasePredecessors to avoid useles pushed to
506559 // PendingQueue for schedulers that implement HasReadyFilter.
507 if (!HazardRec->isEnabled())
560 if (!HazardRec->isEnabled() && AvgIPC < 2)
508561 AdvanceToCycle(CurCycle + 1);
509562
510563 // Update liveness of predecessors before successors to avoid treating a
532585 // If HazardRec is disabled, the cycle was advanced earlier.
533586 //
534587 // Check AvailableQueue after ReleasePredecessors in case of zero latency.
588 ++IssueCount;
535589 if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
590 || (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC)
536591 || AvailableQueue->empty())
537592 AdvanceToCycle(CurCycle + 1);
538593 }
14571512
14581513 bool HighRegPressure(const SUnit *SU) const;
14591514
1460 bool MayReduceRegPressure(SUnit *SU);
1515 bool MayReduceRegPressure(SUnit *SU) const;
1516
1517 int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;
14611518
14621519 void ScheduledNode(SUnit *SU);
14631520
16771734 return false;
16781735 }
16791736
1680 bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) {
1737 bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
16811738 const SDNode *N = SU->getNode();
16821739
16831740 if (!N->isMachineOpcode() || !SU->NumSuccs)
16931750 return true;
16941751 }
16951752 return false;
1753 }
1754
1755 // Compute the register pressure contribution by this instruction by count up
1756 // for uses that are not live and down for defs. Only count register classes
1757 // that are already under high pressure. As a side effect, compute the number of
1758 // uses of registers that are already live.
1759 //
1760 // FIXME: This encompasses the logic in HighRegPressure and MayReduceRegPressure
1761 // so could probably be factored.
1762 int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
1763 LiveUses = 0;
1764 int PDiff = 0;
1765 for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end();
1766 I != E; ++I) {
1767 if (I->isCtrl())
1768 continue;
1769 SUnit *PredSU = I->getSUnit();
1770 // NumRegDefsLeft is zero when enough uses of this node have been scheduled
1771 // to cover the number of registers defined (they are all live).
1772 if (PredSU->NumRegDefsLeft == 0) {
1773 if (PredSU->getNode()->isMachineOpcode())
1774 ++LiveUses;
1775 continue;
1776 }
1777 for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
1778 RegDefPos.IsValid(); RegDefPos.Advance()) {
1779 EVT VT = RegDefPos.GetValue();
1780 unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
1781 if (RegPressure[RCId] >= RegLimit[RCId])
1782 ++PDiff;
1783 }
1784 }
1785 const SDNode *N = SU->getNode();
1786
1787 if (!N->isMachineOpcode() || !SU->NumSuccs)
1788 return PDiff;
1789
1790 unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
1791 for (unsigned i = 0; i != NumDefs; ++i) {
1792 EVT VT = N->getValueType(i);
1793 if (!N->hasAnyUseOfValue(i))
1794 continue;
1795 unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
1796 if (RegPressure[RCId] >= RegLimit[RCId])
1797 --PDiff;
1798 }
1799 return PDiff;
16961800 }
16971801
16981802 void RegReductionPQBase::ScheduledNode(SUnit *SU) {
19972101 static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
19982102 unsigned LPriority = SPQ->getNodePriority(left);
19992103 unsigned RPriority = SPQ->getNodePriority(right);
2000 if (LPriority != RPriority)
2104 if (LPriority != RPriority) {
2105 DEBUG(++FactorCount[FactUllman]);
20012106 return LPriority > RPriority;
2002
2107 }
20032108 // Try schedule def + use closer when Sethi-Ullman numbers are the same.
20042109 // e.g.
20052110 // t1 = op t2, c1
21272232 // No way to compute latency of calls.
21282233 return BURRSort(left, right, SPQ);
21292234
2130 bool LHigh = SPQ->HighRegPressure(left);
2131 bool RHigh = SPQ->HighRegPressure(right);
2132 // Avoid causing spills. If register pressure is high, schedule for
2133 // register pressure reduction.
2134 if (LHigh && !RHigh)
2135 return true;
2136 else if (!LHigh && RHigh)
2137 return false;
2138 else if (!LHigh && !RHigh) {
2139 // Low register pressure situation, schedule to maximize instruction level
2140 // parallelism.
2141 if (left->NumPreds > right->NumPreds)
2142 return false;
2143 else if (left->NumPreds < right->NumPreds)
2144 return true;
2235 unsigned LLiveUses, RLiveUses;
2236 int LPDiff = SPQ->RegPressureDiff(left, LLiveUses);
2237 int RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
2238 if (!DisableSchedRegPressure && LPDiff != RPDiff) {
2239 DEBUG(++FactorCount[FactPressureDiff]);
2240 return LPDiff > RPDiff;
2241 }
2242
2243 if (!DisableSchedLiveUses && LLiveUses != RLiveUses) {
2244 DEBUG(dbgs() << "Live uses " << left->NodeNum << " = " << LLiveUses
2245 << " != " << right->NodeNum << " = " << RLiveUses << "\n");
2246 DEBUG(++FactorCount[FactRegUses]);
2247 return LLiveUses < RLiveUses;
2248 }
2249
2250 bool LStall = BUHasStall(left, left->getHeight(), SPQ);
2251 bool RStall = BUHasStall(right, right->getHeight(), SPQ);
2252 if (!DisableSchedStalls && LStall != RStall) {
2253 DEBUG(++FactorCount[FactHeight]);
2254 return left->getHeight() > right->getHeight();
2255 }
2256
2257 if (!DisableSchedCriticalPath
2258 && abs((long)left->getDepth() - right->getDepth()) > MaxReorderWindow) {
2259 DEBUG(++FactorCount[FactDepth]);
2260 return left->getDepth() < right->getDepth();
2261 }
2262
2263 if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
2264 DEBUG(++FactorCount[FactHeight]);
2265 return left->getHeight() > right->getHeight();
21452266 }
21462267
21472268 return BURRSort(left, right, SPQ);
2626 #include "llvm/ADT/SmallSet.h"
2727 #include "llvm/ADT/SmallVector.h"
2828 #include "llvm/ADT/Statistic.h"
29 #include "llvm/Support/CommandLine.h"
2930 #include "llvm/Support/Debug.h"
3031 #include "llvm/Support/raw_ostream.h"
3132 using namespace llvm;
3233
3334 STATISTIC(LoadsClustered, "Number of loads clustered together");
35
36 // This allows latency based scheduler to notice high latency instructions
37 // without a target itinerary. The choise if number here has more to do with
38 // balancing scheduler heursitics than with the actual machine latency.
39 static cl::opt HighLatencyCycles(
40 "sched-high-latency-cycles", cl::Hidden, cl::init(10),
41 cl::desc("Roughly estimate the number of cycles that 'long latency'"
42 "instructions take for targets with no itinerary"));
3443
3544 ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
3645 : ScheduleDAG(mf),
505514 }
506515
507516 if (!InstrItins || InstrItins->isEmpty()) {
508 SU->Latency = 1;
517 if (SU->getNode() && TII->isHighLatencyDef(SU->getNode()->getOpcode()))
518 SU->Latency = HighLatencyCycles;
519 else
520 SU->Latency = 1;
509521 return;
510522 }
511523
12831283 case X86::GR32RegClassID:
12841284 return 4 - FPDiff;
12851285 case X86::GR64RegClassID:
1286 return 8 - FPDiff;
1286 return 12 - FPDiff;
12871287 case X86::VR128RegClassID:
12881288 return Subtarget->is64Bit() ? 10 : 4;
12891289 case X86::VR64RegClassID:
30843084 NopInst.setOpcode(X86::NOOP);
30853085 }
30863086
3087 bool X86InstrInfo::
3088 hasHighOperandLatency(const InstrItineraryData *ItinData,
3089 const MachineRegisterInfo *MRI,
3090 const MachineInstr *DefMI, unsigned DefIdx,
3091 const MachineInstr *UseMI, unsigned UseIdx) const {
3092 switch (DefMI->getOpcode()) {
3087 bool X86InstrInfo::isHighLatencyDef(int opc) const {
3088 switch (opc) {
30933089 default: return false;
30943090 case X86::DIVSDrm:
30953091 case X86::DIVSDrm_Int:
31193115 }
31203116 }
31213117
3118 bool X86InstrInfo::
3119 hasHighOperandLatency(const InstrItineraryData *ItinData,
3120 const MachineRegisterInfo *MRI,
3121 const MachineInstr *DefMI, unsigned DefIdx,
3122 const MachineInstr *UseMI, unsigned UseIdx) const {
3123 return isHighLatencyDef(DefMI->getOpcode());
3124 }
3125
31223126 namespace {
31233127 /// CGBR - Create Global Base Reg pass. This initializes the PIC
31243128 /// global base register for x86-32.
857857 const SmallVectorImpl &MOs,
858858 unsigned Size, unsigned Alignment) const;
859859
860 bool isHighLatencyDef(int opc) const;
861
860862 bool hasHighOperandLatency(const InstrItineraryData *ItinData,
861863 const MachineRegisterInfo *MRI,
862864 const MachineInstr *DefMI, unsigned DefIdx,