llvm.org GIT mirror llvm / 9e76e1d
Add a PostMachineScheduler pass with generic implementation. PostGenericScheduler uses either the new machine model or the hazard checker for top-down scheduling. Most of the infrastructure for PreRA machine scheduling is reused. With a some tuning, this should allow MachineScheduler to be default for all ARM targets, including cortex-A9, using the new machine model. Likewise, with additional tuning, it should be able to replace PostRAScheduler for all targets. The PostMachineScheduler pass does not currently run the AntiDepBreaker. There is less need for it on targets that are already running preRA MachineScheduler. I want to prove it's necessary before committing to the maintenance burden. The PostMachineScheduler also currently removes kill flags and adds them all back later. This is a bit ridiculous. I'd prefer passes to directly use a liveness utility than rely on flags. A test case that enables this scheduler will be included in a subsequent checkin that updates the A9 model. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@198122 91177308-0d34-0410-b5e6-96231b3b80d8 Andrew Trick 5 years ago
1 changed file(s) with 733 addition(s) and 495 deletion(s). Raw diff Collapse all Expand all
203203
204204 /// Forward declare the standard machine scheduler. This will be used as the
205205 /// default scheduler if the target does not set a default.
206 static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C);
207 static ScheduleDAGInstrs *createRawGenericSched(MachineSchedContext *C);
206 static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C);
207 static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C);
208208
209209 /// Decrement this iterator until reaching the top or a non-debug instr.
210210 static MachineBasicBlock::const_iterator
263263 return Scheduler;
264264
265265 // Default to GenericScheduler.
266 return createGenericSched(this);
266 return createGenericSchedLive(this);
267267 }
268268
269269 /// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
276276 return Scheduler;
277277
278278 // Default to GenericScheduler.
279 // return createRawGenericSched(this);
280 return NULL;
279 return createGenericSchedPostRA(this);
281280 }
282281
283282 /// Top-level MachineScheduler pass driver.
345344 return true;
346345 }
347346
347 /// Return true of the given instruction should not be included in a scheduling
348 /// region.
349 ///
350 /// MachineScheduler does not currently support scheduling across calls. To
351 /// handle calls, the DAG builder needs to be modified to create register
352 /// anti/output dependencies on the registers clobbered by the call's regmask
353 /// operand. In PreRA scheduling, the stack pointer adjustment already prevents
354 /// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
355 /// the boundary, but there would be no benefit to postRA scheduling across
356 /// calls this late anyway.
357 static bool isSchedBoundary(MachineBasicBlock::iterator MI,
358 MachineBasicBlock *MBB,
359 MachineFunction *MF,
360 const TargetInstrInfo *TII,
361 bool IsPostRA) {
362 return MI->isCall() || TII->isSchedulingBoundary(MI, MBB, *MF);
363 }
364
348365 /// Main driver for both MachineScheduler and PostMachineScheduler.
349366 void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler) {
350367 const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
368 bool IsPostRA = Scheduler.isPostRA();
351369
352370 // Visit all machine basic blocks.
353371 //
368386 // The Scheduler may insert instructions during either schedule() or
369387 // exitRegion(), even for empty regions. So the local iterators 'I' and
370388 // 'RegionEnd' are invalid across these calls.
371 unsigned RemainingInstrs = MBB->size();
389 //
390 // MBB::size() uses instr_iterator to count. Here we need a bundle to count
391 // as a single instruction.
392 unsigned RemainingInstrs = std::distance(MBB->begin(), MBB->end());
372393 for(MachineBasicBlock::iterator RegionEnd = MBB->end();
373394 RegionEnd != MBB->begin(); RegionEnd = Scheduler.begin()) {
374395
375396 // Avoid decrementing RegionEnd for blocks with no terminator.
376397 if (RegionEnd != MBB->end()
377 || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
398 || isSchedBoundary(llvm::prior(RegionEnd), MBB, MF, TII, IsPostRA)) {
378399 --RegionEnd;
379400 // Count the boundary instruction.
380401 --RemainingInstrs;
385406 unsigned NumRegionInstrs = 0;
386407 MachineBasicBlock::iterator I = RegionEnd;
387408 for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
388 if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
409 if (isSchedBoundary(llvm::prior(I), MBB, MF, TII, IsPostRA))
389410 break;
390411 }
391412 // Notify the scheduler of the region, even if we may skip scheduling
399420 Scheduler.exitRegion();
400421 continue;
401422 }
402 DEBUG(dbgs() << "********** MI Scheduling **********\n");
423 DEBUG(dbgs() << "********** " << ((Scheduler.isPostRA()) ? "PostRA " : "")
424 << "MI Scheduling **********\n");
403425 DEBUG(dbgs() << MF->getName()
404426 << ":BB#" << MBB->getNumber() << " " << MBB->getName()
405427 << "\n From: " << *I << " To: ";
421443 }
422444 assert(RemainingInstrs == 0 && "Instruction count mismatch!");
423445 Scheduler.finishBlock();
446 if (Scheduler.isPostRA()) {
447 // FIXME: Ideally, no further passes should rely on kill flags. However,
448 // thumb2 size reduction is currently an exception.
449 Scheduler.fixupKills(MBB);
450 }
424451 }
425452 Scheduler.finalizeSchedule();
426453 }
15011528 //===----------------------------------------------------------------------===//
15021529 // MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
15031530 // and possibly other custom schedulers.
1504 // ===----------------------------------------------------------------------===/
1531 //===----------------------------------------------------------------------===//
15051532
15061533 static const unsigned InvalidCycle = ~0U;
15071534
15301557 IsResourceLimited = false;
15311558 ReservedCycles.clear();
15321559 #ifndef NDEBUG
1560 // Track the maximum number of stall cycles that could arise either from the
1561 // latency of a DAG edge or the number of cycles that a processor resource is
1562 // reserved (SchedBoundary::ReservedCycles).
15331563 MaxObservedLatency = 0;
15341564 #endif
15351565 // Reserve a zero-count for invalid CritResIdx.
16151645 ///
16161646 /// TODO: Also check whether the SU must start a new group.
16171647 bool SchedBoundary::checkHazard(SUnit *SU) {
1618 if (HazardRec->isEnabled())
1619 return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
1620
1648 if (HazardRec->isEnabled()
1649 && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
1650 return true;
1651 }
16211652 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
16221653 if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
16231654 DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops="
19031934 PI = SchedModel->getWriteProcResBegin(SC),
19041935 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
19051936 unsigned PIdx = PI->ProcResourceIdx;
1906 if (SchedModel->getProcResource(PIdx)->BufferSize == 0)
1937 if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
19071938 ReservedCycles[PIdx] = isTop() ? NextCycle + PI->Cycles : NextCycle;
1939 #ifndef NDEBUG
1940 MaxObservedLatency = std::max(PI->Cycles, MaxObservedLatency);
1941 #endif
1942 }
19081943 }
19091944 }
19101945 }
19391974 // bump the cycle to avoid uselessly checking everything in the readyQ.
19401975 CurrMOps += IncMOps;
19411976 while (CurrMOps >= SchedModel->getIssueWidth()) {
1942 bumpCycle(++NextCycle);
19431977 DEBUG(dbgs() << " *** Max MOps " << CurrMOps
19441978 << " at cycle " << CurrCycle << '\n');
1979 bumpCycle(++NextCycle);
19451980 }
19461981 DEBUG(dumpScheduledState());
19471982 }
20442079 #endif
20452080
20462081 //===----------------------------------------------------------------------===//
2047 // GenericScheduler - Implementation of the generic MachineSchedStrategy.
2082 // GenericScheduler - Generic implementation of MachineSchedStrategy.
20482083 //===----------------------------------------------------------------------===//
20492084
20502085 namespace {
2051 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance
2052 /// the schedule.
2053 class GenericScheduler : public MachineSchedStrategy {
2086 /// Base class for GenericScheduler. This class maintains information about
2087 /// scheduling candidates based on TargetSchedModel making it easy to implement
2088 /// heuristics for either preRA or postRA scheduling.
2089 class GenericSchedulerBase : public MachineSchedStrategy {
20542090 public:
20552091 /// Represent the type of SchedCandidate found within a single queue.
20562092 /// pickNodeBidirectional depends on these listed by decreasing priority.
20602096 TopDepthReduce, TopPathReduce, NextDefUse, NodeOrder};
20612097
20622098 #ifndef NDEBUG
2063 static const char *getReasonStr(GenericScheduler::CandReason Reason);
2099 static const char *getReasonStr(GenericSchedulerBase::CandReason Reason);
20642100 #endif
20652101
20662102 /// Policy for scheduling the next instruction in the candidate's zone.
21282164 bool isRepeat(CandReason R) { return RepeatReasonSet & (1 << R); }
21292165 void setRepeat(CandReason R) { RepeatReasonSet |= (1 << R); }
21302166
2131 void initResourceDelta(const ScheduleDAGMILive *DAG,
2167 void initResourceDelta(const ScheduleDAGMI *DAG,
21322168 const TargetSchedModel *SchedModel);
21332169 };
21342170
2135 private:
2171 protected:
21362172 const MachineSchedContext *Context;
2137 ScheduleDAGMILive *DAG;
21382173 const TargetSchedModel *SchedModel;
21392174 const TargetRegisterInfo *TRI;
21402175
2141 // State of the top and bottom scheduled instruction boundaries.
21422176 SchedRemainder Rem;
2143 SchedBoundary Top;
2144 SchedBoundary Bot;
2145
2146 MachineSchedPolicy RegionPolicy;
2147 public:
2148 GenericScheduler(const MachineSchedContext *C):
2149 Context(C), DAG(0), SchedModel(0), TRI(0),
2150 Top(SchedBoundary::TopQID, "TopQ"), Bot(SchedBoundary::BotQID, "BotQ") {}
2151
2152 virtual void initPolicy(MachineBasicBlock::iterator Begin,
2153 MachineBasicBlock::iterator End,
2154 unsigned NumRegionInstrs);
2155
2156 bool shouldTrackPressure() const { return RegionPolicy.ShouldTrackPressure; }
2157
2158 virtual void initialize(ScheduleDAGMI *dag);
2159
2160 virtual SUnit *pickNode(bool &IsTopNode);
2161
2162 virtual void schedNode(SUnit *SU, bool IsTopNode);
2163
2164 virtual void releaseTopNode(SUnit *SU) { Top.releaseTopNode(SU); }
2165
2166 virtual void releaseBottomNode(SUnit *SU) { Bot.releaseBottomNode(SU); }
2167
2168 virtual void registerRoots();
2169
21702177 protected:
2171 void checkAcyclicLatency();
2172
2173 void setPolicy(CandPolicy &Policy, SchedBoundary &CurrZone,
2174 SchedBoundary &OtherZone);
2175
2176 void tryCandidate(SchedCandidate &Cand,
2177 SchedCandidate &TryCand,
2178 SchedBoundary &Zone,
2179 const RegPressureTracker &RPTracker,
2180 RegPressureTracker &TempTracker);
2181
2182 SUnit *pickNodeBidirectional(bool &IsTopNode);
2183
2184 void pickNodeFromQueue(SchedBoundary &Zone,
2185 const RegPressureTracker &RPTracker,
2186 SchedCandidate &Candidate);
2187
2188 void reschedulePhysRegCopies(SUnit *SU, bool isTop);
2178 GenericSchedulerBase(const MachineSchedContext *C):
2179 Context(C), SchedModel(0), TRI(0) {}
2180
2181 void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
2182 SchedBoundary *OtherZone);
21892183
21902184 #ifndef NDEBUG
21912185 void traceCandidate(const SchedCandidate &Cand);
21932187 };
21942188 } // namespace
21952189
2196 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
2197 assert(dag->hasVRegLiveness() &&
2198 "(PreRA)GenericScheduler needs vreg liveness");
2199 DAG = static_cast(dag);
2200 SchedModel = DAG->getSchedModel();
2201 TRI = DAG->TRI;
2202
2203 Rem.init(DAG, SchedModel);
2204 Top.init(DAG, SchedModel, &Rem);
2205 Bot.init(DAG, SchedModel, &Rem);
2206
2207 // Initialize resource counts.
2208
2209 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
2210 // are disabled, then these HazardRecs will be disabled.
2211 const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
2212 const TargetMachine &TM = DAG->MF.getTarget();
2213 if (!Top.HazardRec) {
2214 Top.HazardRec =
2215 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
2216 }
2217 if (!Bot.HazardRec) {
2218 Bot.HazardRec =
2219 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
2220 }
2221 }
2222
2223 /// Initialize the per-region scheduling policy.
2224 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
2225 MachineBasicBlock::iterator End,
2226 unsigned NumRegionInstrs) {
2227 const TargetMachine &TM = Context->MF->getTarget();
2228
2229 // Avoid setting up the register pressure tracker for small regions to save
2230 // compile time. As a rough heuristic, only track pressure when the number of
2231 // schedulable instructions exceeds half the integer register file.
2232 unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
2233 TM.getTargetLowering()->getRegClassFor(MVT::i32));
2234
2235 RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
2236
2237 // For generic targets, we default to bottom-up, because it's simpler and more
2238 // compile-time optimizations have been implemented in that direction.
2239 RegionPolicy.OnlyBottomUp = true;
2240
2241 // Allow the subtarget to override default policy.
2242 const TargetSubtargetInfo &ST = TM.getSubtarget();
2243 ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
2244
2245 // After subtarget overrides, apply command line options.
2246 if (!EnableRegPressure)
2247 RegionPolicy.ShouldTrackPressure = false;
2248
2249 // Check -misched-topdown/bottomup can force or unforce scheduling direction.
2250 // e.g. -misched-bottomup=false allows scheduling in both directions.
2251 assert((!ForceTopDown || !ForceBottomUp) &&
2252 "-misched-topdown incompatible with -misched-bottomup");
2253 if (ForceBottomUp.getNumOccurrences() > 0) {
2254 RegionPolicy.OnlyBottomUp = ForceBottomUp;
2255 if (RegionPolicy.OnlyBottomUp)
2256 RegionPolicy.OnlyTopDown = false;
2257 }
2258 if (ForceTopDown.getNumOccurrences() > 0) {
2259 RegionPolicy.OnlyTopDown = ForceTopDown;
2260 if (RegionPolicy.OnlyTopDown)
2261 RegionPolicy.OnlyBottomUp = false;
2262 }
2263 }
2264
2265 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
2266 /// critical path by more cycles than it takes to drain the instruction buffer.
2267 /// We estimate an upper bounds on in-flight instructions as:
2268 ///
2269 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
2270 /// InFlightIterations = AcyclicPath / CyclesPerIteration
2271 /// InFlightResources = InFlightIterations * LoopResources
2272 ///
2273 /// TODO: Check execution resources in addition to IssueCount.
2274 void GenericScheduler::checkAcyclicLatency() {
2275 if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
2276 return;
2277
2278 // Scaled number of cycles per loop iteration.
2279 unsigned IterCount =
2280 std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
2281 Rem.RemIssueCount);
2282 // Scaled acyclic critical path.
2283 unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
2284 // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
2285 unsigned InFlightCount =
2286 (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
2287 unsigned BufferLimit =
2288 SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
2289
2290 Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
2291
2292 DEBUG(dbgs() << "IssueCycles="
2293 << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
2294 << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
2295 << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
2296 << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
2297 << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
2298 if (Rem.IsAcyclicLatencyLimited)
2299 dbgs() << " ACYCLIC LATENCY LIMIT\n");
2300 }
2301
2302 void GenericScheduler::registerRoots() {
2303 Rem.CriticalPath = DAG->ExitSU.getDepth();
2304
2305 // Some roots may not feed into ExitSU. Check all of them in case.
2306 for (std::vector::const_iterator
2307 I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
2308 if ((*I)->getDepth() > Rem.CriticalPath)
2309 Rem.CriticalPath = (*I)->getDepth();
2310 }
2311 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
2312
2313 if (EnableCyclicPath) {
2314 Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
2315 checkAcyclicLatency();
2190 void GenericSchedulerBase::SchedCandidate::
2191 initResourceDelta(const ScheduleDAGMI *DAG,
2192 const TargetSchedModel *SchedModel) {
2193 if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
2194 return;
2195
2196 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2197 for (TargetSchedModel::ProcResIter
2198 PI = SchedModel->getWriteProcResBegin(SC),
2199 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2200 if (PI->ProcResourceIdx == Policy.ReduceResIdx)
2201 ResDelta.CritResources += PI->Cycles;
2202 if (PI->ProcResourceIdx == Policy.DemandResIdx)
2203 ResDelta.DemandedResources += PI->Cycles;
23162204 }
23172205 }
23182206
23192207 /// Set the CandPolicy given a scheduling zone given the current resources and
23202208 /// latencies inside and outside the zone.
2321 void GenericScheduler::setPolicy(CandPolicy &Policy, SchedBoundary &CurrZone,
2322 SchedBoundary &OtherZone) {
2209 void GenericSchedulerBase::setPolicy(CandPolicy &Policy,
2210 bool IsPostRA,
2211 SchedBoundary &CurrZone,
2212 SchedBoundary *OtherZone) {
23232213 // Apply preemptive heuristics based on the the total latency and resources
23242214 // inside and outside this zone. Potential stalls should be considered before
23252215 // following this policy.
23452235
23462236 // Compute the critical resource outside the zone.
23472237 unsigned OtherCritIdx;
2348 unsigned OtherCount = OtherZone.getOtherResourceCount(OtherCritIdx);
2238 unsigned OtherCount =
2239 OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
23492240
23502241 bool OtherResLimited = false;
23512242 if (SchedModel->hasInstrSchedModel()) {
23522243 unsigned LFactor = SchedModel->getLatencyFactor();
23532244 OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor;
23542245 }
2355 if (!OtherResLimited
2356 && (RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath)) {
2357 Policy.ReduceLatency |= true;
2358 DEBUG(dbgs() << " " << CurrZone.Available.getName() << " RemainingLatency "
2359 << RemLatency << " + " << CurrZone.getCurrCycle() << "c > CritPath "
2360 << Rem.CriticalPath << "\n");
2246 // Schedule aggressively for latency in PostRA mode. We don't check for
2247 // acyclic latency during PostRA, and highly out-of-order processors will
2248 // skip PostRA scheduling.
2249 if (!OtherResLimited) {
2250 if (IsPostRA || (RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath)) {
2251 Policy.ReduceLatency |= true;
2252 DEBUG(dbgs() << " " << CurrZone.Available.getName()
2253 << " RemainingLatency " << RemLatency << " + "
2254 << CurrZone.getCurrCycle() << "c > CritPath "
2255 << Rem.CriticalPath << "\n");
2256 }
23612257 }
23622258 // If the same resource is limiting inside and outside the zone, do nothing.
23632259 if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
23822278 Policy.DemandResIdx = OtherCritIdx;
23832279 }
23842280
2385 void GenericScheduler::SchedCandidate::
2386 initResourceDelta(const ScheduleDAGMILive *DAG,
2387 const TargetSchedModel *SchedModel) {
2388 if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
2389 return;
2390
2391 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
2392 for (TargetSchedModel::ProcResIter
2393 PI = SchedModel->getWriteProcResBegin(SC),
2394 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
2395 if (PI->ProcResourceIdx == Policy.ReduceResIdx)
2396 ResDelta.CritResources += PI->Cycles;
2397 if (PI->ProcResourceIdx == Policy.DemandResIdx)
2398 ResDelta.DemandedResources += PI->Cycles;
2399 }
2400 }
2401
2402 /// Return true if this heuristic determines order.
2403 static bool tryLess(int TryVal, int CandVal,
2404 GenericScheduler::SchedCandidate &TryCand,
2405 GenericScheduler::SchedCandidate &Cand,
2406 GenericScheduler::CandReason Reason) {
2407 if (TryVal < CandVal) {
2408 TryCand.Reason = Reason;
2409 return true;
2410 }
2411 if (TryVal > CandVal) {
2412 if (Cand.Reason > Reason)
2413 Cand.Reason = Reason;
2414 return true;
2415 }
2416 Cand.setRepeat(Reason);
2417 return false;
2418 }
2419
2420 static bool tryGreater(int TryVal, int CandVal,
2421 GenericScheduler::SchedCandidate &TryCand,
2422 GenericScheduler::SchedCandidate &Cand,
2423 GenericScheduler::CandReason Reason) {
2424 if (TryVal > CandVal) {
2425 TryCand.Reason = Reason;
2426 return true;
2427 }
2428 if (TryVal < CandVal) {
2429 if (Cand.Reason > Reason)
2430 Cand.Reason = Reason;
2431 return true;
2432 }
2433 Cand.setRepeat(Reason);
2434 return false;
2435 }
2436
2437 static bool tryPressure(const PressureChange &TryP,
2438 const PressureChange &CandP,
2439 GenericScheduler::SchedCandidate &TryCand,
2440 GenericScheduler::SchedCandidate &Cand,
2441 GenericScheduler::CandReason Reason) {
2442 int TryRank = TryP.getPSetOrMax();
2443 int CandRank = CandP.getPSetOrMax();
2444 // If both candidates affect the same set, go with the smallest increase.
2445 if (TryRank == CandRank) {
2446 return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
2447 Reason);
2448 }
2449 // If one candidate decreases and the other increases, go with it.
2450 // Invalid candidates have UnitInc==0.
2451 if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
2452 Reason)) {
2453 return true;
2454 }
2455 // If the candidates are decreasing pressure, reverse priority.
2456 if (TryP.getUnitInc() < 0)
2457 std::swap(TryRank, CandRank);
2458 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
2459 }
2460
2461 static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
2462 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
2463 }
2464
2465 /// Minimize physical register live ranges. Regalloc wants them adjacent to
2466 /// their physreg def/use.
2467 ///
2468 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
2469 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
2470 /// with the operation that produces or consumes the physreg. We'll do this when
2471 /// regalloc has support for parallel copies.
2472 static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
2473 const MachineInstr *MI = SU->getInstr();
2474 if (!MI->isCopy())
2475 return 0;
2476
2477 unsigned ScheduledOper = isTop ? 1 : 0;
2478 unsigned UnscheduledOper = isTop ? 0 : 1;
2479 // If we have already scheduled the physreg produce/consumer, immediately
2480 // schedule the copy.
2481 if (TargetRegisterInfo::isPhysicalRegister(
2482 MI->getOperand(ScheduledOper).getReg()))
2483 return 1;
2484 // If the physreg is at the boundary, defer it. Otherwise schedule it
2485 // immediately to free the dependent. We can hoist the copy later.
2486 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
2487 if (TargetRegisterInfo::isPhysicalRegister(
2488 MI->getOperand(UnscheduledOper).getReg()))
2489 return AtBoundary ? -1 : 1;
2490 return 0;
2491 }
2492
2493 static bool tryLatency(GenericScheduler::SchedCandidate &TryCand,
2494 GenericScheduler::SchedCandidate &Cand,
2495 SchedBoundary &Zone) {
2496 if (Zone.isTop()) {
2497 if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
2498 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2499 TryCand, Cand, GenericScheduler::TopDepthReduce))
2500 return true;
2501 }
2502 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2503 TryCand, Cand, GenericScheduler::TopPathReduce))
2504 return true;
2505 }
2506 else {
2507 if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
2508 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2509 TryCand, Cand, GenericScheduler::BotHeightReduce))
2510 return true;
2511 }
2512 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2513 TryCand, Cand, GenericScheduler::BotPathReduce))
2514 return true;
2515 }
2516 return false;
2517 }
2518
2519 /// Apply a set of heursitics to a new candidate. Heuristics are currently
2520 /// hierarchical. This may be more efficient than a graduated cost model because
2521 /// we don't need to evaluate all aspects of the model for each node in the
2522 /// queue. But it's really done to make the heuristics easier to debug and
2523 /// statistically analyze.
2524 ///
2525 /// \param Cand provides the policy and current best candidate.
2526 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
2527 /// \param Zone describes the scheduled zone that we are extending.
2528 /// \param RPTracker describes reg pressure within the scheduled zone.
2529 /// \param TempTracker is a scratch pressure tracker to reuse in queries.
2530 void GenericScheduler::tryCandidate(SchedCandidate &Cand,
2531 SchedCandidate &TryCand,
2532 SchedBoundary &Zone,
2533 const RegPressureTracker &RPTracker,
2534 RegPressureTracker &TempTracker) {
2535
2536 if (DAG->isTrackingPressure()) {
2537 // Always initialize TryCand's RPDelta.
2538 if (Zone.isTop()) {
2539 TempTracker.getMaxDownwardPressureDelta(
2540 TryCand.SU->getInstr(),
2541 TryCand.RPDelta,
2542 DAG->getRegionCriticalPSets(),
2543 DAG->getRegPressure().MaxSetPressure);
2544 }
2545 else {
2546 if (VerifyScheduling) {
2547 TempTracker.getMaxUpwardPressureDelta(
2548 TryCand.SU->getInstr(),
2549 &DAG->getPressureDiff(TryCand.SU),
2550 TryCand.RPDelta,
2551 DAG->getRegionCriticalPSets(),
2552 DAG->getRegPressure().MaxSetPressure);
2553 }
2554 else {
2555 RPTracker.getUpwardPressureDelta(
2556 TryCand.SU->getInstr(),
2557 DAG->getPressureDiff(TryCand.SU),
2558 TryCand.RPDelta,
2559 DAG->getRegionCriticalPSets(),
2560 DAG->getRegPressure().MaxSetPressure);
2561 }
2562 }
2563 }
2564 DEBUG(if (TryCand.RPDelta.Excess.isValid())
2565 dbgs() << " SU(" << TryCand.SU->NodeNum << ") "
2566 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
2567 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
2568
2569 // Initialize the candidate if needed.
2570 if (!Cand.isValid()) {
2571 TryCand.Reason = NodeOrder;
2572 return;
2573 }
2574
2575 if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()),
2576 biasPhysRegCopy(Cand.SU, Zone.isTop()),
2577 TryCand, Cand, PhysRegCopy))
2578 return;
2579
2580 // Avoid exceeding the target's limit. If signed PSetID is negative, it is
2581 // invalid; convert it to INT_MAX to give it lowest priority.
2582 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
2583 Cand.RPDelta.Excess,
2584 TryCand, Cand, RegExcess))
2585 return;
2586
2587 // Avoid increasing the max critical pressure in the scheduled region.
2588 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
2589 Cand.RPDelta.CriticalMax,
2590 TryCand, Cand, RegCritical))
2591 return;
2592
2593 // For loops that are acyclic path limited, aggressively schedule for latency.
2594 // This can result in very long dependence chains scheduled in sequence, so
2595 // once every cycle (when CurrMOps == 0), switch to normal heuristics.
2596 if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps()
2597 && tryLatency(TryCand, Cand, Zone))
2598 return;
2599
2600 // Prioritize instructions that read unbuffered resources by stall cycles.
2601 if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
2602 Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
2603 return;
2604
2605 // Keep clustered nodes together to encourage downstream peephole
2606 // optimizations which may reduce resource requirements.
2607 //
2608 // This is a best effort to set things up for a post-RA pass. Optimizations
2609 // like generating loads of multiple registers should ideally be done within
2610 // the scheduler pass by combining the loads during DAG postprocessing.
2611 const SUnit *NextClusterSU =
2612 Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
2613 if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
2614 TryCand, Cand, Cluster))
2615 return;
2616
2617 // Weak edges are for clustering and other constraints.
2618 if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
2619 getWeakLeft(Cand.SU, Zone.isTop()),
2620 TryCand, Cand, Weak)) {
2621 return;
2622 }
2623 // Avoid increasing the max pressure of the entire region.
2624 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
2625 Cand.RPDelta.CurrentMax,
2626 TryCand, Cand, RegMax))
2627 return;
2628
2629 // Avoid critical resource consumption and balance the schedule.
2630 TryCand.initResourceDelta(DAG, SchedModel);
2631 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
2632 TryCand, Cand, ResourceReduce))
2633 return;
2634 if (tryGreater(TryCand.ResDelta.DemandedResources,
2635 Cand.ResDelta.DemandedResources,
2636 TryCand, Cand, ResourceDemand))
2637 return;
2638
2639 // Avoid serializing long latency dependence chains.
2640 // For acyclic path limited loops, latency was already checked above.
2641 if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
2642 && tryLatency(TryCand, Cand, Zone)) {
2643 return;
2644 }
2645
2646 // Prefer immediate defs/users of the last scheduled instruction. This is a
2647 // local pressure avoidance strategy that also makes the machine code
2648 // readable.
2649 if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU),
2650 TryCand, Cand, NextDefUse))
2651 return;
2652
2653 // Fall through to original instruction order.
2654 if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
2655 || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
2656 TryCand.Reason = NodeOrder;
2657 }
2658 }
2659
26602281 #ifndef NDEBUG
2661 const char *GenericScheduler::getReasonStr(
2662 GenericScheduler::CandReason Reason) {
2282 const char *GenericSchedulerBase::getReasonStr(
2283 GenericSchedulerBase::CandReason Reason) {
26632284 switch (Reason) {
26642285 case NoCand: return "NOCAND ";
26652286 case PhysRegCopy: return "PREG-COPY";
26812302 llvm_unreachable("Unknown reason!");
26822303 }
26832304
2684 void GenericScheduler::traceCandidate(const SchedCandidate &Cand) {
2305 void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
26852306 PressureChange P;
26862307 unsigned ResIdx = 0;
26872308 unsigned Latency = 0;
27342355 }
27352356 #endif
27362357
2358 /// Return true if this heuristic determines order.
2359 static bool tryLess(int TryVal, int CandVal,
2360 GenericSchedulerBase::SchedCandidate &TryCand,
2361 GenericSchedulerBase::SchedCandidate &Cand,
2362 GenericSchedulerBase::CandReason Reason) {
2363 if (TryVal < CandVal) {
2364 TryCand.Reason = Reason;
2365 return true;
2366 }
2367 if (TryVal > CandVal) {
2368 if (Cand.Reason > Reason)
2369 Cand.Reason = Reason;
2370 return true;
2371 }
2372 Cand.setRepeat(Reason);
2373 return false;
2374 }
2375
2376 static bool tryGreater(int TryVal, int CandVal,
2377 GenericSchedulerBase::SchedCandidate &TryCand,
2378 GenericSchedulerBase::SchedCandidate &Cand,
2379 GenericSchedulerBase::CandReason Reason) {
2380 if (TryVal > CandVal) {
2381 TryCand.Reason = Reason;
2382 return true;
2383 }
2384 if (TryVal < CandVal) {
2385 if (Cand.Reason > Reason)
2386 Cand.Reason = Reason;
2387 return true;
2388 }
2389 Cand.setRepeat(Reason);
2390 return false;
2391 }
2392
2393 static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
2394 GenericSchedulerBase::SchedCandidate &Cand,
2395 SchedBoundary &Zone) {
2396 if (Zone.isTop()) {
2397 if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
2398 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2399 TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
2400 return true;
2401 }
2402 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2403 TryCand, Cand, GenericSchedulerBase::TopPathReduce))
2404 return true;
2405 }
2406 else {
2407 if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
2408 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
2409 TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
2410 return true;
2411 }
2412 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
2413 TryCand, Cand, GenericSchedulerBase::BotPathReduce))
2414 return true;
2415 }
2416 return false;
2417 }
2418
2419 static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand,
2420 bool IsTop) {
2421 DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
2422 << GenericSchedulerBase::getReasonStr(Cand.Reason) << '\n');
2423 }
2424
2425 namespace {
2426 /// GenericScheduler shrinks the unscheduled zone using heuristics to balance
2427 /// the schedule.
2428 class GenericScheduler : public GenericSchedulerBase {
2429 ScheduleDAGMILive *DAG;
2430
2431 // State of the top and bottom scheduled instruction boundaries.
2432 SchedBoundary Top;
2433 SchedBoundary Bot;
2434
2435 MachineSchedPolicy RegionPolicy;
2436 public:
2437 GenericScheduler(const MachineSchedContext *C):
2438 GenericSchedulerBase(C), DAG(0), Top(SchedBoundary::TopQID, "TopQ"),
2439 Bot(SchedBoundary::BotQID, "BotQ") {}
2440
2441 virtual void initPolicy(MachineBasicBlock::iterator Begin,
2442 MachineBasicBlock::iterator End,
2443 unsigned NumRegionInstrs) LLVM_OVERRIDE;
2444
2445 virtual bool shouldTrackPressure() const LLVM_OVERRIDE {
2446 return RegionPolicy.ShouldTrackPressure;
2447 }
2448
2449 virtual void initialize(ScheduleDAGMI *dag) LLVM_OVERRIDE;
2450
2451 virtual SUnit *pickNode(bool &IsTopNode) LLVM_OVERRIDE;
2452
2453 virtual void schedNode(SUnit *SU, bool IsTopNode) LLVM_OVERRIDE;
2454
2455 virtual void releaseTopNode(SUnit *SU) LLVM_OVERRIDE {
2456 Top.releaseTopNode(SU);
2457 }
2458
2459 virtual void releaseBottomNode(SUnit *SU) LLVM_OVERRIDE {
2460 Bot.releaseBottomNode(SU);
2461 }
2462
2463 virtual void registerRoots() LLVM_OVERRIDE;
2464
2465 protected:
2466 void checkAcyclicLatency();
2467
2468 void tryCandidate(SchedCandidate &Cand,
2469 SchedCandidate &TryCand,
2470 SchedBoundary &Zone,
2471 const RegPressureTracker &RPTracker,
2472 RegPressureTracker &TempTracker);
2473
2474 SUnit *pickNodeBidirectional(bool &IsTopNode);
2475
2476 void pickNodeFromQueue(SchedBoundary &Zone,
2477 const RegPressureTracker &RPTracker,
2478 SchedCandidate &Candidate);
2479
2480 void reschedulePhysRegCopies(SUnit *SU, bool isTop);
2481 };
2482 } // namespace
2483
2484 void GenericScheduler::initialize(ScheduleDAGMI *dag) {
2485 assert(dag->hasVRegLiveness() &&
2486 "(PreRA)GenericScheduler needs vreg liveness");
2487 DAG = static_cast(dag);
2488 SchedModel = DAG->getSchedModel();
2489 TRI = DAG->TRI;
2490
2491 Rem.init(DAG, SchedModel);
2492 Top.init(DAG, SchedModel, &Rem);
2493 Bot.init(DAG, SchedModel, &Rem);
2494
2495 // Initialize resource counts.
2496
2497 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
2498 // are disabled, then these HazardRecs will be disabled.
2499 const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
2500 const TargetMachine &TM = DAG->MF.getTarget();
2501 if (!Top.HazardRec) {
2502 Top.HazardRec =
2503 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
2504 }
2505 if (!Bot.HazardRec) {
2506 Bot.HazardRec =
2507 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
2508 }
2509 }
2510
2511 /// Initialize the per-region scheduling policy.
2512 void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
2513 MachineBasicBlock::iterator End,
2514 unsigned NumRegionInstrs) {
2515 const TargetMachine &TM = Context->MF->getTarget();
2516
2517 // Avoid setting up the register pressure tracker for small regions to save
2518 // compile time. As a rough heuristic, only track pressure when the number of
2519 // schedulable instructions exceeds half the integer register file.
2520 unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
2521 TM.getTargetLowering()->getRegClassFor(MVT::i32));
2522
2523 RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
2524
2525 // For generic targets, we default to bottom-up, because it's simpler and more
2526 // compile-time optimizations have been implemented in that direction.
2527 RegionPolicy.OnlyBottomUp = true;
2528
2529 // Allow the subtarget to override default policy.
2530 const TargetSubtargetInfo &ST = TM.getSubtarget();
2531 ST.overrideSchedPolicy(RegionPolicy, Begin, End, NumRegionInstrs);
2532
2533 // After subtarget overrides, apply command line options.
2534 if (!EnableRegPressure)
2535 RegionPolicy.ShouldTrackPressure = false;
2536
2537 // Check -misched-topdown/bottomup can force or unforce scheduling direction.
2538 // e.g. -misched-bottomup=false allows scheduling in both directions.
2539 assert((!ForceTopDown || !ForceBottomUp) &&
2540 "-misched-topdown incompatible with -misched-bottomup");
2541 if (ForceBottomUp.getNumOccurrences() > 0) {
2542 RegionPolicy.OnlyBottomUp = ForceBottomUp;
2543 if (RegionPolicy.OnlyBottomUp)
2544 RegionPolicy.OnlyTopDown = false;
2545 }
2546 if (ForceTopDown.getNumOccurrences() > 0) {
2547 RegionPolicy.OnlyTopDown = ForceTopDown;
2548 if (RegionPolicy.OnlyTopDown)
2549 RegionPolicy.OnlyBottomUp = false;
2550 }
2551 }
2552
2553 /// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
2554 /// critical path by more cycles than it takes to drain the instruction buffer.
2555 /// We estimate an upper bounds on in-flight instructions as:
2556 ///
2557 /// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
2558 /// InFlightIterations = AcyclicPath / CyclesPerIteration
2559 /// InFlightResources = InFlightIterations * LoopResources
2560 ///
2561 /// TODO: Check execution resources in addition to IssueCount.
2562 void GenericScheduler::checkAcyclicLatency() {
2563 if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
2564 return;
2565
2566 // Scaled number of cycles per loop iteration.
2567 unsigned IterCount =
2568 std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
2569 Rem.RemIssueCount);
2570 // Scaled acyclic critical path.
2571 unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
2572 // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
2573 unsigned InFlightCount =
2574 (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
2575 unsigned BufferLimit =
2576 SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
2577
2578 Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
2579
2580 DEBUG(dbgs() << "IssueCycles="
2581 << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
2582 << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
2583 << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
2584 << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
2585 << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
2586 if (Rem.IsAcyclicLatencyLimited)
2587 dbgs() << " ACYCLIC LATENCY LIMIT\n");
2588 }
2589
2590 void GenericScheduler::registerRoots() {
2591 Rem.CriticalPath = DAG->ExitSU.getDepth();
2592
2593 // Some roots may not feed into ExitSU. Check all of them in case.
2594 for (std::vector::const_iterator
2595 I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
2596 if ((*I)->getDepth() > Rem.CriticalPath)
2597 Rem.CriticalPath = (*I)->getDepth();
2598 }
2599 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
2600
2601 if (EnableCyclicPath) {
2602 Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
2603 checkAcyclicLatency();
2604 }
2605 }
2606
2607 static bool tryPressure(const PressureChange &TryP,
2608 const PressureChange &CandP,
2609 GenericSchedulerBase::SchedCandidate &TryCand,
2610 GenericSchedulerBase::SchedCandidate &Cand,
2611 GenericSchedulerBase::CandReason Reason) {
2612 int TryRank = TryP.getPSetOrMax();
2613 int CandRank = CandP.getPSetOrMax();
2614 // If both candidates affect the same set, go with the smallest increase.
2615 if (TryRank == CandRank) {
2616 return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
2617 Reason);
2618 }
2619 // If one candidate decreases and the other increases, go with it.
2620 // Invalid candidates have UnitInc==0.
2621 if (tryLess(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
2622 Reason)) {
2623 return true;
2624 }
2625 // If the candidates are decreasing pressure, reverse priority.
2626 if (TryP.getUnitInc() < 0)
2627 std::swap(TryRank, CandRank);
2628 return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
2629 }
2630
2631 static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
2632 return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
2633 }
2634
2635 /// Minimize physical register live ranges. Regalloc wants them adjacent to
2636 /// their physreg def/use.
2637 ///
2638 /// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
2639 /// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
2640 /// with the operation that produces or consumes the physreg. We'll do this when
2641 /// regalloc has support for parallel copies.
2642 static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
2643 const MachineInstr *MI = SU->getInstr();
2644 if (!MI->isCopy())
2645 return 0;
2646
2647 unsigned ScheduledOper = isTop ? 1 : 0;
2648 unsigned UnscheduledOper = isTop ? 0 : 1;
2649 // If we have already scheduled the physreg produce/consumer, immediately
2650 // schedule the copy.
2651 if (TargetRegisterInfo::isPhysicalRegister(
2652 MI->getOperand(ScheduledOper).getReg()))
2653 return 1;
2654 // If the physreg is at the boundary, defer it. Otherwise schedule it
2655 // immediately to free the dependent. We can hoist the copy later.
2656 bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
2657 if (TargetRegisterInfo::isPhysicalRegister(
2658 MI->getOperand(UnscheduledOper).getReg()))
2659 return AtBoundary ? -1 : 1;
2660 return 0;
2661 }
2662
2663 /// Apply a set of heursitics to a new candidate. Heuristics are currently
2664 /// hierarchical. This may be more efficient than a graduated cost model because
2665 /// we don't need to evaluate all aspects of the model for each node in the
2666 /// queue. But it's really done to make the heuristics easier to debug and
2667 /// statistically analyze.
2668 ///
2669 /// \param Cand provides the policy and current best candidate.
2670 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
2671 /// \param Zone describes the scheduled zone that we are extending.
2672 /// \param RPTracker describes reg pressure within the scheduled zone.
2673 /// \param TempTracker is a scratch pressure tracker to reuse in queries.
2674 void GenericScheduler::tryCandidate(SchedCandidate &Cand,
2675 SchedCandidate &TryCand,
2676 SchedBoundary &Zone,
2677 const RegPressureTracker &RPTracker,
2678 RegPressureTracker &TempTracker) {
2679
2680 if (DAG->isTrackingPressure()) {
2681 // Always initialize TryCand's RPDelta.
2682 if (Zone.isTop()) {
2683 TempTracker.getMaxDownwardPressureDelta(
2684 TryCand.SU->getInstr(),
2685 TryCand.RPDelta,
2686 DAG->getRegionCriticalPSets(),
2687 DAG->getRegPressure().MaxSetPressure);
2688 }
2689 else {
2690 if (VerifyScheduling) {
2691 TempTracker.getMaxUpwardPressureDelta(
2692 TryCand.SU->getInstr(),
2693 &DAG->getPressureDiff(TryCand.SU),
2694 TryCand.RPDelta,
2695 DAG->getRegionCriticalPSets(),
2696 DAG->getRegPressure().MaxSetPressure);
2697 }
2698 else {
2699 RPTracker.getUpwardPressureDelta(
2700 TryCand.SU->getInstr(),
2701 DAG->getPressureDiff(TryCand.SU),
2702 TryCand.RPDelta,
2703 DAG->getRegionCriticalPSets(),
2704 DAG->getRegPressure().MaxSetPressure);
2705 }
2706 }
2707 }
2708 DEBUG(if (TryCand.RPDelta.Excess.isValid())
2709 dbgs() << " SU(" << TryCand.SU->NodeNum << ") "
2710 << TRI->getRegPressureSetName(TryCand.RPDelta.Excess.getPSet())
2711 << ":" << TryCand.RPDelta.Excess.getUnitInc() << "\n");
2712
2713 // Initialize the candidate if needed.
2714 if (!Cand.isValid()) {
2715 TryCand.Reason = NodeOrder;
2716 return;
2717 }
2718
2719 if (tryGreater(biasPhysRegCopy(TryCand.SU, Zone.isTop()),
2720 biasPhysRegCopy(Cand.SU, Zone.isTop()),
2721 TryCand, Cand, PhysRegCopy))
2722 return;
2723
2724 // Avoid exceeding the target's limit. If signed PSetID is negative, it is
2725 // invalid; convert it to INT_MAX to give it lowest priority.
2726 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
2727 Cand.RPDelta.Excess,
2728 TryCand, Cand, RegExcess))
2729 return;
2730
2731 // Avoid increasing the max critical pressure in the scheduled region.
2732 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
2733 Cand.RPDelta.CriticalMax,
2734 TryCand, Cand, RegCritical))
2735 return;
2736
2737 // For loops that are acyclic path limited, aggressively schedule for latency.
2738 // This can result in very long dependence chains scheduled in sequence, so
2739 // once every cycle (when CurrMOps == 0), switch to normal heuristics.
2740 if (Rem.IsAcyclicLatencyLimited && !Zone.getCurrMOps()
2741 && tryLatency(TryCand, Cand, Zone))
2742 return;
2743
2744 // Prioritize instructions that read unbuffered resources by stall cycles.
2745 if (tryLess(Zone.getLatencyStallCycles(TryCand.SU),
2746 Zone.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
2747 return;
2748
2749 // Keep clustered nodes together to encourage downstream peephole
2750 // optimizations which may reduce resource requirements.
2751 //
2752 // This is a best effort to set things up for a post-RA pass. Optimizations
2753 // like generating loads of multiple registers should ideally be done within
2754 // the scheduler pass by combining the loads during DAG postprocessing.
2755 const SUnit *NextClusterSU =
2756 Zone.isTop() ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
2757 if (tryGreater(TryCand.SU == NextClusterSU, Cand.SU == NextClusterSU,
2758 TryCand, Cand, Cluster))
2759 return;
2760
2761 // Weak edges are for clustering and other constraints.
2762 if (tryLess(getWeakLeft(TryCand.SU, Zone.isTop()),
2763 getWeakLeft(Cand.SU, Zone.isTop()),
2764 TryCand, Cand, Weak)) {
2765 return;
2766 }
2767 // Avoid increasing the max pressure of the entire region.
2768 if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
2769 Cand.RPDelta.CurrentMax,
2770 TryCand, Cand, RegMax))
2771 return;
2772
2773 // Avoid critical resource consumption and balance the schedule.
2774 TryCand.initResourceDelta(DAG, SchedModel);
2775 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
2776 TryCand, Cand, ResourceReduce))
2777 return;
2778 if (tryGreater(TryCand.ResDelta.DemandedResources,
2779 Cand.ResDelta.DemandedResources,
2780 TryCand, Cand, ResourceDemand))
2781 return;
2782
2783 // Avoid serializing long latency dependence chains.
2784 // For acyclic path limited loops, latency was already checked above.
2785 if (Cand.Policy.ReduceLatency && !Rem.IsAcyclicLatencyLimited
2786 && tryLatency(TryCand, Cand, Zone)) {
2787 return;
2788 }
2789
2790 // Prefer immediate defs/users of the last scheduled instruction. This is a
2791 // local pressure avoidance strategy that also makes the machine code
2792 // readable.
2793 if (tryGreater(Zone.isNextSU(TryCand.SU), Zone.isNextSU(Cand.SU),
2794 TryCand, Cand, NextDefUse))
2795 return;
2796
2797 // Fall through to original instruction order.
2798 if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
2799 || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
2800 TryCand.Reason = NodeOrder;
2801 }
2802 }
2803
27372804 /// Pick the best candidate from the queue.
27382805 ///
27392806 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
27642831 }
27652832 }
27662833
2767 static void tracePick(const GenericScheduler::SchedCandidate &Cand,
2768 bool IsTop) {
2769 DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
2770 << GenericScheduler::getReasonStr(Cand.Reason) << '\n');
2771 }
2772
27732834 /// Pick the best candidate node from either the top or bottom queue.
27742835 SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
27752836 // Schedule as far as possible in the direction of no choice. This is most
27892850 SchedCandidate TopCand(NoPolicy);
27902851 // Set the bottom-up policy based on the state of the current bottom zone and
27912852 // the instructions outside the zone, including the top zone.
2792 setPolicy(BotCand.Policy, Bot, Top);
2853 setPolicy(BotCand.Policy, /*IsPostRA=*/false, Bot, &Top);
27932854 // Set the top-down policy based on the state of the current top zone and
27942855 // the instructions outside the zone, including the bottom zone.
2795 setPolicy(TopCand.Policy, Top, Bot);
2856 setPolicy(TopCand.Policy, /*IsPostRA=*/false, Top, &Bot);
27962857
27972858 // Prefer bottom scheduling when heuristics are silent.
27982859 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
29022963 }
29032964
29042965 /// Update the scheduler's state after scheduling a node. This is the same node
2905 /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to update
2906 /// it's state based on the current cycle before MachineSchedStrategy does.
2966 /// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
2967 /// update it's state based on the current cycle before MachineSchedStrategy
2968 /// does.
29072969 ///
29082970 /// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
29092971 /// them here. See comments in biasPhysRegCopy.
29222984 }
29232985 }
29242986
2925 /// Create a generic scheduler with no DAG mutation passes.
2926 static ScheduleDAGInstrs *createRawGenericSched(MachineSchedContext *C) {
2927 return new ScheduleDAGMILive(C, new GenericScheduler(C));
2928 }
2929
29302987 /// Create the standard converging machine scheduler. This will be used as the
29312988 /// default scheduler if the target does not set a default.
2932 static ScheduleDAGInstrs *createGenericSched(MachineSchedContext *C) {
2933 ScheduleDAGMILive *DAG =
2934 static_cast(createRawGenericSched(C));
2989 static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
2990 ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, new GenericScheduler(C));
29352991 // Register DAG post-processors.
29362992 //
29372993 // FIXME: extend the mutation API to allow earlier mutations to instantiate
29443000 DAG->addMutation(new MacroFusion(DAG->TII));
29453001 return DAG;
29463002 }
3003
29473004 static MachineSchedRegistry
29483005 GenericSchedRegistry("converge", "Standard converging scheduler.",
2949 createGenericSched);
3006 createGenericSchedLive);
3007
3008 //===----------------------------------------------------------------------===//
3009 // PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
3010 //===----------------------------------------------------------------------===//
3011
3012 namespace {
3013 /// PostGenericScheduler - Interface to the scheduling algorithm used by
3014 /// ScheduleDAGMI.
3015 ///
3016 /// Callbacks from ScheduleDAGMI:
3017 /// initPolicy -> initialize(DAG) -> registerRoots -> pickNode ...
3018 class PostGenericScheduler : public GenericSchedulerBase {
3019 ScheduleDAGMI *DAG;
3020 SchedBoundary Top;
3021 SmallVector BotRoots;
3022 public:
3023 PostGenericScheduler(const MachineSchedContext *C):
3024 GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ") {}
3025
3026 virtual ~PostGenericScheduler() {}
3027
3028 virtual void initPolicy(MachineBasicBlock::iterator Begin,
3029 MachineBasicBlock::iterator End,
3030 unsigned NumRegionInstrs) LLVM_OVERRIDE {
3031 /* no configurable policy */
3032 };
3033
3034 /// PostRA scheduling does not track pressure.
3035 virtual bool shouldTrackPressure() const LLVM_OVERRIDE { return false; }
3036
3037 virtual void initialize(ScheduleDAGMI *Dag) LLVM_OVERRIDE {
3038 DAG = Dag;
3039 SchedModel = DAG->getSchedModel();
3040 TRI = DAG->TRI;
3041
3042 Rem.init(DAG, SchedModel);
3043 Top.init(DAG, SchedModel, &Rem);
3044 BotRoots.clear();
3045
3046 // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
3047 // or are disabled, then these HazardRecs will be disabled.
3048 const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
3049 const TargetMachine &TM = DAG->MF.getTarget();
3050 if (!Top.HazardRec) {
3051 Top.HazardRec =
3052 TM.getInstrInfo()->CreateTargetMIHazardRecognizer(Itin, DAG);
3053 }
3054 }
3055
3056 virtual void registerRoots() LLVM_OVERRIDE;
3057
3058 virtual SUnit *pickNode(bool &IsTopNode) LLVM_OVERRIDE;
3059
3060 virtual void scheduleTree(unsigned SubtreeID) LLVM_OVERRIDE {
3061 llvm_unreachable("PostRA scheduler does not support subtree analysis.");
3062 }
3063
3064 virtual void schedNode(SUnit *SU, bool IsTopNode) LLVM_OVERRIDE;
3065
3066 virtual void releaseTopNode(SUnit *SU) LLVM_OVERRIDE {
3067 Top.releaseTopNode(SU);
3068 }
3069
3070 // Only called for roots.
3071 virtual void releaseBottomNode(SUnit *SU) LLVM_OVERRIDE {
3072 BotRoots.push_back(SU);
3073 }
3074
3075 protected:
3076 void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand);
3077
3078 void pickNodeFromQueue(SchedCandidate &Cand);
3079 };
3080 } // namespace
3081
3082 void PostGenericScheduler::registerRoots() {
3083 Rem.CriticalPath = DAG->ExitSU.getDepth();
3084
3085 // Some roots may not feed into ExitSU. Check all of them in case.
3086 for (SmallVectorImpl::const_iterator
3087 I = BotRoots.begin(), E = BotRoots.end(); I != E; ++I) {
3088 if ((*I)->getDepth() > Rem.CriticalPath)
3089 Rem.CriticalPath = (*I)->getDepth();
3090 }
3091 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
3092 }
3093
3094 /// Apply a set of heursitics to a new candidate for PostRA scheduling.
3095 ///
3096 /// \param Cand provides the policy and current best candidate.
3097 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
3098 void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
3099 SchedCandidate &TryCand) {
3100
3101 // Initialize the candidate if needed.
3102 if (!Cand.isValid()) {
3103 TryCand.Reason = NodeOrder;
3104 return;
3105 }
3106
3107 // Prioritize instructions that read unbuffered resources by stall cycles.
3108 if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
3109 Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
3110 return;
3111
3112 // Avoid critical resource consumption and balance the schedule.
3113 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
3114 TryCand, Cand, ResourceReduce))
3115 return;
3116 if (tryGreater(TryCand.ResDelta.DemandedResources,
3117 Cand.ResDelta.DemandedResources,
3118 TryCand, Cand, ResourceDemand))
3119 return;
3120
3121 // Avoid serializing long latency dependence chains.
3122 if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
3123 return;
3124 }
3125
3126 // Fall through to original instruction order.
3127 if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
3128 TryCand.Reason = NodeOrder;
3129 }
3130
3131 void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
3132 ReadyQueue &Q = Top.Available;
3133
3134 DEBUG(Q.dump());
3135
3136 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
3137 SchedCandidate TryCand(Cand.Policy);
3138 TryCand.SU = *I;
3139 TryCand.initResourceDelta(DAG, SchedModel);
3140 tryCandidate(Cand, TryCand);
3141 if (TryCand.Reason != NoCand) {
3142 Cand.setBest(TryCand);
3143 DEBUG(traceCandidate(Cand));
3144 }
3145 }
3146 }
3147
3148 /// Pick the next node to schedule.
3149 SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
3150 if (DAG->top() == DAG->bottom()) {
3151 assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
3152 return NULL;
3153 }
3154 SUnit *SU;
3155 do {
3156 SU = Top.pickOnlyChoice();
3157 if (!SU) {
3158 CandPolicy NoPolicy;
3159 SchedCandidate TopCand(NoPolicy);
3160 // Set the top-down policy based on the state of the current top zone and
3161 // the instructions outside the zone, including the bottom zone.
3162 setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, NULL);
3163 pickNodeFromQueue(TopCand);
3164 assert(TopCand.Reason != NoCand && "failed to find a candidate");
3165 tracePick(TopCand, true);
3166 SU = TopCand.SU;
3167 }
3168 } while (SU->isScheduled);
3169
3170 IsTopNode = true;
3171 Top.removeReady(SU);
3172
3173 DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
3174 return SU;
3175 }
3176
3177 /// Called after ScheduleDAGMI has scheduled an instruction and updated
3178 /// scheduled/remaining flags in the DAG nodes.
3179 void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
3180 SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
3181 Top.bumpNode(SU);
3182 }
3183
3184 /// Create a generic scheduler with no vreg liveness or DAG mutation passes.
3185 static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C) {
3186 return new ScheduleDAGMI(C, new PostGenericScheduler(C), /*IsPostRA=*/true);
3187 }
29503188
29513189 //===----------------------------------------------------------------------===//
29523190 // ILP Scheduler. Currently for experimental analysis of heuristics.