llvm.org GIT mirror llvm / 3b87f62
misched: Heuristics based on the machine model. misched is disabled by default. With -enable-misched, these heuristics balance the schedule to simultaneously avoid saturating processor resources, expose ILP, and minimize register pressure. I've been analyzing the performance of these heuristics on everything in the llvm test suite in addition to a few other benchmarks. I would like each heuristic check to be verified by a unit test, but I'm still trying to figure out the best way to do that. The heuristics are still in considerable flux, but as they are refined we should be rigorous about unit testing the improvements. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167527 91177308-0d34-0410-b5e6-96231b3b80d8 Andrew Trick 7 years ago
3 changed file(s) with 1008 addition(s) and 155 deletion(s). Raw diff Collapse all Expand all
153153
154154 bool empty() const { return Queue.empty(); }
155155
156 void clear() { Queue.clear(); }
157
156158 unsigned size() const { return Queue.size(); }
157159
158160 typedef std::vector::iterator iterator;
170172 SU->NodeQueueId |= ID;
171173 }
172174
173 void remove(iterator I) {
175 iterator remove(iterator I) {
174176 (*I)->NodeQueueId &= ~ID;
175177 *I = Queue.back();
178 unsigned idx = I - Queue.begin();
176179 Queue.pop_back();
180 return Queue.begin() + idx;
177181 }
178182
179183 #ifndef NDEBUG
305309 /// Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
306310 void placeDebugValues();
307311
312 /// \brief dump the scheduled Sequence.
313 void dumpSchedule() const;
314
308315 // Lesser helpers...
309316
310317 void initRegPressure();
4747 #else
4848 static bool ViewMISchedDAGs = false;
4949 #endif // NDEBUG
50
51 // Threshold to very roughly model an out-of-order processor's instruction
52 // buffers. If the actual value of this threshold matters much in practice, then
53 // it can be specified by the machine model. For now, it's an experimental
54 // tuning knob to determine when and if it matters.
55 static cl::opt ILPWindow("ilp-window", cl::Hidden,
56 cl::desc("Allow expected latency to exceed the critical path by N cycles "
57 "before attempting to balance ILP"),
58 cl::init(10U));
5059
5160 //===----------------------------------------------------------------------===//
5261 // Machine Instruction Scheduling Pass and Registry
486495 assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
487496
488497 placeDebugValues();
498
499 DEBUG({
500 unsigned BBNum = top()->getParent()->getNumber();
501 dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
502 dumpSchedule();
503 dbgs() << '\n';
504 });
489505 }
490506
491507 /// Build the DAG and setup three register pressure trackers.
626642 FirstDbgValue = NULL;
627643 }
628644
645 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
646 void ScheduleDAGMI::dumpSchedule() const {
647 for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
648 if (SUnit *SU = getSUnit(&(*MI)))
649 SU->dump(this);
650 else
651 dbgs() << "Missing SUnit\n";
652 }
653 }
654 #endif
655
629656 //===----------------------------------------------------------------------===//
630657 // ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
631658 //===----------------------------------------------------------------------===//
634661 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
635662 /// the schedule.
636663 class ConvergingScheduler : public MachineSchedStrategy {
664 public:
665 /// Represent the type of SchedCandidate found within a single queue.
666 /// pickNodeBidirectional depends on these listed by decreasing priority.
667 enum CandReason {
668 NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand,
669 BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce,
670 SingleMax, MultiPressure, NextDefUse, NodeOrder};
671
672 #ifndef NDEBUG
673 static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
674 #endif
675
676 /// Policy for scheduling the next instruction in the candidate's zone.
677 struct CandPolicy {
678 bool ReduceLatency;
679 unsigned ReduceResIdx;
680 unsigned DemandResIdx;
681
682 CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
683 };
684
685 /// Status of an instruction's critical resource consumption.
686 struct SchedResourceDelta {
687 // Count critical resources in the scheduled region required by SU.
688 unsigned CritResources;
689
690 // Count critical resources from another region consumed by SU.
691 unsigned DemandedResources;
692
693 SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
694
695 bool operator==(const SchedResourceDelta &RHS) const {
696 return CritResources == RHS.CritResources
697 && DemandedResources == RHS.DemandedResources;
698 }
699 bool operator!=(const SchedResourceDelta &RHS) const {
700 return !operator==(RHS);
701 }
702 };
637703
638704 /// Store the state used by ConvergingScheduler heuristics, required for the
639705 /// lifetime of one invocation of pickNode().
640706 struct SchedCandidate {
707 CandPolicy Policy;
708
641709 // The best SUnit candidate.
642710 SUnit *SU;
643711
712 // The reason for this candidate.
713 CandReason Reason;
714
644715 // Register pressure values for the best candidate.
645716 RegPressureDelta RPDelta;
646717
647 SchedCandidate(): SU(NULL) {}
718 // Critical resource consumption of the best candidate.
719 SchedResourceDelta ResDelta;
720
721 SchedCandidate(const CandPolicy &policy)
722 : Policy(policy), SU(NULL), Reason(NoCand) {}
723
724 bool isValid() const { return SU; }
725
726 // Copy the status of another candidate without changing policy.
727 void setBest(SchedCandidate &Best) {
728 assert(Best.Reason != NoCand && "uninitialized Sched candidate");
729 SU = Best.SU;
730 Reason = Best.Reason;
731 RPDelta = Best.RPDelta;
732 ResDelta = Best.ResDelta;
733 }
734
735 void initResourceDelta(const ScheduleDAGMI *DAG,
736 const TargetSchedModel *SchedModel);
648737 };
649 /// Represent the type of SchedCandidate found within a single queue.
650 enum CandResult {
651 NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure };
738
739 /// Summarize the unscheduled region.
740 struct SchedRemainder {
741 // Critical path through the DAG in expected latency.
742 unsigned CriticalPath;
743
744 // Unscheduled resources
745 SmallVector RemainingCounts;
746 // Critical resource for the unscheduled zone.
747 unsigned CritResIdx;
748 // Number of micro-ops left to schedule.
749 unsigned RemainingMicroOps;
750 // Is the unscheduled zone resource limited.
751 bool IsResourceLimited;
752
753 unsigned MaxRemainingCount;
754
755 void reset() {
756 CriticalPath = 0;
757 RemainingCounts.clear();
758 CritResIdx = 0;
759 RemainingMicroOps = 0;
760 IsResourceLimited = false;
761 MaxRemainingCount = 0;
762 }
763
764 SchedRemainder() { reset(); }
765
766 void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
767 };
652768
653769 /// Each Scheduling boundary is associated with ready queues. It tracks the
654 /// current cycle in whichever direction at has moved, and maintains the state
770 /// current cycle in the direction of movement, and maintains the state
655771 /// of "hazards" and other interlocks at the current cycle.
656772 struct SchedBoundary {
657773 ScheduleDAGMI *DAG;
658774 const TargetSchedModel *SchedModel;
775 SchedRemainder *Rem;
659776
660777 ReadyQueue Available;
661778 ReadyQueue Pending;
662779 bool CheckPending;
663780
781 // For heuristics, keep a list of the nodes that immediately depend on the
782 // most recently scheduled node.
783 SmallPtrSet NextSUs;
784
664785 ScheduleHazardRecognizer *HazardRec;
665786
666787 unsigned CurrCycle;
669790 /// MinReadyCycle - Cycle of the soonest available instruction.
670791 unsigned MinReadyCycle;
671792
793 // The expected latency of the critical path in this scheduled zone.
794 unsigned ExpectedLatency;
795
796 // Resources used in the scheduled zone beyond this boundary.
797 SmallVector ResourceCounts;
798
799 // Cache the critical resources ID in this scheduled zone.
800 unsigned CritResIdx;
801
802 // Is the scheduled region resource limited vs. latency limited.
803 bool IsResourceLimited;
804
805 unsigned ExpectedCount;
806
807 // Policy flag: attempt to find ILP until expected latency is covered.
808 bool ShouldIncreaseILP;
809
810 #ifndef NDEBUG
672811 // Remember the greatest min operand latency.
673812 unsigned MaxMinLatency;
813 #endif
814
815 void reset() {
816 Available.clear();
817 Pending.clear();
818 CheckPending = false;
819 NextSUs.clear();
820 HazardRec = 0;
821 CurrCycle = 0;
822 IssueCount = 0;
823 MinReadyCycle = UINT_MAX;
824 ExpectedLatency = 0;
825 ResourceCounts.resize(1);
826 assert(!ResourceCounts[0] && "nonzero count for bad resource");
827 CritResIdx = 0;
828 IsResourceLimited = false;
829 ExpectedCount = 0;
830 ShouldIncreaseILP = false;
831 #ifndef NDEBUG
832 MaxMinLatency = 0;
833 #endif
834 // Reserve a zero-count for invalid CritResIdx.
835 ResourceCounts.resize(1);
836 }
674837
675838 /// Pending queues extend the ready queues with the same ID and the
676839 /// PendingFlag set.
677840 SchedBoundary(unsigned ID, const Twine &Name):
678 DAG(0), SchedModel(0), Available(ID, Name+".A"),
679 Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
680 CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0),
681 MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
841 DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
842 Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P") {
843 reset();
844 }
682845
683846 ~SchedBoundary() { delete HazardRec; }
684847
685 void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel) {
686 DAG = dag;
687 SchedModel = smodel;
688 }
848 void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel,
849 SchedRemainder *rem);
689850
690851 bool isTop() const {
691852 return Available.getID() == ConvergingScheduler::TopQID;
692853 }
693854
855 unsigned getUnscheduledLatency(SUnit *SU) const {
856 if (isTop())
857 return SU->getHeight();
858 return SU->getDepth();
859 }
860
861 unsigned getCriticalCount() const {
862 return ResourceCounts[CritResIdx];
863 }
864
694865 bool checkHazard(SUnit *SU);
695866
867 void checkILPPolicy();
868
696869 void releaseNode(SUnit *SU, unsigned ReadyCycle);
697870
698871 void bumpCycle();
872
873 void countResource(unsigned PIdx, unsigned Cycles);
699874
700875 void bumpNode(SUnit *SU);
701876
706881 SUnit *pickOnlyChoice();
707882 };
708883
884 private:
709885 ScheduleDAGMI *DAG;
710886 const TargetSchedModel *SchedModel;
711887 const TargetRegisterInfo *TRI;
712888
713889 // State of the top and bottom scheduled instruction boundaries.
890 SchedRemainder Rem;
714891 SchedBoundary Top;
715892 SchedBoundary Bot;
716893
735912
736913 virtual void releaseBottomNode(SUnit *SU);
737914
915 virtual void registerRoots();
916
738917 protected:
739 SUnit *pickNodeBidrectional(bool &IsTopNode);
740
741 CandResult pickNodeFromQueue(ReadyQueue &Q,
742 const RegPressureTracker &RPTracker,
743 SchedCandidate &Candidate);
918 void balanceZones(
919 ConvergingScheduler::SchedBoundary &CriticalZone,
920 ConvergingScheduler::SchedCandidate &CriticalCand,
921 ConvergingScheduler::SchedBoundary &OppositeZone,
922 ConvergingScheduler::SchedCandidate &OppositeCand);
923
924 void checkResourceLimits(ConvergingScheduler::SchedCandidate &TopCand,
925 ConvergingScheduler::SchedCandidate &BotCand);
926
927 void tryCandidate(SchedCandidate &Cand,
928 SchedCandidate &TryCand,
929 SchedBoundary &Zone,
930 const RegPressureTracker &RPTracker,
931 RegPressureTracker &TempTracker);
932
933 SUnit *pickNodeBidirectional(bool &IsTopNode);
934
935 void pickNodeFromQueue(SchedBoundary &Zone,
936 const RegPressureTracker &RPTracker,
937 SchedCandidate &Candidate);
938
744939 #ifndef NDEBUG
745 void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
746 PressureElement P = PressureElement());
940 void traceCandidate(const SchedCandidate &Cand, const SchedBoundary &Zone);
747941 #endif
748942 };
749943 } // namespace
944
945 void ConvergingScheduler::SchedRemainder::
946 init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
947 reset();
948 if (!SchedModel->hasInstrSchedModel())
949 return;
950 RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
951 for (std::vector::iterator
952 I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
953 const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
954 RemainingMicroOps += SchedModel->getNumMicroOps(I->getInstr(), SC);
955 for (TargetSchedModel::ProcResIter
956 PI = SchedModel->getWriteProcResBegin(SC),
957 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
958 unsigned PIdx = PI->ProcResourceIdx;
959 unsigned Factor = SchedModel->getResourceFactor(PIdx);
960 RemainingCounts[PIdx] += (Factor * PI->Cycles);
961 }
962 }
963 }
964
965 void ConvergingScheduler::SchedBoundary::
966 init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
967 reset();
968 DAG = dag;
969 SchedModel = smodel;
970 Rem = rem;
971 if (SchedModel->hasInstrSchedModel())
972 ResourceCounts.resize(SchedModel->getNumProcResourceKinds());
973 }
750974
751975 void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
752976 DAG = dag;
753977 SchedModel = DAG->getSchedModel();
754978 TRI = DAG->TRI;
755 Top.init(DAG, SchedModel);
756 Bot.init(DAG, SchedModel);
979 Rem.init(DAG, SchedModel);
980 Top.init(DAG, SchedModel, &Rem);
981 Bot.init(DAG, SchedModel, &Rem);
982
983 // Initialize resource counts.
757984
758985 // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
759986 // are disabled, then these HazardRecs will be disabled.
8001027 SU->BotReadyCycle = SuccReadyCycle + MinLatency;
8011028 }
8021029 Bot.releaseNode(SU, SU->BotReadyCycle);
1030 }
1031
1032 void ConvergingScheduler::registerRoots() {
1033 Rem.CriticalPath = DAG->ExitSU.getDepth();
1034 // Some roots may not feed into ExitSU. Check all of them in case.
1035 for (std::vector::const_iterator
1036 I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
1037 if ((*I)->getDepth() > Rem.CriticalPath)
1038 Rem.CriticalPath = (*I)->getDepth();
1039 }
1040 DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
8031041 }
8041042
8051043 /// Does this SU have a hazard within the current instruction group.
8201058 return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
8211059
8221060 unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
823 if (IssueCount + uops > SchedModel->getIssueWidth())
1061 if ((IssueCount > 0) && (IssueCount + uops > SchedModel->getIssueWidth())) {
1062 DEBUG(dbgs() << " SU(" << SU->NodeNum << ") uops="
1063 << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
8241064 return true;
825
1065 }
8261066 return false;
1067 }
1068
1069 /// If expected latency is covered, disable ILP policy.
1070 void ConvergingScheduler::SchedBoundary::checkILPPolicy() {
1071 if (ShouldIncreaseILP
1072 && (IsResourceLimited || ExpectedLatency <= CurrCycle)) {
1073 ShouldIncreaseILP = false;
1074 DEBUG(dbgs() << "Disable ILP: " << Available.getName() << '\n');
1075 }
8271076 }
8281077
8291078 void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
8301079 unsigned ReadyCycle) {
1080
8311081 if (ReadyCycle < MinReadyCycle)
8321082 MinReadyCycle = ReadyCycle;
8331083
8371087 Pending.push(SU);
8381088 else
8391089 Available.push(SU);
1090
1091 // Record this node as an immediate dependent of the scheduled node.
1092 NextSUs.insert(SU);
1093
1094 // If CriticalPath has been computed, then check if the unscheduled nodes
1095 // exceed the ILP window. Before registerRoots, CriticalPath==0.
1096 if (Rem->CriticalPath && (ExpectedLatency + getUnscheduledLatency(SU)
1097 > Rem->CriticalPath + ILPWindow)) {
1098 ShouldIncreaseILP = true;
1099 DEBUG(dbgs() << "Increase ILP: " << Available.getName() << " "
1100 << ExpectedLatency << " + " << getUnscheduledLatency(SU) << '\n');
1101 }
8401102 }
8411103
8421104 /// Move the boundary of scheduled code by one cycle.
8441106 unsigned Width = SchedModel->getIssueWidth();
8451107 IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
8461108
1109 unsigned NextCycle = CurrCycle + 1;
8471110 assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
848 unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
1111 if (MinReadyCycle > NextCycle) {
1112 IssueCount = 0;
1113 NextCycle = MinReadyCycle;
1114 }
8491115
8501116 if (!HazardRec->isEnabled()) {
8511117 // Bypass HazardRec virtual calls.
8611127 }
8621128 }
8631129 CheckPending = true;
864
865 DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
1130 IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
1131
1132 DEBUG(dbgs() << " *** " << Available.getName() << " cycle "
8661133 << CurrCycle << '\n');
1134 }
1135
1136 /// Add the given processor resource to this scheduled zone.
1137 void ConvergingScheduler::SchedBoundary::countResource(unsigned PIdx,
1138 unsigned Cycles) {
1139 unsigned Factor = SchedModel->getResourceFactor(PIdx);
1140 DEBUG(dbgs() << " " << SchedModel->getProcResource(PIdx)->Name
1141 << " +(" << Cycles << "x" << Factor
1142 << ") / " << SchedModel->getLatencyFactor() << '\n');
1143
1144 unsigned Count = Factor * Cycles;
1145 ResourceCounts[PIdx] += Count;
1146 assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
1147 Rem->RemainingCounts[PIdx] -= Count;
1148
1149 // Reset MaxRemainingCount for sanity.
1150 Rem->MaxRemainingCount = 0;
1151
1152 // Check if this resource exceeds the current critical resource by a full
1153 // cycle. If so, it becomes the critical resource.
1154 if ((int)(ResourceCounts[PIdx] - ResourceCounts[CritResIdx])
1155 >= (int)SchedModel->getLatencyFactor()) {
1156 CritResIdx = PIdx;
1157 DEBUG(dbgs() << " *** Critical resource "
1158 << SchedModel->getProcResource(PIdx)->Name << " x"
1159 << ResourceCounts[PIdx] << '\n');
1160 }
8671161 }
8681162
8691163 /// Move the boundary of scheduled code by one SUnit.
8771171 }
8781172 HazardRec->EmitInstruction(SU);
8791173 }
1174 // Update resource counts and critical resource.
1175 if (SchedModel->hasInstrSchedModel()) {
1176 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
1177 Rem->RemainingMicroOps -= SchedModel->getNumMicroOps(SU->getInstr(), SC);
1178 for (TargetSchedModel::ProcResIter
1179 PI = SchedModel->getWriteProcResBegin(SC),
1180 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
1181 countResource(PI->ProcResourceIdx, PI->Cycles);
1182 }
1183 }
1184 if (isTop()) {
1185 if (SU->getDepth() > ExpectedLatency)
1186 ExpectedLatency = SU->getDepth();
1187 }
1188 else {
1189 if (SU->getHeight() > ExpectedLatency)
1190 ExpectedLatency = SU->getHeight();
1191 }
1192
1193 IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
1194
8801195 // Check the instruction group dispatch limit.
8811196 // TODO: Check if this SU must end a dispatch group.
8821197 IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
1198
1199 // checkHazard prevents scheduling multiple instructions per cycle that exceed
1200 // issue width. However, we commonly reach the maximum. In this case
1201 // opportunistically bump the cycle to avoid uselessly checking everything in
1202 // the readyQ. Furthermore, a single instruction may produce more than one
1203 // cycle's worth of micro-ops.
8831204 if (IssueCount >= SchedModel->getIssueWidth()) {
884 DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
1205 DEBUG(dbgs() << " *** Max instrs at cycle " << CurrCycle << '\n');
8851206 bumpCycle();
8861207 }
8871208 }
9121233 Pending.remove(Pending.begin()+i);
9131234 --i; --e;
9141235 }
1236 DEBUG(if (!Pending.empty()) Pending.dump());
9151237 CheckPending = false;
9161238 }
9171239
9261248 }
9271249
9281250 /// If this queue only has one ready candidate, return it. As a side effect,
929 /// advance the cycle until at least one node is ready. If multiple instructions
930 /// are ready, return NULL.
1251 /// defer any nodes that now hit a hazard, and advance the cycle until at least
1252 /// one node is ready. If multiple instructions are ready, return NULL.
9311253 SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
9321254 if (CheckPending)
9331255 releasePending();
9341256
1257 if (IssueCount > 0) {
1258 // Defer any ready instrs that now have a hazard.
1259 for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
1260 if (checkHazard(*I)) {
1261 Pending.push(*I);
1262 I = Available.remove(I);
1263 continue;
1264 }
1265 ++I;
1266 }
1267 }
9351268 for (unsigned i = 0; Available.empty(); ++i) {
9361269 assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
9371270 "permanent hazard"); (void)i;
9431276 return NULL;
9441277 }
9451278
946 #ifndef NDEBUG
947 void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q,
948 SUnit *SU, PressureElement P) {
949 dbgs() << Label << " " << Q.getName() << " ";
950 if (P.isValid())
951 dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
952 << " ";
953 else
954 dbgs() << " ";
955 SU->dump(DAG);
956 }
957 #endif
1279 /// Record the candidate policy for opposite zones with different critical
1280 /// resources.
1281 ///
1282 /// If the CriticalZone is latency limited, don't force a policy for the
1283 /// candidates here. Instead, When releasing each candidate, releaseNode
1284 /// compares the region's critical path to the candidate's height or depth and
1285 /// the scheduled zone's expected latency then sets ShouldIncreaseILP.
1286 void ConvergingScheduler::balanceZones(
1287 ConvergingScheduler::SchedBoundary &CriticalZone,
1288 ConvergingScheduler::SchedCandidate &CriticalCand,
1289 ConvergingScheduler::SchedBoundary &OppositeZone,
1290 ConvergingScheduler::SchedCandidate &OppositeCand) {
1291
1292 if (!CriticalZone.IsResourceLimited)
1293 return;
1294
1295 SchedRemainder *Rem = CriticalZone.Rem;
1296
1297 // If the critical zone is overconsuming a resource relative to the
1298 // remainder, try to reduce it.
1299 unsigned RemainingCritCount =
1300 Rem->RemainingCounts[CriticalZone.CritResIdx];
1301 if ((int)(Rem->MaxRemainingCount - RemainingCritCount)
1302 > (int)SchedModel->getLatencyFactor()) {
1303 CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx;
1304 DEBUG(dbgs() << "Balance " << CriticalZone.Available.getName() << " reduce "
1305 << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name
1306 << '\n');
1307 }
1308 // If the other zone is underconsuming a resource relative to the full zone,
1309 // try to increase it.
1310 unsigned OppositeCount =
1311 OppositeZone.ResourceCounts[CriticalZone.CritResIdx];
1312 if ((int)(OppositeZone.ExpectedCount - OppositeCount)
1313 > (int)SchedModel->getLatencyFactor()) {
1314 OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx;
1315 DEBUG(dbgs() << "Balance " << OppositeZone.Available.getName() << " demand "
1316 << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name
1317 << '\n');
1318 }
1319 }
1320
1321 /// Determine if the scheduled zones exceed resource limits or critical path and
1322 /// set each candidate's ReduceHeight policy accordingly.
1323 void ConvergingScheduler::checkResourceLimits(
1324 ConvergingScheduler::SchedCandidate &TopCand,
1325 ConvergingScheduler::SchedCandidate &BotCand) {
1326
1327 Bot.checkILPPolicy();
1328 Top.checkILPPolicy();
1329 if (Bot.ShouldIncreaseILP)
1330 BotCand.Policy.ReduceLatency = true;
1331 if (Top.ShouldIncreaseILP)
1332 TopCand.Policy.ReduceLatency = true;
1333
1334 // Handle resource-limited regions.
1335 if (Top.IsResourceLimited && Bot.IsResourceLimited
1336 && Top.CritResIdx == Bot.CritResIdx) {
1337 // If the scheduled critical resource in both zones is no longer the
1338 // critical remaining resource, attempt to reduce resource height both ways.
1339 if (Top.CritResIdx != Rem.CritResIdx) {
1340 TopCand.Policy.ReduceResIdx = Top.CritResIdx;
1341 BotCand.Policy.ReduceResIdx = Bot.CritResIdx;
1342 DEBUG(dbgs() << "Reduce scheduled "
1343 << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n');
1344 }
1345 return;
1346 }
1347 // Handle latency-limited regions.
1348 if (!Top.IsResourceLimited && !Bot.IsResourceLimited) {
1349 // If the total scheduled expected latency exceeds the region's critical
1350 // path then reduce latency both ways.
1351 //
1352 // Just because a zone is not resource limited does not mean it is latency
1353 // limited. Unbuffered resource, such as max micro-ops may cause CurrCycle
1354 // to exceed expected latency.
1355 if ((Top.ExpectedLatency + Bot.ExpectedLatency >= Rem.CriticalPath)
1356 && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) {
1357 TopCand.Policy.ReduceLatency = true;
1358 BotCand.Policy.ReduceLatency = true;
1359 DEBUG(dbgs() << "Reduce scheduled latency " << Top.ExpectedLatency
1360 << " + " << Bot.ExpectedLatency << '\n');
1361 }
1362 return;
1363 }
1364 // The critical resource is different in each zone, so request balancing.
1365
1366 // Compute the cost of each zone.
1367 Rem.MaxRemainingCount = std::max(
1368 Rem.RemainingMicroOps * SchedModel->getMicroOpFactor(),
1369 Rem.RemainingCounts[Rem.CritResIdx]);
1370 Top.ExpectedCount = std::max(Top.ExpectedLatency, Top.CurrCycle);
1371 Top.ExpectedCount = std::max(
1372 Top.getCriticalCount(),
1373 Top.ExpectedCount * SchedModel->getLatencyFactor());
1374 Bot.ExpectedCount = std::max(Bot.ExpectedLatency, Bot.CurrCycle);
1375 Bot.ExpectedCount = std::max(
1376 Bot.getCriticalCount(),
1377 Bot.ExpectedCount * SchedModel->getLatencyFactor());
1378
1379 balanceZones(Top, TopCand, Bot, BotCand);
1380 balanceZones(Bot, BotCand, Top, TopCand);
1381 }
1382
1383 void ConvergingScheduler::SchedCandidate::
1384 initResourceDelta(const ScheduleDAGMI *DAG,
1385 const TargetSchedModel *SchedModel) {
1386 if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
1387 return;
1388
1389 const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
1390 for (TargetSchedModel::ProcResIter
1391 PI = SchedModel->getWriteProcResBegin(SC),
1392 PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
1393 if (PI->ProcResourceIdx == Policy.ReduceResIdx)
1394 ResDelta.CritResources += PI->Cycles;
1395 if (PI->ProcResourceIdx == Policy.DemandResIdx)
1396 ResDelta.DemandedResources += PI->Cycles;
1397 }
1398 }
1399
1400 /// Return true if this heuristic determines order.
1401 static bool tryLess(unsigned TryVal, unsigned CandVal,
1402 ConvergingScheduler::SchedCandidate &TryCand,
1403 ConvergingScheduler::SchedCandidate &Cand,
1404 ConvergingScheduler::CandReason Reason) {
1405 if (TryVal < CandVal) {
1406 TryCand.Reason = Reason;
1407 return true;
1408 }
1409 if (TryVal > CandVal) {
1410 if (Cand.Reason > Reason)
1411 Cand.Reason = Reason;
1412 return true;
1413 }
1414 return false;
1415 }
1416 static bool tryGreater(unsigned TryVal, unsigned CandVal,
1417 ConvergingScheduler::SchedCandidate &TryCand,
1418 ConvergingScheduler::SchedCandidate &Cand,
1419 ConvergingScheduler::CandReason Reason) {
1420 if (TryVal > CandVal) {
1421 TryCand.Reason = Reason;
1422 return true;
1423 }
1424 if (TryVal < CandVal) {
1425 if (Cand.Reason > Reason)
1426 Cand.Reason = Reason;
1427 return true;
1428 }
1429 return false;
1430 }
1431
1432 /// Apply a set of heursitics to a new candidate. Heuristics are currently
1433 /// hierarchical. This may be more efficient than a graduated cost model because
1434 /// we don't need to evaluate all aspects of the model for each node in the
1435 /// queue. But it's really done to make the heuristics easier to debug and
1436 /// statistically analyze.
1437 ///
1438 /// \param Cand provides the policy and current best candidate.
1439 /// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
1440 /// \param Zone describes the scheduled zone that we are extending.
1441 /// \param RPTracker describes reg pressure within the scheduled zone.
1442 /// \param TempTracker is a scratch pressure tracker to reuse in queries.
1443 void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
1444 SchedCandidate &TryCand,
1445 SchedBoundary &Zone,
1446 const RegPressureTracker &RPTracker,
1447 RegPressureTracker &TempTracker) {
1448
1449 // Always initialize TryCand's RPDelta.
1450 TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
1451 DAG->getRegionCriticalPSets(),
1452 DAG->getRegPressure().MaxSetPressure);
1453
1454 // Initialize the candidate if needed.
1455 if (!Cand.isValid()) {
1456 TryCand.Reason = NodeOrder;
1457 return;
1458 }
1459 // Avoid exceeding the target's limit.
1460 if (tryLess(TryCand.RPDelta.Excess.UnitIncrease,
1461 Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess))
1462 return;
1463 if (Cand.Reason == SingleExcess)
1464 Cand.Reason = MultiPressure;
1465
1466 // Avoid increasing the max critical pressure in the scheduled region.
1467 if (tryLess(TryCand.RPDelta.CriticalMax.UnitIncrease,
1468 Cand.RPDelta.CriticalMax.UnitIncrease,
1469 TryCand, Cand, SingleCritical))
1470 return;
1471 if (Cand.Reason == SingleCritical)
1472 Cand.Reason = MultiPressure;
1473
1474 // Avoid critical resource consumption and balance the schedule.
1475 TryCand.initResourceDelta(DAG, SchedModel);
1476 if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
1477 TryCand, Cand, ResourceReduce))
1478 return;
1479 if (tryGreater(TryCand.ResDelta.DemandedResources,
1480 Cand.ResDelta.DemandedResources,
1481 TryCand, Cand, ResourceDemand))
1482 return;
1483
1484 // Avoid serializing long latency dependence chains.
1485 if (Cand.Policy.ReduceLatency) {
1486 if (Zone.isTop()) {
1487 if (Cand.SU->getDepth() * SchedModel->getLatencyFactor()
1488 > Zone.ExpectedCount) {
1489 if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
1490 TryCand, Cand, TopDepthReduce))
1491 return;
1492 }
1493 if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
1494 TryCand, Cand, TopPathReduce))
1495 return;
1496 }
1497 else {
1498 if (Cand.SU->getHeight() * SchedModel->getLatencyFactor()
1499 > Zone.ExpectedCount) {
1500 if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
1501 TryCand, Cand, BotHeightReduce))
1502 return;
1503 }
1504 if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
1505 TryCand, Cand, BotPathReduce))
1506 return;
1507 }
1508 }
1509
1510 // Avoid increasing the max pressure of the entire region.
1511 if (tryLess(TryCand.RPDelta.CurrentMax.UnitIncrease,
1512 Cand.RPDelta.CurrentMax.UnitIncrease, TryCand, Cand, SingleMax))
1513 return;
1514 if (Cand.Reason == SingleMax)
1515 Cand.Reason = MultiPressure;
1516
1517 // Prefer immediate defs/users of the last scheduled instruction. This is a
1518 // nice pressure avoidance strategy that also conserves the processor's
1519 // register renaming resources and keeps the machine code readable.
1520 if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) {
1521 TryCand.Reason = NextDefUse;
1522 return;
1523 }
1524 if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) {
1525 if (Cand.Reason > NextDefUse)
1526 Cand.Reason = NextDefUse;
1527 return;
1528 }
1529 // Fall through to original instruction order.
1530 if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
1531 || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
1532 TryCand.Reason = NodeOrder;
1533 }
1534 }
9581535
9591536 /// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
9601537 /// more desirable than RHS from scheduling standpoint.
9651542 // have UnitIncrease==0, so are neutral.
9661543
9671544 // Avoid increasing the max critical pressure in the scheduled region.
968 if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease)
1545 if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) {
1546 DEBUG(dbgs() << "RP excess top - bot: "
1547 << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n');
9691548 return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
970
1549 }
9711550 // Avoid increasing the max critical pressure in the scheduled region.
972 if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease)
1551 if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) {
1552 DEBUG(dbgs() << "RP critical top - bot: "
1553 << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease)
1554 << '\n');
9731555 return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
974
1556 }
9751557 // Avoid increasing the max pressure of the entire region.
976 if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease)
1558 if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) {
1559 DEBUG(dbgs() << "RP current top - bot: "
1560 << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease)
1561 << '\n');
9771562 return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
978
1563 }
9791564 return false;
9801565 }
1566
1567 #ifndef NDEBUG
1568 const char *ConvergingScheduler::getReasonStr(
1569 ConvergingScheduler::CandReason Reason) {
1570 switch (Reason) {
1571 case NoCand: return "NOCAND ";
1572 case SingleExcess: return "REG-EXCESS";
1573 case SingleCritical: return "REG-CRIT ";
1574 case SingleMax: return "REG-MAX ";
1575 case MultiPressure: return "REG-MULTI ";
1576 case ResourceReduce: return "RES-REDUCE";
1577 case ResourceDemand: return "RES-DEMAND";
1578 case TopDepthReduce: return "TOP-DEPTH ";
1579 case TopPathReduce: return "TOP-PATH ";
1580 case BotHeightReduce:return "BOT-HEIGHT";
1581 case BotPathReduce: return "BOT-PATH ";
1582 case NextDefUse: return "DEF-USE ";
1583 case NodeOrder: return "ORDER ";
1584 };
1585 }
1586
1587 void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
1588 const SchedBoundary &Zone) {
1589 const char *Label = getReasonStr(Cand.Reason);
1590 PressureElement P;
1591 unsigned ResIdx = 0;
1592 unsigned Latency = 0;
1593 switch (Cand.Reason) {
1594 default:
1595 break;
1596 case SingleExcess:
1597 P = Cand.RPDelta.Excess;
1598 break;
1599 case SingleCritical:
1600 P = Cand.RPDelta.CriticalMax;
1601 break;
1602 case SingleMax:
1603 P = Cand.RPDelta.CurrentMax;
1604 break;
1605 case ResourceReduce:
1606 ResIdx = Cand.Policy.ReduceResIdx;
1607 break;
1608 case ResourceDemand:
1609 ResIdx = Cand.Policy.DemandResIdx;
1610 break;
1611 case TopDepthReduce:
1612 Latency = Cand.SU->getDepth();
1613 break;
1614 case TopPathReduce:
1615 Latency = Cand.SU->getHeight();
1616 break;
1617 case BotHeightReduce:
1618 Latency = Cand.SU->getHeight();
1619 break;
1620 case BotPathReduce:
1621 Latency = Cand.SU->getDepth();
1622 break;
1623 }
1624 dbgs() << Label << " " << Zone.Available.getName() << " ";
1625 if (P.isValid())
1626 dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
1627 << " ";
1628 else
1629 dbgs() << " ";
1630 if (ResIdx)
1631 dbgs() << SchedModel->getProcResource(ResIdx)->Name << " ";
1632 else
1633 dbgs() << " ";
1634 if (Latency)
1635 dbgs() << Latency << " cycles ";
1636 else
1637 dbgs() << " ";
1638 Cand.SU->dump(DAG);
1639 }
1640 #endif
9811641
9821642 /// Pick the best candidate from the top queue.
9831643 ///
9841644 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
9851645 /// DAG building. To adjust for the current scheduling location we need to
9861646 /// maintain the number of vreg uses remaining to be top-scheduled.
987 ConvergingScheduler::CandResult ConvergingScheduler::
988 pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
989 SchedCandidate &Candidate) {
1647 void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
1648 const RegPressureTracker &RPTracker,
1649 SchedCandidate &Cand) {
1650 ReadyQueue &Q = Zone.Available;
1651
9901652 DEBUG(Q.dump());
9911653
9921654 // getMaxPressureDelta temporarily modifies the tracker.
9931655 RegPressureTracker &TempTracker = const_cast(RPTracker);
9941656
995 // BestSU remains NULL if no top candidates beat the best existing candidate.
996 CandResult FoundCandidate = NoCand;
9971657 for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
998 RegPressureDelta RPDelta;
999 TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
1000 DAG->getRegionCriticalPSets(),
1001 DAG->getRegPressure().MaxSetPressure);
1002
1003 // Initialize the candidate if needed.
1004 if (!Candidate.SU) {
1005 Candidate.SU = *I;
1006 Candidate.RPDelta = RPDelta;
1007 FoundCandidate = NodeOrder;
1008 continue;
1009 }
1010 // Avoid exceeding the target's limit.
1011 if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) {
1012 DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess));
1013 Candidate.SU = *I;
1014 Candidate.RPDelta = RPDelta;
1015 FoundCandidate = SingleExcess;
1016 continue;
1017 }
1018 if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease)
1019 continue;
1020 if (FoundCandidate == SingleExcess)
1021 FoundCandidate = MultiPressure;
1022
1023 // Avoid increasing the max critical pressure in the scheduled region.
1024 if (RPDelta.CriticalMax.UnitIncrease
1025 < Candidate.RPDelta.CriticalMax.UnitIncrease) {
1026 DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax));
1027 Candidate.SU = *I;
1028 Candidate.RPDelta = RPDelta;
1029 FoundCandidate = SingleCritical;
1030 continue;
1031 }
1032 if (RPDelta.CriticalMax.UnitIncrease
1033 > Candidate.RPDelta.CriticalMax.UnitIncrease)
1034 continue;
1035 if (FoundCandidate == SingleCritical)
1036 FoundCandidate = MultiPressure;
1037
1038 // Avoid increasing the max pressure of the entire region.
1039 if (RPDelta.CurrentMax.UnitIncrease
1040 < Candidate.RPDelta.CurrentMax.UnitIncrease) {
1041 DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax));
1042 Candidate.SU = *I;
1043 Candidate.RPDelta = RPDelta;
1044 FoundCandidate = SingleMax;
1045 continue;
1046 }
1047 if (RPDelta.CurrentMax.UnitIncrease
1048 > Candidate.RPDelta.CurrentMax.UnitIncrease)
1049 continue;
1050 if (FoundCandidate == SingleMax)
1051 FoundCandidate = MultiPressure;
1052
1053 // Fall through to original instruction order.
1054 // Only consider node order if Candidate was chosen from this Q.
1055 if (FoundCandidate == NoCand)
1056 continue;
1057
1058 if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
1059 || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
1060 DEBUG(traceCandidate("NCAND", Q, *I));
1061 Candidate.SU = *I;
1062 Candidate.RPDelta = RPDelta;
1063 FoundCandidate = NodeOrder;
1064 }
1065 }
1066 return FoundCandidate;
1658
1659 SchedCandidate TryCand(Cand.Policy);
1660 TryCand.SU = *I;
1661 tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker);
1662 if (TryCand.Reason != NoCand) {
1663 // Initialize resource delta if needed in case future heuristics query it.
1664 if (TryCand.ResDelta == SchedResourceDelta())
1665 TryCand.initResourceDelta(DAG, SchedModel);
1666 Cand.setBest(TryCand);
1667 DEBUG(traceCandidate(Cand, Zone));
1668 }
1669 TryCand.SU = *I;
1670 }
1671 }
1672
1673 static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
1674 bool IsTop) {
1675 DEBUG(dbgs() << "Pick " << (IsTop ? "top" : "bot")
1676 << " SU(" << Cand.SU->NodeNum << ") "
1677 << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
10671678 }
10681679
10691680 /// Pick the best candidate node from either the top or bottom queue.
1070 SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
1681 SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
10711682 // Schedule as far as possible in the direction of no choice. This is most
10721683 // efficient, but also provides the best heuristics for CriticalPSets.
10731684 if (SUnit *SU = Bot.pickOnlyChoice()) {
10781689 IsTopNode = true;
10791690 return SU;
10801691 }
1081 SchedCandidate BotCand;
1692 CandPolicy NoPolicy;
1693 SchedCandidate BotCand(NoPolicy);
1694 SchedCandidate TopCand(NoPolicy);
1695 checkResourceLimits(TopCand, BotCand);
1696
10821697 // Prefer bottom scheduling when heuristics are silent.
1083 CandResult BotResult = pickNodeFromQueue(Bot.Available,
1084 DAG->getBotRPTracker(), BotCand);
1085 assert(BotResult != NoCand && "failed to find the first candidate");
1698 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
1699 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
10861700
10871701 // If either Q has a single candidate that provides the least increase in
10881702 // Excess pressure, we can immediately schedule from that Q.
10911705 // affects picking from either Q. If scheduling in one direction must
10921706 // increase pressure for one of the excess PSets, then schedule in that
10931707 // direction first to provide more freedom in the other direction.
1094 if (BotResult == SingleExcess || BotResult == SingleCritical) {
1708 if (BotCand.Reason == SingleExcess || BotCand.Reason == SingleCritical) {
10951709 IsTopNode = false;
1710 tracePick(BotCand, IsTopNode);
10961711 return BotCand.SU;
10971712 }
10981713 // Check if the top Q has a better candidate.
1099 SchedCandidate TopCand;
1100 CandResult TopResult = pickNodeFromQueue(Top.Available,
1101 DAG->getTopRPTracker(), TopCand);
1102 assert(TopResult != NoCand && "failed to find the first candidate");
1103
1104 if (TopResult == SingleExcess || TopResult == SingleCritical) {
1105 IsTopNode = true;
1106 return TopCand.SU;
1107 }
1714 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
1715 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
1716
11081717 // If either Q has a single candidate that minimizes pressure above the
11091718 // original region's pressure pick it.
1110 if (BotResult == SingleMax) {
1719 if (TopCand.Reason <= SingleMax || BotCand.Reason <= SingleMax) {
1720 if (TopCand.Reason < BotCand.Reason) {
1721 IsTopNode = true;
1722 tracePick(TopCand, IsTopNode);
1723 return TopCand.SU;
1724 }
11111725 IsTopNode = false;
1726 tracePick(BotCand, IsTopNode);
11121727 return BotCand.SU;
1113 }
1114 if (TopResult == SingleMax) {
1115 IsTopNode = true;
1116 return TopCand.SU;
11171728 }
11181729 // Check for a salient pressure difference and pick the best from either side.
11191730 if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
11201731 IsTopNode = true;
1732 tracePick(TopCand, IsTopNode);
11211733 return TopCand.SU;
11221734 }
1123 // Otherwise prefer the bottom candidate in node order.
1735 // Otherwise prefer the bottom candidate, in node order if all else failed.
1736 if (TopCand.Reason < BotCand.Reason) {
1737 IsTopNode = true;
1738 tracePick(TopCand, IsTopNode);
1739 return TopCand.SU;
1740 }
11241741 IsTopNode = false;
1742 tracePick(BotCand, IsTopNode);
11251743 return BotCand.SU;
11261744 }
11271745
11371755 if (ForceTopDown) {
11381756 SU = Top.pickOnlyChoice();
11391757 if (!SU) {
1140 SchedCandidate TopCand;
1141 CandResult TopResult =
1142 pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
1143 assert(TopResult != NoCand && "failed to find the first candidate");
1144 (void)TopResult;
1758 CandPolicy NoPolicy;
1759 SchedCandidate TopCand(NoPolicy);
1760 pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
1761 assert(TopCand.Reason != NoCand && "failed to find the first candidate");
11451762 SU = TopCand.SU;
11461763 }
11471764 IsTopNode = true;
11491766 else if (ForceBottomUp) {
11501767 SU = Bot.pickOnlyChoice();
11511768 if (!SU) {
1152 SchedCandidate BotCand;
1153 CandResult BotResult =
1154 pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
1155 assert(BotResult != NoCand && "failed to find the first candidate");
1156 (void)BotResult;
1769 CandPolicy NoPolicy;
1770 SchedCandidate BotCand(NoPolicy);
1771 pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
1772 assert(BotCand.Reason != NoCand && "failed to find the first candidate");
11571773 SU = BotCand.SU;
11581774 }
11591775 IsTopNode = false;
11601776 }
11611777 else {
1162 SU = pickNodeBidrectional(IsTopNode);
1778 SU = pickNodeBidirectional(IsTopNode);
11631779 }
11641780 } while (SU->isScheduled);
11651781
0 ; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
1 ; RUN: -verify-machineinstrs | FileCheck %s
2 ;
3 ; Verify that misched resource/latency balancy heuristics are sane.
4
5 define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
6 i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
7 i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
8 nounwind uwtable ssp {
9 entry:
10 br label %for.body
11
12 ; imull folded loads should be in order and interleaved with addl, never
13 ; adjacent. Also check that we have no spilling.
14 ;
15 ; Since mmult1 IR is already in good order, this effectively ensure
16 ; the scheduler maintains source order.
17 ;
18 ; CHECK: %for.body
19 ; CHECK-NOT: %rsp
20 ; CHECK: imull 4
21 ; CHECK-NOT: {{imull|rsp}}
22 ; CHECK: addl
23 ; CHECK: imull 8
24 ; CHECK-NOT: {{imull|rsp}}
25 ; CHECK: addl
26 ; CHECK: imull 12
27 ; CHECK-NOT: {{imull|rsp}}
28 ; CHECK: addl
29 ; CHECK: imull 16
30 ; CHECK-NOT: {{imull|rsp}}
31 ; CHECK: addl
32 ; CHECK: imull 20
33 ; CHECK-NOT: {{imull|rsp}}
34 ; CHECK: addl
35 ; CHECK: imull 24
36 ; CHECK-NOT: {{imull|rsp}}
37 ; CHECK: addl
38 ; CHECK: imull 28
39 ; CHECK-NOT: {{imull|rsp}}
40 ; CHECK: addl
41 ; CHECK: imull 32
42 ; CHECK-NOT: {{imull|rsp}}
43 ; CHECK: addl
44 ; CHECK: imull 36
45 ; CHECK-NOT: {{imull|rsp}}
46 ; CHECK: addl
47 ; CHECK-NOT: {{imull|rsp}}
48 ; CHECK: %end
49 for.body:
50 %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
51 %tmp57 = load i32* %tmp56, align 4
52 %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
53 %tmp58 = load i32* %arrayidx12.us.i61, align 4
54 %mul.us.i = mul nsw i32 %tmp58, %tmp57
55 %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
56 %tmp59 = load i32* %arrayidx8.us.i.1, align 4
57 %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
58 %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
59 %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
60 %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
61 %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
62 %tmp61 = load i32* %arrayidx8.us.i.2, align 4
63 %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
64 %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
65 %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
66 %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
67 %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
68 %tmp63 = load i32* %arrayidx8.us.i.3, align 4
69 %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
70 %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
71 %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
72 %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
73 %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
74 %tmp65 = load i32* %arrayidx8.us.i.4, align 4
75 %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
76 %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
77 %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
78 %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
79 %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
80 %tmp67 = load i32* %arrayidx8.us.i.5, align 4
81 %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
82 %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
83 %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
84 %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
85 %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
86 %tmp69 = load i32* %arrayidx8.us.i.6, align 4
87 %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
88 %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
89 %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
90 %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
91 %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
92 %tmp71 = load i32* %arrayidx8.us.i.7, align 4
93 %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
94 %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
95 %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
96 %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
97 %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
98 %tmp73 = load i32* %arrayidx8.us.i.8, align 4
99 %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
100 %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
101 %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
102 %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
103 %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
104 %tmp75 = load i32* %arrayidx8.us.i.9, align 4
105 %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
106 %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
107 %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
108 %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
109 %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
110 store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
111 %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
112 %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
113 %exitcond = icmp eq i32 %lftr.wideiv, 10
114 br i1 %exitcond, label %end, label %for.body
115
116 end:
117 ret void
118 }
119
120 ; Unlike the above loop, this IR starts out bad and must be
121 ; rescheduled.
122 ;
123 ; CHECK: %for.body
124 ; CHECK-NOT: %rsp
125 ; CHECK: imull 4
126 ; CHECK-NOT: {{imull|rsp}}
127 ; CHECK: addl
128 ; CHECK: imull 8
129 ; CHECK-NOT: {{imull|rsp}}
130 ; CHECK: addl
131 ; CHECK: imull 12
132 ; CHECK-NOT: {{imull|rsp}}
133 ; CHECK: addl
134 ; CHECK: imull 16
135 ; CHECK-NOT: {{imull|rsp}}
136 ; CHECK: addl
137 ; CHECK: imull 20
138 ; CHECK-NOT: {{imull|rsp}}
139 ; CHECK: addl
140 ; CHECK: imull 24
141 ; CHECK-NOT: {{imull|rsp}}
142 ; CHECK: addl
143 ; CHECK: imull 28
144 ; CHECK-NOT: {{imull|rsp}}
145 ; CHECK: addl
146 ; CHECK: imull 32
147 ; CHECK-NOT: {{imull|rsp}}
148 ; CHECK: addl
149 ; CHECK: imull 36
150 ; CHECK-NOT: {{imull|rsp}}
151 ; CHECK: addl
152 ; CHECK-NOT: {{imull|rsp}}
153 ; CHECK: %end
154 define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
155 i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
156 i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
157 nounwind uwtable ssp {
158 entry:
159 br label %for.body
160 for.body:
161 %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
162 %tmp57 = load i32* %tmp56, align 4
163 %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
164 %tmp58 = load i32* %arrayidx12.us.i61, align 4
165 %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
166 %tmp59 = load i32* %arrayidx8.us.i.1, align 4
167 %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
168 %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
169 %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
170 %tmp61 = load i32* %arrayidx8.us.i.2, align 4
171 %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
172 %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
173 %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
174 %tmp63 = load i32* %arrayidx8.us.i.3, align 4
175 %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
176 %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
177 %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
178 %tmp65 = load i32* %arrayidx8.us.i.4, align 4
179 %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
180 %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
181 %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
182 %tmp67 = load i32* %arrayidx8.us.i.5, align 4
183 %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
184 %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
185 %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
186 %tmp69 = load i32* %arrayidx8.us.i.6, align 4
187 %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
188 %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
189 %mul.us.i = mul nsw i32 %tmp58, %tmp57
190 %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
191 %tmp71 = load i32* %arrayidx8.us.i.7, align 4
192 %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
193 %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
194 %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
195 %tmp73 = load i32* %arrayidx8.us.i.8, align 4
196 %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
197 %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
198 %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
199 %tmp75 = load i32* %arrayidx8.us.i.9, align 4
200 %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
201 %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
202 %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
203 %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
204 %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
205 %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
206 %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
207 %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
208 %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
209 %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
210 %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
211 %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
212 %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
213 %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
214 %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
215 %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
216 %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
217 %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
218 %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
219 %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
220 %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
221 store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
222 %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
223 %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
224 %exitcond = icmp eq i32 %lftr.wideiv, 10
225 br i1 %exitcond, label %end, label %for.body
226
227 end:
228 ret void
229 }