llvm.org GIT mirror llvm / c1359c9
MachinePipeliner pass that implements Swing Modulo Scheduling Software pipelining is an optimization for improving ILP by overlapping loop iterations. Swing Modulo Scheduling (SMS) is an implementation of software pipelining that attempts to reduce register pressure and generate efficient pipelines with a low compile-time cost. This implementaion of SMS is a target-independent back-end pass. When enabled, the pass should run just prior to the register allocation pass, while the machine IR is in SSA form. If the pass is successful, then the original loop is replaced by the optimized loop. The optimized loop contains one or more prolog blocks, the pipelined kernel, and one or more epilog blocks. This pass is enabled for Hexagon only. To enable for other targets, a couple of target specific hooks must be implemented, and the pass needs to be called from the target's TargetMachine implementation. Differential Review: http://reviews.llvm.org/D16829 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@277169 91177308-0d34-0410-b5e6-96231b3b80d8 Brendon Cahoon 3 years ago
21 changed file(s) with 4583 addition(s) and 7 deletion(s). Raw diff Collapse all Expand all
373373 /// and propagates register usage information of callee to caller
374374 /// if available with PysicalRegisterUsageInfo pass.
375375 FunctionPass *createRegUsageInfoPropPass();
376
377 /// This pass performs software pipelining on machine instructions.
378 extern char &MachinePipelinerID;
376379 } // End llvm namespace
377380
378381 /// Target machine pass initializer for passes with dependencies. Use with
222222 void initializeMachineLICMPass(PassRegistry&);
223223 void initializeMachineLoopInfoPass(PassRegistry&);
224224 void initializeMachineModuleInfoPass(PassRegistry&);
225 void initializeMachinePipelinerPass(PassRegistry&);
225226 void initializeMachinePostDominatorTreePass(PassRegistry&);
226227 void initializeMachineRegionInfoPassPass(PassRegistry&);
227228 void initializeMachineSchedulerPass(PassRegistry&);
1717 #include "llvm/ADT/SmallSet.h"
1818 #include "llvm/CodeGen/MachineCombinerPattern.h"
1919 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineLoopInfo.h"
2021 #include "llvm/MC/MCInstrInfo.h"
2122 #include "llvm/Support/BranchProbability.h"
2223 #include "llvm/Target/TargetRegisterInfo.h"
550551 llvm_unreachable("Target didn't implement TargetInstrInfo::InsertBranch!");
551552 }
552553
554 /// Analyze the loop code, return true if it cannot be understoo. Upon
555 /// success, this function returns false and returns information about the
556 /// induction variable and compare instruction used at the end.
557 virtual bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst,
558 MachineInstr *&CmpInst) const {
559 return true;
560 }
561
562 /// Generate code to reduce the loop iteration by one and check if the loop is
563 /// finished. Return the value/register of the the new loop count. We need
564 /// this function when peeling off one or more iterations of a loop. This
565 /// function assumes the nth iteration is peeled first.
566 virtual unsigned reduceLoopCount(MachineBasicBlock &MBB,
567 MachineInstr *IndVar, MachineInstr *Cmp,
568 SmallVectorImpl &Cond,
569 SmallVectorImpl &PrevInsts,
570 unsigned Iter, unsigned MaxIter) const {
571 llvm_unreachable("Target didn't implement ReduceLoopCount");
572 }
573
553574 /// Delete the instruction OldInst and everything after it, replacing it with
554575 /// an unconditional branch to NewDest. This is used by the tail merging pass.
555576 virtual void ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
10081029 return false;
10091030 }
10101031
1032 /// Return true if the instruction contains a base register and offset. If
1033 /// true, the function also sets the operand position in the instruction
1034 /// for the base register and offset.
1035 virtual bool getBaseAndOffsetPosition(const MachineInstr *MI,
1036 unsigned &BasePos,
1037 unsigned &OffsetPos) const {
1038 return false;
1039 }
1040
1041 /// If the instruction is an increment of a constant value, return the amount.
1042 virtual bool getIncrementValue(const MachineInstr *MI, int &Value) const {
1043 return false;
1044 }
1045
10111046 virtual bool enableClusterLoads() const { return false; }
10121047
10131048 virtual bool enableClusterStores() const { return false; }
10401075 /// Return the noop instruction to use for a noop.
10411076 virtual void getNoopForMachoTarget(MCInst &NopInst) const;
10421077
1078 /// Return true for post-incremented instructions.
1079 virtual bool isPostIncrement(const MachineInstr* MI) const {
1080 return false;
1081 }
10431082
10441083 /// Returns true if the instruction is already predicated.
10451084 virtual bool isPredicated(const MachineInstr &MI) const {
6969 MachineModuleInfo.cpp
7070 MachineModuleInfoImpls.cpp
7171 MachinePassRegistry.cpp
72 MachinePipeliner.cpp
7273 MachinePostDominators.cpp
7374 MachineRegionInfo.cpp
7475 MachineRegisterInfo.cpp
5252 initializeMachineLICMPass(Registry);
5353 initializeMachineLoopInfoPass(Registry);
5454 initializeMachineModuleInfoPass(Registry);
55 initializeMachinePipelinerPass(Registry);
5556 initializeMachinePostDominatorTreePass(Registry);
5657 initializeMachineSchedulerPass(Registry);
5758 initializeMachineSinkingPass(Registry);
0 //===-- MachinePipeliner.cpp - Machine Software Pipeliner Pass ------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10 //
11 // Software pipelining (SWP) is an instruction scheduling technique for loops
12 // that overlap loop iterations and explioits ILP via a compiler transformation.
13 //
14 // Swing Modulo Scheduling is an implementation of software pipelining
15 // that generates schedules that are near optimal in terms of initiation
16 // interval, register requirements, and stage count. See the papers:
17 //
18 // "Swing Modulo Scheduling: A Lifetime-Sensitive Approach", by J. Llosa,
19 // A. Gonzalez, E. Ayguade, and M. Valero. In PACT '96 Processings of the 1996
20 // Conference on Parallel Architectures and Compilation Techiniques.
21 //
22 // "Lifetime-Sensitive Modulo Scheduling in a Production Environment", by J.
23 // Llosa, E. Ayguade, A. Gonzalez, M. Valero, and J. Eckhardt. In IEEE
24 // Transactions on Computers, Vol. 50, No. 3, 2001.
25 //
26 // "An Implementation of Swing Modulo Scheduling With Extensions for
27 // Superblocks", by T. Lattner, Master's Thesis, University of Illinois at
28 // Urbana-Chambpain, 2005.
29 //
30 //
31 // The SMS algorithm consists of three main steps after computing the minimal
32 // initiation interval (MII).
33 // 1) Analyze the dependence graph and compute information about each
34 // instruction in the graph.
35 // 2) Order the nodes (instructions) by priority based upon the heuristics
36 // described in the algorithm.
37 // 3) Attempt to schedule the nodes in the specified order using the MII.
38 //
39 // This SMS implementation is a target-independent back-end pass. When enabled,
40 // the pass runs just prior to the register allocation pass, while the machine
41 // IR is in SSA form. If software pipelining is successful, then the original
42 // loop is replaced by the optimized loop. The optimized loop contains one or
43 // more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
44 // the instructions cannot be scheduled in a given MII, we increase the MII by
45 // one and try again.
46 //
47 // The SMS implementation is an extension of the ScheduleDAGInstrs class. We
48 // represent loop carried dependences in the DAG as order edges to the Phi
49 // nodes. We also perform several passes over the DAG to eliminate unnecessary
50 // edges that inhibit the ability to pipeline. The implementation uses the
51 // DFAPacketizer class to compute the minimum initiation interval and the check
52 // where an instruction may be inserted in the pipelined schedule.
53 //
54 // In order for the SMS pass to work, several target specific hooks need to be
55 // implemented to get information about the loop structure and to rewrite
56 // instructions.
57 //
58 //===----------------------------------------------------------------------===//
59
60 #include "llvm/ADT/DenseMap.h"
61 #include "llvm/ADT/MapVector.h"
62 #include "llvm/ADT/PriorityQueue.h"
63 #include "llvm/ADT/SetVector.h"
64 #include "llvm/ADT/SmallPtrSet.h"
65 #include "llvm/ADT/SmallSet.h"
66 #include "llvm/ADT/Statistic.h"
67 #include "llvm/Analysis/AliasAnalysis.h"
68 #include "llvm/Analysis/ValueTracking.h"
69 #include "llvm/CodeGen/DFAPacketizer.h"
70 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
71 #include "llvm/CodeGen/MachineBasicBlock.h"
72 #include "llvm/CodeGen/MachineDominators.h"
73 #include "llvm/CodeGen/MachineInstrBuilder.h"
74 #include "llvm/CodeGen/MachineLoopInfo.h"
75 #include "llvm/CodeGen/MachineRegisterInfo.h"
76 #include "llvm/CodeGen/Passes.h"
77 #include "llvm/CodeGen/RegisterClassInfo.h"
78 #include "llvm/CodeGen/RegisterPressure.h"
79 #include "llvm/CodeGen/ScheduleDAGInstrs.h"
80 #include "llvm/MC/MCInstrItineraries.h"
81 #include "llvm/Support/CommandLine.h"
82 #include "llvm/Support/Debug.h"
83 #include "llvm/Support/raw_ostream.h"
84 #include "llvm/Target/TargetInstrInfo.h"
85 #include "llvm/Target/TargetMachine.h"
86 #include "llvm/Target/TargetRegisterInfo.h"
87 #include "llvm/Target/TargetSubtargetInfo.h"
88 #include
89 #include
90 #include
91
92 using namespace llvm;
93
94 #define DEBUG_TYPE "pipeliner"
95
96 STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
97 STATISTIC(NumPipelined, "Number of loops software pipelined");
98
99 /// A command line option to turn software pipelining on or off.
100 cl::opt EnableSWP("enable-pipeliner", cl::Hidden, cl::init(true),
101 cl::ZeroOrMore, cl::desc("Enable Software Pipelining"));
102
103 /// A command line option to enable SWP at -Os.
104 static cl::opt EnableSWPOptSize("enable-pipeliner-opt-size",
105 cl::desc("Enable SWP at Os."), cl::Hidden,
106 cl::init(false));
107
108 /// A command line argument to limit minimum initial interval for pipelining.
109 static cl::opt SwpMaxMii("pipeliner-max-mii",
110 cl::desc("Size limit for the the MII."),
111 cl::Hidden, cl::init(27));
112
113 /// A command line argument to limit the number of stages in the pipeline.
114 static cl::opt
115 SwpMaxStages("pipeliner-max-stages",
116 cl::desc("Maximum stages allowed in the generated scheduled."),
117 cl::Hidden, cl::init(3));
118
119 /// A command line option to disable the pruning of chain dependences due to
120 /// an unrelated Phi.
121 static cl::opt
122 SwpPruneDeps("pipeliner-prune-deps",
123 cl::desc("Prune dependences between unrelated Phi nodes."),
124 cl::Hidden, cl::init(true));
125
126 /// A command line option to disable the pruning of loop carried order
127 /// dependences.
128 static cl::opt
129 SwpPruneLoopCarried("pipeliner-prune-loop-carried",
130 cl::desc("Prune loop carried order dependences."),
131 cl::Hidden, cl::init(true));
132
133 #ifndef NDEBUG
134 static cl::opt SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-1));
135 #endif
136
137 static cl::opt SwpIgnoreRecMII("pipeliner-ignore-recmii",
138 cl::ReallyHidden, cl::init(false),
139 cl::ZeroOrMore, cl::desc("Ignore RecMII"));
140
141 namespace {
142
143 class NodeSet;
144 class SMSchedule;
145 class SwingSchedulerDAG;
146
147 /// The main class in the implementation of the target independent
148 /// software pipeliner pass.
149 class MachinePipeliner : public MachineFunctionPass {
150 public:
151 MachineFunction *MF = nullptr;
152 const MachineLoopInfo *MLI = nullptr;
153 const MachineDominatorTree *MDT = nullptr;
154 const InstrItineraryData *InstrItins;
155 const TargetInstrInfo *TII = nullptr;
156 RegisterClassInfo RegClassInfo;
157
158 #ifndef NDEBUG
159 static int NumTries;
160 #endif
161 /// Cache the target analysis information about the loop.
162 struct LoopInfo {
163 MachineBasicBlock *TBB = nullptr;
164 MachineBasicBlock *FBB = nullptr;
165 SmallVector BrCond;
166 MachineInstr *LoopInductionVar = nullptr;
167 MachineInstr *LoopCompare = nullptr;
168 };
169 LoopInfo LI;
170
171 static char ID;
172 MachinePipeliner() : MachineFunctionPass(ID) {
173 initializeMachinePipelinerPass(*PassRegistry::getPassRegistry());
174 }
175
176 virtual bool runOnMachineFunction(MachineFunction &MF);
177
178 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
179 AU.addRequired();
180 AU.addPreserved();
181 AU.addRequired();
182 AU.addRequired();
183 AU.addRequired();
184 MachineFunctionPass::getAnalysisUsage(AU);
185 }
186
187 private:
188 bool canPipelineLoop(MachineLoop &L);
189 bool scheduleLoop(MachineLoop &L);
190 bool swingModuloScheduler(MachineLoop &L);
191 };
192
193 /// This class builds the dependence graph for the instructions in a loop,
194 /// and attempts to schedule the instructions using the SMS algorithm.
195 class SwingSchedulerDAG : public ScheduleDAGInstrs {
196 MachinePipeliner &Pass;
197 /// The minimum initiation interval between iterations for this schedule.
198 unsigned MII;
199 /// Set to true if a valid pipelined schedule is found for the loop.
200 bool Scheduled;
201 MachineLoop &Loop;
202 LiveIntervals &LIS;
203 const RegisterClassInfo &RegClassInfo;
204
205 /// A toplogical ordering of the SUnits, which is needed for changing
206 /// dependences and iterating over the SUnits.
207 ScheduleDAGTopologicalSort Topo;
208
209 struct NodeInfo {
210 int ASAP;
211 int ALAP;
212 NodeInfo() : ASAP(0), ALAP(0) {}
213 };
214 /// Computed properties for each node in the graph.
215 std::vector ScheduleInfo;
216
217 enum OrderKind { BottomUp = 0, TopDown = 1 };
218 /// Computed node ordering for scheduling.
219 SetVector NodeOrder;
220
221 typedef SmallVector NodeSetType;
222 typedef DenseMap ValueMapTy;
223 typedef SmallVectorImpl MBBVectorTy;
224 typedef DenseMap InstrMapTy;
225
226 /// Instructions to change when emitting the final schedule.
227 DenseMap> InstrChanges;
228
229 /// We may create a new instruction, so remember it because it
230 /// must be deleted when the pass is finished.
231 SmallPtrSet NewMIs;
232
233 /// Helper class to implement Johnson's circuit finding algorithm.
234 class Circuits {
235 std::vector &SUnits;
236 SetVector Stack;
237 BitVector Blocked;
238 SmallVector, 10> B;
239 SmallVector, 16> AdjK;
240 unsigned NumPaths;
241 static unsigned MaxPaths;
242
243 public:
244 Circuits(std::vector &SUs)
245 : SUnits(SUs), Stack(), Blocked(SUs.size()), B(SUs.size()),
246 AdjK(SUs.size()) {}
247 /// Reset the data structures used in the circuit algorithm.
248 void reset() {
249 Stack.clear();
250 Blocked.reset();
251 B.assign(SUnits.size(), SmallPtrSet());
252 NumPaths = 0;
253 }
254 void createAdjacencyStructure(SwingSchedulerDAG *DAG);
255 bool circuit(int V, int S, NodeSetType &NodeSets, bool HasBackedge = false);
256 void unblock(int U);
257 };
258
259 public:
260 SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
261 const RegisterClassInfo &rci)
262 : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), MII(0),
263 Scheduled(false), Loop(L), LIS(lis), RegClassInfo(rci),
264 Topo(SUnits, &ExitSU) {}
265
266 void schedule();
267 void finishBlock();
268
269 /// Return true if the loop kernel has been scheduled.
270 bool hasNewSchedule() { return Scheduled; }
271
272 /// Return the earliest time an instruction may be scheduled.
273 int getASAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ASAP; }
274
275 /// Return the latest time an instruction my be scheduled.
276 int getALAP(SUnit *Node) { return ScheduleInfo[Node->NodeNum].ALAP; }
277
278 /// The mobility function, which the the number of slots in which
279 /// an instruction may be scheduled.
280 int getMOV(SUnit *Node) { return getALAP(Node) - getASAP(Node); }
281
282 /// The depth, in the dependence graph, for a node.
283 int getDepth(SUnit *Node) { return Node->getDepth(); }
284
285 /// The height, in the dependence graph, for a node.
286 int getHeight(SUnit *Node) { return Node->getHeight(); }
287
288 /// Return true if the dependence is a back-edge in the data dependence graph.
289 /// Since the DAG doesn't contain cycles, we represent a cycle in the graph
290 /// using an anti dependence from a Phi to an instruction.
291 bool isBackedge(SUnit *Source, const SDep &Dep) {
292 if (Dep.getKind() != SDep::Anti)
293 return false;
294 return Source->getInstr()->isPHI() || Dep.getSUnit()->getInstr()->isPHI();
295 }
296
297 /// Return true if the dependence is an order dependence between non-Phis.
298 static bool isOrder(SUnit *Source, const SDep &Dep) {
299 if (Dep.getKind() != SDep::Order)
300 return false;
301 return (!Source->getInstr()->isPHI() &&
302 !Dep.getSUnit()->getInstr()->isPHI());
303 }
304
305 bool isLoopCarriedOrder(SUnit *Source, const SDep &Dep, bool isSucc = true);
306
307 /// The latency of the dependence.
308 unsigned getLatency(SUnit *Source, const SDep &Dep) {
309 // Anti dependences represent recurrences, so use the latency of the
310 // instruction on the back-edge.
311 if (Dep.getKind() == SDep::Anti) {
312 if (Source->getInstr()->isPHI())
313 return Dep.getSUnit()->Latency;
314 if (Dep.getSUnit()->getInstr()->isPHI())
315 return Source->Latency;
316 return Dep.getLatency();
317 }
318 return Dep.getLatency();
319 }
320
321 /// The distance function, which indicates that operation V of iteration I
322 /// depends on operations U of iteration I-distance.
323 unsigned getDistance(SUnit *U, SUnit *V, const SDep &Dep) {
324 // Instructions that feed a Phi have a distance of 1. Computing larger
325 // values for arrays requires data dependence information.
326 if (V->getInstr()->isPHI() && Dep.getKind() == SDep::Anti)
327 return 1;
328 return 0;
329 }
330
331 /// Set the Minimum Initiation Interval for this schedule attempt.
332 void setMII(unsigned mii) { MII = mii; }
333
334 MachineInstr *applyInstrChange(MachineInstr *MI, SMSchedule &Schedule,
335 bool UpdateDAG = false);
336
337 /// Return the new base register that was stored away for the changed
338 /// instruction.
339 unsigned getInstrBaseReg(SUnit *SU) {
340 DenseMap>::iterator It =
341 InstrChanges.find(SU);
342 if (It != InstrChanges.end())
343 return It->second.first;
344 return 0;
345 }
346
347 private:
348 void addLoopCarriedDependences(AliasAnalysis *AA);
349 void updatePhiDependences();
350 void changeDependences();
351 unsigned calculateResMII();
352 unsigned calculateRecMII(NodeSetType &RecNodeSets);
353 void findCircuits(NodeSetType &NodeSets);
354 void fuseRecs(NodeSetType &NodeSets);
355 void removeDuplicateNodes(NodeSetType &NodeSets);
356 void computeNodeFunctions(NodeSetType &NodeSets);
357 void registerPressureFilter(NodeSetType &NodeSets);
358 void colocateNodeSets(NodeSetType &NodeSets);
359 void checkNodeSets(NodeSetType &NodeSets);
360 void groupRemainingNodes(NodeSetType &NodeSets);
361 void addConnectedNodes(SUnit *SU, NodeSet &NewSet,
362 SetVector &NodesAdded);
363 void computeNodeOrder(NodeSetType &NodeSets);
364 bool schedulePipeline(SMSchedule &Schedule);
365 void generatePipelinedLoop(SMSchedule &Schedule);
366 void generateProlog(SMSchedule &Schedule, unsigned LastStage,
367 MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
368 MBBVectorTy &PrologBBs);
369 void generateEpilog(SMSchedule &Schedule, unsigned LastStage,
370 MachineBasicBlock *KernelBB, ValueMapTy *VRMap,
371 MBBVectorTy &EpilogBBs, MBBVectorTy &PrologBBs);
372 void generateExistingPhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
373 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
374 SMSchedule &Schedule, ValueMapTy *VRMap,
375 InstrMapTy &InstrMap, unsigned LastStageNum,
376 unsigned CurStageNum, bool IsLast);
377 void generatePhis(MachineBasicBlock *NewBB, MachineBasicBlock *BB1,
378 MachineBasicBlock *BB2, MachineBasicBlock *KernelBB,
379 SMSchedule &Schedule, ValueMapTy *VRMap,
380 InstrMapTy &InstrMap, unsigned LastStageNum,
381 unsigned CurStageNum, bool IsLast);
382 void removeDeadInstructions(MachineBasicBlock *KernelBB,
383 MBBVectorTy &EpilogBBs);
384 void splitLifetimes(MachineBasicBlock *KernelBB, MBBVectorTy &EpilogBBs,
385 SMSchedule &Schedule);
386 void addBranches(MBBVectorTy &PrologBBs, MachineBasicBlock *KernelBB,
387 MBBVectorTy &EpilogBBs, SMSchedule &Schedule,
388 ValueMapTy *VRMap);
389 bool computeDelta(MachineInstr &MI, unsigned &Delta);
390 void updateMemOperands(MachineInstr &NewMI, MachineInstr &OldMI,
391 unsigned Num);
392 MachineInstr *cloneInstr(MachineInstr *OldMI, unsigned CurStageNum,
393 unsigned InstStageNum);
394 MachineInstr *cloneAndChangeInstr(MachineInstr *OldMI, unsigned CurStageNum,
395 unsigned InstStageNum,
396 SMSchedule &Schedule);
397 void updateInstruction(MachineInstr *NewMI, bool LastDef,
398 unsigned CurStageNum, unsigned InstStageNum,
399 SMSchedule &Schedule, ValueMapTy *VRMap);
400 MachineInstr *findDefInLoop(unsigned Reg);
401 unsigned getPrevMapVal(unsigned StageNum, unsigned PhiStage, unsigned LoopVal,
402 unsigned LoopStage, ValueMapTy *VRMap,
403 MachineBasicBlock *BB);
404 void rewritePhiValues(MachineBasicBlock *NewBB, unsigned StageNum,
405 SMSchedule &Schedule, ValueMapTy *VRMap,
406 InstrMapTy &InstrMap);
407 void rewriteScheduledInstr(MachineBasicBlock *BB, SMSchedule &Schedule,
408 InstrMapTy &InstrMap, unsigned CurStageNum,
409 unsigned PhiNum, MachineInstr *Phi,
410 unsigned OldReg, unsigned NewReg,
411 unsigned PrevReg = 0);
412 bool canUseLastOffsetValue(MachineInstr *MI, unsigned &BasePos,
413 unsigned &OffsetPos, unsigned &NewBase,
414 int64_t &NewOffset);
415 };
416
417 /// A NodeSet contains a set of SUnit DAG nodes with additional information
418 /// that assigns a priority to the set.
419 class NodeSet {
420 SetVector Nodes;
421 bool HasRecurrence;
422 unsigned RecMII = 0;
423 int MaxMOV = 0;
424 int MaxDepth = 0;
425 unsigned Colocate = 0;
426 SUnit *ExceedPressure = nullptr;
427
428 public:
429 typedef SetVector::const_iterator iterator;
430
431 NodeSet() : Nodes(), HasRecurrence(false) {}
432
433 NodeSet(iterator S, iterator E) : Nodes(S, E), HasRecurrence(true) {}
434
435 bool insert(SUnit *SU) { return Nodes.insert(SU); }
436
437 void insert(iterator S, iterator E) { Nodes.insert(S, E); }
438
439 template bool remove_if(UnaryPredicate P) {
440 return Nodes.remove_if(P);
441 }
442
443 unsigned count(SUnit *SU) const { return Nodes.count(SU); }
444
445 bool hasRecurrence() { return HasRecurrence; };
446
447 unsigned size() const { return Nodes.size(); }
448
449 bool empty() const { return Nodes.empty(); }
450
451 SUnit *getNode(unsigned i) const { return Nodes[i]; };
452
453 void setRecMII(unsigned mii) { RecMII = mii; };
454
455 void setColocate(unsigned c) { Colocate = c; };
456
457 void setExceedPressure(SUnit *SU) { ExceedPressure = SU; }
458
459 bool isExceedSU(SUnit *SU) { return ExceedPressure == SU; }
460
461 int compareRecMII(NodeSet &RHS) { return RecMII - RHS.RecMII; }
462
463 int getRecMII() { return RecMII; }
464
465 /// Summarize node functions for the entire node set.
466 void computeNodeSetInfo(SwingSchedulerDAG *SSD) {
467 for (SUnit *SU : *this) {
468 MaxMOV = std::max(MaxMOV, SSD->getMOV(SU));
469 MaxDepth = std::max(MaxDepth, SSD->getDepth(SU));
470 }
471 }
472
473 void clear() {
474 Nodes.clear();
475 RecMII = 0;
476 HasRecurrence = false;
477 MaxMOV = 0;
478 MaxDepth = 0;
479 Colocate = 0;
480 ExceedPressure = nullptr;
481 }
482
483 operator SetVector &() { return Nodes; }
484
485 /// Sort the node sets by importance. First, rank them by recurrence MII,
486 /// then by mobility (least mobile done first), and finally by depth.
487 /// Each node set may contain a colocate value which is used as the first
488 /// tie breaker, if it's set.
489 bool operator>(const NodeSet &RHS) const {
490 if (RecMII == RHS.RecMII) {
491 if (Colocate != 0 && RHS.Colocate != 0 && Colocate != RHS.Colocate)
492 return Colocate < RHS.Colocate;
493 if (MaxMOV == RHS.MaxMOV)
494 return MaxDepth > RHS.MaxDepth;
495 return MaxMOV < RHS.MaxMOV;
496 }
497 return RecMII > RHS.RecMII;
498 }
499
500 bool operator==(const NodeSet &RHS) const {
501 return RecMII == RHS.RecMII && MaxMOV == RHS.MaxMOV &&
502 MaxDepth == RHS.MaxDepth;
503 }
504
505 bool operator!=(const NodeSet &RHS) const { return !operator==(RHS); }
506
507 iterator begin() { return Nodes.begin(); }
508 iterator end() { return Nodes.end(); }
509
510 void print(raw_ostream &os) const {
511 os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
512 << " depth " << MaxDepth << " col " << Colocate << "\n";
513 for (const auto &I : Nodes)
514 os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
515 os << "\n";
516 }
517
518 void dump() const { print(dbgs()); }
519 };
520
521 /// This class repesents the scheduled code. The main data structure is a
522 /// map from scheduled cycle to instructions. During scheduling, the
523 /// data structure explicitly represents all stages/iterations. When
524 /// the algorithm finshes, the schedule is collapsed into a single stage,
525 /// which represents instructions from different loop iterations.
526 ///
527 /// The SMS algorithm allows negative values for cycles, so the first cycle
528 /// in the schedule is the smallest cycle value.
529 class SMSchedule {
530 private:
531 /// Map from execution cycle to instructions.
532 DenseMap> ScheduledInstrs;
533
534 /// Map from instruction to execution cycle.
535 std::map InstrToCycle;
536
537 /// Map for each register and the max difference between its uses and def.
538 /// The first element in the pair is the max difference in stages. The
539 /// second is true if the register defines a Phi value and loop value is
540 /// scheduled before the Phi.
541 std::map> RegToStageDiff;
542
543 /// Keep track of the first cycle value in the schedule. It starts
544 /// as zero, but the algorithm allows negative values.
545 int FirstCycle;
546
547 /// Keep track of the last cycle value in the schedule.
548 int LastCycle;
549
550 /// The initiation interval (II) for the schedule.
551 int InitiationInterval;
552
553 /// Target machine information.
554 const TargetSubtargetInfo &ST;
555
556 /// Virtual register information.
557 MachineRegisterInfo &MRI;
558
559 DFAPacketizer *Resources;
560
561 public:
562 SMSchedule(MachineFunction *mf)
563 : ST(mf->getSubtarget()), MRI(mf->getRegInfo()),
564 Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {
565 FirstCycle = 0;
566 LastCycle = 0;
567 InitiationInterval = 0;
568 }
569
570 ~SMSchedule() {
571 ScheduledInstrs.clear();
572 InstrToCycle.clear();
573 RegToStageDiff.clear();
574 delete Resources;
575 }
576
577 void reset() {
578 ScheduledInstrs.clear();
579 InstrToCycle.clear();
580 RegToStageDiff.clear();
581 FirstCycle = 0;
582 LastCycle = 0;
583 InitiationInterval = 0;
584 }
585
586 /// Set the initiation interval for this schedule.
587 void setInitiationInterval(int ii) { InitiationInterval = ii; }
588
589 /// Return the first cycle in the completed schedule. This
590 /// can be a negative value.
591 int getFirstCycle() const { return FirstCycle; }
592
593 /// Return the last cycle in the finalized schedule.
594 int getFinalCycle() const { return FirstCycle + InitiationInterval - 1; }
595
596 /// Return the cycle of the earliest scheduled instruction in the dependence
597 /// chain.
598 int earliestCycleInChain(const SDep &Dep);
599
600 /// Return the cycle of the latest scheduled instruction in the dependence
601 /// chain.
602 int latestCycleInChain(const SDep &Dep);
603
604 void computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
605 int *MinEnd, int *MaxStart, int II, SwingSchedulerDAG *DAG);
606 bool insert(SUnit *SU, int StartCycle, int EndCycle, int II);
607
608 /// Iterators for the cycle to instruction map.
609 typedef DenseMap>::iterator sched_iterator;
610 typedef DenseMap>::const_iterator
611 const_sched_iterator;
612
613 /// Return true if the instruction is scheduled at the specified stage.
614 bool isScheduledAtStage(SUnit *SU, unsigned StageNum) {
615 return (stageScheduled(SU) == (int)StageNum);
616 }
617
618 /// Return the stage for a scheduled instruction. Return -1 if
619 /// the instruction has not been scheduled.
620 int stageScheduled(SUnit *SU) const {
621 std::map::const_iterator it = InstrToCycle.find(SU);
622 if (it == InstrToCycle.end())
623 return -1;
624 return (it->second - FirstCycle) / InitiationInterval;
625 }
626
627 /// Return the cycle for a scheduled instruction. This function normalizes
628 /// the first cycle to be 0.
629 unsigned cycleScheduled(SUnit *SU) const {
630 std::map::const_iterator it = InstrToCycle.find(SU);
631 assert(it != InstrToCycle.end() && "Instruction hasn't been scheduled.");
632 return (it->second - FirstCycle) % InitiationInterval;
633 }
634
635 /// Return the maximum stage count needed for this schedule.
636 unsigned getMaxStageCount() {
637 return (LastCycle - FirstCycle) / InitiationInterval;
638 }
639
640 /// Return the max. number of stages/iterations that can occur between a
641 /// register definition and its uses.
642 unsigned getStagesForReg(int Reg, unsigned CurStage) {
643 std::pair Stages = RegToStageDiff[Reg];
644 if (CurStage > getMaxStageCount() && Stages.first == 0 && Stages.second)
645 return 1;
646 return Stages.first;
647 }
648
649 /// The number of stages for a Phi is a little different than other
650 /// instructions. The minimum value computed in RegToStageDiff is 1
651 /// because we assume the Phi is needed for at least 1 iteration.
652 /// This is not the case if the loop value is scheduled prior to the
653 /// Phi in the same stage. This function returns the number of stages
654 /// or iterations needed between the Phi definition and any uses.
655 unsigned getStagesForPhi(int Reg) {
656 std::pair Stages = RegToStageDiff[Reg];
657 if (Stages.second)
658 return Stages.first;
659 return Stages.first - 1;
660 }
661
662 /// Return the instructions that are scheduled at the specified cycle.
663 std::deque &getInstructions(int cycle) {
664 return ScheduledInstrs[cycle];
665 }
666
667 bool isValidSchedule(SwingSchedulerDAG *SSD);
668 void finalizeSchedule(SwingSchedulerDAG *SSD);
669 bool orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
670 std::deque &Insts);
671 bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
672 bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Inst,
673 MachineOperand &MO);
674 void print(raw_ostream &os) const;
675 void dump() const;
676 };
677
678 } // end anonymous namespace
679
680 unsigned SwingSchedulerDAG::Circuits::MaxPaths = 5;
681 char MachinePipeliner::ID = 0;
682 #ifndef NDEBUG
683 int MachinePipeliner::NumTries = 0;
684 #endif
685 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
686 INITIALIZE_PASS_BEGIN(MachinePipeliner, "pipeliner",
687 "Modulo Software Pipelining", false, false)
688 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
689 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
690 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
691 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
692 INITIALIZE_PASS_END(MachinePipeliner, "pipeliner",
693 "Modulo Software Pipelining", false, false)
694
695 /// The "main" function for implementing Swing Modulo Scheduling.
696 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
697 if (skipFunction(*mf.getFunction()))
698 return false;
699
700 if (!EnableSWP)
701 return false;
702
703 if (mf.getFunction()->getAttributes().hasAttribute(
704 AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
705 !EnableSWPOptSize.getPosition())
706 return false;
707
708 MF = &mf;
709 MLI = &getAnalysis();
710 MDT = &getAnalysis();
711 TII = MF->getSubtarget().getInstrInfo();
712 RegClassInfo.runOnMachineFunction(*MF);
713
714 for (auto &L : *MLI)
715 scheduleLoop(*L);
716
717 return false;
718 }
719
720 /// Attempt to perform the SMS algorithm on the specified loop. This function is
721 /// the main entry point for the algorithm. The function identifies candidate
722 /// loops, calculates the minimum initiation interval, and attempts to schedule
723 /// the loop.
724 bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
725 bool Changed = false;
726 for (auto &InnerLoop : L)
727 Changed |= scheduleLoop(*InnerLoop);
728
729 #ifndef NDEBUG
730 // Stop trying after reaching the limit (if any).
731 int Limit = SwpLoopLimit;
732 if (Limit >= 0) {
733 if (NumTries >= SwpLoopLimit)
734 return Changed;
735 NumTries++;
736 }
737 #endif
738
739 if (!canPipelineLoop(L))
740 return Changed;
741
742 ++NumTrytoPipeline;
743
744 Changed = swingModuloScheduler(L);
745
746 return Changed;
747 }
748
749 /// Return true if the loop can be software pipelined. The algorithm is
750 /// restricted to loops with a single basic block. Make sure that the
751 /// branch in the loop can be analyzed.
752 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
753 if (L.getNumBlocks() != 1)
754 return false;
755
756 // Check if the branch can't be understood because we can't do pipelining
757 // if that's the case.
758 LI.TBB = nullptr;
759 LI.FBB = nullptr;
760 LI.BrCond.clear();
761 if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond))
762 return false;
763
764 LI.LoopInductionVar = nullptr;
765 LI.LoopCompare = nullptr;
766 if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare))
767 return false;
768
769 if (!L.getLoopPreheader())
770 return false;
771
772 // If any of the Phis contain subregs, then we can't pipeline
773 // because we don't know how to maintain subreg information in the
774 // VMap structure.
775 MachineBasicBlock *MBB = L.getHeader();
776 for (MachineBasicBlock::iterator BBI = MBB->instr_begin(),
777 BBE = MBB->getFirstNonPHI();
778 BBI != BBE; ++BBI)
779 for (unsigned i = 1; i != BBI->getNumOperands(); i += 2)
780 if (BBI->getOperand(i).getSubReg() != 0)
781 return false;
782
783 return true;
784 }
785
786 /// The SMS algorithm consists of the following main steps:
787 /// 1. Computation and analysis of the dependence graph.
788 /// 2. Ordering of the nodes (instructions).
789 /// 3. Attempt to Schedule the loop.
790 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
791 assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
792
793 SwingSchedulerDAG SMS(*this, L, getAnalysis(), RegClassInfo);
794
795 MachineBasicBlock *MBB = L.getHeader();
796 // The kernel should not include any terminator instructions. These
797 // will be added back later.
798 SMS.startBlock(MBB);
799
800 // Compute the number of 'real' instructions in the basic block by
801 // ignoring terminators.
802 unsigned size = MBB->size();
803 for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
804 E = MBB->instr_end();
805 I != E; ++I, --size)
806 ;
807
808 SMS.enterRegion(MBB, MBB->begin(), MBB->getFirstTerminator(), size);
809 SMS.schedule();
810 SMS.exitRegion();
811
812 SMS.finishBlock();
813 return SMS.hasNewSchedule();
814 }
815
816 /// We override the schedule function in ScheduleDAGInstrs to implement the
817 /// scheduling part of the Swing Modulo Scheduling algorithm.
818 void SwingSchedulerDAG::schedule() {
819 AliasAnalysis *AA = &Pass.getAnalysis().getAAResults();
820 buildSchedGraph(AA);
821 addLoopCarriedDependences(AA);
822 updatePhiDependences();
823 Topo.InitDAGTopologicalSorting();
824 changeDependences();
825 DEBUG({
826 for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
827 SUnits[su].dumpAll(this);
828 });
829
830 NodeSetType NodeSets;
831 findCircuits(NodeSets);
832
833 // Calculate the MII.
834 unsigned ResMII = calculateResMII();
835 unsigned RecMII = calculateRecMII(NodeSets);
836
837 fuseRecs(NodeSets);
838
839 // This flag is used for testing and can cause correctness problems.
840 if (SwpIgnoreRecMII)
841 RecMII = 0;
842
843 MII = std::max(ResMII, RecMII);
844 DEBUG(dbgs() << "MII = " << MII << " (rec=" << RecMII << ", res=" << ResMII
845 << ")\n");
846
847 // Can't schedule a loop without a valid MII.
848 if (MII == 0)
849 return;
850
851 // Don't pipeline large loops.
852 if (SwpMaxMii != -1 && (int)MII > SwpMaxMii)
853 return;
854
855 computeNodeFunctions(NodeSets);
856
857 registerPressureFilter(NodeSets);
858
859 colocateNodeSets(NodeSets);
860
861 checkNodeSets(NodeSets);
862
863 DEBUG({
864 for (auto &I : NodeSets) {
865 dbgs() << " Rec NodeSet ";
866 I.dump();
867 }
868 });
869
870 std::sort(NodeSets.begin(), NodeSets.end(), std::greater());
871
872 groupRemainingNodes(NodeSets);
873
874 removeDuplicateNodes(NodeSets);
875
876 DEBUG({
877 for (auto &I : NodeSets) {
878 dbgs() << " NodeSet ";
879 I.dump();
880 }
881 });
882
883 computeNodeOrder(NodeSets);
884
885 SMSchedule Schedule(Pass.MF);
886 Scheduled = schedulePipeline(Schedule);
887
888 if (!Scheduled)
889 return;
890
891 unsigned numStages = Schedule.getMaxStageCount();
892 // No need to generate pipeline if there are no overlapped iterations.
893 if (numStages == 0)
894 return;
895
896 // Check that the maximum stage count is less than user-defined limit.
897 if (SwpMaxStages > -1 && (int)numStages > SwpMaxStages)
898 return;
899
900 generatePipelinedLoop(Schedule);
901 ++NumPipelined;
902 }
903
904 /// Clean up after the software pipeliner runs.
905 void SwingSchedulerDAG::finishBlock() {
906 for (MachineInstr *I : NewMIs)
907 MF.DeleteMachineInstr(I);
908 NewMIs.clear();
909
910 // Call the superclass.
911 ScheduleDAGInstrs::finishBlock();
912 }
913
914 /// Return the register values for the operands of a Phi instruction.
915 /// This function assume the instruction is a Phi.
916 static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
917 unsigned &InitVal, unsigned &LoopVal) {
918 assert(Phi.isPHI() && "Expecting a Phi.");
919
920 InitVal = 0;
921 LoopVal = 0;
922 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
923 if (Phi.getOperand(i + 1).getMBB() != Loop)
924 InitVal = Phi.getOperand(i).getReg();
925 else if (Phi.getOperand(i + 1).getMBB() == Loop)
926 LoopVal = Phi.getOperand(i).getReg();
927
928 assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
929 }
930
931 /// Return the Phi register value that comes from the incoming block.
932 static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
933 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
934 if (Phi.getOperand(i + 1).getMBB() != LoopBB)
935 return Phi.getOperand(i).getReg();
936 return 0;
937 }
938
939 /// Return the Phi register value that comes the the loop block.
940 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
941 for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
942 if (Phi.getOperand(i + 1).getMBB() == LoopBB)
943 return Phi.getOperand(i).getReg();
944 return 0;
945 }
946
947 /// Return true if SUb can be reached from SUa following the chain edges.
948 static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
949 SmallPtrSet Visited;
950 SmallVector Worklist;
951 Worklist.push_back(SUa);
952 while (!Worklist.empty()) {
953 const SUnit *SU = Worklist.pop_back_val();
954 for (auto &SI : SU->Succs) {
955 SUnit *SuccSU = SI.getSUnit();
956 if (SI.getKind() == SDep::Order) {
957 if (Visited.count(SuccSU))
958 continue;
959 if (SuccSU == SUb)
960 return true;
961 Worklist.push_back(SuccSU);
962 Visited.insert(SuccSU);
963 }
964 }
965 }
966 return false;
967 }
968
969 /// Return true if the instruction causes a chain between memory
970 /// references before and after it.
971 static bool isDependenceBarrier(MachineInstr &MI, AliasAnalysis *AA) {
972 return MI.isCall() || MI.hasUnmodeledSideEffects() ||
973 (MI.hasOrderedMemoryRef() &&
974 (!MI.mayLoad() || !MI.isInvariantLoad(AA)));
975 }
976
977 /// Return the underlying objects for the memory references of an instruction.
978 /// This function calls the code in ValueTracking, but first checks that the
979 /// instruction has a memory operand.
980 static void getUnderlyingObjects(MachineInstr *MI,
981 SmallVectorImpl &Objs,
982 const DataLayout &DL) {
983 if (!MI->hasOneMemOperand())
984 return;
985 MachineMemOperand *MM = *MI->memoperands_begin();
986 if (!MM->getValue())
987 return;
988 GetUnderlyingObjects(const_cast(MM->getValue()), Objs, DL);
989 }
990
991 /// Add a chain edge between a load and store if the store can be an
992 /// alias of the load on a subsequent iteration, i.e., a loop carried
993 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
994 /// but that code doesn't create loop carried dependences.
995 void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
996 MapVector> PendingLoads;
997 for (auto &SU : SUnits) {
998 MachineInstr &MI = *SU.getInstr();
999 if (isDependenceBarrier(MI, AA))
1000 PendingLoads.clear();
1001 else if (MI.mayLoad()) {
1002 SmallVector Objs;
1003 getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1004 for (auto V : Objs) {
1005 SmallVector &SUs = PendingLoads[V];
1006 SUs.push_back(&SU);
1007 }
1008 } else if (MI.mayStore()) {
1009 SmallVector Objs;
1010 getUnderlyingObjects(&MI, Objs, MF.getDataLayout());
1011 for (auto V : Objs) {
1012 MapVector>::iterator I =
1013 PendingLoads.find(V);
1014 if (I == PendingLoads.end())
1015 continue;
1016 for (auto Load : I->second) {
1017 if (isSuccOrder(Load, &SU))
1018 continue;
1019 MachineInstr &LdMI = *Load->getInstr();
1020 // First, perform the cheaper check that compares the base register.
1021 // If they are the same and the load offset is less than the store
1022 // offset, then mark the dependence as loop carried potentially.
1023 unsigned BaseReg1, BaseReg2;
1024 int64_t Offset1, Offset2;
1025 if (!TII->getMemOpBaseRegImmOfs(LdMI, BaseReg1, Offset1, TRI) ||
1026 !TII->getMemOpBaseRegImmOfs(MI, BaseReg2, Offset2, TRI)) {
1027 SU.addPred(SDep(Load, SDep::Barrier));
1028 continue;
1029 }
1030 if (BaseReg1 == BaseReg2 && (int)Offset1 < (int)Offset2) {
1031 assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
1032 "What happened to the chain edge?");
1033 SU.addPred(SDep(Load, SDep::Barrier));
1034 continue;
1035 }
1036 // Second, the more expensive check that uses alias analysis on the
1037 // base registers. If they alias, and the load offset is less than
1038 // the store offset, the mark the dependence as loop carried.
1039 if (!AA) {
1040 SU.addPred(SDep(Load, SDep::Barrier));
1041 continue;
1042 }
1043 MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
1044 MachineMemOperand *MMO2 = *MI.memoperands_begin();
1045 if (!MMO1->getValue() || !MMO2->getValue()) {
1046 SU.addPred(SDep(Load, SDep::Barrier));
1047 continue;
1048 }
1049 if (MMO1->getValue() == MMO2->getValue() &&
1050 MMO1->getOffset() <= MMO2->getOffset()) {
1051 SU.addPred(SDep(Load, SDep::Barrier));
1052 continue;
1053 }
1054 AliasResult AAResult = AA->alias(
1055 MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
1056 MMO1->getAAInfo()),
1057 MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
1058 MMO2->getAAInfo()));
1059
1060 if (AAResult != NoAlias)
1061 SU.addPred(SDep(Load, SDep::Barrier));
1062 }
1063 }
1064 }
1065 }
1066 }
1067
1068 /// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1069 /// processes dependences for PHIs. This function adds true dependences
1070 /// from a PHI to a use, and a loop carried dependence from the use to the
1071 /// PHI. The loop carried dependence is represented as an anti dependence
1072 /// edge. This function also removes chain dependences between unrelated
1073 /// PHIs.
1074 void SwingSchedulerDAG::updatePhiDependences() {
1075 SmallVector RemoveDeps;
1076 const TargetSubtargetInfo &ST = MF.getSubtarget();
1077
1078 // Iterate over each DAG node.
1079 for (SUnit &I : SUnits) {
1080 RemoveDeps.clear();
1081 // Set to true if the instruction has an operand defined by a Phi.
1082 unsigned HasPhiUse = 0;
1083 unsigned HasPhiDef = 0;
1084 MachineInstr *MI = I.getInstr();
1085 // Iterate over each operand, and we process the definitions.
1086 for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
1087 MOE = MI->operands_end();
1088 MOI != MOE; ++MOI) {
1089 if (!MOI->isReg())
1090 continue;
1091 unsigned Reg = MOI->getReg();
1092 if (MOI->isDef()) {
1093 // If the register is used by a Phi, then create an anti dependence.
1094 for (MachineRegisterInfo::use_instr_iterator
1095 UI = MRI.use_instr_begin(Reg),
1096 UE = MRI.use_instr_end();
1097 UI != UE; ++UI) {
1098 MachineInstr *UseMI = &*UI;
1099 SUnit *SU = getSUnit(UseMI);
1100 if (SU != 0 && UseMI->isPHI()) {
1101 if (!MI->isPHI()) {
1102 SDep Dep(SU, SDep::Anti, Reg);
1103 I.addPred(Dep);
1104 } else {
1105 HasPhiDef = Reg;
1106 // Add a chain edge to a dependent Phi that isn't an existing
1107 // predecessor.
1108 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1109 I.addPred(SDep(SU, SDep::Barrier));
1110 }
1111 }
1112 }
1113 } else if (MOI->isUse()) {
1114 // If the register is defined by a Phi, then create a true dependence.
1115 MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1116 if (DefMI == 0)
1117 continue;
1118 SUnit *SU = getSUnit(DefMI);
1119 if (SU != 0 && DefMI->isPHI()) {
1120 if (!MI->isPHI()) {
1121 SDep Dep(SU, SDep::Data, Reg);
1122 Dep.setLatency(0);
1123 ST.adjustSchedDependency(SU, &I, Dep);
1124 I.addPred(Dep);
1125 } else {
1126 HasPhiUse = Reg;
1127 // Add a chain edge to a dependent Phi that isn't an existing
1128 // predecessor.
1129 if (SU->NodeNum < I.NodeNum && !I.isPred(SU))
1130 I.addPred(SDep(SU, SDep::Barrier));
1131 }
1132 }
1133 }
1134 }
1135 // Remove order dependences from an unrelated Phi.
1136 if (!SwpPruneDeps)
1137 continue;
1138 for (auto &PI : I.Preds) {
1139 MachineInstr *PMI = PI.getSUnit()->getInstr();
1140 if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1141 if (I.getInstr()->isPHI()) {
1142 if (PMI->getOperand(0).getReg() == HasPhiUse)
1143 continue;
1144 if (getLoopPhiReg(*PMI, PMI->getParent()) == HasPhiDef)
1145 continue;
1146 }
1147 RemoveDeps.push_back(PI);
1148 }
1149 }
1150 for (int i = 0, e = RemoveDeps.size(); i != e; ++i)
1151 I.removePred(RemoveDeps[i]);
1152 }
1153 }
1154
1155 /// Iterate over each DAG node and see if we can change any dependences
1156 /// in order to reduce the recurrence MII.
1157 void SwingSchedulerDAG::changeDependences() {
1158 // See if an instruction can use a value from the previous iteration.
1159 // If so, we update the base and offset of the instruction and change
1160 // the dependences.
1161 for (SUnit &I : SUnits) {
1162 unsigned BasePos = 0, OffsetPos = 0, NewBase = 0;
1163 int64_t NewOffset = 0;
1164 if (!canUseLastOffsetValue(I.getInstr(), BasePos, OffsetPos, NewBase,
1165 NewOffset))
1166 continue;
1167
1168 // Get the MI and SUnit for the instruction that defines the original base.
1169 unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
1170 MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
1171 if (!DefMI)
1172 continue;
1173 SUnit *DefSU = getSUnit(DefMI);
1174 if (!DefSU)
1175 continue;
1176 // Get the MI and SUnit for the instruction that defins the new base.
1177 MachineInstr *LastMI = MRI.getUniqueVRegDef(NewBase);
1178 if (!LastMI)
1179 continue;
1180 SUnit *LastSU = getSUnit(LastMI);
1181 if (!LastSU)
1182 continue;
1183
1184 if (Topo.IsReachable(&I, LastSU))
1185 continue;
1186
1187 // Remove the dependence. The value now depends on a prior iteration.
1188 SmallVector Deps;
1189 for (SUnit::pred_iterator P = I.Preds.begin(), E = I.Preds.end(); P != E;
1190 ++P)
1191 if (P->getSUnit() == DefSU)
1192 Deps.push_back(*P);
1193 for (int i = 0, e = Deps.size(); i != e; i++) {
1194 Topo.RemovePred(&I, Deps[i].getSUnit());
1195 I.removePred(Deps[i]);
1196 }
1197 // Remove the chain dependence between the instructions.
1198 Deps.clear();
1199 for (auto &P : LastSU->Preds)
1200 if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1201 Deps.push_back(P);
1202 for (int i = 0, e = Deps.size(); i != e; i++) {
1203 Topo.RemovePred(LastSU, Deps[i].getSUnit());
1204 LastSU->removePred(Deps[i]);
1205 }
1206
1207 // Add a dependence between the new instruction and the instruction
1208 // that defines the new base.
1209 SDep Dep(&I, SDep::Anti, NewBase);
1210 LastSU->addPred(Dep);
1211
1212 // Remember the base and offset information so that we can update the
1213 // instruction during code generation.
1214 InstrChanges[&I] = std::make_pair(NewBase, NewOffset);
1215 }
1216 }
1217
1218 namespace {
1219 // FuncUnitSorter - Comparison operator used to sort instructions by
1220 // the number of functional unit choices.
1221 struct FuncUnitSorter {
1222 const InstrItineraryData *InstrItins;
1223 DenseMap Resources;
1224
1225 // Compute the number of functional unit alternatives needed
1226 // at each stage, and take the minimum value. We prioritize the
1227 // instructions by the least number of choices first.
1228 unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
1229 unsigned schedClass = Inst->getDesc().getSchedClass();
1230 unsigned min = UINT_MAX;
1231 for (const InstrStage *IS = InstrItins->beginStage(schedClass),
1232 *IE = InstrItins->endStage(schedClass);
1233 IS != IE; ++IS) {
1234 unsigned funcUnits = IS->getUnits();
1235 unsigned numAlternatives = countPopulation(funcUnits);
1236 if (numAlternatives < min) {
1237 min = numAlternatives;
1238 F = funcUnits;
1239 }
1240 }
1241 return min;
1242 }
1243
1244 // Compute the critical resources needed by the instruction. This
1245 // function records the functional units needed by instructions that
1246 // must use only one functional unit. We use this as a tie breaker
1247 // for computing the resource MII. The instrutions that require
1248 // the same, highly used, functional unit have high priority.
1249 void calcCriticalResources(MachineInstr &MI) {
1250 unsigned SchedClass = MI.getDesc().getSchedClass();
1251 for (const InstrStage *IS = InstrItins->beginStage(SchedClass),
1252 *IE = InstrItins->endStage(SchedClass);
1253 IS != IE; ++IS) {
1254 unsigned FuncUnits = IS->getUnits();
1255 if (countPopulation(FuncUnits) == 1)
1256 Resources[FuncUnits]++;
1257 }
1258 }
1259
1260 FuncUnitSorter(const InstrItineraryData *IID) : InstrItins(IID) {}
1261 /// Return true if IS1 has less priority than IS2.
1262 bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
1263 unsigned F1 = 0, F2 = 0;
1264 unsigned MFUs1 = minFuncUnits(IS1, F1);
1265 unsigned MFUs2 = minFuncUnits(IS2, F2);
1266 if (MFUs1 == 1 && MFUs2 == 1)
1267 return Resources.lookup(F1) < Resources.lookup(F2);
1268 return MFUs1 > MFUs2;
1269 }
1270 };
1271 }
1272
1273 /// Calculate the resource constrained minimum initiation interval for the
1274 /// specified loop. We use the DFA to model the resources needed for
1275 /// each instruction, and we ignore dependences. A different DFA is created
1276 /// for each cycle that is required. When adding a new instruction, we attempt
1277 /// to add it to each existing DFA, until a legal space is found. If the
1278 /// instruction cannot be reserved in an existing DFA, we create a new one.
1279 unsigned SwingSchedulerDAG::calculateResMII() {
1280 SmallVector Resources;
1281 MachineBasicBlock *MBB = Loop.getHeader();
1282 Resources.push_back(TII->CreateTargetScheduleState(MF.getSubtarget()));
1283
1284 // Sort the instructions by the number of available choices for scheduling,
1285 // least to most. Use the number of critical resources as the tie breaker.
1286 FuncUnitSorter FUS =
1287 FuncUnitSorter(MF.getSubtarget().getInstrItineraryData());
1288 for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1289 E = MBB->getFirstTerminator();
1290 I != E; ++I)
1291 FUS.calcCriticalResources(*I);
1292 PriorityQueue, FuncUnitSorter>
1293 FuncUnitOrder(FUS);
1294
1295 for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(),
1296 E = MBB->getFirstTerminator();
1297 I != E; ++I)
1298 FuncUnitOrder.push(&*I);
1299
1300 while (!FuncUnitOrder.empty()) {
1301 MachineInstr *MI = FuncUnitOrder.top();
1302 FuncUnitOrder.pop();
1303 if (TII->isZeroCost(MI->getOpcode()))
1304 continue;
1305 // Attempt to reserve the instruction in an existing DFA. At least one
1306 // DFA is needed for each cycle.
1307 unsigned NumCycles = getSUnit(MI)->Latency;
1308 unsigned ReservedCycles = 0;
1309 SmallVectorImpl::iterator RI = Resources.begin();
1310 SmallVectorImpl::iterator RE = Resources.end();
1311 for (unsigned C = 0; C < NumCycles; ++C)
1312 while (RI != RE) {
1313 if ((*RI++)->canReserveResources(*MI)) {
1314 ++ReservedCycles;
1315 break;
1316 }
1317 }
1318 // Start reserving resources using existing DFAs.
1319 for (unsigned C = 0; C < ReservedCycles; ++C) {
1320 --RI;
1321 (*RI)->reserveResources(*MI);
1322 }
1323 // Add new DFAs, if needed, to reserve resources.
1324 for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
1325 DFAPacketizer *NewResource =
1326 TII->CreateTargetScheduleState(MF.getSubtarget());
1327 assert(NewResource->canReserveResources(*MI) && "Reserve error.");
1328 NewResource->reserveResources(*MI);
1329 Resources.push_back(NewResource);
1330 }
1331 }
1332 int Resmii = Resources.size();
1333 // Delete the memory for each of the DFAs that were created earlier.
1334 for (DFAPacketizer *RI : Resources) {
1335 DFAPacketizer *D = RI;
1336 delete D;
1337 }
1338 Resources.clear();
1339 return Resmii;
1340 }
1341
1342 /// Calculate the recurrence-constrainted minimum initiation interval.
1343 /// Iterate over each circuit. Compute the delay(c) and distance(c)
1344 /// for each circuit. The II needs to satisfy the inequality
1345 /// delay(c) - II*distance(c) <= 0. For each circuit, choose the smallest
1346 /// II that satistifies the inequality, and the RecMII is the maximum
1347 /// of those values.
1348 unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1349 unsigned RecMII = 0;
1350
1351 for (NodeSet &Nodes : NodeSets) {
1352 if (Nodes.size() == 0)
1353 continue;
1354
1355 unsigned Delay = Nodes.size() - 1;
1356 unsigned Distance = 1;
1357
1358 // ii = ceil(delay / distance)
1359 unsigned CurMII = (Delay + Distance - 1) / Distance;
1360 Nodes.setRecMII(CurMII);
1361 if (CurMII > RecMII)
1362 RecMII = CurMII;
1363 }
1364
1365 return RecMII;
1366 }
1367
1368 /// Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1369 /// but we do this to find the circuits, and then change them back.
1370 static void swapAntiDependences(std::vector &SUnits) {
1371 SmallVector, 8> DepsAdded;
1372 for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
1373 SUnit *SU = &SUnits[i];
1374 for (SUnit::pred_iterator IP = SU->Preds.begin(), EP = SU->Preds.end();
1375 IP != EP; ++IP) {
1376 if (IP->getKind() != SDep::Anti)
1377 continue;
1378 DepsAdded.push_back(std::make_pair(SU, *IP));
1379 }
1380 }
1381 for (SmallVector, 8>::iterator I = DepsAdded.begin(),
1382 E = DepsAdded.end();
1383 I != E; ++I) {
1384 // Remove this anti dependency and add one in the reverse direction.
1385 SUnit *SU = I->first;
1386 SDep &D = I->second;
1387 SUnit *TargetSU = D.getSUnit();
1388 unsigned Reg = D.getReg();
1389 unsigned Lat = D.getLatency();
1390 SU->removePred(D);
1391 SDep Dep(SU, SDep::Anti, Reg);
1392 Dep.setLatency(Lat);
1393 TargetSU->addPred(Dep);
1394 }
1395 }
1396
1397 /// Create the adjacency structure of the nodes in the graph.
1398 void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1399 SwingSchedulerDAG *DAG) {
1400 BitVector Added(SUnits.size());
1401 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1402 Added.reset();
1403 // Add any successor to the adjacency matrix and exclude duplicates.
1404 for (auto &SI : SUnits[i].Succs) {
1405 // Do not process a boundary node and a back-edge is processed only
1406 // if it goes to a Phi.
1407 if (SI.getSUnit()->isBoundaryNode() ||
1408 (SI.getKind() == SDep::Anti && !SI.getSUnit()->getInstr()->isPHI()))
1409 continue;
1410 int N = SI.getSUnit()->NodeNum;
1411 if (!Added.test(N)) {
1412 AdjK[i].push_back(N);
1413 Added.set(N);
1414 }
1415 }
1416 // A chain edge between a store and a load is treated as a back-edge in the
1417 // adjacency matrix.
1418 for (auto &PI : SUnits[i].Preds) {
1419 if (!SUnits[i].getInstr()->mayStore() ||
1420 !DAG->isLoopCarriedOrder(&SUnits[i], PI, false))
1421 continue;
1422 if (PI.getKind() == SDep::Order && PI.getSUnit()->getInstr()->mayLoad()) {
1423 int N = PI.getSUnit()->NodeNum;
1424 if (!Added.test(N)) {
1425 AdjK[i].push_back(N);
1426 Added.set(N);
1427 }
1428 }
1429 }
1430 }
1431 }
1432
1433 /// Identify an elementary circuit in the dependence graph starting at the
1434 /// specified node.
1435 bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1436 bool HasBackedge) {
1437 SUnit *SV = &SUnits[V];
1438 bool F = false;
1439 Stack.insert(SV);
1440 Blocked.set(V);
1441
1442 for (auto W : AdjK[V]) {
1443 if (NumPaths > MaxPaths)
1444 break;
1445 if (W < S)
1446 continue;
1447 if (W == S) {
1448 if (!HasBackedge)
1449 NodeSets.push_back(NodeSet(Stack.begin(), Stack.end()));
1450 F = true;
1451 ++NumPaths;
1452 break;
1453 } else if (!Blocked.test(W)) {
1454 if (circuit(W, S, NodeSets, W < V ? true : HasBackedge))
1455 F = true;
1456 }
1457 }
1458
1459 if (F)
1460 unblock(V);
1461 else {
1462 for (auto W : AdjK[V]) {
1463 if (W < S)
1464 continue;
1465 if (B[W].count(SV) == 0)
1466 B[W].insert(SV);
1467 }
1468 }
1469 Stack.pop_back();
1470 return F;
1471 }
1472
1473 /// Unblock a node in the circuit finding algorithm.
1474 void SwingSchedulerDAG::Circuits::unblock(int U) {
1475 Blocked.reset(U);
1476 SmallPtrSet &BU = B[U];
1477 while (!BU.empty()) {
1478 SmallPtrSet::iterator SI = BU.begin();
1479 assert(SI != BU.end() && "Invalid B set.");
1480 SUnit *W = *SI;
1481 BU.erase(W);
1482 if (Blocked.test(W->NodeNum))
1483 unblock(W->NodeNum);
1484 }
1485 }
1486
1487 /// Identify all the elementary circuits in the dependence graph using
1488 /// Johnson's circuit algorithm.
1489 void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1490 // Swap all the anti dependences in the DAG. That means it is no longer a DAG,
1491 // but we do this to find the circuits, and then change them back.
1492 swapAntiDependences(SUnits);
1493
1494 Circuits Cir(SUnits);
1495 // Create the adjacency structure.
1496 Cir.createAdjacencyStructure(this);
1497 for (int i = 0, e = SUnits.size(); i != e; ++i) {
1498 Cir.reset();
1499 Cir.circuit(i, i, NodeSets);
1500 }
1501
1502 // Change the dependences back so that we've created a DAG again.
1503 swapAntiDependences(SUnits);
1504 }
1505
1506 /// Return true for DAG nodes that we ignore when computing the cost functions.
1507 /// We ignore the back-edge recurrence in order to avoid unbounded recurison
1508 /// in the calculation of the ASAP, ALAP, etc functions.
1509 static bool ignoreDependence(const SDep &D, bool isPred) {
1510 if (D.isArtificial())
1511 return true;
1512 return D.getKind() == SDep::Anti && isPred;
1513 }
1514
1515 /// Compute several functions need to order the nodes for scheduling.
1516 /// ASAP - Earliest time to schedule a node.
1517 /// ALAP - Latest time to schedule a node.
1518 /// MOV - Mobility function, difference between ALAP and ASAP.
1519 /// D - Depth of each node.
1520 /// H - Height of each node.
1521 void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
1522
1523 ScheduleInfo.resize(SUnits.size());
1524
1525 DEBUG({
1526 for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1527 E = Topo.end();
1528 I != E; ++I) {
1529 SUnit *SU = &SUnits[*I];
1530 SU->dump(this);
1531 }
1532 });
1533
1534 int maxASAP = 0;
1535 // Compute ASAP.
1536 for (ScheduleDAGTopologicalSort::const_iterator I = Topo.begin(),
1537 E = Topo.end();
1538 I != E; ++I) {
1539 int asap = 0;
1540 SUnit *SU = &SUnits[*I];
1541 for (SUnit::const_pred_iterator IP = SU->Preds.begin(),
1542 EP = SU->Preds.end();
1543 IP != EP; ++IP) {
1544 if (ignoreDependence(*IP, true))
1545 continue;
1546 SUnit *pred = IP->getSUnit();
1547 asap = std::max(asap, (int)(getASAP(pred) + getLatency(SU, *IP) -
1548 getDistance(pred, SU, *IP) * MII));
1549 }
1550 maxASAP = std::max(maxASAP, asap);
1551 ScheduleInfo[*I].ASAP = asap;
1552 }
1553
1554 // Compute ALAP and MOV.
1555 for (ScheduleDAGTopologicalSort::const_reverse_iterator I = Topo.rbegin(),
1556 E = Topo.rend();
1557 I != E; ++I) {
1558 int alap = maxASAP;
1559 SUnit *SU = &SUnits[*I];
1560 for (SUnit::const_succ_iterator IS = SU->Succs.begin(),
1561 ES = SU->Succs.end();
1562 IS != ES; ++IS) {
1563 if (ignoreDependence(*IS, true))
1564 continue;
1565 SUnit *succ = IS->getSUnit();
1566 alap = std::min(alap, (int)(getALAP(succ) - getLatency(SU, *IS) +
1567 getDistance(SU, succ, *IS) * MII));
1568 }
1569
1570 ScheduleInfo[*I].ALAP = alap;
1571 }
1572
1573 // After computing the node functions, compute the summary for each node set.
1574 for (NodeSet &I : NodeSets)
1575 I.computeNodeSetInfo(this);
1576
1577 DEBUG({
1578 for (unsigned i = 0; i < SUnits.size(); i++) {
1579 dbgs() << "\tNode " << i << ":\n";
1580 dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
1581 dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
1582 dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
1583 dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
1584 dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
1585 }
1586 });
1587 }
1588
1589 /// Compute the Pred_L(O) set, as defined in the paper. The set is defined
1590 /// as the predecessors of the elements of NodeOrder that are not also in
1591 /// NodeOrder.
1592 static bool pred_L(SetVector &NodeOrder,
1593 SmallSetVector &Preds,
1594 const NodeSet *S = nullptr) {
1595 Preds.clear();
1596 for (SetVector::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1597 I != E; ++I) {
1598 for (SUnit::pred_iterator PI = (*I)->Preds.begin(), PE = (*I)->Preds.end();
1599 PI != PE; ++PI) {
1600 if (S && S->count(PI->getSUnit()) == 0)
1601 continue;
1602 if (ignoreDependence(*PI, true))
1603 continue;
1604 if (NodeOrder.count(PI->getSUnit()) == 0)
1605 Preds.insert(PI->getSUnit());
1606 }
1607 // Back-edges are predecessors with an anti-dependence.
1608 for (SUnit::const_succ_iterator IS = (*I)->Succs.begin(),
1609 ES = (*I)->Succs.end();
1610 IS != ES; ++IS) {
1611 if (IS->getKind() != SDep::Anti)
1612 continue;
1613 if (S && S->count(IS->getSUnit()) == 0)
1614 continue;
1615 if (NodeOrder.count(IS->getSUnit()) == 0)
1616 Preds.insert(IS->getSUnit());
1617 }
1618 }
1619 return Preds.size() > 0;
1620 }
1621
1622 /// Compute the Succ_L(O) set, as defined in the paper. The set is defined
1623 /// as the successors of the elements of NodeOrder that are not also in
1624 /// NodeOrder.
1625 static bool succ_L(SetVector &NodeOrder,
1626 SmallSetVector &Succs,
1627 const NodeSet *S = nullptr) {
1628 Succs.clear();
1629 for (SetVector::iterator I = NodeOrder.begin(), E = NodeOrder.end();
1630 I != E; ++I) {
1631 for (SUnit::succ_iterator SI = (*I)->Succs.begin(), SE = (*I)->Succs.end();
1632 SI != SE; ++SI) {
1633 if (S && S->count(SI->getSUnit()) == 0)
1634 continue;
1635 if (ignoreDependence(*SI, false))
1636 continue;
1637 if (NodeOrder.count(SI->getSUnit()) == 0)
1638 Succs.insert(SI->getSUnit());
1639 }
1640 for (SUnit::const_pred_iterator PI = (*I)->Preds.begin(),
1641 PE = (*I)->Preds.end();
1642 PI != PE; ++PI) {
1643 if (PI->getKind() != SDep::Anti)
1644 continue;
1645 if (S && S->count(PI->getSUnit()) == 0)
1646 continue;
1647 if (NodeOrder.count(PI->getSUnit()) == 0)
1648 Succs.insert(PI->getSUnit());
1649 }
1650 }
1651 return Succs.size() > 0;
1652 }
1653
1654 /// Return true if there is a path from the specified node to any of the nodes
1655 /// in DestNodes. Keep track and return the nodes in any path.
1656 static bool computePath(SUnit *Cur, SetVector &Path,
1657 SetVector &DestNodes,
1658 SetVector &Exclude,
1659 SmallPtrSet &Visited) {
1660 if (Cur->isBoundaryNode())
1661 return false;
1662 if (Exclude.count(Cur) != 0)
1663 return false;
1664 if (DestNodes.count(Cur) != 0)
1665 return true;
1666 if (!Visited.insert(Cur).second)
1667 return Path.count(Cur) != 0;
1668 bool FoundPath = false;
1669 for (auto &SI : Cur->Succs)
1670 FoundPath |= computePath(SI.getSUnit(), Path, DestNodes, Exclude, Visited);
1671 for (auto &PI : Cur->Preds)
1672 if (PI.getKind() == SDep::Anti)
1673 FoundPath |=
1674 computePath(PI.getSUnit(), Path, DestNodes, Exclude, Visited);
1675 if (FoundPath)
1676 Path.insert(Cur);
1677 return FoundPath;
1678 }
1679
1680 /// Return true if Set1 is a subset of Set2.
1681 template static bool isSubset(S1Ty &Set1, S2Ty &Set2) {
1682 for (typename S1Ty::iterator I = Set1.begin(), E = Set1.end(); I != E; ++I)
1683 if (Set2.count(*I) == 0)
1684 return false;
1685 return true;
1686 }
1687
1688 /// Compute the live-out registers for the instructions in a node-set.
1689 /// The live-out registers are those that are defined in the node-set,
1690 /// but not used. Except for use operands of Phis.
1691 static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
1692 NodeSet &NS) {
1693 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
1694 MachineRegisterInfo &MRI = MF.getRegInfo();
1695 SmallVector LiveOutRegs;
1696 SmallSet Uses;
1697 for (SUnit *SU : NS) {
1698 const MachineInstr *MI = SU->getInstr();
1699 if (MI->isPHI())
1700 continue;
1701 for (ConstMIOperands MO(*MI); MO.isValid(); ++MO)
1702 if (MO->isReg() && MO->isUse()) {
1703 unsigned Reg = MO->getReg();
1704 if (TargetRegisterInfo::isVirtualRegister(Reg))
1705 Uses.insert(Reg);
1706 else if (MRI.isAllocatable(Reg))
1707 for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1708 Uses.insert(*Units);
1709 }
1710 }
1711 for (SUnit *SU : NS)
1712 for (ConstMIOperands MO(*SU->getInstr()); MO.isValid(); ++MO)
1713 if (MO->isReg() && MO->isDef() && !MO->isDead()) {
1714 unsigned Reg = MO->getReg();
1715 if (TargetRegisterInfo::isVirtualRegister(Reg)) {
1716 if (!Uses.count(Reg))
1717 LiveOutRegs.push_back(RegisterMaskPair(Reg, 0));
1718 } else if (MRI.isAllocatable(Reg)) {
1719 for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
1720 if (!Uses.count(*Units))
1721 LiveOutRegs.push_back(RegisterMaskPair(*Units, 0));
1722 }
1723 }
1724 RPTracker.addLiveRegs(LiveOutRegs);
1725 }
1726
1727 /// A heuristic to filter nodes in recurrent node-sets if the register
1728 /// pressure of a set is too high.
1729 void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
1730 for (auto &NS : NodeSets) {
1731 // Skip small node-sets since they won't cause register pressure problems.
1732 if (NS.size() <= 2)
1733 continue;
1734 IntervalPressure RecRegPressure;
1735 RegPressureTracker RecRPTracker(RecRegPressure);
1736 RecRPTracker.init(&MF, &RegClassInfo, &LIS, BB, BB->end(), false, true);
1737 computeLiveOuts(MF, RecRPTracker, NS);
1738 RecRPTracker.closeBottom();
1739
1740 std::vector SUnits(NS.begin(), NS.end());
1741 std::sort(SUnits.begin(), SUnits.end(), [](const SUnit *A, const SUnit *B) {
1742 return A->NodeNum > B->NodeNum;
1743 });
1744
1745 for (auto &SU : SUnits) {
1746 // Since we're computing the register pressure for a subset of the
1747 // instructions in a block, we need to set the tracker for each
1748 // instruction in the node-set. The tracker is set to the instruction
1749 // just after the one we're interested in.
1750 MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
1751 RecRPTracker.setPos(std::next(CurInstI));
1752
1753 RegPressureDelta RPDelta;
1754 ArrayRef CriticalPSets;
1755 RecRPTracker.getMaxUpwardPressureDelta(SU->getInstr(), nullptr, RPDelta,
1756 CriticalPSets,
1757 RecRegPressure.MaxSetPressure);
1758 if (RPDelta.Excess.isValid()) {
1759 DEBUG(dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
1760 << TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
1761 << ":" << RPDelta.Excess.getUnitInc());
1762 NS.setExceedPressure(SU);
1763 break;
1764 }
1765 RecRPTracker.recede();
1766 }
1767 }
1768 }
1769
1770 /// A heuristic to colocate node sets that have the same set of
1771 /// successors.
1772 void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
1773 unsigned Colocate = 0;
1774 for (int i = 0, e = NodeSets.size(); i < e; ++i) {
1775 NodeSet &N1 = NodeSets[i];
1776 SmallSetVector S1;
1777 if (N1.empty() || !succ_L(N1, S1))
1778 continue;
1779 for (int j = i + 1; j < e; ++j) {
1780 NodeSet &N2 = NodeSets[j];
1781 if (N1.compareRecMII(N2) != 0)
1782 continue;
1783 SmallSetVector S2;
1784 if (N2.empty() || !succ_L(N2, S2))
1785 continue;
1786 if (isSubset(S1, S2) && S1.size() == S2.size()) {
1787 N1.setColocate(++Colocate);
1788 N2.setColocate(Colocate);
1789 break;
1790 }
1791 }
1792 }
1793 }
1794
1795 /// Check if the existing node-sets are profitable. If not, then ignore the
1796 /// recurrent node-sets, and attempt to schedule all nodes together. This is
1797 /// a heuristic. If the MII is large and there is a non-recurrent node with
1798 /// a large depth compared to the MII, then it's best to try and schedule
1799 /// all instruction together instead of starting with the recurrent node-sets.
1800 void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
1801 // Look for loops with a large MII.
1802 if (MII <= 20)
1803 return;
1804 // Check if the node-set contains only a simple add recurrence.
1805 for (auto &NS : NodeSets)
1806 if (NS.size() > 2)
1807 return;
1808 // If the depth of any instruction is significantly larger than the MII, then
1809 // ignore the recurrent node-sets and treat all instructions equally.
1810 for (auto &SU : SUnits)
1811 if (SU.getDepth() > MII * 1.5) {
1812 NodeSets.clear();
1813 DEBUG(dbgs() << "Clear recurrence node-sets\n");
1814 return;
1815 }
1816 }
1817
1818 /// Add the nodes that do not belong to a recurrence set into groups
1819 /// based upon connected componenets.
1820 void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
1821 SetVector NodesAdded;
1822 SmallPtrSet Visited;
1823 // Add the nodes that are on a path between the previous node sets and
1824 // the current node set.
1825 for (NodeSet &I : NodeSets) {
1826 SmallSetVector N;
1827 // Add the nodes from the current node set to the previous node set.
1828 if (succ_L(I, N)) {
1829 SetVector Path;
1830 for (SUnit *NI : N) {
1831 Visited.clear();
1832 computePath(NI, Path, NodesAdded, I, Visited);
1833 }
1834 if (Path.size() > 0)
1835 I.insert(Path.begin(), Path.end());
1836 }
1837 // Add the nodes from the previous node set to the current node set.
1838 N.clear();
1839 if (succ_L(NodesAdded, N)) {
1840 SetVector Path;
1841 for (SUnit *NI : N) {
1842 Visited.clear();
1843 computePath(NI, Path, I, NodesAdded, Visited);
1844 }
1845 if (Path.size() > 0)
1846 I.insert(Path.begin(), Path.end());
1847 }
1848 NodesAdded.insert(I.begin(), I.end());
1849 }
1850
1851 // Create a new node set with the connected nodes of any successor of a node
1852 // in a recurrent set.
1853 NodeSet NewSet;
1854 SmallSetVector N;
1855 if (succ_L(NodesAdded, N))
1856 for (SUnit *I : N)
1857 addConnectedNodes(I, NewSet, NodesAdded);
1858 if (NewSet.size() > 0)
1859 NodeSets.push_back(NewSet);
1860
1861 // Create a new node set with the connected nodes of any predecessor of a node
1862 // in a recurrent set.
1863 NewSet.clear();
1864 if (pred_L(NodesAdded, N))
1865 for (SUnit *I : N)
1866 addConnectedNodes(I, NewSet, NodesAdded);
1867 if (NewSet.size() > 0)
1868 NodeSets.push_back(NewSet);
1869
1870 // Create new nodes sets with the connected nodes any any remaining node that
1871 // has no predecessor.
1872 for (unsigned i = 0; i < SUnits.size(); ++i) {
1873 SUnit *SU = &SUnits[i];
1874 if (NodesAdded.count(SU) == 0) {
1875 NewSet.clear();
1876 addConnectedNodes(SU, NewSet, NodesAdded);
1877 if (NewSet.size() > 0)
1878 NodeSets.push_back(NewSet);
1879 }
1880 }
1881 }
1882
1883 /// Add the node to the set, and add all is its connected nodes to the set.
1884 void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
1885 SetVector &NodesAdded) {
1886 NewSet.insert(SU);
1887 NodesAdded.insert(SU);
1888 for (auto &SI : SU->Succs) {
1889 SUnit *Successor = SI.getSUnit();
1890 if (!SI.isArtificial() && NodesAdded.count(Successor) == 0)
1891 addConnectedNodes(Successor, NewSet, NodesAdded);
1892 }
1893 for (auto &PI : SU->Preds) {
1894 SUnit *Predecessor = PI.getSUnit();
1895 if (!PI.isArtificial() && NodesAdded.count(Predecessor) == 0)
1896 addConnectedNodes(Predecessor, NewSet, NodesAdded);
1897 }
1898 }
1899
1900 /// Return true if Set1 contains elements in Set2. The elements in common
1901 /// are returned in a different container.
1902 static bool isIntersect(SmallSetVector &Set1, const NodeSet &Set2,
1903 SmallSetVector &Result) {
1904 Result.clear();
1905 for (unsigned i = 0, e = Set1.size(); i != e; ++i) {
1906 SUnit *SU = Set1[i];
1907 if (Set2.count(SU) != 0)
1908 Result.insert(SU);
1909 }
1910 return !Result.empty();
1911 }
1912
1913 /// Merge the recurrence node sets that have the same initial node.
1914 void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
1915 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1916 ++I) {
1917 NodeSet &NI = *I;
1918 for (NodeSetType::iterator J = I + 1; J != E;) {
1919 NodeSet &NJ = *J;
1920 if (NI.getNode(0)->NodeNum == NJ.getNode(0)->NodeNum) {
1921 if (NJ.compareRecMII(NI) > 0)
1922 NI.setRecMII(NJ.getRecMII());
1923 for (NodeSet::iterator NII = J->begin(), ENI = J->end(); NII != ENI;
1924 ++NII)
1925 I->insert(*NII);
1926 NodeSets.erase(J);
1927 E = NodeSets.end();
1928 } else {
1929 ++J;
1930 }
1931 }
1932 }
1933 }
1934
1935 /// Remove nodes that have been scheduled in previous NodeSets.
1936 void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
1937 for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
1938 ++I)
1939 for (NodeSetType::iterator J = I + 1; J != E;) {
1940 J->remove_if([&](SUnit *SUJ) { return I->count(SUJ); });
1941
1942 if (J->size() == 0) {
1943 NodeSets.erase(J);
1944 E = NodeSets.end();
1945 } else {
1946 ++J;
1947 }
1948 }
1949 }
1950
1951 /// Return true if Inst1 defines a value that is used in Inst2.
1952 static bool hasDataDependence(SUnit *Inst1, SUnit *Inst2) {
1953 for (auto &SI : Inst1->Succs)
1954 if (SI.getSUnit() == Inst2 && SI.getKind() == SDep::Data)
1955 return true;
1956 return false;
1957 }
1958
1959 /// Compute an ordered list of the dependence graph nodes, which
1960 /// indicates the order that the nodes will be scheduled. This is a
1961 /// two-level algorithm. First, a partial order is created, which
1962 /// consists of a list of sets ordered from highest to lowest priority.
1963 void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
1964 SmallSetVector R;
1965 NodeOrder.clear();
1966
1967 for (auto &Nodes : NodeSets) {
1968 DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
1969 OrderKind Order;
1970 SmallSetVector N;
1971 if (pred_L(NodeOrder, N) && isSubset(N, Nodes)) {
1972 R.insert(N.begin(), N.end());
1973 Order = BottomUp;
1974 DEBUG(dbgs() << " Bottom up (preds) ");
1975 } else if (succ_L(NodeOrder, N) && isSubset(N, Nodes)) {
1976 R.insert(N.begin(), N.end());
1977 Order = TopDown;
1978 DEBUG(dbgs() << " Top down (succs) ");
1979 } else if (isIntersect(N, Nodes, R)) {
1980 // If some of the successors are in the existing node-set, then use the
1981 // top-down ordering.
1982 Order = TopDown;
1983 DEBUG(dbgs() << " Top down (intersect) ");
1984 } else if (NodeSets.size() == 1) {
1985 for (auto &N : Nodes)
1986 if (N->Succs.size() == 0)
1987 R.insert(N);
1988 Order = BottomUp;
1989 DEBUG(dbgs() << " Bottom up (all) ");
1990 } else {
1991 // Find the node with the highest ASAP.
1992 SUnit *maxASAP = nullptr;
1993 for (SUnit *SU : Nodes) {
1994 if (maxASAP == nullptr || getASAP(SU) >= getASAP(maxASAP))
1995 maxASAP = SU;
1996 }
1997 R.insert(maxASAP);
1998 Order = BottomUp;
1999 DEBUG(dbgs() << " Bottom up (default) ");
2000 }
2001
2002 while (!R.empty()) {
2003 if (Order == TopDown) {
2004 // Choose the node with the maximum height. If more than one, choose
2005 // the node with the lowest MOV. If still more than one, check if there
2006 // is a dependence between the instructions.
2007 while (!R.empty()) {
2008 SUnit *maxHeight = nullptr;
2009 for (SUnit *I : R) {
2010 if (maxHeight == 0 || getHeight(I) > getHeight(maxHeight))
2011 maxHeight = I;
2012 else if (getHeight(I) == getHeight(maxHeight) &&
2013 getMOV(I) < getMOV(maxHeight) &&
2014 !hasDataDependence(maxHeight, I))
2015 maxHeight = I;
2016 else if (hasDataDependence(I, maxHeight))
2017 maxHeight = I;
2018 }
2019 NodeOrder.insert(maxHeight);
2020 DEBUG(dbgs() << maxHeight->NodeNum << " ");
2021 R.remove(maxHeight);
2022 for (const auto &I : maxHeight->Succs) {
2023 if (Nodes.count(I.getSUnit()) == 0)
2024 continue;
2025 if (NodeOrder.count(I.getSUnit()) != 0)
2026 continue;
2027 if (ignoreDependence(I, false))
2028 continue;
2029 R.insert(I.getSUnit());
2030 }
2031 // Back-edges are predecessors with an anti-dependence.
2032 for (const auto &I : maxHeight->Preds) {
2033 if (I.getKind() != SDep::Anti)
2034 continue;
2035 if (Nodes.count(I.getSUnit()) == 0)
2036 continue;
2037 if (NodeOrder.count(I.getSUnit()) != 0)
2038 continue;
2039 R.insert(I.getSUnit());
2040 }
2041 }
2042 Order = BottomUp;
2043 DEBUG(dbgs() << "\n Switching order to bottom up ");
2044 SmallSetVector N;
2045 if (pred_L(NodeOrder, N, &Nodes))
2046 R.insert(N.begin(), N.end());
2047 } else {
2048 // Choose the node with the maximum depth. If more than one, choose
2049 // the node with the lowest MOV. If there is still more than one, check
2050 // for a dependence between the instructions.
2051 while (!R.empty()) {
2052 SUnit *maxDepth = nullptr;
2053 for (SUnit *I : R) {
2054 if (maxDepth == 0 || getDepth(I) > getDepth(maxDepth))
2055 maxDepth = I;
2056 else if (getDepth(I) == getDepth(maxDepth) &&
2057 getMOV(I) < getMOV(maxDepth) &&
2058 !hasDataDependence(I, maxDepth))
2059 maxDepth = I;
2060 else if (hasDataDependence(maxDepth, I))
2061 maxDepth = I;
2062 }
2063 NodeOrder.insert(maxDepth);
2064 DEBUG(dbgs() << maxDepth->NodeNum << " ");
2065 R.remove(maxDepth);
2066 if (Nodes.isExceedSU(maxDepth)) {
2067 Order = TopDown;
2068 R.clear();
2069 R.insert(Nodes.getNode(0));
2070 break;
2071 }
2072 for (const auto &I : maxDepth->Preds) {
2073 if (Nodes.count(I.getSUnit()) == 0)
2074 continue;
2075 if (NodeOrder.count(I.getSUnit()) != 0)
2076 continue;
2077 if (I.getKind() == SDep::Anti)
2078 continue;
2079 R.insert(I.getSUnit());
2080 }
2081 // Back-edges are predecessors with an anti-dependence.
2082 for (const auto &I : maxDepth->Succs) {
2083 if (I.getKind() != SDep::Anti)
2084 continue;
2085 if (Nodes.count(I.getSUnit()) == 0)
2086 continue;
2087 if (NodeOrder.count(I.getSUnit()) != 0)
2088 continue;
2089 R.insert(I.getSUnit());
2090 }
2091 }
2092 Order = TopDown;
2093 DEBUG(dbgs() << "\n Switching order to top down ");
2094 SmallSetVector N;
2095 if (succ_L(NodeOrder, N, &Nodes))
2096 R.insert(N.begin(), N.end());
2097 }
2098 }
2099 DEBUG(dbgs() << "\nDone with Nodeset\n");
2100 }
2101
2102 DEBUG({
2103 dbgs() << "Node order: ";
2104 for (SUnit *I : NodeOrder)
2105 dbgs() << " " << I->NodeNum << " ";
2106 dbgs() << "\n";
2107 });
2108 }
2109
2110 /// Process the nodes in the computed order and create the pipelined schedule
2111 /// of the instructions, if possible. Return true if a schedule is found.
2112 bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2113
2114 if (NodeOrder.size() == 0)
2115 return false;
2116
2117 bool scheduleFound = false;
2118 // Keep increasing II until a valid schedule is found.
2119 for (unsigned II = MII; II < MII + 10 && !scheduleFound; ++II) {
2120 Schedule.reset();
2121 Schedule.setInitiationInterval(II);
2122 DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2123
2124 SetVector::iterator NI = NodeOrder.begin();
2125 SetVector::iterator NE = NodeOrder.end();
2126 do {
2127 SUnit *SU = *NI;
2128
2129 // Compute the schedule time for the instruction, which is based
2130 // upon the scheduled time for any predecessors/successors.
2131 int EarlyStart = INT_MIN;
2132 int LateStart = INT_MAX;
2133 // These values are set when the size of the schedule window is limited
2134 // due to chain dependences.
2135 int SchedEnd = INT_MAX;
2136 int SchedStart = INT_MIN;
2137 Schedule.computeStart(SU, &EarlyStart, &LateStart, &SchedEnd, &SchedStart,
2138 II, this);
2139 DEBUG({
2140 dbgs() << "Inst (" << SU->NodeNum << ") ";
2141 SU->getInstr()->dump();
2142 dbgs() << "\n";
2143 });
2144 DEBUG({
2145 dbgs() << "\tes: " << EarlyStart << " ls: " << LateStart
2146 << " me: " << SchedEnd << " ms: " << SchedStart << "\n";
2147 });
2148
2149 if (EarlyStart > LateStart || SchedEnd < EarlyStart ||
2150 SchedStart > LateStart)
2151 scheduleFound = false;
2152 else if (EarlyStart != INT_MIN && LateStart == INT_MAX) {
2153 SchedEnd = std::min(SchedEnd, EarlyStart + (int)II - 1);
2154 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2155 } else if (EarlyStart == INT_MIN && LateStart != INT_MAX) {
2156 SchedStart = std::max(SchedStart, LateStart - (int)II + 1);
2157 scheduleFound = Schedule.insert(SU, LateStart, SchedStart, II);
2158 } else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2159 SchedEnd =
2160 std::min(SchedEnd, std::min(LateStart, EarlyStart + (int)II - 1));
2161 // When scheduling a Phi it is better to start at the late cycle and go
2162 // backwards. The default order may insert the Phi too far away from
2163 // its first dependence.
2164 if (SU->getInstr()->isPHI())
2165 scheduleFound = Schedule.insert(SU, SchedEnd, EarlyStart, II);
2166 else
2167 scheduleFound = Schedule.insert(SU, EarlyStart, SchedEnd, II);
2168 } else {
2169 int FirstCycle = Schedule.getFirstCycle();
2170 scheduleFound = Schedule.insert(SU, FirstCycle + getASAP(SU),
2171 FirstCycle + getASAP(SU) + II - 1, II);
2172 }
2173 // Even if we find a schedule, make sure the schedule doesn't exceed the
2174 // allowable number of stages. We keep trying if this happens.
2175 if (scheduleFound)
2176 if (SwpMaxStages > -1 &&
2177 Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2178 scheduleFound = false;
2179
2180 DEBUG({
2181 if (!scheduleFound)
2182 dbgs() << "\tCan't schedule\n";
2183 });
2184 } while (++NI != NE && scheduleFound);
2185
2186 // If a schedule is found, check if it is a valid schedule too.
2187 if (scheduleFound)
2188 scheduleFound = Schedule.isValidSchedule(this);
2189 }
2190
2191 DEBUG(dbgs() << "Schedule Found? " << scheduleFound << "\n");
2192
2193 if (scheduleFound)
2194 Schedule.finalizeSchedule(this);
2195 else
2196 Schedule.reset();
2197
2198 return scheduleFound && Schedule.getMaxStageCount() > 0;
2199 }
2200
2201 /// Given a schedule for the loop, generate a new version of the loop,
2202 /// and replace the old version. This function generates a prolog
2203 /// that contains the initial iterations in the pipeline, and kernel
2204 /// loop, and the epilogue that contains the code for the final
2205 /// iterations.
2206 void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
2207 // Create a new basic block for the kernel and add it to the CFG.
2208 MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2209
2210 unsigned MaxStageCount = Schedule.getMaxStageCount();
2211
2212 // Remember the registers that are used in different stages. The index is
2213 // the iteration, or stage, that the instruction is scheduled in. This is
2214 // a map between register names in the orignal block and the names created
2215 // in each stage of the pipelined loop.
2216 ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
2217 InstrMapTy InstrMap;
2218
2219 SmallVector PrologBBs;
2220 // Generate the prolog instructions that set up the pipeline.
2221 generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
2222 MF.insert(BB->getIterator(), KernelBB);
2223
2224 // Rearrange the instructions to generate the new, pipelined loop,
2225 // and update register names as needed.
2226 for (int Cycle = Schedule.getFirstCycle(),
2227 LastCycle = Schedule.getFinalCycle();
2228 Cycle <= LastCycle; ++Cycle) {
2229 std::deque &CycleInstrs = Schedule.getInstructions(Cycle);
2230 // This inner loop schedules each instruction in the cycle.
2231 for (SUnit *CI : CycleInstrs) {
2232 if (CI->getInstr()->isPHI())
2233 continue;
2234 unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
2235 MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
2236 updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
2237 KernelBB->push_back(NewMI);
2238 InstrMap[NewMI] = CI->getInstr();
2239 }
2240 }
2241
2242 // Copy any terminator instructions to the new kernel, and update
2243 // names as needed.
2244 for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
2245 E = BB->instr_end();
2246 I != E; ++I) {
2247 MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
2248 updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
2249 KernelBB->push_back(NewMI);
2250 InstrMap[NewMI] = &*I;
2251 }
2252
2253 KernelBB->transferSuccessors(BB);
2254 KernelBB->replaceSuccessor(BB, KernelBB);
2255
2256 generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
2257 VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
2258 generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
2259 InstrMap, MaxStageCount, MaxStageCount, false);
2260
2261 DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
2262
2263 SmallVector EpilogBBs;
2264 // Generate the epilog instructions to complete the pipeline.
2265 generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
2266 PrologBBs);
2267
2268 // We need this step because the register allocation doesn't handle some
2269 // situations well, so we insert copies to help out.
2270 splitLifetimes(KernelBB, EpilogBBs, Schedule);
2271
2272 // Remove dead instructions due to loop induction variables.
2273 removeDeadInstructions(KernelBB, EpilogBBs);
2274
2275 // Add branches between prolog and epilog blocks.
2276 addBranches(PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
2277
2278 // Remove the original loop since it's no longer referenced.
2279 BB->clear();
2280 BB->eraseFromParent();
2281
2282 delete[] VRMap;
2283 }
2284
2285 /// Generate the pipeline prolog code.
2286 void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
2287 MachineBasicBlock *KernelBB,
2288 ValueMapTy *VRMap,
2289 MBBVectorTy &PrologBBs) {
2290 MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
2291 assert(PreheaderBB != NULL &&
2292 "Need to add code to handle loops w/o preheader");
2293 MachineBasicBlock *PredBB = PreheaderBB;
2294 InstrMapTy InstrMap;
2295
2296 // Generate a basic block for each stage, not including the last stage,
2297 // which will be generated in the kernel. Each basic block may contain
2298 // instructions from multiple stages/iterations.
2299 for (unsigned i = 0; i < LastStage; ++i) {
2300 // Create and insert the prolog basic block prior to the original loop
2301 // basic block. The original loop is removed later.
2302 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
2303 PrologBBs.push_back(NewBB);
2304 MF.insert(BB->getIterator(), NewBB);
2305 NewBB->transferSuccessors(PredBB);
2306 PredBB->addSuccessor(NewBB);
2307 PredBB = NewBB;
2308
2309 // Generate instructions for each appropriate stage. Process instructions
2310 // in original program order.
2311 for (int StageNum = i; StageNum >= 0; --StageNum) {
2312 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2313 BBE = BB->getFirstTerminator();
2314 BBI != BBE; ++BBI) {
2315 if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
2316 if (BBI->isPHI())
2317 continue;
2318 MachineInstr *NewMI =
2319 cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
2320 updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
2321 VRMap);
2322 NewBB->push_back(NewMI);
2323 InstrMap[NewMI] = &*BBI;
2324 }
2325 }
2326 }
2327 rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
2328 DEBUG({
2329 dbgs() << "prolog:\n";
2330 NewBB->dump();
2331 });
2332 }
2333
2334 PredBB->replaceSuccessor(BB, KernelBB);
2335
2336 // Check if we need to remove the branch from the preheader to the original
2337 // loop, and replace it with a branch to the new loop.
2338 unsigned numBranches = TII->RemoveBranch(*PreheaderBB);
2339 if (numBranches) {
2340 SmallVector Cond;
2341 TII->InsertBranch(*PreheaderBB, PrologBBs[0], 0, Cond, DebugLoc());
2342 }
2343 }
2344
2345 /// Generate the pipeline epilog code. The epilog code finishes the iterations
2346 /// that were started in either the prolog or the kernel. We create a basic
2347 /// block for each stage that needs to complete.
2348 void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
2349 MachineBasicBlock *KernelBB,
2350 ValueMapTy *VRMap,
2351 MBBVectorTy &EpilogBBs,
2352 MBBVectorTy &PrologBBs) {
2353 // We need to change the branch from the kernel to the first epilog block, so
2354 // this call to analyze branch uses the kernel rather than the original BB.
2355 MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
2356 SmallVector Cond;
2357 bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
2358 assert(!checkBranch && "generateEpilog must be able to analyze the branch");
2359 if (checkBranch)
2360 return;
2361
2362 MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
2363 if (*LoopExitI == KernelBB)
2364 ++LoopExitI;
2365 assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
2366 MachineBasicBlock *LoopExitBB = *LoopExitI;
2367
2368 MachineBasicBlock *PredBB = KernelBB;
2369 MachineBasicBlock *EpilogStart = LoopExitBB;
2370 InstrMapTy InstrMap;
2371
2372 // Generate a basic block for each stage, not including the last stage,
2373 // which was generated for the kernel. Each basic block may contain
2374 // instructions from multiple stages/iterations.
2375 int EpilogStage = LastStage + 1;
2376 for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
2377 MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
2378 EpilogBBs.push_back(NewBB);
2379 MF.insert(BB->getIterator(), NewBB);
2380
2381 PredBB->replaceSuccessor(LoopExitBB, NewBB);
2382 NewBB->addSuccessor(LoopExitBB);
2383
2384 if (EpilogStart == LoopExitBB)
2385 EpilogStart = NewBB;
2386
2387 // Add instructions to the epilog depending on the current block.
2388 // Process instructions in original program order.
2389 for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
2390 for (auto &BBI : *BB) {
2391 if (BBI.isPHI())
2392 continue;
2393 MachineInstr *In = &BBI;
2394 if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
2395 MachineInstr *NewMI = cloneInstr(In, EpilogStage - LastStage, 0);
2396 updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
2397 NewBB->push_back(NewMI);
2398 InstrMap[NewMI] = In;
2399 }
2400 }
2401 }
2402 generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
2403 VRMap, InstrMap, LastStage, EpilogStage, i == 1);
2404 generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
2405 InstrMap, LastStage, EpilogStage, i == 1);
2406 PredBB = NewBB;
2407
2408 DEBUG({
2409 dbgs() << "epilog:\n";
2410 NewBB->dump();
2411 });
2412 }
2413
2414 // Fix any Phi nodes in the loop exit block.
2415 for (MachineInstr &MI : *LoopExitBB) {
2416 if (!MI.isPHI())
2417 break;
2418 for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
2419 MachineOperand &MO = MI.getOperand(i);
2420 if (MO.getMBB() == BB)
2421 MO.setMBB(PredBB);
2422 }
2423 }
2424
2425 // Create a branch to the new epilog from the kernel.
2426 // Remove the original branch and add a new branch to the epilog.
2427 TII->RemoveBranch(*KernelBB);
2428 TII->InsertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
2429 // Add a branch to the loop exit.
2430 if (EpilogBBs.size() > 0) {
2431 MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
2432 SmallVector Cond1;
2433 TII->InsertBranch(*LastEpilogBB, LoopExitBB, 0, Cond1, DebugLoc());
2434 }
2435 }
2436
2437 /// Replace all uses of FromReg that appear outside the specified
2438 /// basic block with ToReg.
2439 static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
2440 MachineBasicBlock *MBB,
2441 MachineRegisterInfo &MRI,
2442 LiveIntervals &LIS) {
2443 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
2444 E = MRI.use_end();
2445 I != E;) {
2446 MachineOperand &O = *I;
2447 ++I;
2448 if (O.getParent()->getParent() != MBB)
2449 O.setReg(ToReg);
2450 }
2451 if (!LIS.hasInterval(ToReg))
2452 LIS.createEmptyInterval(ToReg);
2453 }
2454
2455 /// Return true if the register has a use that occurs outside the
2456 /// specified loop.
2457 static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
2458 MachineRegisterInfo &MRI) {
2459 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
2460 E = MRI.use_end();
2461 I != E; ++I)
2462 if (I->getParent()->getParent() != BB)
2463 return true;
2464 return false;
2465 }
2466
2467 /// Generate Phis for the specific block in the generated pipelined code.
2468 /// This function looks at the Phis from the original code to guide the
2469 /// creation of new Phis.
2470 void SwingSchedulerDAG::generateExistingPhis(
2471 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2472 MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2473 InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2474 bool IsLast) {
2475 // Compute the stage number for the inital value of the Phi, which
2476 // comes from the prolog. The prolog to use depends on to which kernel/
2477 // epilog that we're adding the Phi.
2478 unsigned PrologStage = 0;
2479 unsigned PrevStage = 0;
2480 bool InKernel = (LastStageNum == CurStageNum);
2481 if (InKernel) {
2482 PrologStage = LastStageNum - 1;
2483 PrevStage = CurStageNum;
2484 } else {
2485 PrologStage = LastStageNum - (CurStageNum - LastStageNum);
2486 PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
2487 }
2488
2489 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
2490 BBE = BB->getFirstNonPHI();
2491 BBI != BBE; ++BBI) {
2492 unsigned Def = BBI->getOperand(0).getReg();
2493
2494 unsigned InitVal = 0;
2495 unsigned LoopVal = 0;
2496 getPhiRegs(*BBI, BB, InitVal, LoopVal);
2497
2498 unsigned PhiOp1 = 0;
2499 // The Phi value from the loop body typically is defined in the loop, but
2500 // not always. So, we need to check if the value is defined in the loop.
2501 unsigned PhiOp2 = LoopVal;
2502 if (VRMap[LastStageNum].count(LoopVal))
2503 PhiOp2 = VRMap[LastStageNum][LoopVal];
2504
2505 int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2506 int LoopValStage =
2507 Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
2508 unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
2509 if (NumStages == 0) {
2510 // We don't need to generate a Phi anymore, but we need to rename any uses
2511 // of the Phi value.
2512 unsigned NewReg = VRMap[PrevStage][LoopVal];
2513 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
2514 Def, NewReg);
2515 if (VRMap[CurStageNum].count(LoopVal))
2516 VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
2517 }
2518 // Adjust the number of Phis needed depending on the number of prologs left,
2519 // and the distance from where the Phi is first scheduled.
2520 unsigned NumPhis = NumStages;
2521 if (!InKernel && (int)PrologStage < LoopValStage)
2522 // The NumPhis is the maximum number of new Phis needed during the steady
2523 // state. If the Phi has not been scheduled in current prolog, then we
2524 // need to generate less Phis.
2525 NumPhis = std::max((int)NumPhis - (int)(LoopValStage - PrologStage), 1);
2526 // The number of Phis cannot exceed the number of prolog stages. Each
2527 // stage can potentially define two values.
2528 NumPhis = std::min(NumPhis, PrologStage + 2);
2529
2530 unsigned NewReg = 0;
2531
2532 unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
2533 // In the epilog, we may need to look back one stage to get the correct
2534 // Phi name because the epilog and prolog blocks execute the same stage.
2535 // The correct name is from the previous block only when the Phi has
2536 // been completely scheduled prior to the epilog, and Phi value is not
2537 // needed in multiple stages.
2538 int StageDiff = 0;
2539 if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
2540 NumPhis == 1)
2541 StageDiff = 1;
2542 // Adjust the computations below when the phi and the loop definition
2543 // are scheduled in different stages.
2544 if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
2545 StageDiff = StageScheduled - LoopValStage;
2546 for (unsigned np = 0; np < NumPhis; ++np) {
2547 // If the Phi hasn't been scheduled, then use the initial Phi operand
2548 // value. Otherwise, use the scheduled version of the instruction. This
2549 // is a little complicated when a Phi references another Phi.
2550 if (np > PrologStage || StageScheduled >= (int)LastStageNum)
2551 PhiOp1 = InitVal;
2552 // Check if the Phi has already been scheduled in a prolog stage.
2553 else if (PrologStage >= AccessStage + StageDiff + np &&
2554 VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
2555 PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
2556 // Check if the Phi has already been scheduled, but the loop intruction
2557 // is either another Phi, or doesn't occur in the loop.
2558 else if (PrologStage >= AccessStage + StageDiff + np) {
2559 // If the Phi references another Phi, we need to examine the other
2560 // Phi to get the correct value.
2561 PhiOp1 = LoopVal;
2562 MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
2563 int Indirects = 1;
2564 while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
2565 int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
2566 if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
2567 PhiOp1 = getInitPhiReg(*InstOp1, BB);
2568 else
2569 PhiOp1 = getLoopPhiReg(*InstOp1, BB);
2570 InstOp1 = MRI.getVRegDef(PhiOp1);
2571 int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
2572 int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
2573 if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
2574 VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
2575 PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
2576 break;
2577 }
2578 ++Indirects;
2579 }
2580 } else
2581 PhiOp1 = InitVal;
2582 // If this references a generated Phi in the kernel, get the Phi operand
2583 // from the incoming block.
2584 if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
2585 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2586 PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2587
2588 MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
2589 bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
2590 // In the epilog, a map lookup is needed to get the value from the kernel,
2591 // or previous epilog block. How is does this depends on if the
2592 // instruction is scheduled in the previous block.
2593 if (!InKernel) {
2594 int StageDiffAdj = 0;
2595 if (LoopValStage != -1 && StageScheduled > LoopValStage)
2596 StageDiffAdj = StageScheduled - LoopValStage;
2597 // Use the loop value defined in the kernel, unless the kernel
2598 // contains the last definition of the Phi.
2599 if (np == 0 && PrevStage == LastStageNum &&
2600 (StageScheduled != 0 || LoopValStage != 0) &&
2601 VRMap[PrevStage - StageDiffAdj].count(LoopVal))
2602 PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
2603 // Use the value defined by the Phi. We add one because we switch
2604 // from looking at the loop value to the Phi definition.
2605 else if (np > 0 && PrevStage == LastStageNum &&
2606 VRMap[PrevStage - np + 1].count(Def))
2607 PhiOp2 = VRMap[PrevStage - np + 1][Def];
2608 // Use the loop value defined in the kernel.
2609 else if ((unsigned)LoopValStage + StageDiffAdj > PrologStage + 1 &&
2610 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
2611 PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
2612 // Use the value defined by the Phi, unless we're generating the first
2613 // epilog and the Phi refers to a Phi in a different stage.
2614 else if (VRMap[PrevStage - np].count(Def) &&
2615 (!LoopDefIsPhi || PrevStage != LastStageNum))
2616 PhiOp2 = VRMap[PrevStage - np][Def];
2617 }
2618
2619 // Check if we can reuse an existing Phi. This occurs when a Phi
2620 // references another Phi, and the other Phi is scheduled in an
2621 // earlier stage. We can try to reuse an existing Phi up until the last
2622 // stage of the current Phi.
2623 if (LoopDefIsPhi && VRMap[CurStageNum].count(LoopVal) &&
2624 LoopValStage >= (int)(CurStageNum - LastStageNum)) {
2625 int LVNumStages = Schedule.getStagesForPhi(LoopVal);
2626 int StageDiff = (StageScheduled - LoopValStage);
2627 LVNumStages -= StageDiff;
2628 if (LVNumStages > (int)np) {
2629 NewReg = PhiOp2;
2630 unsigned ReuseStage = CurStageNum;
2631 if (Schedule.isLoopCarried(this, *PhiInst))
2632 ReuseStage -= LVNumStages;
2633 // Check if the Phi to reuse has been generated yet. If not, then
2634 // there is nothing to reuse.
2635 if (VRMap[ReuseStage].count(LoopVal)) {
2636 NewReg = VRMap[ReuseStage][LoopVal];
2637
2638 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2639 &*BBI, Def, NewReg);
2640 // Update the map with the new Phi name.
2641 VRMap[CurStageNum - np][Def] = NewReg;
2642 PhiOp2 = NewReg;
2643 if (VRMap[LastStageNum - np - 1].count(LoopVal))
2644 PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
2645
2646 if (IsLast && np == NumPhis - 1)
2647 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2648 continue;
2649 }
2650 } else if (StageDiff > 0 &&
2651 VRMap[CurStageNum - StageDiff - np].count(LoopVal))
2652 PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
2653 }
2654
2655 const TargetRegisterClass *RC = MRI.getRegClass(Def);
2656 NewReg = MRI.createVirtualRegister(RC);
2657
2658 MachineInstrBuilder NewPhi =
2659 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2660 TII->get(TargetOpcode::PHI), NewReg);
2661 NewPhi.addReg(PhiOp1).addMBB(BB1);
2662 NewPhi.addReg(PhiOp2).addMBB(BB2);
2663 if (np == 0)
2664 InstrMap[NewPhi] = &*BBI;
2665
2666 // We define the Phis after creating the new pipelined code, so
2667 // we need to rename the Phi values in scheduled instructions.
2668
2669 unsigned PrevReg = 0;
2670 if (InKernel && VRMap[PrevStage - np].count(LoopVal))
2671 PrevReg = VRMap[PrevStage - np][LoopVal];
2672 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2673 Def, NewReg, PrevReg);
2674 // If the Phi has been scheduled, use the new name for rewriting.
2675 if (VRMap[CurStageNum - np].count(Def)) {
2676 unsigned R = VRMap[CurStageNum - np][Def];
2677 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
2678 R, NewReg);
2679 }
2680
2681 // Check if we need to rename any uses that occurs after the loop. The
2682 // register to replace depends on whether the Phi is scheduled in the
2683 // epilog.
2684 if (IsLast && np == NumPhis - 1)
2685 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2686
2687 // In the kernel, a dependent Phi uses the value from this Phi.
2688 if (InKernel)
2689 PhiOp2 = NewReg;
2690
2691 // Update the map with the new Phi name.
2692 VRMap[CurStageNum - np][Def] = NewReg;
2693 }
2694
2695 while (NumPhis++ < NumStages) {
2696 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
2697 &*BBI, Def, NewReg, 0);
2698 }
2699
2700 // Check if we need to rename a Phi that has been eliminated due to
2701 // scheduling.
2702 if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
2703 replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
2704 }
2705 }
2706
2707 /// Generate Phis for the specified block in the generated pipelined code.
2708 /// These are new Phis needed because the definition is scheduled after the
2709 /// use in the pipelened sequence.
2710 void SwingSchedulerDAG::generatePhis(
2711 MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
2712 MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
2713 InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
2714 bool IsLast) {
2715 // Compute the stage number that contains the initial Phi value, and
2716 // the Phi from the previous stage.
2717 unsigned PrologStage = 0;
2718 unsigned PrevStage = 0;
2719 unsigned StageDiff = CurStageNum - LastStageNum;
2720 bool InKernel = (StageDiff == 0);
2721 if (InKernel) {
2722 PrologStage = LastStageNum - 1;
2723 PrevStage = CurStageNum;
2724 } else {
2725 PrologStage = LastStageNum - StageDiff;
2726 PrevStage = LastStageNum + StageDiff - 1;
2727 }
2728
2729 for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
2730 BBE = BB->instr_end();
2731 BBI != BBE; ++BBI) {
2732 for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
2733 MachineOperand &MO = BBI->getOperand(i);
2734 if (!MO.isReg() || !MO.isDef() ||
2735 !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
2736 continue;
2737
2738 int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
2739 assert(StageScheduled != -1 && "Expecting scheduled instruction.");
2740 unsigned Def = MO.getReg();
2741 unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
2742 // An instruction scheduled in stage 0 and is used after the loop
2743 // requires a phi in the epilog for the last definition from either
2744 // the kernel or prolog.
2745 if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
2746 hasUseAfterLoop(Def, BB, MRI))
2747 NumPhis = 1;
2748 if (!InKernel && (unsigned)StageScheduled > PrologStage)
2749 continue;
2750
2751 unsigned PhiOp2 = VRMap[PrevStage][Def];
2752 if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
2753 if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
2754 PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
2755 // The number of Phis can't exceed the number of prolog stages. The
2756 // prolog stage number is zero based.
2757 if (NumPhis > PrologStage + 1 - StageScheduled)
2758 NumPhis = PrologStage + 1 - StageScheduled;
2759 for (unsigned np = 0; np < NumPhis; ++np) {
2760 unsigned PhiOp1 = VRMap[PrologStage][Def];
2761 if (np <= PrologStage)
2762 PhiOp1 = VRMap[PrologStage - np][Def];
2763 if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
2764 if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
2765 PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
2766 if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
2767 PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
2768 }
2769 if (!InKernel)
2770 PhiOp2 = VRMap[PrevStage - np][Def];
2771
2772 const TargetRegisterClass *RC = MRI.getRegClass(Def);
2773 unsigned NewReg = MRI.createVirtualRegister(RC);
2774
2775 MachineInstrBuilder NewPhi =
2776 BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
2777 TII->get(TargetOpcode::PHI), NewReg);
2778 NewPhi.addReg(PhiOp1).addMBB(BB1);
2779 NewPhi.addReg(PhiOp2).addMBB(BB2);
2780 if (np == 0)
2781 InstrMap[NewPhi] = &*BBI;
2782
2783 // Rewrite uses and update the map. The actions depend upon whether
2784 // we generating code for the kernel or epilog blocks.
2785 if (InKernel) {
2786 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2787 &*BBI, PhiOp1, NewReg);
2788 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2789 &*BBI, PhiOp2, NewReg);
2790
2791 PhiOp2 = NewReg;
2792 VRMap[PrevStage - np - 1][Def] = NewReg;
2793 } else {
2794 VRMap[CurStageNum - np][Def] = NewReg;
2795 if (np == NumPhis - 1)
2796 rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
2797 &*BBI, Def, NewReg);
2798 }
2799 if (IsLast && np == NumPhis - 1)
2800 replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
2801 }
2802 }
2803 }
2804 }
2805
2806 /// Remove instructions that generate values with no uses.
2807 /// Typically, these are induction variable operations that generate values
2808 /// used in the loop itself. A dead instruction has a definition with
2809 /// no uses, or uses that occur in the original loop only.
2810 void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
2811 MBBVectorTy &EpilogBBs) {
2812 // For each epilog block, check that the value defined by each instruction
2813 // is used. If not, delete it.
2814 for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
2815 MBE = EpilogBBs.rend();
2816 MBB != MBE; ++MBB)
2817 for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
2818 ME = (*MBB)->instr_rend();
2819 MI != ME;) {
2820 // From DeadMachineInstructionElem. Don't delete inline assembly.
2821 if (MI->isInlineAsm()) {
2822 ++MI;
2823 continue;
2824 }
2825 bool SawStore = false;
2826 // Check if it's safe to remove the instruction due to side effects.
2827 // We can, and want to, remove Phis here.
2828 if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
2829 ++MI;
2830 continue;
2831 }
2832 bool used = true;
2833 for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
2834 MOE = MI->operands_end();
2835 MOI != MOE; ++MOI) {
2836 if (!MOI->isReg() || !MOI->isDef())
2837 continue;
2838 unsigned reg = MOI->getReg();
2839 unsigned realUses = 0;
2840 for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
2841 EI = MRI.use_end();
2842 UI != EI; ++UI) {
2843 // Check if there are any uses that occur only in the original
2844 // loop. If so, that's not a real use.
2845 if (UI->getParent()->getParent() != BB) {
2846 realUses++;
2847 used = true;
2848 break;
2849 }
2850 }
2851 if (realUses > 0)
2852 break;
2853 used = false;
2854 }
2855 if (!used) {
2856 MI->eraseFromParent();
2857 ME = (*MBB)->instr_rend();
2858 continue;
2859 }
2860 ++MI;
2861 }
2862 // In the kernel block, check if we can remove a Phi that generates a value
2863 // used in an instruction removed in the epilog block.
2864 for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2865 BBE = KernelBB->getFirstNonPHI();
2866 BBI != BBE;) {
2867 MachineInstr *MI = &*BBI;
2868 ++BBI;
2869 unsigned reg = MI->getOperand(0).getReg();
2870 if (MRI.use_begin(reg) == MRI.use_end()) {
2871 MI->eraseFromParent();
2872 }
2873 }
2874 }
2875
2876 /// For loop carried definitions, we split the lifetime of a virtual register
2877 /// that has uses past the definition in the next iteration. A copy with a new
2878 /// virtual register is inserted before the definition, which helps with
2879 /// generating a better register assignment.
2880 ///
2881 /// v1 = phi(a, v2) v1 = phi(a, v2)
2882 /// v2 = phi(b, v3) v2 = phi(b, v3)
2883 /// v3 = .. v4 = copy v1
2884 /// .. = V1 v3 = ..
2885 /// .. = v4
2886 void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
2887 MBBVectorTy &EpilogBBs,
2888 SMSchedule &Schedule) {
2889 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2890 for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
2891 BBF = KernelBB->getFirstNonPHI();
2892 BBI != BBF; ++BBI) {
2893 unsigned Def = BBI->getOperand(0).getReg();
2894 // Check for any Phi definition that used as an operand of another Phi
2895 // in the same block.
2896 for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
2897 E = MRI.use_instr_end();
2898 I != E; ++I) {
2899 if (I->isPHI() && I->getParent() == KernelBB) {
2900 // Get the loop carried definition.
2901 unsigned LCDef = getLoopPhiReg(*BBI, KernelBB);
2902 if (!LCDef)
2903 continue;
2904 MachineInstr *MI = MRI.getVRegDef(LCDef);
2905 if (!MI || MI->getParent() != KernelBB || MI->isPHI())
2906 continue;
2907 // Search through the rest of the block looking for uses of the Phi
2908 // definition. If one occurs, then split the lifetime.
2909 unsigned SplitReg = 0;
2910 for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
2911 KernelBB->instr_end()))
2912 if (BBJ.readsRegister(Def)) {
2913 // We split the lifetime when we find the first use.
2914 if (SplitReg == 0) {
2915 SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
2916 BuildMI(*KernelBB, MI, MI->getDebugLoc(),
2917 TII->get(TargetOpcode::COPY), SplitReg)
2918 .addReg(Def);
2919 }
2920 BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
2921 }
2922 if (!SplitReg)
2923 continue;
2924 // Search through each of the epilog blocks for any uses to be renamed.
2925 for (auto &Epilog : EpilogBBs)
2926 for (auto &I : *Epilog)
2927 if (I.readsRegister(Def))
2928 I.substituteRegister(Def, SplitReg, 0, *TRI);
2929 break;
2930 }
2931 }
2932 }
2933 }
2934
2935 /// Remove the incoming block from the Phis in a basic block.
2936 static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
2937 for (MachineInstr &MI : *BB) {
2938 if (!MI.isPHI())
2939 break;
2940 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
2941 if (MI.getOperand(i + 1).getMBB() == Incoming) {
2942 MI.RemoveOperand(i + 1);
2943 MI.RemoveOperand(i);
2944 break;
2945 }
2946 }
2947 }
2948
2949 /// Create branches from each prolog basic block to the appropriate epilog
2950 /// block. These edges are needed if the loop ends before reaching the
2951 /// kernel.
2952 void SwingSchedulerDAG::addBranches(MBBVectorTy &PrologBBs,
2953 MachineBasicBlock *KernelBB,
2954 MBBVectorTy &EpilogBBs,
2955 SMSchedule &Schedule, ValueMapTy *VRMap) {
2956 assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
2957 MachineInstr *IndVar = Pass.LI.LoopInductionVar;
2958 MachineInstr *Cmp = Pass.LI.LoopCompare;
2959 MachineBasicBlock *LastPro = KernelBB;
2960 MachineBasicBlock *LastEpi = KernelBB;
2961
2962 // Start from the blocks connected to the kernel and work "out"
2963 // to the first prolog and the last epilog blocks.
2964 SmallVector PrevInsts;
2965 unsigned MaxIter = PrologBBs.size() - 1;
2966 unsigned LC = UINT_MAX;
2967 unsigned LCMin = UINT_MAX;
2968 for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
2969 // Add branches to the prolog that go to the corresponding
2970 // epilog, and the fall-thru prolog/kernel block.
2971 MachineBasicBlock *Prolog = PrologBBs[j];
2972 MachineBasicBlock *Epilog = EpilogBBs[i];
2973 // We've executed one iteration, so decrement the loop count and check for
2974 // the loop end.
2975 SmallVector Cond;
2976 // Check if the LOOP0 has already been removed. If so, then there is no need
2977 // to reduce the trip count.
2978 if (LC != 0)
2979 LC = TII->reduceLoopCount(*Prolog, IndVar, Cmp, Cond, PrevInsts, j,
2980 MaxIter);
2981
2982 // Record the value of the first trip count, which is used to determine if
2983 // branches and blocks can be removed for constant trip counts.
2984 if (LCMin == UINT_MAX)
2985 LCMin = LC;
2986
2987 unsigned numAdded = 0;
2988 if (TargetRegisterInfo::isVirtualRegister(LC)) {
2989 Prolog->addSuccessor(Epilog);
2990 numAdded = TII->InsertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
2991 } else if (j >= LCMin) {
2992 Prolog->addSuccessor(Epilog);
2993 Prolog->removeSuccessor(LastPro);
2994 LastEpi->removeSuccessor(Epilog);
2995 numAdded = TII->InsertBranch(*Prolog, Epilog, 0, Cond, DebugLoc());
2996 removePhis(Epilog, LastEpi);
2997 // Remove the blocks that are no longer referenced.
2998 if (LastPro != LastEpi) {
2999 LastEpi->clear();
3000 LastEpi->eraseFromParent();
3001 }
3002 LastPro->clear();
3003 LastPro->eraseFromParent();
3004 } else {
3005 numAdded = TII->InsertBranch(*Prolog, LastPro, 0, Cond, DebugLoc());
3006 removePhis(Epilog, Prolog);
3007 }
3008 LastPro = Prolog;
3009 LastEpi = Epilog;
3010 for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
3011 E = Prolog->instr_rend();
3012 I != E && numAdded > 0; ++I, --numAdded)
3013 updateInstruction(&*I, false, j, 0, Schedule, VRMap);
3014 }
3015 }
3016
3017 /// Return true if we can compute the amount the instruction changes
3018 /// during each iteration. Set Delta to the amount of the change.
3019 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
3020 const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3021 unsigned BaseReg;
3022 int64_t Offset;
3023 if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
3024 return false;
3025
3026 MachineRegisterInfo &MRI = MF.getRegInfo();
3027 // Check if there is a Phi. If so, get the definition in the loop.
3028 MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
3029 if (BaseDef && BaseDef->isPHI()) {
3030 BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
3031 BaseDef = MRI.getVRegDef(BaseReg);
3032 }
3033 if (!BaseDef)
3034 return false;
3035
3036 int D = 0;
3037 if (!TII->getIncrementValue(BaseDef, D) && D >= 0)
3038 return false;
3039
3040 Delta = D;
3041 return true;
3042 }
3043
3044 /// Update the memory operand with a new offset when the pipeliner
3045 /// generate a new copy of the instruction that refers to a
3046 /// different memory location.
3047 void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
3048 MachineInstr &OldMI, unsigned Num) {
3049 if (Num == 0)
3050 return;
3051 // If the instruction has memory operands, then adjust the offset
3052 // when the instruction appears in different stages.
3053 unsigned NumRefs = NewMI.memoperands_end() - NewMI.memoperands_begin();
3054 if (NumRefs == 0)
3055 return;
3056 MachineInstr::mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NumRefs);
3057 unsigned Refs = 0;
3058 for (MachineInstr::mmo_iterator I = NewMI.memoperands_begin(),
3059 E = NewMI.memoperands_end();
3060 I != E; ++I) {
3061 if ((*I)->isVolatile() || (*I)->isInvariant() || (!(*I)->getValue())) {
3062 NewMemRefs[Refs++] = *I;
3063 continue;
3064 }
3065 unsigned Delta;
3066 if (computeDelta(OldMI, Delta)) {
3067 int64_t AdjOffset = Delta * Num;
3068 NewMemRefs[Refs++] =
3069 MF.getMachineMemOperand(*I, AdjOffset, (*I)->getSize());
3070 } else
3071 NewMemRefs[Refs++] = MF.getMachineMemOperand(*I, 0, UINT64_MAX);
3072 }
3073 NewMI.setMemRefs(NewMemRefs, NewMemRefs + NumRefs);
3074 }
3075
3076 /// Clone the instruction for the new pipelined loop and update the
3077 /// memory operands, if needed.
3078 MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
3079 unsigned CurStageNum,
3080 unsigned InstStageNum) {
3081 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3082 // Check for tied operands in inline asm instructions. This should be handled
3083 // elsewhere, but I'm not sure of the best solution.
3084 if (OldMI->isInlineAsm())
3085 for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
3086 const auto &MO = OldMI->getOperand(i);
3087 if (MO.isReg() && MO.isUse())
3088 break;
3089 unsigned UseIdx;
3090 if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
3091 NewMI->tieOperands(i, UseIdx);
3092 }
3093 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3094 return NewMI;
3095 }
3096
3097 /// Clone the instruction for the new pipelined loop. If needed, this
3098 /// function updates the instruction using the values saved in the
3099 /// InstrChanges structure.
3100 MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
3101 unsigned CurStageNum,
3102 unsigned InstStageNum,
3103 SMSchedule &Schedule) {
3104 MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
3105 DenseMap>::iterator It =
3106 InstrChanges.find(getSUnit(OldMI));
3107 if (It != InstrChanges.end()) {
3108 std::pair RegAndOffset = It->second;
3109 unsigned BasePos, OffsetPos;
3110 if (!TII->getBaseAndOffsetPosition(OldMI, BasePos, OffsetPos))
3111 return nullptr;
3112 int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
3113 MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
3114 if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
3115 NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
3116 NewMI->getOperand(OffsetPos).setImm(NewOffset);
3117 }
3118 updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
3119 return NewMI;
3120 }
3121
3122 /// Update the machine instruction with new virtual registers. This
3123 /// function may change the defintions and/or uses.
3124 void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
3125 unsigned CurStageNum,
3126 unsigned InstrStageNum,
3127 SMSchedule &Schedule,
3128 ValueMapTy *VRMap) {
3129 for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
3130 MachineOperand &MO = NewMI->getOperand(i);
3131 if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
3132 continue;
3133 unsigned reg = MO.getReg();
3134 if (MO.isDef()) {
3135 // Create a new virtual register for the definition.
3136 const TargetRegisterClass *RC = MRI.getRegClass(reg);
3137 unsigned NewReg = MRI.createVirtualRegister(RC);
3138 MO.setReg(NewReg);
3139 VRMap[CurStageNum][reg] = NewReg;
3140 if (LastDef)
3141 replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
3142 } else if (MO.isUse()) {
3143 MachineInstr *Def = MRI.getVRegDef(reg);
3144 // Compute the stage that contains the last definition for instruction.
3145 int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
3146 unsigned StageNum = CurStageNum;
3147 if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
3148 // Compute the difference in stages between the defintion and the use.
3149 unsigned StageDiff = (InstrStageNum - DefStageNum);
3150 // Make an adjustment to get the last definition.
3151 StageNum -= StageDiff;
3152 }
3153 if (VRMap[StageNum].count(reg))
3154 MO.setReg(VRMap[StageNum][reg]);
3155 }
3156 }
3157 }
3158
3159 /// Return the instruction in the loop that defines the register.
3160 /// If the definition is a Phi, then follow the Phi operand to
3161 /// the instruction in the loop.
3162 MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
3163 SmallPtrSet Visited;
3164 MachineInstr *Def = MRI.getVRegDef(Reg);
3165 while (Def->isPHI()) {
3166 if (!Visited.insert(Def).second)
3167 break;
3168 for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
3169 if (Def->getOperand(i + 1).getMBB() == BB) {
3170 Def = MRI.getVRegDef(Def->getOperand(i).getReg());
3171 break;
3172 }
3173 }
3174 return Def;
3175 }
3176
3177 /// Return the new name for the value from the previous stage.
3178 unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
3179 unsigned LoopVal, unsigned LoopStage,
3180 ValueMapTy *VRMap,
3181 MachineBasicBlock *BB) {
3182 unsigned PrevVal = 0;
3183 if (StageNum > PhiStage) {
3184 MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
3185 if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
3186 // The name is defined in the previous stage.
3187 PrevVal = VRMap[StageNum - 1][LoopVal];
3188 else if (VRMap[StageNum].count(LoopVal))
3189 // The previous name is defined in the current stage when the instruction
3190 // order is swapped.
3191 PrevVal = VRMap[StageNum][LoopVal];
3192 else if (!LoopInst->isPHI())
3193 // The loop value hasn't yet been scheduled.
3194 PrevVal = LoopVal;
3195 else if (StageNum == PhiStage + 1)
3196 // The loop value is another phi, which has not been scheduled.
3197 PrevVal = getInitPhiReg(*LoopInst, BB);
3198 else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
3199 // The loop value is another phi, which has been scheduled.
3200 PrevVal =
3201 getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
3202 LoopStage, VRMap, BB);
3203 }
3204 return PrevVal;
3205 }
3206
3207 /// Rewrite the Phi values in the specified block to use the mappings
3208 /// from the initial operand. Once the Phi is scheduled, we switch
3209 /// to using the loop value instead of the Phi value, so those names
3210 /// do not need to be rewritten.
3211 void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
3212 unsigned StageNum,
3213 SMSchedule &Schedule,
3214 ValueMapTy *VRMap,
3215 InstrMapTy &InstrMap) {
3216 for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
3217 BBE = BB->getFirstNonPHI();
3218 BBI != BBE; ++BBI) {
3219 unsigned InitVal = 0;
3220 unsigned LoopVal = 0;
3221 getPhiRegs(*BBI, BB, InitVal, LoopVal);
3222 unsigned PhiDef = BBI->getOperand(0).getReg();
3223
3224 unsigned PhiStage =
3225 (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
3226 unsigned LoopStage =
3227 (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
3228 unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
3229 if (NumPhis > StageNum)
3230 NumPhis = StageNum;
3231 for (unsigned np = 0; np <= NumPhis; ++np) {
3232 unsigned NewVal =
3233 getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
3234 if (!NewVal)
3235 NewVal = InitVal;
3236 rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &*BBI,
3237 PhiDef, NewVal);
3238 }
3239 }
3240 }
3241
3242 /// Rewrite a previously scheduled instruction to use the register value
3243 /// from the new instruction. Make sure the instruction occurs in the
3244 /// basic block, and we don't change the uses in the new instruction.
3245 void SwingSchedulerDAG::rewriteScheduledInstr(
3246 MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
3247 unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
3248 unsigned NewReg, unsigned PrevReg) {
3249 bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
3250 int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
3251 // Rewrite uses that have been scheduled already to use the new
3252 // Phi register.
3253 for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
3254 EI = MRI.use_end();
3255 UI != EI;) {
3256 MachineOperand &UseOp = *UI;
3257 MachineInstr *UseMI = UseOp.getParent();
3258 ++UI;
3259 if (UseMI->getParent() != BB)
3260 continue;
3261 if (UseMI->isPHI()) {
3262 if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
3263 continue;
3264 if (getLoopPhiReg(*UseMI, BB) != OldReg)
3265 continue;
3266 }
3267 InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
3268 assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
3269 SUnit *OrigMISU = getSUnit(OrigInstr->second);
3270 int StageSched = Schedule.stageScheduled(OrigMISU);
3271 int CycleSched = Schedule.cycleScheduled(OrigMISU);
3272 unsigned ReplaceReg = 0;
3273 // This is the stage for the scheduled instruction.
3274 if (StagePhi == StageSched && Phi->isPHI()) {
3275 int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
3276 if (PrevReg && InProlog)
3277 ReplaceReg = PrevReg;
3278 else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
3279 (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
3280 ReplaceReg = PrevReg;
3281 else
3282 ReplaceReg = NewReg;
3283 }
3284 // The scheduled instruction occurs before the scheduled Phi, and the
3285 // Phi is not loop carried.
3286 if (!InProlog && StagePhi + 1 == StageSched &&
3287 !Schedule.isLoopCarried(this, *Phi))
3288 ReplaceReg = NewReg;
3289 if (StagePhi > StageSched && Phi->isPHI())
3290 ReplaceReg = NewReg;
3291 if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
3292 ReplaceReg = NewReg;
3293 if (ReplaceReg) {
3294 MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
3295 UseOp.setReg(ReplaceReg);
3296 }
3297 }
3298 }
3299
3300 /// Check if we can change the instruction to use an offset value from the
3301 /// previous iteration. If so, return true and set the base and offset values
3302 /// so that we can rewrite the load, if necessary.
3303 /// v1 = Phi(v0, v3)
3304 /// v2 = load v1, 0
3305 /// v3 = post_store v1, 4, x
3306 /// This function enables the load to be rewritten as v2 = load v3, 4.
3307 bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
3308 unsigned &BasePos,
3309 unsigned &OffsetPos,
3310 unsigned &NewBase,
3311 int64_t &Offset) {
3312 // Get the load instruction.
3313 if (TII->isPostIncrement(MI))
3314 return false;
3315 unsigned BasePosLd, OffsetPosLd;
3316 if (!TII->getBaseAndOffsetPosition(MI, BasePosLd, OffsetPosLd))
3317 return false;
3318 unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
3319
3320 // Look for the Phi instruction.
3321 MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
3322 MachineInstr *Phi = MRI.getVRegDef(BaseReg);
3323 if (!Phi || !Phi->isPHI())
3324 return false;
3325 // Get the register defined in the loop block.
3326 unsigned PrevReg = getLoopPhiReg(*Phi, MI->getParent());
3327 if (!PrevReg)
3328 return false;
3329
3330 // Check for the post-increment load/store instruction.
3331 MachineInstr *PrevDef = MRI.getVRegDef(PrevReg);
3332 if (!PrevDef || PrevDef == MI)
3333 return false;
3334
3335 if (!TII->isPostIncrement(PrevDef))
3336