llvm.org GIT mirror llvm / 2a3b42c
[ExecutionDepsFix] Improve clearance calculation for loops Summary: In revision rL278321, ExecutionDepsFix learned how to pick a better register for undef register reads, e.g. for instructions such as `vcvtsi2sdq`. While this revision improved performance on a good number of our benchmarks, it unfortunately also caused significant regressions (up to 3x) on others. This regression turned out to be caused by loops such as: PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT ^ | +----------------------------------+ In the previous version of the clearance calculation, we would visit the blocks in order, remembering for each whether there were any incoming backedges from blocks that we hadn't processed yet and if so queuing up the block to be re-processed. However, for loop structures such as the above, this is clearly insufficient, since the block B does not have any unknown backedges, so we do not see the false dependency from the previous interation's Def of xmm registers in B. To fix this, we need to consider all blocks that are part of the loop and reprocess them one the correct clearance values are known. As an optimization, we also want to avoid reprocessing any later blocks that are not part of the loop. In summary, the iteration order is as follows: Before: PH A B C D A' Corrected (Naive): PH A B C D A' B' C' D' Corrected (w/ optimization): PH A B C A' B' C' D To facilitate this optimization we introduce two new counters for each basic block. The first counts how many of it's predecssors have completed primary processing. The second counts how many of its predecessors have completed all processing (we will call such a block *done*. Now, the criteria to reprocess a block is as follows: - All Predecessors have completed primary processing - For x the number of predecessors that have completed primary processing *at the time of primary processing of this block*, the number of predecessors that are done has reached x. The intuition behind this criterion is as follows: We need to perform primary processing on all predecessors in order to find out any direct defs in those predecessors. When predecessors are done, we also know that we have information about indirect defs (e.g. in block B though that were inherited through B->C->A->B). However, we can't wait for all predecessors to be done, since that would cause cyclic dependencies. However, it is guaranteed that all those predecessors that are prior to us in reverse postorder will be done before us. Since we iterate of the basic blocks in reverse postorder, the number x above, is precisely the count of the number of predecessors prior to us in reverse postorder. Reviewers: myatsina Differential Revision: https://reviews.llvm.org/D28759 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293571 91177308-0d34-0410-b5e6-96231b3b80d8 Keno Fischer 3 years ago
2 changed file(s) with 242 addition(s) and 89 deletion(s). Raw diff Collapse all Expand all
141141 std::vector> AliasMap;
142142 const unsigned NumRegs;
143143 LiveReg *LiveRegs;
144 typedef DenseMap LiveOutMap;
145 LiveOutMap LiveOuts;
144 struct MBBInfo {
145 // Keeps clearance and domain information for all registers. Note that this
146 // is different from the usual definition notion of liveness. The CPU
147 // doesn't care whether or not we consider a register killed.
148 LiveReg *OutRegs;
149
150 // Whether we have gotten to this block in primary processing yet.
151 bool PrimaryCompleted;
152
153 // The number of predecessors for which primary processing has completed
154 unsigned IncomingProcessed;
155
156 // The value of `IncomingProcessed` at the start of primary processing
157 unsigned PrimaryIncoming;
158
159 // The number of predecessors for which all processing steps are done.
160 unsigned IncomingCompleted;
161
162 MBBInfo()
163 : OutRegs(nullptr), PrimaryCompleted(false), IncomingProcessed(0),
164 PrimaryIncoming(0), IncomingCompleted(0) {}
165 };
166 typedef DenseMap MBBInfoMap;
167 MBBInfoMap MBBInfos;
146168
147169 /// List of undefined register reads in this block in forward order.
148170 std::vector > UndefReads;
153175 /// Current instruction number.
154176 /// The first instruction in each basic block is 0.
155177 int CurInstr;
156
157 /// True when the current block has a predecessor that hasn't been visited
158 /// yet.
159 bool SeenUnknownBackEdge;
160
161178 public:
162179 ExeDepsFix(const TargetRegisterClass *rc)
163180 : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
179196 private:
180197 iterator_range::const_iterator>
181198 regIndices(unsigned Reg) const;
182
183199 // DomainValue allocation.
184200 DomainValue *alloc(int domain = -1);
185201 DomainValue *retain(DomainValue *DV) {
198214
199215 void enterBasicBlock(MachineBasicBlock*);
200216 void leaveBasicBlock(MachineBasicBlock*);
201 void visitInstr(MachineInstr*);
202 void processDefs(MachineInstr*, bool Kill);
217 bool isBlockDone(MachineBasicBlock *);
218 void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass);
219 void updateSuccessors(MachineBasicBlock *MBB, bool PrimaryPass);
220 bool visitInstr(MachineInstr *);
221 void processDefs(MachineInstr *, bool breakDependency, bool Kill);
203222 void visitSoftInstr(MachineInstr*, unsigned mask);
204223 void visitHardInstr(MachineInstr*, unsigned domain);
205224 void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
359378
360379 /// Set up LiveRegs by merging predecessor live-out values.
361380 void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
362 // Detect back-edges from predecessors we haven't processed yet.
363 SeenUnknownBackEdge = false;
364
365381 // Reset instruction counter in each basic block.
366382 CurInstr = 0;
367383
396412 // Try to coalesce live-out registers from predecessors.
397413 for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
398414 pe = MBB->pred_end(); pi != pe; ++pi) {
399 LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
400 if (fi == LiveOuts.end()) {
401 SeenUnknownBackEdge = true;
402 continue;
403 }
404 assert(fi->second && "Can't have NULL entries");
415 auto fi = MBBInfos.find(*pi);
416 assert(fi != MBBInfos.end() &&
417 "Should have pre-allocated MBBInfos for all MBBs");
418 LiveReg *Incoming = fi->second.OutRegs;
419 // Incoming is null if this is a backedge from a BB
420 // we haven't processed yet
421 if (Incoming == nullptr) {
422 continue;
423 }
405424
406425 for (unsigned rx = 0; rx != NumRegs; ++rx) {
407426 // Use the most recent predecessor def for each register.
408 LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def);
409
410 DomainValue *pdv = resolve(fi->second[rx].Value);
427 LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def);
428
429 DomainValue *pdv = resolve(Incoming[rx].Value);
411430 if (!pdv)
412431 continue;
413432 if (!LiveRegs[rx].Value) {
431450 force(rx, pdv->getFirstDomain());
432451 }
433452 }
434 DEBUG(dbgs() << "BB#" << MBB->getNumber()
435 << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n"));
453 DEBUG(
454 dbgs() << "BB#" << MBB->getNumber()
455 << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));
436456 }
437457
438458 void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
439459 assert(LiveRegs && "Must enter basic block first.");
440 // Save live registers at end of MBB - used by enterBasicBlock().
441 // Also use LiveOuts as a visited set to detect back-edges.
442 bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second;
443
444 if (First) {
445 // LiveRegs was inserted in LiveOuts. Adjust all defs to be relative to
446 // the end of this block instead of the beginning.
447 for (unsigned i = 0, e = NumRegs; i != e; ++i)
448 LiveRegs[i].Def -= CurInstr;
449 } else {
450 // Insertion failed, this must be the second pass.
460 LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs;
461 // Save register clearances at end of MBB - used by enterBasicBlock().
462 MBBInfos[MBB].OutRegs = LiveRegs;
463
464 // While processing the basic block, we kept `Def` relative to the start
465 // of the basic block for convenience. However, future use of this information
466 // only cares about the clearance from the end of the block, so adjust
467 // everything to be relative to the end of the basic block.
468 for (unsigned i = 0, e = NumRegs; i != e; ++i)
469 LiveRegs[i].Def -= CurInstr;
470 if (OldOutRegs) {
471 // This must be the second pass.
451472 // Release all the DomainValues instead of keeping them.
452473 for (unsigned i = 0, e = NumRegs; i != e; ++i)
453 release(LiveRegs[i].Value);
454 delete[] LiveRegs;
474 release(OldOutRegs[i].Value);
475 delete[] OldOutRegs;
455476 }
456477 LiveRegs = nullptr;
457478 }
458479
459 void ExeDepsFix::visitInstr(MachineInstr *MI) {
460 if (MI->isDebugValue())
461 return;
462
480 bool ExeDepsFix::visitInstr(MachineInstr *MI) {
463481 // Update instructions with explicit execution domains.
464482 std::pair DomP = TII->getExecutionDomain(*MI);
465483 if (DomP.first) {
469487 visitHardInstr(MI, DomP.first);
470488 }
471489
472 // Process defs to track register ages, and kill values clobbered by generic
473 // instructions.
474 processDefs(MI, !DomP.first);
490 return !DomP.first;
475491 }
476492
477493 /// \brief Helps avoid false dependencies on undef registers by updating the
541557 DEBUG(dbgs() << ": Break dependency.\n");
542558 continue;
543559 }
544 // The current clearance seems OK, but we may be ignoring a def from a
545 // back-edge.
546 if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
547 DEBUG(dbgs() << ": OK .\n");
548 return false;
549 }
550 // A def from an unprocessed back-edge may make us break this dependency.
551 DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
560 DEBUG(dbgs() << ": OK .\n");
552561 return false;
553562 }
554563 return true;
558567 // If Kill is set, also kill off DomainValues clobbered by the defs.
559568 //
560569 // Also break dependencies on partial defs and undef uses.
561 void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
570 void ExeDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
571 bool Kill) {
562572 assert(!MI->isDebugValue() && "Won't process debug values");
563573
564574 // Break dependence on undef uses. Do this before updating LiveRegs below.
565575 unsigned OpNum;
566 unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
567 if (Pref) {
568 pickBestRegisterForUndef(MI, OpNum, Pref);
569 if (shouldBreakDependence(MI, OpNum, Pref))
570 UndefReads.push_back(std::make_pair(MI, OpNum));
576 if (breakDependency) {
577 unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
578 if (Pref) {
579 pickBestRegisterForUndef(MI, OpNum, Pref);
580 if (shouldBreakDependence(MI, OpNum, Pref))
581 UndefReads.push_back(std::make_pair(MI, OpNum));
582 }
571583 }
572584 const MCInstrDesc &MCID = MI->getDesc();
573585 for (unsigned i = 0,
583595 DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
584596 << '\t' << *MI);
585597
586 // Check clearance before partial register updates.
587 // Call breakDependence before setting LiveRegs[rx].Def.
588 unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
589 if (Pref && shouldBreakDependence(MI, i, Pref))
590 TII->breakPartialRegDependency(*MI, i, TRI);
598 if (breakDependency) {
599 // Check clearance before partial register updates.
600 // Call breakDependence before setting LiveRegs[rx].Def.
601 unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
602 if (Pref && shouldBreakDependence(MI, i, Pref))
603 TII->breakPartialRegDependency(*MI, i, TRI);
604 }
591605
592606 // How many instructions since rx was last written?
593607 LiveRegs[rx].Def = CurInstr;
779793 }
780794 }
781795
796 void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) {
797 enterBasicBlock(MBB);
798 // If this block is not done, it makes little sense to make any decisions
799 // based on clearance information. We need to make a second pass anyway,
800 // and by then we'll have better information, so we can avoid doing the work
801 // to try and break dependencies now.
802 bool breakDependency = isBlockDone(MBB);
803 for (MachineInstr &MI : *MBB) {
804 if (!MI.isDebugValue()) {
805 bool Kill = false;
806 if (PrimaryPass)
807 Kill = visitInstr(&MI);
808 processDefs(&MI, breakDependency, Kill);
809 }
810 }
811 if (breakDependency)
812 processUndefReads(MBB);
813 leaveBasicBlock(MBB);
814 }
815
816 bool ExeDepsFix::isBlockDone(MachineBasicBlock *MBB) {
817 return MBBInfos[MBB].PrimaryCompleted &&
818 MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming &&
819 MBBInfos[MBB].IncomingProcessed == MBB->pred_size();
820 }
821
822 void ExeDepsFix::updateSuccessors(MachineBasicBlock *MBB, bool Primary) {
823 bool Done = isBlockDone(MBB);
824 for (auto *Succ : MBB->successors()) {
825 if (!isBlockDone(Succ)) {
826 if (Primary) {
827 MBBInfos[Succ].IncomingProcessed++;
828 }
829 if (Done) {
830 MBBInfos[Succ].IncomingCompleted++;
831 }
832 if (isBlockDone(Succ)) {
833 // Perform secondary processing for this successor. See the big comment
834 // in runOnMachineFunction, for an explanation of the iteration order.
835 processBasicBlock(Succ, false);
836 updateSuccessors(Succ, false);
837 }
838 }
839 }
840 }
841
782842 bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
783843 if (skipFunction(*mf.getFunction()))
784844 return false;
815875 AliasMap[*AI].push_back(i);
816876 }
817877
878 // Initialize the MMBInfos
879 for (auto &MBB : mf) {
880 MBBInfo InitialInfo;
881 MBBInfos.insert(std::make_pair(&MBB, InitialInfo));
882 }
883
884 /*
885 * We want to visit every instruction in every basic block in order to update
886 * it's execution domain or break any false dependencies. However, for the
887 * dependency breaking, we need to know clearances from all predecessors
888 * (including any backedges). One way to do so would be to do two complete
889 * passes over all basic blocks/instructions, the first for recording
890 * clearances, the second to break the dependencies. However, for functions
891 * without backedges, or functions with a lot of straight-line code, and
892 * a small loop, that would be a lot of unnecessary work (since only the
893 * BBs that are part of the loop require two passes). As an example,
894 * consider the following loop.
895 *
896 *
897 * PH -> A -> B (xmm -> xmm) -> C -> D -> EXIT
898 * ^ |
899 * +----------------------------------+
900 *
901 * The iteration order is as follows:
902 * Naive: PH A B C D A' B' C' D'
903 * Optimized: PH A B C A' B' C' D
904 *
905 * Note that we avoid processing D twice, because we can entirely process
906 * the predecessors before getting to D. We call a block that is ready
907 * for its second round of processing `done` (isBlockDone). Once we finish
908 * processing some block, we update the counters in MBBInfos and re-process
909 * any successors that are now done.
910 */
911
818912 MachineBasicBlock *Entry = &*MF->begin();
819913 ReversePostOrderTraversal RPOT(Entry);
820 SmallVector Loops;
821914 for (ReversePostOrderTraversal::rpo_iterator
822915 MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
823916 MachineBasicBlock *MBB = *MBBI;
824 enterBasicBlock(MBB);
825 if (SeenUnknownBackEdge)
826 Loops.push_back(MBB);
827 for (MachineInstr &MI : *MBB)
828 visitInstr(&MI);
829 processUndefReads(MBB);
830 leaveBasicBlock(MBB);
831 }
832
833 // Visit all the loop blocks again in order to merge DomainValues from
834 // back-edges.
835 for (MachineBasicBlock *MBB : Loops) {
836 enterBasicBlock(MBB);
837 for (MachineInstr &MI : *MBB)
838 if (!MI.isDebugValue())
839 processDefs(&MI, false);
840 processUndefReads(MBB);
841 leaveBasicBlock(MBB);
917 // N.B: IncomingProcessed and IncomingCompleted were already updated while
918 // processing this block's predecessors.
919 MBBInfos[MBB].PrimaryCompleted = true;
920 MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed;
921 processBasicBlock(MBB, true);
922 updateSuccessors(MBB, true);
923 }
924
925 // We need to go through again and finalize any blocks that are not done yet.
926 // This is possible if blocks have dead predecessors, so we didn't visit them
927 // above.
928 for (ReversePostOrderTraversal::rpo_iterator
929 MBBI = RPOT.begin(),
930 MBBE = RPOT.end();
931 MBBI != MBBE; ++MBBI) {
932 MachineBasicBlock *MBB = *MBBI;
933 if (!isBlockDone(MBB)) {
934 processBasicBlock(MBB, false);
935 // Don't update successors here. We'll get to them anyway through this
936 // loop.
937 }
842938 }
843939
844940 // Clear the LiveOuts vectors and collapse any remaining DomainValues.
845941 for (ReversePostOrderTraversal::rpo_iterator
846942 MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
847 LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI);
848 if (FI == LiveOuts.end() || !FI->second)
943 auto FI = MBBInfos.find(*MBBI);
944 if (FI == MBBInfos.end() || !FI->second.OutRegs)
849945 continue;
850946 for (unsigned i = 0, e = NumRegs; i != e; ++i)
851 if (FI->second[i].Value)
852 release(FI->second[i].Value);
853 delete[] FI->second;
854 }
855 LiveOuts.clear();
947 if (FI->second.OutRegs[i].Value)
948 release(FI->second.OutRegs[i].Value);
949 delete[] FI->second.OutRegs;
950 }
951 MBBInfos.clear();
856952 UndefReads.clear();
857953 Avail.clear();
858954 Allocator.DestroyAll();
276276 ;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}}
277277 ;AVX-NOT: [[XMM4_7]]
278278 }
279
280 ; Make sure we are making a smart choice regarding undef registers even for more
281 ; complicated loop structures. This example is the inner loop from
282 ; julia> a = falses(10000); a[1:4:end] = true
283 ; julia> linspace(1.0,2.0,10000)[a]
284 define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) {
285 entry:
286 tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"()
287 tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
288 tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
289 br label %loop
290
291 loop:
292 %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ]
293 %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ]
294 %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ]
295 br label %inner_loop
296
297 inner_loop:
298 %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ]
299 %idx = lshr i64 %phi, 6
300 %inputptr = getelementptr i64, i64* %x, i64 %idx
301 %input = load i64, i64* %inputptr, align 8
302 %masked = and i64 %phi, 63
303 %shiftedmasked = shl i64 1, %masked
304 %maskedinput = and i64 %input, %shiftedmasked
305 %cmp = icmp eq i64 %maskedinput, 0
306 %nextk = add i64 %phi, 1
307 br i1 %cmp, label %inner_loop, label %loop_end
308
309 loop_end:
310 %nexti = add i64 %phi_i, 1
311 %nextj = add i64 %phi_j, 1
312 ; Register use, plus us clobbering 7-15 above, basically forces xmm6 here as
313 ; the only reasonable choice. The primary thing we care about is that it's
314 ; not one of the registers used in the loop (e.g. not the output reg here)
315 ;AVX-NOT: %xmm6
316 ;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}}
317 ;AVX-NOT: %xmm6
318 %nexti_f = sitofp i64 %nexti to double
319 %sub = fsub double %c1, %nexti_f
320 %mul = fmul double %sub, %c2
321 ;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}}
322 ;AVX-NOT: %xmm6
323 %phi_f = sitofp i64 %phi to double
324 %mul2 = fmul double %phi_f, %c3
325 %add2 = fadd double %mul, %mul2
326 %div = fdiv double %add2, %c4
327 %prev_j = add i64 %phi_j, -1
328 %outptr = getelementptr double, double* %y, i64 %prev_j
329 store double %div, double* %outptr, align 8
330 %done = icmp slt i64 %size, %nexti
331 br i1 %done, label %loopdone, label %loop
332
333 loopdone:
334 ret void
335 }