llvm.org GIT mirror llvm / 5818a51
CodeGen: Allow small copyable blocks to "break" the CFG. When choosing the best successor for a block, ordinarily we would have preferred a block that preserves the CFG unless there is a strong probability the other direction. For small blocks that can be duplicated we now skip that requirement as well, subject to some simple frequency calculations. Differential Revision: https://reviews.llvm.org/D28583 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293716 91177308-0d34-0410-b5e6-96231b3b80d8 Kyle Butt 3 years ago
28 changed file(s) with 598 addition(s) and 194 deletion(s). Raw diff Collapse all Expand all
498498 }
499499
500500 void BranchFolder::MBFIWrapper::view(bool isSimple) { MBFI.view(isSimple); }
501
502 uint64_t
503 BranchFolder::MBFIWrapper::getEntryFreq() const {
504 return MBFI.getEntryFreq();
505 }
501506
502507 /// CountTerminators - Count the number of terminators in the given
503508 /// block and set I to the position of the first non-terminator, if there
122122 raw_ostream &printBlockFreq(raw_ostream &OS,
123123 const BlockFrequency Freq) const;
124124 void view(bool isSimple = true);
125 uint64_t getEntryFreq() const;
125126
126127 private:
127128 const MachineBlockFrequencyInfo &MBFI;
4040 #include "llvm/CodeGen/MachineFunctionPass.h"
4141 #include "llvm/CodeGen/MachineLoopInfo.h"
4242 #include "llvm/CodeGen/MachineModuleInfo.h"
43 #include "llvm/CodeGen/MachinePostDominators.h"
4344 #include "llvm/CodeGen/TailDuplicator.h"
4445 #include "llvm/Support/Allocator.h"
4546 #include "llvm/Support/CommandLine.h"
4950 #include "llvm/Target/TargetLowering.h"
5051 #include "llvm/Target/TargetSubtargetInfo.h"
5152 #include
53 #include
54 #include
5255 using namespace llvm;
5356
5457 #define DEBUG_TYPE "block-placement"
136139 cl::init(true), cl::Hidden);
137140
138141 // Heuristic for tail duplication.
139 static cl::opt TailDuplicatePlacementThreshold(
142 static cl::opt TailDupPlacementThreshold(
140143 "tail-dup-placement-threshold",
141144 cl::desc("Instruction cutoff for tail duplication during layout. "
142145 "Tail merging during layout is forced to have a threshold "
143146 "that won't conflict."), cl::init(2),
147 cl::Hidden);
148
149 // Heuristic for tail duplication.
150 static cl::opt TailDupPlacementPenalty(
151 "tail-dup-placement-penalty",
152 cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
153 "Copying can increase fallthrough, but it also increases icache "
154 "pressure. This parameter controls the penalty to account for that. "
155 "Percent as integer."),
156 cl::init(2),
144157 cl::Hidden);
145158
146159 extern cl::opt StaticLikelyProb;
271284 /// \brief A typedef for a block filter set.
272285 typedef SmallSetVector BlockFilterSet;
273286
287 /// Pair struct containing basic block and taildup profitiability
288 struct BlockAndTailDupResult {
289 MachineBasicBlock * BB;
290 bool ShouldTailDup;
291 };
292
274293 /// \brief work lists of blocks that are ready to be laid out
275294 SmallVector BlockWorkList;
276295 SmallVector EHPadWorkList;
298317 /// \brief A handle to the target's lowering info.
299318 const TargetLoweringBase *TLI;
300319
320 /// \brief A handle to the dominator tree.
321 MachineDominatorTree *MDT;
322
301323 /// \brief A handle to the post dominator tree.
302 MachineDominatorTree *MDT;
324 MachinePostDominatorTree *MPDT;
303325
304326 /// \brief Duplicator used to duplicate tails during placement.
305327 ///
373395 BlockChain &SuccChain, BranchProbability SuccProb,
374396 BranchProbability RealSuccProb, BlockChain &Chain,
375397 const BlockFilterSet *BlockFilter);
376 MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
377 BlockChain &Chain,
378 const BlockFilterSet *BlockFilter);
398 BlockAndTailDupResult selectBestSuccessor(MachineBasicBlock *BB,
399 BlockChain &Chain,
400 const BlockFilterSet *BlockFilter);
379401 MachineBasicBlock *
380402 selectBestCandidateBlock(BlockChain &Chain,
381403 SmallVectorImpl &WorkList);
408430 void buildCFGChains();
409431 void optimizeBranches();
410432 void alignBlocks();
433 bool shouldTailDuplicate(MachineBasicBlock *BB);
434 /// Check the edge frequencies to see if tail duplication will increase
435 /// fallthroughs.
436 bool isProfitableToTailDup(
437 MachineBasicBlock *BB, MachineBasicBlock *Succ,
438 BranchProbability AdjustedSumProb,
439 BlockChain &Chain, const BlockFilterSet *BlockFilter);
440 /// Returns true if a block can tail duplicate into all unplaced
441 /// predecessors. Filters based on loop.
442 bool canTailDuplicateUnplacedPreds(
443 MachineBasicBlock *BB, MachineBasicBlock *Succ,
444 BlockChain &Chain, const BlockFilterSet *BlockFilter);
411445
412446 public:
413447 static char ID; // Pass identification, replacement for typeid
421455 AU.addRequired();
422456 AU.addRequired();
423457 AU.addRequired();
458 if (TailDupPlacement)
459 AU.addRequired();
424460 AU.addRequired();
425461 AU.addRequired();
426462 MachineFunctionPass::getAnalysisUsage(AU);
435471 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
436472 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
437473 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
474 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
438475 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
439476 INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
440477 "Branch Probability Basic Block Placement", false, false)
566603 return SuccProb;
567604 }
568605
606 /// Check if a block should be tail duplicated.
607 /// \p BB Block to check.
608 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
609 // Blocks with single successors don't create additional fallthrough
610 // opportunities. Don't duplicate them. TODO: When conditional exits are
611 // analyzable, allow them to be duplicated.
612 bool IsSimple = TailDup.isSimpleBB(BB);
613
614 if (BB->succ_size() == 1)
615 return false;
616 return TailDup.shouldTailDuplicate(IsSimple, *BB);
617 }
618
619 /// Compare 2 BlockFrequency's with a small penalty for \p A.
620 /// In order to be conservative, we apply a X% penalty to account for
621 /// increased icache pressure and static heuristics. For small frequencies
622 /// we use only the numerators to improve accuracy. For simplicity, we assume the
623 /// penalty is less than 100%
624 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
625 static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
626 uint64_t EntryFreq) {
627 BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
628 BlockFrequency Gain = A - B;
629 return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
630 }
631
632 /// Check the edge frequencies to see if tail duplication will increase
633 /// fallthroughs. It only makes sense to call this function when
634 /// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
635 /// always locally profitable if we would have picked \p Succ without
636 /// considering duplication.
637 bool MachineBlockPlacement::isProfitableToTailDup(
638 MachineBasicBlock *BB, MachineBasicBlock *Succ,
639 BranchProbability QProb,
640 BlockChain &Chain, const BlockFilterSet *BlockFilter) {
641 // We need to do a probability calculation to make sure this is profitable.
642 // First: does succ have a successor that post-dominates? This affects the
643 // calculation. The 2 relevant cases are:
644 // BB BB
645 // | \Qout | \Qout
646 // P| C |P C
647 // = C' = C'
648 // | /Qin | /Qin
649 // | / | /
650 // Succ Succ
651 // / \ | \ V
652 // U/ =V |U \
653 // / \ = D
654 // D E | /
655 // | /
656 // |/
657 // PDom
658 // '=' : Branch taken for that CFG edge
659 // In the second case, Placing Succ while duplicating it into C prevents the
660 // fallthrough of Succ into either D or PDom, because they now have C as an
661 // unplaced predecessor
662
663 // Start by figuring out which case we fall into
664 MachineBasicBlock *PDom = nullptr;
665 SmallVector SuccSuccs;
666 // Only scan the relevant successors
667 auto AdjustedSuccSumProb =
668 collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
669 BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
670 auto BBFreq = MBFI->getBlockFreq(BB);
671 auto SuccFreq = MBFI->getBlockFreq(Succ);
672 BlockFrequency P = BBFreq * PProb;
673 BlockFrequency Qout = BBFreq * QProb;
674 uint64_t EntryFreq = MBFI->getEntryFreq();
675 // If there are no more successors, it is profitable to copy, as it strictly
676 // increases fallthrough.
677 if (SuccSuccs.size() == 0)
678 return greaterWithBias(P, Qout, EntryFreq);
679
680 auto BestSuccSucc = BranchProbability::getZero();
681 // Find the PDom or the best Succ if no PDom exists.
682 for (MachineBasicBlock *SuccSucc : SuccSuccs) {
683 auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
684 if (Prob > BestSuccSucc)
685 BestSuccSucc = Prob;
686 if (PDom == nullptr)
687 if (MPDT->dominates(SuccSucc, Succ)) {
688 PDom = SuccSucc;
689 break;
690 }
691 }
692 // For the comparisons, we need to know Succ's best incoming edge that isn't
693 // from BB.
694 auto SuccBestPred = BlockFrequency(0);
695 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
696 if (SuccPred == Succ || SuccPred == BB
697 || BlockToChain[SuccPred] == &Chain
698 || (BlockFilter && !BlockFilter->count(SuccPred)))
699 continue;
700 auto Freq = MBFI->getBlockFreq(SuccPred)
701 * MBPI->getEdgeProbability(SuccPred, Succ);
702 if (Freq > SuccBestPred)
703 SuccBestPred = Freq;
704 }
705 // Qin is Succ's best unplaced incoming edge that isn't BB
706 BlockFrequency Qin = SuccBestPred;
707 // If it doesn't have a post-dominating successor, here is the calculation:
708 // BB BB
709 // | \Qout | \
710 // P| C | =
711 // = C' | C
712 // | /Qin | |
713 // | / | C' (+Succ)
714 // Succ Succ /|
715 // / \ | \/ |
716 // U/ =V = /= =
717 // / \ | / \|
718 // D E D E
719 // '=' : Branch taken for that CFG edge
720 // Cost in the first case is: P + V
721 // For this calculation, we always assume P > Qout. If Qout > P
722 // The result of this function will be ignored at the caller.
723 // Cost in the second case is: Qout + Qin * V + P * U + P * V
724 // TODO(iteratee): If we lay out D after Succ, the P * U term
725 // goes away. This logic is coming in D28522.
726
727 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
728 BranchProbability UProb = BestSuccSucc;
729 BranchProbability VProb = AdjustedSuccSumProb - UProb;
730 BlockFrequency V = SuccFreq * VProb;
731 BlockFrequency QinV = Qin * VProb;
732 BlockFrequency BaseCost = P + V;
733 BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;
734 return greaterWithBias(BaseCost, DupCost, EntryFreq);
735 }
736 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
737 BranchProbability VProb = AdjustedSuccSumProb - UProb;
738 BlockFrequency U = SuccFreq * UProb;
739 BlockFrequency V = SuccFreq * VProb;
740 // If there is a post-dominating successor, here is the calculation:
741 // BB BB BB BB
742 // | \Qout | \ | \Qout | \
743 // |P C | = |P C | =
744 // = C' |P C = C' |P C
745 // | /Qin | | | /Qin | |
746 // | / | C' (+Succ) | / | C' (+Succ)
747 // Succ Succ /| Succ Succ /|
748 // | \ V | \/ | | \ V | \/ |
749 // |U \ |U /\ | |U = |U /\ |
750 // = D = = =| | D | = =|
751 // | / |/ D | / |/ D
752 // | / | / | = | /
753 // |/ | / |/ | =
754 // Dom Dom Dom Dom
755 // '=' : Branch taken for that CFG edge
756 // The cost for taken branches in the first case is P + U
757 // The cost in the second case (assuming independence), given the layout:
758 // BB, Succ, (C+Succ), D, Dom
759 // is Qout + P * V + Qin * U
760 // compare P + U vs Qout + P + Qin * U.
761 //
762 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
763 //
764 // For the 3rd case, the cost is P + 2 * V
765 // For the 4th case, the cost is Qout + Qin * U + P * V + V
766 // We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V
767 if (UProb > AdjustedSuccSumProb / 2
768 && !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],
769 UProb, UProb, Chain, BlockFilter)) {
770 // Cases 3 & 4
771 return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),
772 EntryFreq);
773 }
774 // Cases 1 & 2
775 return greaterWithBias(
776 (P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq);
777 }
778
779
780 /// When the option TailDupPlacement is on, this method checks if the
781 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
782 /// into all of its unplaced, unfiltered predecessors, that are not BB.
783 bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
784 MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
785 const BlockFilterSet *BlockFilter) {
786 if (!shouldTailDuplicate(Succ))
787 return false;
788
789 for (MachineBasicBlock *Pred : Succ->predecessors()) {
790 // Make sure all unplaced and unfiltered predecessors can be
791 // tail-duplicated into.
792 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
793 || BlockToChain[Pred] == &Chain)
794 continue;
795 if (!TailDup.canTailDuplicate(Succ, Pred))
796 return false;
797 }
798 return true;
799 }
800
569801 /// When the option OutlineOptionalBranches is on, this method
570802 /// checks if the fallthrough candidate block \p Succ (of block
571803 /// \p BB) also has other unscheduled predecessor blocks which
614846 if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
615847 /* See case 1 below for the cost analysis. For BB->Succ to
616848 * be taken with smaller cost, the following needs to hold:
617 * Prob(BB->Succ) > 2* Prob(BB->Pred)
618 * So the threshold T
619 * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
620 * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified
621 * branch bias, we have
849 * Prob(BB->Succ) > 2 * Prob(BB->Pred)
850 * So the threshold T in the calculation below
851 * (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
852 * So T / (1 - T) = 2, Yielding T = 2/3
853 * Also adding user specified branch bias, we have
622854 * T = (2/3)*(ProfileLikelyProb/50)
623855 * = (2*ProfileLikelyProb)/150)
624856 */
630862
631863 /// Checks to see if the layout candidate block \p Succ has a better layout
632864 /// predecessor than \c BB. If yes, returns true.
865 /// \p SuccProb: The probability adjusted for only remaining blocks.
866 /// Only used for logging
867 /// \p RealSuccProb: The un-adjusted probability.
868 /// \p Chain: The chain that BB belongs to and Succ is being considered for.
869 /// \p BlockFilter: if non-null, the set of blocks that make up the loop being
870 /// considered
633871 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
634872 MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
635873 BranchProbability SuccProb, BranchProbability RealSuccProb,
761999 for (MachineBasicBlock *Pred : Succ->predecessors()) {
7621000 if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
7631001 (BlockFilter && !BlockFilter->count(Pred)) ||
764 BlockToChain[Pred] == &Chain)
1002 BlockToChain[Pred] == &Chain ||
1003 // This check is redundant except for look ahead. This function is
1004 // called for lookahead by isProfitableToTailDup when BB hasn't been
1005 // placed yet.
1006 (Pred == BB))
7651007 continue;
7661008 // Do backward checking.
7671009 // For all cases above, we need a backward checking to filter out edges that
768 // are not 'strongly' biased. With profile data available, the check is
769 // mostly redundant for case 2 (when threshold prob is set at 50%) unless S
770 // has more than two successors.
1010 // are not 'strongly' biased.
7711011 // BB Pred
7721012 // \ /
7731013 // Succ
8031043 /// breaking CFG structure, but cave and break such structures in the case of
8041044 /// very hot successor edges.
8051045 ///
806 /// \returns The best successor block found, or null if none are viable.
807 MachineBasicBlock *
1046 /// \returns The best successor block found, or null if none are viable, along
1047 /// with a boolean indicating if tail duplication is necessary.
1048 MachineBlockPlacement::BlockAndTailDupResult
8081049 MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
8091050 BlockChain &Chain,
8101051 const BlockFilterSet *BlockFilter) {
8111052 const BranchProbability HotProb(StaticLikelyProb, 100);
8121053
813 MachineBasicBlock *BestSucc = nullptr;
1054 BlockAndTailDupResult BestSucc = { nullptr, false };
8141055 auto BestProb = BranchProbability::getZero();
8151056
8161057 SmallVector Successors;
8181059 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
8191060
8201061 DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
1062
1063 // For blocks with CFG violations, we may be able to lay them out anyway with
1064 // tail-duplication. We keep this vector so we can perform the probability
1065 // calculations the minimum number of times.
1066 SmallVector, 4>
1067 DupCandidates;
8211068 for (MachineBasicBlock *Succ : Successors) {
8221069 auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
8231070 BranchProbability SuccProb =
8251072
8261073 // This heuristic is off by default.
8271074 if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
828 HotProb))
829 return Succ;
1075 HotProb)) {
1076 BestSucc.BB = Succ;
1077 return BestSucc;
1078 }
8301079
8311080 BlockChain &SuccChain = *BlockToChain[Succ];
8321081 // Skip the edge \c BB->Succ if block \c Succ has a better layout
8331082 // predecessor that yields lower global cost.
8341083 if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
835 Chain, BlockFilter))
1084 Chain, BlockFilter)) {
1085 // If tail duplication would make Succ profitable, place it.
1086 if (TailDupPlacement && shouldTailDuplicate(Succ))
1087 DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
8361088 continue;
1089 }
8371090
8381091 DEBUG(
8391092 dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: "
8411094 << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
8421095 << "\n");
8431096
844 if (BestSucc && BestProb >= SuccProb) {
1097 if (BestSucc.BB && BestProb >= SuccProb) {
8451098 DEBUG(dbgs() << " Not the best candidate, continuing\n");
8461099 continue;
8471100 }
8481101
8491102 DEBUG(dbgs() << " Setting it as best candidate\n");
850 BestSucc = Succ;
1103 BestSucc.BB = Succ;
8511104 BestProb = SuccProb;
8521105 }
853 if (BestSucc)
854 DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc) << "\n");
1106 // Handle the tail duplication candidates in order of decreasing probability.
1107 // Stop at the first one that is profitable. Also stop if they are less
1108 // profitable than BestSucc. Position is important because we preserve it and
1109 // prefer first best match. Here we aren't comparing in order, so we capture
1110 // the position instead.
1111 if (DupCandidates.size() != 0) {
1112 auto cmp =
1113 [](const std::tuple &a,
1114 const std::tuple &b) {
1115 return std::get<0>(a) > std::get<0>(b);
1116 };
1117 std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
1118 }
1119 for(auto &Tup : DupCandidates) {
1120 BranchProbability DupProb;
1121 MachineBasicBlock *Succ;
1122 std::tie(DupProb, Succ) = Tup;
1123 if (DupProb < BestProb)
1124 break;
1125 if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
1126 // If tail duplication gives us fallthrough when we otherwise wouldn't
1127 // have it, that is a strict gain.
1128 && (BestSucc.BB == nullptr
1129 || isProfitableToTailDup(BB, Succ, BestProb, Chain,
1130 BlockFilter))) {
1131 DEBUG(
1132 dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: "
1133 << DupProb
1134 << " (Tail Duplicate)\n");
1135 BestSucc.BB = Succ;
1136 BestSucc.ShouldTailDup = true;
1137 break;
1138 }
1139 }
1140
1141 if (BestSucc.BB)
1142 DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc.BB) << "\n");
8551143
8561144 return BestSucc;
8571145 }
10001288
10011289 // Look for the best viable successor if there is one to place immediately
10021290 // after this block.
1003 MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
1291 auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
1292 MachineBasicBlock* BestSucc = Result.BB;
1293 bool ShouldTailDup = Result.ShouldTailDup;
1294 if (TailDupPlacement)
1295 ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
10041296
10051297 // If an immediate successor isn't available, look for the best viable
10061298 // block among those we've identified as not violating the loop's CFG at
10211313
10221314 // Placement may have changed tail duplication opportunities.
10231315 // Check for that now.
1024 if (TailDupPlacement && BestSucc) {
1316 if (TailDupPlacement && BestSucc && ShouldTailDup) {
10251317 // If the chosen successor was duplicated into all its predecessors,
10261318 // don't bother laying it out, just go round the loop again with BB as
10271319 // the chain end.
19132205 DuplicatedToLPred = false;
19142206 DEBUG(dbgs() << "Redoing tail duplication for Succ#"
19152207 << BB->getNumber() << "\n");
1916 bool IsSimple = TailDup.isSimpleBB(BB);
1917 // Blocks with single successors don't create additional fallthrough
1918 // opportunities. Don't duplicate them. TODO: When conditional exits are
1919 // analyzable, allow them to be duplicated.
1920 if (!IsSimple && BB->succ_size() == 1)
1921 return false;
1922 if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
2208
2209 if (!shouldTailDuplicate(BB))
19232210 return false;
19242211 // This has to be a callback because none of it can be done after
19252212 // BB is deleted.
19722259 llvm::function_ref(RemovalCallback);
19732260
19742261 SmallVector DuplicatedPreds;
2262 bool IsSimple = TailDup.isSimpleBB(BB);
19752263 TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
19762264 &DuplicatedPreds, &RemovalCallbackRef);
19772265
20122300 TII = MF.getSubtarget().getInstrInfo();
20132301 TLI = MF.getSubtarget().getTargetLowering();
20142302 MDT = &getAnalysis();
2303 MPDT = nullptr;
20152304
20162305 // Initialize PreferredLoopExit to nullptr here since it may never be set if
20172306 // there are no MachineLoops.
20182307 PreferredLoopExit = nullptr;
20192308
20202309 if (TailDupPlacement) {
2021 unsigned TailDupSize = TailDuplicatePlacementThreshold;
2310 MPDT = &getAnalysis();
2311 unsigned TailDupSize = TailDupPlacementThreshold;
20222312 if (MF.getFunction()->optForSize())
20232313 TailDupSize = 1;
20242314 TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
20372327 BranchFoldPlacement;
20382328 // No tail merging opportunities if the block number is less than four.
20392329 if (MF.size() > 3 && EnableTailMerge) {
2040 unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1;
2330 unsigned TailMergeSize = TailDupPlacementThreshold + 1;
20412331 BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
20422332 *MBPI, TailMergeSize);
20432333
20482338 BlockToChain.clear();
20492339 // Must redo the dominator tree if blocks were changed.
20502340 MDT->runOnMachineFunction(MF);
2341 if (MPDT)
2342 MPDT->runOnMachineFunction(MF);
20512343 ChainAllocator.DestroyAll();
20522344 buildCFGChains();
20532345 }
88 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
99 ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
1010 ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
11 ; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
11 ; CHECK-NEXT: ret
1212 ; CHECK-NEXT: [[FAILBB]]:
1313 ; CHECK-NEXT: clrex
14 ; CHECK-NEXT: [[EXITBB]]:
14 ; CHECK-NEXT: ret
1515 %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
1616 %val = extractvalue { i32, i1 } %pair, 0
1717 ret i32 %val
2626 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
2727 ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
2828 ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
29 ; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
29 ; CHECK-NEXT: mov x0, x[[ADDR]]
30 ; CHECK-NEXT: ret
3031 ; CHECK-NEXT: [[FAILBB]]:
3132 ; CHECK-NEXT: clrex
32 ; CHECK-NEXT: [[EXITBB]]:
33 ; CHECK-NEXT: mov x0, x[[ADDR]]
34 ; CHECK-NEXT: ret
3335 %new = load i32, i32* %pnew
3436 %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
3537 %val = extractvalue { i32, i1 } %pair, 0
4042 ; CHECK-LABEL: val_compare_and_swap_rel:
4143 ; CHECK-NEXT: mov x[[ADDR:[0-9]+]], x0
4244 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
43 ; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]
45 ; CHECK-NEXT: ldaxr [[RESULT:w[0-9]+]], [x[[ADDR]]]
4446 ; CHECK-NEXT: cmp [[RESULT]], w1
4547 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
46 ; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
48 ; CHECK-NEXT: stlxr [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
4749 ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
48 ; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
50 ; CHECK-NEXT: ret
4951 ; CHECK-NEXT: [[FAILBB]]:
5052 ; CHECK-NEXT: clrex
51 ; CHECK-NEXT: [[EXITBB]]:
53 ; CHECK-NEXT: ret
5254 %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
5355 %val = extractvalue { i32, i1 } %pair, 0
5456 ret i32 %val
6365 ; CHECK-NEXT: b.ne [[FAILBB:.?LBB[0-9_]+]]
6466 ; CHECK-NEXT: stxr [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
6567 ; CHECK-NEXT: cbnz [[SCRATCH_REG]], [[TRYBB]]
66 ; CHECK-NEXT: b [[EXITBB:.?LBB[0-9_]+]]
68 ; CHECK-NEXT: ret
6769 ; CHECK-NEXT: [[FAILBB]]:
6870 ; CHECK-NEXT: clrex
69 ; CHECK-NEXT: [[EXITBB]]:
71 ; CHECK-NEXT: ret
7072 %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
7173 %val = extractvalue { i64, i1 } %pair, 0
7274 ret i64 %val
345345 ; CHECK-NEXT: sub w1, w1, #1
346346 ; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
347347 ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
348 ; DISABLE-NEXT: b [[IFEND_LABEL]]
349 ;
350 ; DISABLE: [[ELSE_LABEL]]: ; %if.else
351 ; DISABLE: lsl w0, w1, #1
352 ;
353 ; CHECK: [[IFEND_LABEL]]:
348 ; CHECK-NEXT: [[IFEND_LABEL]]:
354349 ; Epilogue code.
355350 ; CHECK: add sp, sp, #16
356351 ; CHECK-NEXT: ret
357352 ;
358 ; ENABLE: [[ELSE_LABEL]]: ; %if.else
359 ; ENABLE-NEXT: lsl w0, w1, #1
360 ; ENABLE_NEXT: ret
353 ; CHECK: [[ELSE_LABEL]]: ; %if.else
354 ; CHECK-NEXT: lsl w0, w1, #1
355 ; DISABLE-NEXT: add sp, sp, #16
356 ; CHECK-NEXT: ret
361357 define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
362358 entry:
363359 %ap = alloca i8*, align 8
+0
-69
test/CodeGen/AArch64/tail-dup-repeat-worklist.ll less more
None ; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s
1 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
2 target triple = "aarch64-unknown-linux-gnu"
3
4 %struct.s1 = type { %struct.s3*, %struct.s1* }
5 %struct.s2 = type opaque
6 %struct.s3 = type { i32 }
7
8 ; Function Attrs: nounwind
9 define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 {
10 entry:
11 br label %while.cond.outer
12
13 ; The loop gets laid out:
14 ; %while.cond.outer
15 ; %(null)
16 ; %(null)
17 ; %dup2
18 ; and then %dup1 gets chosen as the next block.
19 ; when dup2 is duplicated into dup1, %worklist could erroneously be placed on
20 ; the worklist, because all of its current predecessors are now scheduled.
21 ; However, after dup2 is tail-duplicated, %worklist can't be on the worklist
22 ; because it now has unscheduled predecessors.q
23 ; CHECK-LABEL: repeated_dup_worklist
24 ; CHECK: // %entry
25 ; CHECK: // %while.cond.outer
26 ; first %(null) block
27 ; CHECK: // in Loop:
28 ; CHECK: ldr
29 ; CHECK-NEXT: tbnz
30 ; second %(null) block
31 ; CHECK: // in Loop:
32 ; CHECK: // %dup2
33 ; CHECK: // %worklist
34 ; CHECK: // %if.then96.i
35 while.cond.outer: ; preds = %dup1, %entry
36 %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ]
37 %inc77 = add nsw i32 %progress.0.ph, 1
38 %cmp = icmp slt i32 %progress.0.ph, %i32_1
39 br i1 %cmp, label %dup2, label %dup1
40
41 dup2: ; preds = %if.then96.i, %worklist, %while.cond.outer
42 %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ]
43 %.pr = load %struct.s1*, %struct.s1** %pp1, align 8
44 br label %dup1
45
46 dup1: ; preds = %dup2, %while.cond.outer
47 %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ]
48 %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ]
49 br i1 %i1_1, label %while.cond.outer, label %worklist
50
51 worklist: ; preds = %dup1
52 %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0
53 %1 = load %struct.s3*, %struct.s3** %snode94, align 8
54 %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0
55 %3 = load i32, i32* %2, align 4
56 %tobool95.i = icmp eq i32 %3, 0
57 br i1 %tobool95.i, label %if.then96.i, label %dup2
58
59 if.then96.i: ; preds = %worklist
60 call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1
61 br label %dup2
62 }
63
64 ; Function Attrs: nounwind
65 declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0
66
67 attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
68 attributes #1 = { nounwind }
99 br i1 %cmp, label %if.then, label %if.end
1010
1111 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
12 ; CHECK: tbz [[CMP]], #31
12 ; CHECK: tbnz [[CMP]], #31
1313
1414 if.then:
1515 call void @t()
2727 br i1 %cmp, label %if.then, label %if.end
2828
2929 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
30 ; CHECK: tbz [[CMP]], #63
30 ; CHECK: tbnz [[CMP]], #63
3131
3232 if.then:
3333 call void @t()
117117 br i1 %cmp, label %if.then, label %if.end
118118
119119 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
120 ; CHECK: tbz [[CMP]], #31
120 ; CHECK: tbnz [[CMP]], #31
121121
122122 if.then:
123123 call void @t()
177177 br i1 %tst, label %if.then, label %if.end
178178
179179 ; CHECK-NOT: cmp
180 ; CHECK: tbz x0, #63
180 ; CHECK: tbnz x0, #63
181181
182182 if.then:
183183 call void @t()
193193 br i1 %tst, label %if.then, label %if.end
194194
195195 ; CHECK-NOT: cmp
196 ; CHECK: tbz x0, #63
196 ; CHECK: tbnz x0, #63
197197
198198 if.then:
199199 call void @t()
208208
209209 ; CHECK: ldr [[CMP:x[0-9]+]], [x1]
210210 ; CHECK-NOT: cmp
211 ; CHECK: tbz [[CMP]], #63
211 ; CHECK: tbnz [[CMP]], #63
212212
213213 %val = load i64, i64* %ptr
214214 %tst = icmp slt i64 %val, 0
228228 br i1 %tst, label %if.then, label %if.end
229229
230230 ; CHECK-NOT: cmp
231 ; CHECK: tbz x0, #63
231 ; CHECK: tbnz x0, #63
232232
233233 if.then:
234234 call void @t()
246246
247247 ; CHECK: orr [[CMP:x[0-9]+]], x0, x1
248248 ; CHECK-NOT: cmp
249 ; CHECK: tbz [[CMP]], #63
249 ; CHECK: tbnz [[CMP]], #63
250250
251251 if.then:
252252 call void @t()
334334 ; GCN-NEXT: ;;#ASMEND
335335
336336 ; GCN-NEXT: [[BB3]]: ; %bb3
337 ; GCN-NEXT: ;;#ASMSTART
338 ; GCN-NEXT: v_nop_e64
339 ; GCN-NEXT: ;;#ASMEND
340 ; GCN-NEXT: ;;#ASMSTART
341 ; GCN-NEXT: v_nop_e64
342 ; GCN-NEXT: ;;#ASMEND
337343 ; GCN-NEXT: s_endpgm
338344 define void @expand_requires_expand(i32 %cond0) #0 {
339345 bb0:
355361 br label %bb3
356362
357363 bb3:
364 ; These NOPs prevent tail-duplication-based outlining
365 ; from firing, which defeats the need to expand the branches and this test.
366 call void asm sideeffect
367 "v_nop_e64", ""() #0
368 call void asm sideeffect
369 "v_nop_e64", ""() #0
358370 ret void
359371 }
360372
384396
385397 ; GCN-NEXT: [[ENDIF]]: ; %endif
386398 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
399 ; GCN-NEXT: s_sleep 5
387400 ; GCN-NEXT: s_endpgm
388401 define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
389402 entry:
401414 br label %endif
402415
403416 endif:
417 ; layout can remove the split branch if it can copy the return block.
418 ; This call makes the return block long enough that it doesn't get copied.
419 call void @llvm.amdgcn.s.sleep(i32 5);
404420 ret void
405421 }
406422
251251 ; GCN: s_cmp_lt_i32 [[COND]], 1
252252 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
253253 ; GCN: v_cmp_gt_i32_e64 vcc, [[COND]], 0{{$}}
254 ; GCN: s_cbranch_vccnz [[EXIT]]
254 ; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
255 ; GCN: {{^}}[[EXIT]]:
256 ; GCN: s_endpgm
257 ; GCN: {{^}}[[BODY]]:
255258 ; GCN: buffer_store
256 ; GCN: {{^}}[[EXIT]]:
257259 ; GCN: s_endpgm
258260 define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
259261 bb:
301303 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
302304 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
303305 ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
304 ; GCN: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
305306 ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0
306 ; GCN: s_cbranch_scc1 [[ENDIF_LABEL]]
307 ; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]]
308 ; GCN: s_endpgm
309 ; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
307310 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
308311 ; GCN: buffer_store_dword [[ONE]]
309312 define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
327330
328331 ; GCN-LABEL: {{^}}divergent_inside_uniform:
329332 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
330 ; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
333 ; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
334 ; GCN: [[IF_LABEL]]:
331335 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
332336 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
333337 ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
334338 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
335339 ; GCN: buffer_store_dword [[ONE]]
336 ; GCN: [[ENDIF_LABEL]]:
337 ; GCN: s_endpgm
338340 define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
339341 entry:
340342 %u_cmp = icmp eq i32 %cond, 0
362364 ; GCN: buffer_store_dword [[ONE]]
363365 ; GCN: s_or_b64 exec, exec, [[MASK]]
364366 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
365 ; GCN: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
367 ; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]]
368 ; GCN: s_endpgm
369 ; GCN: [[IF_UNIFORM]]:
366370 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
367371 ; GCN: buffer_store_dword [[TWO]]
368 ; GCN: [[EXIT]]:
369 ; GCN: s_endpgm
370372 define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
371373 entry:
372374 %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
4848 ; V8-NEXT: beq
4949 ; V8-NEXT: %tailrecurse.switch
5050 ; V8: cmp
51 ; V8-NEXT: bne
52 ; V8-NEXT: b
53 ; The trailing space in the last line checks that the branch is unconditional
51 ; V8-NEXT: beq
52 ; V8-NEXT: %sw.epilog
53 ; V8-NEXT: bx lr
5454 switch i32 %and, label %sw.epilog [
5555 i32 1, label %sw.bb
5656 i32 3, label %sw.bb6
319319 ; CHECK: strex [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
320320 ; CHECK: cmp [[SUCCESS]], #0
321321 ; CHECK: bne [[LOOP_BB]]
322 ; CHECK: b [[END_BB:\.?LBB[0-9]+_[0-9]+]]
322 ; CHECK: dmb ish
323 ; CHECK: bx lr
323324 ; CHECK: [[FAIL_BB]]:
324325 ; CHECK-NEXT: clrex
325 ; CHECK-NEXT: [[END_BB]]:
326326 ; CHECK: dmb ish
327327 ; CHECK: bx lr
328328
10441044 ; function there.
10451045 ; CHECK-ARM-NEXT: cmp r[[OLD]], r0
10461046 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
1047 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
1047 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
10481048 ; CHECK-NEXT: BB#2:
10491049 ; As above, r1 is a reasonable guess.
10501050 ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
10511051 ; CHECK-NEXT: cmp [[STATUS]], #0
10521052 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
1053 ; CHECK-NEXT: b .LBB{{[0-9]+}}_4
1054 ; CHECK-NEXT: .LBB{{[0-9]+}}_3:
1053 ; CHECK-ARM: mov r0, r[[OLD]]
1054 ; CHECK: bx lr
1055 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
10551056 ; CHECK-NEXT: clrex
1056 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
10571057 ; CHECK-NOT: dmb
10581058 ; CHECK-NOT: mcr
10591059
10601060 ; CHECK-ARM: mov r0, r[[OLD]]
1061 ; CHECK-ARM-NEXT: bx lr
10611062 ret i8 %old
10621063 }
10631064
10771078 ; function there.
10781079 ; CHECK-ARM-NEXT: cmp r[[OLD]], r0
10791080 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
1080 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
1081 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
10811082 ; CHECK-NEXT: BB#2:
10821083 ; As above, r1 is a reasonable guess.
10831084 ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
10841085 ; CHECK-NEXT: cmp [[STATUS]], #0
10851086 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
1086 ; CHECK-NEXT: b .LBB{{[0-9]+}}_4
1087 ; CHECK-NEXT: .LBB{{[0-9]+}}_3:
1087 ; CHECK-ARM: mov r0, r[[OLD]]
1088 ; CHECK: bx lr
1089 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
10881090 ; CHECK-NEXT: clrex
1089 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
10901091 ; CHECK-NOT: dmb
10911092 ; CHECK-NOT: mcr
10921093
10931094 ; CHECK-ARM: mov r0, r[[OLD]]
1095 ; CHECK-ARM-NEXT: bx lr
10941096 ret i16 %old
10951097 }
10961098
11091111 ; r0 below is a reasonable guess but could change: it certainly comes into the
11101112 ; function there.
11111113 ; CHECK-NEXT: cmp r[[OLD]], r0
1112 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
1114 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
11131115 ; CHECK-NEXT: BB#2:
11141116 ; As above, r1 is a reasonable guess.
11151117 ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
11161118 ; CHECK-NEXT: cmp [[STATUS]], #0
11171119 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
1118 ; CHECK-NEXT: b .LBB{{[0-9]+}}_4
1119 ; CHECK-NEXT: .LBB{{[0-9]+}}_3:
1120 ; CHECK: str{{(.w)?}} r[[OLD]],
1121 ; CHECK-NEXT: bx lr
1122 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
11201123 ; CHECK-NEXT: clrex
1121 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
11221124 ; CHECK-NOT: dmb
11231125 ; CHECK-NOT: mcr
11241126
11251127 ; CHECK: str{{(.w)?}} r[[OLD]],
1128 ; CHECK-ARM-NEXT: bx lr
11261129 ret void
11271130 }
11281131
11471150 ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
11481151 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
11491152 ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]]
1150 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
1153 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
11511154 ; CHECK-NEXT: BB#2:
11521155 ; As above, r2, r3 is a reasonable guess.
11531156 ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
11541157 ; CHECK-NEXT: cmp [[STATUS]], #0
11551158 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
1156 ; CHECK-NEXT: b .LBB{{[0-9]+}}_4
1157 ; CHECK-NEXT: .LBB{{[0-9]+}}_3:
1159 ; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
1160 ; CHECK-NEXT: pop
1161 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
11581162 ; CHECK-NEXT: clrex
1159 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
11601163 ; CHECK-NOT: dmb
11611164 ; CHECK-NOT: mcr
11621165
1212 ; CHECK-NEXT: dmb ish
1313 ; CHECK-NEXT: strex [[SUCCESS:r[0-9]+]], r2, [r0]
1414 ; CHECK-NEXT: cmp [[SUCCESS]], #0
15 ; CHECK-NEXT: bne [[FAILBB:LBB[0-9]+_[0-9]+]]
15 ; CHECK-NEXT: beq [[SUCCESSBB:LBB[0-9]+_[0-9]+]]
1616 ; CHECK-NEXT: BB#2:
17 ; CHECK-NEXT: dmb ish
1817 ; CHECK-NEXT: str r3, [r0]
1918 ; CHECK-NEXT: bx lr
2019 ; CHECK-NEXT: [[LDFAILBB]]:
2120 ; CHECK-NEXT: clrex
22 ; CHECK-NEXT: [[FAILBB]]:
21 ; CHECK-NEXT: str r3, [r0]
22 ; CHECK-NEXT: bx lr
23 ; CHECK-NEXT: [[SUCCESSBB]]:
24 ; CHECK-NEXT: dmb ish
2325 ; CHECK-NEXT: str r3, [r0]
2426 ; CHECK-NEXT: bx lr
2527
66 entry:
77 %0 = load i32, i32* @j, align 4
88 %cmp = icmp eq i32 %0, 0
9 br i1 %cmp, label %if.then, label %if.end
9 br i1 %cmp, label %if.then, label %if.end, !prof !1
1010
1111 ; 16: bnez ${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
1212 ; 16: lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
2020 ret void
2121 }
2222
23
23 !1 = !{!"branch_weights", i32 2, i32 1}
55 %x = alloca i32, align 4
66 %0 = load i32, i32* %x, align 4
77 %cmp = icmp eq i32 %0, 0
8 br i1 %cmp, label %if.then, label %if.end
8 br i1 %cmp, label %if.then, label %if.end, !prof !1
99
1010 if.then:
1111 store i32 10, i32* %x, align 4
1616 }
1717
1818 ; CHECK: bnezc
19 !1 = !{!"branch_weights", i32 2, i32 1}
1616 %sum1 = add i32 %sumin, 1
1717 %val1 = load i32, i32* %ptr
1818 %p = icmp eq i32 %sumin, 0
19 br i1 %p, label %true, label %end
19 br i1 %p, label %true, label %end, !prof !1
2020 true:
2121 %sum2 = add i32 %sum1, 1
2222 %ptr2 = getelementptr i32, i32* %ptr, i32 1
5252 ret i32 %valmerge
5353 }
5454 declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
55
56 !1 = !{!"branch_weights", i32 2, i32 1}
0 ; RUN: llc -O2 -o - %s | FileCheck %s
1 target datalayout = "e-m:e-i64:64-n32:64"
2 target triple = "powerpc64le-grtev4-linux-gnu"
3
4 ; Intended layout:
5 ; The code for tail-duplication during layout will produce the layout:
6 ; test1
7 ; test2
8 ; body1 (with copy of test2)
9 ; body2
10 ; exit
11
12 ;CHECK-LABEL: tail_dup_break_cfg:
13 ;CHECK: mr [[TAGREG:[0-9]+]], 3
14 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
15 ;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
16 ;CHECK-NEXT: # %test2
17 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
18 ;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
19 ;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
20 ;CHECK-NEXT: [[BODY1LABEL]]
21 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
22 ;CHECK-NEXT: beq 0, [[EXITLABEL]]
23 ;CHECK-NEXT: [[BODY2LABEL]]
24 ;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
25 ;CHECK: blr
26 define void @tail_dup_break_cfg(i32 %tag) {
27 entry:
28 br label %test1
29 test1:
30 %tagbit1 = and i32 %tag, 1
31 %tagbit1eq0 = icmp eq i32 %tagbit1, 0
32 br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
33 body1:
34 call void @a()
35 call void @a()
36 call void @a()
37 call void @a()
38 br label %test2
39 test2:
40 %tagbit2 = and i32 %tag, 2
41 %tagbit2eq0 = icmp eq i32 %tagbit2, 0
42 br i1 %tagbit2eq0, label %exit, label %body2, !prof !1 ; %exit more likely
43 body2:
44 call void @b()
45 call void @b()
46 call void @b()
47 call void @b()
48 br label %exit
49 exit:
50 ret void
51 }
52
53 ; The branch weights here hint that we shouldn't tail duplicate in this case.
54 ;CHECK-LABEL: tail_dup_dont_break_cfg:
55 ;CHECK: mr [[TAGREG:[0-9]+]], 3
56 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
57 ;CHECK-NEXT: bc 4, 1, [[TEST2LABEL:[._0-9A-Za-z]+]]
58 ;CHECK-NEXT: # %body1
59 ;CHECK: [[TEST2LABEL]]: # %test2
60 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
61 ;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
62 ;CHECK-NEXT: # %body2
63 ;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
64 ;CHECK: blr
65 define void @tail_dup_dont_break_cfg(i32 %tag) {
66 entry:
67 br label %test1
68 test1:
69 %tagbit1 = and i32 %tag, 1
70 %tagbit1eq0 = icmp eq i32 %tagbit1, 0
71 br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
72 body1:
73 call void @a()
74 call void @a()
75 call void @a()
76 call void @a()
77 br label %test2
78 test2:
79 %tagbit2 = and i32 %tag, 2
80 %tagbit2eq0 = icmp ne i32 %tagbit2, 0
81 br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
82 body2:
83 call void @b()
84 call void @b()
85 call void @b()
86 call void @b()
87 br label %exit
88 exit:
89 ret void
90 }
91 declare void @a()
92 declare void @b()
93 declare void @c()
94 declare void @d()
95
96 ; This function arranges for the successors of %succ to have already been laid
97 ; out. When we consider whether to lay out succ after bb and to tail-duplicate
98 ; it, v and ret have already been placed, so we tail-duplicate as it removes a
99 ; branch and strictly increases fallthrough
100 ; CHECK-LABEL: tail_dup_no_succ
101 ; CHECK: # %entry
102 ; CHECK: # %v
103 ; CHECK: # %ret
104 ; CHECK: # %bb
105 ; CHECK: # %succ
106 ; CHECK: # %c
107 ; CHECK: bl c
108 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
109 ; CHECK: beq
110 ; CHECK: b
111 define void @tail_dup_no_succ(i32 %tag) {
112 entry:
113 %tagbit1 = and i32 %tag, 1
114 %tagbit1eq0 = icmp eq i32 %tagbit1, 0
115 br i1 %tagbit1eq0, label %v, label %bb, !prof !2 ; %v very much more likely
116 bb:
117 %tagbit2 = and i32 %tag, 2
118 %tagbit2eq0 = icmp eq i32 %tagbit2, 0
119 br i1 %tagbit2eq0, label %succ, label %c, !prof !3 ; %succ more likely
120 c:
121 call void @c()
122 call void @c()
123 br label %succ
124 succ:
125 %tagbit3 = and i32 %tag, 4
126 %tagbit3eq0 = icmp eq i32 %tagbit3, 0
127 br i1 %tagbit3eq0, label %ret, label %v, !prof !1 ; %u more likely
128 v:
129 call void @d()
130 call void @d()
131 br label %ret
132 ret:
133 ret void
134 }
135
136
137 !1 = !{!"branch_weights", i32 5, i32 3}
138 !2 = !{!"branch_weights", i32 95, i32 5}
139 !3 = !{!"branch_weights", i32 7, i32 3}
6565 ; CHECK: ba .LBB1_1
6666 ; CHECK: nop
6767 ; CHECK:.LBB1_1: ! %entry
68 ; CHECK: ba .LBB1_3
6968 ; CHECK: mov %g0, %i0
69 ; CHECK: cmp %i0, 0
70 ; CHECK: bne .LBB1_4
71 ; CHECK: ba .LBB1_5
7072 ; CHECK:.LBB1_2: ! Block address taken
7173 ; CHECK: mov 1, %i0
72 ; CHECK:.LBB1_3: ! %entry
73 ; CHECK: cmp %i0, 0
7474 ; CHECK: be .LBB1_5
75 ; CHECK: nop
75 ; CHECK:.LBB1_4:
76 ; CHECK: ba .LBB1_6
7677 }
7778 declare i8* @llvm.frameaddress(i32) #2
7879
472472 %xor = xor i32 %val, 1
473473 %add = add i32 %xor, 1000000
474474 call void @foo()
475 %cmp = icmp ne i32 %add, 0
476 br i1 %cmp, label %exit, label %store
475 %cmp = icmp eq i32 %add, 0
476 br i1 %cmp, label %store, label %exit, !prof !1
477477
478478 store:
479479 store i32 %add, i32 *%ptr
887887 exit:
888888 ret i64 %res
889889 }
890
891 !1 = !{!"branch_weights", i32 2, i32 1}
None ; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
0 ; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
11 ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
2 ; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
2 ; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
33 ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
4 ; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
4 ; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
55 ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
6 ; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
6 ; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
77 ; RUN: | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
8
89 ;
910 ; Note: Lots of tests use inline asm instead of regular calls.
1011 ; This allows to have a better control on what the allocation will do.
1415 ; edges.
1516 ; Also disable the late if-converter as it makes harder to reason on
1617 ; the diffs.
18 ; Disable tail-duplication during placement, as v4t vs v5t get different
19 ; results due to branches not being analyzable under v5
1720
1821 ; Initial motivating example: Simple diamond with a call just on one side.
1922 ; CHECK-LABEL: foo:
2525 call void @x()
2626 call void @x()
2727 call void @x()
28 ; CHECK: cbnz
28 ; CHECK: cbz
2929 %q = icmp eq i32 %y, 0
3030 br i1 %q, label %t2, label %f
3131
33
44 define void @f0(i32 %x) optsize {
55 ; CHECK-LABEL: f0:
6 ; CHECK: cbnz
6 ; CHECK: cbz
77 %p = icmp eq i32 %x, 0
88 br i1 %p, label %t, label %f
99
1111
1212 define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string,std::allocator >"* %this, %"struct.std::basic_string,std::allocator >"* %__str) {
1313 ; CHECK-LABEL: _ZNKSs7compareERKSs:
14 ; CHECK: cbnz r0,
14 ; CHECK: cbz r0,
15 ; CHECK-NEXT: %bb1
16 ; CHECK-NEXT: pop.w
1517 ; CHECK-NEXT: %bb
1618 ; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}}
17 ; CHECK-NEXT: %bb1
1819 ; CHECK-NEXT: pop.w
1920 entry:
2021 %0 = tail call arm_aapcs_vfpcc i32 @_ZNKSs4sizeEv(%"struct.std::basic_string,std::allocator >"* %this) ; [#uses=3]
77 ; Basic phi triangle.
88
99 ; CHECK-LABEL: test0:
10 ; CHECK: div_s $[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}}
11 ; CHECK: return $[[NUM0]]{{$}}
10 ; CHECK: return $0
11 ; CHECK: div_s $push[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}}
12 ; CHECK: return $pop[[NUM0]]{{$}}
1213 define i32 @test0(i32 %p) {
1314 entry:
1415 %t = icmp slt i32 %p, 0
6868 ; ALL-NEXT: vxorps %xmm1, %xmm1, %xmm1
6969 ; ALL-NEXT: vucomiss %xmm1, %xmm0
7070 ; ALL-NEXT: jne LBB3_1
71 ; ALL-NEXT: jnp LBB3_2
71 ; ALL-NEXT: jp LBB3_1
72 ; ALL-NEXT: ## BB#2: ## %return
73 ; ALL-NEXT: retq
7274 ; ALL-NEXT: LBB3_1: ## %if.end
7375 ; ALL-NEXT: seta %al
7476 ; ALL-NEXT: movzbl %al, %eax
7577 ; ALL-NEXT: leaq {{.*}}(%rip), %rcx
7678 ; ALL-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
77 ; ALL-NEXT: LBB3_2: ## %return
7879 ; ALL-NEXT: retq
7980 entry:
8081 %cmp = fcmp oeq float %p, 0.000000e+00
4242 ; CHECK-LABEL: test2b:
4343 ; CHECK: # BB#0: # %entry
4444 ; CHECK-NEXT: btl %esi, %edi
45 ; CHECK-NEXT: jb .LBB1_2
45 ; CHECK-NEXT: jae .LBB1_1
4646 ;
4747 entry:
4848 %tmp29 = lshr i32 %x, %n
8282 ; CHECK-LABEL: atest2b:
8383 ; CHECK: # BB#0: # %entry
8484 ; CHECK-NEXT: btl %esi, %edi
85 ; CHECK-NEXT: jb .LBB3_2
85 ; CHECK-NEXT: jae .LBB3_1
8686 ;
8787 entry:
8888 %tmp29 = ashr i32 %x, %n
102102 ; CHECK-LABEL: test3:
103103 ; CHECK: # BB#0: # %entry
104104 ; CHECK-NEXT: btl %esi, %edi
105 ; CHECK-NEXT: jb .LBB4_2
105 ; CHECK-NEXT: jae .LBB4_1
106106 ;
107107 entry:
108108 %tmp29 = shl i32 1, %n
122122 ; CHECK-LABEL: test3b:
123123 ; CHECK: # BB#0: # %entry
124124 ; CHECK-NEXT: btl %esi, %edi
125 ; CHECK-NEXT: jb .LBB5_2
125 ; CHECK-NEXT: jae .LBB5_1
126126 ;
127127 entry:
128128 %tmp29 = shl i32 1, %n
3535
3636 entry:
3737 %mul = fmul double %x, %y
38 %cmp = fcmp une double %mul, 0.000000e+00
39 br i1 %cmp, label %bb2, label %bb1
38 %cmp = fcmp oeq double %mul, 0.000000e+00
39 br i1 %cmp, label %bb1, label %bb2
4040
4141 bb1:
4242 %add = fadd double %mul, -1.000000e+00
55 ; CHECK: jns
66 %tmp1 = add i32 %X, 1 ; [#uses=1]
77 %tmp = icmp slt i32 %tmp1, 0 ; [#uses=1]
8 br i1 %tmp, label %cond_true, label %cond_next
8 br i1 %tmp, label %cond_true, label %cond_next, !prof !1
99
1010 cond_true: ; preds = %entry
1111 %tmp2 = tail call i32 (...) @bar( ) ; [#uses=0]
302302 if.end:
303303 ret i32 undef
304304 }
305
306 !1 = !{!"branch_weights", i32 2, i32 1}