llvm.org GIT mirror llvm / a466b36
Codegen: Make chains from trellis-shaped CFGs Lay out trellis-shaped CFGs optimally. A trellis of the shape below: A B |\ /| | \ / | | X | | / \ | |/ \| C D would be laid out A; B->C ; D by the current layout algorithm. Now we identify trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an increasing number of predecessors. A trellis is a a group of 2 or more predecessor blocks that all have the same successors. because of this we can tail duplicate to extend existing trellises. As an example consider the following CFG: B D F H / \ / \ / \ / \ A---C---E---G---Ret Where A,C,E,G are all small (Currently 2 instructions). The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret. The current code will copy C into B, E into D and G into F and yield the layout A,C,B(C),E,D(E),F(G),G,H,ret define void @straight_test(i32 %tag) { entry: br label %test1 test1: ; A %tagbit1 = and i32 %tag, 1 %tagbit1eq0 = icmp eq i32 %tagbit1, 0 br i1 %tagbit1eq0, label %test2, label %optional1 optional1: ; B call void @a() br label %test2 test2: ; C %tagbit2 = and i32 %tag, 2 %tagbit2eq0 = icmp eq i32 %tagbit2, 0 br i1 %tagbit2eq0, label %test3, label %optional2 optional2: ; D call void @b() br label %test3 test3: ; E %tagbit3 = and i32 %tag, 4 %tagbit3eq0 = icmp eq i32 %tagbit3, 0 br i1 %tagbit3eq0, label %test4, label %optional3 optional3: ; F call void @c() br label %test4 test4: ; G %tagbit4 = and i32 %tag, 8 %tagbit4eq0 = icmp eq i32 %tagbit4, 0 br i1 %tagbit4eq0, label %exit, label %optional4 optional4: ; H call void @d() br label %exit exit: ret void } here is the layout after D27742: straight_test: # @straight_test ; ... Prologue elided ; BB#0: # %entry ; A (merged with test1) ; ... More prologue elided mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_2 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_3 b .LBB0_4 .LBB0_2: # %optional1 ; B (copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_4 .LBB0_3: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_5 b .LBB0_6 .LBB0_4: # %optional2 ; D (copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_5: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 b .LBB0_7 .LBB0_6: # %optional3 ; F (copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit ; Ret ld 30, 96(1) # 8-byte Folded Reload addi 1, 1, 112 ld 0, 16(1) mtlr 0 blr The tail-duplication has produced some benefit, but it has also produced a trellis which is not laid out optimally. With this patch, we improve the layouts of such trellises, and decrease the cost calculation for tail-duplication accordingly. This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have back edges, which is a negative, but it has a bigger compensating positive, which is that it handles the case where there are long strings of skipped blocks much better than the original layout. Both layouts handle runs of executed blocks equally well. Branch prediction also improves if there is any correlation between subsequent optional blocks. Here is the resulting concrete layout: straight_test: # @straight_test ; BB#0: # %entry ; A (merged with test1) mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_4 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_5 .LBB0_2: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_3: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 bne 0, .LBB0_7 b .LBB0_8 .LBB0_4: # %optional1 ; B (Copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_2 .LBB0_5: # %optional2 ; D (Copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_3 .LBB0_6: # %optional3 ; F (Copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit Differential Revision: https://reviews.llvm.org/D28522 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295223 91177308-0d34-0410-b5e6-96231b3b80d8 Kyle Butt 3 years ago
24 changed file(s) with 746 addition(s) and 148 deletion(s). Raw diff Collapse all Expand all
293293
294294 /// Pair struct containing basic block and taildup profitiability
295295 struct BlockAndTailDupResult {
296 MachineBasicBlock * BB;
296 MachineBasicBlock *BB;
297297 bool ShouldTailDup;
298 };
299
300 /// Triple struct containing edge weight and the edge.
301 struct WeightedEdge {
302 BlockFrequency Weight;
303 MachineBasicBlock *Src;
304 MachineBasicBlock *Dest;
298305 };
299306
300307 /// \brief work lists of blocks that are ready to be laid out
301308 SmallVector BlockWorkList;
302309 SmallVector EHPadWorkList;
310
311 /// Edges that have already been computed as optimal by the trellis code.
312 DenseMap ComputedTrellisEdges;
303313
304314 /// \brief Machine Function
305315 MachineFunction *F;
439449 void buildCFGChains();
440450 void optimizeBranches();
441451 void alignBlocks();
452 /// Returns true if a block should be tail-duplicated to increase fallthrough
453 /// opportunities.
442454 bool shouldTailDuplicate(MachineBasicBlock *BB);
443455 /// Check the edge frequencies to see if tail duplication will increase
444456 /// fallthroughs.
446458 const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
447459 BranchProbability AdjustedSumProb,
448460 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
461 /// Check for a trellis layout.
462 bool isTrellis(const MachineBasicBlock *BB,
463 const SmallVectorImpl &ViableSuccs,
464 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
465 /// Get the best successor given a trellis layout.
466 BlockAndTailDupResult getBestTrellisSuccessor(
467 const MachineBasicBlock *BB,
468 const SmallVectorImpl &ViableSuccs,
469 BranchProbability AdjustedSumProb, const BlockChain &Chain,
470 const BlockFilterSet *BlockFilter);
471 /// Get the best pair of non-conflicting edges.
472 static std::pair getBestNonConflictingEdges(
473 const MachineBasicBlock *BB,
474 SmallVector, 2> &Edges);
449475 /// Returns true if a block can tail duplicate into all unplaced
450476 /// predecessors. Filters based on loop.
451477 bool canTailDuplicateUnplacedPreds(
613639 return SuccProb;
614640 }
615641
616 /// Check if a block should be tail duplicated.
642 /// Check if \p BB has exactly the successors in \p Successors.
643 static bool
644 hasSameSuccessors(MachineBasicBlock &BB,
645 SmallPtrSetImpl &Successors) {
646 if (BB.succ_size() != Successors.size())
647 return false;
648 // We don't want to count self-loops
649 if (Successors.count(&BB))
650 return false;
651 for (MachineBasicBlock *Succ : BB.successors())
652 if (!Successors.count(Succ))
653 return false;
654 return true;
655 }
656
657 /// Check if a block should be tail duplicated to increase fallthrough
658 /// opportunities.
617659 /// \p BB Block to check.
618660 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
619661 // Blocks with single successors don't create additional fallthrough
723765 // | / | C' (+Succ)
724766 // Succ Succ /|
725767 // / \ | \/ |
726 // U/ =V = /= =
768 // U/ =V | == |
727769 // / \ | / \|
728770 // D E D E
729771 // '=' : Branch taken for that CFG edge
730772 // Cost in the first case is: P + V
731773 // For this calculation, we always assume P > Qout. If Qout > P
732774 // The result of this function will be ignored at the caller.
733 // Cost in the second case is: Qout + Qin * V + P * U + P * V
734 // TODO(iteratee): If we lay out D after Succ, the P * U term
735 // goes away. This logic is coming in D28522.
775 // Cost in the second case is: Qout + Qin * U + P * V
736776
737777 if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
738778 BranchProbability UProb = BestSuccSucc;
739779 BranchProbability VProb = AdjustedSuccSumProb - UProb;
740780 BlockFrequency V = SuccFreq * VProb;
741 BlockFrequency QinV = Qin * VProb;
781 BlockFrequency QinU = Qin * UProb;
742782 BlockFrequency BaseCost = P + V;
743 BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;
783 BlockFrequency DupCost = Qout + QinU + P * VProb;
744784 return greaterWithBias(BaseCost, DupCost, EntryFreq);
745785 }
746786 BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
757797 // Succ Succ /| Succ Succ /|
758798 // | \ V | \/ | | \ V | \/ |
759799 // |U \ |U /\ | |U = |U /\ |
760 // = D = = =| | D | = =|
800 // = D = = \= | D | = =|
761801 // | / |/ D | / |/ D
762802 // | / | / | = | /
763803 // |/ | / |/ | =
767807 // The cost in the second case (assuming independence), given the layout:
768808 // BB, Succ, (C+Succ), D, Dom
769809 // is Qout + P * V + Qin * U
770 // compare P + U vs Qout + P + Qin * U.
810 // compare P + U vs Qout + P * U + Qin.
771811 //
772812 // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
773813 //
774814 // For the 3rd case, the cost is P + 2 * V
775815 // For the 4th case, the cost is Qout + Qin * U + P * V + V
776816 // We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V
777 if (UProb > AdjustedSuccSumProb / 2
778 && !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],
779 UProb, UProb, Chain, BlockFilter)) {
817 if (UProb > AdjustedSuccSumProb / 2 &&
818 !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
819 Chain, BlockFilter))
780820 // Cases 3 & 4
781821 return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),
782822 EntryFreq);
783 }
784823 // Cases 1 & 2
785824 return greaterWithBias(
786 (P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq);
787 }
788
825 (P + U), (Qout + Qin * AdjustedSuccSumProb + P * UProb), EntryFreq);
826 }
827
828 /// Check for a trellis layout. \p BB is the upper part of a trellis if its
829 /// successors form the lower part of a trellis. A successor set S forms the
830 /// lower part of a trellis if all of the predecessors of S are either in S or
831 /// have all of S as successors. We ignore trellises where BB doesn't have 2
832 /// successors because for fewer than 2, it's trivial, and for 3 or greater they
833 /// are very uncommon and complex to compute optimally. Allowing edges within S
834 /// is not strictly a trellis, but the same algorithm works, so we allow it.
835 bool MachineBlockPlacement::isTrellis(
836 const MachineBasicBlock *BB,
837 const SmallVectorImpl &ViableSuccs,
838 const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
839 // Technically BB could form a trellis with branching factor higher than 2.
840 // But that's extremely uncommon.
841 if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
842 return false;
843
844 SmallPtrSet Successors(BB->succ_begin(),
845 BB->succ_end());
846 // To avoid reviewing the same predecessors twice.
847 SmallPtrSet SeenPreds;
848
849 for (MachineBasicBlock *Succ : ViableSuccs) {
850 int PredCount = 0;
851 for (auto SuccPred : Succ->predecessors()) {
852 // Allow triangle successors, but don't count them.
853 if (Successors.count(SuccPred))
854 continue;
855 const BlockChain *PredChain = BlockToChain[SuccPred];
856 if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
857 PredChain == &Chain || PredChain == BlockToChain[Succ])
858 continue;
859 ++PredCount;
860 // Perform the successor check only once.
861 if (!SeenPreds.insert(SuccPred).second)
862 continue;
863 if (!hasSameSuccessors(*SuccPred, Successors))
864 return false;
865 }
866 // If one of the successors has only BB as a predecessor, it is not a
867 // trellis.
868 if (PredCount < 1)
869 return false;
870 }
871 return true;
872 }
873
874 /// Pick the highest total weight pair of edges that can both be laid out.
875 /// The edges in \p Edges[0] are assumed to have a different destination than
876 /// the edges in \p Edges[1]. Simple counting shows that the best pair is either
877 /// the individual highest weight edges to the 2 different destinations, or in
878 /// case of a conflict, one of them should be replaced with a 2nd best edge.
879 std::pair
880 MachineBlockPlacement::WeightedEdge>
881 MachineBlockPlacement::getBestNonConflictingEdges(
882 const MachineBasicBlock *BB,
883 SmallVector, 2>
884 &Edges) {
885 // Sort the edges, and then for each successor, find the best incoming
886 // predecessor. If the best incoming predecessors aren't the same,
887 // then that is clearly the best layout. If there is a conflict, one of the
888 // successors will have to fallthrough from the second best predecessor. We
889 // compare which combination is better overall.
890
891 // Sort for highest frequency.
892 auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
893
894 std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
895 std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
896 auto BestA = Edges[0].begin();
897 auto BestB = Edges[1].begin();
898 // Arrange for the correct answer to be in BestA and BestB
899 // If the 2 best edges don't conflict, the answer is already there.
900 if (BestA->Src == BestB->Src) {
901 // Compare the total fallthrough of (Best + Second Best) for both pairs
902 auto SecondBestA = std::next(BestA);
903 auto SecondBestB = std::next(BestB);
904 BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
905 BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
906 if (BestAScore < BestBScore)
907 BestA = SecondBestA;
908 else
909 BestB = SecondBestB;
910 }
911 // Arrange for the BB edge to be in BestA if it exists.
912 if (BestB->Src == BB)
913 std::swap(BestA, BestB);
914 return std::make_pair(*BestA, *BestB);
915 }
916
917 /// Get the best successor from \p BB based on \p BB being part of a trellis.
918 /// We only handle trellises with 2 successors, so the algorithm is
919 /// straightforward: Find the best pair of edges that don't conflict. We find
920 /// the best incoming edge for each successor in the trellis. If those conflict,
921 /// we consider which of them should be replaced with the second best.
922 /// Upon return the two best edges will be in \p BestEdges. If one of the edges
923 /// comes from \p BB, it will be in \p BestEdges[0]
924 MachineBlockPlacement::BlockAndTailDupResult
925 MachineBlockPlacement::getBestTrellisSuccessor(
926 const MachineBasicBlock *BB,
927 const SmallVectorImpl &ViableSuccs,
928 BranchProbability AdjustedSumProb, const BlockChain &Chain,
929 const BlockFilterSet *BlockFilter) {
930
931 BlockAndTailDupResult Result = {nullptr, false};
932 SmallPtrSet Successors(BB->succ_begin(),
933 BB->succ_end());
934
935 // We assume size 2 because it's common. For general n, we would have to do
936 // the Hungarian algorithm, but it's not worth the complexity because more
937 // than 2 successors is fairly uncommon, and a trellis even more so.
938 if (Successors.size() != 2 || ViableSuccs.size() != 2)
939 return Result;
940
941 // Collect the edge frequencies of all edges that form the trellis.
942 SmallVector, 2> Edges(2);
943 int SuccIndex = 0;
944 for (auto Succ : ViableSuccs) {
945 for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
946 // Skip any placed predecessors that are not BB
947 if (SuccPred != BB)
948 if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
949 BlockToChain[SuccPred] == &Chain ||
950 BlockToChain[SuccPred] == BlockToChain[Succ])
951 continue;
952 BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
953 MBPI->getEdgeProbability(SuccPred, Succ);
954 Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
955 }
956 ++SuccIndex;
957 }
958
959 // Pick the best combination of 2 edges from all the edges in the trellis.
960 WeightedEdge BestA, BestB;
961 std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
962
963 if (BestA.Src != BB) {
964 // If we have a trellis, and BB doesn't have the best fallthrough edges,
965 // we shouldn't choose any successor. We've already looked and there's a
966 // better fallthrough edge for all the successors.
967 DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
968 return Result;
969 }
970
971 // Did we pick the triangle edge? If tail-duplication is profitable, do
972 // that instead. Otherwise merge the triangle edge now while we know it is
973 // optimal.
974 if (BestA.Dest == BestB.Src) {
975 // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
976 // would be better.
977 MachineBasicBlock *Succ1 = BestA.Dest;
978 MachineBasicBlock *Succ2 = BestB.Dest;
979 // Check to see if tail-duplication would be profitable.
980 if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
981 canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
982 isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
983 Chain, BlockFilter)) {
984 DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
985 MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
986 dbgs() << " Selected: " << getBlockName(Succ2)
987 << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
988 Result.BB = Succ2;
989 Result.ShouldTailDup = true;
990 return Result;
991 }
992 }
993 // We have already computed the optimal edge for the other side of the
994 // trellis.
995 ComputedTrellisEdges[BestB.Src] = BestB.Dest;
996
997 auto TrellisSucc = BestA.Dest;
998 DEBUG(BranchProbability SuccProb = getAdjustedProbability(
999 MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
1000 dbgs() << " Selected: " << getBlockName(TrellisSucc)
1001 << ", probability: " << SuccProb << " (Trellis)\n");
1002 Result.BB = TrellisSucc;
1003 return Result;
1004 }
7891005
7901006 /// When the option TailDupPlacement is on, this method checks if the
7911007 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
7961012 if (!shouldTailDuplicate(Succ))
7971013 return false;
7981014
1015 // For CFG checking.
1016 SmallPtrSet Successors(BB->succ_begin(),
1017 BB->succ_end());
7991018 for (MachineBasicBlock *Pred : Succ->predecessors()) {
8001019 // Make sure all unplaced and unfiltered predecessors can be
8011020 // tail-duplicated into.
8031022 if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
8041023 || BlockToChain[Pred] == &Chain)
8051024 continue;
806 if (!TailDup.canTailDuplicate(Succ, Pred))
1025 if (!TailDup.canTailDuplicate(Succ, Pred)) {
1026 if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
1027 // This will result in a trellis after tail duplication, so we don't
1028 // need to copy Succ into this predecessor. In the presence
1029 // of a trellis tail duplication can continue to be profitable.
1030 // For example:
1031 // A A
1032 // |\ |\
1033 // | \ | \
1034 // | C | C+BB
1035 // | / | |
1036 // |/ | |
1037 // BB => BB |
1038 // |\ |\/|
1039 // | \ |/\|
1040 // | D | D
1041 // | / | /
1042 // |/ |/
1043 // Succ Succ
1044 //
1045 // After BB was duplicated into C, the layout looks like the one on the
1046 // right. BB and C now have the same successors. When considering
1047 // whether Succ can be duplicated into all its unplaced predecessors, we
1048 // ignore C.
1049 // We can do this because C already has a profitable fallthrough, namely
1050 // D. TODO(iteratee): ignore sufficiently cold predecessors for
1051 // duplication and for this test.
1052 //
1053 // This allows trellises to be laid out in 2 separate chains
1054 // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
1055 // because it allows the creation of 2 fallthrough paths with links
1056 // between them, and we correctly identify the best layout for these
1057 // CFGs. We want to extend trellises that the user created in addition
1058 // to trellises created by tail-duplication, so we just look for the
1059 // CFG.
1060 continue;
8071061 return false;
1062 }
8081063 }
8091064 return true;
8101065 }
9891244 // | Pred----| | S1----
9901245 // | | | |
9911246 // --(S1 or S2) ---Pred--
1247 // |
1248 // S2
9921249 //
9931250 // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
9941251 // + min(freq(Pred->S1), freq(Pred->S2))
9951252 // Non-topo-order cost:
996 // In the worst case, S2 will not get laid out after Pred.
9971253 // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
9981254 // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
9991255 // is 0. Then the non topo layout is better when
10711327 collectViableSuccessors(BB, Chain, BlockFilter, Successors);
10721328
10731329 DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
1330
1331 // if we already precomputed the best successor for BB as part of a trellis we
1332 // saw earlier, return that if still applicable.
1333 auto FoundEdge = ComputedTrellisEdges.find(BB);
1334 if (FoundEdge != ComputedTrellisEdges.end()) {
1335 MachineBasicBlock *Succ = FoundEdge->second;
1336 ComputedTrellisEdges.erase(FoundEdge);
1337 BlockChain *SuccChain = BlockToChain[Succ];
1338 if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
1339 SuccChain != &Chain && Succ == *SuccChain->begin()) {
1340 BestSucc.BB = Succ;
1341 return BestSucc;
1342 }
1343 }
1344
1345 // if BB is part of a trellis, Use the trellis to determine the optimal
1346 // fallthrough edges
1347 if (isTrellis(BB, Successors, Chain, BlockFilter))
1348 return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
1349 BlockFilter);
10741350
10751351 // For blocks with CFG violations, we may be able to lay them out anyway with
10761352 // tail-duplication. We keep this vector so we can perform the probability
55
66 ; CHECK-NEXT: ; BB#1: ; %b3
77 ; CHECK: ldr [[LOAD:w[0-9]+]]
8 ; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
9 ; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
8 ; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
9 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
1010
11 ; CHECK-NEXT: [[SKIP_LONG_B]]:
12 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
11 ; CHECK-NEXT: [[B8]]: ; %b8
12 ; CHECK-NEXT: ret
1313
1414 ; CHECK-NEXT: [[B2]]: ; %b2
1515 ; CHECK: mov w{{[0-9]+}}, #93
1616 ; CHECK: bl _extfunc
1717 ; CHECK: cbz w{{[0-9]+}}, [[B7]]
18
19 ; CHECK-NEXT: [[B8]]: ; %b8
20 ; CHECK-NEXT: ret
18 ; CHECK-NEXT: b [[B8]]
2119
2220 ; CHECK-NEXT: [[B7]]: ; %b7
2321 ; CHECK: mov w{{[0-9]+}}, #13
2422 ; CHECK: b _extfunc
23
2524 define void @split_block_no_fallthrough(i64 %val) #0 {
2625 bb:
2726 %c0 = icmp sgt i64 %val, -5
263263 define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
264264 ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
265265 ; CHECK: cmn
266 ; CHECK: b.gt
266 ; CHECK: b.le
267267 ; CHECK: cmp
268268 ; CHECK: b.gt
269269 entry:
1010 ;
1111 ; CHECK-LABEL: func
1212 ; CHECK-NOT: and
13 ; CHECK: tbnz
13 ; CHECK: tbz
1414 define void @func() {
1515 %c0 = icmp sgt i64 0, 0
1616 br i1 %c0, label %b1, label %b6
77 ; GCNNOOPT: v_writelane_b32
88 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
99
10
11 ; GCN: ; BB#1
1210 ; GCNNOOPT: v_readlane_b32
1311 ; GCNNOOPT: v_readlane_b32
1412 ; GCN: buffer_store_dword
15 ; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
16 ; TODO: This waitcnt can be eliminated
13 ; GCNNOOPT: s_endpgm
1714
1815 ; GCN: {{^}}[[END]]:
1916 ; GCN: s_endpgm
490490
491491 ; GCN-LABEL: {{^}}long_branch_hang:
492492 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
493 ; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
493 ; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
494 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
494495 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
495496
496497 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
11 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
22
33 ; GCN-LABEL: {{^}}test_loop:
4 ; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
4 ; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
55 ; GCN: ds_read_b32
66 ; GCN: ds_write_b32
77 ; GCN: s_branch [[LABEL]]
2828 ; GCN: v_cmp_ne_u32_e64
2929
3030 ; GCN: BB{{[0-9]+_[0-9]+}}:
31
3132 define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
3233 bb:
3334 %tmp = call i32 @llvm.amdgcn.workitem.id.x()
438438 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
439439 ; GCN-NOHSA: buffer_store_dword [[ONE]]
440440 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
441 ; GCN; {{^}}[[EXIT]]:
441 ; GCN: {{^}}[[EXIT]]:
442442 ; GCN: s_endpgm
443443 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
444444 bb3: ; preds = %bb2
1111 ; CHECK: bl _quux
1212 ; CHECK-NOT: bl _quux
1313
14 ; NOMERGE: bl _baz
15 ; NOMERGE: bl _baz
14 ; NOMERGE-DAG: bl _baz
15 ; NOMERGE-DAG: bl _baz
1616
17 ; NOMERGE: bl _quux
18 ; NOMERGE: bl _quux
17 ; NOMERGE-DAG: bl _quux
18 ; NOMERGE-DAG: bl _quux
1919
2020 ; ModuleID = 'tail.c'
2121 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
6565 ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
6666 ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
6767 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
68 ; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
68 ; CHECK-ARMV7-NEXT: moveq r0, #1
6969 ; CHECK-ARMV7-NEXT: bxeq lr
7070 ; CHECK-ARMV7-NEXT: [[TRY]]:
71 ; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
72 ; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
71 ; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
72 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
7373 ; CHECK-ARMV7-NEXT: beq [[HEAD]]
7474 ; CHECK-ARMV7-NEXT: clrex
75 ; CHECK-ARMV7-NEXT: mov [[RES]], #0
75 ; CHECK-ARMV7-NEXT: mov r0, #0
7676 ; CHECK-ARMV7-NEXT: bx lr
7777
7878 ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
134134
135135 ; Important to check for beginning of basic block, because if it gets
136136 ; if-converted the test is probably no longer checking what it should.
137 ; CHECK: {{LBB[0-9]+_2}}:
137 ; CHECK: %end
138138 ; CHECK-NEXT: vpop {d7, d8}
139139 ; CHECK-NEXT: pop {r4, pc}
140140
1515 ;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
1616 ;CHECK-NEXT: # %test2
1717 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
18 ;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
19 ;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
18 ;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
19 ;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
20 ;CHECK: blr
2021 ;CHECK-NEXT: [[BODY1LABEL]]
2122 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
2223 ;CHECK-NEXT: beq 0, [[EXITLABEL]]
23 ;CHECK-NEXT: [[BODY2LABEL]]
24 ;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
25 ;CHECK: blr
24 ;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
25 ;CHECK: b [[EXITLABEL]]
2626 define void @tail_dup_break_cfg(i32 %tag) {
2727 entry:
2828 br label %test1
7878 test2:
7979 %tagbit2 = and i32 %tag, 2
8080 %tagbit2eq0 = icmp ne i32 %tagbit2, 0
81 br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
81 br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
8282 body2:
8383 call void @b()
8484 call void @b()
136136
137137 !1 = !{!"branch_weights", i32 5, i32 3}
138138 !2 = !{!"branch_weights", i32 95, i32 5}
139 !3 = !{!"branch_weights", i32 7, i32 3}
139 !3 = !{!"branch_weights", i32 8, i32 3}
None ; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
0 ; RUN: llc -O2 < %s | FileCheck %s
11 target datalayout = "e-m:e-i64:64-n32:64"
22 target triple = "powerpc64le-grtev4-linux-gnu"
33
44 ; Intended layout:
5 ; The outlining flag produces the layout
5 ; The chain-based outlining produces the layout
66 ; test1
77 ; test2
88 ; test3
99 ; test4
10 ; exit
1110 ; optional1
1211 ; optional2
1312 ; optional3
1413 ; optional4
14 ; exit
1515 ; Tail duplication puts test n+1 at the end of optional n
1616 ; so optional1 includes a copy of test2 at the end, and branches
1717 ; to test3 (at the top) or falls through to optional 2.
18 ; The CHECK statements check for the whole string of tests and exit block,
18 ; The CHECK statements check for the whole string of tests
1919 ; and then check that the correct test has been duplicated into the end of
2020 ; the optional blocks and that the optional blocks are in the correct order.
21 ;CHECK-LABEL: f:
21 ;CHECK-LABEL: straight_test:
2222 ; test1 may have been merged with entry
2323 ;CHECK: mr [[TAGREG:[0-9]+]], 3
2424 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
25 ;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
26 ;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
25 ;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
26 ;CHECK-NEXT: # %test2
2727 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
28 ;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
29 ;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
28 ;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
29 ;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
3030 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
31 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
32 ;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
31 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
32 ;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
3333 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
34 ;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
35 ;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
34 ;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
35 ;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
3636 ;CHECK: blr
37 ;CHECK-NEXT: [[OPT1LABEL]]
37 ;CHECK-NEXT: .[[OPT1LABEL]]:
3838 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
39 ;CHECK-NEXT: beq 0, [[TEST3LABEL]]
40 ;CHECK-NEXT: [[OPT2LABEL]]
39 ;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
40 ;CHECK-NEXT: .[[OPT2LABEL]]:
4141 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
42 ;CHECK-NEXT: beq 0, [[TEST4LABEL]]
43 ;CHECK-NEXT: [[OPT3LABEL]]
42 ;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
43 ;CHECK-NEXT: .[[OPT3LABEL]]:
4444 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
45 ;CHECK-NEXT: beq 0, [[EXITLABEL]]
46 ;CHECK-NEXT: [[OPT4LABEL]]
47 ;CHECK: b [[EXITLABEL]]
48
49 define void @f(i32 %tag) {
45 ;CHECK-NEXT: beq 0, .[[EXITLABEL]]
46 ;CHECK-NEXT: .[[OPT4LABEL]]:
47 ;CHECK: b .[[EXITLABEL]]
48
49 define void @straight_test(i32 %tag) {
5050 entry:
5151 br label %test1
5252 test1:
5353 %tagbit1 = and i32 %tag, 1
5454 %tagbit1eq0 = icmp eq i32 %tagbit1, 0
55 br i1 %tagbit1eq0, label %test2, label %optional1
55 br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
5656 optional1:
5757 call void @a()
5858 call void @a()
6262 test2:
6363 %tagbit2 = and i32 %tag, 2
6464 %tagbit2eq0 = icmp eq i32 %tagbit2, 0
65 br i1 %tagbit2eq0, label %test3, label %optional2
65 br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
6666 optional2:
6767 call void @b()
6868 call void @b()
7272 test3:
7373 %tagbit3 = and i32 %tag, 4
7474 %tagbit3eq0 = icmp eq i32 %tagbit3, 0
75 br i1 %tagbit3eq0, label %test4, label %optional3
75 br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
7676 optional3:
7777 call void @c()
7878 call void @c()
8282 test4:
8383 %tagbit4 = and i32 %tag, 8
8484 %tagbit4eq0 = icmp eq i32 %tagbit4, 0
85 br i1 %tagbit4eq0, label %exit, label %optional4
85 br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
8686 optional4:
8787 call void @d()
8888 call void @d()
9090 call void @d()
9191 br label %exit
9292 exit:
93 ret void
94 }
95
96 ; Intended layout:
97 ; The chain-based outlining produces the layout
98 ; entry
99 ; --- Begin loop ---
100 ; for.latch
101 ; for.check
102 ; test1
103 ; test2
104 ; test3
105 ; test4
106 ; optional1
107 ; optional2
108 ; optional3
109 ; optional4
110 ; --- End loop ---
111 ; exit
112 ; The CHECK statements check for the whole string of tests and exit block,
113 ; and then check that the correct test has been duplicated into the end of
114 ; the optional blocks and that the optional blocks are in the correct order.
115 ;CHECK-LABEL: loop_test:
116 ;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
117 ;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
118 ;CHECK: addi
119 ;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
120 ;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
121 ;CHECK: # %test1
122 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
123 ;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
124 ;CHECK-NEXT: # %test2
125 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
126 ;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
127 ;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
128 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
129 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
130 ;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
131 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
132 ;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
133 ;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
134 ;CHECK: [[OPT1LABEL]]
135 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
136 ;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
137 ;CHECK-NEXT: .[[OPT2LABEL]]
138 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
139 ;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
140 ;CHECK-NEXT: .[[OPT3LABEL]]
141 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
142 ;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
143 ;CHECK: [[OPT4LABEL]]:
144 ;CHECK: b .[[LATCHLABEL]]
145 define void @loop_test(i32* %tags, i32 %count) {
146 entry:
147 br label %for.check
148 for.check:
149 %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
150 %done.count = icmp ugt i32 %count.loop, 0
151 %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
152 %tag = load i32, i32* %tag_ptr
153 %done.tag = icmp eq i32 %tag, 0
154 %done = and i1 %done.count, %done.tag
155 br i1 %done, label %test1, label %exit, !prof !1
156 test1:
157 %tagbit1 = and i32 %tag, 1
158 %tagbit1eq0 = icmp eq i32 %tagbit1, 0
159 br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
160 optional1:
161 call void @a()
162 call void @a()
163 call void @a()
164 call void @a()
165 br label %test2
166 test2:
167 %tagbit2 = and i32 %tag, 2
168 %tagbit2eq0 = icmp eq i32 %tagbit2, 0
169 br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
170 optional2:
171 call void @b()
172 call void @b()
173 call void @b()
174 call void @b()
175 br label %test3
176 test3:
177 %tagbit3 = and i32 %tag, 4
178 %tagbit3eq0 = icmp eq i32 %tagbit3, 0
179 br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
180 optional3:
181 call void @c()
182 call void @c()
183 call void @c()
184 call void @c()
185 br label %test4
186 test4:
187 %tagbit4 = and i32 %tag, 8
188 %tagbit4eq0 = icmp eq i32 %tagbit4, 0
189 br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
190 optional4:
191 call void @d()
192 call void @d()
193 call void @d()
194 call void @d()
195 br label %for.latch
196 for.latch:
197 %count.sub = sub i32 %count.loop, 1
198 br label %for.check
199 exit:
200 ret void
201 }
202
203 ; The block then2 is not unavoidable, meaning it does not dominate the exit.
204 ; But since it can be tail-duplicated, it should be placed as a fallthrough from
205 ; test2 and copied. The purpose here is to make sure that the tail-duplication
206 ; code is independent of the outlining code, which works by choosing the
207 ; "unavoidable" blocks.
208 ; CHECK-LABEL: avoidable_test:
209 ; CHECK: # %entry
210 ; CHECK: andi.
211 ; CHECK: # %test2
212 ; Make sure then2 falls through from test2
213 ; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
214 ; CHECK: # %then2
215 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
216 ; CHECK: # %else1
217 ; CHECK: bl a
218 ; CHECK: bl a
219 ; Make sure then2 was copied into else1
220 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
221 ; CHECK: # %end1
222 ; CHECK: bl d
223 ; CHECK: # %else2
224 ; CHECK: bl c
225 ; CHECK: # %end2
226 define void @avoidable_test(i32 %tag) {
227 entry:
228 br label %test1
229 test1:
230 %tagbit1 = and i32 %tag, 1
231 %tagbit1eq0 = icmp eq i32 %tagbit1, 0
232 br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
233 else1:
234 call void @a()
235 call void @a()
236 br label %then2
237 test2:
238 %tagbit2 = and i32 %tag, 2
239 %tagbit2eq0 = icmp eq i32 %tagbit2, 0
240 br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
241 then2:
242 %tagbit3 = and i32 %tag, 4
243 %tagbit3eq0 = icmp eq i32 %tagbit3, 0
244 br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
245 else2:
246 call void @c()
247 br label %end2
248 end2:
249 ret void
250 end1:
251 call void @d()
252 ret void
253 }
254
255 ; CHECK-LABEL: trellis_test
256 ; The number in the block labels is the expected block frequency given the
257 ; probabilities annotated. There is a conflict in the b;c->d;e trellis that
258 ; should be resolved as c->e;b->d.
259 ; The d;e->f;g trellis should be resolved as e->g;d->f.
260 ; The f;g->h;i trellis should be resolved as f->i;g->h.
261 ; The h;i->j;ret trellis contains a triangle edge, and should be resolved as
262 ; h->j->ret
263 ; CHECK: # %entry
264 ; CHECK: # %c10
265 ; CHECK: # %e9
266 ; CHECK: # %g10
267 ; CHECK: # %h10
268 ; CHECK: # %j8
269 ; CHECK: # %ret
270 ; CHECK: # %b6
271 ; CHECK: # %d7
272 ; CHECK: # %f6
273 ; CHECK: # %i6
274 define void @trellis_test(i32 %tag) {
275 entry:
276 br label %a16
277 a16:
278 call void @a()
279 call void @a()
280 %tagbits.a = and i32 %tag, 3
281 %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
282 br i1 %tagbits.a.eq0, label %c10, label %b6, !prof !1 ; 10 to 6
283 c10:
284 call void @c()
285 call void @c()
286 %tagbits.c = and i32 %tag, 12
287 %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
288 ; Both of these edges should be hotter than the other incoming edge
289 ; for e9 or d7
290 br i1 %tagbits.c.eq0, label %e9, label %d7, !prof !3 ; 6 to 4
291 e9:
292 call void @e()
293 call void @e()
294 %tagbits.e = and i32 %tag, 48
295 %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
296 br i1 %tagbits.e.eq0, label %g10, label %f6, !prof !4 ; 7 to 2
297 g10:
298 call void @g()
299 call void @g()
300 %tagbits.g = and i32 %tag, 192
301 %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
302 br i1 %tagbits.g.eq0, label %i6, label %h10, !prof !5 ; 2 to 8
303 i6:
304 call void @i()
305 call void @i()
306 %tagbits.i = and i32 %tag, 768
307 %tagbits.i.eq0 = icmp eq i32 %tagbits.i, 0
308 br i1 %tagbits.i.eq0, label %ret, label %j8, !prof !2 ; balanced (3 to 3)
309 b6:
310 call void @b()
311 call void @b()
312 %tagbits.b = and i32 %tag, 12
313 %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
314 br i1 %tagbits.b.eq1, label %e9, label %d7, !prof !2 ; balanced (3 to 3)
315 d7:
316 call void @d()
317 call void @d()
318 %tagbits.d = and i32 %tag, 48
319 %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
320 br i1 %tagbits.d.eq1, label %g10, label %f6, !prof !6 ; 3 to 4
321 f6:
322 call void @f()
323 call void @f()
324 %tagbits.f = and i32 %tag, 192
325 %tagbits.f.eq1 = icmp eq i32 %tagbits.f, 128
326 br i1 %tagbits.f.eq1, label %i6, label %h10, !prof !7 ; 4 to 2
327 h10:
328 call void @h()
329 call void @h()
330 %tagbits.h = and i32 %tag, 768
331 %tagbits.h.eq1 = icmp eq i32 %tagbits.h, 512
332 br i1 %tagbits.h.eq1, label %ret, label %j8, !prof !2 ; balanced (5 to 5)
333 j8:
334 call void @j()
335 call void @j()
336 br label %ret
337 ret:
338 ret void
339 }
340
341 ; Verify that we still consider tail-duplication opportunities if we find a
342 ; triangle trellis. Here D->F->G is the triangle, and D;E are both predecessors
343 ; of both F and G. The basic trellis algorithm picks the F->G edge, but after
344 ; checking, it's profitable to duplicate G into F. The weights here are not
345 ; really important. They are there to help make the test stable.
346 ; CHECK-LABEL: trellis_then_dup_test
347 ; CHECK: # %entry
348 ; CHECK: # %b
349 ; CHECK: # %d
350 ; CHECK: # %g
351 ; CHECK: # %ret1
352 ; CHECK: # %c
353 ; CHECK: # %e
354 ; CHECK: # %f
355 ; CHECK: # %ret2
356 ; CHECK: # %ret
357 define void @trellis_then_dup_test(i32 %tag) {
358 entry:
359 br label %a
360 a:
361 call void @a()
362 call void @a()
363 %tagbits.a = and i32 %tag, 3
364 %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
365 br i1 %tagbits.a.eq0, label %b, label %c, !prof !1 ; 5 to 3
366 b:
367 call void @b()
368 call void @b()
369 %tagbits.b = and i32 %tag, 12
370 %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
371 br i1 %tagbits.b.eq1, label %d, label %e, !prof !1 ; 5 to 3
372 d:
373 call void @d()
374 call void @d()
375 %tagbits.d = and i32 %tag, 48
376 %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
377 br i1 %tagbits.d.eq1, label %g, label %f, !prof !1 ; 5 to 3
378 f:
379 call void @f()
380 call void @f()
381 br label %g
382 g:
383 %tagbits.g = and i32 %tag, 192
384 %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
385 br i1 %tagbits.g.eq0, label %ret1, label %ret2, !prof !2 ; balanced
386 c:
387 call void @c()
388 call void @c()
389 %tagbits.c = and i32 %tag, 12
390 %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
391 br i1 %tagbits.c.eq0, label %d, label %e, !prof !1 ; 5 to 3
392 e:
393 call void @e()
394 call void @e()
395 %tagbits.e = and i32 %tag, 48
396 %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
397 br i1 %tagbits.e.eq0, label %g, label %f, !prof !1 ; 5 to 3
398 ret1:
399 call void @a()
400 br label %ret
401 ret2:
402 call void @b()
403 br label %ret
404 ret:
93405 ret void
94406 }
95407
97409 declare void @b()
98410 declare void @c()
99411 declare void @d()
412 declare void @e()
413 declare void @f()
414 declare void @g()
415 declare void @h()
416 declare void @i()
417 declare void @j()
418
419 !1 = !{!"branch_weights", i32 5, i32 3}
420 !2 = !{!"branch_weights", i32 50, i32 50}
421 !3 = !{!"branch_weights", i32 6, i32 4}
422 !4 = !{!"branch_weights", i32 7, i32 2}
423 !5 = !{!"branch_weights", i32 2, i32 8}
424 !6 = !{!"branch_weights", i32 3, i32 4}
425 !7 = !{!"branch_weights", i32 4, i32 2}
6666 ; CHECK: nop
6767 ; CHECK:.LBB1_1: ! %entry
6868 ; CHECK: mov %g0, %i0
69 ; CHECK: ! %entry
70 ; CHECK: cmp %i0, 0
71 ; CHECK: be .LBB1_5
72 ; CHECK: nop
73 ; CHECK:.LBB1_4:
74 ; CHECK: mov 1, %i0
75 ; CHECK: ba .LBB1_6
76 ; CHECK:.LBB1_2: ! Block address taken
77 ; CHECK: mov 1, %i0
6978 ; CHECK: cmp %i0, 0
7079 ; CHECK: bne .LBB1_4
71 ; CHECK: ba .LBB1_5
72 ; CHECK:.LBB1_2: ! Block address taken
73 ; CHECK: mov 1, %i0
74 ; CHECK: be .LBB1_5
75 ; CHECK:.LBB1_4:
76 ; CHECK: ba .LBB1_6
80 ; CHECK: nop
7781 }
7882 declare i8* @llvm.frameaddress(i32) #2
7983
None ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
0 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
11
22 ; Test memcpy, memmove, and memset intrinsics.
33
313313 define void @unnatural_cfg1() {
314314 ; Test that we can handle a loop with an inner unnatural loop at the end of
315315 ; a function. This is a gross CFG reduced out of the single source GCC.
316 ; CHECK: unnatural_cfg1
316 ; CHECK-LABEL: unnatural_cfg1
317317 ; CHECK: %entry
318318 ; CHECK: %loop.body1
319319 ; CHECK: %loop.body2
351351 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
352352 ; loop. This was reduced from a crash on block placement when run over
353353 ; single-source GCC.
354 ; CHECK: unnatural_cfg2
354 ; CHECK-LABEL: unnatural_cfg2
355355 ; CHECK: %entry
356356 ; CHECK: %loop.body1
357357 ; CHECK: %loop.body2
358 ; CHECK: %loop.body4
359 ; CHECK: %loop.inner2.begin
360 ; CHECK: %loop.inner2.begin
358361 ; CHECK: %loop.body3
359362 ; CHECK: %loop.inner1.begin
360 ; The end block is folded with %loop.body3...
361 ; CHECK-NOT: %loop.inner1.end
362 ; CHECK: %loop.body4
363 ; CHECK: %loop.inner2.begin
364 ; The loop.inner2.end block is folded
365363 ; CHECK: %loop.header
366364 ; CHECK: %bail
367365
558556 ; didn't correctly locate the fallthrough successor, assuming blindly that the
559557 ; first one was the fallthrough successor. As a result, we would add an
560558 ; erroneous jump to the landing pad thinking *that* was the default successor.
561 ; CHECK: test_eh_lpad_successor
559 ; CHECK-LABEL: test_eh_lpad_successor
562560 ; CHECK: %entry
563561 ; CHECK-NOT: jmp
564562 ; CHECK: %loop
586584 ; fallthrough simply won't occur. Make sure we don't crash trying to update
587585 ; terminators for such constructs.
588586 ;
589 ; CHECK: test_eh_throw
587 ; CHECK-LABEL: test_eh_throw
590588 ; CHECK: %entry
591589 ; CHECK: %cleanup
592590
608606 ; attempt to merge onto the wrong end of the inner loop just because we find it
609607 ; first. This was reduced from a crasher in GCC's single source.
610608 ;
611 ; CHECK: test_unnatural_cfg_backwards_inner_loop
609 ; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
612610 ; CHECK: %entry
613611 ; CHECK: %loop2b
614612 ; CHECK: %loop1
648646 ; fallthrough because that happens to always produce unanalyzable branches on
649647 ; x86.
650648 ;
651 ; CHECK: unanalyzable_branch_to_loop_header
649 ; CHECK-LABEL: unanalyzable_branch_to_loop_header
652650 ; CHECK: %entry
653651 ; CHECK: %loop
654652 ; CHECK: %exit
672670 ; This branch is now analyzable and hence the destination block becomes the
673671 ; hotter one. The right order is entry->bar->exit->foo.
674672 ;
675 ; CHECK: unanalyzable_branch_to_best_succ
673 ; CHECK-LABEL: unanalyzable_branch_to_best_succ
676674 ; CHECK: %entry
677675 ; CHECK: %bar
678676 ; CHECK: %exit
698696 ; Ensure that we can handle unanalyzable branches where the destination block
699697 ; gets selected as the best free block in the CFG.
700698 ;
701 ; CHECK: unanalyzable_branch_to_free_block
699 ; CHECK-LABEL: unanalyzable_branch_to_free_block
702700 ; CHECK: %entry
703701 ; CHECK: %a
704702 ; CHECK: %b
728726 ; Ensure that we don't crash as we're building up many unanalyzable branches,
729727 ; blocks, and loops.
730728 ;
731 ; CHECK: many_unanalyzable_branches
729 ; CHECK-LABEL: many_unanalyzable_branches
732730 ; CHECK: %entry
733731 ; CHECK: %exit
734732
947945 ; strange layouts that are siginificantly less efficient, often times maing
948946 ; it discontiguous.
949947 ;
950 ; CHECK: @benchmark_heapsort
948 ; CHECK-LABEL: @benchmark_heapsort
951949 ; CHECK: %entry
952950 ; First rotated loop top.
953951 ; CHECK: .p2align
9494 ; CHECK-NEXT: idivl %ebx
9595 ; CHECK-NEXT: movl %eax, %esi
9696 ; CHECK-NEXT: testl $-256, %edi
97 ; CHECK-NEXT: jne .LBB3_5
98 ; CHECK-NEXT: jmp .LBB3_4
97 ; CHECK-NEXT: je .LBB3_4
98 ; CHECK-NEXT: .LBB3_5:
99 ; CHECK-NEXT: xorl %edx, %edx
100 ; CHECK-NEXT: movl %ecx, %eax
101 ; CHECK-NEXT: divl %ebx
102 ; CHECK-NEXT: jmp .LBB3_6
99103 ; CHECK-NEXT: .LBB3_1:
100104 ; CHECK-NEXT: movzbl %cl, %eax
101105 ; CHECK-NEXT: # kill: %EAX %EAX %AX
102106 ; CHECK-NEXT: divb %bl
103107 ; CHECK-NEXT: movzbl %al, %esi
104108 ; CHECK-NEXT: testl $-256, %edi
105 ; CHECK-NEXT: je .LBB3_4
106 ; CHECK-NEXT: .LBB3_5:
107 ; CHECK-NEXT: xorl %edx, %edx
108 ; CHECK-NEXT: movl %ecx, %eax
109 ; CHECK-NEXT: divl %ebx
110 ; CHECK-NEXT: jmp .LBB3_6
109 ; CHECK-NEXT: jne .LBB3_5
111110 ; CHECK-NEXT: .LBB3_4:
112111 ; CHECK-NEXT: movzbl %cl, %eax
113112 ; CHECK-NEXT: # kill: %EAX %EAX %AX
5959 ; X32-NEXT: xorps %xmm1, %xmm1
6060 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
6161 ; X32-NEXT: jne .LBB1_5
62 ; X32-NEXT: jmp .LBB1_4
62 ; X32-NEXT: .LBB1_4:
63 ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
64 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
65 ; X32-NEXT: jne .LBB1_8
66 ; X32-NEXT: .LBB1_7:
67 ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
68 ; X32-NEXT: jmp .LBB1_9
6369 ; X32-NEXT: .LBB1_1:
6470 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
6571 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
6773 ; X32-NEXT: .LBB1_5: # %entry
6874 ; X32-NEXT: xorps %xmm2, %xmm2
6975 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
70 ; X32-NEXT: jne .LBB1_8
71 ; X32-NEXT: jmp .LBB1_7
72 ; X32-NEXT: .LBB1_4:
73 ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
74 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
7576 ; X32-NEXT: je .LBB1_7
7677 ; X32-NEXT: .LBB1_8: # %entry
7778 ; X32-NEXT: xorps %xmm3, %xmm3
78 ; X32-NEXT: jmp .LBB1_9
79 ; X32-NEXT: .LBB1_7:
80 ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
8179 ; X32-NEXT: .LBB1_9: # %entry
8280 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
8381 ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
9896 ; X64-NEXT: xorps %xmm1, %xmm1
9997 ; X64-NEXT: testl %edx, %edx
10098 ; X64-NEXT: jne .LBB1_5
101 ; X64-NEXT: jmp .LBB1_4
99 ; X64-NEXT: .LBB1_4:
100 ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
101 ; X64-NEXT: testl %r8d, %r8d
102 ; X64-NEXT: jne .LBB1_8
103 ; X64-NEXT: .LBB1_7:
104 ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
105 ; X64-NEXT: jmp .LBB1_9
102106 ; X64-NEXT: .LBB1_1:
103107 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
104108 ; X64-NEXT: testl %edx, %edx
106110 ; X64-NEXT: .LBB1_5: # %entry
107111 ; X64-NEXT: xorps %xmm2, %xmm2
108112 ; X64-NEXT: testl %r8d, %r8d
109 ; X64-NEXT: jne .LBB1_8
110 ; X64-NEXT: jmp .LBB1_7
111 ; X64-NEXT: .LBB1_4:
112 ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
113 ; X64-NEXT: testl %r8d, %r8d
114113 ; X64-NEXT: je .LBB1_7
115114 ; X64-NEXT: .LBB1_8: # %entry
116115 ; X64-NEXT: xorps %xmm3, %xmm3
117 ; X64-NEXT: jmp .LBB1_9
118 ; X64-NEXT: .LBB1_7:
119 ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
120116 ; X64-NEXT: .LBB1_9: # %entry
121117 ; X64-NEXT: testl %esi, %esi
122118 ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
214210 ret <4 x i32> %zext
215211 }
216212
217 ; Fragile test warning - we need to induce the generation of a vselect
213 ; Fragile test warning - we need to induce the generation of a vselect
218214 ; post-legalization to cause the crash seen in:
219215 ; https://llvm.org/bugs/show_bug.cgi?id=31672
220216 ; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
55 ; CHECK-LABEL: tail_dup_merge_loops
66 ; CHECK: # %entry
77 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
8 ; CHECK: # %exit
9 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
810 ; CHECK: # %inner_loop_exit
911 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
1012 ; CHECK: # %inner_loop_latch
1113 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
1214 ; CHECK: # %inner_loop_test
13 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
14 ; CHECK: # %exit
1515 define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
1616 entry:
1717 %notlhs674.i = icmp eq i32 %a, 0
None ; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
0 ; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
11 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
22 target triple = "x86_64-unknown-linux-gnu"
33
112112 ; CHECK-NEXT: jbe .LBB2_3
113113 ; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
114114 ; CHECK-NEXT: ja .LBB2_4
115 ; CHECK-NEXT: jmp .LBB2_2
115 ; CHECK-NEXT: .LBB2_2:
116 ; CHECK-NEXT: movb $1, %al
117 ; CHECK-NEXT: ret
116118 ; CHECK-NEXT: .LBB2_3:
117119 ; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
118120 ; CHECK-NEXT: jbe .LBB2_2
119121 ; CHECK-NEXT: .LBB2_4:
120122 ; CHECK-NEXT: xorl %eax, %eax
121 ; CHECK-NEXT: ret
122 ; CHECK-NEXT: .LBB2_2:
123 ; CHECK-NEXT: movb $1, %al
124123 ; CHECK-NEXT: ret
125124
126125 define i1 @dont_merge_oddly(float* %result) nounwind {
1818
1919 ; Check that only one mov will be generated in the kernel loop.
2020 ; CHECK-LABEL: foo:
21 ; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
21 ; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
2222 ; CHECK-NOT: mov
2323 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
2424 ; CHECK-NOT: mov
5555
5656 ; Check that only two mov will be generated in the kernel loop.
5757 ; CHECK-LABEL: goo:
58 ; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
58 ; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
5959 ; CHECK-NOT: mov
6060 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
6161 ; CHECK-NOT: mov
114114 ; Test that the blocks are analyzed in the correct order.
115115 ; CHECK-LABEL: cfg:
116116 entry:
117 br i1 %x, label %bb1, label %bb2
117 br i1 %x, label %bb1, label %bb3
118118
119119 bb1:
120120 %p1 = alloca %struct.S
121121 ; CHECK: pushl %eax
122122 ; CHECK: subl $1020, %esp
123 br label %bb3
123 br label %bb4
124
124125 bb2:
126 %p5 = alloca %struct.T
127 ; CHECK: pushl %eax
128 ; CHECK: subl $2996, %esp
129 call void @g(%struct.T* %p5)
130 ret void
131
132 bb3:
125133 %p2 = alloca %struct.T
126134 ; CHECK: pushl %eax
127135 ; CHECK: subl $2996, %esp
128 br label %bb3
129
130 bb3:
131 br i1 %y, label %bb4, label %bb5
136 br label %bb4
132137
133138 bb4:
139 br i1 %y, label %bb5, label %bb2
140
141 bb5:
134142 %p4 = alloca %struct.S
135143 ; CHECK: subl $1024, %esp
136144 call void @f(%struct.S* %p4)
137145 ret void
138146
139 bb5:
140 %p5 = alloca %struct.T
141 ; CHECK: pushl %eax
142 ; CHECK: subl $2996, %esp
143 call void @g(%struct.T* %p5)
144 ret void
145147 }
146148
147149