llvm.org GIT mirror llvm / 2e1b1fa
[SimpleLoopUnswitch] adding cost multiplier to cap exponential unswitch with We need to control exponential behavior of loop-unswitch so we do not get run-away compilation. Suggested solution is to introduce a multiplier for an unswitch cost that makes cost prohibitive as soon as there are too many candidates and too many sibling loops (meaning we have already started duplicating loops by unswitching). It does solve the currently known problem with compile-time degradation (PR 39544). Tests are built on top of a recently implemented CHECK-COUNT-<num> FileCheck directives. Reviewed By: chandlerc, mkazantsev Differential Revision: https://reviews.llvm.org/D54223 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@347097 91177308-0d34-0410-b5e6-96231b3b80d8 Fedor Sergeev 10 months ago
6 changed file(s) with 658 addition(s) and 2 deletion(s). Raw diff Collapse all Expand all
6161 STATISTIC(NumSwitches, "Number of switches unswitched");
6262 STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
6363 STATISTIC(NumTrivial, "Number of unswitches that are trivial");
64 STATISTIC(
65 NumCostMultiplierSkipped,
66 "Number of unswitch candidates that had their cost multiplier skipped");
6467
6568 static cl::opt EnableNonTrivialUnswitch(
6669 "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
7174 UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
7275 cl::desc("The cost threshold for unswitching a loop."));
7376
77 static cl::opt EnableUnswitchCostMultiplier(
78 "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden,
79 cl::desc("Enable unswitch cost multiplier that prohibits exponential "
80 "explosion in nontrivial unswitch."));
81 static cl::opt UnswitchSiblingsToplevelDiv(
82 "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
83 cl::desc("Toplevel siblings divisor for cost multiplier."));
84 static cl::opt UnswitchNumInitialUnscaledCandidates(
85 "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
86 cl::desc("Number of unswitch candidates that are ignored when calculating "
87 "cost multiplier."));
7488 static cl::opt UnswitchGuards(
7589 "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
7690 cl::desc("If enabled, simple loop unswitching will also consider "
22592273 return CheckBI;
22602274 }
22612275
2276 /// Cost multiplier is a way to limit potentially exponential behavior
2277 /// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
2278 /// candidates available. Also accounting for the number of "sibling" loops with
2279 /// the idea to account for previous unswitches that already happened on this
2280 /// cluster of loops. There was an attempt to keep this formula simple,
2281 /// just enough to limit the worst case behavior. Even if it is not that simple
2282 /// now it is still not an attempt to provide a detailed heuristic size
2283 /// prediction.
2284 ///
2285 /// TODO: Make a proper accounting of "explosion" effect for all kinds of
2286 /// unswitch candidates, making adequate predictions instead of wild guesses.
2287 /// That requires knowing not just the number of "remaining" candidates but
2288 /// also costs of unswitching for each of these candidates.
2289 static int calculateUnswitchCostMultiplier(
2290 Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
2291 ArrayRef>>
2292 UnswitchCandidates) {
2293
2294 // Guards and other exiting conditions do not contribute to exponential
2295 // explosion as soon as they dominate the latch (otherwise there might be
2296 // another path to the latch remaining that does not allow to eliminate the
2297 // loop copy on unswitch).
2298 BasicBlock *Latch = L.getLoopLatch();
2299 BasicBlock *CondBlock = TI.getParent();
2300 if (DT.dominates(CondBlock, Latch) &&
2301 (isGuard(&TI) ||
2302 llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) {
2303 return L.contains(SuccBB);
2304 }) <= 1)) {
2305 NumCostMultiplierSkipped++;
2306 return 1;
2307 }
2308
2309 auto *ParentL = L.getParentLoop();
2310 int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
2311 : std::distance(LI.begin(), LI.end()));
2312 // Count amount of clones that all the candidates might cause during
2313 // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
2314 int UnswitchedClones = 0;
2315 for (auto Candidate : UnswitchCandidates) {
2316 Instruction *CI = Candidate.first;
2317 BasicBlock *CondBlock = CI->getParent();
2318 bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
2319 if (isGuard(CI)) {
2320 if (!SkipExitingSuccessors)
2321 UnswitchedClones++;
2322 continue;
2323 }
2324 int NonExitingSuccessors = llvm::count_if(
2325 successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) {
2326 return !SkipExitingSuccessors || L.contains(SuccBB);
2327 });
2328 UnswitchedClones += Log2_32(NonExitingSuccessors);
2329 }
2330
2331 // Ignore up to the "unscaled candidates" number of unswitch candidates
2332 // when calculating the power-of-two scaling of the cost. The main idea
2333 // with this control is to allow a small number of unswitches to happen
2334 // and rely more on siblings multiplier (see below) when the number
2335 // of candidates is small.
2336 unsigned ClonesPower =
2337 std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0);
2338
2339 // Allowing top-level loops to spread a bit more than nested ones.
2340 int SiblingsMultiplier =
2341 std::max((ParentL ? SiblingsCount
2342 : SiblingsCount / (int)UnswitchSiblingsToplevelDiv),
2343 1);
2344 // Compute the cost multiplier in a way that won't overflow by saturating
2345 // at an upper bound.
2346 int CostMultiplier;
2347 if (ClonesPower > Log2_32(UnswitchThreshold) ||
2348 SiblingsMultiplier > UnswitchThreshold)
2349 CostMultiplier = UnswitchThreshold;
2350 else
2351 CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
2352 (int)UnswitchThreshold);
2353
2354 LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
2355 << " (siblings " << SiblingsMultiplier << " * clones "
2356 << (1 << ClonesPower) << ")"
2357 << " for unswitch candidate: " << TI << "\n");
2358 return CostMultiplier;
2359 }
2360
22622361 static bool
22632362 unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
22642363 AssumptionCache &AC, TargetTransformInfo &TTI,
24722571 int CandidateCost = ComputeUnswitchedCost(
24732572 TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
24742573 Invariants[0] == BI->getCondition()));
2475 LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
2476 << " for unswitch candidate: " << TI << "\n");
2574 // Calculate cost multiplier which is a tool to limit potentially
2575 // exponential behavior of loop-unswitch.
2576 if (EnableUnswitchCostMultiplier) {
2577 int CostMultiplier =
2578 calculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
2579 assert(
2580 (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
2581 "cost multiplier needs to be in the range of 1..UnswitchThreshold");
2582 CandidateCost *= CostMultiplier;
2583 LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
2584 << " (multiplier: " << CostMultiplier << ")"
2585 << " for unswitch candidate: " << TI << "\n");
2586 } else {
2587 LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
2588 << " for unswitch candidate: " << TI << "\n");
2589 }
2590
24772591 if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
24782592 BestUnswitchTI = &TI;
24792593 BestUnswitchCost = CandidateCost;
0 ;
1 ; There should be just a single copy of each loop when strictest mutiplier
2 ; candidates formula (unscaled candidates == 0) is enforced:
3
4 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
5 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
6 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
7 ;
8 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
9 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
10 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
11 ;
12 ;
13 ; When we relax the candidates part of a multiplier formula
14 ; (unscaled candidates == 4) we start getting some unswitches,
15 ; which leads to siblings multiplier kicking in.
16 ;
17 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
18 ; RUN: -unswitch-num-initial-unscaled-candidates=4 -unswitch-siblings-toplevel-div=1 \
19 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
20 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE4-DIV1
21 ;
22 ; NB: sort -b is essential here and below, otherwise blanks might lead to different
23 ; order depending on locale.
24 ;
25 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
26 ; RUN: -unswitch-num-initial-unscaled-candidates=4 -unswitch-siblings-toplevel-div=2 \
27 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
28 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE4-DIV2
29 ;
30 ;
31 ; Get
32 ; 2^(num conds) == 2^5 = 32
33 ; loop nests when cost multiplier is disabled:
34 ;
35 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
36 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
37 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP32
38 ;
39 ; Single loop nest, not unswitched
40 ; LOOP1: Loop at depth 1 containing:
41 ; LOOP1: Loop at depth 2 containing:
42 ; LOOP1: Loop at depth 3 containing:
43 ; LOOP1-NOT: Loop at depth {{[0-9]+}} containing:
44 ;
45 ; Half unswitched loop nests, with unscaled4 and div1 it gets less depth1 loops unswitched
46 ; since they have more cost.
47 ; LOOP-UNSCALE4-DIV1-COUNT-6: Loop at depth 1 containing:
48 ; LOOP-UNSCALE4-DIV1-COUNT-19: Loop at depth 2 containing:
49 ; LOOP-UNSCALE4-DIV1-COUNT-29: Loop at depth 3 containing:
50 ; LOOP-UNSCALE4-DIV1-NOT: Loop at depth {{[0-9]+}} containing:
51 ;
52 ; Half unswitched loop nests, with unscaled4 and div2 it gets more depth1 loops unswitched
53 ; as div2 kicks in.
54 ; LOOP-UNSCALE4-DIV2-COUNT-11: Loop at depth 1 containing:
55 ; LOOP-UNSCALE4-DIV2-COUNT-22: Loop at depth 2 containing:
56 ; LOOP-UNSCALE4-DIV2-COUNT-29: Loop at depth 3 containing:
57 ; LOOP-UNSCALE4-DIV2-NOT: Loop at depth {{[0-9]+}} containing:
58 ;
59 ; 32 loop nests, fully unswitched
60 ; LOOP32-COUNT-32: Loop at depth 1 containing:
61 ; LOOP32-COUNT-32: Loop at depth 2 containing:
62 ; LOOP32-COUNT-32: Loop at depth 3 containing:
63 ; LOOP32-NOT: Loop at depth {{[0-9]+}} containing:
64
65 declare void @bar()
66
67 define void @loop_nested3_conds5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
68 entry:
69 %addr1 = getelementptr i32, i32* %addr, i64 0
70 %addr2 = getelementptr i32, i32* %addr, i64 1
71 %addr3 = getelementptr i32, i32* %addr, i64 2
72 br label %outer
73 outer:
74 %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
75 %iv1.next = add i32 %iv1, 1
76 ;; skip nontrivial unswitch
77 call void @bar()
78 br label %middle
79 middle:
80 %iv2 = phi i32 [0, %outer], [%iv2.next, %middle_latch]
81 %iv2.next = add i32 %iv2, 1
82 ;; skip nontrivial unswitch
83 call void @bar()
84 br label %loop
85 loop:
86 %iv3 = phi i32 [0, %middle], [%iv3.next, %loop_latch]
87 %iv3.next = add i32 %iv3, 1
88 ;; skip nontrivial unswitch
89 call void @bar()
90 br i1 %c1, label %loop_next1_left, label %loop_next1_right
91 loop_next1_left:
92 br label %loop_next1
93 loop_next1_right:
94 br label %loop_next1
95
96 loop_next1:
97 br i1 %c2, label %loop_next2_left, label %loop_next2_right
98 loop_next2_left:
99 br label %loop_next2
100 loop_next2_right:
101 br label %loop_next2
102
103 loop_next2:
104 br i1 %c3, label %loop_next3_left, label %loop_next3_right
105 loop_next3_left:
106 br label %loop_next3
107 loop_next3_right:
108 br label %loop_next3
109
110 loop_next3:
111 br i1 %c4, label %loop_next4_left, label %loop_next4_right
112 loop_next4_left:
113 br label %loop_next4
114 loop_next4_right:
115 br label %loop_next4
116
117 loop_next4:
118 br i1 %c5, label %loop_latch_left, label %loop_latch_right
119 loop_latch_left:
120 br label %loop_latch
121 loop_latch_right:
122 br label %loop_latch
123
124 loop_latch:
125 store volatile i32 0, i32* %addr1
126 %test_loop = icmp slt i32 %iv3, 50
127 br i1 %test_loop, label %loop, label %middle_latch
128 middle_latch:
129 store volatile i32 0, i32* %addr2
130 %test_middle = icmp slt i32 %iv2, 50
131 br i1 %test_middle, label %middle, label %outer_latch
132 outer_latch:
133 store volatile i32 0, i32* %addr3
134 %test_outer = icmp slt i32 %iv1, 50
135 br i1 %test_outer, label %outer, label %exit
136 exit:
137 ret void
138 }
0 ;
1 ; Here all the branches we unswitch are exiting from the inner loop.
2 ; That means we should not be getting exponential behavior on inner-loop
3 ; unswitch. In fact there should be just a single version of inner-loop,
4 ; with possibly some outer loop copies.
5 ;
6 ; There should be just a single copy of each loop when strictest mutiplier
7 ; candidates formula (unscaled candidates == 0) is enforced:
8
9 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
10 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
11 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
12 ;
13 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
14 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
15 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
16 ;
17 ;
18 ; When we relax the candidates part of a multiplier formula
19 ; (unscaled candidates == 2) we start getting some unswitches in outer loops,
20 ; which leads to siblings multiplier kicking in.
21 ;
22 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
23 ; RUN: -unswitch-num-initial-unscaled-candidates=3 -unswitch-siblings-toplevel-div=1 \
24 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
25 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE3-DIV1
26 ;
27 ; NB: sort -b is essential here and below, otherwise blanks might lead to different
28 ; order depending on locale.
29 ;
30 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
31 ; RUN: -unswitch-num-initial-unscaled-candidates=3 -unswitch-siblings-toplevel-div=2 \
32 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
33 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-UNSCALE3-DIV2
34 ;
35 ; With disabled cost-multiplier we get maximal possible amount of unswitches.
36 ;
37 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
38 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
39 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-MAX
40 ;
41 ; Single loop nest, not unswitched
42 ; LOOP1: Loop at depth 1 containing:
43 ; LOOP1-NOT: Loop at depth 1 containing:
44 ; LOOP1: Loop at depth 2 containing:
45 ; LOOP1-NOT: Loop at depth 2 containing:
46 ; LOOP1: Loop at depth 3 containing:
47 ; LOOP1-NOT: Loop at depth 3 containing:
48 ;
49 ; Half unswitched loop nests, with unscaled3 and div1 it gets less depth1 loops unswitched
50 ; since they have more cost.
51 ; LOOP-UNSCALE3-DIV1-COUNT-4: Loop at depth 1 containing:
52 ; LOOP-UNSCALE3-DIV1-NOT: Loop at depth 1 containing:
53 ; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 2 containing:
54 ; LOOP-UNSCALE3-DIV1-NOT: Loop at depth 2 containing:
55 ; LOOP-UNSCALE3-DIV1-COUNT-1: Loop at depth 3 containing:
56 ; LOOP-UNSCALE3-DIV1-NOT: Loop at depth 3 containing:
57 ;
58 ; Half unswitched loop nests, with unscaled3 and div2 it gets more depth1 loops unswitched
59 ; as div2 kicks in.
60 ; LOOP-UNSCALE3-DIV2-COUNT-6: Loop at depth 1 containing:
61 ; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 1 containing:
62 ; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 2 containing:
63 ; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 2 containing:
64 ; LOOP-UNSCALE3-DIV2-COUNT-1: Loop at depth 3 containing:
65 ; LOOP-UNSCALE3-DIV2-NOT: Loop at depth 3 containing:
66 ;
67 ; Maximally unswitched (copy of the outer loop per each condition)
68 ; LOOP-MAX-COUNT-6: Loop at depth 1 containing:
69 ; LOOP-MAX-NOT: Loop at depth 1 containing:
70 ; LOOP-MAX-COUNT-1: Loop at depth 2 containing:
71 ; LOOP-MAX-NOT: Loop at depth 2 containing:
72 ; LOOP-MAX-COUNT-1: Loop at depth 3 containing:
73 ; LOOP-MAX-NOT: Loop at depth 3 containing:
74
75 declare void @bar()
76
77 define void @loop_nested3_conds5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
78 entry:
79 %addr1 = getelementptr i32, i32* %addr, i64 0
80 %addr2 = getelementptr i32, i32* %addr, i64 1
81 %addr3 = getelementptr i32, i32* %addr, i64 2
82 br label %outer
83 outer:
84 %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
85 %iv1.next = add i32 %iv1, 1
86 ;; skip nontrivial unswitch
87 call void @bar()
88 br label %middle
89 middle:
90 %iv2 = phi i32 [0, %outer], [%iv2.next, %middle_latch]
91 %iv2.next = add i32 %iv2, 1
92 ;; skip nontrivial unswitch
93 call void @bar()
94 br label %loop
95 loop:
96 %iv3 = phi i32 [0, %middle], [%iv3.next, %loop_latch]
97 %iv3.next = add i32 %iv3, 1
98 ;; skip nontrivial unswitch
99 call void @bar()
100 br i1 %c1, label %loop_next1_left, label %outer_latch
101 loop_next1_left:
102 br label %loop_next1
103 loop_next1_right:
104 br label %loop_next1
105
106 loop_next1:
107 br i1 %c2, label %loop_next2_left, label %outer_latch
108 loop_next2_left:
109 br label %loop_next2
110 loop_next2_right:
111 br label %loop_next2
112
113 loop_next2:
114 br i1 %c3, label %loop_next3_left, label %outer_latch
115 loop_next3_left:
116 br label %loop_next3
117 loop_next3_right:
118 br label %loop_next3
119
120 loop_next3:
121 br i1 %c4, label %loop_next4_left, label %outer_latch
122 loop_next4_left:
123 br label %loop_next4
124 loop_next4_right:
125 br label %loop_next4
126
127 loop_next4:
128 br i1 %c5, label %loop_latch_left, label %outer_latch
129 loop_latch_left:
130 br label %loop_latch
131 loop_latch_right:
132 br label %loop_latch
133
134 loop_latch:
135 store volatile i32 0, i32* %addr1
136 %test_loop = icmp slt i32 %iv3, 50
137 br i1 %test_loop, label %loop, label %middle_latch
138 middle_latch:
139 store volatile i32 0, i32* %addr2
140 %test_middle = icmp slt i32 %iv2, 50
141 br i1 %test_middle, label %middle, label %outer_latch
142 outer_latch:
143 store volatile i32 0, i32* %addr3
144 %test_outer = icmp slt i32 %iv1, 50
145 br i1 %test_outer, label %outer, label %exit
146 exit:
147 ret void
148 }
0 ;
1 ; There should be just a single copy of loop when strictest mutiplier candidates
2 ; formula (unscaled candidates == 0) is enforced:
3 ;
4 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
5 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
6 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
7 ;
8 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
9 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=8 \
10 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
11 ;
12 ; With relaxed candidates multiplier (unscaled candidates == 8) we should allow
13 ; some unswitches to happen until siblings multiplier starts kicking in:
14 ;
15 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
16 ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
17 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP5
18 ;
19 ; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed
20 ; siblings multiplier for top-level loops (toplevel-div == 8) we should get
21 ; 2^(num conds) == 2^5 == 32
22 ; copies of the loop:
23 ;
24 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
25 ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
26 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
27 ;
28 ; Similarly get
29 ; 2^(num conds) == 2^5 == 32
30 ; copies of the loop when cost multiplier is disabled:
31 ;
32 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
33 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP32
34 ;
35 ;
36 ; Single loop, not unswitched
37 ; LOOP1: Loop at depth 1 containing:
38 ; LOOP1-NOT: Loop at depth 1 containing:
39
40 ; 5 loops, unswitched 4 times
41 ; LOOP5-COUNT-5: Loop at depth 1 containing:
42 ; LOOP5-NOT: Loop at depth 1 containing:
43
44 ; 32 loops, fully unswitched
45 ; LOOP32-COUNT-32: Loop at depth 1 containing:
46 ; LOOP32-NOT: Loop at depth 1 containing:
47
48 define void @loop_simple5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
49 entry:
50 br label %loop
51 loop:
52 %iv = phi i32 [0, %entry], [%iv.next, %loop_latch]
53 %iv.next = add i32 %iv, 1
54 br i1 %c1, label %loop_next1, label %loop_next1_right
55 loop_next1_right:
56 br label %loop_next1
57 loop_next1:
58 br i1 %c2, label %loop_next2, label %loop_next2_right
59 loop_next2_right:
60 br label %loop_next2
61 loop_next2:
62 br i1 %c3, label %loop_next3, label %loop_next3_right
63 loop_next3_right:
64 br label %loop_next3
65 loop_next3:
66 br i1 %c4, label %loop_next4, label %loop_next4_right
67 loop_next4_right:
68 br label %loop_next4
69 loop_next4:
70 br i1 %c5, label %loop_latch, label %loop_latch_right
71 loop_latch_right:
72 br label %loop_latch
73 loop_latch:
74 store volatile i32 0, i32* %addr
75 %test_loop = icmp slt i32 %iv, 50
76 br i1 %test_loop, label %loop, label %exit
77 exit:
78 ret void
79 }
0 ;
1 ; Here all the branches are exiting ones. Checking that we dont have
2 ; exponential behavior with any kind of controlling heuristics here.
3 ;
4 ; There we should have just a single loop.
5 ;
6 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
7 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
8 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
9 ;
10 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
11 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=8 \
12 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
13 ;
14 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
15 ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
16 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
17 ;
18 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
19 ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
20 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
21 ;
22 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
23 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
24 ;
25 ;
26 ; Single loop, not unswitched
27 ; LOOP1: Loop at depth 1 containing:
28 ; LOOP1-NOT: Loop at depth 1 containing:
29
30 declare void @bar()
31
32 define void @loop_simple5(i32* %addr, i1 %c1, i1 %c2, i1 %c3, i1 %c4, i1 %c5) {
33 entry:
34 br label %loop
35 loop:
36 %iv = phi i32 [0, %entry], [%iv.next, %loop_latch]
37 %iv.next = add i32 %iv, 1
38 ;; disabling trivial unswitch
39 call void @bar()
40 br i1 %c1, label %loop_next1, label %exit
41 loop_next1:
42 br i1 %c2, label %loop_next2, label %exit
43 loop_next2:
44 br i1 %c3, label %loop_next3, label %exit
45 loop_next3:
46 br i1 %c4, label %loop_next4, label %exit
47 loop_next4:
48 br i1 %c5, label %loop_latch, label %exit
49 loop_latch:
50 store volatile i32 0, i32* %addr
51 %test_loop = icmp slt i32 %iv, 50
52 br i1 %test_loop, label %loop, label %exit
53 exit:
54 ret void
55 }
0 ;
1 ; Here we have 5-way unswitchable switch with each successor also having an unswitchable
2 ; exiting branch in it. If we start unswitching those branches we start duplicating the
3 ; whole switch. This can easily lead to exponential behavior w/o proper control.
4 ; On a real-life testcase there was 16-way switch and that took forever to compile w/o
5 ; a cost control.
6 ;
7 ;
8 ; When we use the stricted multiplier candidates formula (unscaled candidates == 0)
9 ; we should be getting just a single loop.
10 ;
11 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
12 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=1 \
13 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
14 ;
15 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
16 ; RUN: -unswitch-num-initial-unscaled-candidates=0 -unswitch-siblings-toplevel-div=16 \
17 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | FileCheck %s --check-prefixes=LOOP1
18 ;
19 ;
20 ; With relaxed candidates multiplier (unscaled candidates == 8) we should allow
21 ; some unswitches to happen until siblings multiplier starts kicking in:
22 ;
23 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
24 ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=1 \
25 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
26 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-RELAX
27 ;
28 ; With relaxed candidates multiplier (unscaled candidates == 8) and with relaxed
29 ; siblings multiplier for top-level loops (toplevel-div == 8) we should get
30 ; considerably more copies of the loop (especially top-level ones).
31 ;
32 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=true \
33 ; RUN: -unswitch-num-initial-unscaled-candidates=8 -unswitch-siblings-toplevel-div=8 \
34 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
35 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-RELAX2
36 ;
37 ; We get hundreds of copies of the loop when cost multiplier is disabled:
38 ;
39 ; RUN: opt < %s -enable-nontrivial-unswitch -enable-unswitch-cost-multiplier=false \
40 ; RUN: -passes='loop(unswitch),print' -disable-output 2>&1 | \
41 ; RUN: sort -b | FileCheck %s --check-prefixes=LOOP-MAX
42 ;
43
44 ; Single loop nest, not unswitched
45 ; LOOP1: Loop at depth 1 containing:
46 ; LOOP1-NOT: Loop at depth 1 containing:
47 ; LOOP1: Loop at depth 2 containing:
48 ; LOOP1-NOT: Loop at depth 2 containing:
49 ;
50 ; Somewhat relaxed restrictions on candidates:
51 ; LOOP-RELAX-COUNT-5: Loop at depth 1 containing:
52 ; LOOP-RELAX-NOT: Loop at depth 1 containing:
53 ; LOOP-RELAX-COUNT-32: Loop at depth 2 containing:
54 ; LOOP-RELAX-NOT: Loop at depth 2 containing:
55 ;
56 ; Even more relaxed restrictions on candidates and siblings.
57 ; LOOP-RELAX2-COUNT-11: Loop at depth 1 containing:
58 ; LOOP-RELAX2-NOT: Loop at depth 1 containing:
59 ; LOOP-RELAX2-COUNT-40: Loop at depth 2 containing:
60 ; LOOP-RELAX-NOT: Loop at depth 2 containing:
61 ;
62 ; Unswitched as much as it could (with multiplier disabled).
63 ; LOOP-MAX-COUNT-56: Loop at depth 1 containing:
64 ; LOOP-MAX-NOT: Loop at depth 1 containing:
65 ; LOOP-MAX-COUNT-111: Loop at depth 2 containing:
66 ; LOOP-MAX-NOT: Loop at depth 2 containing:
67
68 define i32 @loop_switch(i32* %addr, i32 %c1, i32 %c2) {
69 entry:
70 %addr1 = getelementptr i32, i32* %addr, i64 0
71 %addr2 = getelementptr i32, i32* %addr, i64 1
72 %check0 = icmp eq i32 %c2, 0
73 %check1 = icmp eq i32 %c2, 31
74 %check2 = icmp eq i32 %c2, 32
75 %check3 = icmp eq i32 %c2, 33
76 %check4 = icmp eq i32 %c2, 34
77 br label %outer_loop
78
79 outer_loop:
80 %iv1 = phi i32 [0, %entry], [%iv1.next, %outer_latch]
81 %iv1.next = add i32 %iv1, 1
82 br label %inner_loop
83 inner_loop:
84 %iv2 = phi i32 [0, %outer_loop], [%iv2.next, %inner_latch]
85 %iv2.next = add i32 %iv2, 1
86 switch i32 %c1, label %inner_latch [
87 i32 0, label %case0
88 i32 1, label %case1
89 i32 2, label %case2
90 i32 3, label %case3
91 i32 4, label %case4
92 ]
93
94 case4:
95 br i1 %check4, label %exit, label %inner_latch
96 case3:
97 br i1 %check3, label %exit, label %inner_latch
98 case2:
99 br i1 %check2, label %exit, label %inner_latch
100 case1:
101 br i1 %check1, label %exit, label %inner_latch
102 case0:
103 br i1 %check0, label %exit, label %inner_latch
104
105 inner_latch:
106 store volatile i32 0, i32* %addr1
107 %test_inner = icmp slt i32 %iv2, 50
108 br i1 %test_inner, label %inner_loop, label %outer_latch
109
110 outer_latch:
111 store volatile i32 0, i32* %addr2
112 %test_outer = icmp slt i32 %iv1, 50
113 br i1 %test_outer, label %outer_loop, label %exit
114
115 exit: ; preds = %bci_0
116 ret i32 1
117 }