llvm.org GIT mirror llvm / 6efdd0f
Add a new pass to speculate around PHI nodes with constant (integer) operands when profitable. The core idea is to (re-)introduce some redundancies where their cost is hidden by the cost of materializing immediates for constant operands of PHI nodes. When the cost of the redundancies is covered by this, avoiding materializing the immediate has numerous benefits: 1) Less register pressure 2) Potential for further folding / combining 3) Potential for more efficient instructions due to immediate operand As a motivating example, consider the remarkably different cost on x86 of a SHL instruction with an immediate operand versus a register operand. This pattern turns up surprisingly frequently, but is somewhat rarely obvious as a significant performance problem. The pass is entirely target independent, but it does rely on the target cost model in TTI to decide when to speculate things around the PHI node. I've included x86-focused tests, but any target that sets up its immediate cost model should benefit from this pass. There is probably more that can be done in this space, but the pass as-is is enough to get some important performance on our internal benchmarks, and should be generally performance neutral, but help with more extensive benchmarking is always welcome. One awkward part is that this pass has to be scheduled after *everything* that can eliminate these kinds of redundancies. This includes SimplifyCFG, GVN, etc. I'm open to suggestions about better places to put this. We could in theory make it part of the codegen pass pipeline, but there doesn't really seem to be a good reason for that -- it isn't "lowering" in any sense and only relies on pretty standard cost model based TTI queries, so it seems to fit well with the "optimization" pipeline model. Still, further thoughts on the pipeline position are welcome. I've also only implemented this in the new pass manager. If folks are very interested, I can try to add it to the old PM as well, but I didn't really see much point (my use case is already switched over to the new PM). I've tested this pretty heavily without issue. A wide range of benchmarks internally show no change outside the noise, and I don't see any significant changes in SPEC either. However, the size class computation in tcmalloc is substantially improved by this, which turns into a 2% to 4% win on the hottest path through tcmalloc for us, so there are definitely important cases where this is going to make a substantial difference. Differential revision: https://reviews.llvm.org/D37467 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319164 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 1 year, 9 months ago
8 changed file(s) with 1527 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
0 //===- SpeculateAroundPHIs.h - Speculate around PHIs ------------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #ifndef LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H
10 #define LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H
11
12 #include "llvm/ADT/SetVector.h"
13 #include "llvm/Analysis/AssumptionCache.h"
14 #include "llvm/IR/Dominators.h"
15 #include "llvm/IR/Function.h"
16 #include "llvm/IR/PassManager.h"
17 #include "llvm/Support/Compiler.h"
18 #include
19
20 namespace llvm {
21
22 /// This pass handles simple speculating of instructions around PHIs when
23 /// doing so is profitable for a particular target despite duplicated
24 /// instructions.
25 ///
26 /// The motivating example are PHIs of constants which will require
27 /// materializing the constants along each edge. If the PHI is used by an
28 /// instruction where the target can materialize the constant as part of the
29 /// instruction, it is profitable to speculate those instructions around the
30 /// PHI node. This can reduce dynamic instruction count as well as decrease
31 /// register pressure.
32 ///
33 /// Consider this IR for example:
34 /// ```
35 /// entry:
36 /// br i1 %flag, label %a, label %b
37 ///
38 /// a:
39 /// br label %exit
40 ///
41 /// b:
42 /// br label %exit
43 ///
44 /// exit:
45 /// %p = phi i32 [ 7, %a ], [ 11, %b ]
46 /// %sum = add i32 %arg, %p
47 /// ret i32 %sum
48 /// ```
49 /// To materialize the inputs to this PHI node may require an explicit
50 /// instruction. For example, on x86 this would turn into something like
51 /// ```
52 /// testq %eax, %eax
53 /// movl $7, %rNN
54 /// jne .L
55 /// movl $11, %rNN
56 /// .L:
57 /// addl %edi, %rNN
58 /// movl %rNN, %eax
59 /// retq
60 /// ```
61 /// When these constants can be folded directly into another instruction, it
62 /// would be preferable to avoid the potential for register pressure (above we
63 /// can easily avoid it, but that isn't always true) and simply duplicate the
64 /// instruction using the PHI:
65 /// ```
66 /// entry:
67 /// br i1 %flag, label %a, label %b
68 ///
69 /// a:
70 /// %sum.1 = add i32 %arg, 7
71 /// br label %exit
72 ///
73 /// b:
74 /// %sum.2 = add i32 %arg, 11
75 /// br label %exit
76 ///
77 /// exit:
78 /// %p = phi i32 [ %sum.1, %a ], [ %sum.2, %b ]
79 /// ret i32 %p
80 /// ```
81 /// Which will generate something like the following on x86:
82 /// ```
83 /// testq %eax, %eax
84 /// addl $7, %edi
85 /// jne .L
86 /// addl $11, %edi
87 /// .L:
88 /// movl %edi, %eax
89 /// retq
90 /// ```
91 ///
92 /// It is important to note that this pass is never intended to handle more
93 /// complex cases where speculating around PHIs allows simplifications of the
94 /// IR itself or other subsequent optimizations. Those can and should already
95 /// be handled before this pass is ever run by a more powerful analysis that
96 /// can reason about equivalences and common subexpressions. Classically, those
97 /// cases would be handled by a GVN-powered PRE or similar transform. This
98 /// pass, in contrast, is *only* interested in cases where despite no
99 /// simplifications to the IR itself, speculation is *faster* to execute. The
100 /// result of this is that the cost models which are appropriate to consider
101 /// here are relatively simple ones around execution and codesize cost, without
102 /// any need to consider simplifications or other transformations.
103 struct SpeculateAroundPHIsPass : PassInfoMixin {
104 /// \brief Run the pass over the function.
105 PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
106 };
107
108 } // end namespace llvm
109
110 #endif // LLVM_TRANSFORMS_SCALAR_SPECULATEAROUNDPHIS_H
131131 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
132132 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
133133 #include "llvm/Transforms/Scalar/Sink.h"
134 #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
134135 #include "llvm/Transforms/Scalar/SpeculativeExecution.h"
135136 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
136137 #include "llvm/Transforms/Utils/AddDiscriminators.h"
797798 // LoopSink (and other loop passes since the last simplifyCFG) might have
798799 // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
799800 OptimizePM.addPass(SimplifyCFGPass());
801
802 // Optimize PHIs by speculating around them when profitable. Note that this
803 // pass needs to be run after any PRE or similar pass as it is essentially
804 // inserting redudnancies into the progrem. This even includes SimplifyCFG.
805 OptimizePM.addPass(SpeculateAroundPHIsPass());
800806
801807 // Add the core optimizing pipeline.
802808 MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
198198 FUNCTION_PASS("sink", SinkingPass())
199199 FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
200200 FUNCTION_PASS("speculative-execution", SpeculativeExecutionPass())
201 FUNCTION_PASS("spec-phis", SpeculateAroundPHIsPass())
201202 FUNCTION_PASS("sroa", SROA())
202203 FUNCTION_PASS("tailcallelim", TailCallElimPass())
203204 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
6161 SimplifyCFGPass.cpp
6262 Sink.cpp
6363 SpeculativeExecution.cpp
64 SpeculateAroundPHIs.cpp
6465 StraightLineStrengthReduce.cpp
6566 StructurizeCFG.cpp
6667 TailRecursionElimination.cpp
0 //===- SpeculateAroundPHIs.cpp --------------------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
10 #include "llvm/ADT/PostOrderIterator.h"
11 #include "llvm/ADT/Sequence.h"
12 #include "llvm/ADT/SetVector.h"
13 #include "llvm/ADT/Statistic.h"
14 #include "llvm/Analysis/TargetTransformInfo.h"
15 #include "llvm/Analysis/ValueTracking.h"
16 #include "llvm/IR/BasicBlock.h"
17 #include "llvm/IR/IRBuilder.h"
18 #include "llvm/IR/Instructions.h"
19 #include "llvm/IR/IntrinsicInst.h"
20 #include "llvm/Support/Debug.h"
21 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
22
23 using namespace llvm;
24
25 #define DEBUG_TYPE "spec-phis"
26
27 STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
28 STATISTIC(NumEdgesSplit,
29 "Number of critical edges which were split for speculation");
30 STATISTIC(NumSpeculatedInstructions,
31 "Number of instructions we speculated around the PHI nodes");
32 STATISTIC(NumNewRedundantInstructions,
33 "Number of new, redundant instructions inserted");
34
35 /// Check wether speculating the users of a PHI node around the PHI
36 /// will be safe.
37 ///
38 /// This checks both that all of the users are safe and also that all of their
39 /// operands are either recursively safe or already available along an incoming
40 /// edge to the PHI.
41 ///
42 /// This routine caches both all the safe nodes explored in `PotentialSpecSet`
43 /// and the chain of nodes that definitively reach any unsafe node in
44 /// `UnsafeSet`. By preserving these between repeated calls to this routine for
45 /// PHIs in the same basic block, the exploration here can be reused. However,
46 /// these caches must no be reused for PHIs in a different basic block as they
47 /// reflect what is available along incoming edges.
48 static bool
49 isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
50 SmallPtrSetImpl &PotentialSpecSet,
51 SmallPtrSetImpl &UnsafeSet) {
52 auto *PhiBB = PN.getParent();
53 SmallPtrSet Visited;
54 SmallVector, 16> DFSStack;
55
56 // Walk each user of the PHI node.
57 for (Use &U : PN.uses()) {
58 auto *UI = cast(U.getUser());
59
60 // Ensure the use post-dominates the PHI node. This ensures that, in the
61 // absence of unwinding, the use will actually be reached.
62 // FIXME: We use a blunt hammer of requiring them to be in the same basic
63 // block. We should consider using actual post-dominance here in the
64 // future.
65 if (UI->getParent() != PhiBB) {
66 DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
67 return false;
68 }
69
70 // FIXME: This check is much too conservative. We're not going to move these
71 // instructions onto new dynamic paths through the program unless there is
72 // a call instruction between the use and the PHI node. And memory isn't
73 // changing unless there is a store in that same sequence. We should
74 // probably change this to do at least a limited scan of the intervening
75 // instructions and allow handling stores in easily proven safe cases.
76 if (mayBeMemoryDependent(*UI)) {
77 DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
78 return false;
79 }
80
81 // Now do a depth-first search of everything these users depend on to make
82 // sure they are transitively safe. This is a depth-first search, but we
83 // check nodes in preorder to minimize the amount of checking.
84 Visited.insert(UI);
85 DFSStack.push_back({UI, UI->value_op_begin()});
86 do {
87 User::value_op_iterator OpIt;
88 std::tie(UI, OpIt) = DFSStack.pop_back_val();
89
90 while (OpIt != UI->value_op_end()) {
91 auto *OpI = dyn_cast(*OpIt);
92 // Increment to the next operand for whenever we continue.
93 ++OpIt;
94 // No need to visit non-instructions, which can't form dependencies.
95 if (!OpI)
96 continue;
97
98 // Now do the main pre-order checks that this operand is a viable
99 // dependency of something we want to speculate.
100
101 // First do a few checks for instructions that won't require
102 // speculation at all because they are trivially available on the
103 // incoming edge (either through dominance or through an incoming value
104 // to a PHI).
105 //
106 // The cases in the current block will be trivially dominated by the
107 // edge.
108 auto *ParentBB = OpI->getParent();
109 if (ParentBB == PhiBB) {
110 if (isa(OpI)) {
111 // We can trivially map through phi nodes in the same block.
112 continue;
113 }
114 } else if (DT.dominates(ParentBB, PhiBB)) {
115 // Instructions from dominating blocks are already available.
116 continue;
117 }
118
119 // Once we know that we're considering speculating the operand, check
120 // if we've already explored this subgraph and found it to be safe.
121 if (PotentialSpecSet.count(OpI))
122 continue;
123
124 // If we've already explored this subgraph and found it unsafe, bail.
125 // If when we directly test whether this is safe it fails, bail.
126 if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
127 mayBeMemoryDependent(*OpI)) {
128 DEBUG(dbgs() << " Unsafe: can't speculate transitive use: " << *OpI
129 << "\n");
130 // Record the stack of instructions which reach this node as unsafe
131 // so we prune subsequent searches.
132 UnsafeSet.insert(OpI);
133 for (auto &StackPair : DFSStack) {
134 Instruction *I = StackPair.first;
135 UnsafeSet.insert(I);
136 }
137 return false;
138 }
139
140 // Skip any operands we're already recursively checking.
141 if (!Visited.insert(OpI).second)
142 continue;
143
144 // Push onto the stack and descend. We can directly continue this
145 // loop when ascending.
146 DFSStack.push_back({UI, OpIt});
147 UI = OpI;
148 OpIt = OpI->value_op_begin();
149 }
150
151 // This node and all its operands are safe. Go ahead and cache that for
152 // reuse later.
153 PotentialSpecSet.insert(UI);
154
155 // Continue with the next node on the stack.
156 } while (!DFSStack.empty());
157 }
158
159 #ifndef NDEBUG
160 // Every visited operand should have been marked as safe for speculation at
161 // this point. Verify this and return success.
162 for (auto *I : Visited)
163 assert(PotentialSpecSet.count(I) &&
164 "Failed to mark a visited instruction as safe!");
165 #endif
166 return true;
167 }
168
169 /// Check whether, in isolation, a given PHI node is both safe and profitable
170 /// to speculate users around.
171 ///
172 /// This handles checking whether there are any constant operands to a PHI
173 /// which could represent a useful speculation candidate, whether the users of
174 /// the PHI are safe to speculate including all their transitive dependencies,
175 /// and whether after speculation there will be some cost savings (profit) to
176 /// folding the operands into the users of the PHI node. Returns true if both
177 /// safe and profitable with relevant cost savings updated in the map and with
178 /// an update to the `PotentialSpecSet`. Returns false if either safety or
179 /// profitability are absent. Some new entries may be made to the
180 /// `PotentialSpecSet` even when this routine returns false, but they remain
181 /// conservatively correct.
182 ///
183 /// The profitability check here is a local one, but it checks this in an
184 /// interesting way. Beyond checking that the total cost of materializing the
185 /// constants will be less than the cost of folding them into their users, it
186 /// also checks that no one incoming constant will have a higher cost when
187 /// folded into its users rather than materialized. This higher cost could
188 /// result in a dynamic *path* that is more expensive even when the total cost
189 /// is lower. Currently, all of the interesting cases where this optimization
190 /// should fire are ones where it is a no-loss operation in this sense. If we
191 /// ever want to be more aggressive here, we would need to balance the
192 /// different incoming edges' cost by looking at their respective
193 /// probabilities.
194 static bool isSafeAndProfitableToSpeculateAroundPHI(
195 PHINode &PN, SmallDenseMap &CostSavingsMap,
196 SmallPtrSetImpl &PotentialSpecSet,
197 SmallPtrSetImpl &UnsafeSet, DominatorTree &DT,
198 TargetTransformInfo &TTI) {
199 // First see whether there is any cost savings to speculating around this
200 // PHI, and build up a map of the constant inputs to how many times they
201 // occur.
202 bool NonFreeMat = false;
203 struct CostsAndCount {
204 int MatCost = TargetTransformInfo::TCC_Free;
205 int FoldedCost = TargetTransformInfo::TCC_Free;
206 int Count = 0;
207 };
208 SmallDenseMap CostsAndCounts;
209 SmallPtrSet IncomingConstantBlocks;
210 for (int i : llvm::seq(0, PN.getNumIncomingValues())) {
211 auto *IncomingC = dyn_cast(PN.getIncomingValue(i));
212 if (!IncomingC)
213 continue;
214
215 // Only visit each incoming edge with a constant input once.
216 if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
217 continue;
218
219 auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
220 // Count how many edges share a given incoming costant.
221 ++InsertResult.first->second.Count;
222 // Only compute the cost the first time we see a particular constant.
223 if (!InsertResult.second)
224 continue;
225
226 int &MatCost = InsertResult.first->second.MatCost;
227 MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
228 NonFreeMat |= MatCost != TTI.TCC_Free;
229 }
230 if (!NonFreeMat) {
231 DEBUG(dbgs() << " Free: " << PN << "\n");
232 // No profit in free materialization.
233 return false;
234 }
235
236 // Now check that the uses of this PHI can actually be speculated,
237 // otherwise we'll still have to materialize the PHI value.
238 if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
239 DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
240 return false;
241 }
242
243 // Compute how much (if any) savings are available by speculating around this
244 // PHI.
245 for (Use &U : PN.uses()) {
246 auto *UserI = cast(U.getUser());
247 // Now check whether there is any savings to folding the incoming constants
248 // into this use.
249 unsigned Idx = U.getOperandNo();
250
251 // If we have a binary operator that is commutative, an actual constant
252 // operand would end up on the RHS, so pretend the use of the PHI is on the
253 // RHS.
254 //
255 // Technically, this is a bit weird if *both* operands are PHIs we're
256 // speculating. But if that is the case, giving an "optimistic" cost isn't
257 // a bad thing because after speculation it will constant fold. And
258 // moreover, such cases should likely have been constant folded already by
259 // some other pass, so we shouldn't worry about "modeling" them terribly
260 // accurately here. Similarly, if the other operand is a constant, it still
261 // seems fine to be "optimistic" in our cost modeling, because when the
262 // incoming operand from the PHI node is also a constant, we will end up
263 // constant folding.
264 if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
265 // Assume we will commute the constant to the RHS to be canonical.
266 Idx = 1;
267
268 // Get the intrinsic ID if this user is an instrinsic.
269 Intrinsic::ID IID = Intrinsic::not_intrinsic;
270 if (auto *UserII = dyn_cast(UserI))
271 IID = UserII->getIntrinsicID();
272
273 for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
274 ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
275 int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
276 int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
277 if (IID)
278 FoldedCost += TTI.getIntImmCost(IID, Idx, IncomingC->getValue(),
279 IncomingC->getType());
280 else
281 FoldedCost +=
282 TTI.getIntImmCost(UserI->getOpcode(), Idx, IncomingC->getValue(),
283 IncomingC->getType());
284
285 // If we accumulate more folded cost for this incoming constant than
286 // materialized cost, then we'll regress any edge with this constant so
287 // just bail. We're only interested in cases where folding the incoming
288 // constants is at least break-even on all paths.
289 if (FoldedCost > MatCost) {
290 DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC << "\n"
291 " Materializing cost: " << MatCost << "\n"
292 " Accumulated folded cost: " << FoldedCost << "\n");
293 return false;
294 }
295 }
296 }
297
298 // Compute the total cost savings afforded by this PHI node.
299 int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
300 for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
301 int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
302 int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
303 int Count = IncomingConstantAndCostsAndCount.second.Count;
304
305 TotalMatCost += MatCost * Count;
306 TotalFoldedCost += FoldedCost * Count;
307 }
308 assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
309 "less that its materialized cost, "
310 "the sum must be as well.");
311
312 DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
313 << ": " << PN << "\n");
314 CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
315 return true;
316 }
317
318 /// Simple helper to walk all the users of a list of phis depth first, and call
319 /// a visit function on each one in post-order.
320 ///
321 /// All of the PHIs should be in the same basic block, and this is primarily
322 /// used to make a single depth-first walk across their collective users
323 /// without revisiting any subgraphs. Callers should provide a fast, idempotent
324 /// callable to test whether a node has been visited and the more important
325 /// callable to actually visit a particular node.
326 ///
327 /// Depth-first and postorder here refer to the *operand* graph -- we start
328 /// from a collection of users of PHI nodes and walk "up" the operands
329 /// depth-first.
330 template
331 static void visitPHIUsersAndDepsInPostOrder(ArrayRef PNs,
332 IsVisitedT IsVisited,
333 VisitT Visit) {
334 SmallVector, 16> DFSStack;
335 for (auto *PN : PNs)
336 for (Use &U : PN->uses()) {
337 auto *UI = cast(U.getUser());
338 if (IsVisited(UI))
339 // Already visited this user, continue across the roots.
340 continue;
341
342 // Otherwise, walk the operand graph depth-first and visit each
343 // dependency in postorder.
344 DFSStack.push_back({UI, UI->value_op_begin()});
345 do {
346 User::value_op_iterator OpIt;
347 std::tie(UI, OpIt) = DFSStack.pop_back_val();
348 while (OpIt != UI->value_op_end()) {
349 auto *OpI = dyn_cast(*OpIt);
350 // Increment to the next operand for whenever we continue.
351 ++OpIt;
352 // No need to visit non-instructions, which can't form dependencies,
353 // or instructions outside of our potential dependency set that we
354 // were given. Finally, if we've already visited the node, continue
355 // to the next.
356 if (!OpI || IsVisited(OpI))
357 continue;
358
359 // Push onto the stack and descend. We can directly continue this
360 // loop when ascending.
361 DFSStack.push_back({UI, OpIt});
362 UI = OpI;
363 OpIt = OpI->value_op_begin();
364 }
365
366 // Finished visiting children, visit this node.
367 assert(!IsVisited(UI) && "Should not have already visited a node!");
368 Visit(UI);
369 } while (!DFSStack.empty());
370 }
371 }
372
373 /// Find profitable PHIs to speculate.
374 ///
375 /// For a PHI node to be profitable, we need the cost of speculating its users
376 /// (and their dependencies) to not exceed the savings of folding the PHI's
377 /// constant operands into the speculated users.
378 ///
379 /// Computing this is surprisingly challenging. Because users of two different
380 /// PHI nodes can depend on each other or on common other instructions, it may
381 /// be profitable to speculate two PHI nodes together even though neither one
382 /// in isolation is profitable. The straightforward way to find all the
383 /// profitable PHIs would be to check each combination of PHIs' cost, but this
384 /// is exponential in complexity.
385 ///
386 /// Even if we assume that we only care about cases where we can consider each
387 /// PHI node in isolation (rather than considering cases where none are
388 /// profitable in isolation but some subset are profitable as a set), we still
389 /// have a challenge. The obvious way to find all individually profitable PHIs
390 /// is to iterate until reaching a fixed point, but this will be quadratic in
391 /// complexity. =/
392 ///
393 /// This code currently uses a linear-to-compute order for a greedy approach.
394 /// It won't find cases where a set of PHIs must be considered together, but it
395 /// handles most cases of order dependence without quadratic iteration. The
396 /// specific order used is the post-order across the operand DAG. When the last
397 /// user of a PHI is visited in this postorder walk, we check it for
398 /// profitability.
399 ///
400 /// There is an orthogonal extra complexity to all of this: computing the cost
401 /// itself can easily become a linear computation making everything again (at
402 /// best) quadratic. Using a postorder over the operand graph makes it
403 /// particularly easy to avoid this through dynamic programming. As we do the
404 /// postorder walk, we build the transitive cost of that subgraph. It is also
405 /// straightforward to then update these costs when we mark a PHI for
406 /// speculation so that subsequent PHIs don't re-pay the cost of already
407 /// speculated instructions.
408 static SmallVector
409 findProfitablePHIs(ArrayRef PNs,
410 const SmallDenseMap &CostSavingsMap,
411 const SmallPtrSetImpl &PotentialSpecSet,
412 int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
413 SmallVector SpecPNs;
414
415 // First, establish a reverse mapping from immediate users of the PHI nodes
416 // to the nodes themselves, and count how many users each PHI node has in
417 // a way we can update while processing them.
418 SmallDenseMap, 16> UserToPNMap;
419 SmallDenseMap PNUserCountMap;
420 SmallPtrSet UserSet;
421 for (auto *PN : PNs) {
422 assert(UserSet.empty() && "Must start with an empty user set!");
423 for (Use &U : PN->uses())
424 UserSet.insert(cast(U.getUser()));
425 PNUserCountMap[PN] = UserSet.size();
426 for (auto *UI : UserSet)
427 UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
428 UserSet.clear();
429 }
430
431 // Now do a DFS across the operand graph of the users, computing cost as we
432 // go and when all costs for a given PHI are known, checking that PHI for
433 // profitability.
434 SmallDenseMap SpecCostMap;
435 visitPHIUsersAndDepsInPostOrder(
436 PNs,
437 /*IsVisited*/
438 [&](Instruction *I) {
439 // We consider anything that isn't potentially speculated to be
440 // "visited" as it is already handled. Similarly, anything that *is*
441 // potentially speculated but for which we have an entry in our cost
442 // map, we're done.
443 return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
444 },
445 /*Visit*/
446 [&](Instruction *I) {
447 // We've fully visited the operands, so sum their cost with this node
448 // and update the cost map.
449 int Cost = TTI.TCC_Free;
450 for (Value *OpV : I->operand_values())
451 if (auto *OpI = dyn_cast(OpV)) {
452 auto CostMapIt = SpecCostMap.find(OpI);
453 if (CostMapIt != SpecCostMap.end())
454 Cost += CostMapIt->second;
455 }
456 Cost += TTI.getUserCost(I);
457 bool Inserted = SpecCostMap.insert({I, Cost}).second;
458 (void)Inserted;
459 assert(Inserted && "Must not re-insert a cost during the DFS!");
460
461 // Now check if this node had a corresponding PHI node using it. If so,
462 // we need to decrement the outstanding user count for it.
463 auto UserPNsIt = UserToPNMap.find(I);
464 if (UserPNsIt == UserToPNMap.end())
465 return;
466 auto &UserPNs = UserPNsIt->second;
467 auto UserPNsSplitIt = std::stable_partition(
468 UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
469 int &PNUserCount = PNUserCountMap.find(UserPN)->second;
470 assert(
471 PNUserCount > 0 &&
472 "Should never re-visit a PN after its user count hits zero!");
473 --PNUserCount;
474 return PNUserCount != 0;
475 });
476
477 // FIXME: Rather than one at a time, we should sum the savings as the
478 // cost will be completely shared.
479 SmallVector SpecWorklist;
480 for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
481 int SpecCost = TTI.TCC_Free;
482 for (Use &U : PN->uses())
483 SpecCost +=
484 SpecCostMap.find(cast(U.getUser()))->second;
485 SpecCost *= (NumPreds - 1);
486 // When the user count of a PHI node hits zero, we should check its
487 // profitability. If profitable, we should mark it for speculation
488 // and zero out the cost of everything it depends on.
489 int CostSavings = CostSavingsMap.find(PN)->second;
490 if (SpecCost > CostSavings) {
491 DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN << "\n"
492 " Cost savings: " << CostSavings << "\n"
493 " Speculation cost: " << SpecCost << "\n");
494 continue;
495 }
496
497 // We're going to speculate this user-associated PHI. Copy it out and
498 // add its users to the worklist to update their cost.
499 SpecPNs.push_back(PN);
500 for (Use &U : PN->uses()) {
501 auto *UI = cast(U.getUser());
502 auto CostMapIt = SpecCostMap.find(UI);
503 if (CostMapIt->second == 0)
504 continue;
505 // Zero out this cost entry to avoid duplicates.
506 CostMapIt->second = 0;
507 SpecWorklist.push_back(UI);
508 }
509 }
510
511 // Now walk all the operands of the users in the worklist transitively
512 // to zero out all the memoized costs.
513 while (!SpecWorklist.empty()) {
514 Instruction *SpecI = SpecWorklist.pop_back_val();
515 assert(SpecCostMap.find(SpecI)->second == 0 &&
516 "Didn't zero out a cost!");
517
518 // Walk the operands recursively to zero out their cost as well.
519 for (auto *OpV : SpecI->operand_values()) {
520 auto *OpI = dyn_cast(OpV);
521 if (!OpI)
522 continue;
523 auto CostMapIt = SpecCostMap.find(OpI);
524 if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
525 continue;
526 CostMapIt->second = 0;
527 SpecWorklist.push_back(OpI);
528 }
529 }
530 });
531
532 return SpecPNs;
533 }
534
535 /// Speculate users around a set of PHI nodes.
536 ///
537 /// This routine does the actual speculation around a set of PHI nodes where we
538 /// have determined this to be both safe and profitable.
539 ///
540 /// This routine handles any spliting of critical edges necessary to create
541 /// a safe block to speculate into as well as cloning the instructions and
542 /// rewriting all uses.
543 static void speculatePHIs(ArrayRef SpecPNs,
544 SmallPtrSetImpl &PotentialSpecSet,
545 SmallSetVector &PredSet,
546 DominatorTree &DT) {
547 DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
548 NumPHIsSpeculated += SpecPNs.size();
549
550 // Split any critical edges so that we have a block to hoist into.
551 auto *ParentBB = SpecPNs[0]->getParent();
552 SmallVector SpecPreds;
553 SpecPreds.reserve(PredSet.size());
554 for (auto *PredBB : PredSet) {
555 auto *NewPredBB = SplitCriticalEdge(
556 PredBB, ParentBB,
557 CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
558 if (NewPredBB) {
559 ++NumEdgesSplit;
560 DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
561 << "\n");
562 SpecPreds.push_back(NewPredBB);
563 } else {
564 assert(PredBB->getSingleSuccessor() == ParentBB &&
565 "We need a non-critical predecessor to speculate into.");
566 assert(!isa(PredBB->getTerminator()) &&
567 "Cannot have a non-critical invoke!");
568
569 // Already non-critical, use existing pred.
570 SpecPreds.push_back(PredBB);
571 }
572 }
573
574 SmallPtrSet SpecSet;
575 SmallVector SpecList;
576 visitPHIUsersAndDepsInPostOrder(SpecPNs,
577 /*IsVisited*/
578 [&](Instruction *I) {
579 // This is visited if we don't need to
580 // speculate it or we already have
581 // speculated it.
582 return !PotentialSpecSet.count(I) ||
583 SpecSet.count(I);
584 },
585 /*Visit*/
586 [&](Instruction *I) {
587 // All operands scheduled, schedule this
588 // node.
589 SpecSet.insert(I);
590 SpecList.push_back(I);
591 });
592
593 int NumSpecInsts = SpecList.size() * SpecPreds.size();
594 int NumRedundantInsts = NumSpecInsts - SpecList.size();
595 DEBUG(dbgs() << " Inserting " << NumSpecInsts << " speculated instructions, "
596 << NumRedundantInsts << " redundancies\n");
597 NumSpeculatedInstructions += NumSpecInsts;
598 NumNewRedundantInstructions += NumRedundantInsts;
599
600 // Each predecessor is numbered by its index in `SpecPreds`, so for each
601 // instruction we speculate, the speculated instruction is stored in that
602 // index of the vector asosciated with the original instruction. We also
603 // store the incoming values for each predecessor from any PHIs used.
604 SmallDenseMap, 16> SpeculatedValueMap;
605
606 // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
607 // value. This handles both the PHIs we are speculating around and any other
608 // PHIs that happen to be used.
609 for (auto *OrigI : SpecList)
610 for (auto *OpV : OrigI->operand_values()) {
611 auto *OpPN = dyn_cast(OpV);
612 if (!OpPN || OpPN->getParent() != ParentBB)
613 continue;
614
615 auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
616 if (!InsertResult.second)
617 continue;
618
619 auto &SpeculatedVals = InsertResult.first->second;
620
621 // Populating our structure for mapping is particularly annoying because
622 // finding an incoming value for a particular predecessor block in a PHI
623 // node is a linear time operation! To avoid quadratic behavior, we build
624 // a map for this PHI node's incoming values and then translate it into
625 // the more compact representation used below.
626 SmallDenseMap IncomingValueMap;
627 for (int i : llvm::seq(0, OpPN->getNumIncomingValues()))
628 IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
629
630 for (auto *PredBB : SpecPreds)
631 SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
632 }
633
634 // Speculate into each predecessor.
635 for (int PredIdx : llvm::seq(0, SpecPreds.size())) {
636 auto *PredBB = SpecPreds[PredIdx];
637 assert(PredBB->getSingleSuccessor() == ParentBB &&
638 "We need a non-critical predecessor to speculate into.");
639
640 for (auto *OrigI : SpecList) {
641 auto *NewI = OrigI->clone();
642 NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
643 NewI->insertBefore(PredBB->getTerminator());
644
645 // Rewrite all the operands to the previously speculated instructions.
646 // Because we're walking in-order, the defs must precede the uses and we
647 // should already have these mappings.
648 for (Use &U : NewI->operands()) {
649 auto *OpI = dyn_cast(U.get());
650 if (!OpI)
651 continue;
652 auto MapIt = SpeculatedValueMap.find(OpI);
653 if (MapIt == SpeculatedValueMap.end())
654 continue;
655 const auto &SpeculatedVals = MapIt->second;
656 assert(SpeculatedVals[PredIdx] &&
657 "Must have a speculated value for this predecessor!");
658 assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
659 "Speculated value has the wrong type!");
660
661 // Rewrite the use to this predecessor's speculated instruction.
662 U.set(SpeculatedVals[PredIdx]);
663 }
664
665 // Commute instructions which now have a constant in the LHS but not the
666 // RHS.
667 if (NewI->isBinaryOp() && NewI->isCommutative() &&
668 isa(NewI->getOperand(0)) &&
669 !isa(NewI->getOperand(1)))
670 NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
671
672 SpeculatedValueMap[OrigI].push_back(NewI);
673 assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
674 "Mismatched speculated instruction index!");
675 }
676 }
677
678 // Walk the speculated instruction list and if they have uses, insert a PHI
679 // for them from the speculated versions, and replace the uses with the PHI.
680 // Then erase the instructions as they have been fully speculated. The walk
681 // needs to be in reverse so that we don't think there are users when we'll
682 // actually eventually remove them later.
683 IRBuilder<> IRB(SpecPNs[0]);
684 for (auto *OrigI : llvm::reverse(SpecList)) {
685 // Check if we need a PHI for any remaining users and if so, insert it.
686 if (!OrigI->use_empty()) {
687 auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
688 Twine(OrigI->getName()) + ".phi");
689 // Add the incoming values we speculated.
690 auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
691 for (int PredIdx : llvm::seq(0, SpecPreds.size()))
692 SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
693
694 // And replace the uses with the PHI node.
695 OrigI->replaceAllUsesWith(SpecIPN);
696 }
697
698 // It is important to immediately erase this so that it stops using other
699 // instructions. This avoids inserting needless PHIs of them.
700 OrigI->eraseFromParent();
701 }
702
703 // All of the uses of the speculated phi nodes should be removed at this
704 // point, so erase them.
705 for (auto *SpecPN : SpecPNs) {
706 assert(SpecPN->use_empty() && "All users should have been speculated!");
707 SpecPN->eraseFromParent();
708 }
709 }
710
711 /// Try to speculate around a series of PHIs from a single basic block.
712 ///
713 /// This routine checks whether any of these PHIs are profitable to speculate
714 /// users around. If safe and profitable, it does the speculation. It returns
715 /// true when at least some speculation occurs.
716 static bool tryToSpeculatePHIs(SmallVectorImpl &PNs,
717 DominatorTree &DT, TargetTransformInfo &TTI) {
718 DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
719
720 // Savings in cost from speculating around a PHI node.
721 SmallDenseMap CostSavingsMap;
722
723 // Remember the set of instructions that are candidates for speculation so
724 // that we can quickly walk things within that space. This prunes out
725 // instructions already available along edges, etc.
726 SmallPtrSet PotentialSpecSet;
727
728 // Remember the set of instructions that are (transitively) unsafe to
729 // speculate into the incoming edges of this basic block. This avoids
730 // recomputing them for each PHI node we check. This set is specific to this
731 // block though as things are pruned out of it based on what is available
732 // along incoming edges.
733 SmallPtrSet UnsafeSet;
734
735 // For each PHI node in this block, check whether there are immediate folding
736 // opportunities from speculation, and whether that speculation will be
737 // valid. This determise the set of safe PHIs to speculate.
738 PNs.erase(llvm::remove_if(PNs,
739 [&](PHINode *PN) {
740 return !isSafeAndProfitableToSpeculateAroundPHI(
741 *PN, CostSavingsMap, PotentialSpecSet,
742 UnsafeSet, DT, TTI);
743 }),
744 PNs.end());
745 // If no PHIs were profitable, skip.
746 if (PNs.empty()) {
747 DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
748 return false;
749 }
750
751 // We need to know how much speculation will cost which is determined by how
752 // many incoming edges will need a copy of each speculated instruction.
753 SmallSetVector PredSet;
754 for (auto *PredBB : PNs[0]->blocks()) {
755 if (!PredSet.insert(PredBB))
756 continue;
757
758 // We cannot speculate when a predecessor is an indirect branch.
759 // FIXME: We also can't reliably create a non-critical edge block for
760 // speculation if the predecessor is an invoke. This doesn't seem
761 // fundamental and we should probably be splitting critical edges
762 // differently.
763 if (isa(PredBB->getTerminator()) ||
764 isa(PredBB->getTerminator())) {
765 DEBUG(dbgs() << " Invalid: predecessor terminator: " << PredBB->getName()
766 << "\n");
767 return false;
768 }
769 }
770 if (PredSet.size() < 2) {
771 DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
772 return false;
773 }
774
775 SmallVector SpecPNs = findProfitablePHIs(
776 PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
777 if (SpecPNs.empty())
778 // Nothing to do.
779 return false;
780
781 speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
782 return true;
783 }
784
785 PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
786 FunctionAnalysisManager &AM) {
787 auto &DT = AM.getResult(F);
788 auto &TTI = AM.getResult(F);
789
790 bool Changed = false;
791 for (auto *BB : ReversePostOrderTraversal(&F)) {
792 SmallVector PNs;
793 auto BBI = BB->begin();
794 while (auto *PN = dyn_cast(&*BBI)) {
795 PNs.push_back(PN);
796 ++BBI;
797 }
798
799 if (PNs.empty())
800 continue;
801
802 Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
803 }
804
805 if (!Changed)
806 return PreservedAnalyses::all();
807
808 PreservedAnalyses PA;
809 return PA;
810 }
209209 ; CHECK-O-NEXT: Running pass: InstSimplifierPass
210210 ; CHECK-O-NEXT: Running pass: DivRemPairsPass
211211 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
212 ; CHECK-O-NEXT: Running pass: SpeculateAroundPHIsPass
212213 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
213214 ; CHECK-O-NEXT: Running pass: GlobalDCEPass
214215 ; CHECK-O-NEXT: Running pass: ConstantMergePass
197197 ; CHECK-POSTLINK-O-NEXT: Running pass: InstSimplifierPass
198198 ; CHECK-POSTLINK-O-NEXT: Running pass: DivRemPairsPass
199199 ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass
200 ; CHECK-POSTLINK-O-NEXT: Running pass: SpeculateAroundPHIsPass
200201 ; CHECK-POSTLINK-O-NEXT: Finished llvm::Function pass manager run.
201202 ; CHECK-POSTLINK-O-NEXT: Running pass: GlobalDCEPass
202203 ; CHECK-POSTLINK-O-NEXT: Running pass: ConstantMergePass
0 ; Test the basic functionality of speculating around PHI nodes based on reduced
1 ; cost of the constant operands to the PHI nodes using the x86 cost model.
2 ;
3 ; REQUIRES: x86-registered-target
4 ; RUN: opt -S -passes=spec-phis < %s | FileCheck %s
5
6 target triple = "x86_64-unknown-unknown"
7
8 define i32 @test_basic(i1 %flag, i32 %arg) {
9 ; CHECK-LABEL: define i32 @test_basic(
10 entry:
11 br i1 %flag, label %a, label %b
12 ; CHECK: br i1 %flag, label %a, label %b
13
14 a:
15 br label %exit
16 ; CHECK: a:
17 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %arg, 7
18 ; CHECK-NEXT: br label %exit
19
20 b:
21 br label %exit
22 ; CHECK: b:
23 ; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %arg, 11
24 ; CHECK-NEXT: br label %exit
25
26 exit:
27 %p = phi i32 [ 7, %a ], [ 11, %b ]
28 %sum = add i32 %arg, %p
29 ret i32 %sum
30 ; CHECK: exit:
31 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
32 ; CHECK-NEXT: ret i32 %[[PHI]]
33 }
34
35 ; Check that we handle commuted operands and get the constant onto the RHS.
36 define i32 @test_commuted(i1 %flag, i32 %arg) {
37 ; CHECK-LABEL: define i32 @test_commuted(
38 entry:
39 br i1 %flag, label %a, label %b
40 ; CHECK: br i1 %flag, label %a, label %b
41
42 a:
43 br label %exit
44 ; CHECK: a:
45 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %arg, 7
46 ; CHECK-NEXT: br label %exit
47
48 b:
49 br label %exit
50 ; CHECK: b:
51 ; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %arg, 11
52 ; CHECK-NEXT: br label %exit
53
54 exit:
55 %p = phi i32 [ 7, %a ], [ 11, %b ]
56 %sum = add i32 %p, %arg
57 ret i32 %sum
58 ; CHECK: exit:
59 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
60 ; CHECK-NEXT: ret i32 %[[PHI]]
61 }
62
63 define i32 @test_split_crit_edge(i1 %flag, i32 %arg) {
64 ; CHECK-LABEL: define i32 @test_split_crit_edge(
65 entry:
66 br i1 %flag, label %exit, label %a
67 ; CHECK: entry:
68 ; CHECK-NEXT: br i1 %flag, label %[[ENTRY_SPLIT:.*]], label %a
69 ;
70 ; CHECK: [[ENTRY_SPLIT]]:
71 ; CHECK-NEXT: %[[SUM_ENTRY_SPLIT:.*]] = add i32 %arg, 7
72 ; CHECK-NEXT: br label %exit
73
74 a:
75 br label %exit
76 ; CHECK: a:
77 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %arg, 11
78 ; CHECK-NEXT: br label %exit
79
80 exit:
81 %p = phi i32 [ 7, %entry ], [ 11, %a ]
82 %sum = add i32 %arg, %p
83 ret i32 %sum
84 ; CHECK: exit:
85 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_ENTRY_SPLIT]], %[[ENTRY_SPLIT]] ], [ %[[SUM_A]], %a ]
86 ; CHECK-NEXT: ret i32 %[[PHI]]
87 }
88
89 define i32 @test_no_spec_dominating_inst(i1 %flag, i32* %ptr) {
90 ; CHECK-LABEL: define i32 @test_no_spec_dominating_inst(
91 entry:
92 %load = load i32, i32* %ptr
93 br i1 %flag, label %a, label %b
94 ; CHECK: %[[LOAD:.*]] = load i32, i32* %ptr
95 ; CHECK-NEXT: br i1 %flag, label %a, label %b
96
97 a:
98 br label %exit
99 ; CHECK: a:
100 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %[[LOAD]], 7
101 ; CHECK-NEXT: br label %exit
102
103 b:
104 br label %exit
105 ; CHECK: b:
106 ; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %[[LOAD]], 11
107 ; CHECK-NEXT: br label %exit
108
109 exit:
110 %p = phi i32 [ 7, %a ], [ 11, %b ]
111 %sum = add i32 %load, %p
112 ret i32 %sum
113 ; CHECK: exit:
114 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
115 ; CHECK-NEXT: ret i32 %[[PHI]]
116 }
117
118 ; We have special logic handling PHI nodes, make sure it doesn't get confused
119 ; by a dominating PHI.
120 define i32 @test_no_spec_dominating_phi(i1 %flag1, i1 %flag2, i32 %x, i32 %y) {
121 ; CHECK-LABEL: define i32 @test_no_spec_dominating_phi(
122 entry:
123 br i1 %flag1, label %x.block, label %y.block
124 ; CHECK: entry:
125 ; CHECK-NEXT: br i1 %flag1, label %x.block, label %y.block
126
127 x.block:
128 br label %merge
129 ; CHECK: x.block:
130 ; CHECK-NEXT: br label %merge
131
132 y.block:
133 br label %merge
134 ; CHECK: y.block:
135 ; CHECK-NEXT: br label %merge
136
137 merge:
138 %xy.phi = phi i32 [ %x, %x.block ], [ %y, %y.block ]
139 br i1 %flag2, label %a, label %b
140 ; CHECK: merge:
141 ; CHECK-NEXT: %[[XY_PHI:.*]] = phi i32 [ %x, %x.block ], [ %y, %y.block ]
142 ; CHECK-NEXT: br i1 %flag2, label %a, label %b
143
144 a:
145 br label %exit
146 ; CHECK: a:
147 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %[[XY_PHI]], 7
148 ; CHECK-NEXT: br label %exit
149
150 b:
151 br label %exit
152 ; CHECK: b:
153 ; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %[[XY_PHI]], 11
154 ; CHECK-NEXT: br label %exit
155
156 exit:
157 %p = phi i32 [ 7, %a ], [ 11, %b ]
158 %sum = add i32 %xy.phi, %p
159 ret i32 %sum
160 ; CHECK: exit:
161 ; CHECK-NEXT: %[[SUM_PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
162 ; CHECK-NEXT: ret i32 %[[SUM_PHI]]
163 }
164
165 ; Ensure that we will speculate some number of "free" instructions on the given
166 ; architecture even though they are unrelated to the PHI itself.
167 define i32 @test_speculate_free_insts(i1 %flag, i64 %arg) {
168 ; CHECK-LABEL: define i32 @test_speculate_free_insts(
169 entry:
170 br i1 %flag, label %a, label %b
171 ; CHECK: br i1 %flag, label %a, label %b
172
173 a:
174 br label %exit
175 ; CHECK: a:
176 ; CHECK-NEXT: %[[T1_A:.*]] = trunc i64 %arg to i48
177 ; CHECK-NEXT: %[[T2_A:.*]] = trunc i48 %[[T1_A]] to i32
178 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %[[T2_A]], 7
179 ; CHECK-NEXT: br label %exit
180
181 b:
182 br label %exit
183 ; CHECK: b:
184 ; CHECK-NEXT: %[[T1_B:.*]] = trunc i64 %arg to i48
185 ; CHECK-NEXT: %[[T2_B:.*]] = trunc i48 %[[T1_B]] to i32
186 ; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %[[T2_B]], 11
187 ; CHECK-NEXT: br label %exit
188
189 exit:
190 %p = phi i32 [ 7, %a ], [ 11, %b ]
191 %t1 = trunc i64 %arg to i48
192 %t2 = trunc i48 %t1 to i32
193 %sum = add i32 %t2, %p
194 ret i32 %sum
195 ; CHECK: exit:
196 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
197 ; CHECK-NEXT: ret i32 %[[PHI]]
198 }
199
200 define i32 @test_speculate_free_phis(i1 %flag, i32 %arg1, i32 %arg2) {
201 ; CHECK-LABEL: define i32 @test_speculate_free_phis(
202 entry:
203 br i1 %flag, label %a, label %b
204 ; CHECK: br i1 %flag, label %a, label %b
205
206 a:
207 br label %exit
208 ; CHECK: a:
209 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %arg1, 7
210 ; CHECK-NEXT: br label %exit
211
212 b:
213 br label %exit
214 ; CHECK: b:
215 ; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %arg2, 11
216 ; CHECK-NEXT: br label %exit
217
218 exit:
219 %p1 = phi i32 [ 7, %a ], [ 11, %b ]
220 %p2 = phi i32 [ %arg1, %a ], [ %arg2, %b ]
221 %sum = add i32 %p2, %p1
222 ret i32 %sum
223 ; CHECK: exit:
224 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
225 ; We don't DCE the now unused PHI node...
226 ; CHECK-NEXT: %{{.*}} = phi i32 [ %arg1, %a ], [ %arg2, %b ]
227 ; CHECK-NEXT: ret i32 %[[PHI]]
228 }
229
230 ; We shouldn't speculate multiple uses even if each individually looks
231 ; profitable because of the total cost.
232 define i32 @test_no_spec_multi_uses(i1 %flag, i32 %arg1, i32 %arg2, i32 %arg3) {
233 ; CHECK-LABEL: define i32 @test_no_spec_multi_uses(
234 entry:
235 br i1 %flag, label %a, label %b
236 ; CHECK: br i1 %flag, label %a, label %b
237
238 a:
239 br label %exit
240 ; CHECK: a:
241 ; CHECK-NEXT: br label %exit
242
243 b:
244 br label %exit
245 ; CHECK: b:
246 ; CHECK-NEXT: br label %exit
247
248 exit:
249 %p = phi i32 [ 7, %a ], [ 11, %b ]
250 %add1 = add i32 %arg1, %p
251 %add2 = add i32 %arg2, %p
252 %add3 = add i32 %arg3, %p
253 %sum1 = add i32 %add1, %add2
254 %sum2 = add i32 %sum1, %add3
255 ret i32 %sum2
256 ; CHECK: exit:
257 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
258 ; CHECK-NEXT: %[[ADD1:.*]] = add i32 %arg1, %[[PHI]]
259 ; CHECK-NEXT: %[[ADD2:.*]] = add i32 %arg2, %[[PHI]]
260 ; CHECK-NEXT: %[[ADD3:.*]] = add i32 %arg3, %[[PHI]]
261 ; CHECK-NEXT: %[[SUM1:.*]] = add i32 %[[ADD1]], %[[ADD2]]
262 ; CHECK-NEXT: %[[SUM2:.*]] = add i32 %[[SUM1]], %[[ADD3]]
263 ; CHECK-NEXT: ret i32 %[[SUM2]]
264 }
265
266 define i32 @test_multi_phis1(i1 %flag, i32 %arg) {
267 ; CHECK-LABEL: define i32 @test_multi_phis1(
268 entry:
269 br i1 %flag, label %a, label %b
270 ; CHECK: br i1 %flag, label %a, label %b
271
272 a:
273 br label %exit
274 ; CHECK: a:
275 ; CHECK-NEXT: %[[SUM_A1:.*]] = add i32 %arg, 1
276 ; CHECK-NEXT: %[[SUM_A2:.*]] = add i32 %[[SUM_A1]], 3
277 ; CHECK-NEXT: %[[SUM_A3:.*]] = add i32 %[[SUM_A2]], 5
278 ; CHECK-NEXT: br label %exit
279
280 b:
281 br label %exit
282 ; CHECK: b:
283 ; CHECK-NEXT: %[[SUM_B1:.*]] = add i32 %arg, 2
284 ; CHECK-NEXT: %[[SUM_B2:.*]] = add i32 %[[SUM_B1]], 4
285 ; CHECK-NEXT: %[[SUM_B3:.*]] = add i32 %[[SUM_B2]], 6
286 ; CHECK-NEXT: br label %exit
287
288 exit:
289 %p1 = phi i32 [ 1, %a ], [ 2, %b ]
290 %p2 = phi i32 [ 3, %a ], [ 4, %b ]
291 %p3 = phi i32 [ 5, %a ], [ 6, %b ]
292 %sum1 = add i32 %arg, %p1
293 %sum2 = add i32 %sum1, %p2
294 %sum3 = add i32 %sum2, %p3
295 ret i32 %sum3
296 ; CHECK: exit:
297 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A3]], %a ], [ %[[SUM_B3]], %b ]
298 ; CHECK-NEXT: ret i32 %[[PHI]]
299 }
300
301 ; Check that the order of the PHIs doesn't impact the behavior.
302 define i32 @test_multi_phis2(i1 %flag, i32 %arg) {
303 ; CHECK-LABEL: define i32 @test_multi_phis2(
304 entry:
305 br i1 %flag, label %a, label %b
306 ; CHECK: br i1 %flag, label %a, label %b
307
308 a:
309 br label %exit
310 ; CHECK: a:
311 ; CHECK-NEXT: %[[SUM_A1:.*]] = add i32 %arg, 1
312 ; CHECK-NEXT: %[[SUM_A2:.*]] = add i32 %[[SUM_A1]], 3
313 ; CHECK-NEXT: %[[SUM_A3:.*]] = add i32 %[[SUM_A2]], 5
314 ; CHECK-NEXT: br label %exit
315
316 b:
317 br label %exit
318 ; CHECK: b:
319 ; CHECK-NEXT: %[[SUM_B1:.*]] = add i32 %arg, 2
320 ; CHECK-NEXT: %[[SUM_B2:.*]] = add i32 %[[SUM_B1]], 4
321 ; CHECK-NEXT: %[[SUM_B3:.*]] = add i32 %[[SUM_B2]], 6
322 ; CHECK-NEXT: br label %exit
323
324 exit:
325 %p3 = phi i32 [ 5, %a ], [ 6, %b ]
326 %p2 = phi i32 [ 3, %a ], [ 4, %b ]
327 %p1 = phi i32 [ 1, %a ], [ 2, %b ]
328 %sum1 = add i32 %arg, %p1
329 %sum2 = add i32 %sum1, %p2
330 %sum3 = add i32 %sum2, %p3
331 ret i32 %sum3
332 ; CHECK: exit:
333 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ %[[SUM_A3]], %a ], [ %[[SUM_B3]], %b ]
334 ; CHECK-NEXT: ret i32 %[[PHI]]
335 }
336
337 define i32 @test_no_spec_indirectbr(i1 %flag, i32 %arg) {
338 ; CHECK-LABEL: define i32 @test_no_spec_indirectbr(
339 entry:
340 br i1 %flag, label %a, label %b
341 ; CHECK: entry:
342 ; CHECK-NEXT: br i1 %flag, label %a, label %b
343
344 a:
345 indirectbr i8* undef, [label %exit]
346 ; CHECK: a:
347 ; CHECK-NEXT: indirectbr i8* undef, [label %exit]
348
349 b:
350 indirectbr i8* undef, [label %exit]
351 ; CHECK: b:
352 ; CHECK-NEXT: indirectbr i8* undef, [label %exit]
353
354 exit:
355 %p = phi i32 [ 7, %a ], [ 11, %b ]
356 %sum = add i32 %arg, %p
357 ret i32 %sum
358 ; CHECK: exit:
359 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
360 ; CHECK-NEXT: %[[SUM:.*]] = add i32 %arg, %[[PHI]]
361 ; CHECK-NEXT: ret i32 %[[SUM]]
362 }
363
364 declare void @g()
365
366 declare i32 @__gxx_personality_v0(...)
367
368 ; FIXME: We should be able to handle this case -- only the exceptional edge is
369 ; impossible to split.
370 define i32 @test_no_spec_invoke_continue(i1 %flag, i32 %arg) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
371 ; CHECK-LABEL: define i32 @test_no_spec_invoke_continue(
372 entry:
373 br i1 %flag, label %a, label %b
374 ; CHECK: entry:
375 ; CHECK-NEXT: br i1 %flag, label %a, label %b
376
377 a:
378 invoke void @g()
379 to label %exit unwind label %lpad
380 ; CHECK: a:
381 ; CHECK-NEXT: invoke void @g()
382 ; CHECK-NEXT: to label %exit unwind label %lpad
383
384 b:
385 invoke void @g()
386 to label %exit unwind label %lpad
387 ; CHECK: b:
388 ; CHECK-NEXT: invoke void @g()
389 ; CHECK-NEXT: to label %exit unwind label %lpad
390
391 exit:
392 %p = phi i32 [ 7, %a ], [ 11, %b ]
393 %sum = add i32 %arg, %p
394 ret i32 %sum
395 ; CHECK: exit:
396 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
397 ; CHECK-NEXT: %[[SUM:.*]] = add i32 %arg, %[[PHI]]
398 ; CHECK-NEXT: ret i32 %[[SUM]]
399
400 lpad:
401 %lp = landingpad { i8*, i32 }
402 cleanup
403 resume { i8*, i32 } undef
404 }
405
406 define i32 @test_no_spec_landingpad(i32 %arg, i32* %ptr) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
407 ; CHECK-LABEL: define i32 @test_no_spec_landingpad(
408 entry:
409 invoke void @g()
410 to label %invoke.cont unwind label %lpad
411 ; CHECK: entry:
412 ; CHECK-NEXT: invoke void @g()
413 ; CHECK-NEXT: to label %invoke.cont unwind label %lpad
414
415 invoke.cont:
416 invoke void @g()
417 to label %exit unwind label %lpad
418 ; CHECK: invoke.cont:
419 ; CHECK-NEXT: invoke void @g()
420 ; CHECK-NEXT: to label %exit unwind label %lpad
421
422 lpad:
423 %p = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
424 %lp = landingpad { i8*, i32 }
425 cleanup
426 %sum = add i32 %arg, %p
427 store i32 %sum, i32* %ptr
428 resume { i8*, i32 } undef
429 ; CHECK: lpad:
430 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
431
432 exit:
433 ret i32 0
434 }
435
436 declare i32 @__CxxFrameHandler3(...)
437
438 define i32 @test_no_spec_cleanuppad(i32 %arg, i32* %ptr) personality i32 (...)* @__CxxFrameHandler3 {
439 ; CHECK-LABEL: define i32 @test_no_spec_cleanuppad(
440 entry:
441 invoke void @g()
442 to label %invoke.cont unwind label %lpad
443 ; CHECK: entry:
444 ; CHECK-NEXT: invoke void @g()
445 ; CHECK-NEXT: to label %invoke.cont unwind label %lpad
446
447 invoke.cont:
448 invoke void @g()
449 to label %exit unwind label %lpad
450 ; CHECK: invoke.cont:
451 ; CHECK-NEXT: invoke void @g()
452 ; CHECK-NEXT: to label %exit unwind label %lpad
453
454 lpad:
455 %p = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
456 %cp = cleanuppad within none []
457 %sum = add i32 %arg, %p
458 store i32 %sum, i32* %ptr
459 cleanupret from %cp unwind to caller
460 ; CHECK: lpad:
461 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %entry ], [ 11, %invoke.cont ]
462
463 exit:
464 ret i32 0
465 }
466
467 ; Check that we don't fall over when confronted with seemingly reasonable code
468 ; for us to handle but in an unreachable region and with non-PHI use-def
469 ; cycles.
470 define i32 @test_unreachable_non_phi_cycles(i1 %flag, i32 %arg) {
471 ; CHECK-LABEL: define i32 @test_unreachable_non_phi_cycles(
472 entry:
473 ret i32 42
474 ; CHECK: entry:
475 ; CHECK-NEXT: ret i32 42
476
477 a:
478 br label %exit
479 ; CHECK: a:
480 ; CHECK-NEXT: br label %exit
481
482 b:
483 br label %exit
484 ; CHECK: b:
485 ; CHECK-NEXT: br label %exit
486
487 exit:
488 %p = phi i32 [ 7, %a ], [ 11, %b ]
489 %zext = zext i32 %sum to i64
490 %trunc = trunc i64 %zext to i32
491 %sum = add i32 %trunc, %p
492 br i1 %flag, label %a, label %b
493 ; CHECK: exit:
494 ; CHECK-NEXT: %[[PHI:.*]] = phi i32 [ 7, %a ], [ 11, %b ]
495 ; CHECK-NEXT: %[[ZEXT:.*]] = zext i32 %[[SUM:.*]] to i64
496 ; CHECK-NEXT: %[[TRUNC:.*]] = trunc i64 %[[ZEXT]] to i32
497 ; CHECK-NEXT: %[[SUM]] = add i32 %[[TRUNC]], %[[PHI]]
498 ; CHECK-NEXT: br i1 %flag, label %a, label %b
499 }
500
501 ; Check that we don't speculate in the face of an expensive immediate. There
502 ; are two reasons this should never speculate. First, even a local analysis
503 ; should fail because it makes some paths (%a) potentially more expensive due
504 ; to multiple uses of the immediate. Additionally, when we go to speculate the
505 ; instructions, their cost will also be too high.
506 ; FIXME: The goal is really to test the first property, but there doesn't
507 ; happen to be any way to use free-to-speculate instructions here so that it
508 ; would be the only interesting property.
509 define i64 @test_expensive_imm(i32 %flag, i64 %arg) {
510 ; CHECK-LABEL: define i64 @test_expensive_imm(
511 entry:
512 switch i32 %flag, label %a [
513 i32 1, label %b
514 i32 2, label %c
515 i32 3, label %d
516 ]
517 ; CHECK: switch i32 %flag, label %a [
518 ; CHECK-NEXT: i32 1, label %b
519 ; CHECK-NEXT: i32 2, label %c
520 ; CHECK-NEXT: i32 3, label %d
521 ; CHECK-NEXT: ]
522
523 a:
524 br label %exit
525 ; CHECK: a:
526 ; CHECK-NEXT: br label %exit
527
528 b:
529 br label %exit
530 ; CHECK: b:
531 ; CHECK-NEXT: br label %exit
532
533 c:
534 br label %exit
535 ; CHECK: c:
536 ; CHECK-NEXT: br label %exit
537
538 d:
539 br label %exit
540 ; CHECK: d:
541 ; CHECK-NEXT: br label %exit
542
543 exit:
544 %p = phi i64 [ 4294967296, %a ], [ 1, %b ], [ 1, %c ], [ 1, %d ]
545 %sum1 = add i64 %arg, %p
546 %sum2 = add i64 %sum1, %p
547 ret i64 %sum2
548 ; CHECK: exit:
549 ; CHECK-NEXT: %[[PHI:.*]] = phi i64 [ {{[0-9]+}}, %a ], [ 1, %b ], [ 1, %c ], [ 1, %d ]
550 ; CHECK-NEXT: %[[SUM1:.*]] = add i64 %arg, %[[PHI]]
551 ; CHECK-NEXT: %[[SUM2:.*]] = add i64 %[[SUM1]], %[[PHI]]
552 ; CHECK-NEXT: ret i64 %[[SUM2]]
553 }
554
555 define i32 @test_no_spec_non_postdominating_uses(i1 %flag1, i1 %flag2, i32 %arg) {
556 ; CHECK-LABEL: define i32 @test_no_spec_non_postdominating_uses(
557 entry:
558 br i1 %flag1, label %a, label %b
559 ; CHECK: br i1 %flag1, label %a, label %b
560
561 a:
562 br label %merge
563 ; CHECK: a:
564 ; CHECK-NEXT: %[[SUM_A:.*]] = add i32 %arg, 7
565 ; CHECK-NEXT: br label %merge
566
567 b:
568 br label %merge
569 ; CHECK: b:
570 ; CHECK-NEXT: %[[SUM_B:.*]] = add i32 %arg, 11
571 ; CHECK-NEXT: br label %merge
572
573 merge:
574 %p1 = phi i32 [ 7, %a ], [ 11, %b ]
575 %p2 = phi i32 [ 13, %a ], [ 42, %b ]
576 %sum1 = add i32 %arg, %p1
577 br i1 %flag2, label %exit1, label %exit2
578 ; CHECK: merge:
579 ; CHECK-NEXT: %[[PHI1:.*]] = phi i32 [ %[[SUM_A]], %a ], [ %[[SUM_B]], %b ]
580 ; CHECK-NEXT: %[[PHI2:.*]] = phi i32 [ 13, %a ], [ 42, %b ]
581 ; CHECK-NEXT: br i1 %flag2, label %exit1, label %exit2
582
583 exit1:
584 ret i32 %sum1
585 ; CHECK: exit1:
586 ; CHECK-NEXT: ret i32 %[[PHI1]]
587
588 exit2:
589 %sum2 = add i32 %arg, %p2
590 ret i32 %sum2
591 ; CHECK: exit2:
592 ; CHECK-NEXT: %[[SUM2:.*]] = add i32 %arg, %[[PHI2]]
593 ; CHECK-NEXT: ret i32 %[[SUM2]]
594 }