LLVM 19.0.0git
SelectOptimize.cpp
Go to the documentation of this file.
1//===--- SelectOptimize.cpp - Convert select to branches if profitable ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass converts selects to conditional jumps when profitable.
10//
11//===----------------------------------------------------------------------===//
12
15#include "llvm/ADT/Statistic.h"
22#include "llvm/CodeGen/Passes.h"
27#include "llvm/IR/BasicBlock.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/Instruction.h"
35#include "llvm/Pass.h"
39#include <algorithm>
40#include <memory>
41#include <queue>
42#include <stack>
43
44using namespace llvm;
45using namespace llvm::PatternMatch;
46
47#define DEBUG_TYPE "select-optimize"
48
49STATISTIC(NumSelectOptAnalyzed,
50 "Number of select groups considered for conversion to branch");
51STATISTIC(NumSelectConvertedExpColdOperand,
52 "Number of select groups converted due to expensive cold operand");
53STATISTIC(NumSelectConvertedHighPred,
54 "Number of select groups converted due to high-predictability");
55STATISTIC(NumSelectUnPred,
56 "Number of select groups not converted due to unpredictability");
57STATISTIC(NumSelectColdBB,
58 "Number of select groups not converted due to cold basic block");
59STATISTIC(NumSelectConvertedLoop,
60 "Number of select groups converted due to loop-level analysis");
61STATISTIC(NumSelectsConverted, "Number of selects converted");
62
64 "cold-operand-threshold",
65 cl::desc("Maximum frequency of path for an operand to be considered cold."),
66 cl::init(20), cl::Hidden);
67
69 "cold-operand-max-cost-multiplier",
70 cl::desc("Maximum cost multiplier of TCC_expensive for the dependence "
71 "slice of a cold operand to be considered inexpensive."),
73
75 GainGradientThreshold("select-opti-loop-gradient-gain-threshold",
76 cl::desc("Gradient gain threshold (%)."),
77 cl::init(25), cl::Hidden);
78
80 GainCycleThreshold("select-opti-loop-cycle-gain-threshold",
81 cl::desc("Minimum gain per loop (in cycles) threshold."),
83
85 "select-opti-loop-relative-gain-threshold",
87 "Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"),
89
91 "mispredict-default-rate", cl::Hidden, cl::init(25),
92 cl::desc("Default mispredict rate (initialized to 25%)."));
93
94static cl::opt<bool>
95 DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden,
96 cl::init(false),
97 cl::desc("Disable loop-level heuristics."));
98
99namespace {
100
101class SelectOptimizeImpl {
102 const TargetMachine *TM = nullptr;
103 const TargetSubtargetInfo *TSI = nullptr;
104 const TargetLowering *TLI = nullptr;
105 const TargetTransformInfo *TTI = nullptr;
106 const LoopInfo *LI = nullptr;
108 ProfileSummaryInfo *PSI = nullptr;
109 OptimizationRemarkEmitter *ORE = nullptr;
110 TargetSchedModel TSchedModel;
111
112public:
113 SelectOptimizeImpl() = default;
114 SelectOptimizeImpl(const TargetMachine *TM) : TM(TM){};
116 bool runOnFunction(Function &F, Pass &P);
117
119
120 struct CostInfo {
121 /// Predicated cost (with selects as conditional moves).
122 Scaled64 PredCost;
123 /// Non-predicated cost (with selects converted to branches).
124 Scaled64 NonPredCost;
125 };
126
127 /// SelectLike is an abstraction over SelectInst and other operations that can
128 /// act like selects. For example Or(Zext(icmp), X) can be treated like
129 /// select(icmp, X|1, X).
130 class SelectLike {
131 SelectLike(Instruction *I) : I(I) {}
132
133 /// The select (/or) instruction.
134 Instruction *I;
135 /// Whether this select is inverted, "not(cond), FalseVal, TrueVal", as
136 /// opposed to the original condition.
137 bool Inverted = false;
138
139 public:
140 /// Match a select or select-like instruction, returning a SelectLike.
141 static SelectLike match(Instruction *I) {
142 // Select instruction are what we are usually looking for.
143 if (isa<SelectInst>(I))
144 return SelectLike(I);
145
146 // An Or(zext(i1 X), Y) can also be treated like a select, with condition
147 // C and values Y|1 and Y.
148 Value *X;
150 I, m_c_Or(m_OneUse(m_ZExt(m_Value(X))), m_Value())) &&
151 X->getType()->isIntegerTy(1))
152 return SelectLike(I);
153
154 return SelectLike(nullptr);
155 }
156
157 bool isValid() { return I; }
158 operator bool() { return isValid(); }
159
160 /// Invert the select by inverting the condition and switching the operands.
161 void setInverted() {
162 assert(!Inverted && "Trying to invert an inverted SelectLike");
163 assert(isa<Instruction>(getCondition()) &&
164 cast<Instruction>(getCondition())->getOpcode() ==
165 Instruction::Xor);
166 Inverted = true;
167 }
168 bool isInverted() const { return Inverted; }
169
170 Instruction *getI() { return I; }
171 const Instruction *getI() const { return I; }
172
173 Type *getType() const { return I->getType(); }
174
175 Value *getNonInvertedCondition() const {
176 if (auto *Sel = dyn_cast<SelectInst>(I))
177 return Sel->getCondition();
178 // Or(zext) case
179 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
180 Value *X;
181 if (PatternMatch::match(BO->getOperand(0),
183 return X;
184 if (PatternMatch::match(BO->getOperand(1),
186 return X;
187 }
188
189 llvm_unreachable("Unhandled case in getCondition");
190 }
191
192 /// Return the condition for the SelectLike instruction. For example the
193 /// condition of a select or c in `or(zext(c), x)`
194 Value *getCondition() const {
195 Value *CC = getNonInvertedCondition();
196 // For inverted conditions the CC is checked when created to be a not
197 // (xor) instruction.
198 if (Inverted)
199 return cast<Instruction>(CC)->getOperand(0);
200 return CC;
201 }
202
203 /// Return the true value for the SelectLike instruction. Note this may not
204 /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
205 /// the true value would be `or(x,1)`. As this value does not exist, nullptr
206 /// is returned.
207 Value *getTrueValue(bool HonorInverts = true) const {
208 if (Inverted && HonorInverts)
209 return getFalseValue(/*HonorInverts=*/false);
210 if (auto *Sel = dyn_cast<SelectInst>(I))
211 return Sel->getTrueValue();
212 // Or(zext) case - The true value is Or(X), so return nullptr as the value
213 // does not yet exist.
214 if (isa<BinaryOperator>(I))
215 return nullptr;
216
217 llvm_unreachable("Unhandled case in getTrueValue");
218 }
219
220 /// Return the false value for the SelectLike instruction. For example the
221 /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
222 /// `select(c, x|1, x)`)
223 Value *getFalseValue(bool HonorInverts = true) const {
224 if (Inverted && HonorInverts)
225 return getTrueValue(/*HonorInverts=*/false);
226 if (auto *Sel = dyn_cast<SelectInst>(I))
227 return Sel->getFalseValue();
228 // Or(zext) case - return the operand which is not the zext.
229 if (auto *BO = dyn_cast<BinaryOperator>(I)) {
230 Value *X;
231 if (PatternMatch::match(BO->getOperand(0),
233 return BO->getOperand(1);
234 if (PatternMatch::match(BO->getOperand(1),
236 return BO->getOperand(0);
237 }
238
239 llvm_unreachable("Unhandled case in getFalseValue");
240 }
241
242 /// Return the NonPredCost cost of the true op, given the costs in
243 /// InstCostMap. This may need to be generated for select-like instructions.
244 Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
245 const TargetTransformInfo *TTI) {
246 if (isa<SelectInst>(I))
247 if (auto *I = dyn_cast<Instruction>(getTrueValue()))
248 return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
250
251 // Or case - add the cost of an extra Or to the cost of the False case.
252 if (isa<BinaryOperator>(I))
253 if (auto I = dyn_cast<Instruction>(getFalseValue()))
254 if (InstCostMap.contains(I)) {
256 Instruction::Or, I->getType(), TargetTransformInfo::TCK_Latency,
257 {TargetTransformInfo::OK_AnyValue,
258 TargetTransformInfo::OP_None},
259 {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2});
260 return InstCostMap[I].NonPredCost +
261 Scaled64::get(*OrCost.getValue());
262 }
263
264 return Scaled64::getZero();
265 }
266
267 /// Return the NonPredCost cost of the false op, given the costs in
268 /// InstCostMap. This may need to be generated for select-like instructions.
270 getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
271 const TargetTransformInfo *TTI) {
272 if (isa<SelectInst>(I))
273 if (auto *I = dyn_cast<Instruction>(getFalseValue()))
274 return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
276
277 // Or case - return the cost of the false case
278 if (isa<BinaryOperator>(I))
279 if (auto I = dyn_cast<Instruction>(getFalseValue()))
280 if (InstCostMap.contains(I))
281 return InstCostMap[I].NonPredCost;
282
283 return Scaled64::getZero();
284 }
285 };
286
287private:
288 // Select groups consist of consecutive select instructions with the same
289 // condition.
290 using SelectGroup = SmallVector<SelectLike, 2>;
291 using SelectGroups = SmallVector<SelectGroup, 2>;
292
293 // Converts select instructions of a function to conditional jumps when deemed
294 // profitable. Returns true if at least one select was converted.
295 bool optimizeSelects(Function &F);
296
297 // Heuristics for determining which select instructions can be profitably
298 // conveted to branches. Separate heuristics for selects in inner-most loops
299 // and the rest of code regions (base heuristics for non-inner-most loop
300 // regions).
301 void optimizeSelectsBase(Function &F, SelectGroups &ProfSIGroups);
302 void optimizeSelectsInnerLoops(Function &F, SelectGroups &ProfSIGroups);
303
304 // Converts to branches the select groups that were deemed
305 // profitable-to-convert.
306 void convertProfitableSIGroups(SelectGroups &ProfSIGroups);
307
308 // Splits selects of a given basic block into select groups.
309 void collectSelectGroups(BasicBlock &BB, SelectGroups &SIGroups);
310
311 // Determines for which select groups it is profitable converting to branches
312 // (base and inner-most-loop heuristics).
313 void findProfitableSIGroupsBase(SelectGroups &SIGroups,
314 SelectGroups &ProfSIGroups);
315 void findProfitableSIGroupsInnerLoops(const Loop *L, SelectGroups &SIGroups,
316 SelectGroups &ProfSIGroups);
317
318 // Determines if a select group should be converted to a branch (base
319 // heuristics).
320 bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
321
322 // Returns true if there are expensive instructions in the cold value
323 // operand's (if any) dependence slice of any of the selects of the given
324 // group.
325 bool hasExpensiveColdOperand(const SelectGroup &ASI);
326
327 // For a given source instruction, collect its backwards dependence slice
328 // consisting of instructions exclusively computed for producing the operands
329 // of the source instruction.
330 void getExclBackwardsSlice(Instruction *I, std::stack<Instruction *> &Slice,
331 Instruction *SI, bool ForSinking = false);
332
333 // Returns true if the condition of the select is highly predictable.
334 bool isSelectHighlyPredictable(const SelectLike SI);
335
336 // Loop-level checks to determine if a non-predicated version (with branches)
337 // of the given loop is more profitable than its predicated version.
338 bool checkLoopHeuristics(const Loop *L, const CostInfo LoopDepth[2]);
339
340 // Computes instruction and loop-critical-path costs for both the predicated
341 // and non-predicated version of the given loop.
342 bool computeLoopCosts(const Loop *L, const SelectGroups &SIGroups,
344 CostInfo *LoopCost);
345
346 // Returns a set of all the select instructions in the given select groups.
348 getSImap(const SelectGroups &SIGroups);
349
350 // Returns the latency cost of a given instruction.
351 std::optional<uint64_t> computeInstCost(const Instruction *I);
352
353 // Returns the misprediction cost of a given select when converted to branch.
354 Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
355
356 // Returns the cost of a branch when the prediction is correct.
357 Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
358 const SelectLike SI);
359
360 // Returns true if the target architecture supports lowering a given select.
361 bool isSelectKindSupported(const SelectLike SI);
362};
363
364class SelectOptimize : public FunctionPass {
365 SelectOptimizeImpl Impl;
366
367public:
368 static char ID;
369
370 SelectOptimize() : FunctionPass(ID) {
372 }
373
374 bool runOnFunction(Function &F) override {
375 return Impl.runOnFunction(F, *this);
376 }
377
378 void getAnalysisUsage(AnalysisUsage &AU) const override {
385 }
386};
387
388} // namespace
389
392 SelectOptimizeImpl Impl(TM);
393 return Impl.run(F, FAM);
394}
395
396char SelectOptimize::ID = 0;
397
398INITIALIZE_PASS_BEGIN(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
399 false)
406INITIALIZE_PASS_END(SelectOptimize, DEBUG_TYPE, "Optimize selects", false,
407 false)
408
409FunctionPass *llvm::createSelectOptimizePass() { return new SelectOptimize(); }
410
411PreservedAnalyses SelectOptimizeImpl::run(Function &F,
413 TSI = TM->getSubtargetImpl(F);
414 TLI = TSI->getTargetLowering();
415
416 // If none of the select types are supported then skip this pass.
417 // This is an optimization pass. Legality issues will be handled by
418 // instruction selection.
422 return PreservedAnalyses::all();
423
426 return PreservedAnalyses::all();
427
429 .getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
430 assert(PSI && "This pass requires module analysis pass `profile-summary`!");
432
433 // When optimizing for size, selects are preferable over branches.
434 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
435 return PreservedAnalyses::all();
436
437 LI = &FAM.getResult<LoopAnalysis>(F);
439 TSchedModel.init(TSI);
440
441 bool Changed = optimizeSelects(F);
442 return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
443}
444
445bool SelectOptimizeImpl::runOnFunction(Function &F, Pass &P) {
446 TM = &P.getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
447 TSI = TM->getSubtargetImpl(F);
448 TLI = TSI->getTargetLowering();
449
450 // If none of the select types are supported then skip this pass.
451 // This is an optimization pass. Legality issues will be handled by
452 // instruction selection.
456 return false;
457
458 TTI = &P.getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
459
461 return false;
462
463 LI = &P.getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
464 BFI = &P.getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
465 PSI = &P.getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
466 ORE = &P.getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
467 TSchedModel.init(TSI);
468
469 // When optimizing for size, selects are preferable over branches.
470 if (F.hasOptSize() || llvm::shouldOptimizeForSize(&F, PSI, BFI))
471 return false;
472
473 return optimizeSelects(F);
474}
475
476bool SelectOptimizeImpl::optimizeSelects(Function &F) {
477 // Determine for which select groups it is profitable converting to branches.
478 SelectGroups ProfSIGroups;
479 // Base heuristics apply only to non-loops and outer loops.
480 optimizeSelectsBase(F, ProfSIGroups);
481 // Separate heuristics for inner-most loops.
482 optimizeSelectsInnerLoops(F, ProfSIGroups);
483
484 // Convert to branches the select groups that were deemed
485 // profitable-to-convert.
486 convertProfitableSIGroups(ProfSIGroups);
487
488 // Code modified if at least one select group was converted.
489 return !ProfSIGroups.empty();
490}
491
492void SelectOptimizeImpl::optimizeSelectsBase(Function &F,
493 SelectGroups &ProfSIGroups) {
494 // Collect all the select groups.
495 SelectGroups SIGroups;
496 for (BasicBlock &BB : F) {
497 // Base heuristics apply only to non-loops and outer loops.
498 Loop *L = LI->getLoopFor(&BB);
499 if (L && L->isInnermost())
500 continue;
501 collectSelectGroups(BB, SIGroups);
502 }
503
504 // Determine for which select groups it is profitable converting to branches.
505 findProfitableSIGroupsBase(SIGroups, ProfSIGroups);
506}
507
508void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
509 SelectGroups &ProfSIGroups) {
511 // Need to check size on each iteration as we accumulate child loops.
512 for (unsigned long i = 0; i < Loops.size(); ++i)
513 for (Loop *ChildL : Loops[i]->getSubLoops())
514 Loops.push_back(ChildL);
515
516 for (Loop *L : Loops) {
517 if (!L->isInnermost())
518 continue;
519
520 SelectGroups SIGroups;
521 for (BasicBlock *BB : L->getBlocks())
522 collectSelectGroups(*BB, SIGroups);
523
524 findProfitableSIGroupsInnerLoops(L, SIGroups, ProfSIGroups);
525 }
526}
527
528/// If \p isTrue is true, return the true value of \p SI, otherwise return
529/// false value of \p SI. If the true/false value of \p SI is defined by any
530/// select instructions in \p Selects, look through the defining select
531/// instruction until the true/false value is not defined in \p Selects.
532static Value *
533getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
535 IRBuilder<> &IB) {
536 Value *V = nullptr;
537 for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
538 DefSI != nullptr && Selects.count(DefSI);
539 DefSI = dyn_cast<SelectInst>(V)) {
540 if (DefSI->getCondition() == SI.getCondition())
541 V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
542 else // Handle inverted SI
543 V = (!isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
544 }
545
546 if (isa<BinaryOperator>(SI.getI())) {
547 assert(SI.getI()->getOpcode() == Instruction::Or &&
548 "Only currently handling Or instructions.");
549 V = SI.getFalseValue();
550 if (isTrue)
551 V = IB.CreateOr(V, ConstantInt::get(V->getType(), 1));
552 }
553
554 assert(V && "Failed to get select true/false value");
555 return V;
556}
557
558void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
559 for (SelectGroup &ASI : ProfSIGroups) {
560 // The code transformation here is a modified version of the sinking
561 // transformation in CodeGenPrepare::optimizeSelectInst with a more
562 // aggressive strategy of which instructions to sink.
563 //
564 // TODO: eliminate the redundancy of logic transforming selects to branches
565 // by removing CodeGenPrepare::optimizeSelectInst and optimizing here
566 // selects for all cases (with and without profile information).
567
568 // Transform a sequence like this:
569 // start:
570 // %cmp = cmp uge i32 %a, %b
571 // %sel = select i1 %cmp, i32 %c, i32 %d
572 //
573 // Into:
574 // start:
575 // %cmp = cmp uge i32 %a, %b
576 // %cmp.frozen = freeze %cmp
577 // br i1 %cmp.frozen, label %select.true, label %select.false
578 // select.true:
579 // br label %select.end
580 // select.false:
581 // br label %select.end
582 // select.end:
583 // %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
584 //
585 // %cmp should be frozen, otherwise it may introduce undefined behavior.
586 // In addition, we may sink instructions that produce %c or %d into the
587 // destination(s) of the new branch.
588 // If the true or false blocks do not contain a sunken instruction, that
589 // block and its branch may be optimized away. In that case, one side of the
590 // first branch will point directly to select.end, and the corresponding PHI
591 // predecessor block will be the start block.
592
593 // Find all the instructions that can be soundly sunk to the true/false
594 // blocks. These are instructions that are computed solely for producing the
595 // operands of the select instructions in the group and can be sunk without
596 // breaking the semantics of the LLVM IR (e.g., cannot sink instructions
597 // with side effects).
598 SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
599 typedef std::stack<Instruction *>::size_type StackSizeType;
600 StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
601 for (SelectLike SI : ASI) {
602 // For each select, compute the sinkable dependence chains of the true and
603 // false operands.
604 if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) {
605 std::stack<Instruction *> TrueSlice;
606 getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true);
607 maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
608 TrueSlices.push_back(TrueSlice);
609 }
610 if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) {
611 if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) {
612 std::stack<Instruction *> FalseSlice;
613 getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true);
614 maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
615 FalseSlices.push_back(FalseSlice);
616 }
617 }
618 }
619 // In the case of multiple select instructions in the same group, the order
620 // of non-dependent instructions (instructions of different dependence
621 // slices) in the true/false blocks appears to affect performance.
622 // Interleaving the slices seems to experimentally be the optimal approach.
623 // This interleaving scheduling allows for more ILP (with a natural downside
624 // of increasing a bit register pressure) compared to a simple ordering of
625 // one whole chain after another. One would expect that this ordering would
626 // not matter since the scheduling in the backend of the compiler would
627 // take care of it, but apparently the scheduler fails to deliver optimal
628 // ILP with a naive ordering here.
629 SmallVector<Instruction *, 2> TrueSlicesInterleaved, FalseSlicesInterleaved;
630 for (StackSizeType IS = 0; IS < maxTrueSliceLen; ++IS) {
631 for (auto &S : TrueSlices) {
632 if (!S.empty()) {
633 TrueSlicesInterleaved.push_back(S.top());
634 S.pop();
635 }
636 }
637 }
638 for (StackSizeType IS = 0; IS < maxFalseSliceLen; ++IS) {
639 for (auto &S : FalseSlices) {
640 if (!S.empty()) {
641 FalseSlicesInterleaved.push_back(S.top());
642 S.pop();
643 }
644 }
645 }
646
647 // We split the block containing the select(s) into two blocks.
648 SelectLike SI = ASI.front();
649 SelectLike LastSI = ASI.back();
650 BasicBlock *StartBlock = SI.getI()->getParent();
651 BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
652 // With RemoveDIs turned off, SplitPt can be a dbg.* intrinsic. With
653 // RemoveDIs turned on, SplitPt would instead point to the next
654 // instruction. To match existing dbg.* intrinsic behaviour with RemoveDIs,
655 // tell splitBasicBlock that we want to include any DbgVariableRecords
656 // attached to SplitPt in the splice.
657 SplitPt.setHeadBit(true);
658 BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
659 BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
660 // Delete the unconditional branch that was just created by the split.
661 StartBlock->getTerminator()->eraseFromParent();
662
663 // Move any debug/pseudo instructions and not's that were in-between the
664 // select group to the newly-created end block.
666 auto DIt = SI.getI()->getIterator();
667 while (&*DIt != LastSI.getI()) {
668 if (DIt->isDebugOrPseudoInst())
669 SinkInstrs.push_back(&*DIt);
670 if (match(&*DIt, m_Not(m_Specific(SI.getCondition()))))
671 SinkInstrs.push_back(&*DIt);
672 DIt++;
673 }
674 for (auto *DI : SinkInstrs)
675 DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
676
677 // Duplicate implementation for DbgRecords, the non-instruction debug-info
678 // format. Helper lambda for moving DbgRecords to the end block.
679 auto TransferDbgRecords = [&](Instruction &I) {
680 for (auto &DbgRecord :
681 llvm::make_early_inc_range(I.getDbgRecordRange())) {
684 EndBlock->getFirstInsertionPt());
685 }
686 };
687
688 // Iterate over all instructions in between SI and LastSI, not including
689 // SI itself. These are all the variable assignments that happen "in the
690 // middle" of the select group.
691 auto R = make_range(std::next(SI.getI()->getIterator()),
692 std::next(LastSI.getI()->getIterator()));
693 llvm::for_each(R, TransferDbgRecords);
694
695 // These are the new basic blocks for the conditional branch.
696 // At least one will become an actual new basic block.
697 BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
698 BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
699 if (!TrueSlicesInterleaved.empty()) {
700 TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink",
701 EndBlock->getParent(), EndBlock);
702 TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
703 TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
704 for (Instruction *TrueInst : TrueSlicesInterleaved)
705 TrueInst->moveBefore(TrueBranch);
706 }
707 if (!FalseSlicesInterleaved.empty()) {
708 FalseBlock =
709 BasicBlock::Create(EndBlock->getContext(), "select.false.sink",
710 EndBlock->getParent(), EndBlock);
711 FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
712 FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
713 for (Instruction *FalseInst : FalseSlicesInterleaved)
714 FalseInst->moveBefore(FalseBranch);
715 }
716 // If there was nothing to sink, then arbitrarily choose the 'false' side
717 // for a new input value to the PHI.
718 if (TrueBlock == FalseBlock) {
719 assert(TrueBlock == nullptr &&
720 "Unexpected basic block transform while optimizing select");
721
722 FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false",
723 EndBlock->getParent(), EndBlock);
724 auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
725 FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
726 }
727
728 // Insert the real conditional branch based on the original condition.
729 // If we did not create a new block for one of the 'true' or 'false' paths
730 // of the condition, it means that side of the branch goes to the end block
731 // directly and the path originates from the start block from the point of
732 // view of the new PHI.
733 BasicBlock *TT, *FT;
734 if (TrueBlock == nullptr) {
735 TT = EndBlock;
736 FT = FalseBlock;
737 TrueBlock = StartBlock;
738 } else if (FalseBlock == nullptr) {
739 TT = TrueBlock;
740 FT = EndBlock;
741 FalseBlock = StartBlock;
742 } else {
743 TT = TrueBlock;
744 FT = FalseBlock;
745 }
746 IRBuilder<> IB(SI.getI());
747 auto *CondFr = IB.CreateFreeze(SI.getCondition(),
748 SI.getCondition()->getName() + ".frozen");
749
751 for (auto SI : ASI)
752 INS.insert(SI.getI());
753
754 // Use reverse iterator because later select may use the value of the
755 // earlier select, and we need to propagate value through earlier select
756 // to get the PHI operand.
757 for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
758 SelectLike SI = *It;
759 // The select itself is replaced with a PHI Node.
760 PHINode *PN = PHINode::Create(SI.getType(), 2, "");
761 PN->insertBefore(EndBlock->begin());
762 PN->takeName(SI.getI());
763 PN->addIncoming(getTrueOrFalseValue(SI, true, INS, IB), TrueBlock);
764 PN->addIncoming(getTrueOrFalseValue(SI, false, INS, IB), FalseBlock);
765 PN->setDebugLoc(SI.getI()->getDebugLoc());
766 SI.getI()->replaceAllUsesWith(PN);
767 INS.erase(SI.getI());
768 ++NumSelectsConverted;
769 }
770 IB.CreateCondBr(CondFr, TT, FT, SI.getI());
771
772 // Remove the old select instructions, now that they are not longer used.
773 for (auto SI : ASI)
774 SI.getI()->eraseFromParent();
775 }
776}
777
778void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
779 SelectGroups &SIGroups) {
780 BasicBlock::iterator BBIt = BB.begin();
781 while (BBIt != BB.end()) {
782 Instruction *I = &*BBIt++;
783 if (SelectLike SI = SelectLike::match(I)) {
785 continue;
786
787 SelectGroup SIGroup;
788 SIGroup.push_back(SI);
789 while (BBIt != BB.end()) {
790 Instruction *NI = &*BBIt;
791 // Debug/pseudo instructions should be skipped and not prevent the
792 // formation of a select group.
793 if (NI->isDebugOrPseudoInst()) {
794 ++BBIt;
795 continue;
796 }
797
798 // Skip not(select(..)), if the not is part of the same select group
799 if (match(NI, m_Not(m_Specific(SI.getCondition())))) {
800 ++BBIt;
801 continue;
802 }
803
804 // We only allow selects in the same group, not other select-like
805 // instructions.
806 if (!isa<SelectInst>(NI))
807 break;
808
809 SelectLike NSI = SelectLike::match(NI);
810 if (NSI && SI.getCondition() == NSI.getCondition()) {
811 SIGroup.push_back(NSI);
812 } else if (NSI && match(NSI.getCondition(),
813 m_Not(m_Specific(SI.getCondition())))) {
814 NSI.setInverted();
815 SIGroup.push_back(NSI);
816 } else
817 break;
818 ++BBIt;
819 }
820
821 // If the select type is not supported, no point optimizing it.
822 // Instruction selection will take care of it.
823 if (!isSelectKindSupported(SI))
824 continue;
825
826 LLVM_DEBUG({
827 dbgs() << "New Select group with\n";
828 for (auto SI : SIGroup)
829 dbgs() << " " << *SI.getI() << "\n";
830 });
831
832 SIGroups.push_back(SIGroup);
833 }
834 }
835}
836
837void SelectOptimizeImpl::findProfitableSIGroupsBase(
838 SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
839 for (SelectGroup &ASI : SIGroups) {
840 ++NumSelectOptAnalyzed;
841 if (isConvertToBranchProfitableBase(ASI))
842 ProfSIGroups.push_back(ASI);
843 }
844}
845
848 LLVM_DEBUG(dbgs() << Rem.getMsg() << "\n");
849 ORE->emit(Rem);
850}
851
852void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
853 const Loop *L, SelectGroups &SIGroups, SelectGroups &ProfSIGroups) {
854 NumSelectOptAnalyzed += SIGroups.size();
855 // For each select group in an inner-most loop,
856 // a branch is more preferable than a select/conditional-move if:
857 // i) conversion to branches for all the select groups of the loop satisfies
858 // loop-level heuristics including reducing the loop's critical path by
859 // some threshold (see SelectOptimizeImpl::checkLoopHeuristics); and
860 // ii) the total cost of the select group is cheaper with a branch compared
861 // to its predicated version. The cost is in terms of latency and the cost
862 // of a select group is the cost of its most expensive select instruction
863 // (assuming infinite resources and thus fully leveraging available ILP).
864
866 CostInfo LoopCost[2] = {{Scaled64::getZero(), Scaled64::getZero()},
868 if (!computeLoopCosts(L, SIGroups, InstCostMap, LoopCost) ||
869 !checkLoopHeuristics(L, LoopCost)) {
870 return;
871 }
872
873 for (SelectGroup &ASI : SIGroups) {
874 // Assuming infinite resources, the cost of a group of instructions is the
875 // cost of the most expensive instruction of the group.
876 Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
877 for (SelectLike SI : ASI) {
878 SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost);
879 BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost);
880 }
881 if (BranchCost < SelectCost) {
882 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
883 OR << "Profitable to convert to branch (loop analysis). BranchCost="
884 << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
885 << ". ";
886 EmitAndPrintRemark(ORE, OR);
887 ++NumSelectConvertedLoop;
888 ProfSIGroups.push_back(ASI);
889 } else {
890 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
891 ASI.front().getI());
892 ORmiss << "Select is more profitable (loop analysis). BranchCost="
893 << BranchCost.toString()
894 << ", SelectCost=" << SelectCost.toString() << ". ";
895 EmitAndPrintRemark(ORE, ORmiss);
896 }
897 }
898}
899
900bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
901 const SelectGroup &ASI) {
902 SelectLike SI = ASI.front();
903 LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI.getI()
904 << "\n");
905 OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
906 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
907
908 // Skip cold basic blocks. Better to optimize for size for cold blocks.
909 if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) {
910 ++NumSelectColdBB;
911 ORmiss << "Not converted to branch because of cold basic block. ";
912 EmitAndPrintRemark(ORE, ORmiss);
913 return false;
914 }
915
916 // If unpredictable, branch form is less profitable.
917 if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) {
918 ++NumSelectUnPred;
919 ORmiss << "Not converted to branch because of unpredictable branch. ";
920 EmitAndPrintRemark(ORE, ORmiss);
921 return false;
922 }
923
924 // If highly predictable, branch form is more profitable, unless a
925 // predictable select is inexpensive in the target architecture.
926 if (isSelectHighlyPredictable(SI) && TLI->isPredictableSelectExpensive()) {
927 ++NumSelectConvertedHighPred;
928 OR << "Converted to branch because of highly predictable branch. ";
929 EmitAndPrintRemark(ORE, OR);
930 return true;
931 }
932
933 // Look for expensive instructions in the cold operand's (if any) dependence
934 // slice of any of the selects in the group.
935 if (hasExpensiveColdOperand(ASI)) {
936 ++NumSelectConvertedExpColdOperand;
937 OR << "Converted to branch because of expensive cold operand.";
938 EmitAndPrintRemark(ORE, OR);
939 return true;
940 }
941
942 ORmiss << "Not profitable to convert to branch (base heuristic).";
943 EmitAndPrintRemark(ORE, ORmiss);
944 return false;
945}
946
948 uint64_t Denominator) {
949 return (Numerator + (Denominator / 2)) / Denominator;
950}
951
952static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
953 uint64_t &TrueVal, uint64_t &FalseVal) {
954 if (isa<SelectInst>(SI.getI()))
955 return extractBranchWeights(*SI.getI(), TrueVal, FalseVal);
956 return false;
957}
958
959bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
960 bool ColdOperand = false;
961 uint64_t TrueWeight, FalseWeight, TotalWeight;
962 if (extractBranchWeights(ASI.front(), TrueWeight, FalseWeight)) {
963 uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
964 TotalWeight = TrueWeight + FalseWeight;
965 // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
966 ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
967 } else if (PSI->hasProfileSummary()) {
968 OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
969 ASI.front().getI());
970 ORmiss << "Profile data available but missing branch-weights metadata for "
971 "select instruction. ";
972 EmitAndPrintRemark(ORE, ORmiss);
973 }
974 if (!ColdOperand)
975 return false;
976 // Check if the cold path's dependence slice is expensive for any of the
977 // selects of the group.
978 for (SelectLike SI : ASI) {
979 Instruction *ColdI = nullptr;
980 uint64_t HotWeight;
981 if (TrueWeight < FalseWeight) {
982 ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue());
983 HotWeight = FalseWeight;
984 } else {
985 ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue());
986 HotWeight = TrueWeight;
987 }
988 if (ColdI) {
989 std::stack<Instruction *> ColdSlice;
990 getExclBackwardsSlice(ColdI, ColdSlice, SI.getI());
991 InstructionCost SliceCost = 0;
992 while (!ColdSlice.empty()) {
993 SliceCost += TTI->getInstructionCost(ColdSlice.top(),
995 ColdSlice.pop();
996 }
997 // The colder the cold value operand of the select is the more expensive
998 // the cmov becomes for computing the cold value operand every time. Thus,
999 // the colder the cold operand is the more its cost counts.
1000 // Get nearest integer cost adjusted for coldness.
1001 InstructionCost AdjSliceCost =
1002 divideNearest(SliceCost * HotWeight, TotalWeight);
1003 if (AdjSliceCost >=
1005 return true;
1006 }
1007 }
1008 return false;
1009}
1010
1011// Check if it is safe to move LoadI next to the SI.
1012// Conservatively assume it is safe only if there is no instruction
1013// modifying memory in-between the load and the select instruction.
1014static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI) {
1015 // Assume loads from different basic blocks are unsafe to move.
1016 if (LoadI->getParent() != SI->getParent())
1017 return false;
1018 auto It = LoadI->getIterator();
1019 while (&*It != SI) {
1020 if (It->mayWriteToMemory())
1021 return false;
1022 It++;
1023 }
1024 return true;
1025}
1026
1027// For a given source instruction, collect its backwards dependence slice
1028// consisting of instructions exclusively computed for the purpose of producing
1029// the operands of the source instruction. As an approximation
1030// (sufficiently-accurate in practice), we populate this set with the
1031// instructions of the backwards dependence slice that only have one-use and
1032// form an one-use chain that leads to the source instruction.
1033void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
1034 std::stack<Instruction *> &Slice,
1035 Instruction *SI,
1036 bool ForSinking) {
1038 std::queue<Instruction *> Worklist;
1039 Worklist.push(I);
1040 while (!Worklist.empty()) {
1041 Instruction *II = Worklist.front();
1042 Worklist.pop();
1043
1044 // Avoid cycles.
1045 if (!Visited.insert(II).second)
1046 continue;
1047
1048 if (!II->hasOneUse())
1049 continue;
1050
1051 // Cannot soundly sink instructions with side-effects.
1052 // Terminator or phi instructions cannot be sunk.
1053 // Avoid sinking other select instructions (should be handled separetely).
1054 if (ForSinking && (II->isTerminator() || II->mayHaveSideEffects() ||
1055 isa<SelectInst>(II) || isa<PHINode>(II)))
1056 continue;
1057
1058 // Avoid sinking loads in order not to skip state-modifying instructions,
1059 // that may alias with the loaded address.
1060 // Only allow sinking of loads within the same basic block that are
1061 // conservatively proven to be safe.
1062 if (ForSinking && II->mayReadFromMemory() && !isSafeToSinkLoad(II, SI))
1063 continue;
1064
1065 // Avoid considering instructions with less frequency than the source
1066 // instruction (i.e., avoid colder code regions of the dependence slice).
1067 if (BFI->getBlockFreq(II->getParent()) < BFI->getBlockFreq(I->getParent()))
1068 continue;
1069
1070 // Eligible one-use instruction added to the dependence slice.
1071 Slice.push(II);
1072
1073 // Explore all the operands of the current instruction to expand the slice.
1074 for (unsigned k = 0; k < II->getNumOperands(); ++k)
1075 if (auto *OpI = dyn_cast<Instruction>(II->getOperand(k)))
1076 Worklist.push(OpI);
1077 }
1078}
1079
1080bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
1081 uint64_t TrueWeight, FalseWeight;
1082 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1083 uint64_t Max = std::max(TrueWeight, FalseWeight);
1084 uint64_t Sum = TrueWeight + FalseWeight;
1085 if (Sum != 0) {
1086 auto Probability = BranchProbability::getBranchProbability(Max, Sum);
1087 if (Probability > TTI->getPredictableBranchThreshold())
1088 return true;
1089 }
1090 }
1091 return false;
1092}
1093
1094bool SelectOptimizeImpl::checkLoopHeuristics(const Loop *L,
1095 const CostInfo LoopCost[2]) {
1096 // Loop-level checks to determine if a non-predicated version (with branches)
1097 // of the loop is more profitable than its predicated version.
1098
1100 return true;
1101
1102 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti",
1103 L->getHeader()->getFirstNonPHI());
1104
1105 if (LoopCost[0].NonPredCost > LoopCost[0].PredCost ||
1106 LoopCost[1].NonPredCost >= LoopCost[1].PredCost) {
1107 ORmissL << "No select conversion in the loop due to no reduction of loop's "
1108 "critical path. ";
1109 EmitAndPrintRemark(ORE, ORmissL);
1110 return false;
1111 }
1112
1113 Scaled64 Gain[2] = {LoopCost[0].PredCost - LoopCost[0].NonPredCost,
1114 LoopCost[1].PredCost - LoopCost[1].NonPredCost};
1115
1116 // Profitably converting to branches need to reduce the loop's critical path
1117 // by at least some threshold (absolute gain of GainCycleThreshold cycles and
1118 // relative gain of 12.5%).
1119 if (Gain[1] < Scaled64::get(GainCycleThreshold) ||
1120 Gain[1] * Scaled64::get(GainRelativeThreshold) < LoopCost[1].PredCost) {
1121 Scaled64 RelativeGain = Scaled64::get(100) * Gain[1] / LoopCost[1].PredCost;
1122 ORmissL << "No select conversion in the loop due to small reduction of "
1123 "loop's critical path. Gain="
1124 << Gain[1].toString()
1125 << ", RelativeGain=" << RelativeGain.toString() << "%. ";
1126 EmitAndPrintRemark(ORE, ORmissL);
1127 return false;
1128 }
1129
1130 // If the loop's critical path involves loop-carried dependences, the gradient
1131 // of the gain needs to be at least GainGradientThreshold% (defaults to 25%).
1132 // This check ensures that the latency reduction for the loop's critical path
1133 // keeps decreasing with sufficient rate beyond the two analyzed loop
1134 // iterations.
1135 if (Gain[1] > Gain[0]) {
1136 Scaled64 GradientGain = Scaled64::get(100) * (Gain[1] - Gain[0]) /
1137 (LoopCost[1].PredCost - LoopCost[0].PredCost);
1138 if (GradientGain < Scaled64::get(GainGradientThreshold)) {
1139 ORmissL << "No select conversion in the loop due to small gradient gain. "
1140 "GradientGain="
1141 << GradientGain.toString() << "%. ";
1142 EmitAndPrintRemark(ORE, ORmissL);
1143 return false;
1144 }
1145 }
1146 // If the gain decreases it is not profitable to convert.
1147 else if (Gain[1] < Gain[0]) {
1148 ORmissL
1149 << "No select conversion in the loop due to negative gradient gain. ";
1150 EmitAndPrintRemark(ORE, ORmissL);
1151 return false;
1152 }
1153
1154 // Non-predicated version of the loop is more profitable than its
1155 // predicated version.
1156 return true;
1157}
1158
1159// Computes instruction and loop-critical-path costs for both the predicated
1160// and non-predicated version of the given loop.
1161// Returns false if unable to compute these costs due to invalid cost of loop
1162// instruction(s).
1163bool SelectOptimizeImpl::computeLoopCosts(
1164 const Loop *L, const SelectGroups &SIGroups,
1165 DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
1166 LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
1167 << L->getHeader()->getName() << "\n");
1168 const auto &SImap = getSImap(SIGroups);
1169 // Compute instruction and loop-critical-path costs across two iterations for
1170 // both predicated and non-predicated version.
1171 const unsigned Iterations = 2;
1172 for (unsigned Iter = 0; Iter < Iterations; ++Iter) {
1173 // Cost of the loop's critical path.
1174 CostInfo &MaxCost = LoopCost[Iter];
1175 for (BasicBlock *BB : L->getBlocks()) {
1176 for (const Instruction &I : *BB) {
1177 if (I.isDebugOrPseudoInst())
1178 continue;
1179 // Compute the predicated and non-predicated cost of the instruction.
1180 Scaled64 IPredCost = Scaled64::getZero(),
1181 INonPredCost = Scaled64::getZero();
1182
1183 // Assume infinite resources that allow to fully exploit the available
1184 // instruction-level parallelism.
1185 // InstCost = InstLatency + max(Op1Cost, Op2Cost, … OpNCost)
1186 for (const Use &U : I.operands()) {
1187 auto UI = dyn_cast<Instruction>(U.get());
1188 if (!UI)
1189 continue;
1190 if (InstCostMap.count(UI)) {
1191 IPredCost = std::max(IPredCost, InstCostMap[UI].PredCost);
1192 INonPredCost = std::max(INonPredCost, InstCostMap[UI].NonPredCost);
1193 }
1194 }
1195 auto ILatency = computeInstCost(&I);
1196 if (!ILatency) {
1197 OptimizationRemarkMissed ORmissL(DEBUG_TYPE, "SelectOpti", &I);
1198 ORmissL << "Invalid instruction cost preventing analysis and "
1199 "optimization of the inner-most loop containing this "
1200 "instruction. ";
1201 EmitAndPrintRemark(ORE, ORmissL);
1202 return false;
1203 }
1204 IPredCost += Scaled64::get(*ILatency);
1205 INonPredCost += Scaled64::get(*ILatency);
1206
1207 // For a select that can be converted to branch,
1208 // compute its cost as a branch (non-predicated cost).
1209 //
1210 // BranchCost = PredictedPathCost + MispredictCost
1211 // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
1212 // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
1213 if (SImap.contains(&I)) {
1214 auto SI = SImap.at(&I);
1215 Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
1216 Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
1217 Scaled64 PredictedPathCost =
1218 getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
1219
1220 Scaled64 CondCost = Scaled64::getZero();
1221 if (auto *CI = dyn_cast<Instruction>(SI.getCondition()))
1222 if (InstCostMap.count(CI))
1223 CondCost = InstCostMap[CI].NonPredCost;
1224 Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
1225
1226 INonPredCost = PredictedPathCost + MispredictCost;
1227 }
1228 LLVM_DEBUG(dbgs() << " " << ILatency << "/" << IPredCost << "/"
1229 << INonPredCost << " for " << I << "\n");
1230
1231 InstCostMap[&I] = {IPredCost, INonPredCost};
1232 MaxCost.PredCost = std::max(MaxCost.PredCost, IPredCost);
1233 MaxCost.NonPredCost = std::max(MaxCost.NonPredCost, INonPredCost);
1234 }
1235 }
1236 LLVM_DEBUG(dbgs() << "Iteration " << Iter + 1
1237 << " MaxCost = " << MaxCost.PredCost << " "
1238 << MaxCost.NonPredCost << "\n");
1239 }
1240 return true;
1241}
1242
1244SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
1246 for (const SelectGroup &ASI : SIGroups)
1247 for (SelectLike SI : ASI)
1248 SImap.try_emplace(SI.getI(), SI);
1249 return SImap;
1250}
1251
1252std::optional<uint64_t>
1253SelectOptimizeImpl::computeInstCost(const Instruction *I) {
1254 InstructionCost ICost =
1256 if (auto OC = ICost.getValue())
1257 return std::optional<uint64_t>(*OC);
1258 return std::nullopt;
1259}
1260
1262SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
1263 const Scaled64 CondCost) {
1264 uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
1265
1266 // Account for the default misprediction rate when using a branch
1267 // (conservatively set to 25% by default).
1268 uint64_t MispredictRate = MispredictDefaultRate;
1269 // If the select condition is obviously predictable, then the misprediction
1270 // rate is zero.
1271 if (isSelectHighlyPredictable(SI))
1272 MispredictRate = 0;
1273
1274 // CondCost is included to account for cases where the computation of the
1275 // condition is part of a long dependence chain (potentially loop-carried)
1276 // that would delay detection of a misprediction and increase its cost.
1277 Scaled64 MispredictCost =
1278 std::max(Scaled64::get(MispredictPenalty), CondCost) *
1279 Scaled64::get(MispredictRate);
1280 MispredictCost /= Scaled64::get(100);
1281
1282 return MispredictCost;
1283}
1284
1285// Returns the cost of a branch when the prediction is correct.
1286// TrueCost * TrueProbability + FalseCost * FalseProbability.
1288SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
1289 const SelectLike SI) {
1290 Scaled64 PredPathCost;
1291 uint64_t TrueWeight, FalseWeight;
1292 if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
1293 uint64_t SumWeight = TrueWeight + FalseWeight;
1294 if (SumWeight != 0) {
1295 PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
1296 FalseCost * Scaled64::get(FalseWeight);
1297 PredPathCost /= Scaled64::get(SumWeight);
1298 return PredPathCost;
1299 }
1300 }
1301 // Without branch weight metadata, we assume 75% for the one path and 25% for
1302 // the other, and pick the result with the biggest cost.
1303 PredPathCost = std::max(TrueCost * Scaled64::get(3) + FalseCost,
1304 FalseCost * Scaled64::get(3) + TrueCost);
1305 PredPathCost /= Scaled64::get(4);
1306 return PredPathCost;
1307}
1308
1309bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
1310 bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(1);
1311 if (VectorCond)
1312 return false;
1314 if (SI.getType()->isVectorTy())
1316 else
1318 return TLI->isSelectSupported(SelectKind);
1319}
static Value * getTrueOrFalseValue(SelectInst *SI, bool isTrue, const SmallPtrSet< const Instruction *, 2 > &Selects)
If isTrue is true, return the true value of SI, otherwise return false value of SI.
#define LLVM_DEBUG(X)
Definition: Debug.h:101
static bool runOnFunction(Function &F, bool PostInlining)
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define DEBUG_TYPE
Hexagon Hardware Loops
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition: PassSupport.h:55
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:59
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:52
This file contains the declarations for profiling metadata utility functions.
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isSafeToSinkLoad(Instruction *LoadI, Instruction *SI)
Optimize selects
static cl::opt< unsigned > ColdOperandMaxCostMultiplier("cold-operand-max-cost-multiplier", cl::desc("Maximum cost multiplier of TCC_expensive for the dependence " "slice of a cold operand to be considered inexpensive."), cl::init(1), cl::Hidden)
static cl::opt< unsigned > ColdOperandThreshold("cold-operand-threshold", cl::desc("Maximum frequency of path for an operand to be considered cold."), cl::init(20), cl::Hidden)
static cl::opt< bool > DisableLoopLevelHeuristics("disable-loop-level-heuristics", cl::Hidden, cl::init(false), cl::desc("Disable loop-level heuristics."))
static cl::opt< unsigned > GainCycleThreshold("select-opti-loop-cycle-gain-threshold", cl::desc("Minimum gain per loop (in cycles) threshold."), cl::init(4), cl::Hidden)
static cl::opt< unsigned > MispredictDefaultRate("mispredict-default-rate", cl::Hidden, cl::init(25), cl::desc("Default mispredict rate (initialized to 25%)."))
static void EmitAndPrintRemark(OptimizationRemarkEmitter *ORE, DiagnosticInfoOptimizationBase &Rem)
static cl::opt< unsigned > GainGradientThreshold("select-opti-loop-gradient-gain-threshold", cl::desc("Gradient gain threshold (%)."), cl::init(25), cl::Hidden)
static cl::opt< unsigned > GainRelativeThreshold("select-opti-loop-relative-gain-threshold", cl::desc("Minimum relative gain per loop threshold (1/X). Defaults to 12.5%"), cl::init(8), cl::Hidden)
This file contains the declaration of the SelectOptimizePass class, its corresponding pass name is se...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:409
void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:570
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
Analysis pass which computes BlockFrequencyInfo.
Legacy analysis pass which computes BlockFrequencyInfo.
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
static BranchProbability getBranchProbability(uint64_t Numerator, uint64_t Denominator)
Base class for non-instruction debug metadata records that have positions within IR.
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
Common features for diagnostics dealing with optimization remarks that are used by both IR and MIR pa...
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
bool isDebugOrPseudoInst() const LLVM_READONLY
Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool isTerminator() const
Definition: Instruction.h:255
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
iterator end() const
iterator begin() const
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
The legacy pass manager's analysis pass to compute loop information.
Definition: LoopInfo.h:593
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
OptimizationRemarkEmitter legacy analysis pass.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
An analysis over an "inner" IR unit that provides access to an analysis manager over a "outer" IR uni...
Definition: PassManager.h:756
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr, BasicBlock::iterator InsertBefore)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:98
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition: Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
An analysis pass based on legacy pass manager to deliver ProfileSummaryInfo.
Analysis providing profile information.
bool hasProfileSummary() const
Returns true if profile summary is available.
bool isColdBlock(const BBType *BB, BFIT *BFI) const
Returns true if BasicBlock BB is considered cold.
std::string toString(unsigned Precision=DefaultPrecision)
Convert to a decimal representation in a string.
Definition: ScaledNumber.h:596
static ScaledNumber get(uint64_t N)
Definition: ScaledNumber.h:526
static ScaledNumber getZero()
Definition: ScaledNumber.h:521
This class represents the LLVM 'select' instruction.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM)
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Analysis pass providing the TargetTransformInfo.
virtual bool isSelectSupported(SelectSupportKind) const
SelectSupportKind
Enum that describes what type of support for selects the target has.
bool isPredictableSelectExpensive() const
Return true if selects are only cheaper than branches if the branch is unlikely to be predicted right...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
Target-Independent Code Generator Pass Configuration Options.
Provide an instruction scheduling machine model to CodeGen passes.
const MCSchedModel * getMCSchedModel() const
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual const TargetLowering * getTargetLowering() const
Wrapper pass for TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
bool shouldTreatInstructionLikeSelect(const Instruction *I) const
Should the Select Optimization pass treat the given instruction like a select, potentially converting...
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
@ TCK_Latency
The latency of instruction.
bool enableSelectOptimize() const
Should the Select Optimization pass be enabled and ran.
BranchProbability getPredictableBranchThreshold() const
If a branch or a select condition is skewed in one direction by more than this factor,...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
unsigned getNumOperands() const
Definition: User.h:191
LLVM Value Representation.
Definition: Value.h:74
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
BinaryOp_match< cst_pred_ty< is_all_ones >, ValTy, Instruction::Xor, true > m_Not(const ValTy &V)
Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
FunctionPass * createSelectOptimizePass()
This pass converts conditional moves to conditional jumps when profitable.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
bool shouldOptimizeForSize(const MachineFunction *MF, ProfileSummaryInfo *PSI, const MachineBlockFrequencyInfo *BFI, PGSOQueryType QueryType=PGSOQueryType::Other)
Returns true if machine function MF is suggested to be size-optimized based on the profile.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator)
Returns the integer nearest(Numerator / Denominator).
Definition: MathExtras.h:433
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool extractBranchWeights(const MDNode *ProfileData, SmallVectorImpl< uint32_t > &Weights)
Extract branch weights from MD_prof metadata.
void initializeSelectOptimizePass(PassRegistry &)
unsigned MispredictPenalty
Definition: MCSchedule.h:306