llvm.org GIT mirror llvm / 6f41872
[X86] X86::CMOV to Branch heuristic based optimization. LLVM compiler recognizes opportunities to transform a branch into IR select instruction(s) - later it will be lowered into X86::CMOV instruction, assuming no other optimization eliminated the SelectInst. However, it is not always profitable to emit X86::CMOV instruction. For example, branch is preferable over an X86::CMOV instruction when: 1. Branch is well predicted 2. Condition operand is expensive, compared to True-value and the False-value operands In CodeGenPrepare pass there is a shallow optimization that tries to convert SelectInst into branch, but it is not enough. This commit, implements machine optimization pass that converts X86::CMOV instruction(s) into branch, based on a conservative heuristic. Differential Revision: https://reviews.llvm.org/D34769 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@308142 91177308-0d34-0410-b5e6-96231b3b80d8 Amjad Aboud 3 years ago
8 changed file(s) with 1006 addition(s) and 29 deletion(s). Raw diff Collapse all Expand all
3636 set(sources
3737 X86AsmPrinter.cpp
3838 X86CallFrameOptimization.cpp
39 X86CmovConversion.cpp
3940 X86ExpandPseudo.cpp
4041 X86FastISel.cpp
4142 X86FixupBWInsts.cpp
8282 /// the MachineInstr to MC.
8383 FunctionPass *createX86ExpandPseudoPass();
8484
85 /// This pass converts X86 cmov instructions into branch when profitable.
86 FunctionPass *createX86CmovConverterPass();
87
8588 /// Return a Machine IR pass that selectively replaces
8689 /// certain byte and word instructions by equivalent 32 bit instructions,
8790 /// in order to eliminate partial register usage, false dependences on
0 //====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements a pass that converts X86 cmov instructions into branch
10 /// when profitable. This pass is conservative, i.e., it applies transformation
11 /// if and only if it can gaurantee a gain with high confidence.
12 ///
13 /// Thus, the optimization applies under the following conditions:
14 /// 1. Consider as a candidate only CMOV in most inner loop, assuming that
15 /// most hotspots are represented by these loops.
16 /// 2. Given a group of CMOV instructions, that are using same EFLAGS def
17 /// instruction:
18 /// a. Consider them as candidates only if all have same code condition or
19 /// opposite one, to prevent generating more than one conditional jump
20 /// per EFLAGS def instruction.
21 /// b. Consider them as candidates only if all are profitable to be
22 /// converted, assuming that one bad conversion may casue a degradation.
23 /// 3. Apply conversion only for loop that are found profitable and only for
24 /// CMOV candidates that were found profitable.
25 /// a. Loop is considered profitable only if conversion will reduce its
26 /// depth cost by some thrishold.
27 /// b. CMOV is considered profitable if the cost of its condition is higher
28 /// than the average cost of its true-value and false-value by 25% of
29 /// branch-misprediction-penalty, this to assure no degredassion even
30 /// with 25% branch misprediction.
31 ///
32 /// Note: This pass is assumed to run on SSA machine code.
33 //===----------------------------------------------------------------------===//
34 //
35 // External interfaces:
36 // FunctionPass *llvm::createX86CmovConverterPass();
37 // bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
38 //
39
40 #include "X86.h"
41 #include "X86InstrInfo.h"
42 #include "X86Subtarget.h"
43 #include "llvm/ADT/Statistic.h"
44 #include "llvm/CodeGen/MachineFunctionPass.h"
45 #include "llvm/CodeGen/MachineInstrBuilder.h"
46 #include "llvm/CodeGen/MachineLoopInfo.h"
47 #include "llvm/CodeGen/MachineRegisterInfo.h"
48 #include "llvm/CodeGen/Passes.h"
49 #include "llvm/CodeGen/TargetSchedule.h"
50 #include "llvm/IR/InstIterator.h"
51 #include "llvm/Support/Debug.h"
52 #include "llvm/Support/raw_ostream.h"
53 using namespace llvm;
54
55 #define DEBUG_TYPE "x86-cmov-converter"
56
57 STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
58 STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
59 STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
60 STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
61
62 namespace {
63 // This internal switch can be used to turn off the cmov/branch optimization.
64 static cl::opt
65 EnableCmovConverter("x86-cmov-converter",
66 cl::desc("Enable the X86 cmov-to-branch optimization."),
67 cl::init(true), cl::Hidden);
68
69 /// Converts X86 cmov instructions into branches when profitable.
70 class X86CmovConverterPass : public MachineFunctionPass {
71 public:
72 X86CmovConverterPass() : MachineFunctionPass(ID) {}
73 ~X86CmovConverterPass() {}
74
75 StringRef getPassName() const override { return "X86 cmov Conversion"; }
76 bool runOnMachineFunction(MachineFunction &MF) override;
77 void getAnalysisUsage(AnalysisUsage &AU) const override;
78
79 private:
80 /// Pass identification, replacement for typeid.
81 static char ID;
82
83 const MachineRegisterInfo *MRI;
84 const TargetInstrInfo *TII;
85 TargetSchedModel TSchedModel;
86
87 /// List of consecutive CMOV instructions.
88 typedef SmallVector CmovGroup;
89 typedef SmallVector CmovGroups;
90
91 /// Collect all CMOV-group-candidates in \p CurrLoop and update \p
92 /// CmovInstGroups accordingly.
93 ///
94 /// \param CurrLoop Loop being processed.
95 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
96 /// \returns true iff it found any CMOV-group-candidate.
97 bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups);
98
99 /// Check if it is profitable to transform each CMOV-group-candidates into
100 /// branch. Remove all groups that are not profitable from \p CmovInstGroups.
101 ///
102 /// \param CurrLoop Loop being processed.
103 /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
104 /// \returns true iff any CMOV-group-candidate remain.
105 bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop,
106 CmovGroups &CmovInstGroups);
107
108 /// Convert the given list of consecutive CMOV instructions into a branch.
109 ///
110 /// \param Group Consecutive CMOV instructions to be converted into branch.
111 void convertCmovInstsToBranches(SmallVectorImpl &Group) const;
112 };
113
114 char X86CmovConverterPass::ID = 0;
115
116 void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
117 MachineFunctionPass::getAnalysisUsage(AU);
118 AU.addRequired();
119 }
120
121 bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
122 if (skipFunction(*MF.getFunction()))
123 return false;
124 if (!EnableCmovConverter)
125 return false;
126
127 DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
128 << "**********\n");
129
130 bool Changed = false;
131 MachineLoopInfo &MLI = getAnalysis();
132 const TargetSubtargetInfo &STI = MF.getSubtarget();
133 MRI = &MF.getRegInfo();
134 TII = STI.getInstrInfo();
135 TSchedModel.init(STI.getSchedModel(), &STI, TII);
136
137 //===--------------------------------------------------------------------===//
138 // Algorithm
139 // ---------
140 // For each inner most loop
141 // collectCmovCandidates() {
142 // Find all CMOV-group-candidates.
143 // }
144 //
145 // checkForProfitableCmovCandidates() {
146 // * Calculate both loop-depth and optimized-loop-depth.
147 // * Use these depth to check for loop transformation profitability.
148 // * Check for CMOV-group-candidate transformation profitability.
149 // }
150 //
151 // For each profitable CMOV-group-candidate
152 // convertCmovInstsToBranches() {
153 // * Create FalseBB, SinkBB, Conditional branch to SinkBB.
154 // * Replace each CMOV instruction with a PHI instruction in SinkBB.
155 // }
156 //
157 // Note: For more details, see each function description.
158 //===--------------------------------------------------------------------===//
159 for (MachineBasicBlock &MBB : MF) {
160 MachineLoop *CurrLoop = MLI.getLoopFor(&MBB);
161
162 // Optimize only inner most loops.
163 if (!CurrLoop || CurrLoop->getHeader() != &MBB ||
164 !CurrLoop->getSubLoops().empty())
165 continue;
166
167 // List of consecutive CMOV instructions to be processed.
168 CmovGroups CmovInstGroups;
169
170 if (!collectCmovCandidates(CurrLoop, CmovInstGroups))
171 continue;
172
173 if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups))
174 continue;
175
176 Changed = true;
177 for (auto &Group : CmovInstGroups)
178 convertCmovInstsToBranches(Group);
179 }
180 return Changed;
181 }
182
183 bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
184 CmovGroups &CmovInstGroups) {
185 //===--------------------------------------------------------------------===//
186 // Collect all CMOV-group-candidates and add them into CmovInstGroups.
187 //
188 // CMOV-group:
189 // CMOV instructions, in same MBB, that uses same EFLAGS def instruction.
190 //
191 // CMOV-group-candidate:
192 // CMOV-group where all the CMOV instructions are
193 // 1. consecutive.
194 // 2. have same condition code or opposite one.
195 // 3. have only operand registers (X86::CMOVrr).
196 //===--------------------------------------------------------------------===//
197 // List of possible improvement (TODO's):
198 // --------------------------------------
199 // TODO: Add support for X86::CMOVrm instructions.
200 // TODO: Add support for X86::SETcc instructions.
201 // TODO: Add support for CMOV-groups with non consecutive CMOV instructions.
202 //===--------------------------------------------------------------------===//
203
204 // Current processed CMOV-Group.
205 CmovGroup Group;
206 for (auto *MBB : CurrLoop->getBlocks()) {
207 Group.clear();
208 // Condition code of first CMOV instruction current processed range and its
209 // opposite condition code.
210 X86::CondCode FirstCC, FirstOppCC;
211 // Indicator of a non CMOVrr instruction in the current processed range.
212 bool FoundNonCMOVInst = false;
213 // Indicator for current processed CMOV-group if it should be skipped.
214 bool SkipGroup = false;
215
216 for (auto &I : *MBB) {
217 X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
218 // Check if we found a X86::CMOVrr instruction.
219 if (CC != X86::COND_INVALID && !I.mayLoad()) {
220 if (Group.empty()) {
221 // We found first CMOV in the range, reset flags.
222 FirstCC = CC;
223 FirstOppCC = X86::GetOppositeBranchCondition(CC);
224 FoundNonCMOVInst = false;
225 SkipGroup = false;
226 }
227 Group.push_back(&I);
228 // Check if it is a non-consecutive CMOV instruction or it has different
229 // condition code than FirstCC or FirstOppCC.
230 if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
231 // Mark the SKipGroup indicator to skip current processed CMOV-Group.
232 SkipGroup = true;
233 continue;
234 }
235 // If Group is empty, keep looking for first CMOV in the range.
236 if (Group.empty())
237 continue;
238
239 // We found a non X86::CMOVrr instruction.
240 FoundNonCMOVInst = true;
241 // Check if this instruction define EFLAGS, to determine end of processed
242 // range, as there would be no more instructions using current EFLAGS def.
243 if (I.definesRegister(X86::EFLAGS)) {
244 // Check if current processed CMOV-group should not be skipped and add
245 // it as a CMOV-group-candidate.
246 if (!SkipGroup)
247 CmovInstGroups.push_back(Group);
248 else
249 ++NumOfSkippedCmovGroups;
250 Group.clear();
251 }
252 }
253 // End of basic block is considered end of range, check if current processed
254 // CMOV-group should not be skipped and add it as a CMOV-group-candidate.
255 if (Group.empty())
256 continue;
257 if (!SkipGroup)
258 CmovInstGroups.push_back(Group);
259 else
260 ++NumOfSkippedCmovGroups;
261 }
262
263 NumOfCmovGroupCandidate += CmovInstGroups.size();
264 return !CmovInstGroups.empty();
265 }
266
267 /// \returns Depth of CMOV instruction as if it was converted into branch.
268 /// \param TrueOpDepth depth cost of CMOV true value operand.
269 /// \param FalseOpDepth depth cost of CMOV false value operand.
270 static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
271 //===--------------------------------------------------------------------===//
272 // With no info about branch weight, we assume 50% for each value operand.
273 // Thus, depth of optimized CMOV instruction is the rounded up average of
274 // its True-Operand-Value-Depth and False-Operand-Value-Depth.
275 //===--------------------------------------------------------------------===//
276 return (TrueOpDepth + FalseOpDepth + 1) / 2;
277 }
278
279 bool X86CmovConverterPass::checkForProfitableCmovCandidates(
280 MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) {
281 struct DepthInfo {
282 /// Depth of original loop.
283 unsigned Depth;
284 /// Depth of optimized loop.
285 unsigned OptDepth;
286 };
287 /// Number of loop iterations to calculate depth for ?!
288 static const unsigned LoopIterations = 2;
289 DenseMap DepthMap;
290 DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}};
291 enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 };
292 /// For each register type maps the register to its last def instruction.
293 DenseMap RegDefMaps[RegTypeNum];
294 /// Maps register operand to its def instruction, which can be nullptr if it
295 /// is unknown (e.g., operand is defined outside the loop).
296 DenseMap OperandToDefMap;
297
298 // Set depth of unknown instruction (i.e., nullptr) to zero.
299 DepthMap[nullptr] = {0, 0};
300
301 SmallPtrSet CmovInstructions;
302 for (auto &Group : CmovInstGroups)
303 CmovInstructions.insert(Group.begin(), Group.end());
304
305 //===--------------------------------------------------------------------===//
306 // Step 1: Calculate instruction depth and loop depth.
307 // Optimized-Loop:
308 // loop with CMOV-group-candidates converted into branches.
309 //
310 // Instruction-Depth:
311 // instruction latency + max operand depth.
312 // * For CMOV instruction in optimized loop the depth is calculated as:
313 // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth)
314 // TODO: Find a better way to estimate the latency of the branch instruction
315 // rather than using the CMOV latency.
316 //
317 // Loop-Depth:
318 // max instruction depth of all instructions in the loop.
319 // Note: instruction with max depth represents the critical-path in the loop.
320 //
321 // Loop-Depth[i]:
322 // Loop-Depth calculated for first `i` iterations.
323 // Note: it is enough to calculate depth for up to two iterations.
324 //
325 // Depth-Diff[i]:
326 // Number of cycles saved in first 'i` iterations by optimizing the loop.
327 //===--------------------------------------------------------------------===//
328 for (unsigned I = 0; I < LoopIterations; ++I) {
329 DepthInfo &MaxDepth = LoopDepth[I];
330 for (auto *MBB : CurrLoop->getBlocks()) {
331 // Clear physical registers Def map.
332 RegDefMaps[PhyRegType].clear();
333 for (MachineInstr &MI : *MBB) {
334 unsigned MIDepth = 0;
335 unsigned MIDepthOpt = 0;
336 bool IsCMOV = CmovInstructions.count(&MI);
337 for (auto &MO : MI.uses()) {
338 // Checks for "isUse()" as "uses()" returns also implicit definitions.
339 if (!MO.isReg() || !MO.isUse())
340 continue;
341 unsigned Reg = MO.getReg();
342 auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)];
343 if (MachineInstr *DefMI = RDM.lookup(Reg)) {
344 OperandToDefMap[&MO] = DefMI;
345 DepthInfo Info = DepthMap.lookup(DefMI);
346 MIDepth = std::max(MIDepth, Info.Depth);
347 if (!IsCMOV)
348 MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth);
349 }
350 }
351
352 if (IsCMOV)
353 MIDepthOpt = getDepthOfOptCmov(
354 DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth,
355 DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth);
356
357 // Iterates over all operands to handle implicit definitions as well.
358 for (auto &MO : MI.operands()) {
359 if (!MO.isReg() || !MO.isDef())
360 continue;
361 unsigned Reg = MO.getReg();
362 RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI;
363 }
364
365 unsigned Latency = TSchedModel.computeInstrLatency(&MI);
366 DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency};
367 MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth);
368 MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt);
369 }
370 }
371 }
372
373 unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth,
374 LoopDepth[1].Depth - LoopDepth[1].OptDepth};
375
376 //===--------------------------------------------------------------------===//
377 // Step 2: Check if Loop worth to be optimized.
378 // Worth-Optimize-Loop:
379 // case 1: Diff[1] == Diff[0]
380 // Critical-path is iteration independent - there is no dependency
381 // of critical-path instructions on critical-path instructions of
382 // previous iteration.
383 // Thus, it is enough to check gain percent of 1st iteration -
384 // To be conservative, the optimized loop need to have a depth of
385 // 12.5% cycles less than original loop, per iteration.
386 //
387 // case 2: Diff[1] > Diff[0]
388 // Critical-path is iteration dependent - there is dependency of
389 // critical-path instructions on critical-path instructions of
390 // previous iteration.
391 // Thus, it is required to check the gradient of the gain - the
392 // change in Depth-Diff compared to the change in Loop-Depth between
393 // 1st and 2nd iterations.
394 // To be conservative, the gradient need to be at least 50%.
395 //
396 // If loop is not worth optimizing, remove all CMOV-group-candidates.
397 //===--------------------------------------------------------------------===//
398 bool WorthOptLoop = false;
399 if (Diff[1] == Diff[0])
400 WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
401 else if (Diff[1] > Diff[0])
402 WorthOptLoop =
403 (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
404
405 if (!WorthOptLoop)
406 return false;
407
408 ++NumOfLoopCandidate;
409
410 //===--------------------------------------------------------------------===//
411 // Step 3: Check for each CMOV-group-candidate if it worth to be optimized.
412 // Worth-Optimize-Group:
413 // Iff it worths to optimize all CMOV instructions in the group.
414 //
415 // Worth-Optimize-CMOV:
416 // Predicted branch is faster than CMOV by the difference between depth of
417 // condition operand and depth of taken (predicted) value operand.
418 // To be conservative, the gain of such CMOV transformation should cover at
419 // at least 25% of branch-misprediction-penalty.
420 //===--------------------------------------------------------------------===//
421 unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
422 CmovGroups TempGroups;
423 std::swap(TempGroups, CmovInstGroups);
424 for (auto &Group : TempGroups) {
425 bool WorthOpGroup = true;
426 for (auto *MI : Group) {
427 // Avoid CMOV instruction which value is used as a pointer to load from.
428 // This is another conservative check to avoid converting CMOV instruction
429 // used with tree-search like algorithm, where the branch is unpredicted.
430 auto UIs = MRI->use_instructions(MI->defs().begin()->getReg());
431 if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) {
432 unsigned Op = UIs.begin()->getOpcode();
433 if (Op == X86::MOV64rm || Op == X86::MOV32rm) {
434 WorthOpGroup = false;
435 break;
436 }
437 }
438
439 unsigned CondCost =
440 DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth;
441 unsigned ValCost = getDepthOfOptCmov(
442 DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth,
443 DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth);
444 if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) {
445 WorthOpGroup = false;
446 break;
447 }
448 }
449
450 if (WorthOpGroup)
451 CmovInstGroups.push_back(Group);
452 }
453
454 return !CmovInstGroups.empty();
455 }
456
457 static bool checkEFLAGSLive(MachineInstr *MI) {
458 if (MI->killsRegister(X86::EFLAGS))
459 return false;
460
461 // The EFLAGS operand of MI might be missing a kill marker.
462 // Figure out whether EFLAGS operand should LIVE after MI instruction.
463 MachineBasicBlock *BB = MI->getParent();
464 MachineBasicBlock::iterator ItrMI = MI;
465
466 // Scan forward through BB for a use/def of EFLAGS.
467 for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) {
468 if (I->readsRegister(X86::EFLAGS))
469 return true;
470 if (I->definesRegister(X86::EFLAGS))
471 return false;
472 }
473
474 // We hit the end of the block, check whether EFLAGS is live into a successor.
475 for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) {
476 if ((*I)->isLiveIn(X86::EFLAGS))
477 return true;
478 }
479
480 return false;
481 }
482
483 void X86CmovConverterPass::convertCmovInstsToBranches(
484 SmallVectorImpl &Group) const {
485 assert(!Group.empty() && "No CMOV instructions to convert");
486 ++NumOfOptimizedCmovGroups;
487
488 // To convert a CMOVcc instruction, we actually have to insert the diamond
489 // control-flow pattern. The incoming instruction knows the destination vreg
490 // to set, the condition code register to branch on, the true/false values to
491 // select between, and a branch opcode to use.
492
493 // Before
494 // -----
495 // MBB:
496 // cond = cmp ...
497 // v1 = CMOVge t1, f1, cond
498 // v2 = CMOVlt t2, f2, cond
499 // v3 = CMOVge v1, f3, cond
500 //
501 // After
502 // -----
503 // MBB:
504 // cond = cmp ...
505 // jge %SinkMBB
506 //
507 // FalseMBB:
508 // jmp %SinkMBB
509 //
510 // SinkMBB:
511 // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
512 // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
513 // ; true-value with false-value
514 // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use
515 // ; previous Phi instruction result
516
517 MachineInstr &MI = *Group.front();
518 MachineInstr *LastCMOV = Group.back();
519 DebugLoc DL = MI.getDebugLoc();
520 X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
521 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
522 MachineBasicBlock *MBB = MI.getParent();
523 MachineFunction::iterator It = ++MBB->getIterator();
524 MachineFunction *F = MBB->getParent();
525 const BasicBlock *BB = MBB->getBasicBlock();
526
527 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB);
528 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB);
529 F->insert(It, FalseMBB);
530 F->insert(It, SinkMBB);
531
532 // If the EFLAGS register isn't dead in the terminator, then claim that it's
533 // live into the sink and copy blocks.
534 if (checkEFLAGSLive(LastCMOV)) {
535 FalseMBB->addLiveIn(X86::EFLAGS);
536 SinkMBB->addLiveIn(X86::EFLAGS);
537 }
538
539 // Transfer the remainder of BB and its successor edges to SinkMBB.
540 SinkMBB->splice(SinkMBB->begin(), MBB,
541 std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end());
542 SinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
543
544 // Add the false and sink blocks as its successors.
545 MBB->addSuccessor(FalseMBB);
546 MBB->addSuccessor(SinkMBB);
547
548 // Create the conditional branch instruction.
549 BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB);
550
551 // Add the sink block to the false block successors.
552 FalseMBB->addSuccessor(SinkMBB);
553
554 MachineInstrBuilder MIB;
555 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
556 MachineBasicBlock::iterator MIItEnd =
557 std::next(MachineBasicBlock::iterator(LastCMOV));
558 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
559 // As we are creating the PHIs, we have to be careful if there is more than
560 // one. Later CMOVs may reference the results of earlier CMOVs, but later
561 // PHIs have to reference the individual true/false inputs from earlier PHIs.
562 // That also means that PHI construction must work forward from earlier to
563 // later, and that the code must maintain a mapping from earlier PHI's
564 // destination registers, and the registers that went into the PHI.
565 DenseMap> RegRewriteTable;
566
567 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
568 unsigned DestReg = MIIt->getOperand(0).getReg();
569 unsigned Op1Reg = MIIt->getOperand(1).getReg();
570 unsigned Op2Reg = MIIt->getOperand(2).getReg();
571
572 // If this CMOV we are processing is the opposite condition from the jump we
573 // generated, then we have to swap the operands for the PHI that is going to
574 // be generated.
575 if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC)
576 std::swap(Op1Reg, Op2Reg);
577
578 auto Op1Itr = RegRewriteTable.find(Op1Reg);
579 if (Op1Itr != RegRewriteTable.end())
580 Op1Reg = Op1Itr->second.first;
581
582 auto Op2Itr = RegRewriteTable.find(Op2Reg);
583 if (Op2Itr != RegRewriteTable.end())
584 Op2Reg = Op2Itr->second.second;
585
586 // SinkMBB:
587 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ]
588 // ...
589 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
590 .addReg(Op1Reg)
591 .addMBB(FalseMBB)
592 .addReg(Op2Reg)
593 .addMBB(MBB);
594 (void)MIB;
595 DEBUG(dbgs() << "\tFrom: "; MIIt->dump());
596 DEBUG(dbgs() << "\tTo: "; MIB->dump());
597
598 // Add this PHI to the rewrite table.
599 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
600 }
601
602 // Now remove the CMOV(s).
603 MBB->erase(MIItBegin, MIItEnd);
604 }
605
606 } // End anonymous namespace.
607
608 FunctionPass *llvm::createX86CmovConverterPass() {
609 return new X86CmovConverterPass();
610 }
374374 addPass(&EarlyIfConverterID);
375375 if (EnableMachineCombinerPass)
376376 addPass(&MachineCombinerID);
377 addPass(createX86CmovConverterPass());
377378 return true;
378379 }
379380
None ; RUN: llc < %s -march=x86 -mattr=+cmov | FileCheck %s
0 ; RUN: llc < %s -march=x86 -mattr=+cmov -x86-cmov-converter=false | FileCheck %s
11 ;
22 ; Test scheduling a multi-use compare. We should neither spill flags
33 ; nor clone the compare.
88 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
99 ; LINUX: cmpl
1010 ; LINUX: sbbl
11 ; LINUX: cmovne
12 ; LINUX: cmovne
11 ; LINUX: jne
12 ; LINUX: jne
1313 ; LINUX: lock cmpxchg8b
1414 ; LINUX: jne [[LABEL]]
1515 %2 = atomicrmw min i64* @sc64, i64 6 acquire
1616 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
1717 ; LINUX: cmpl
1818 ; LINUX: sbbl
19 ; LINUX: cmovne
20 ; LINUX: cmovne
19 ; LINUX: jne
20 ; LINUX: jne
2121 ; LINUX: lock cmpxchg8b
2222 ; LINUX: jne [[LABEL]]
2323 %3 = atomicrmw umax i64* @sc64, i64 7 acquire
2424 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
2525 ; LINUX: cmpl
2626 ; LINUX: sbbl
27 ; LINUX: cmovne
28 ; LINUX: cmovne
27 ; LINUX: jne
28 ; LINUX: jne
2929 ; LINUX: lock cmpxchg8b
3030 ; LINUX: jne [[LABEL]]
3131 %4 = atomicrmw umin i64* @sc64, i64 8 acquire
3232 ; LINUX: [[LABEL:.LBB[0-9]+_[0-9]+]]
3333 ; LINUX: cmpl
3434 ; LINUX: sbbl
35 ; LINUX: cmovne
36 ; LINUX: cmovne
35 ; LINUX: jne
36 ; LINUX: jne
3737 ; LINUX: lock cmpxchg8b
3838 ; LINUX: jne [[LABEL]]
3939 ret void
166166 ; CHECK-NEXT: sbbq %rdx, %rcx
167167 ; CHECK-NEXT: setge %cl
168168 ; CHECK-NEXT: andb $1, %cl
169 ; CHECK-NEXT: movq %rsi, %rbx
170 ; CHECK-NEXT: cmovneq %rax, %rbx
169 ; CHECK-NEXT: movq %rax, %rbx
170 ; CHECK-NEXT: jne LBB5_3
171 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
172 ; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
173 ; CHECK-NEXT: movq %rsi, %rbx
174 ; CHECK-NEXT: LBB5_3: ## %atomicrmw.start
175 ; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
171176 ; CHECK-NEXT: testb %cl, %cl
172 ; CHECK-NEXT: movq %r8, %rcx
173 ; CHECK-NEXT: cmovneq %rdx, %rcx
177 ; CHECK-NEXT: movq %rdx, %rcx
178 ; CHECK-NEXT: jne LBB5_5
179 ; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
180 ; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
181 ; CHECK-NEXT: movq %r8, %rcx
182 ; CHECK-NEXT: LBB5_5: ## %atomicrmw.start
183 ; CHECK-NEXT: ## in Loop: Header=BB5_1 Depth=1
174184 ; CHECK-NEXT: lock cmpxchg16b (%rdi)
175185 ; CHECK-NEXT: jne LBB5_1
176 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
186 ; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
177187 ; CHECK-NEXT: movq %rax, {{.*}}(%rip)
178188 ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
179189 ; CHECK-NEXT: popq %rbx
202212 ; CHECK-NEXT: sbbq %r8, %rcx
203213 ; CHECK-NEXT: setge %cl
204214 ; CHECK-NEXT: andb $1, %cl
205 ; CHECK-NEXT: movq %rsi, %rbx
206 ; CHECK-NEXT: cmovneq %rax, %rbx
215 ; CHECK-NEXT: movq %rax, %rbx
216 ; CHECK-NEXT: jne LBB6_3
217 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
218 ; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
219 ; CHECK-NEXT: movq %rsi, %rbx
220 ; CHECK-NEXT: LBB6_3: ## %atomicrmw.start
221 ; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
207222 ; CHECK-NEXT: testb %cl, %cl
208 ; CHECK-NEXT: movq %r8, %rcx
209 ; CHECK-NEXT: cmovneq %rdx, %rcx
223 ; CHECK-NEXT: movq %rdx, %rcx
224 ; CHECK-NEXT: jne LBB6_5
225 ; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
226 ; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
227 ; CHECK-NEXT: movq %r8, %rcx
228 ; CHECK-NEXT: LBB6_5: ## %atomicrmw.start
229 ; CHECK-NEXT: ## in Loop: Header=BB6_1 Depth=1
210230 ; CHECK-NEXT: lock cmpxchg16b (%rdi)
211231 ; CHECK-NEXT: jne LBB6_1
212 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
232 ; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
213233 ; CHECK-NEXT: movq %rax, {{.*}}(%rip)
214234 ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
215235 ; CHECK-NEXT: popq %rbx
238258 ; CHECK-NEXT: sbbq %rdx, %rcx
239259 ; CHECK-NEXT: setae %cl
240260 ; CHECK-NEXT: andb $1, %cl
241 ; CHECK-NEXT: movq %rsi, %rbx
242 ; CHECK-NEXT: cmovneq %rax, %rbx
261 ; CHECK-NEXT: movq %rax, %rbx
262 ; CHECK-NEXT: jne LBB7_3
263 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
264 ; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
265 ; CHECK-NEXT: movq %rsi, %rbx
266 ; CHECK-NEXT: LBB7_3: ## %atomicrmw.start
267 ; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
243268 ; CHECK-NEXT: testb %cl, %cl
244 ; CHECK-NEXT: movq %r8, %rcx
245 ; CHECK-NEXT: cmovneq %rdx, %rcx
269 ; CHECK-NEXT: movq %rdx, %rcx
270 ; CHECK-NEXT: jne LBB7_5
271 ; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
272 ; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
273 ; CHECK-NEXT: movq %r8, %rcx
274 ; CHECK-NEXT: LBB7_5: ## %atomicrmw.start
275 ; CHECK-NEXT: ## in Loop: Header=BB7_1 Depth=1
246276 ; CHECK-NEXT: lock cmpxchg16b (%rdi)
247277 ; CHECK-NEXT: jne LBB7_1
248 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
278 ; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
249279 ; CHECK-NEXT: movq %rax, {{.*}}(%rip)
250280 ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
251281 ; CHECK-NEXT: popq %rbx
274304 ; CHECK-NEXT: sbbq %rdx, %rcx
275305 ; CHECK-NEXT: setb %cl
276306 ; CHECK-NEXT: andb $1, %cl
277 ; CHECK-NEXT: movq %rsi, %rbx
278 ; CHECK-NEXT: cmovneq %rax, %rbx
307 ; CHECK-NEXT: movq %rax, %rbx
308 ; CHECK-NEXT: jne LBB8_3
309 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.start
310 ; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
311 ; CHECK-NEXT: movq %rsi, %rbx
312 ; CHECK-NEXT: LBB8_3: ## %atomicrmw.start
313 ; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
279314 ; CHECK-NEXT: testb %cl, %cl
280 ; CHECK-NEXT: movq %r8, %rcx
281 ; CHECK-NEXT: cmovneq %rdx, %rcx
315 ; CHECK-NEXT: movq %rdx, %rcx
316 ; CHECK-NEXT: jne LBB8_5
317 ; CHECK-NEXT: ## BB#4: ## %atomicrmw.start
318 ; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
319 ; CHECK-NEXT: movq %r8, %rcx
320 ; CHECK-NEXT: LBB8_5: ## %atomicrmw.start
321 ; CHECK-NEXT: ## in Loop: Header=BB8_1 Depth=1
282322 ; CHECK-NEXT: lock cmpxchg16b (%rdi)
283323 ; CHECK-NEXT: jne LBB8_1
284 ; CHECK-NEXT: ## BB#2: ## %atomicrmw.end
324 ; CHECK-NEXT: ## BB#6: ## %atomicrmw.end
285325 ; CHECK-NEXT: movq %rax, {{.*}}(%rip)
286326 ; CHECK-NEXT: movq %rdx, _var+{{.*}}(%rip)
287327 ; CHECK-NEXT: popq %rbx
0 ; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
1
2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3 ;; This test checks that x86-cmov-converter optimization transform CMOV
4 ;; instruction into branches when it is profitable.
5 ;; There are 5 cases below:
6 ;; 1. CmovInCriticalPath:
7 ;; CMOV depends on the condition and it is in the hot path.
8 ;; Thus, it worths transforming.
9 ;;
10 ;; 2. CmovNotInCriticalPath:
11 ;; similar test like in (1), just that CMOV is not in the hot path.
12 ;; Thus, it does not worth transforming.
13 ;;
14 ;; 3. MaxIndex:
15 ;; Maximum calculation algorithm that is looking for the max index,
16 ;; calculating CMOV value is cheaper than calculating CMOV condition.
17 ;; Thus, it worths transforming.
18 ;;
19 ;; 4. MaxValue:
20 ;; Maximum calculation algorithm that is looking for the max value,
21 ;; calculating CMOV value is not cheaper than calculating CMOV condition.
22 ;; Thus, it does not worth transforming.
23 ;;
24 ;; 5. BinarySearch:
25 ;; Usually, binary search CMOV is not predicted.
26 ;; Thus, it does not worth transforming.
27 ;;
28 ;; Test was created using the following command line:
29 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
30 ;; Where foo.c is:
31 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
32 ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
33 ;; for (int i = 0; i < n; i++) {
34 ;; int t = c[i];
35 ;; if (c[i] * a > b)
36 ;; t = 10;
37 ;; c[i] = t;
38 ;; }
39 ;;}
40 ;;
41 ;;
42 ;;void CmovNotInHotPath(int n, int a, int b, int *c, int *d) {
43 ;; for (int i = 0; i < n; i++) {
44 ;; int t = c[i];
45 ;; if (c[i] * a > b)
46 ;; t = 10;
47 ;; c[i] = t;
48 ;; d[i] /= b;
49 ;; }
50 ;;}
51 ;;
52 ;;
53 ;;int MaxIndex(int n, int *a) {
54 ;; int t = 0;
55 ;; for (int i = 1; i < n; i++) {
56 ;; if (a[i] > a[t])
57 ;; t = i;
58 ;; }
59 ;; return a[t];
60 ;;}
61 ;;
62 ;;
63 ;;int MaxValue(int n, int *a) {
64 ;; int t = a[0];
65 ;; for (int i = 1; i < n; i++) {
66 ;; if (a[i] > t)
67 ;; t = a[i];
68 ;; }
69 ;; return t;
70 ;;}
71 ;;
72 ;;typedef struct Node Node;
73 ;;struct Node {
74 ;; unsigned Val;
75 ;; Node *Right;
76 ;; Node *Left;
77 ;;};
78 ;;
79 ;;unsigned BinarySearch(unsigned Mask, Node *Curr, Node *Next) {
80 ;; while (Curr->Val > Next->Val) {
81 ;; Curr = Next;
82 ;; if (Mask & (0x1 << Curr->Val))
83 ;; Next = Curr->Right;
84 ;; else
85 ;; Next = Curr->Left;
86 ;; }
87 ;; return Curr->Val;
88 ;;}
89 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
90
91 %struct.Node = type { i32, %struct.Node*, %struct.Node* }
92
93 ; CHECK-LABEL: CmovInHotPath
94 ; CHECK-NOT: cmov
95 ; CHECK: jg
96
97 define void @CmovInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
98 entry:
99 %cmp14 = icmp sgt i32 %n, 0
100 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
101
102 for.body.preheader: ; preds = %entry
103 %wide.trip.count = zext i32 %n to i64
104 br label %for.body
105
106 for.cond.cleanup: ; preds = %for.body, %entry
107 ret void
108
109 for.body: ; preds = %for.body.preheader, %for.body
110 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
111 %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
112 %0 = load i32, i32* %arrayidx, align 4
113 %mul = mul nsw i32 %0, %a
114 %cmp3 = icmp sgt i32 %mul, %b
115 %. = select i1 %cmp3, i32 10, i32 %0
116 store i32 %., i32* %arrayidx, align 4
117 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
118 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
119 br i1 %exitcond, label %for.cond.cleanup, label %for.body
120 }
121
122 ; CHECK-LABEL: CmovNotInHotPath
123 ; CHECK: cmovg
124
125 define void @CmovNotInHotPath(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture %d) #0 {
126 entry:
127 %cmp18 = icmp sgt i32 %n, 0
128 br i1 %cmp18, label %for.body.preheader, label %for.cond.cleanup
129
130 for.body.preheader: ; preds = %entry
131 %wide.trip.count = zext i32 %n to i64
132 br label %for.body
133
134 for.cond.cleanup: ; preds = %for.body, %entry
135 ret void
136
137 for.body: ; preds = %for.body.preheader, %for.body
138 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
139 %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
140 %0 = load i32, i32* %arrayidx, align 4
141 %mul = mul nsw i32 %0, %a
142 %cmp3 = icmp sgt i32 %mul, %b
143 %. = select i1 %cmp3, i32 10, i32 %0
144 store i32 %., i32* %arrayidx, align 4
145 %arrayidx7 = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
146 %1 = load i32, i32* %arrayidx7, align 4
147 %div = sdiv i32 %1, %b
148 store i32 %div, i32* %arrayidx7, align 4
149 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
150 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
151 br i1 %exitcond, label %for.cond.cleanup, label %for.body
152 }
153
154 ; CHECK-LABEL: MaxIndex
155 ; CHECK-NOT: cmov
156 ; CHECK: jg
157
158 define i32 @MaxIndex(i32 %n, i32* nocapture readonly %a) #0 {
159 entry:
160 %cmp14 = icmp sgt i32 %n, 1
161 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
162
163 for.body.preheader: ; preds = %entry
164 %wide.trip.count = zext i32 %n to i64
165 br label %for.body
166
167 for.cond.cleanup.loopexit: ; preds = %for.body
168 %phitmp = sext i32 %i.0.t.0 to i64
169 br label %for.cond.cleanup
170
171 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
172 %t.0.lcssa = phi i64 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
173 %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %t.0.lcssa
174 %0 = load i32, i32* %arrayidx5, align 4
175 ret i32 %0
176
177 for.body: ; preds = %for.body.preheader, %for.body
178 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
179 %t.015 = phi i32 [ %i.0.t.0, %for.body ], [ 0, %for.body.preheader ]
180 %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
181 %1 = load i32, i32* %arrayidx, align 4
182 %idxprom1 = sext i32 %t.015 to i64
183 %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %idxprom1
184 %2 = load i32, i32* %arrayidx2, align 4
185 %cmp3 = icmp sgt i32 %1, %2
186 %3 = trunc i64 %indvars.iv to i32
187 %i.0.t.0 = select i1 %cmp3, i32 %3, i32 %t.015
188 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
189 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
190 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
191 }
192
193 ; CHECK-LABEL: MaxValue
194 ; CHECK-NOT: jg
195 ; CHECK: cmovg
196
197 define i32 @MaxValue(i32 %n, i32* nocapture readonly %a) #0 {
198 entry:
199 %0 = load i32, i32* %a, align 4
200 %cmp13 = icmp sgt i32 %n, 1
201 br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup
202
203 for.body.preheader: ; preds = %entry
204 %wide.trip.count = zext i32 %n to i64
205 br label %for.body
206
207 for.cond.cleanup: ; preds = %for.body, %entry
208 %t.0.lcssa = phi i32 [ %0, %entry ], [ %.t.0, %for.body ]
209 ret i32 %t.0.lcssa
210
211 for.body: ; preds = %for.body.preheader, %for.body
212 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 1, %for.body.preheader ]
213 %t.014 = phi i32 [ %.t.0, %for.body ], [ %0, %for.body.preheader ]
214 %arrayidx1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
215 %1 = load i32, i32* %arrayidx1, align 4
216 %cmp2 = icmp sgt i32 %1, %t.014
217 %.t.0 = select i1 %cmp2, i32 %1, i32 %t.014
218 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
219 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
220 br i1 %exitcond, label %for.cond.cleanup, label %for.body
221 }
222
223 ; CHECK-LABEL: BinarySearch
224 ; CHECK: cmov
225
226 define i32 @BinarySearch(i32 %Mask, %struct.Node* nocapture readonly %Curr, %struct.Node* nocapture readonly %Next) #0 {
227 entry:
228 %Val8 = getelementptr inbounds %struct.Node, %struct.Node* %Curr, i64 0, i32 0
229 %0 = load i32, i32* %Val8, align 8
230 %Val19 = getelementptr inbounds %struct.Node, %struct.Node* %Next, i64 0, i32 0
231 %1 = load i32, i32* %Val19, align 8
232 %cmp10 = icmp ugt i32 %0, %1
233 br i1 %cmp10, label %while.body, label %while.end
234
235 while.body: ; preds = %entry, %while.body
236 %2 = phi i32 [ %4, %while.body ], [ %1, %entry ]
237 %Next.addr.011 = phi %struct.Node* [ %3, %while.body ], [ %Next, %entry ]
238 %shl = shl i32 1, %2
239 %and = and i32 %shl, %Mask
240 %tobool = icmp eq i32 %and, 0
241 %Left = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 2
242 %Right = getelementptr inbounds %struct.Node, %struct.Node* %Next.addr.011, i64 0, i32 1
243 %Left.sink = select i1 %tobool, %struct.Node** %Left, %struct.Node** %Right
244 %3 = load %struct.Node*, %struct.Node** %Left.sink, align 8
245 %Val1 = getelementptr inbounds %struct.Node, %struct.Node* %3, i64 0, i32 0
246 %4 = load i32, i32* %Val1, align 8
247 %cmp = icmp ugt i32 %2, %4
248 br i1 %cmp, label %while.body, label %while.end
249
250 while.end: ; preds = %while.body, %entry
251 %.lcssa = phi i32 [ %0, %entry ], [ %2, %while.body ]
252 ret i32 %.lcssa
253 }
254
255 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
256 ;; The following test checks that x86-cmov-converter optimization transforms
257 ;; CMOV instructions into branch correctly.
258 ;;
259 ;; MBB:
260 ;; cond = cmp ...
261 ;; v1 = CMOVgt t1, f1, cond
262 ;; v2 = CMOVle s1, f2, cond
263 ;;
264 ;; Where: t1 = 11, f1 = 22, f2 = a
265 ;;
266 ;; After CMOV transformation
267 ;; -------------------------
268 ;; MBB:
269 ;; cond = cmp ...
270 ;; ja %SinkMBB
271 ;;
272 ;; FalseMBB:
273 ;; jmp %SinkMBB
274 ;;
275 ;; SinkMBB:
276 ;; %v1 = phi[%f1, %FalseMBB], [%t1, %MBB]
277 ;; %v2 = phi[%f1, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch
278 ;; ; true-value with false-value
279 ;; ; Phi instruction cannot use
280 ;; ; previous Phi instruction result
281 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
282
283 ; CHECK-LABEL: Transform
284 ; CHECK-NOT: cmov
285 ; CHECK: divl [[a:%[0-9a-z]*]]
286 ; CHECK: cmpl [[a]], %eax
287 ; CHECK: movl $11, [[s1:%[0-9a-z]*]]
288 ; CHECK: movl [[a]], [[s2:%[0-9a-z]*]]
289 ; CHECK: ja [[SinkBB:.*]]
290 ; CHECK: [[FalseBB:.*]]:
291 ; CHECK: movl $22, [[s1]]
292 ; CHECK: movl $22, [[s2]]
293 ; CHECK: [[SinkBB]]:
294 ; CHECK: ja
295
296 define void @Transform(i32 *%arr, i32 *%arr2, i32 %a, i32 %b, i32 %c, i32 %n) #0 {
297 entry:
298 %cmp10 = icmp ugt i32 0, %n
299 br i1 %cmp10, label %while.body, label %while.end
300
301 while.body: ; preds = %entry, %while.body
302 %i = phi i32 [ %i_inc, %while.body ], [ 0, %entry ]
303 %arr_i = getelementptr inbounds i32, i32* %arr, i32 %i
304 %x = load i32, i32* %arr_i, align 4
305 %div = udiv i32 %x, %a
306 %cond = icmp ugt i32 %div, %a
307 %condOpp = icmp ule i32 %div, %a
308 %s1 = select i1 %cond, i32 11, i32 22
309 %s2 = select i1 %condOpp, i32 %s1, i32 %a
310 %sum = urem i32 %s1, %s2
311 store i32 %sum, i32* %arr_i, align 4
312 %i_inc = add i32 %i, 1
313 %cmp = icmp ugt i32 %i_inc, %n
314 br i1 %cmp, label %while.body, label %while.end
315
316 while.end: ; preds = %while.body, %entry
317 ret void
318 }
319
320 attributes #0 = {"target-cpu"="x86-64"}