llvm.org GIT mirror llvm / 5f81eab
[x86] Teach the EFLAGS copy lowering to handle much more complex control flow patterns including forks, merges, and even cyles. This tries to cover a reasonably comprehensive set of patterns that still don't require PHIs or PHI placement. The coverage was inspired by the amazing variety of patterns produced when copy EFLAGS and restoring it to implement Speculative Load Hardening. Without this patch, we simply cannot make such complex and invasive changes to x86 instruction sequences due to EFLAGS. I've added "just" one test, but this test covers many different complexities and corner cases of this approach. It is actually more comprehensive, as far as I can tell, than anything that I have encountered in the wild on SLH. Because the test is so complex, I've tried to give somewhat thorough comments and an ASCII-art diagram of the control flows to make it a bit easier to read and maintain long-term. Differential Revision: https://reviews.llvm.org/D49220 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@336985 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 1 year, 8 months ago
2 changed file(s) with 359 addition(s) and 44 deletion(s). Raw diff Collapse all Expand all
2626 #include "X86Subtarget.h"
2727 #include "llvm/ADT/ArrayRef.h"
2828 #include "llvm/ADT/DenseMap.h"
29 #include "llvm/ADT/PostOrderIterator.h"
2930 #include "llvm/ADT/STLExtras.h"
3031 #include "llvm/ADT/ScopeExit.h"
3132 #include "llvm/ADT/SmallPtrSet.h"
101102 MachineDominatorTree *MDT;
102103
103104 CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
104 MachineInstr &CopyDefI);
105 MachineBasicBlock::iterator CopyDefI);
105106
106107 unsigned promoteCondToReg(MachineBasicBlock &MBB,
107108 MachineBasicBlock::iterator TestPos,
355356 // Nothing to do for a degenerate empty function...
356357 return false;
357358
359 // Collect the copies in RPO so that when there are chains where a copy is in
360 // turn copied again we visit the first one first. This ensures we can find
361 // viable locations for testing the original EFLAGS that dominate all the
362 // uses across complex CFGs.
358363 SmallVector Copies;
359 for (MachineBasicBlock &MBB : MF)
360 for (MachineInstr &MI : MBB)
364 ReversePostOrderTraversal RPOT(&MF);
365 for (MachineBasicBlock *MBB : RPOT)
366 for (MachineInstr &MI : *MBB)
361367 if (MI.getOpcode() == TargetOpcode::COPY &&
362368 MI.getOperand(0).getReg() == X86::EFLAGS)
363369 Copies.push_back(&MI);
406412 if (DOp.isDead())
407413 continue;
408414
409 MachineBasicBlock &TestMBB = *CopyDefI.getParent();
415 MachineBasicBlock *TestMBB = CopyDefI.getParent();
410416 auto TestPos = CopyDefI.getIterator();
411417 DebugLoc TestLoc = CopyDefI.getDebugLoc();
412418
413419 LLVM_DEBUG(dbgs() << "Rewriting copy: "; CopyI->dump());
420
421 // Walk up across live-in EFLAGS to find where they were actually def'ed.
422 //
423 // This copy's def may just be part of a region of blocks covered by
424 // a single def of EFLAGS and we want to find the top of that region where
425 // possible.
426 //
427 // This is essentially a search for a *candidate* reaching definition
428 // location. We don't need to ever find the actual reaching definition here,
429 // but we want to walk up the dominator tree to find the highest point which
430 // would be viable for such a definition.
431 auto HasEFLAGSClobber = [&](MachineBasicBlock::iterator Begin,
432 MachineBasicBlock::iterator End) {
433 // Scan backwards as we expect these to be relatively short and often find
434 // a clobber near the end.
435 return llvm::any_of(
436 llvm::reverse(llvm::make_range(Begin, End)), [&](MachineInstr &MI) {
437 // Flag any instruction (other than the copy we are
438 // currently rewriting) that defs EFLAGS.
439 return &MI != CopyI && MI.findRegisterDefOperand(X86::EFLAGS);
440 });
441 };
442 auto HasEFLAGSClobberPath = [&](MachineBasicBlock *BeginMBB,
443 MachineBasicBlock *EndMBB) {
444 assert(MDT->dominates(BeginMBB, EndMBB) &&
445 "Only support paths down the dominator tree!");
446 SmallPtrSet Visited;
447 SmallVector Worklist;
448 // We terminate at the beginning. No need to scan it.
449 Visited.insert(BeginMBB);
450 Worklist.push_back(EndMBB);
451 do {
452 auto *MBB = Worklist.pop_back_val();
453 for (auto *PredMBB : MBB->predecessors()) {
454 if (!Visited.insert(PredMBB).second)
455 continue;
456 if (HasEFLAGSClobber(PredMBB->begin(), PredMBB->end()))
457 return true;
458 // Enqueue this block to walk its predecessors.
459 Worklist.push_back(PredMBB);
460 }
461 } while (!Worklist.empty());
462 // No clobber found along a path from the begin to end.
463 return false;
464 };
465 while (TestMBB->isLiveIn(X86::EFLAGS) && !TestMBB->pred_empty() &&
466 !HasEFLAGSClobber(TestMBB->begin(), TestPos)) {
467 // Find the nearest common dominator of the predecessors, as
468 // that will be the best candidate to hoist into.
469 MachineBasicBlock *HoistMBB =
470 std::accumulate(std::next(TestMBB->pred_begin()), TestMBB->pred_end(),
471 *TestMBB->pred_begin(),
472 [&](MachineBasicBlock *LHS, MachineBasicBlock *RHS) {
473 return MDT->findNearestCommonDominator(LHS, RHS);
474 });
475
476 // Now we need to scan all predecessors that may be reached along paths to
477 // the hoist block. A clobber anywhere in any of these blocks the hoist.
478 // Note that this even handles loops because we require *no* clobbers.
479 if (HasEFLAGSClobberPath(HoistMBB, TestMBB))
480 break;
481
482 // We also need the terminators to not sneakily clobber flags.
483 if (HasEFLAGSClobber(HoistMBB->getFirstTerminator()->getIterator(),
484 HoistMBB->instr_end()))
485 break;
486
487 // We found a viable location, hoist our test position to it.
488 TestMBB = HoistMBB;
489 TestPos = TestMBB->getFirstTerminator()->getIterator();
490 // Clear the debug location as it would just be confusing after hoisting.
491 TestLoc = DebugLoc();
492 }
493 LLVM_DEBUG({
494 auto DefIt = llvm::find_if(
495 llvm::reverse(llvm::make_range(TestMBB->instr_begin(), TestPos)),
496 [&](MachineInstr &MI) {
497 return MI.findRegisterDefOperand(X86::EFLAGS);
498 });
499 if (DefIt.base() != TestMBB->instr_begin()) {
500 dbgs() << " Using EFLAGS defined by: ";
501 DefIt->dump();
502 } else {
503 dbgs() << " Using live-in flags for BB:\n";
504 TestMBB->dump();
505 }
506 });
414507
415508 // While rewriting uses, we buffer jumps and rewrite them in a second pass
416509 // because doing so will perturb the CFG that we are walking to find the
422515 // very few of them and we expect to not revisit the same copy definition
423516 // many times. If either of those change sufficiently we could build a map
424517 // of these up front instead.
425 CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
518 CondRegArray CondRegs = collectCondsInRegs(*TestMBB, TestPos);
426519
427520 // Collect the basic blocks we need to scan. Typically this will just be
428521 // a single basic block but we may have to scan multiple blocks if the
430523 SmallVector Blocks;
431524 SmallPtrSet VisitedBlocks;
432525 Blocks.push_back(&MBB);
433 VisitedBlocks.insert(&MBB);
434526
435527 do {
436528 MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
438530 // Track when if/when we find a kill of the flags in this block.
439531 bool FlagsKilled = false;
440532
441 // We currently don't do any PHI insertion and so we require that the
442 // test basic block dominates all of the use basic blocks.
443 //
444 // We could in theory do PHI insertion here if it becomes useful by just
445 // taking undef values in along every edge that we don't trace this
446 // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
447 // but still seems like a great deal of complexity.
448 //
449 // Because it is theoretically possible that some earlier MI pass or
450 // other lowering transformation could induce this to happen, we do
451 // a hard check even in non-debug builds here.
452 if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
453 LLVM_DEBUG({
454 dbgs() << "ERROR: Encountered use that is not dominated by our test "
455 "basic block! Rewriting this would require inserting PHI "
456 "nodes to track the flag state across the CFG.\n\nTest "
457 "block:\n";
458 TestMBB.dump();
459 dbgs() << "Use block:\n";
460 UseMBB.dump();
461 });
462 report_fatal_error("Cannot lower EFLAGS copy when original copy def "
463 "does not dominate all uses.");
464 }
465
466 for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
467 : UseMBB.instr_begin(),
533 // In most cases, we walk from the beginning to the end of the block. But
534 // when the block is the same block as the copy is from, we will visit it
535 // twice. The first time we start from the copy and go to the end. The
536 // second time we start from the beginning and go to the copy. This lets
537 // us handle copies inside of cycles.
538 // FIXME: This loop is *super* confusing. This is at least in part
539 // a symptom of all of this routine needing to be refactored into
540 // documentable components. Once done, there may be a better way to write
541 // this loop.
542 for (auto MII = (&UseMBB == &MBB && !VisitedBlocks.count(&UseMBB))
543 ? std::next(CopyI->getIterator())
544 : UseMBB.instr_begin(),
468545 MIE = UseMBB.instr_end();
469546 MII != MIE;) {
470547 MachineInstr &MI = *MII++;
548 // If we are in the original copy block and encounter either the copy
549 // def or the copy itself, break so that we don't re-process any part of
550 // the block or process the instructions in the range that was copied
551 // over.
552 if (&MI == CopyI || &MI == &CopyDefI) {
553 assert(&UseMBB == &MBB && VisitedBlocks.count(&MBB) &&
554 "Should only encounter these on the second pass over the "
555 "original block.");
556 break;
557 }
558
471559 MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
472560 if (!FlagUse) {
473561 if (MI.findRegisterDefOperand(X86::EFLAGS)) {
511599
512600 // Otherwise we can just rewrite in-place.
513601 if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
514 rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
602 rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
515603 } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
516604 X86::COND_INVALID) {
517 rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
605 rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
518606 } else if (MI.getOpcode() == TargetOpcode::COPY) {
519607 rewriteCopy(MI, *FlagUse, CopyDefI);
520608 } else {
537625 case X86::SETB_C64r:
538626 // Use custom lowering for arithmetic that is merely extending the
539627 // carry flag. We model this as the SETB_C* pseudo instructions.
540 rewriteSetCarryExtended(TestMBB, TestPos, TestLoc, MI, *FlagUse,
628 rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
541629 CondRegs);
542630 break;
543631
544632 default:
545633 // Generically handle remaining uses as arithmetic instructions.
546 rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse,
634 rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
547635 CondRegs);
548636 break;
549637 }
563651 // and queue those up for processing.
564652 for (MachineBasicBlock *SuccMBB : UseMBB.successors())
565653 if (SuccMBB->isLiveIn(X86::EFLAGS) &&
566 VisitedBlocks.insert(SuccMBB).second)
654 VisitedBlocks.insert(SuccMBB).second) {
655 // We currently don't do any PHI insertion and so we require that the
656 // test basic block dominates all of the use basic blocks. Further, we
657 // can't have a cycle from the test block back to itself as that would
658 // create a cycle requiring a PHI to break it.
659 //
660 // We could in theory do PHI insertion here if it becomes useful by
661 // just taking undef values in along every edge that we don't trace
662 // this EFLAGS copy along. This isn't as bad as fully general PHI
663 // insertion, but still seems like a great deal of complexity.
664 //
665 // Because it is theoretically possible that some earlier MI pass or
666 // other lowering transformation could induce this to happen, we do
667 // a hard check even in non-debug builds here.
668 if (SuccMBB == TestMBB || !MDT->dominates(TestMBB, SuccMBB)) {
669 LLVM_DEBUG({
670 dbgs()
671 << "ERROR: Encountered use that is not dominated by our test "
672 "basic block! Rewriting this would require inserting PHI "
673 "nodes to track the flag state across the CFG.\n\nTest "
674 "block:\n";
675 TestMBB->dump();
676 dbgs() << "Use block:\n";
677 SuccMBB->dump();
678 });
679 report_fatal_error(
680 "Cannot lower EFLAGS copy when original copy def "
681 "does not dominate all uses.");
682 }
683
567684 Blocks.push_back(SuccMBB);
685 }
568686 } while (!Blocks.empty());
569687
570688 // Now rewrite the jumps that use the flags. These we handle specially
579697 else
580698 LastJmpMBB = JmpI->getParent();
581699
582 rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
700 rewriteCondJmp(*TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
583701 }
584702
585703 // FIXME: Mark the last use of EFLAGS before the copy's def as a kill if
603721
604722 /// Collect any conditions that have already been set in registers so that we
605723 /// can re-use them rather than adding duplicates.
606 CondRegArray
607 X86FlagsCopyLoweringPass::collectCondsInRegs(MachineBasicBlock &MBB,
608 MachineInstr &CopyDefI) {
724 CondRegArray X86FlagsCopyLoweringPass::collectCondsInRegs(
725 MachineBasicBlock &MBB, MachineBasicBlock::iterator TestPos) {
609726 CondRegArray CondRegs = {};
610727
611728 // Scan backwards across the range of instructions with live EFLAGS.
612 for (MachineInstr &MI : llvm::reverse(
613 llvm::make_range(MBB.instr_begin(), CopyDefI.getIterator()))) {
729 for (MachineInstr &MI :
730 llvm::reverse(llvm::make_range(MBB.begin(), TestPos))) {
614731 X86::CondCode Cond = X86::getCondFromSETOpc(MI.getOpcode());
615732 if (Cond != X86::COND_INVALID && MI.getOperand(0).isReg() &&
616733 TRI->isVirtualRegister(MI.getOperand(0).getReg()))
7979 }
8080
8181 define i64 @test_branch_with_interleaved_livein_and_kill(i64 %a, i64 %b) {
82 entry:
83 call void @foo()
84 ret i64 0
85 }
86
87 define i64 @test_mid_cycle_copies(i64 %a, i64 %b) {
8288 entry:
8389 call void @foo()
8490 ret i64 0
737743 RET 0, $rax
738744
739745 ...
746 ---
747 # This test case is designed to exercise a particularly challenging situation:
748 # when the flags are copied and restored *inside* of a complex and cyclic CFG
749 # all of which have live-in flags. To correctly handle this case we have to walk
750 # up the dominator tree and locate a viable reaching definition location,
751 # checking for clobbers along any path. The CFG for this function looks like the
752 # following diagram, control flowing out the bottom of blocks and in the top:
753 #
754 # bb.0
755 # | __________________
756 # |/ \
757 # bb.1 |
758 # |\_________ |
759 # | __ \ ____ |
760 # |/ \ |/ \ |
761 # bb.2 | bb.4 | |
762 # |\__/ / \ | |
763 # | / \ | |
764 # bb.3 bb.5 bb.6 | |
765 # | \ / | |
766 # | \ / | |
767 # | bb.7 | |
768 # | ________/ \____/ |
769 # |/ |
770 # bb.8 |
771 # |\__________________/
772 # |
773 # bb.9
774 #
775 # We set EFLAGS in bb.0, clobber them in bb.3, and copy them in bb.2 and bb.6.
776 # Because of the cycles this requires hoisting the `SETcc` instructions to
777 # capture the flags for the bb.6 copy to bb.1 and using them for the copy in
778 # `bb.2` as well despite the clobber in `bb.3`. The clobber in `bb.3` also
779 # prevents hoisting the `SETcc`s up to `bb.0`.
780 #
781 # Throughout the test we use branch instructions that are totally bogus (as the
782 # flags are obviously not changing!) but this is just to allow us to send
783 # a small but complex CFG structure through the backend and force it to choose
784 # plausible lowering decisions based on the core CFG presented, regardless of
785 # the futility of the actual branches.
786 name: test_mid_cycle_copies
787 # CHECK-LABEL: name: test_mid_cycle_copies
788 liveins:
789 - { reg: '$rdi', virtual-reg: '%0' }
790 - { reg: '$rsi', virtual-reg: '%1' }
791 body: |
792 bb.0:
793 successors: %bb.1
794 liveins: $rdi, $rsi
795
796 %0:gr64 = COPY $rdi
797 %1:gr64 = COPY $rsi
798 CMP64rr %0, %1, implicit-def $eflags
799 ; CHECK: bb.0:
800 ; CHECK-NOT: COPY{{( killed)?}} $eflags
801 ; CHECK: CMP64rr %0, %1, implicit-def $eflags
802 ; CHECK-NOT: COPY{{( killed)?}} $eflags
803 JMP_1 %bb.1
804
805 bb.1:
806 successors: %bb.2, %bb.4
807 liveins: $eflags
808
809 ; Outer loop header, target for one set of hoisting.
810 JE_1 %bb.2, implicit $eflags
811 JMP_1 %bb.4
812 ; CHECK: bb.1:
813 ; CHECK-NOT: COPY{{( killed)?}} $eflags
814 ; CHECK: %[[A_REG:[^:]*]]:gr8 = SETAr implicit $eflags
815 ; CHECK-NEXT: %[[E_REG:[^:]*]]:gr8 = SETEr implicit $eflags
816 ; CHECK-NEXT: %[[B_REG:[^:]*]]:gr8 = SETBr implicit $eflags
817 ; CHECK-NOT: COPY{{( killed)?}} $eflags
818
819 bb.2:
820 successors: %bb.2, %bb.3
821 liveins: $eflags
822
823 ; Inner loop with a local copy. We should eliminate this but can't hoist.
824 %2:gr64 = COPY $eflags
825 $eflags = COPY %2
826 JE_1 %bb.2, implicit $eflags
827 JMP_1 %bb.3
828 ; CHECK: bb.2:
829 ; CHECK-NOT: COPY{{( killed)?}} $eflags
830 ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
831 ; CHECK-NEXT: JNE_1 %bb.2, implicit killed $eflags
832 ; CHECK-NOT: COPY{{( killed)?}} $eflags
833
834 bb.3:
835 successors: %bb.8
836 liveins: $eflags
837
838 ; Use and then clobber $eflags. Then hop to the outer loop latch.
839 %3:gr64 = ADC64ri32 %0, 42, implicit-def dead $eflags, implicit $eflags
840 ; CHECK: bb.3:
841 ; CHECK-NOT: COPY{{( killed)?}} $eflags
842 ; CHECK: dead %{{[^:]*}}:gr8 = ADD8ri %[[B_REG]], 255, implicit-def $eflags
843 ; CHECK-NEXT: %3:gr64 = ADC64ri32 %0, 42, implicit-def{{( dead)?}} $eflags, implicit{{( killed)?}} $eflags
844 ; CHECK-NOT: COPY{{( killed)?}} $eflags
845 MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %3
846 JMP_1 %bb.8
847
848 bb.4:
849 successors: %bb.5, %bb.6
850 liveins: $eflags
851
852 ; Another inner loop, this one with a diamond.
853 JE_1 %bb.5, implicit $eflags
854 JMP_1 %bb.6
855 ; CHECK: bb.4:
856 ; CHECK-NOT: COPY{{( killed)?}} $eflags
857 ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
858 ; CHECK-NEXT: JNE_1 %bb.5, implicit killed $eflags
859 ; CHECK-NOT: COPY{{( killed)?}} $eflags
860
861 bb.5:
862 successors: %bb.7
863 liveins: $eflags
864
865 ; Just use $eflags on this side of the diamond.
866 %4:gr64 = CMOVA64rr %0, %1, implicit $eflags
867 ; CHECK: bb.5:
868 ; CHECK-NOT: COPY{{( killed)?}} $eflags
869 ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
870 ; CHECK-NEXT: %4:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
871 ; CHECK-NOT: COPY{{( killed)?}} $eflags
872 MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %4
873 JMP_1 %bb.7
874
875 bb.6:
876 successors: %bb.7
877 liveins: $eflags
878
879 ; Use, copy, and then use $eflags again.
880 %5:gr64 = CMOVA64rr %0, %1, implicit $eflags
881 ; CHECK: bb.6:
882 ; CHECK-NOT: COPY{{( killed)?}} $eflags
883 ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
884 ; CHECK-NEXT: %5:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
885 ; CHECK-NOT: COPY{{( killed)?}} $eflags
886 MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %5
887
888 %6:gr64 = COPY $eflags
889 $eflags = COPY %6:gr64
890
891 %7:gr64 = CMOVA64rr %0, %1, implicit $eflags
892 ; CHECK-NOT: COPY{{( killed)?}} $eflags
893 ; CHECK: TEST8rr %[[A_REG]], %[[A_REG]], implicit-def $eflags
894 ; CHECK-NEXT: %7:gr64 = CMOVNE64rr %0, %1, implicit killed $eflags
895 ; CHECK-NOT: COPY{{( killed)?}} $eflags
896 MOV64mr $rsp, 1, $noreg, -16, $noreg, killed %7
897 JMP_1 %bb.7
898
899 bb.7:
900 successors: %bb.4, %bb.8
901 liveins: $eflags
902
903 ; Inner loop latch.
904 JE_1 %bb.4, implicit $eflags
905 JMP_1 %bb.8
906 ; CHECK: bb.7:
907 ; CHECK-NOT: COPY{{( killed)?}} $eflags
908 ; CHECK: TEST8rr %[[E_REG]], %[[E_REG]], implicit-def $eflags
909 ; CHECK-NEXT: JNE_1 %bb.4, implicit killed $eflags
910 ; CHECK-NOT: COPY{{( killed)?}} $eflags
911
912 bb.8:
913 successors: %bb.1, %bb.9
914
915 ; Outer loop latch. Note that we cannot have EFLAGS live-in here as that
916 ; Immediately require PHIs.
917 CMP64rr %0, %1, implicit-def $eflags
918 JE_1 %bb.1, implicit $eflags
919 JMP_1 %bb.9
920 ; CHECK: bb.8:
921 ; CHECK-NOT: COPY{{( killed)?}} $eflags
922 ; CHECK: CMP64rr %0, %1, implicit-def $eflags
923 ; CHECK-NEXT: JE_1 %bb.1, implicit $eflags
924 ; CHECK-NOT: COPY{{( killed)?}} $eflags
925
926 bb.9:
927 liveins: $eflags
928
929 ; And we're done.
930 %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
931 $rax = COPY %8
932 RET 0, $rax
933 ; CHECK: bb.9:
934 ; CHECK-NOT: $eflags
935 ; CHECK: %8:gr64 = CMOVE64rr %0, %1, implicit killed $eflags
936
937 ...