llvm.org GIT mirror llvm / 7384652
[CodeGen] Print "%vreg0" as "%0" in both MIR and debug output As part of the unification of the debug format and the MIR format, avoid printing "vreg" for virtual registers (which is one of the current MIR possibilities). Basically: * find . \( -name "*.mir" -o -name "*.cpp" -o -name "*.h" -o -name "*.ll" \) -type f -print0 | xargs -0 sed -i '' -E "s/%vreg([0-9]+)/%\1/g" * grep -nr '%vreg' . and fix if needed * find . \( -name "*.mir" -o -name "*.cpp" -o -name "*.h" -o -name "*.ll" \) -type f -print0 | xargs -0 sed -i '' -E "s/ vreg([0-9]+)/ %\1/g" * grep -nr 'vreg[0-9]\+' . and fix if needed Differential Revision: https://reviews.llvm.org/D40420 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@319427 91177308-0d34-0410-b5e6-96231b3b80d8 Francis Visoiu Mistrih 2 years ago
79 changed file(s) with 525 addition(s) and 525 deletion(s). Raw diff Collapse all Expand all
115115 /// the same register. In that case, the instruction may depend on those
116116 /// operands reading the same dont-care value. For example:
117117 ///
118 /// %vreg1 = XOR %vreg2, %vreg2
119 ///
120 /// Any register can be used for %vreg2, and its value doesn't matter, but
118 /// %1 = XOR %2, %2
119 ///
120 /// Any register can be used for %2, and its value doesn't matter, but
121121 /// the two operands must be the same register.
122122 ///
123123 bool IsUndef : 1;
421421 /// and \p DefIdx.
422422 /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
423423 /// the list is modeled as .
424 /// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
424 /// E.g., REG_SEQUENCE %1:sub1, sub0, %2, sub1 would produce
425425 /// two elements:
426 /// - vreg1:sub1, sub0
427 /// - vreg2<:0>, sub1
426 /// - %1:sub1, sub0
427 /// - %2<:0>, sub1
428428 ///
429429 /// \returns true if it is possible to build such an input sequence
430430 /// with the pair \p MI, \p DefIdx. False otherwise.
441441 /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
442442 /// and \p DefIdx.
443443 /// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
444 /// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
445 /// - vreg1:sub1, sub0
444 /// E.g., EXTRACT_SUBREG %1:sub1, sub0, sub1 would produce:
445 /// - %1:sub1, sub0
446446 ///
447447 /// \returns true if it is possible to build such an input sequence
448448 /// with the pair \p MI, \p DefIdx. False otherwise.
459459 /// and \p DefIdx.
460460 /// \p [out] BaseReg and \p [out] InsertedReg contain
461461 /// the equivalent inputs of INSERT_SUBREG.
462 /// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
463 /// - BaseReg: vreg0:sub0
464 /// - InsertedReg: vreg1:sub1, sub3
462 /// E.g., INSERT_SUBREG %0:sub0, %1:sub1, sub3 would produce:
463 /// - BaseReg: %0:sub0
464 /// - InsertedReg: %1:sub1, sub3
465465 ///
466466 /// \returns true if it is possible to build such an input sequence
467467 /// with the pair \p MI, \p DefIdx. False otherwise.
11371137 ///
11381138 /// The format is:
11391139 /// %noreg - NoRegister
1140 /// %vreg5 - a virtual register.
1141 /// %vreg5:sub_8bit - a virtual register with sub-register index (with TRI).
1140 /// %5 - a virtual register.
1141 /// %5:sub_8bit - a virtual register with sub-register index (with TRI).
11421142 /// %eax - a physical register
11431143 /// %physreg17 - a physical register when no TRI instance given.
11441144 ///
1616 /// when subregisters are involved.
1717 ///
1818 /// Example:
19 /// %vreg0 = some definition
20 /// %vreg1 = IMPLICIT_DEF
21 /// %vreg2 = REG_SEQUENCE %vreg0, sub0, %vreg1, sub1
22 /// %vreg3 = EXTRACT_SUBREG %vreg2, sub1
23 /// = use %vreg3
24 /// The %vreg0 definition is dead and %vreg3 contains an undefined value.
19 /// %0 = some definition
20 /// %1 = IMPLICIT_DEF
21 /// %2 = REG_SEQUENCE %0, sub0, %1, sub1
22 /// %3 = EXTRACT_SUBREG %2, sub1
23 /// = use %3
24 /// The %0 definition is dead and %3 contains an undefined value.
2525 //
2626 //===----------------------------------------------------------------------===//
2727
697697 // Check if any of the regunits are live beyond the end of RI. That could
698698 // happen when a physreg is defined as a copy of a virtreg:
699699 //
700 // %eax = COPY %vreg5
701 // FOO %vreg5 <--- MI, cancel kill because %eax is live.
700 // %eax = COPY %5
701 // FOO %5 <--- MI, cancel kill because %eax is live.
702702 // BAR %eax
703703 //
704 // There should be no kill flag on FOO when %vreg5 is rewritten as %eax.
704 // There should be no kill flag on FOO when %5 is rewritten as %eax.
705705 for (auto &RUP : RU) {
706706 const LiveRange &RURange = *RUP.first;
707707 LiveRange::const_iterator &I = RUP.second;
718718 // When reading a partial undefined value we must not add a kill flag.
719719 // The regalloc might have used the undef lane for something else.
720720 // Example:
721 // %vreg1 = ... ; R32: %vreg1
722 // %vreg2:high16 = ... ; R64: %vreg2
723 // = read %vreg2 ; R64: %vreg2
724 // = read %vreg1 ; R32: %vreg1
725 // The flag is correct for %vreg2, but the register allocator may
726 // assign R0L to %vreg1, and R0 to %vreg2 because the low 32bits of R0
727 // are actually never written by %vreg2. After assignment the
721 // %1 = ... ; R32: %1
722 // %2:high16 = ... ; R64: %2
723 // = read %2 ; R64: %2
724 // = read %1 ; R32: %1
725 // The flag is correct for %2, but the register allocator may
726 // assign R0L to %1, and R0 to %2 because the low 32bits of R0
727 // are actually never written by %2. After assignment the
728728 // flag at the read instruction is invalid.
729729 LaneBitmask DefinedLanesMask;
730730 if (!SRs.empty()) {
19601960 if (MOI->isDef()) {
19611961 if (Sub != 0) {
19621962 hasSubRegDef = true;
1963 // An operand vreg0:sub0 reads vreg0:sub1..n. Invert the lane
1963 // An operand %0:sub0 reads %0:sub1..n. Invert the lane
19641964 // mask for subregister defs. Read-undef defs will be handled by
19651965 // readsReg below.
19661966 SLM = ~SLM;
14521452 // only the first copy is considered.
14531453 //
14541454 // e.g.
1455 // %vreg1 = COPY %vreg0
1456 // %vreg2 = COPY %vreg0:sub1
1457 //
1458 // Should replace %vreg2 uses with %vreg1:sub1
1455 // %1 = COPY %0
1456 // %2 = COPY %0:sub1
1457 //
1458 // Should replace %2 uses with %1:sub1
14591459 bool PeepholeOptimizer::foldRedundantCopy(
14601460 MachineInstr *MI, SmallSet &CopySrcRegs,
14611461 DenseMap &CopyMIs) {
16201620 /// from the phi. For example, if there is a recurrence of
16211621 ///
16221622 /// LoopHeader:
1623 /// %vreg1 = phi(%vreg0, %vreg100)
1623 /// %1 = phi(%0, %100)
16241624 /// LoopLatch:
1625 /// %vreg0 = ADD %vreg2, %vreg1
1625 /// %0 = ADD %2, %1
16261626 ///
1627 /// , the fact that vreg0 and vreg2 are in the same tied operands set makes
1627 /// , the fact that %0 and %2 are in the same tied operands set makes
16281628 /// the coalescing of copy instruction generated from the phi in
1629 /// LoopHeader(i.e. %vreg1 = COPY %vreg0) impossible, because %vreg1 and
1630 /// %vreg2 have overlapping live range. This introduces additional move
1631 /// instruction to the final assembly. However, if we commute %vreg2 and
1632 /// %vreg1 of ADD instruction, the redundant move instruction can be
1629 /// LoopHeader(i.e. %1 = COPY %0) impossible, because %1 and
1630 /// %2 have overlapping live range. This introduces additional move
1631 /// instruction to the final assembly. However, if we commute %2 and
1632 /// %1 of ADD instruction, the redundant move instruction can be
16331633 /// avoided.
16341634 bool PeepholeOptimizer::optimizeRecurrence(MachineInstr &PHI) {
16351635 SmallSet TargetRegs;
13951395 /// Such sequences are created in 2 scenarios:
13961396 ///
13971397 /// Scenario #1:
1398 /// vreg0 is evicted from physreg0 by vreg1.
1399 /// Evictee vreg0 is intended for region splitting with split candidate
1400 /// physreg0 (the reg vreg0 was evicted from).
1398 /// %0 is evicted from physreg0 by %1.
1399 /// Evictee %0 is intended for region splitting with split candidate
1400 /// physreg0 (the reg %0 was evicted from).
14011401 /// Region splitting creates a local interval because of interference with the
1402 /// evictor vreg1 (normally region spliitting creates 2 interval, the "by reg"
1402 /// evictor %1 (normally region spliitting creates 2 interval, the "by reg"
14031403 /// and "by stack" intervals and local interval created when interference
14041404 /// occurs).
1405 /// One of the split intervals ends up evicting vreg2 from physreg1.
1406 /// Evictee vreg2 is intended for region splitting with split candidate
1405 /// One of the split intervals ends up evicting %2 from physreg1.
1406 /// Evictee %2 is intended for region splitting with split candidate
14071407 /// physreg1.
1408 /// One of the split intervals ends up evicting vreg3 from physreg2, etc.
1408 /// One of the split intervals ends up evicting %3 from physreg2, etc.
14091409 ///
14101410 /// Scenario #2
1411 /// vreg0 is evicted from physreg0 by vreg1.
1412 /// vreg2 is evicted from physreg2 by vreg3 etc.
1413 /// Evictee vreg0 is intended for region splitting with split candidate
1411 /// %0 is evicted from physreg0 by %1.
1412 /// %2 is evicted from physreg2 by %3 etc.
1413 /// Evictee %0 is intended for region splitting with split candidate
14141414 /// physreg1.
14151415 /// Region splitting creates a local interval because of interference with the
1416 /// evictor vreg1.
1417 /// One of the split intervals ends up evicting back original evictor vreg1
1418 /// from physreg0 (the reg vreg0 was evicted from).
1419 /// Another evictee vreg2 is intended for region splitting with split candidate
1416 /// evictor %1.
1417 /// One of the split intervals ends up evicting back original evictor %1
1418 /// from physreg0 (the reg %0 was evicted from).
1419 /// Another evictee %2 is intended for region splitting with split candidate
14201420 /// physreg1.
1421 /// One of the split intervals ends up evicting vreg3 from physreg2, etc.
1421 /// One of the split intervals ends up evicting %3 from physreg2, etc.
14221422 ///
14231423 /// \param Evictee The register considered to be split.
14241424 /// \param Cand The split candidate that determines the physical register
227227 /// flag.
228228 /// This can happen when undef uses were previously concealed by a copy
229229 /// which we coalesced. Example:
230 /// %vreg0:sub0 = ...
231 /// %vreg1 = COPY %vreg0 <-- Coalescing COPY reveals undef
232 /// = use %vreg1:sub1 <-- hidden undef use
230 /// %0:sub0 = ...
231 /// %1 = COPY %0 <-- Coalescing COPY reveals undef
232 /// = use %1:sub1 <-- hidden undef use
233233 void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
234234 MachineOperand &MO, unsigned SubRegIdx);
235235
11421142 NewMI.setDebugLoc(DL);
11431143
11441144 // In a situation like the following:
1145 // %vreg0:subreg = instr ; DefMI, subreg = DstIdx
1146 // %vreg1 = copy %vreg0:subreg ; CopyMI, SrcIdx = 0
1147 // instead of widening %vreg1 to the register class of %vreg0 simply do:
1148 // %vreg1 = instr
1145 // %0:subreg = instr ; DefMI, subreg = DstIdx
1146 // %1 = copy %0:subreg ; CopyMI, SrcIdx = 0
1147 // instead of widening %1 to the register class of %0 simply do:
1148 // %1 = instr
11491149 const TargetRegisterClass *NewRC = CP.getNewRC();
11501150 if (DstIdx != 0) {
11511151 MachineOperand &DefMO = NewMI.getOperand(0);
12251225 // This could happen if the rematerialization instruction is rematerializing
12261226 // more than actually is used in the register.
12271227 // An example would be:
1228 // vreg1 = LOAD CONSTANTS 5, 8 ; Loading both 5 and 8 in different subregs
1228 // %1 = LOAD CONSTANTS 5, 8 ; Loading both 5 and 8 in different subregs
12291229 // ; Copying only part of the register here, but the rest is undef.
1230 // vreg2:sub_16bit = COPY vreg1:sub_16bit
1230 // %2:sub_16bit = COPY %1:sub_16bit
12311231 // ==>
12321232 // ; Materialize all the constants but only using one
1233 // vreg2 = LOAD_CONSTANTS 5, 8
1233 // %2 = LOAD_CONSTANTS 5, 8
12341234 //
12351235 // at this point for the part that wasn't defined before we could have
12361236 // subranges missing the definition.
12531253
12541254 // Make sure that the subrange for resultant undef is removed
12551255 // For example:
1256 // vreg1:sub1 = LOAD CONSTANT 1
1257 // vreg2 = COPY vreg1
1256 // %1:sub1 = LOAD CONSTANT 1
1257 // %2 = COPY %1
12581258 // ==>
1259 // vreg2:sub1 = LOAD CONSTANT 1
1260 // ; Correct but need to remove the subrange for vreg2:sub0
1259 // %2:sub1 = LOAD CONSTANT 1
1260 // ; Correct but need to remove the subrange for %2:sub0
12611261 // ; as it is now undef
12621262 if (NewIdx != 0 && DstInt.hasSubRanges()) {
12631263 // The affected subregister segments can be removed.
12911291 // Otherwise, variables that live through may miss some
12921292 // interferences, thus creating invalid allocation.
12931293 // E.g., i386 code:
1294 // vreg1 = somedef ; vreg1 GR8
1295 // vreg2 = remat ; vreg2 GR32
1296 // CL = COPY vreg2.sub_8bit
1297 // = somedef vreg1 ; vreg1 GR8
1294 // %1 = somedef ; %1 GR8
1295 // %2 = remat ; %2 GR32
1296 // CL = COPY %2.sub_8bit
1297 // = somedef %1 ; %1 GR8
12981298 // =>
1299 // vreg1 = somedef ; vreg1 GR8
1299 // %1 = somedef ; %1 GR8
13001300 // ECX = remat ; CL
1301 // = somedef vreg1 ; vreg1 GR8
1302 // vreg1 will see the inteferences with CL but not with CH since
1301 // = somedef %1 ; %1 GR8
1302 // %1 will see the inteferences with CL but not with CH since
13031303 // no live-ranges would have been created for ECX.
13041304 // Fix that!
13051305 SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
13521352 // ProcessImpicitDefs may leave some copies of values, it only removes
13531353 // local variables. When we have a copy like:
13541354 //
1355 // %vreg1 = COPY %vreg2
1355 // %1 = COPY %2
13561356 //
1357 // We delete the copy and remove the corresponding value number from %vreg1.
1357 // We delete the copy and remove the corresponding value number from %1.
13581358 // Any uses of that value number are marked as .
13591359
13601360 // Note that we do not query CoalescerPair here but redo isMoveInstr as the
18191819 MachineInstr *CopyMI;
18201820 if (CP.isFlipped()) {
18211821 // Physreg is copied into vreg
1822 // %vregY = COPY %x
1822 // %y = COPY %physreg_x
18231823 // ... //< no other def of %x here
1824 // use %vregY
1824 // use %y
18251825 // =>
18261826 // ...
18271827 // use %x
18281828 CopyMI = MRI->getVRegDef(SrcReg);
18291829 } else {
18301830 // VReg is copied into physreg:
1831 // %vregX = def
1831 // %y = def
18321832 // ... //< no other def or use of %y here
1833 // %y = COPY %vregX
1833 // %y = COPY %physreg_x
18341834 // =>
18351835 // %y = def
18361836 // ...
99 /// Rename independent subregisters looks for virtual registers with
1010 /// independently used subregisters and renames them to new virtual registers.
1111 /// Example: In the following:
12 /// %vreg0:sub0 = ...
13 /// %vreg0:sub1 = ...
14 /// use %vreg0:sub0
15 /// %vreg0:sub0 = ...
16 /// use %vreg0:sub0
17 /// use %vreg0:sub1
12 /// %0:sub0 = ...
13 /// %0:sub1 = ...
14 /// use %0:sub0
15 /// %0:sub0 = ...
16 /// use %0:sub0
17 /// use %0:sub1
1818 /// sub0 and sub1 are never used together, and we have two independent sub0
1919 /// definitions. This pass will rename to:
20 /// %vreg0:sub0 = ...
21 /// %vreg1:sub1 = ...
22 /// use %vreg1:sub1
23 /// %vreg2:sub1 = ...
24 /// use %vreg2:sub1
25 /// use %vreg0:sub0
20 /// %0:sub0 = ...
21 /// %1:sub1 = ...
22 /// use %1:sub1
23 /// %2:sub1 = ...
24 /// use %2:sub1
25 /// use %0:sub0
2626 //
2727 //===----------------------------------------------------------------------===//
2828
13741374 continue;
13751375 // The problem here can be that the new register may have been created
13761376 // for a partially defined original register. For example:
1377 // %vreg827:subreg_hireg = ...
1377 // %0:subreg_hireg = ...
13781378 // ...
1379 // %vreg828 = COPY %vreg827
1379 // %1 = COPY %0
13801380 if (S.empty())
13811381 continue;
13821382 SubLRC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
9292 else if (TargetRegisterInfo::isStackSlot(Reg))
9393 OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
9494 else if (TargetRegisterInfo::isVirtualRegister(Reg))
95 OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
95 OS << '%' << TargetRegisterInfo::virtReg2Index(Reg);
9696 else if (TRI && Reg < TRI->getNumRegs()) {
9797 OS << '%';
9898 printLowerCase(TRI->getName(Reg), OS);
133133 Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
134134 return Printable([Unit, TRI](raw_ostream &OS) {
135135 if (TRI && TRI->isVirtualRegister(Unit)) {
136 OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
136 OS << '%' << TargetRegisterInfo::virtReg2Index(Unit);
137137 } else {
138138 OS << printRegUnit(Unit, TRI);
139139 }
28002800 LiveIntervals *LIS) const {
28012801 // This is a bit of a hack. Consider this instruction:
28022802 //
2803 // %vreg0 = COPY %sp; GPR64all:%vreg0
2803 // %0 = COPY %sp; GPR64all:%0
28042804 //
28052805 // We explicitly chose GPR64all for the virtual register so such a copy might
28062806 // be eliminated by RegisterCoalescer. However, that may not be possible, and
2807 // %vreg0 may even spill. We can't spill %sp, and since it is in the GPR64all
2807 // %0 may even spill. We can't spill %sp, and since it is in the GPR64all
28082808 // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
28092809 //
2810 // To prevent that, we are going to constrain the %vreg0 register class here.
2810 // To prevent that, we are going to constrain the %0 register class here.
28112811 //
28122812 //
28132813 //
28292829 // Handle the case where a copy is being spilled or filled but the source
28302830 // and destination register class don't match. For example:
28312831 //
2832 // %vreg0 = COPY %xzr; GPR64common:%vreg0
2832 // %0 = COPY %xzr; GPR64common:%0
28332833 //
28342834 // In this case we can still safely fold away the COPY and generate the
28352835 // following spill code:
28392839 // This also eliminates spilled cross register class COPYs (e.g. between x and
28402840 // d regs) of the same size. For example:
28412841 //
2842 // %vreg0 = COPY %vreg1; GPR64:%vreg0, FPR64:%vreg1
2842 // %0 = COPY %1; GPR64:%0, FPR64:%1
28432843 //
28442844 // will be filled as
28452845 //
2846 // LDRDui %vreg0, fi<#0>
2846 // LDRDui %0, fi<#0>
28472847 //
28482848 // instead of
28492849 //
2850 // LDRXui %vregTemp, fi<#0>
2851 // %vreg0 = FMOV %vregTemp
2850 // LDRXui %Temp, fi<#0>
2851 // %0 = FMOV %Temp
28522852 //
28532853 if (MI.isCopy() && Ops.size() == 1 &&
28542854 // Make sure we're only folding the explicit COPY defs/uses.
28852885
28862886 // Handle cases like spilling def of:
28872887 //
2888 // %vreg0:sub_32 = COPY %wzr; GPR64common:%vreg0
2888 // %0:sub_32 = COPY %wzr; GPR64common:%0
28892889 //
28902890 // where the physical register source can be widened and stored to the full
28912891 // virtual reg destination stack slot, in this case producing:
29332933
29342934 // Handle cases like filling use of:
29352935 //
2936 // %vreg0:sub_32 = COPY %vreg1; GPR64:%vreg0, GPR32:%vreg1
2936 // %0:sub_32 = COPY %1; GPR64:%0, GPR32:%1
29372937 //
29382938 // where we can load the full virtual reg source stack slot, into the subreg
29392939 // destination, in this case producing:
29402940 //
2941 // LDRWui %vreg0:sub_32,
2941 // LDRWui %0:sub_32,
29422942 //
29432943 if (IsFill && SrcMO.getSubReg() == 0 && DstMO.isUndef()) {
29442944 const TargetRegisterClass *FillRC;
1111 /// common data and/or have enough undef subreg using swizzle abilities.
1212 ///
1313 /// For instance let's consider the following pseudo code :
14 /// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
14 /// %5 = REG_SEQ %1, sub0, %2, sub1, %3, sub2, undef, sub3
1515 /// ...
16 /// vreg7 = REG_SEQ vreg1, sub0, vreg3, sub1, undef, sub2, vreg4, sub3
17 /// (swizzable Inst) vreg7, SwizzleMask : sub0, sub1, sub2, sub3
16 /// %7 = REG_SEQ %1, sub0, %3, sub1, undef, sub2, %4, sub3
17 /// (swizzable Inst) %7, SwizzleMask : sub0, sub1, sub2, sub3
1818 ///
1919 /// is turned into :
20 /// vreg5 = REG_SEQ vreg1, sub0, vreg2, sub1, vreg3, sub2, undef, sub3
20 /// %5 = REG_SEQ %1, sub0, %2, sub1, %3, sub2, undef, sub3
2121 /// ...
22 /// vreg7 = INSERT_SUBREG vreg4, sub3
23 /// (swizzable Inst) vreg7, SwizzleMask : sub0, sub2, sub1, sub3
22 /// %7 = INSERT_SUBREG %4, sub3
23 /// (swizzable Inst) %7, SwizzleMask : sub0, sub2, sub1, sub3
2424 ///
2525 /// This allow regalloc to reduce register pressure for vector registers and
2626 /// to reduce MOV count.
1313 /// Register Class is the union of and
1414 ///
1515 /// BB0:
16 /// %vreg0 = SCALAR_INST
17 /// %vreg1 = COPY %vreg0
16 /// %0 = SCALAR_INST
17 /// %1 = COPY %0
1818 /// ...
1919 /// BRANCH %cond BB1, BB2
2020 /// BB1:
21 /// %vreg2 = VECTOR_INST
22 /// %vreg3 = COPY %vreg2
21 /// %2 = VECTOR_INST
22 /// %3 = COPY %2
2323 /// BB2:
24 /// %vreg4 = PHI %vreg1 , , %vreg3 ,
25 /// %vreg5 = VECTOR_INST %vreg4 >
24 /// %4 = PHI %1 , , %3 , >
25 /// %5 = VECTOR_INST %4
2626 ///
2727 ///
2828 /// The coalescer will begin at BB0 and eliminate its copy, then the resulting
2929 /// code will look like this:
3030 ///
3131 /// BB0:
32 /// %vreg0 = SCALAR_INST
32 /// %0 = SCALAR_INST
3333 /// ...
3434 /// BRANCH %cond BB1, BB2
3535 /// BB1:
36 /// %vreg2 = VECTOR_INST
37 /// %vreg3 = COPY %vreg2
36 /// %2 = VECTOR_INST
37 /// %3 = COPY %2
3838 /// BB2:
39 /// %vreg4 = PHI %vreg0 , , %vreg3 ,
40 /// %vreg5 = VECTOR_INST %vreg4 >
39 /// %4 = PHI %0 , , %3 , >
40 /// %5 = VECTOR_INST %4
4141 ///
4242 /// Now that the result of the PHI instruction is an SGPR, the register
43 /// allocator is now forced to constrain the register class of %vreg3 to
43 /// allocator is now forced to constrain the register class of %3 to
4444 /// so we end up with final code like this:
4545 ///
4646 /// BB0:
47 /// %vreg0 = SCALAR_INST
47 /// %0 = SCALAR_INST
4848 /// ...
4949 /// BRANCH %cond BB1, BB2
5050 /// BB1:
51 /// %vreg2 = VECTOR_INST
52 /// %vreg3 = COPY %vreg2
51 /// %2 = VECTOR_INST
52 /// %3 = COPY %2
5353 /// BB2:
54 /// %vreg4 = PHI %vreg0 , , %vreg3 ,
55 /// %vreg5 = VECTOR_INST %vreg4 >
54 /// %4 = PHI %0 , , %3 , >
55 /// %5 = VECTOR_INST %4
5656 ///
5757 /// Now this code contains an illegal copy from a VGPR to an SGPR.
5858 ///
289289 // copy since a subregister use tied to a full register def doesn't really
290290 // make sense. e.g. don't fold:
291291 //
292 // %vreg1 = COPY %vreg0:sub1
293 // %vreg2 = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg1
292 // %1 = COPY %0:sub1
293 // %2 = V_MAC_{F16, F32} %3, %4, %1
294294 //
295295 // into
296 // %vreg2 = V_MAC_{F16, F32} %vreg3, %vreg4, %vreg0:sub1
296 // %2 = V_MAC_{F16, F32} %3, %4, %0:sub1
297297 if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
298298 return;
299299 }
970970 // Prevent folding operands backwards in the function. For example,
971971 // the COPY opcode must not be replaced by 1 in this example:
972972 //
973 // %vreg3 = COPY %vgpr0; VGPR_32:%vreg3
973 // %3 = COPY %vgpr0; VGPR_32:%3
974974 // ...
975975 // %vgpr0 = V_MOV_B32_e32 1, %exec
976976 MachineOperand &Dst = MI.getOperand(0);
99 /// \file This pass tries to apply several peephole SDWA patterns.
1010 ///
1111 /// E.g. original:
12 /// V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
13 /// V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
14 /// V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
12 /// V_LSHRREV_B32_e32 %0, 16, %1
13 /// V_ADD_I32_e32 %2, %0, %3
14 /// V_LSHLREV_B32_e32 %4, 16, %2
1515 ///
1616 /// Replace:
17 /// V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
17 /// V_ADD_I32_sdwa %4, %1, %3
1818 /// dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
1919 ///
2020 //===----------------------------------------------------------------------===//
409409 }
410410
411411 // If this is not immediate then it can be copy of immediate value, e.g.:
412 // %vreg1 = S_MOV_B32 255;
412 // %1 = S_MOV_B32 255;
413413 if (Op.isReg()) {
414414 for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
415415 if (!isSameReg(Op, Def))
13461346 // class.
13471347 //
13481348 // e.g. if we have something like
1349 // vreg0 = ...
1350 // vreg1 = ...
1351 // vreg2 = REG_SEQUENCE vreg0, sub0, vreg1, sub1, vreg2, sub2
1352 // vreg3 = COPY vreg2, sub0
1349 // %0 = ...
1350 // %1 = ...
1351 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1352 // %3 = COPY %2, sub0
13531353 //
13541354 // We want to look through the COPY to find:
1355 // => vreg3 = COPY vreg0
1355 // => %3 = COPY %0
13561356
13571357 // Plain copy.
13581358 return getCommonSubClass(DefRC, SrcRC) != nullptr;
16491649 }
16501650
16511651 for (unsigned i = 3, e = MI0.getNumOperands(); i != e; ++i) {
1652 // %vreg12 = PICLDR %vreg11, 0, pred:14, pred:%noreg
1652 // %12 = PICLDR %11, 0, pred:14, pred:%noreg
16531653 const MachineOperand &MO0 = MI0.getOperand(i);
16541654 const MachineOperand &MO1 = MI1.getOperand(i);
16551655 if (!MO0.isIdenticalTo(MO1))
4646 /// and \p DefIdx.
4747 /// \p [out] InputRegs of the equivalent REG_SEQUENCE. Each element of
4848 /// the list is modeled as .
49 /// E.g., REG_SEQUENCE vreg1:sub1, sub0, vreg2, sub1 would produce
49 /// E.g., REG_SEQUENCE %1:sub1, sub0, %2, sub1 would produce
5050 /// two elements:
51 /// - vreg1:sub1, sub0
52 /// - vreg2<:0>, sub1
51 /// - %1:sub1, sub0
52 /// - %2<:0>, sub1
5353 ///
5454 /// \returns true if it is possible to build such an input sequence
5555 /// with the pair \p MI, \p DefIdx. False otherwise.
6262 /// Build the equivalent inputs of a EXTRACT_SUBREG for the given \p MI
6363 /// and \p DefIdx.
6464 /// \p [out] InputReg of the equivalent EXTRACT_SUBREG.
65 /// E.g., EXTRACT_SUBREG vreg1:sub1, sub0, sub1 would produce:
66 /// - vreg1:sub1, sub0
65 /// E.g., EXTRACT_SUBREG %1:sub1, sub0, sub1 would produce:
66 /// - %1:sub1, sub0
6767 ///
6868 /// \returns true if it is possible to build such an input sequence
6969 /// with the pair \p MI, \p DefIdx. False otherwise.
7676 /// and \p DefIdx.
7777 /// \p [out] BaseReg and \p [out] InsertedReg contain
7878 /// the equivalent inputs of INSERT_SUBREG.
79 /// E.g., INSERT_SUBREG vreg0:sub0, vreg1:sub1, sub3 would produce:
80 /// - BaseReg: vreg0:sub0
81 /// - InsertedReg: vreg1:sub1, sub3
79 /// E.g., INSERT_SUBREG %0:sub0, %1:sub1, sub3 would produce:
80 /// - BaseReg: %0:sub0
81 /// - InsertedReg: %1:sub1, sub3
8282 ///
8383 /// \returns true if it is possible to build such an input sequence
8484 /// with the pair \p MI, \p DefIdx. False otherwise.
545545 if (!RegN || !TargetRegisterInfo::isVirtualRegister(RegN->getReg()))
546546 return;
547547 unsigned AndOpReg = RegN->getReg();
548 DEBUG(dbgs() << "Examine %vreg" << TargetRegisterInfo::virtReg2Index(AndOpReg)
548 DEBUG(dbgs() << "Examine %" << TargetRegisterInfo::virtReg2Index(AndOpReg)
549549 << '\n');
550550
551551 // Examine the PHI insns in the MachineBasicBlock to found out the
573573 return;
574574 } else {
575575 // The PHI node looks like:
576 // %vreg2 = PHI %vreg0, , %vreg1,
577 // Trace each incoming definition, e.g., (%vreg0, BB#1) and (%vreg1, BB#3)
578 // The AND operation can be removed if both %vreg0 in BB#1 and %vreg1 in
576 // %2 = PHI %0, , %1,
577 // Trace each incoming definition, e.g., (%0, BB#1) and (%1, BB#3)
578 // The AND operation can be removed if both %0 in BB#1 and %1 in
579579 // BB#3 are defined with with a load matching the MaskN.
580580 DEBUG(dbgs() << "Check PHI Insn: "; MII->dump(); dbgs() << '\n');
581581 unsigned PrevReg = -1;
1717 // A "ref" value is associated with a BitRef structure, which indicates
1818 // which virtual register, and which bit in that register is the origin
1919 // of the value. For example, given an instruction
20 // vreg2 = ASL vreg1, 1
21 // assuming that nothing is known about bits of vreg1, bit 1 of vreg2
22 // will be a "ref" to (vreg1, 0). If there is a subsequent instruction
23 // vreg3 = ASL vreg2, 2
24 // then bit 3 of vreg3 will be a "ref" to (vreg1, 0) as well.
20 // %2 = ASL %1, 1
21 // assuming that nothing is known about bits of %1, bit 1 of %2
22 // will be a "ref" to (%1, 0). If there is a subsequent instruction
23 // %3 = ASL %2, 2
24 // then bit 3 of %3 will be a "ref" to (%1, 0) as well.
2525 // The "bottom" case means that the bit's value cannot be determined,
2626 // and that this virtual register actually defines it. The "bottom" case
2727 // is discussed in detail in BitTracker.h. In fact, "bottom" is a "ref
28 // to self", so for the vreg1 above, the bit 0 of it will be a "ref" to
29 // (vreg1, 0), bit 1 will be a "ref" to (vreg1, 1), etc.
28 // to self", so for the %1 above, the bit 0 of it will be a "ref" to
29 // (%1, 0), bit 1 will be a "ref" to (%1, 1), etc.
3030 //
3131 // The tracker implements the Wegman-Zadeck algorithm, originally developed
3232 // for SSA-based constant propagation. Each register is represented as
7474
7575 namespace {
7676
77 // Local trickery to pretty print a register (without the whole "%vreg"
77 // Local trickery to pretty print a register (without the whole "%number"
7878 // business).
7979 struct printv {
8080 printv(unsigned r) : R(r) {}
894894 }
895895
896896 // Calculate the register class that matches Reg:Sub. For example, if
897 // vreg1 is a double register, then vreg1:isub_hi would match the "int"
897 // %1 is a double register, then %1:isub_hi would match the "int"
898898 // register class.
899899 const TargetRegisterClass *HexagonBitSimplify::getFinalVRegClass(
900900 const BitTracker::RegisterRef &RR, MachineRegisterInfo &MRI) {
12451245 // holds the bits for the entire register. To keep track of that, the
12461246 // argument Begin indicates where in Bits is the lowest-significant bit
12471247 // of the register used in operand OpN. For example, in instruction:
1248 // vreg1 = S2_lsr_i_r vreg2:isub_hi, 10
1248 // %1 = S2_lsr_i_r %2:isub_hi, 10
12491249 // the operand 1 is a 32-bit register, which happens to be a subregister
1250 // of the 64-bit register vreg2, and that subregister starts at position 32.
1250 // of the 64-bit register %2, and that subregister starts at position 32.
12511251 // In this case Begin=32, since Bits[32] would be the lowest-significant bit
1252 // of vreg2:isub_hi.
1252 // of %2:isub_hi.
12531253 bool RedundantInstrElimination::computeUsedBits(const MachineInstr &MI,
12541254 unsigned OpN, BitVector &Bits, uint16_t Begin) {
12551255 unsigned Opc = MI.getOpcode();
13551355 // This pass can create copies between registers that don't have the
13561356 // exact same values. Updating the tracker has to involve updating
13571357 // all dependent cells. Example:
1358 // vreg1 = inst vreg2 ; vreg1 != vreg2, but used bits are equal
1358 // %1 = inst %2 ; %1 != %2, but used bits are equal
13591359 //
1360 // vreg3 = copy vreg2 ; <- inserted
1361 // ... = vreg3 ; <- replaced from vreg2
1362 // Indirectly, we can create a "copy" between vreg1 and vreg2 even
1360 // %3 = copy %2 ; <- inserted
1361 // ... = %3 ; <- replaced from %2
1362 // Indirectly, we can create a "copy" between %1 and %2 even
13631363 // though their exact values do not match.
13641364 BT.visit(*CopyI);
13651365 Changed = true;
23122312
23132313 // Check for tstbit simplification opportunity, where the bit being checked
23142314 // can be tracked back to another register. For example:
2315 // vreg2 = S2_lsr_i_r vreg1, 5
2316 // vreg3 = S2_tstbit_i vreg2, 0
2315 // %2 = S2_lsr_i_r %1, 5
2316 // %3 = S2_tstbit_i %2, 0
23172317 // =>
2318 // vreg3 = S2_tstbit_i vreg1, 5
2318 // %3 = S2_tstbit_i %1, 5
23192319 bool BitSimplification::simplifyTstbit(MachineInstr *MI,
23202320 BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC) {
23212321 unsigned Opc = MI->getOpcode();
367367 }
368368 }
369369 // Defs and clobbers can overlap, e.g.
370 // %d0 = COPY %vreg5, %r0, %r1
370 // %d0 = COPY %5, %r0, %r1
371371 for (RegisterRef R : Defs)
372372 Clobbers.erase(R);
373373
19731973 {
19741974 const MachineOperand &VO = MI.getOperand(1);
19751975 // The operand of CONST32 can be a blockaddress, e.g.
1976 // %vreg0 = CONST32
1976 // %0 = CONST32
19771977 // Do this check for all instructions for safety.
19781978 if (!VO.isImm())
19791979 return false;
2424 //
2525 // Example:
2626 //
27 // %vreg40 = L2_loadrub_io %vreg39, 1
28 // %vreg41 = S2_tstbit_i %vreg40, 0
29 // J2_jumpt %vreg41, , %pc
27 // %40 = L2_loadrub_io %39, 1
28 // %41 = S2_tstbit_i %40, 0
29 // J2_jumpt %41, , %pc
3030 // J2_jump , %pc
3131 // Successors according to CFG: BB#4(62) BB#5(62)
3232 //
3333 // BB#4: derived from LLVM BB %if.then
3434 // Predecessors according to CFG: BB#3
35 // %vreg11 = A2_addp %vreg6, %vreg10
36 // S2_storerd_io %vreg32, 16, %vreg11
35 // %11 = A2_addp %6, %10
36 // S2_storerd_io %32, 16, %11
3737 // Successors according to CFG: BB#5
3838 //
3939 // BB#5: derived from LLVM BB %if.end
4040 // Predecessors according to CFG: BB#3 BB#4
41 // %vreg12 = PHI %vreg6, , %vreg11,
42 // %vreg13 = A2_addp %vreg7, %vreg12
43 // %vreg42 = C2_cmpeqi %vreg9, 10
44 // J2_jumpf %vreg42, , %pc>
41 // %12 = PHI %6, , %11, >
42 // %13 = A2_addp %7, %12
43 // %42 = C2_cmpeqi %9, 10
44 // J2_jumpf %42, , %pc
4545 // J2_jump , %pc
4646 // Successors according to CFG: BB#6(4) BB#3(124)
4747 //
4848 // would become:
4949 //
50 // %vreg40 = L2_loadrub_io %vreg39, 1
51 // %vreg41 = S2_tstbit_i %vreg40, 0
52 // spec-> %vreg11 = A2_addp %vreg6, %vreg10
53 // pred-> S2_pstorerdf_io %vreg41, %vreg32, 16, %vreg11
54 // %vreg46 = PS_pselect %vreg41, %vreg6, %vreg11
55 // %vreg13 = A2_addp %vreg7, %vreg46
56 // %vreg42 = C2_cmpeqi %vreg9, 10
57 // J2_jumpf %vreg42, , %pc
50 // %40 = L2_loadrub_io %39, 1
51 // %41 = S2_tstbit_i %40, 0
52 // spec-> %11 = A2_addp %6, %10
53 // pred-> S2_pstorerdf_io %41, %32, 16, %11
54 // %46 = PS_pselect %41, %6, %11
55 // %13 = A2_addp %7, %46
56 // %42 = C2_cmpeqi %9, 10
57 // J2_jumpf %42, , %pc
5858 // J2_jump , %pc
5959 // Successors according to CFG: BB#6 BB#3
6060
1616 //
1717 // Liveness tracking aside, the main functionality of this pass is divided
1818 // into two steps. The first step is to replace an instruction
19 // vreg0 = C2_mux vreg1, vreg2, vreg3
19 // %0 = C2_mux %1, %2, %3
2020 // with a pair of conditional transfers
21 // vreg0 = A2_tfrt vreg1, vreg2
22 // vreg0 = A2_tfrf vreg1, vreg3
21 // %0 = A2_tfrt %1, %2
22 // %0 = A2_tfrf %1, %3
2323 // It is the intention that the execution of this pass could be terminated
2424 // after this step, and the code generated would be functionally correct.
2525 //
26 // If the uses of the source values vreg1 and vreg2 are kills, and their
26 // If the uses of the source values %1 and %2 are kills, and their
2727 // definitions are predicable, then in the second step, the conditional
2828 // transfers will then be rewritten as predicated instructions. E.g.
29 // vreg0 = A2_or vreg1, vreg2
30 // vreg3 = A2_tfrt vreg99, vreg0
29 // %0 = A2_or %1, %2
30 // %3 = A2_tfrt %99, %0
3131 // will be rewritten as
32 // vreg3 = A2_port vreg99, vreg1, vreg2
32 // %3 = A2_port %99, %1, %2
3333 //
3434 // This replacement has two variants: "up" and "down". Consider this case:
35 // vreg0 = A2_or vreg1, vreg2
35 // %0 = A2_or %1, %2
3636 // ... [intervening instructions] ...
37 // vreg3 = A2_tfrt vreg99, vreg0
37 // %3 = A2_tfrt %99, %0
3838 // variant "up":
39 // vreg3 = A2_port vreg99, vreg1, vreg2
40 // ... [intervening instructions, vreg0->vreg3] ...
39 // %3 = A2_port %99, %1, %2
40 // ... [intervening instructions, %0->vreg3] ...
4141 // [deleted]
4242 // variant "down":
4343 // [deleted]
4444 // ... [intervening instructions] ...
45 // vreg3 = A2_port vreg99, vreg1, vreg2
45 // %3 = A2_port %99, %1, %2
4646 //
4747 // Both, one or none of these variants may be valid, and checks are made
4848 // to rule out inapplicable variants.
5050 // As an additional optimization, before either of the two steps above is
5151 // executed, the pass attempts to coalesce the target register with one of
5252 // the source registers, e.g. given an instruction
53 // vreg3 = C2_mux vreg0, vreg1, vreg2
54 // vreg3 will be coalesced with either vreg1 or vreg2. If this succeeds,
53 // %3 = C2_mux %0, %1, %2
54 // %3 will be coalesced with either %1 or %2. If this succeeds,
5555 // the instruction would then be (for example)
56 // vreg3 = C2_mux vreg0, vreg3, vreg2
56 // %3 = C2_mux %0, %3, %2
5757 // and, under certain circumstances, this could result in only one predicated
5858 // instruction:
59 // vreg3 = A2_tfrf vreg0, vreg2
59 // %3 = A2_tfrf %0, %2
6060 //
6161
6262 // Splitting a definition of a register into two predicated transfers
6464 // will see both instructions as actual definitions, and will mark the
6565 // first one as dead. The definition is not actually dead, and this
6666 // situation will need to be fixed. For example:
67 // vreg1 = A2_tfrt ... ; marked as dead
68 // vreg1 = A2_tfrf ...
67 // %1 = A2_tfrt ... ; marked as dead
68 // %1 = A2_tfrf ...
6969 //
7070 // Since any of the individual predicated transfers may end up getting
7171 // removed (in case it is an identity copy), some pre-existing def may
7272 // be marked as dead after live interval recomputation:
73 // vreg1 = ... ; marked as dead
73 // %1 = ... ; marked as dead
7474 // ...
75 // vreg1 = A2_tfrf ... ; if A2_tfrt is removed
76 // This case happens if vreg1 was used as a source in A2_tfrt, which means
75 // %1 = A2_tfrf ... ; if A2_tfrt is removed
76 // This case happens if %1 was used as a source in A2_tfrt, which means
7777 // that is it actually live at the A2_tfrf, and so the now dead definition
78 // of vreg1 will need to be updated to non-dead at some point.
78 // of %1 will need to be updated to non-dead at some point.
7979 //
8080 // This issue could be remedied by adding implicit uses to the predicated
8181 // transfers, but this will create a problem with subsequent predication,
759759 if (RR.Reg != RD.Reg)
760760 continue;
761761 // If the "Reg" part agrees, there is still the subregister to check.
762 // If we are looking for vreg1:loreg, we can skip vreg1:hireg, but
763 // not vreg1 (w/o subregisters).
762 // If we are looking for %1:loreg, we can skip %1:hireg, but
763 // not %1 (w/o subregisters).
764764 if (RR.Sub == RD.Sub)
765765 return MI;
766766 if (RR.Sub == 0 || RD.Sub == 0)
10701070 bool Done = predicate(*I, (Opc == Hexagon::A2_tfrt), UpdRegs);
10711071 if (!Done) {
10721072 // If we didn't predicate I, we may need to remove it in case it is
1073 // an "identity" copy, e.g. vreg1 = A2_tfrt vreg2, vreg1.
1073 // an "identity" copy, e.g. %1 = A2_tfrt %2, %1.
10741074 if (RegisterRef(I->getOperand(0)) == RegisterRef(I->getOperand(2))) {
10751075 for (auto &Op : I->operands())
10761076 if (Op.isReg())
11971197 MachineOperand &S1 = CI->getOperand(2), &S2 = CI->getOperand(3);
11981198 bool Done = false;
11991199 // Consider this case:
1200 // vreg1 = instr1 ...
1201 // vreg2 = instr2 ...
1202 // vreg0 = C2_mux ..., vreg1, vreg2
1203 // If vreg0 was coalesced with vreg1, we could end up with the following
1200 // %1 = instr1 ...
1201 // %2 = instr2 ...
1202 // %0 = C2_mux ..., %1, %2
1203 // If %0 was coalesced with %1, we could end up with the following
12041204 // code:
1205 // vreg0 = instr1 ...
1206 // vreg2 = instr2 ...
1207 // vreg0 = A2_tfrf ..., vreg2
1205 // %0 = instr1 ...
1206 // %2 = instr2 ...
1207 // %0 = A2_tfrf ..., %2
12081208 // which will later become:
1209 // vreg0 = instr1 ...
1210 // vreg0 = instr2_cNotPt ...
1211 // i.e. there will be an unconditional definition (instr1) of vreg0
1209 // %0 = instr1 ...
1210 // %0 = instr2_cNotPt ...
1211 // i.e. there will be an unconditional definition (instr1) of %0
12121212 // followed by a conditional one. The output dependency was there before
12131213 // and it unavoidable, but if instr1 is predicable, we will no longer be
12141214 // able to predicate it here.
11051105
11061106 // Now, remove those whose sets of potentially removable registers are
11071107 // contained in another IF candidate for VR. For example, given these
1108 // candidates for vreg45,
1109 // %vreg45:
1110 // (%vreg44,%vreg41,#9,#8), { %vreg42 }
1111 // (%vreg43,%vreg41,#9,#8), { %vreg42 %vreg44 }
1108 // candidates for %45,
1109 // %45:
1110 // (%44,%41,#9,#8), { %42 }
1111 // (%43,%41,#9,#8), { %42 %44 }
11121112 // remove the first one, since it is contained in the second one.
11131113 for (unsigned i = 0, n = LL.size(); i < n; ) {
11141114 const RegisterSet &RMi = LL[i].second;
16211621 RegisterInductionSet IndRegs;
16221622
16231623 // Look for induction patterns:
1624 // vreg1 = PHI ..., [ latch, vreg2 ]
1625 // vreg2 = ADD vreg1, imm
1624 // %1 = PHI ..., [ latch, %2 ]
1625 // %2 = ADD %1, imm
16261626 using instr_iterator = MachineBasicBlock::instr_iterator;
16271627
16281628 for (instr_iterator I = Header->instr_begin(), E = Header->instr_end();
17191719 MachineOperand &MO = PredDef->getOperand(i);
17201720 if (MO.isReg()) {
17211721 // Skip all implicit references. In one case there was:
1722 // %vreg140 = FCMPUGT32_rr %vreg138, %vreg139, %usr
1722 // %140 = FCMPUGT32_rr %138, %139, %usr
17231723 if (MO.isImplicit())
17241724 continue;
17251725 if (MO.isUse()) {
77 // This peephole pass optimizes in the following cases.
88 // 1. Optimizes redundant sign extends for the following case
99 // Transform the following pattern
10 // %vreg170 = SXTW %vreg166
10 // %170 = SXTW %166
1111 // ...
12 // %vreg176 = COPY %vreg170:isub_lo
12 // %176 = COPY %170:isub_lo
1313 //
1414 // Into
15 // %vreg176 = COPY vreg166
15 // %176 = COPY %166
1616 //
1717 // 2. Optimizes redundant negation of predicates.
18 // %vreg15 = CMPGTrr %vreg6, %vreg2
18 // %15 = CMPGTrr %6, %2
1919 // ...
20 // %vreg16 = NOT_p %vreg15
20 // %16 = NOT_p %15
2121 // ...
22 // JMP_c %vreg16, , %pc
22 // JMP_c %16, , %pc
2323 //
2424 // Into
25 // %vreg15 = CMPGTrr %vreg6, %vreg2;
25 // %15 = CMPGTrr %6, %2;
2626 // ...
27 // JMP_cNot %vreg15, , %pc;
27 // JMP_cNot %15, , %pc;
2828 //
2929 // Note: The peephole pass makes the instrucstions like
30 // %vreg170 = SXTW %vreg166 or %vreg16 = NOT_p %vreg15
30 // %170 = SXTW %166 or %16 = NOT_p %15
3131 // redundant and relies on some form of dead removal instructions, like
3232 // DCE or DIE to actually eliminate them.
3333
132132 NextI = std::next(I);
133133 MachineInstr &MI = *I;
134134 // Look for sign extends:
135 // %vreg170 = SXTW %vreg166
135 // %170 = SXTW %166
136136 if (!DisableOptSZExt && MI.getOpcode() == Hexagon::A2_sxtw) {
137137 assert(MI.getNumOperands() == 2);
138138 MachineOperand &Dst = MI.getOperand(0);
143143 if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
144144 TargetRegisterInfo::isVirtualRegister(SrcReg)) {
145145 // Map the following:
146 // %vreg170 = SXTW %vreg166
147 // PeepholeMap[170] = vreg166
146 // %170 = SXTW %166
147 // PeepholeMap[170] = %166
148148 PeepholeMap[DstReg] = SrcReg;
149149 }
150150 }
151151
152 // Look for %vreg170 = COMBINE_ir_V4 (0, %vreg169)
153 // %vreg170:DoublRegs, %vreg169:IntRegs
152 // Look for %170 = COMBINE_ir_V4 (0, %169)
153 // %170:DoublRegs, %169:IntRegs
154154 if (!DisableOptExtTo64 && MI.getOpcode() == Hexagon::A4_combineir) {
155155 assert(MI.getNumOperands() == 3);
156156 MachineOperand &Dst = MI.getOperand(0);
164164 }
165165
166166 // Look for this sequence below
167 // %vregDoubleReg1 = LSRd_ri %vregDoubleReg0, 32
168 // %vregIntReg = COPY %vregDoubleReg1:isub_lo.
167 // %DoubleReg1 = LSRd_ri %DoubleReg0, 32
168 // %IntReg = COPY %DoubleReg1:isub_lo.
169169 // and convert into
170 // %vregIntReg = COPY %vregDoubleReg0:isub_hi.
170 // %IntReg = COPY %DoubleReg0:isub_hi.
171171 if (MI.getOpcode() == Hexagon::S2_lsr_i_p) {
172172 assert(MI.getNumOperands() == 3);
173173 MachineOperand &Dst = MI.getOperand(0);
192192 if (TargetRegisterInfo::isVirtualRegister(DstReg) &&
193193 TargetRegisterInfo::isVirtualRegister(SrcReg)) {
194194 // Map the following:
195 // %vreg170 = NOT_xx %vreg166
196 // PeepholeMap[170] = vreg166
195 // %170 = NOT_xx %166
196 // PeepholeMap[170] = %166
197197 PeepholeMap[DstReg] = SrcReg;
198198 }
199199 }
200200
201201 // Look for copy:
202 // %vreg176 = COPY %vreg170:isub_lo
202 // %176 = COPY %170:isub_lo
203203 if (!DisableOptSZExt && MI.isCopy()) {
204204 assert(MI.getNumOperands() == 2);
205205 MachineOperand &Dst = MI.getOperand(0);
88 // Replace sequences of "narrow" stores to adjacent memory locations with
99 // a fewer "wide" stores that have the same effect.
1010 // For example, replace:
11 // S4_storeirb_io %vreg100, 0, 0 ; store-immediate-byte
12 // S4_storeirb_io %vreg100, 1, 0 ; store-immediate-byte
11 // S4_storeirb_io %100, 0, 0 ; store-immediate-byte
12 // S4_storeirb_io %100, 1, 0 ; store-immediate-byte
1313 // with
14 // S4_storeirh_io %vreg100, 0, 0 ; store-immediate-halfword
14 // S4_storeirh_io %100, 0, 0 ; store-immediate-halfword
1515 // The above is the general idea. The actual cases handled by the code
1616 // may be a bit more complex.
1717 // The purpose of this pass is to reduce the number of outstanding stores,
222222 // both the return value and the argument for the next call being in %r0.
223223 // Example:
224224 // 1:
225 // 2: %vregX = COPY %r0
226 // 3:
225 // 2: %vreg = COPY %r0
226 // 3:
227227 // 4: %r0 = ...
228228 // 5:
229229 // The scheduler would often swap 3 and 4, so an additional register is
233233 const MachineInstr *MI = DAG->SUnits[su].getInstr();
234234 if (MI->isCopy() && (MI->readsRegister(Hexagon::R0, &TRI) ||
235235 MI->readsRegister(Hexagon::V0, &TRI))) {
236 // %vregX = COPY %r0
236 // %vreg = COPY %r0
237237 VRegHoldingRet = MI->getOperand(0).getReg();
238238 RetRegister = MI->getOperand(1).getReg();
239239 LastUseOfRet = nullptr;
240240 } else if (VRegHoldingRet && MI->readsVirtualRegister(VRegHoldingRet))
241 // vregX>
241 // X>
242242 LastUseOfRet = &DAG->SUnits[su];
243243 else if (LastUseOfRet && MI->definesRegister(RetRegister, &TRI))
244244 // %r0 = ...
2121 // This peephole pass optimizes these cases, for example
2222 //
2323 // It will transform the following pattern
24 // %vreg0 = LEA_ADDRi64 %VRFrame, 4
25 // %vreg1 = cvta_to_local_yes_64 %vreg0
24 // %0 = LEA_ADDRi64 %VRFrame, 4
25 // %1 = cvta_to_local_yes_64 %0
2626 //
2727 // into
28 // %vreg1 = LEA_ADDRi64 %VRFrameLocal, 4
28 // %1 = LEA_ADDRi64 %VRFrameLocal, 4
2929 //
3030 // %VRFrameLocal is the virtual register name of %SPL
3131 //
6161 /// BB#0: derived from LLVM BB %entry
6262 /// Live Ins: %f1 %f3 %x6
6363 ///
64 /// %vreg0 = COPY %f1; F8RC:%vreg0
65 /// %vreg5 = CMPLWI %vreg4, 0; CRRC:%vreg5 GPRC:%vreg4
66 /// %vreg8 = LXSDX %zero8, %vreg7, %rm;
67 /// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
68 /// BCC 76, %vreg5, ; CRRC:%vreg5
64 /// %0 = COPY %f1; F8RC:%0
65 /// %5 = CMPLWI %4, 0; CRRC:%5 GPRC:%4
66 /// %8 = LXSDX %zero8, %7, %rm;
67 /// mem:LD8[ConstantPool] F8RC:%8 G8RC:%7
68 /// BCC 76, %5, ; CRRC:%5
6969 /// Successors according to CFG: BB#1(?%) BB#2(?%)
7070 ///
7171 /// BB#1: derived from LLVM BB %entry
7474 ///
7575 /// BB#2: derived from LLVM BB %entry
7676 /// Predecessors according to CFG: BB#0 BB#1
77 /// %vreg9 = PHI %vreg8, , %vreg0, ;
78 /// F8RC:%vreg9,%vreg8,%vreg0
77 /// %9 = PHI %8, , %0, ;
78 /// F8RC:%9,%8,%0
7979 ///
80 /// BCC 76, %vreg5, ; CRRC:%vreg5
80 /// BCC 76, %5, ; CRRC:%5
8181 /// Successors according to CFG: BB#3(?%) BB#4(?%)
8282 ///
8383 /// BB#3: derived from LLVM BB %entry
8686 ///
8787 /// BB#4: derived from LLVM BB %entry
8888 /// Predecessors according to CFG: BB#2 BB#3
89 /// %vreg13 = PHI %vreg12, , %vreg2, ;
90 /// F8RC:%vreg13,%vreg12,%vreg2
89 /// %13 = PHI %12, , %2, ;
90 /// F8RC:%13,%12,%2
9191 ///
9292 /// BLR8 %lr8, %rm, %f1
9393 ///
9999 /// BB#0: derived from LLVM BB %entry
100100 /// Live Ins: %f1 %f3 %x6
101101 ///
102 /// %vreg0 = COPY %f1; F8RC:%vreg0
103 /// %vreg5 = CMPLWI %vreg4, 0; CRRC:%vreg5 GPRC:%vreg4
104 /// %vreg8 = LXSDX %zero8, %vreg7, %rm;
105 /// mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
102 /// %0 = COPY %f1; F8RC:%0
103 /// %5 = CMPLWI %4, 0; CRRC:%5 GPRC:%4
104 /// %8 = LXSDX %zero8, %7, %rm;
105 /// mem:LD8[ConstantPool] F8RC:%8 G8RC:%7
106106 ///
107 /// BCC 76, %vreg5, ; CRRC:%vreg5
107 /// BCC 76, %5, ; CRRC:%5
108108 /// Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%)
109109 /// BB#4(0x55555554 / 0x80000000 = 66.67%)
110110 ///
114114 ///
115115 /// BB#4: derived from LLVM BB %entry
116116 /// Predecessors according to CFG: BB#0 BB#1
117 /// %vreg9 = PHI %vreg8, , %vreg0, ;
118 /// F8RC:%vreg9,%vreg8,%vreg0
119 /// %vreg13 = PHI %vreg12, , %vreg2, ;
120 /// F8RC:%vreg13,%vreg12,%vreg2
117 /// %9 = PHI %8, , %0, ;
118 /// F8RC:%9,%8,%0
119 /// %13 = PHI %12, , %2, ;
120 /// F8RC:%13,%12,%2
121121 ///
122122 /// BLR8 %lr8, %rm, %f1
123123 ///
23172317 // ADJCALLSTACKDOWN 32, %r1, %r1
23182318 // BL8_NOP ,...
23192319 // ADJCALLSTACKUP 32, 0, %r1, %r1
2320 // %vreg5 = COPY %x3; G8RC:%vreg5
2320 // %5 = COPY %x3; G8RC:%5
23212321 if (SrcReg == PPC::X3) {
23222322 const MachineBasicBlock *MBB = MI.getParent();
23232323 MachineBasicBlock::const_instr_iterator II =
584584 // We can eliminate RLDICL (e.g. for zero-extension)
585585 // if all bits to clear are already zero in the input.
586586 // This code assume following code sequence for zero-extension.
587 // %vreg6 = COPY %vreg5:sub_32; (optional)
588 // %vreg8 = IMPLICIT_DEF;
589 // %vreg7 = INSERT_SUBREG %vreg8, %vreg6, sub_32;
587 // %6 = COPY %5:sub_32; (optional)
588 // %8 = IMPLICIT_DEF;
589 // %7 = INSERT_SUBREG %8, %6, sub_32;
590590 if (!EnableZExtElimination) break;
591591
592592 if (MI.getOperand(2).getImm() != 0)
684684 DEBUG(dbgs() << "Optimizing LI to ADDI: ");
685685 DEBUG(LiMI->dump());
686686
687 // There could be repeated registers in the PHI, e.g: %vreg1 =
688 // PHI %vreg6, , %vreg8, , %vreg8, ; So if we've
687 // There could be repeated registers in the PHI, e.g: %1 =
688 // PHI %6, , %8, , %8, ; So if we've
689689 // already replaced the def instruction, skip.
690690 if (LiMI->getOpcode() == PPC::ADDI || LiMI->getOpcode() == PPC::ADDI8)
691691 continue;
8989 // This pass is run after register coalescing, and so we're looking for
9090 // a situation like this:
9191 // ...
92 // %vreg5 = COPY %vreg9; VSLRC:%vreg5,%vreg9
93 // %vreg5 = XSMADDADP %vreg5, %vreg17, %vreg16,
94 // %rm; VSLRC:%vreg5,%vreg17,%vreg16
92 // %5 = COPY %9; VSLRC:%5,%9
93 // %5 = XSMADDADP %5, %17, %16,
94 // %rm; VSLRC:%5,%17,%16
9595 // ...
96 // %vreg9 = XSMADDADP %vreg9, %vreg17, %vreg19,
97 // %rm; VSLRC:%vreg9,%vreg17,%vreg19
96 // %9 = XSMADDADP %9, %17, %19,
97 // %rm; VSLRC:%9,%17,%19
9898 // ...
9999 // Where we can eliminate the copy by changing from the A-type to the
100100 // M-type instruction. Specifically, for this example, this means:
101 // %vreg5 = XSMADDADP %vreg5, %vreg17, %vreg16,
102 // %rm; VSLRC:%vreg5,%vreg17,%vreg16
101 // %5 = XSMADDADP %5, %17, %16,
102 // %rm; VSLRC:%5,%17,%16
103103 // is replaced by:
104 // %vreg16 = XSMADDMDP %vreg16, %vreg18, %vreg9,
105 // %rm; VSLRC:%vreg16,%vreg18,%vreg9
106 // and we remove: %vreg5 = COPY %vreg9; VSLRC:%vreg5,%vreg9
104 // %16 = XSMADDMDP %16, %18, %9,
105 // %rm; VSLRC:%16,%18,%9
106 // and we remove: %5 = COPY %9; VSLRC:%5,%9
107107
108108 SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
109109
149149 // walking the MIs we may as well test liveness here.
150150 //
151151 // FIXME: There is a case that occurs in practice, like this:
152 // %vreg9 = COPY %f1; VSSRC:%vreg9
152 // %9 = COPY %f1; VSSRC:%9
153153 // ...
154 // %vreg6 = COPY %vreg9; VSSRC:%vreg6,%vreg9
155 // %vreg7 = COPY %vreg9; VSSRC:%vreg7,%vreg9
156 // %vreg9 = XSMADDASP %vreg9, %vreg1, %vreg4; VSSRC:
157 // %vreg6 = XSMADDASP %vreg6, %vreg1, %vreg2; VSSRC:
158 // %vreg7 = XSMADDASP %vreg7, %vreg1, %vreg3; VSSRC:
154 // %6 = COPY %9; VSSRC:%6,%9
155 // %7 = COPY %9; VSSRC:%7,%9
156 // %9 = XSMADDASP %9, %1, %4; VSSRC:
157 // %6 = XSMADDASP %6, %1, %2; VSSRC:
158 // %7 = XSMADDASP %7, %1, %3; VSSRC:
159159 // which prevents an otherwise-profitable transformation.
160160 bool OtherUsers = false, KillsAddendSrc = false;
161161 for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
176176
177177
178178 // The transformation doesn't work well with things like:
179 // %vreg5 = A-form-op %vreg5, %vreg11, %vreg5;
180 // unless vreg11 is also a kill, so skip when it is not,
179 // %5 = A-form-op %5, %11, %5;
180 // unless %11 is also a kill, so skip when it is not,
181181 // and check operand 3 to see it is also a kill to handle the case:
182 // %vreg5 = A-form-op %vreg5, %vreg5, %vreg11;
183 // where vreg5 and vreg11 are both kills. This case would be skipped
182 // %5 = A-form-op %5, %5, %11;
183 // where %5 and %11 are both kills. This case would be skipped
184184 // otherwise.
185185 unsigned OldFMAReg = MI.getOperand(0).getReg();
186186
69476947
69486948 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
69496949 // lowered this:
6950 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6950 // (extract_vector_elt (v8f32 %1), Constant<6>)
69516951 // to:
69526952 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6953 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6953 // (extract_subvector (v8f32 %0), Constant<4>),
69546954 // undef)
69556955 // Constant<0>)
69566956 // In this case the vector is the extract_subvector expression and the index
4242 ; The key problem here is that we may fail to create an MBB referenced by a
4343 ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things
4444 ; happen.
45 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: G_STORE %vreg6, %vreg2; mem:ST4[%addr] GPR:%vreg6,%vreg2 (in function: pending_phis)
45 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: G_STORE %6, %2; mem:ST4[%addr] GPR:%6,%2 (in function: pending_phis)
4646 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis
4747 ; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis:
4848 define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) {
6262 }
6363
6464 ; General legalizer inability to handle types whose size wasn't a power of 2.
65 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %vreg1, %vreg0; mem:ST6[%addr](align=8) (in function: odd_type)
65 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %1, %0; mem:ST6[%addr](align=8) (in function: odd_type)
6666 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type
6767 ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type:
6868 define void @odd_type(i42* %addr) {
7171 ret void
7272 }
7373
74 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %vreg1, %vreg0; mem:ST28[%addr](align=32) (in function: odd_vector)
74 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %1, %0; mem:ST28[%addr](align=32) (in function: odd_vector)
7575 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_vector
7676 ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_vector:
7777 define void @odd_vector(<7 x i32>* %addr) {
9090 }
9191
9292 ; Just to make sure we don't accidentally emit a normal load/store.
93 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: %vreg2(s64) = G_LOAD %vreg0; mem:LD8[%addr] GPR:%vreg2,%vreg0 (in function: atomic_ops)
93 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: cannot select: %2(s64) = G_LOAD %0; mem:LD8[%addr] GPR:%2,%0 (in function: atomic_ops)
9494 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for atomic_ops
9595 ; FALLBACK-WITH-REPORT-LABEL: atomic_ops:
9696 define i64 @atomic_ops(i64* %addr) {
131131 }
132132
133133 ; Check that we fallback on invoke translation failures.
134 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg0(s128) = G_FCONSTANT quad 2
134 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %0(s128) = G_FCONSTANT quad 2
135135 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump
136136 ; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump:
137137 define fp128 @test_quad_dump() {
138138 ret fp128 0xL00000000000000004000000000000000
139139 }
140140
141 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg0(p0) = G_EXTRACT_VECTOR_ELT %vreg1, %vreg2; (in function: vector_of_pointers_extractelement)
141 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %0(p0) = G_EXTRACT_VECTOR_ELT %1, %2; (in function: vector_of_pointers_extractelement)
142142 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for vector_of_pointers_extractelement
143143 ; FALLBACK-WITH-REPORT-OUT-LABEL: vector_of_pointers_extractelement:
144144 @var = global <2 x i16*> zeroinitializer
155155 br label %block
156156 }
157157
158 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %vreg0, %vreg4; mem:ST16[undef] (in function: vector_of_pointers_insertelement)
158 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %0, %4; mem:ST16[undef] (in function: vector_of_pointers_insertelement)
159159 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for vector_of_pointers_insertelement
160160 ; FALLBACK-WITH-REPORT-OUT-LABEL: vector_of_pointers_insertelement:
161161 define void @vector_of_pointers_insertelement() {
171171 br label %block
172172 }
173173
174 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %vreg1, %vreg3; mem:ST12[undef](align=4) (in function: nonpow2_insertvalue_narrowing)
174 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %1, %3; mem:ST12[undef](align=4) (in function: nonpow2_insertvalue_narrowing)
175175 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_insertvalue_narrowing
176176 ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_insertvalue_narrowing:
177177 %struct96 = type { float, float, float }
181181 ret void
182182 }
183183
184 ; FALLBACK-WITH-REPORT-ERR remark: :0:0: unable to legalize instruction: G_STORE %vreg3, %vreg4; mem:ST12[undef](align=16) (in function: nonpow2_add_narrowing)
184 ; FALLBACK-WITH-REPORT-ERR remark: :0:0: unable to legalize instruction: G_STORE %3, %4; mem:ST12[undef](align=16) (in function: nonpow2_add_narrowing)
185185 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_add_narrowing
186186 ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_add_narrowing:
187187 define void @nonpow2_add_narrowing() {
192192 ret void
193193 }
194194
195 ; FALLBACK-WITH-REPORT-ERR remark: :0:0: unable to legalize instruction: G_STORE %vreg3, %vreg4; mem:ST12[undef](align=16) (in function: nonpow2_add_narrowing)
195 ; FALLBACK-WITH-REPORT-ERR remark: :0:0: unable to legalize instruction: G_STORE %3, %4; mem:ST12[undef](align=16) (in function: nonpow2_add_narrowing)
196196 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_or_narrowing
197197 ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_or_narrowing:
198198 define void @nonpow2_or_narrowing() {
203203 ret void
204204 }
205205
206 ; FALLBACK-WITH-REPORT-ERR remark: :0:0: unable to legalize instruction: G_STORE %vreg0, %vreg1; mem:ST12[undef](align=16) (in function: nonpow2_load_narrowing)
206 ; FALLBACK-WITH-REPORT-ERR remark: :0:0: unable to legalize instruction: G_STORE %0, %1; mem:ST12[undef](align=16) (in function: nonpow2_load_narrowing)
207207 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_load_narrowing
208208 ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_load_narrowing:
209209 define void @nonpow2_load_narrowing() {
212212 ret void
213213 }
214214
215 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %vreg3, %vreg0; mem:ST12[%c](align=16) (in function: nonpow2_store_narrowing
215 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %3, %0; mem:ST12[%c](align=16) (in function: nonpow2_store_narrowing
216216 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_store_narrowing
217217 ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_store_narrowing:
218218 define void @nonpow2_store_narrowing(i96* %c) {
222222 ret void
223223 }
224224
225 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %vreg0, %vreg1; mem:ST12[undef](align=16) (in function: nonpow2_constant_narrowing)
225 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %0, %1; mem:ST12[undef](align=16) (in function: nonpow2_constant_narrowing)
226226 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_constant_narrowing
227227 ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_constant_narrowing:
228228 define void @nonpow2_constant_narrowing() {
232232
233233 ; Currently can't handle vector lengths that aren't an exact multiple of
234234 ; natively supported vector lengths. Test that the fall-back works for those.
235 ; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: :0:0: unable to legalize instruction: %vreg1(<7 x s64>) = G_ADD %vreg0, %vreg0; (in function: nonpow2_vector_add_fewerelements
236 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %vreg2(s64) = G_EXTRACT_VECTOR_ELT %vreg1, %vreg3; (in function: nonpow2_vector_add_fewerelements)
235 ; FALLBACK-WITH-REPORT-ERR-G_IMPLICIT_DEF-LEGALIZABLE: (FIXME: this is what is expected once we can legalize non-pow-of-2 G_IMPLICIT_DEF) remark: :0:0: unable to legalize instruction: %1(<7 x s64>) = G_ADD %0, %0; (in function: nonpow2_vector_add_fewerelements
236 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: %2(s64) = G_EXTRACT_VECTOR_ELT %1, %3; (in function: nonpow2_vector_add_fewerelements)
237237 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for nonpow2_vector_add_fewerelements
238238 ; FALLBACK-WITH-REPORT-OUT-LABEL: nonpow2_vector_add_fewerelements:
239239 define void @nonpow2_vector_add_fewerelements() {
88 ...
99 ---
1010 # CHECK: *** Bad machine code: Generic virtual register must have a bank in a RegBankSelected function ***
11 # CHECK: instruction: %vreg0(s64) = COPY
12 # CHECK: operand 0: %vreg0
11 # CHECK: instruction: %0(s64) = COPY
12 # CHECK: operand 0: %0
1313 name: test
1414 regBankSelected: true
1515 registers:
2121 %0 = COPY %x0
2222
2323 ; CHECK: *** Bad machine code: Unexpected generic instruction in a Selected function ***
24 ; CHECK: instruction: %vreg1 = G_ADD
24 ; CHECK: instruction: %1 = G_ADD
2525 %1 = G_ADD %0, %0
2626
2727 ; CHECK: *** Bad machine code: Generic virtual register invalid in a Selected function ***
28 ; CHECK: instruction: %vreg2(s64) = COPY
29 ; CHECK: operand 0: %vreg2
28 ; CHECK: instruction: %2(s64) = COPY
29 ; CHECK: operand 0: %2
3030 %2(s64) = COPY %x0
3131 ...
44 ; CHECK-LABEL: stp_i64_scale:BB#0
55 ; CHECK:Cluster ld/st SU(4) - SU(3)
66 ; CHECK:Cluster ld/st SU(2) - SU(5)
7 ; CHECK:SU(4): STRXui %vreg1, %vreg0, 1
8 ; CHECK:SU(3): STRXui %vreg1, %vreg0, 2
9 ; CHECK:SU(2): STRXui %vreg1, %vreg0, 3
10 ; CHECK:SU(5): STRXui %vreg1, %vreg0, 4
7 ; CHECK:SU(4): STRXui %1, %0, 1
8 ; CHECK:SU(3): STRXui %1, %0, 2
9 ; CHECK:SU(2): STRXui %1, %0, 3
10 ; CHECK:SU(5): STRXui %1, %0, 4
1111 define i64 @stp_i64_scale(i64* nocapture %P, i64 %v) {
1212 entry:
1313 %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
2525 ; CHECK-LABEL: stp_i32_scale:BB#0
2626 ; CHECK:Cluster ld/st SU(4) - SU(3)
2727 ; CHECK:Cluster ld/st SU(2) - SU(5)
28 ; CHECK:SU(4): STRWui %vreg1, %vreg0, 1
29 ; CHECK:SU(3): STRWui %vreg1, %vreg0, 2
30 ; CHECK:SU(2): STRWui %vreg1, %vreg0, 3
31 ; CHECK:SU(5): STRWui %vreg1, %vreg0, 4
28 ; CHECK:SU(4): STRWui %1, %0, 1
29 ; CHECK:SU(3): STRWui %1, %0, 2
30 ; CHECK:SU(2): STRWui %1, %0, 3
31 ; CHECK:SU(5): STRWui %1, %0, 4
3232 define i32 @stp_i32_scale(i32* nocapture %P, i32 %v) {
3333 entry:
3434 %arrayidx = getelementptr inbounds i32, i32* %P, i32 3
4646 ; CHECK-LABEL:stp_i64_unscale:BB#0 entry
4747 ; CHECK:Cluster ld/st SU(5) - SU(2)
4848 ; CHECK:Cluster ld/st SU(4) - SU(3)
49 ; CHECK:SU(5): STURXi %vreg1, %vreg0, -32
50 ; CHECK:SU(2): STURXi %vreg1, %vreg0, -24
51 ; CHECK:SU(4): STURXi %vreg1, %vreg0, -16
52 ; CHECK:SU(3): STURXi %vreg1, %vreg0, -8
49 ; CHECK:SU(5): STURXi %1, %0, -32
50 ; CHECK:SU(2): STURXi %1, %0, -24
51 ; CHECK:SU(4): STURXi %1, %0, -16
52 ; CHECK:SU(3): STURXi %1, %0, -8
5353 define void @stp_i64_unscale(i64* nocapture %P, i64 %v) #0 {
5454 entry:
5555 %arrayidx = getelementptr inbounds i64, i64* %P, i64 -3
6767 ; CHECK-LABEL:stp_i32_unscale:BB#0 entry
6868 ; CHECK:Cluster ld/st SU(5) - SU(2)
6969 ; CHECK:Cluster ld/st SU(4) - SU(3)
70 ; CHECK:SU(5): STURWi %vreg1, %vreg0, -16
71 ; CHECK:SU(2): STURWi %vreg1, %vreg0, -12
72 ; CHECK:SU(4): STURWi %vreg1, %vreg0, -8
73 ; CHECK:SU(3): STURWi %vreg1, %vreg0, -4
70 ; CHECK:SU(5): STURWi %1, %0, -16
71 ; CHECK:SU(2): STURWi %1, %0, -12
72 ; CHECK:SU(4): STURWi %1, %0, -8
73 ; CHECK:SU(3): STURWi %1, %0, -4
7474 define void @stp_i32_unscale(i32* nocapture %P, i32 %v) #0 {
7575 entry:
7676 %arrayidx = getelementptr inbounds i32, i32* %P, i32 -3
8888 ; CHECK-LABEL:stp_double:BB#0
8989 ; CHECK:Cluster ld/st SU(3) - SU(4)
9090 ; CHECK:Cluster ld/st SU(2) - SU(5)
91 ; CHECK:SU(3): STRDui %vreg1, %vreg0, 1
92 ; CHECK:SU(4): STRDui %vreg1, %vreg0, 2
93 ; CHECK:SU(2): STRDui %vreg1, %vreg0, 3
94 ; CHECK:SU(5): STRDui %vreg1, %vreg0, 4
91 ; CHECK:SU(3): STRDui %1, %0, 1
92 ; CHECK:SU(4): STRDui %1, %0, 2
93 ; CHECK:SU(2): STRDui %1, %0, 3
94 ; CHECK:SU(5): STRDui %1, %0, 4
9595 define void @stp_double(double* nocapture %P, double %v) {
9696 entry:
9797 %arrayidx = getelementptr inbounds double, double* %P, i64 3
109109 ; CHECK-LABEL:stp_float:BB#0
110110 ; CHECK:Cluster ld/st SU(3) - SU(4)
111111 ; CHECK:Cluster ld/st SU(2) - SU(5)
112 ; CHECK:SU(3): STRSui %vreg1, %vreg0, 1
113 ; CHECK:SU(4): STRSui %vreg1, %vreg0, 2
114 ; CHECK:SU(2): STRSui %vreg1, %vreg0, 3
115 ; CHECK:SU(5): STRSui %vreg1, %vreg0, 4
112 ; CHECK:SU(3): STRSui %1, %0, 1
113 ; CHECK:SU(4): STRSui %1, %0, 2
114 ; CHECK:SU(2): STRSui %1, %0, 3
115 ; CHECK:SU(5): STRSui %1, %0, 4
116116 define void @stp_float(float* nocapture %P, float %v) {
117117 entry:
118118 %arrayidx = getelementptr inbounds float, float* %P, i64 3
129129 ; CHECK: ********** MI Scheduling **********
130130 ; CHECK-LABEL: stp_volatile:BB#0
131131 ; CHECK-NOT: Cluster ld/st
132 ; CHECK:SU(2): STRXui %vreg1, %vreg0, 3; mem:Volatile
133 ; CHECK:SU(3): STRXui %vreg1, %vreg0, 2; mem:Volatile
134 ; CHECK:SU(4): STRXui %vreg1, %vreg0, 1; mem:Volatile
135 ; CHECK:SU(5): STRXui %vreg1, %vreg0, 4; mem:Volatile
132 ; CHECK:SU(2): STRXui %1, %0, 3; mem:Volatile
133 ; CHECK:SU(3): STRXui %1, %0, 2; mem:Volatile
134 ; CHECK:SU(4): STRXui %1, %0, 1; mem:Volatile
135 ; CHECK:SU(5): STRXui %1, %0, 4; mem:Volatile
136136 define i64 @stp_volatile(i64* nocapture %P, i64 %v) {
137137 entry:
138138 %arrayidx = getelementptr inbounds i64, i64* %P, i64 3
33
44 ; CHECK-SSA-LABEL: Machine code for function t1
55
6 ; CHECK-SSA: [[QUOTREG:%vreg[0-9]+]] = SDIVWr
6 ; CHECK-SSA: [[QUOTREG:%[0-9]+]] = SDIVWr
77 ; CHECK-SSA-NOT: [[QUOTREG]] =
8 ; CHECK-SSA: {{%vreg[0-9]+}} = MSUBWrrr [[QUOTREG]]
8 ; CHECK-SSA: {{%[0-9]+}} = MSUBWrrr [[QUOTREG]]
99
1010 ; CHECK-SSA-LABEL: Machine code for function t2
1111
55 ; CHECK: ********** MI Scheduling **********
66 ; CHECK-LABEL: ldr_int:BB#0
77 ; CHECK: Cluster ld/st SU(1) - SU(2)
8 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui
9 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui
8 ; CHECK: SU(1): %{{[0-9]+}} = LDRWui
9 ; CHECK: SU(2): %{{[0-9]+}} = LDRWui
1010 ; EXYNOS: ********** MI Scheduling **********
1111 ; EXYNOS-LABEL: ldr_int:BB#0
1212 ; EXYNOS: Cluster ld/st SU(1) - SU(2)
13 ; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRWui
14 ; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRWui
13 ; EXYNOS: SU(1): %{{[0-9]+}} = LDRWui
14 ; EXYNOS: SU(2): %{{[0-9]+}} = LDRWui
1515 define i32 @ldr_int(i32* %a) nounwind {
1616 %p1 = getelementptr inbounds i32, i32* %a, i32 1
1717 %tmp1 = load i32, i32* %p1, align 2
2525 ; CHECK: ********** MI Scheduling **********
2626 ; CHECK-LABEL: ldp_sext_int:BB#0
2727 ; CHECK: Cluster ld/st SU(1) - SU(2)
28 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRSWui
29 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRSWui
28 ; CHECK: SU(1): %{{[0-9]+}} = LDRSWui
29 ; CHECK: SU(2): %{{[0-9]+}} = LDRSWui
3030 ; EXYNOS: ********** MI Scheduling **********
3131 ; EXYNOS-LABEL: ldp_sext_int:BB#0
3232 ; EXYNOS: Cluster ld/st SU(1) - SU(2)
33 ; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRSWui
34 ; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRSWui
33 ; EXYNOS: SU(1): %{{[0-9]+}} = LDRSWui
34 ; EXYNOS: SU(2): %{{[0-9]+}} = LDRSWui
3535 define i64 @ldp_sext_int(i32* %p) nounwind {
3636 %tmp = load i32, i32* %p, align 4
3737 %add.ptr = getelementptr inbounds i32, i32* %p, i64 1
4646 ; CHECK: ********** MI Scheduling **********
4747 ; CHECK-LABEL: ldur_int:BB#0
4848 ; CHECK: Cluster ld/st SU(2) - SU(1)
49 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDURWi
50 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDURWi
49 ; CHECK: SU(1): %{{[0-9]+}} = LDURWi
50 ; CHECK: SU(2): %{{[0-9]+}} = LDURWi
5151 ; EXYNOS: ********** MI Scheduling **********
5252 ; EXYNOS-LABEL: ldur_int:BB#0
5353 ; EXYNOS: Cluster ld/st SU(2) - SU(1)
54 ; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDURWi
55 ; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDURWi
54 ; EXYNOS: SU(1): %{{[0-9]+}} = LDURWi
55 ; EXYNOS: SU(2): %{{[0-9]+}} = LDURWi
5656 define i32 @ldur_int(i32* %a) nounwind {
5757 %p1 = getelementptr inbounds i32, i32* %a, i32 -1
5858 %tmp1 = load i32, i32* %p1, align 2
6666 ; CHECK: ********** MI Scheduling **********
6767 ; CHECK-LABEL: ldp_half_sext_zext_int:BB#0
6868 ; CHECK: Cluster ld/st SU(3) - SU(4)
69 ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRSWui
70 ; CHECK: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui
69 ; CHECK: SU(3): %{{[0-9]+}} = LDRSWui
70 ; CHECK: SU(4): %{{[0-9]+}}:sub_32 = LDRWui
7171 ; EXYNOS: ********** MI Scheduling **********
7272 ; EXYNOS-LABEL: ldp_half_sext_zext_int:BB#0
7373 ; EXYNOS: Cluster ld/st SU(3) - SU(4)
74 ; EXYNOS: SU(3): %vreg{{[0-9]+}} = LDRSWui
75 ; EXYNOS: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui
74 ; EXYNOS: SU(3): %{{[0-9]+}} = LDRSWui
75 ; EXYNOS: SU(4): %{{[0-9]+}}:sub_32 = LDRWui
7676 define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind {
7777 %tmp0 = load i64, i64* %q, align 4
7878 %tmp = load i32, i32* %p, align 4
8989 ; CHECK: ********** MI Scheduling **********
9090 ; CHECK-LABEL: ldp_half_zext_sext_int:BB#0
9191 ; CHECK: Cluster ld/st SU(3) - SU(4)
92 ; CHECK: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui
93 ; CHECK: SU(4): %vreg{{[0-9]+}} = LDRSWui
92 ; CHECK: SU(3): %{{[0-9]+}}:sub_32 = LDRWui
93 ; CHECK: SU(4): %{{[0-9]+}} = LDRSWui
9494 ; EXYNOS: ********** MI Scheduling **********
9595 ; EXYNOS-LABEL: ldp_half_zext_sext_int:BB#0
9696 ; EXYNOS: Cluster ld/st SU(3) - SU(4)
97 ; EXYNOS: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui
98 ; EXYNOS: SU(4): %vreg{{[0-9]+}} = LDRSWui
97 ; EXYNOS: SU(3): %{{[0-9]+}}:sub_32 = LDRWui
98 ; EXYNOS: SU(4): %{{[0-9]+}} = LDRSWui
9999 define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind {
100100 %tmp0 = load i64, i64* %q, align 4
101101 %tmp = load i32, i32* %p, align 4
112112 ; CHECK: ********** MI Scheduling **********
113113 ; CHECK-LABEL: ldr_int_volatile:BB#0
114114 ; CHECK-NOT: Cluster ld/st
115 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui
116 ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui
115 ; CHECK: SU(1): %{{[0-9]+}} = LDRWui
116 ; CHECK: SU(2): %{{[0-9]+}} = LDRWui
117117 ; EXYNOS: ********** MI Scheduling **********
118118 ; EXYNOS-LABEL: ldr_int_volatile:BB#0
119119 ; EXYNOS-NOT: Cluster ld/st
120 ; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRWui
121 ; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRWui
120 ; EXYNOS: SU(1): %{{[0-9]+}} = LDRWui
121 ; EXYNOS: SU(2): %{{[0-9]+}} = LDRWui
122122 define i32 @ldr_int_volatile(i32* %a) nounwind {
123123 %p1 = getelementptr inbounds i32, i32* %a, i32 1
124124 %tmp1 = load volatile i32, i32* %p1, align 2
132132 ; CHECK: ********** MI Scheduling **********
133133 ; CHECK-LABEL: ldq_cluster:BB#0
134134 ; CHECK: Cluster ld/st SU(1) - SU(3)
135 ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRQui
136 ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRQui
135 ; CHECK: SU(1): %{{[0-9]+}} = LDRQui
136 ; CHECK: SU(3): %{{[0-9]+}} = LDRQui
137137 ; EXYNOS: ********** MI Scheduling **********
138138 ; EXYNOS-LABEL: ldq_cluster:BB#0
139139 ; EXYNOS-NOT: Cluster ld/st
55 ;
66 ; CHECK: ********** MI Scheduling **********
77 ; CHECK: shiftable
8 ; CHECK: SU(2): %vreg2 = SUBXri %vreg1, 20, 0
8 ; CHECK: SU(2): %2 = SUBXri %1, 20, 0
99 ; CHECK: Successors:
10 ; CHECK-NEXT: SU(4): Data Latency=1 Reg=%vreg2
11 ; CHECK-NEXT: SU(3): Data Latency=2 Reg=%vreg2
10 ; CHECK-NEXT: SU(4): Data Latency=1 Reg=%2
11 ; CHECK-NEXT: SU(3): Data Latency=2 Reg=%2
1212 ; CHECK: ********** INTERVALS **********
1313 define i64 @shiftable(i64 %A, i64 %B) {
1414 %tmp0 = sub i64 %B, 20
44 ;
55 ; CHECK: ********** MI Scheduling **********
66 ; CHECK: misched_bug:BB#0 entry
7 ; CHECK: SU(2): %vreg2 = LDRWui %vreg0, 1; mem:LD4[%ptr1_plus1] GPR32:%vreg2 GPR64common:%vreg0
7 ; CHECK: SU(2): %2 = LDRWui %0, 1; mem:LD4[%ptr1_plus1] GPR32:%2 GPR64common:%0
88 ; CHECK: Successors:
9 ; CHECK-NEXT: SU(5): Data Latency=4 Reg=%vreg2
9 ; CHECK-NEXT: SU(5): Data Latency=4 Reg=%2
1010 ; CHECK-NEXT: SU(4): Ord Latency=0
11 ; CHECK: SU(3): STRWui %wzr, %vreg0, 0; mem:ST4[%ptr1] GPR64common:%vreg0
11 ; CHECK: SU(3): STRWui %wzr, %0, 0; mem:ST4[%ptr1] GPR64common:%0
1212 ; CHECK: Successors:
1313 ; CHECK: SU(4): Ord Latency=0
14 ; CHECK: SU(4): STRWui %wzr, %vreg1, 0; mem:ST4[%ptr2] GPR64common:%vreg1
15 ; CHECK: SU(5): %w0 = COPY %vreg2; GPR32:%vreg2
14 ; CHECK: SU(4): STRWui %wzr, %1, 0; mem:ST4[%ptr2] GPR64common:%1
15 ; CHECK: SU(5): %w0 = COPY %2; GPR32:%2
1616 ; CHECK: ** ScheduleDAGMI::schedule picking next node
1717 define i32 @misched_bug(i32* %ptr1, i32* %ptr2) {
1818 entry:
2525 ; CHECK: fi#-2: {{.*}} fixed, at location [SP+8]
2626 ; CHECK: fi#-1: {{.*}} fixed, at location [SP]
2727
28 ; CHECK: [[VRA:%vreg.*]] = LDRXui
29 ; CHECK: [[VRB:%vreg.*]] = LDRXui
30 ; CHECK: STRXui %vreg{{.*}}, >
28 ; CHECK: [[VRA:%.*]] = LDRXui >
29 ; CHECK: [[VRB:%.*]] = LDRXui
30 ; CHECK: STRXui %{{.*}},
3131 ; CHECK: STRXui [[VRB]],
3232
3333 ; Make sure that there is an dependence edge between fi#-2 and fi#-4.
3939 ; CHECK: SU([[DEPSTOREB:.*]]): Ord Latency=0
4040 ; CHECK: SU([[DEPSTOREA:.*]]): Ord Latency=0
4141
42 ; CHECK: SU([[DEPSTOREA]]): STRXui %vreg{{.*}},
43 ; CHECK: SU([[DEPSTOREB]]): STRXui %vreg{{.*}}, >
42 ; CHECK: SU([[DEPSTOREA]]): STRXui %{{.*}}, >
43 ; CHECK: SU([[DEPSTOREB]]): STRXui %{{.*}},
4545 ;
4646 ; The instruction selection phase will generate ISA that looks like this:
4747 ; %oqap = LDS_READ_RET
48 ; %vreg0 = MOV %oqap
49 ; %vreg1 = VTX_READ_32
50 ; %vreg2 = ADD_INT %vreg1, %vreg0
48 ; %0 = MOV %oqap
49 ; %1 = VTX_READ_32
50 ; %2 = ADD_INT %1, %0
5151 ;
5252 ; The bottom scheduler will schedule the two ALU instructions first:
5353 ;
5454 ; UNSCHEDULED:
5555 ; %oqap = LDS_READ_RET
56 ; %vreg1 = VTX_READ_32
56 ; %1 = VTX_READ_32
5757 ;
5858 ; SCHEDULED:
5959 ;
60 ; vreg0 = MOV %oqap
61 ; vreg2 = ADD_INT %vreg1, %vreg2
60 ; %0 = MOV %oqap
61 ; %2 = ADD_INT %1, %2
6262 ;
6363 ; The lack of proper aliasing results in the local memory read (LDS_READ_RET)
6464 ; to consider the global memory read (VTX_READ_32) has a chain dependency, so
6868 ; Alu clause:
6969 ; %oqap = LDS_READ_RET
7070 ; VTX clause:
71 ; %vreg1 = VTX_READ_32
71 ; %1 = VTX_READ_32
7272 ; Alu clause:
73 ; vreg0 = MOV %oqap
74 ; vreg2 = ADD_INT %vreg1, %vreg2
73 ; %0 = MOV %oqap
74 ; %2 = ADD_INT %1, %2
7575 ;
7676 ; This is an illegal program because the oqap def and use know occur in
7777 ; different ALU clauses.
55 # liveranges needed it.
66 #
77 # Should see three distinct value numbers:
8 # CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
8 # CHECK: %0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
99 --- |
1010 define amdgpu_kernel void @test0() { ret void }
1111 ...
11 # https://bugs.llvm.org/show_bug.cgi?id=33620
22
33 ---
4 # This would assert due to the empty live interval created for %vreg9
4 # This would assert due to the empty live interval created for %9
55 # on the last S_NOP with an undef subreg use.
66
77 # CHECK-LABEL: name: expecting_non_empty_interval
11 # REQUIRES: asserts
22
33 # CHECK: INTERVALS
4 # CHECK: vreg0
4 # CHECK: %0
55 # CHECK-LABEL: Machine code for function test0:
66
77 # CHECK: INTERVALS
8 # CHECK: vreg0
8 # CHECK: %0
99 # CHECK-LABEL: Machine code for function test1:
1010
1111 --- |
44 ; This test calls shrinkToUses with an early-clobber redefined live range during
55 ; spilling.
66 ;
7 ; Shrink: %vreg47,1.158257e-02 = [384r,400e:0)[400e,420r:1) 0@384r 1@400e
7 ; Shrink: %47,1.158257e-02 = [384r,400e:0)[400e,420r:1) 0@384r 1@400e
88 ;
99 ; The early-clobber instruction is an str:
1010 ;
11 ; %vreg12 = t2STR_PRE %vreg6, %vreg12, 32, pred:14, pred:%noreg
11 ; %12 = t2STR_PRE %6, %12, 32, pred:14, pred:%noreg
1212 ;
1313 ; This tests that shrinkToUses handles the EC redef correctly.
1414
118118 ; CHECK-CFG-DAG: t2B
119119
120120 ; CHECK-CFG-DAG: BB#2
121 ; CHECK-CFG-DAG: tCMPi8 %vreg{{[0-9]}}, 0
121 ; CHECK-CFG-DAG: tCMPi8 %{{[0-9]}}, 0
122122 ; CHECK-CFG-DAG: t2Bcc
123123
124124 ; CHECK-CFG-DAG: BB#4
6060
6161 ; CHECK: insert_elem
6262 ; This test has a sub-register copy with a kill flag:
63 ; %vreg6:ssub_3 = COPY %vreg6:ssub_2; QPR_VFP2:%vreg6
63 ; %6:ssub_3 = COPY %6:ssub_2; QPR_VFP2:%6
6464 ; The rewriter must do something sensible with that, or the scavenger crashes.
6565 define void @insert_elem() nounwind {
6666 entry:
3232 ; This case was a crasher in constrainLocalCopy.
3333 ; The problem was the t2LDR_PRE defining both the global and local lrg.
3434 ; CHECK-LABEL: *** Final schedule for BB#5 ***
35 ; CHECK: %[[R4:vreg[0-9]+]], %[[R1:vreg[0-9]+]] = t2LDR_PRE %[[R1]]
36 ; CHECK: %vreg{{[0-9]+}} = COPY %[[R1]]
37 ; CHECK: %vreg{{[0-9]+}} = COPY %[[R4]]
35 ; CHECK: %[[R4:[0-9]+]], %[[R1:[0-9]+]] = t2LDR_PRE %[[R1]]
36 ; CHECK: %{{[0-9]+}} = COPY %[[R1]]
37 ; CHECK: %{{[0-9]+}} = COPY %[[R4]]
3838 ; CHECK-LABEL: MACHINEINSTRS
3939 %struct.rtx_def = type { [4 x i8], [1 x %union.rtunion_def] }
4040 %union.rtunion_def = type { i64 }
3636 }
3737 #
3838 # CHECK: ********** MI Scheduling **********
39 # CHECK: SU(2): %vreg2 = t2MOVi32imm ; rGPR:%vreg2
39 # CHECK: SU(2): %2 = t2MOVi32imm ; rGPR:%2
4040 # CHECK_A9: Latency : 2
4141 # CHECK_SWIFT: Latency : 2
4242 # CHECK_R52: Latency : 2
4343 #
44 # CHECK: SU(3): %vreg3 = t2LDRi12 %vreg2, 0, pred:14, pred:%noreg; mem:LD4[@g1](dereferenceable) rGPR:%vreg3,%vreg2
44 # CHECK: SU(3): %3 = t2LDRi12 %2, 0, pred:14, pred:%noreg; mem:LD4[@g1](dereferenceable) rGPR:%3,%2
4545 # CHECK_A9: Latency : 1
4646 # CHECK_SWIFT: Latency : 3
4747 # CHECK_R52: Latency : 4
4848 #
49 # CHECK : SU(6): %vreg6 = t2ADDrr %vreg3, %vreg3, pred:14, pred:%noreg, opt:%noreg; rGPR:%vreg6,%vreg3,%vreg3
49 # CHECK : SU(6): %6 = t2ADDrr %3, %3, pred:14, pred:%noreg, opt:%noreg; rGPR:%6,%3,%3
5050 # CHECK_A9: Latency : 1
5151 # CHECK_SWIFT: Latency : 1
5252 # CHECK_R52: Latency : 3
5353
54 # CHECK: SU(7): %vreg7 = t2SDIV %vreg6, %vreg5, pred:14, pred:%noreg; rGPR:%vreg7,%vreg6,%vreg5
54 # CHECK: SU(7): %7 = t2SDIV %6, %5, pred:14, pred:%noreg; rGPR:%7,%6,%5
5555 # CHECK_A9: Latency : 0
5656 # CHECK_SWIFT: Latency : 14
5757 # CHECK_R52: Latency : 8
5858
59 # CHECK: SU(8): t2STRi12 %vreg7, %vreg2, 0, pred:14, pred:%noreg; mem:ST4[@g1] rGPR:%vreg7,%vreg2
59 # CHECK: SU(8): t2STRi12 %7, %2, 0, pred:14, pred:%noreg; mem:ST4[@g1] rGPR:%7,%2
6060 # CHECK_A9: Latency : 1
6161 # CHECK_SWIFT: Latency : 0
6262 # CHECK_R52: Latency : 4
6363 #
64 # CHECK: SU(9): %vreg8 = t2SMULBB %vreg1, %vreg1, pred:14, pred:%noreg; rGPR:%vreg8,%vreg1,%vreg1
64 # CHECK: SU(9): %8 = t2SMULBB %1, %1, pred:14, pred:%noreg; rGPR:%8,%1,%1
6565 # CHECK_A9: Latency : 2
6666 # CHECK_SWIFT: Latency : 4
6767 # CHECK_R52: Latency : 4
6868 #
69 # CHECK: SU(10): %vreg9 = t2SMLABB %vreg0, %vreg0, %vreg8, pred:14, pred:%noreg; rGPR:%vreg9,%vreg0,%vreg0,%vreg8
69 # CHECK: SU(10): %9 = t2SMLABB %0, %0, %8, pred:14, pred:%noreg; rGPR:%9,%0,%0,%8
7070 # CHECK_A9: Latency : 2
7171 # CHECK_SWIFT: Latency : 4
7272 # CHECK_R52: Latency : 4
7373 #
74 # CHECK: SU(11): %vreg10 = t2UXTH %vreg9, 0, pred:14, pred:%noreg; rGPR:%vreg10,%vreg9
74 # CHECK: SU(11): %10 = t2UXTH %9, 0, pred:14, pred:%noreg; rGPR:%10,%9
7575 # CHECK_A9: Latency : 1
7676 # CHECK_SWIFT: Latency : 1
7777 # CHECK_R52: Latency : 3
7878 #
79 # CHECK: SU(12): %vreg11 = t2MUL %vreg10, %vreg7, pred:14, pred:%noreg; rGPR:%vreg11,%vreg10,%vreg7
79 # CHECK: SU(12): %11 = t2MUL %10, %7, pred:14, pred:%noreg; rGPR:%11,%10,%7
8080 # CHECK_A9: Latency : 2
8181 # CHECK_SWIFT: Latency : 4
8282 # CHECK_R52: Latency : 4
8383 #
84 # CHECK: SU(13): %vreg12 = t2MLA %vreg11, %vreg11, %vreg11, pred:14, pred:%noreg; rGPR:%vreg12,%vreg11,%vreg11,%vreg11
84 # CHECK: SU(13): %12 = t2MLA %11, %11, %11, pred:14, pred:%noreg; rGPR:%12,%11,%11,%11
8585 # CHECK_A9: Latency : 2
8686 # CHECK_SWIFT: Latency : 4
8787 # CHECK_R52: Latency : 4
8888 #
89 # CHECK: SU(14): %vreg13, %vreg14 = t2UMULL %vreg12, %vreg12, pred:14, pred:%noreg; rGPR:%vreg13,%vreg14,%vreg12,%vreg12
89 # CHECK: SU(14): %13, %14 = t2UMULL %12, %12, pred:14, pred:%noreg; rGPR:%13,%14,%12,%12
9090 # CHECK_A9: Latency : 3
9191 # CHECK_SWIFT: Latency : 5
9292 # CHECK_R52: Latency : 4
9393 #
94 # CHECK: SU(18): %vreg19, %vreg20 = t2UMLAL %vreg12, %vreg12, %vreg19, %vreg20, pred:14, pred:%noreg; rGPR:%vreg19,%vreg20,%vreg12,%vreg12,%vreg20
94 # CHECK: SU(18): %19, %20 = t2UMLAL %12, %12, %19, %20, pred:14, pred:%noreg; rGPR:%19,%20,%12,%12,%20
9595 # CHECK_A9: Latency : 3
9696 # CHECK_SWIFT: Latency : 7
9797 # CHECK_R52: Latency : 4
2727 }
2828
2929 # CHECK: ********** MI Scheduling **********
30 # CHECK: SU(2): %vreg2 = SMULBB %vreg1, %vreg1, pred:14, pred:%noreg; GPR:%vreg2,%vreg1,%vreg1
30 # CHECK: SU(2): %2 = SMULBB %1, %1, pred:14, pred:%noreg; GPR:%2,%1,%1
3131 # CHECK_A9: Latency : 2
3232 # CHECK_SWIFT: Latency : 4
3333 # CHECK_R52: Latency : 4
3434 #
35 # CHECK: SU(3): %vreg3 = SMLABB %vreg0, %vreg0, %vreg2, pred:14, pred:%noreg; GPRnopc:%vreg3,%vreg0,%vreg0 GPR:%vreg2
35 # CHECK: SU(3): %3 = SMLABB %0, %0, %2, pred:14, pred:%noreg; GPRnopc:%3,%0,%0 GPR:%2
3636 # CHECK_A9: Latency : 2
3737 # CHECK_SWIFT: Latency : 4
3838 # CHECK_R52: Latency : 4
3939 #
40 # CHECK: SU(4): %vreg4 = UXTH %vreg3, 0, pred:14, pred:%noreg; GPRnopc:%vreg4,%vreg3
40 # CHECK: SU(4): %4 = UXTH %3, 0, pred:14, pred:%noreg; GPRnopc:%4,%3
4141 # CHECK_A9: Latency : 1
4242 # CHECK_SWIFT: Latency : 1
4343 # CHECK_R52: Latency : 3
4444 #
45 # CHECK: SU(5): %vreg5 = MUL %vreg4, %vreg4, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg5,%vreg4,%vreg4
45 # CHECK: SU(5): %5 = MUL %4, %4, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%5,%4,%4
4646 # CHECK_A9: Latency : 2
4747 # CHECK_SWIFT: Latency : 4
4848 # CHECK_R52: Latency : 4
4949 #
50 # CHECK: SU(6): %vreg6 = MLA %vreg5, %vreg5, %vreg5, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg6,%vreg5,%vreg5,%vreg5
50 # CHECK: SU(6): %6 = MLA %5, %5, %5, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%6,%5,%5,%5
5151 # CHECK_A9: Latency : 2
5252 # CHECK_SWIFT: Latency : 4
5353 # CHECK_R52: Latency : 4
5454 #
55 # CHECK: SU(7): %vreg7, %vreg8 = UMULL %vreg6, %vreg6, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg7,%vreg8,%vreg6,%vreg6
55 # CHECK: SU(7): %7, %8 = UMULL %6, %6, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%7,%8,%6,%6
5656 # CHECK_A9: Latency : 3
5757 # CHECK_SWIFT: Latency : 5
5858 # CHECK_R52: Latency : 4
5959 #
60 # CHECK: SU(11): %vreg13, %vreg14 = UMLAL %vreg6, %vreg6, %vreg13, %vreg14, pred:14, pred:%noreg, opt:%noreg; GPR:%vreg13 GPRnopc:%vreg14,%vreg6,%vreg6
60 # CHECK: SU(11): %13, %14 = UMLAL %6, %6, %13, %14, pred:14, pred:%noreg, opt:%noreg; GPR:%13 GPRnopc:%14,%6,%6
6161 # CHECK_SWIFT: Latency : 7
6262 # CHECK_A9: Latency : 3
6363 # CHECK_R52: Latency : 4
1919
2020 # CHECK: ********** MI Scheduling **********
2121 # CHECK: ScheduleDAGMILive::schedule starting
22 # CHECK: SU(1): %vreg1 = VLD4d8Pseudo %vreg0, 8, pred:14, pred:%noreg; mem:LD32[%A](align=8) QQPR:%vreg1 GPR:%vreg0
22 # CHECK: SU(1): %1 = VLD4d8Pseudo %0, 8, pred:14, pred:%noreg; mem:LD32[%A](align=8) QQPR:%1 GPR:%0
2323 # CHECK: Latency : 8
2424 # CHECK: Single Issue : true;
25 # CHECK: SU(2): %vreg4 = VADDv8i8 %vreg1:dsub_0, %vreg1:dsub_1, pred:14, pred:%noreg; DPR:%vreg4 QQPR:%vreg1
25 # CHECK: SU(2): %4 = VADDv8i8 %1:dsub_0, %1:dsub_1, pred:14, pred:%noreg; DPR:%4 QQPR:%1
2626 # CHECK: Latency : 5
2727 # CHECK: Single Issue : false;
28 # CHECK: SU(3): %vreg5, %vreg6 = VMOVRRD %vreg4, pred:14, pred:%noreg; GPR:%vreg5,%vreg6 DPR:%vreg4
28 # CHECK: SU(3): %5, %6 = VMOVRRD %4, pred:14, pred:%noreg; GPR:%5,%6 DPR:%4
2929 # CHECK: Latency : 4
3030 # CHECK: Single Issue : false;
3131
32 # TOPDOWN: Scheduling SU(1) %vreg1 = VLD4d8Pseudo
32 # TOPDOWN: Scheduling SU(1) %1 = VLD4d8Pseudo
3333 # TOPDOWN: Bump cycle to end group
34 # TOPDOWN: Scheduling SU(2) %vreg4 = VADDv8i8
34 # TOPDOWN: Scheduling SU(2) %4 = VADDv8i8
3535
36 # BOTTOMUP: Scheduling SU(2) %vreg4 = VADDv8i8
37 # BOTTOMUP: Scheduling SU(1) %vreg1 = VLD4d8Pseudo
36 # BOTTOMUP: Scheduling SU(2) %4 = VADDv8i8
37 # BOTTOMUP: Scheduling SU(1) %1 = VLD4d8Pseudo
3838 # BOTTOMUP: Bump cycle to begin group
3939
4040 ...
33 ;
44 ; The vector %v2 is built like this:
55 ;
6 ; %vreg6:ssub_1 = ...
7 ; %vreg6:ssub_0 = VLDRS , 0, pred:14, pred:%noreg; mem:LD4[ConstantPool] DPR_VFP2:%vreg6
6 ; %6:ssub_1 = ...
7 ; %6:ssub_0 = VLDRS , 0, pred:14, pred:%noreg; mem:LD4[ConstantPool] DPR_VFP2:%6
88 ;
9 ; When %vreg6 spills, the VLDRS constant pool load cannot be rematerialized
9 ; When %6 spills, the VLDRS constant pool load cannot be rematerialized
1010 ; since it implicitly reads the ssub_1 sub-register.
1111 ;
1212 ; CHECK: f1
3030 ; because the bits are undef, we should rematerialize. The vector is now built
3131 ; like this:
3232 ;
33 ; %vreg2:ssub_0 = VLDRS , 0, pred:14, pred:%noreg, %vreg2; mem:LD4[ConstantPool]
33 ; %2:ssub_0 = VLDRS , 0, pred:14, pred:%noreg, %2; mem:LD4[ConstantPool]
3434 ;
3535 ; The extra operand indicates that the instruction fully defines the
3636 ; virtual register. It doesn't read the old value.
1010 ;
1111 ; BB#2: derived from LLVM BB %finish
1212 ; Predecessors according to CFG: BB#0 BB#1
13 ; %vreg0 = PHI %vreg3, , %vreg5,
14 ; %vreg7 = LDIRdK 2
15 ; %vreg8 = LDIRdK 1
16 ; CPRdRr %vreg2, %vreg0, %SREG>
13 ; %0 = PHI %3, , %5, >
14 ; %7 = LDIRdK 2
15 ; %8 = LDIRdK 1
16 ; CPRdRr %2, %0, %SREG
1717 ; BREQk , %SREG
1818 ; Successors according to CFG: BB#5(?%) BB#6(?%)
1919 ;
66 ; UNREACHABLE executed at llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp:615!
77 ; This happened because after unrolling a loop with a ldd_circ instruction we
88 ; would have several TFCR and ldd_circ instruction sequences.
9 ; %vreg0 (CRRegs) = TFCR %vreg0 (IntRegs)
10 ; = ldd_circ( , , vreg0)
11 ; %vreg1 (CRRegs) = TFCR %vreg1 (IntRegs)
12 ; = ldd_circ( , , vreg0)
9 ; %0 (CRRegs) = TFCR %0 (IntRegs)
10 ; = ldd_circ( , , %0)
11 ; %1 (CRRegs) = TFCR %1 (IntRegs)
12 ; = ldd_circ( , , %0)
1313 ; The scheduler would move the CRRegs to the top of the loop. The allocator
1414 ; would try to spill the CRRegs after running out of them. We don't have code to
1515 ; spill CRRegs and the above assertion would be triggered.
22
33 # Check that coalesced registers are removed from live intervals.
44 #
5 # Check that vreg3 is coalesced into vreg4, and that after coalescing
5 # Check that %3 is coalesced into %4, and that after coalescing
66 # it is no longer in live intervals.
77
88 # CHECK-LABEL: After expand-condsets
99 # CHECK: INTERVALS
10 # CHECK-NOT: vreg3
10 # CHECK-NOT: %3
1111 # CHECK: MACHINEINSTRS
1212
1313
22
33 ; Check that the generated post-increment load has TBAA information.
44 ; CHECK-LABEL: Machine code for function fred:
5 ; CHECK: = V6_vL32b_pi %vreg{{[0-9]+}}, 64; mem:LD64[{{.*}}](tbaa=
5 ; CHECK: = V6_vL32b_pi %{{[0-9]+}}, 64; mem:LD64[{{.*}}](tbaa=
66
77 target triple = "hexagon"
88
3535 ; CHECK-LABEL: SU({{.*}}): SW_RI{{.*}}, 4,
3636 ; CHECK: # preds left : 2
3737 ; CHECK: # succs left : 0
38 ; CHECK-LABEL: SU({{.*}}): %vreg{{.*}} = LDW_RI{{.*}}, 12,
38 ; CHECK-LABEL: SU({{.*}}): %{{.*}} = LDW_RI{{.*}}, 12,
3939 ; CHECK: # preds left : 1
4040 ; CHECK: # succs left : 4
4141 ; CHECK-LABEL: SU({{.*}}): STH_RI{{.*}}, 10,
2121 ...
2222 ---
2323 # CHECK-LABEL: name: test_subreg_spill_fold2
24 # Similar to test_subreg_spill_fold, but with a vreg0 register class not containing %WZR.
24 # Similar to test_subreg_spill_fold, but with a %0 register class not containing %WZR.
2525 name: test_subreg_spill_fold2
2626 registers:
2727 - { id: 0, class: gpr64sp }
1313
1414 ; CHECK: ********** Function: foo
1515 ; CHECK: ********** FAST REGISTER ALLOCATION **********
16 ; CHECK: %x3 = COPY %vreg
17 ; CHECK-NEXT: %x4 = COPY %vreg
16 ; CHECK: %x3 = COPY %{{[0-9]+}}
17 ; CHECK-NEXT: %x4 = COPY %{{[0-9]+}}
1818 ; CHECK-NEXT: BLR
0 ; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=wasm32-unknown-unknown-wasm | FileCheck %s
11
22 ; CHECK: BB#0
3 ; CHECK: #DEBUG_VALUE: usage:self <- %vreg4
3 ; CHECK: #DEBUG_VALUE: usage:self <- %4
44 ; CHECK: BB#1
55 ; CHECK: DW_TAG_variable
66 source_filename = "test/CodeGen/WebAssembly/dbgvalue.ll"
11 ;
22 ; Test RegistersDefinedFromSameValue. We have multiple copies of the same vreg:
33 ; while.body85.i:
4 ; vreg1 = copy vreg2
5 ; vreg2 = add
4 ; %1 = copy %2
5 ; %2 = add
66 ; critical edge from land.lhs.true.i -> if.end117.i:
7 ; vreg27 = vreg2
7 ; %27 = %2
88 ; critical edge from land.lhs.true103.i -> if.end117.i:
9 ; vreg27 = vreg2
9 ; %27 = %2
1010 ; if.then108.i:
11 ; vreg27 = vreg1
11 ; %27 = %1
1212 ;
1313 ; Prior to fixing PR10920 401.bzip miscompile, the coalescer would
14 ; consider vreg1 and vreg27 to be copies of the same value. It would
14 ; consider %1 and %27 to be copies of the same value. It would
1515 ; then remove one of the critical edge copes, which cannot safely be removed.
1616
1717 ; There are two obvious ways the register-allocator could go here, either
77 ; the fallback path.
88
99 ; Check that we fallback on invoke translation failures.
10 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %vreg1, %vreg0; mem:ST10[%ptr](align=16) (in function: test_x86_fp80_dump)
10 ; FALLBACK-WITH-REPORT-ERR: remark: :0:0: unable to legalize instruction: G_STORE %1, %0; mem:ST10[%ptr](align=16) (in function: test_x86_fp80_dump)
1111 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_x86_fp80_dump
1212 ; FALLBACK-WITH-REPORT-OUT-LABEL: test_x86_fp80_dump:
1313 define void @test_x86_fp80_dump(x86_fp80* %ptr){
226226
227227 ; The following test failed because llvm had a bug where a structure like:
228228 ;
229 ; %vreg12 = CMOV_GR8 %vreg7, %vreg11 ... (lt)
230 ; %vreg13 = CMOV_GR8 %vreg12, %vreg11 ... (gt)
229 ; %12 = CMOV_GR8 %7, %11 ... (lt)
230 ; %13 = CMOV_GR8 %12, %11 ... (gt)
231231 ;
232232 ; was lowered to:
233233 ;
238238 ; JG_1 BB#9
239239 ; BB#8:
240240 ; BB#9:
241 ; vreg12 = phi(vreg7, BB#8, vreg11, BB#0, vreg12, BB#7)
242 ; vreg13 = COPY vreg12
243 ; Which was invalid as %vreg12 is not the same value as %vreg13
241 ; %12 = phi(%7, BB#8, %11, BB#0, %12, BB#7)
242 ; %13 = COPY %12
243 ; Which was invalid as %12 is not the same value as %13
244244
245245 ; CHECK-LABEL: no_cascade_opt:
246246 ; CMOV-DAG: cmpl %edx, %esi
33
44 ; This test case has a sub-register join followed by a remat:
55 ;
6 ; 256L %vreg2 = COPY %vreg7:sub_32bit; GR32:%vreg2 GR64:%vreg7
7 ; Considering merging %vreg2 with %vreg7:sub_32bit
6 ; 256L %2 = COPY %7:sub_32bit; GR32:%2 GR64:%7
7 ; Considering merging %2 with %7:sub_32bit
88 ; Cross-class to GR64.
9 ; RHS = %vreg2 = [256d,272d:0) 0@256d
10 ; LHS = %vreg7 = [208d,256d:0)[304L,480L:0) 0@208d
11 ; updated: 272L %vreg0 = COPY %vreg7:sub_32bit; GR32:%vreg0 GR64:%vreg7
12 ; Joined. Result = %vreg7 = [208d,272d:0)[304L,480L:0) 0@208d
9 ; RHS = %2 = [256d,272d:0) 0@256d
10 ; LHS = %7 = [208d,256d:0)[304L,480L:0) 0@208d
11 ; updated: 272L %0 = COPY %7:sub_32bit; GR32:%0 GR64:%7
12 ; Joined. Result = %7 = [208d,272d:0)[304L,480L:0) 0@208d
1313 ;
14 ; 272L %vreg10:sub_32bit = COPY %vreg7:sub_32bit, %vreg10; GR64:%vreg10,%vreg7
15 ; Considering merging %vreg7 with %vreg10
16 ; RHS = %vreg7 = [208d,272d:0)[304L,480L:0) 0@208d
17 ; LHS = %vreg10 = [16d,64L:2)[64L,160L:1)[192L,240L:1)[272d,304L:3)[304L,352d:1)[352d,400d:0)[400d,400S:4) 0@352d 1@64L-phidef 2@16d-phikill 3@272d-phikill 4@400d
18 ; Remat: %vreg10 = MOV64r0 %vreg10, %eflags, %vreg10; GR64:%vreg10
19 ; Shrink: %vreg7 = [208d,272d:0)[304L,480L:0) 0@208d
14 ; 272L %10:sub_32bit = COPY %7:sub_32bit, %10; GR64:%10,%7
15 ; Considering merging %7 with %10
16 ; RHS = %7 = [208d,272d:0)[304L,480L:0) 0@208d
17 ; LHS = %10 = [16d,64L:2)[64L,160L:1)[192L,240L:1)[272d,304L:3)[304L,352d:1)[352d,400d:0)[400d,400S:4) 0@352d 1@64L-phidef 2@16d-phikill 3@272d-phikill 4@400d
18 ; Remat: %10 = MOV64r0 %10, %eflags, %10; GR64:%10
19 ; Shrink: %7 = [208d,272d:0)[304L,480L:0) 0@208d
2020 ; live-in at 240L
2121 ; live-in at 416L
2222 ; live-in at 320L
2323 ; live-in at 304L
24 ; Shrunk: %vreg7 = [208d,256d:0)[304L,480L:0) 0@208d
24 ; Shrunk: %7 = [208d,256d:0)[304L,480L:0) 0@208d
2525 ;
2626 ; The COPY at 256L is rewritten as a partial def, and that would artificially
27 ; extend the live range of %vreg7 to end at 256d. When the joined copy is
27 ; extend the live range of %7 to end at 256d. When the joined copy is
2828 ; removed, -verify-coalescing complains about the dangling kill.
2929 ;
3030 ;
480480
481481 ; Check coalescing of IMPLICIT_DEF instructions:
482482 ;
483 ; %vreg1 = IMPLICIT_DEF
484 ; %vreg2 = MOV32r0
483 ; %1 = IMPLICIT_DEF
484 ; %2 = MOV32r0
485485 ;
486 ; When coalescing %vreg1 and %vreg2, the IMPLICIT_DEF instruction should be
486 ; When coalescing %1 and %2, the IMPLICIT_DEF instruction should be
487487 ; erased along with its value number.
488488 ;
489489 define void @rdar12474033() nounwind ssp {
77 ; %edx has a live range into the function and is used by the DIV32r.
88 ;
99 ; Here sinking a kill + dead def:
10 ; 144B -> 180B: DIV32r %vreg4, %eax, %edx, %EFLAGS, %eax, %edx
11 ; %vreg4: [48r,144r:0) 0@48r
10 ; 144B -> 180B: DIV32r %4, %eax, %edx, %EFLAGS, %eax, %edx
11 ; %4: [48r,144r:0) 0@48r
1212 ; --> [48r,180r:0) 0@48r
1313 ; DH: [0B,16r:0)[128r,144r:2)[144r,144d:1) 0@0B-phi 1@144r 2@128r
1414 ; --> [0B,16r:0)[128r,180r:2)[180r,180d:1) 0@0B-phi 1@180r 2@128r
2424 }
2525
2626 ; Same as above, but moving a kill + live def:
27 ; 144B -> 180B: DIV32r %vreg4, %eax, %edx, %EFLAGS, %eax, %edx
28 ; %vreg4: [48r,144r:0) 0@48r
27 ; 144B -> 180B: DIV32r %4, %eax, %edx, %EFLAGS, %eax, %edx
28 ; %4: [48r,144r:0) 0@48r
2929 ; --> [48r,180r:0) 0@48r
3030 ; DH: [0B,16r:0)[128r,144r:2)[144r,184r:1) 0@0B-phi 1@144r 2@128r
3131 ; --> [0B,16r:0)[128r,180r:2)[180r,184r:1) 0@0B-phi 1@180r 2@128r
4040 ret i32 %add
4141 }
4242
43 ; Moving a use below the existing kill (%vreg5):
44 ; Moving a tied virtual register def (%vreg11):
43 ; Moving a use below the existing kill (%5):
44 ; Moving a tied virtual register def (%11):
4545 ;
46 ; 96B -> 120B: %vreg11 = SUB32rr %vreg11, %vreg5
47 ; %vreg11: [80r,96r:1)[96r,144r:0) 0@96r 1@80r
46 ; 96B -> 120B: %11 = SUB32rr %11, %5
47 ; %11: [80r,96r:1)[96r,144r:0) 0@96r 1@80r
4848 ; --> [80r,120r:1)[120r,144r:0) 0@120r 1@80r
49 ; %vreg5: [16r,112r:0) 0@16r
49 ; %5: [16r,112r:0) 0@16r
5050 ; --> [16r,120r:0) 0@16r
5151 ;
5252 define i32 @f3(i32 %a, i32 %b) nounwind uwtable readnone ssp {
44 define void @func() { ret void }
55 ...
66 ---
7 # Liveness calculation should detect that we do not have a definition for vreg0
8 # on all paths; In this example a def for vreg0 is missing when jumping from
7 # Liveness calculation should detect that we do not have a definition for %0
8 # on all paths; In this example a def for %0 is missing when jumping from
99 # bb.0 to bb.3.
1010 #
11 # CHECK: Use of %vreg0 does not have a corresponding definition on every path
11 # CHECK: Use of %0 does not have a corresponding definition on every path
1212 # CHECK: ERROR: Use not jointly dominated by defs.
1313 name: func
1414 registers:
6161
6262
6363 ; RAFast would forget to add a super-register when rewriting:
64 ; %vreg10:sub_32bit = COPY %R9D
64 ; %10:sub_32bit = COPY %R9D
6565 ; This trips up the machine code verifier.
6666 define void @autogen_SD24657(i8*, i32*, i64*, i32, i64, i8) {
6767 BB:
99 ;
1010 ; CHECK: *** Final schedule for BB#1 ***
1111 ; CHECK: %eax = COPY
12 ; CHECK-NEXT: MUL32r %vreg{{[0-9]+}}, %eax, %edx, %eflags, %eax;
12 ; CHECK-NEXT: MUL32r %{{[0-9]+}}, %eax, %edx, %eflags, %eax;
1313 ; CHECK-NEXT: COPY %e{{[ad]}}x
1414 ; CHECK-NEXT: COPY %e{{[ad]}}x
1515 ; CHECK: DIVSSrm
4040
4141 ; This test case extracts a sub_8bit_hi sub-register:
4242 ;
43 ; %vreg2 = COPY %vreg1:sub_8bit_hi; GR8:%vreg2 GR64_ABCD:%vreg1
44 ; TEST8ri %vreg2, 1, %eflags; GR8:%vreg2
43 ; %2 = COPY %1:sub_8bit_hi; GR8:%2 GR64_ABCD:%1
44 ; TEST8ri %2, 1, %eflags; GR8:%2
4545 ;
46 ; %vreg2 must be constrained to GR8_NOREX, or the COPY could become impossible.
46 ; %2 must be constrained to GR8_NOREX, or the COPY could become impossible.
4747 ;
4848 ; PR11088
4949
0 ; RUN: llc < %s -verify-machineinstrs -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
11 ; rdar://5571034
22
3 ; This requires physreg joining, %vreg13 is live everywhere:
4 ; 304L %cl = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
5 ; 320L %vreg15 = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
6 ; 336L %vreg15 = SAR32rCL %vreg15, %eflags, %cl; GR32:%vreg15
3 ; This requires physreg joining, %13 is live everywhere:
4 ; 304L %cl = COPY %13:sub_8bit; GR32_ABCD:%13
5 ; 320L %15 = COPY %19; GR32:%15 GR32_NOSP:%19
6 ; 336L %15 = SAR32rCL %15, %eflags, %cl; GR32:%15
77
88 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
99 ; CHECK-LABEL: foo:
147147
148148 # Let's verify that the slot index ranges for the unused variables argc/argv,
149149 # connected to physical regs %edi and %rsi, does not overlap with the ranges
150 # for %vreg2 and %vreg3. The register allocator is actually allocating the
150 # for %2 and %3. The register allocator is actually allocating the
151151 # virtual registers # to %edi and %esi, so the ranges for argc/argv should
152152 # not cover the whole BB.
153153 #
156156 # CHECKDBG-NEXT: [0B;0e):0 BB#0-160B
157157 # CHECKDBG-NEXT: !"argv,5" [0B;0e):0 Loc0=%rsi
158158 # CHECKDBG-NEXT: [0B;0e):0 BB#0-160B
159 # CHECKDBG-NEXT: !"a0,7" [16r;64r):0 Loc0=%vreg2
159 # CHECKDBG-NEXT: !"a0,7" [16r;64r):0 Loc0=%2
160160 # CHECKDBG-NEXT: [16r;64r):0 BB#0-160B
161 # CHECKDBG-NEXT: !"a1,8" [32r;80r):0 Loc0=%vreg3
161 # CHECKDBG-NEXT: !"a1,8" [32r;80r):0 Loc0=%3
162162 # CHECKDBG-NEXT: [32r;80r):0 BB#0-160B