llvm.org GIT mirror llvm / 20de1ea
[CodeGen] Move MacroFusion to the target This patch moves the class for scheduling adjacent instructions, MacroFusion, to the target. In AArch64, it also expands the fusion to all instructions pairs in a scheduling block, beyond just among the predecessors of the branch at the end. Differential revision: https://reviews.llvm.org/D28489 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@293737 91177308-0d34-0410-b5e6-96231b3b80d8 Evandro Menezes 3 years ago
16 changed file(s) with 538 addition(s) and 345 deletion(s). Raw diff Collapse all Expand all
10321032 const TargetRegisterInfo *TRI);
10331033
10341034 std::unique_ptr
1035 createMacroFusionDAGMutation(const TargetInstrInfo *TII);
1036
1037 std::unique_ptr
10381035 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
10391036 const TargetRegisterInfo *TRI);
10401037
10691069 llvm_unreachable("target did not implement shouldClusterMemOps()");
10701070 }
10711071
1072 /// Can this target fuse the given instructions if they are scheduled
1073 /// adjacent. Note that you have to add:
1074 /// DAG.addMutation(createMacroFusionDAGMutation());
1075 /// to TargetPassConfig::createMachineScheduler() to have an effect.
1076 virtual bool shouldScheduleAdjacent(const MachineInstr &First,
1077 const MachineInstr &Second) const {
1078 llvm_unreachable("target did not implement shouldScheduleAdjacent()");
1079 }
1080
10811072 /// Reverses the branch condition of the specified condition list,
10821073 /// returning false on success and true if it cannot be reversed.
10831074 virtual
7878 static cl::opt EnableMemOpCluster("misched-cluster", cl::Hidden,
7979 cl::desc("Enable memop clustering."),
8080 cl::init(true));
81
82 // Experimental heuristics
83 static cl::opt EnableMacroFusion("misched-fusion", cl::Hidden,
84 cl::desc("Enable scheduling for macro fusion."), cl::init(true));
8581
8682 static cl::opt VerifyScheduling("verify-misched", cl::Hidden,
8783 cl::desc("Verify machine instrs before and after machine scheduling"));
15431539 }
15441540
15451541 //===----------------------------------------------------------------------===//
1546 // MacroFusion - DAG post-processing to encourage fusion of macro ops.
1547 //===----------------------------------------------------------------------===//
1548
1549 namespace {
1550 /// \brief Post-process the DAG to create cluster edges between instructions
1551 /// that may be fused by the processor into a single operation.
1552 class MacroFusion : public ScheduleDAGMutation {
1553 const TargetInstrInfo &TII;
1554 public:
1555 MacroFusion(const TargetInstrInfo &TII)
1556 : TII(TII) {}
1557
1558 void apply(ScheduleDAGInstrs *DAGInstrs) override;
1559 };
1560 } // anonymous
1561
1562 namespace llvm {
1563
1564 std::unique_ptr
1565 createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
1566 return EnableMacroFusion ? make_unique(*TII) : nullptr;
1567 }
1568
1569 } // namespace llvm
1570
1571 /// \brief Callback from DAG postProcessing to create cluster edges to encourage
1572 /// fused operations.
1573 void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
1574 ScheduleDAGMI *DAG = static_cast(DAGInstrs);
1575
1576 // For now, assume targets can only fuse with the branch.
1577 SUnit &ExitSU = DAG->ExitSU;
1578 MachineInstr *Branch = ExitSU.getInstr();
1579 if (!Branch)
1580 return;
1581
1582 for (SDep &PredDep : ExitSU.Preds) {
1583 if (PredDep.isWeak())
1584 continue;
1585 SUnit &SU = *PredDep.getSUnit();
1586 MachineInstr &Pred = *SU.getInstr();
1587 if (!TII.shouldScheduleAdjacent(Pred, *Branch))
1588 continue;
1589
1590 // Create a single weak edge from SU to ExitSU. The only effect is to cause
1591 // bottom-up scheduling to heavily prioritize the clustered SU. There is no
1592 // need to copy predecessor edges from ExitSU to SU, since top-down
1593 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
1594 // of SU, we could create an artificial edge from the deepest root, but it
1595 // hasn't been needed yet.
1596 bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
1597 (void)Success;
1598 assert(Success && "No DAG nodes should be reachable from ExitSU");
1599
1600 // Adjust latency of data deps between the nodes.
1601 for (SDep &PredDep : ExitSU.Preds) {
1602 if (PredDep.getSUnit() == &SU)
1603 PredDep.setLatency(0);
1604 }
1605 for (SDep &SuccDep : SU.Succs) {
1606 if (SuccDep.getSUnit() == &ExitSU)
1607 SuccDep.setLatency(0);
1608 }
1609
1610 DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
1611 break;
1612 }
1613 }
1614
1615 //===----------------------------------------------------------------------===//
16161542 // CopyConstrain - DAG post-processing to encourage copy elimination.
16171543 //===----------------------------------------------------------------------===//
16181544
19111911 // The caller should already have ordered First/SecondLdSt by offset.
19121912 assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
19131913 return Offset1 + 1 == Offset2;
1914 }
1915
1916 bool AArch64InstrInfo::shouldScheduleAdjacent(
1917 const MachineInstr &First, const MachineInstr &Second) const {
1918 if (Subtarget.hasArithmeticBccFusion()) {
1919 // Fuse CMN, CMP, TST followed by Bcc.
1920 unsigned SecondOpcode = Second.getOpcode();
1921 if (SecondOpcode == AArch64::Bcc) {
1922 switch (First.getOpcode()) {
1923 default:
1924 return false;
1925 case AArch64::ADDSWri:
1926 case AArch64::ADDSWrr:
1927 case AArch64::ADDSXri:
1928 case AArch64::ADDSXrr:
1929 case AArch64::ANDSWri:
1930 case AArch64::ANDSWrr:
1931 case AArch64::ANDSXri:
1932 case AArch64::ANDSXrr:
1933 case AArch64::SUBSWri:
1934 case AArch64::SUBSWrr:
1935 case AArch64::SUBSXri:
1936 case AArch64::SUBSXrr:
1937 case AArch64::BICSWrr:
1938 case AArch64::BICSXrr:
1939 return true;
1940 case AArch64::ADDSWrs:
1941 case AArch64::ADDSXrs:
1942 case AArch64::ANDSWrs:
1943 case AArch64::ANDSXrs:
1944 case AArch64::SUBSWrs:
1945 case AArch64::SUBSXrs:
1946 case AArch64::BICSWrs:
1947 case AArch64::BICSXrs:
1948 // Shift value can be 0 making these behave like the "rr" variant...
1949 return !hasShiftedReg(Second);
1950 }
1951 }
1952 }
1953 if (Subtarget.hasArithmeticCbzFusion()) {
1954 // Fuse ALU operations followed by CBZ/CBNZ.
1955 unsigned SecondOpcode = Second.getOpcode();
1956 if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
1957 SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
1958 switch (First.getOpcode()) {
1959 default:
1960 return false;
1961 case AArch64::ADDWri:
1962 case AArch64::ADDWrr:
1963 case AArch64::ADDXri:
1964 case AArch64::ADDXrr:
1965 case AArch64::ANDWri:
1966 case AArch64::ANDWrr:
1967 case AArch64::ANDXri:
1968 case AArch64::ANDXrr:
1969 case AArch64::EORWri:
1970 case AArch64::EORWrr:
1971 case AArch64::EORXri:
1972 case AArch64::EORXrr:
1973 case AArch64::ORRWri:
1974 case AArch64::ORRWrr:
1975 case AArch64::ORRXri:
1976 case AArch64::ORRXrr:
1977 case AArch64::SUBWri:
1978 case AArch64::SUBWrr:
1979 case AArch64::SUBXri:
1980 case AArch64::SUBXrr:
1981 return true;
1982 case AArch64::ADDWrs:
1983 case AArch64::ADDXrs:
1984 case AArch64::ANDWrs:
1985 case AArch64::ANDXrs:
1986 case AArch64::SUBWrs:
1987 case AArch64::SUBXrs:
1988 case AArch64::BICWrs:
1989 case AArch64::BICXrs:
1990 // Shift value can be 0 making these behave like the "rr" variant...
1991 return !hasShiftedReg(Second);
1992 }
1993 }
1994 }
1995 return false;
19961914 }
19971915
19981916 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
137137 bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
138138 unsigned NumLoads) const override;
139139
140 bool shouldScheduleAdjacent(const MachineInstr &First,
141 const MachineInstr &Second) const override;
142
143140 MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
144141 uint64_t Offset, const MDNode *Var,
145142 const MDNode *Expr,
0 //===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file This file contains the AArch64 implementation of the DAG scheduling mutation
10 // to pair instructions back to back.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64MacroFusion.h"
15 #include "AArch64Subtarget.h"
16 #include "llvm/Support/CommandLine.h"
17 #include "llvm/Target/TargetInstrInfo.h"
18
19 #define DEBUG_TYPE "misched"
20
21 using namespace llvm;
22
23 static cl::opt EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
24 cl::desc("Enable scheduling for macro fusion."), cl::init(true));
25
26 namespace {
27
28 /// \brief Verify that the instruction pair, \param First and \param Second,
29 /// should be scheduled back to back. Given an anchor instruction, if the other
30 /// instruction is unspecified, then verify that the anchor instruction may be
31 /// part of a pair at all.
32 static bool shouldScheduleAdjacent(const AArch64InstrInfo &TII,
33 const AArch64Subtarget &ST,
34 const MachineInstr *First,
35 const MachineInstr *Second) {
36 unsigned FirstOpcode = First ?
37 First->getOpcode() : AArch64::INSTRUCTION_LIST_END;
38 unsigned SecondOpcode = Second ?
39 Second->getOpcode() : AArch64::INSTRUCTION_LIST_END;
40
41 if (ST.hasArithmeticBccFusion())
42 // Fuse CMN, CMP, TST followed by Bcc.
43 if (SecondOpcode == AArch64::Bcc)
44 switch (FirstOpcode) {
45 default:
46 return false;
47 case AArch64::ADDSWri:
48 case AArch64::ADDSWrr:
49 case AArch64::ADDSXri:
50 case AArch64::ADDSXrr:
51 case AArch64::ANDSWri:
52 case AArch64::ANDSWrr:
53 case AArch64::ANDSXri:
54 case AArch64::ANDSXrr:
55 case AArch64::SUBSWri:
56 case AArch64::SUBSWrr:
57 case AArch64::SUBSXri:
58 case AArch64::SUBSXrr:
59 case AArch64::BICSWrr:
60 case AArch64::BICSXrr:
61 return true;
62 case AArch64::ADDSWrs:
63 case AArch64::ADDSXrs:
64 case AArch64::ANDSWrs:
65 case AArch64::ANDSXrs:
66 case AArch64::SUBSWrs:
67 case AArch64::SUBSXrs:
68 case AArch64::BICSWrs:
69 case AArch64::BICSXrs:
70 // Shift value can be 0 making these behave like the "rr" variant...
71 return !TII.hasShiftedReg(*First);
72 case AArch64::INSTRUCTION_LIST_END:
73 return true;
74 }
75
76 if (ST.hasArithmeticCbzFusion())
77 // Fuse ALU operations followed by CBZ/CBNZ.
78 if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
79 SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
80 switch (FirstOpcode) {
81 default:
82 return false;
83 case AArch64::ADDWri:
84 case AArch64::ADDWrr:
85 case AArch64::ADDXri:
86 case AArch64::ADDXrr:
87 case AArch64::ANDWri:
88 case AArch64::ANDWrr:
89 case AArch64::ANDXri:
90 case AArch64::ANDXrr:
91 case AArch64::EORWri:
92 case AArch64::EORWrr:
93 case AArch64::EORXri:
94 case AArch64::EORXrr:
95 case AArch64::ORRWri:
96 case AArch64::ORRWrr:
97 case AArch64::ORRXri:
98 case AArch64::ORRXrr:
99 case AArch64::SUBWri:
100 case AArch64::SUBWrr:
101 case AArch64::SUBXri:
102 case AArch64::SUBXrr:
103 return true;
104 case AArch64::ADDWrs:
105 case AArch64::ADDXrs:
106 case AArch64::ANDWrs:
107 case AArch64::ANDXrs:
108 case AArch64::SUBWrs:
109 case AArch64::SUBXrs:
110 case AArch64::BICWrs:
111 case AArch64::BICXrs:
112 // Shift value can be 0 making these behave like the "rr" variant...
113 return !TII.hasShiftedReg(*First);
114 case AArch64::INSTRUCTION_LIST_END:
115 return true;
116 }
117
118 return false;
119 }
120
121 /// \brief Implement the fusion of instruction pairs in the scheduling
122 /// \param DAG, anchored at the instruction in \param ASU. \param Preds
123 /// indicates if its dependencies in \param APreds are predecessors instead of
124 /// successors.
125 static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit *ASU,
126 SmallVectorImpl &APreds, bool Preds) {
127 const AArch64InstrInfo *TII = static_cast(DAG->TII);
128 const AArch64Subtarget &ST = DAG->MF.getSubtarget();
129
130 const MachineInstr *AMI = ASU->getInstr();
131 if (!AMI || AMI->isPseudo() || AMI->isTransient() ||
132 (Preds && !shouldScheduleAdjacent(*TII, ST, nullptr, AMI)) ||
133 (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, nullptr)))
134 return false;
135
136 for (SDep &BDep : APreds) {
137 if (BDep.isWeak())
138 continue;
139
140 SUnit *BSU = BDep.getSUnit();
141 const MachineInstr *BMI = BSU->getInstr();
142 if (!BMI || BMI->isPseudo() || BMI->isTransient() ||
143 (Preds && !shouldScheduleAdjacent(*TII, ST, BMI, AMI)) ||
144 (!Preds && !shouldScheduleAdjacent(*TII, ST, AMI, BMI)))
145 continue;
146
147 // Create a single weak edge between the adjacent instrs. The only
148 // effect is to cause bottom-up scheduling to heavily prioritize the
149 // clustered instrs.
150 if (Preds)
151 DAG->addEdge(ASU, SDep(BSU, SDep::Cluster));
152 else
153 DAG->addEdge(BSU, SDep(ASU, SDep::Cluster));
154
155 // Adjust the latency between the 1st instr and its predecessors/successors.
156 for (SDep &Dep : APreds)
157 if (Dep.getSUnit() == BSU)
158 Dep.setLatency(0);
159
160 // Adjust the latency between the 2nd instr and its successors/predecessors.
161 auto &BSuccs = Preds ? BSU->Succs : BSU->Preds;
162 for (SDep &Dep : BSuccs)
163 if (Dep.getSUnit() == ASU)
164 Dep.setLatency(0);
165
166 DEBUG(dbgs() << "Macro fuse ";
167 Preds ? BSU->print(dbgs(), DAG) : ASU->print(dbgs(), DAG);
168 dbgs() << " - ";
169 Preds ? ASU->print(dbgs(), DAG) : BSU->print(dbgs(), DAG);
170 dbgs() << '\n');
171
172 return true;
173 }
174
175 return false;
176 }
177
178 /// \brief Post-process the DAG to create cluster edges between instructions
179 /// that may be fused by the processor into a single operation.
180 class AArch64MacroFusion : public ScheduleDAGMutation {
181 public:
182 AArch64MacroFusion() {}
183
184 void apply(ScheduleDAGInstrs *DAGInstrs) override;
185 };
186
187 void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
188 ScheduleDAGMI *DAG = static_cast(DAGInstrs);
189
190 // For each of the SUnits in the scheduling block, try to fuse the instruction
191 // in it with one in its successors.
192 for (SUnit &ASU : DAG->SUnits)
193 scheduleAdjacentImpl(DAG, &ASU, ASU.Succs, false);
194
195 // Try to fuse the instruction in the ExitSU with one in its predecessors.
196 scheduleAdjacentImpl(DAG, &DAG->ExitSU, DAG->ExitSU.Preds, true);
197 }
198
199 } // end namespace
200
201
202 namespace llvm {
203
204 std::unique_ptr createAArch64MacroFusionDAGMutation () {
205 return EnableMacroFusion ? make_unique() : nullptr;
206 }
207
208 } // end namespace llvm
0 //===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // \fileThis file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the AArch64 definition of the DAG scheduling mutation
10 // to pair instructions back to back.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AArch64InstrInfo.h"
15 #include "llvm/CodeGen/MachineScheduler.h"
16
17 //===----------------------------------------------------------------------===//
18 // AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops.
19 //===----------------------------------------------------------------------===//
20
21 namespace llvm {
22
23 /// Note that you have to add:
24 /// DAG.addMutation(createAArch64MacroFusionDAGMutation());
25 /// to AArch64PassConfig::createMachineScheduler() to have an effect.
26 std::unique_ptr createAArch64MacroFusionDAGMutation();
27
28 } // llvm
1313 #include "AArch64CallLowering.h"
1414 #include "AArch64InstructionSelector.h"
1515 #include "AArch64LegalizerInfo.h"
16 #include "AArch64MacroFusion.h"
1617 #ifdef LLVM_BUILD_GLOBAL_ISEL
1718 #include "AArch64RegisterBankInfo.h"
1819 #endif
324325 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
325326 DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
326327 DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
327 DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
328 DAG->addMutation(createAArch64MacroFusionDAGMutation());
328329 return DAG;
329330 }
330331
5555 AArch64ISelLowering.cpp
5656 AArch64InstrInfo.cpp
5757 AArch64LoadStoreOptimizer.cpp
58 AArch64MacroFusion.cpp
5859 AArch64MCInstLower.cpp
5960 AArch64PromoteConstant.cpp
6061 AArch64PBQPRegAlloc.cpp
4242 X86EvexToVex.cpp
4343 X86MCInstLower.cpp
4444 X86MachineFunctionInfo.cpp
45 X86MacroFusion.cpp
4546 X86OptimizeLEAs.cpp
4647 X86PadShortFunction.cpp
4748 X86RegisterInfo.cpp
84188418 return true;
84198419 }
84208420
8421 bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
8422 const MachineInstr &Second) const {
8423 // Check if this processor supports macro-fusion. Since this is a minor
8424 // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
8425 // proxy for SandyBridge+.
8426 if (!Subtarget.hasAVX())
8427 return false;
8428
8429 enum {
8430 FuseTest,
8431 FuseCmp,
8432 FuseInc
8433 } FuseKind;
8434
8435 switch (Second.getOpcode()) {
8436 default:
8437 return false;
8438 case X86::JE_1:
8439 case X86::JNE_1:
8440 case X86::JL_1:
8441 case X86::JLE_1:
8442 case X86::JG_1:
8443 case X86::JGE_1:
8444 FuseKind = FuseInc;
8445 break;
8446 case X86::JB_1:
8447 case X86::JBE_1:
8448 case X86::JA_1:
8449 case X86::JAE_1:
8450 FuseKind = FuseCmp;
8451 break;
8452 case X86::JS_1:
8453 case X86::JNS_1:
8454 case X86::JP_1:
8455 case X86::JNP_1:
8456 case X86::JO_1:
8457 case X86::JNO_1:
8458 FuseKind = FuseTest;
8459 break;
8460 }
8461 switch (First.getOpcode()) {
8462 default:
8463 return false;
8464 case X86::TEST8rr:
8465 case X86::TEST16rr:
8466 case X86::TEST32rr:
8467 case X86::TEST64rr:
8468 case X86::TEST8ri:
8469 case X86::TEST16ri:
8470 case X86::TEST32ri:
8471 case X86::TEST32i32:
8472 case X86::TEST64i32:
8473 case X86::TEST64ri32:
8474 case X86::TEST8rm:
8475 case X86::TEST16rm:
8476 case X86::TEST32rm:
8477 case X86::TEST64rm:
8478 case X86::TEST8ri_NOREX:
8479 case X86::AND16i16:
8480 case X86::AND16ri:
8481 case X86::AND16ri8:
8482 case X86::AND16rm:
8483 case X86::AND16rr:
8484 case X86::AND32i32:
8485 case X86::AND32ri:
8486 case X86::AND32ri8:
8487 case X86::AND32rm:
8488 case X86::AND32rr:
8489 case X86::AND64i32:
8490 case X86::AND64ri32:
8491 case X86::AND64ri8:
8492 case X86::AND64rm:
8493 case X86::AND64rr:
8494 case X86::AND8i8:
8495 case X86::AND8ri:
8496 case X86::AND8rm:
8497 case X86::AND8rr:
8498 return true;
8499 case X86::CMP16i16:
8500 case X86::CMP16ri:
8501 case X86::CMP16ri8:
8502 case X86::CMP16rm:
8503 case X86::CMP16rr:
8504 case X86::CMP32i32:
8505 case X86::CMP32ri:
8506 case X86::CMP32ri8:
8507 case X86::CMP32rm:
8508 case X86::CMP32rr:
8509 case X86::CMP64i32:
8510 case X86::CMP64ri32:
8511 case X86::CMP64ri8:
8512 case X86::CMP64rm:
8513 case X86::CMP64rr:
8514 case X86::CMP8i8:
8515 case X86::CMP8ri:
8516 case X86::CMP8rm:
8517 case X86::CMP8rr:
8518 case X86::ADD16i16:
8519 case X86::ADD16ri:
8520 case X86::ADD16ri8:
8521 case X86::ADD16ri8_DB:
8522 case X86::ADD16ri_DB:
8523 case X86::ADD16rm:
8524 case X86::ADD16rr:
8525 case X86::ADD16rr_DB:
8526 case X86::ADD32i32:
8527 case X86::ADD32ri:
8528 case X86::ADD32ri8:
8529 case X86::ADD32ri8_DB:
8530 case X86::ADD32ri_DB:
8531 case X86::ADD32rm:
8532 case X86::ADD32rr:
8533 case X86::ADD32rr_DB:
8534 case X86::ADD64i32:
8535 case X86::ADD64ri32:
8536 case X86::ADD64ri32_DB:
8537 case X86::ADD64ri8:
8538 case X86::ADD64ri8_DB:
8539 case X86::ADD64rm:
8540 case X86::ADD64rr:
8541 case X86::ADD64rr_DB:
8542 case X86::ADD8i8:
8543 case X86::ADD8mi:
8544 case X86::ADD8mr:
8545 case X86::ADD8ri:
8546 case X86::ADD8rm:
8547 case X86::ADD8rr:
8548 case X86::SUB16i16:
8549 case X86::SUB16ri:
8550 case X86::SUB16ri8:
8551 case X86::SUB16rm:
8552 case X86::SUB16rr:
8553 case X86::SUB32i32:
8554 case X86::SUB32ri:
8555 case X86::SUB32ri8:
8556 case X86::SUB32rm:
8557 case X86::SUB32rr:
8558 case X86::SUB64i32:
8559 case X86::SUB64ri32:
8560 case X86::SUB64ri8:
8561 case X86::SUB64rm:
8562 case X86::SUB64rr:
8563 case X86::SUB8i8:
8564 case X86::SUB8ri:
8565 case X86::SUB8rm:
8566 case X86::SUB8rr:
8567 return FuseKind == FuseCmp || FuseKind == FuseInc;
8568 case X86::INC16r:
8569 case X86::INC32r:
8570 case X86::INC64r:
8571 case X86::INC8r:
8572 case X86::DEC16r:
8573 case X86::DEC32r:
8574 case X86::DEC64r:
8575 case X86::DEC8r:
8576 return FuseKind == FuseInc;
8577 }
8578 }
8579
85808421 bool X86InstrInfo::
85818422 reverseBranchCondition(SmallVectorImpl &Cond) const {
85828423 assert(Cond.size() == 1 && "Invalid X86 branch condition!");
442442 int64_t Offset1, int64_t Offset2,
443443 unsigned NumLoads) const override;
444444
445 bool shouldScheduleAdjacent(const MachineInstr &First,
446 const MachineInstr &Second) const override;
447
448445 void getNoopForMachoTarget(MCInst &NopInst) const override;
449446
450447 bool
0 //===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // \file This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 implementation of the DAG scheduling mutation to
10 // pair instructions back to back.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "X86MacroFusion.h"
15 #include "X86Subtarget.h"
16 #include "llvm/Support/CommandLine.h"
17 #include "llvm/Target/TargetInstrInfo.h"
18
19 #define DEBUG_TYPE "misched"
20
21 using namespace llvm;
22
23 static cl::opt EnableMacroFusion("x86-misched-fusion", cl::Hidden,
24 cl::desc("Enable scheduling for macro fusion."), cl::init(true));
25
26 namespace {
27
28 /// \brief Verify that the instruction pair, \param First and \param Second,
29 /// should be scheduled back to back. If either instruction is unspecified,
30 /// then verify that the other instruction may be part of a pair at all.
31 static bool shouldScheduleAdjacent(const X86Subtarget &ST,
32 const MachineInstr *First,
33 const MachineInstr *Second) {
34 // Check if this processor supports macro-fusion. Since this is a minor
35 // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
36 // proxy for SandyBridge+.
37 if (!ST.hasAVX())
38 return false;
39
40 enum {
41 FuseTest,
42 FuseCmp,
43 FuseInc
44 } FuseKind;
45
46 unsigned FirstOpcode = First ?
47 First->getOpcode() : X86::INSTRUCTION_LIST_END;
48 unsigned SecondOpcode = Second ?
49 Second->getOpcode() : X86::INSTRUCTION_LIST_END;
50
51 switch (SecondOpcode) {
52 default:
53 return false;
54 case X86::JE_1:
55 case X86::JNE_1:
56 case X86::JL_1:
57 case X86::JLE_1:
58 case X86::JG_1:
59 case X86::JGE_1:
60 FuseKind = FuseInc;
61 break;
62 case X86::JB_1:
63 case X86::JBE_1:
64 case X86::JA_1:
65 case X86::JAE_1:
66 FuseKind = FuseCmp;
67 break;
68 case X86::JS_1:
69 case X86::JNS_1:
70 case X86::JP_1:
71 case X86::JNP_1:
72 case X86::JO_1:
73 case X86::JNO_1:
74 FuseKind = FuseTest;
75 break;
76 }
77
78 switch (FirstOpcode) {
79 default:
80 return false;
81 case X86::TEST8rr:
82 case X86::TEST16rr:
83 case X86::TEST32rr:
84 case X86::TEST64rr:
85 case X86::TEST8ri:
86 case X86::TEST16ri:
87 case X86::TEST32ri:
88 case X86::TEST32i32:
89 case X86::TEST64i32:
90 case X86::TEST64ri32:
91 case X86::TEST8rm:
92 case X86::TEST16rm:
93 case X86::TEST32rm:
94 case X86::TEST64rm:
95 case X86::TEST8ri_NOREX:
96 case X86::AND16i16:
97 case X86::AND16ri:
98 case X86::AND16ri8:
99 case X86::AND16rm:
100 case X86::AND16rr:
101 case X86::AND32i32:
102 case X86::AND32ri:
103 case X86::AND32ri8:
104 case X86::AND32rm:
105 case X86::AND32rr:
106 case X86::AND64i32:
107 case X86::AND64ri32:
108 case X86::AND64ri8:
109 case X86::AND64rm:
110 case X86::AND64rr:
111 case X86::AND8i8:
112 case X86::AND8ri:
113 case X86::AND8rm:
114 case X86::AND8rr:
115 return true;
116 case X86::CMP16i16:
117 case X86::CMP16ri:
118 case X86::CMP16ri8:
119 case X86::CMP16rm:
120 case X86::CMP16rr:
121 case X86::CMP32i32:
122 case X86::CMP32ri:
123 case X86::CMP32ri8:
124 case X86::CMP32rm:
125 case X86::CMP32rr:
126 case X86::CMP64i32:
127 case X86::CMP64ri32:
128 case X86::CMP64ri8:
129 case X86::CMP64rm:
130 case X86::CMP64rr:
131 case X86::CMP8i8:
132 case X86::CMP8ri:
133 case X86::CMP8rm:
134 case X86::CMP8rr:
135 case X86::ADD16i16:
136 case X86::ADD16ri:
137 case X86::ADD16ri8:
138 case X86::ADD16ri8_DB:
139 case X86::ADD16ri_DB:
140 case X86::ADD16rm:
141 case X86::ADD16rr:
142 case X86::ADD16rr_DB:
143 case X86::ADD32i32:
144 case X86::ADD32ri:
145 case X86::ADD32ri8:
146 case X86::ADD32ri8_DB:
147 case X86::ADD32ri_DB:
148 case X86::ADD32rm:
149 case X86::ADD32rr:
150 case X86::ADD32rr_DB:
151 case X86::ADD64i32:
152 case X86::ADD64ri32:
153 case X86::ADD64ri32_DB:
154 case X86::ADD64ri8:
155 case X86::ADD64ri8_DB:
156 case X86::ADD64rm:
157 case X86::ADD64rr:
158 case X86::ADD64rr_DB:
159 case X86::ADD8i8:
160 case X86::ADD8mi:
161 case X86::ADD8mr:
162 case X86::ADD8ri:
163 case X86::ADD8rm:
164 case X86::ADD8rr:
165 case X86::SUB16i16:
166 case X86::SUB16ri:
167 case X86::SUB16ri8:
168 case X86::SUB16rm:
169 case X86::SUB16rr:
170 case X86::SUB32i32:
171 case X86::SUB32ri:
172 case X86::SUB32ri8:
173 case X86::SUB32rm:
174 case X86::SUB32rr:
175 case X86::SUB64i32:
176 case X86::SUB64ri32:
177 case X86::SUB64ri8:
178 case X86::SUB64rm:
179 case X86::SUB64rr:
180 case X86::SUB8i8:
181 case X86::SUB8ri:
182 case X86::SUB8rm:
183 case X86::SUB8rr:
184 return FuseKind == FuseCmp || FuseKind == FuseInc;
185 case X86::INC16r:
186 case X86::INC32r:
187 case X86::INC64r:
188 case X86::INC8r:
189 case X86::DEC16r:
190 case X86::DEC32r:
191 case X86::DEC64r:
192 case X86::DEC8r:
193 return FuseKind == FuseInc;
194 case X86::INSTRUCTION_LIST_END:
195 return true;
196 }
197 }
198
199 /// \brief Post-process the DAG to create cluster edges between instructions
200 /// that may be fused by the processor into a single operation.
201 class X86MacroFusion : public ScheduleDAGMutation {
202 public:
203 X86MacroFusion() {}
204
205 void apply(ScheduleDAGInstrs *DAGInstrs) override;
206 };
207
208 void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
209 ScheduleDAGMI *DAG = static_cast(DAGInstrs);
210 const X86Subtarget &ST = DAG->MF.getSubtarget();
211
212 // For now, assume targets can only fuse with the branch.
213 SUnit &ExitSU = DAG->ExitSU;
214 MachineInstr *Branch = ExitSU.getInstr();
215 if (!shouldScheduleAdjacent(ST, nullptr, Branch))
216 return;
217
218 for (SDep &PredDep : ExitSU.Preds) {
219 if (PredDep.isWeak())
220 continue;
221 SUnit &SU = *PredDep.getSUnit();
222 MachineInstr &Pred = *SU.getInstr();
223 if (!shouldScheduleAdjacent(ST, &Pred, Branch))
224 continue;
225
226 // Create a single weak edge from SU to ExitSU. The only effect is to cause
227 // bottom-up scheduling to heavily prioritize the clustered SU. There is no
228 // need to copy predecessor edges from ExitSU to SU, since top-down
229 // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
230 // of SU, we could create an artificial edge from the deepest root, but it
231 // hasn't been needed yet.
232 bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
233 (void)Success;
234 assert(Success && "No DAG nodes should be reachable from ExitSU");
235
236 // Adjust latency of data deps between the nodes.
237 for (SDep &PredDep : ExitSU.Preds)
238 if (PredDep.getSUnit() == &SU)
239 PredDep.setLatency(0);
240 for (SDep &SuccDep : SU.Succs)
241 if (SuccDep.getSUnit() == &ExitSU)
242 SuccDep.setLatency(0);
243
244 DEBUG(dbgs() << "Macro fuse ";
245 SU.print(dbgs(), DAG);
246 dbgs() << " - ExitSU" << '\n');
247
248 break;
249 }
250 }
251
252 } // end namespace
253
254 namespace llvm {
255
256 std::unique_ptr
257 createX86MacroFusionDAGMutation () {
258 return EnableMacroFusion ? make_unique() : nullptr;
259 }
260
261 } // end namespace llvm
0 //===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // \file This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains the X86 definition of the DAG scheduling mutation to pair
10 // instructions back to back.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "X86InstrInfo.h"
15 #include "llvm/CodeGen/MachineScheduler.h"
16
17 //===----------------------------------------------------------------------===//
18 // X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
19 //===----------------------------------------------------------------------===//
20
21 namespace llvm {
22
23 /// Note that you have to add:
24 /// DAG.addMutation(createX86MacroFusionDAGMutation());
25 /// to X86PassConfig::createMachineScheduler() to have an effect.
26 std::unique_ptr
27 createX86MacroFusionDAGMutation();
28
29 } // end namespace llvm
1313 #include "X86TargetMachine.h"
1414 #include "X86.h"
1515 #include "X86CallLowering.h"
16 #include "X86MacroFusion.h"
1617 #include "X86TargetObjectFile.h"
1718 #include "X86TargetTransformInfo.h"
1819 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
288289 ScheduleDAGInstrs *
289290 createMachineScheduler(MachineSchedContext *C) const override {
290291 ScheduleDAGMILive *DAG = createGenericSchedLive(C);
291 DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
292 DAG->addMutation(createX86MacroFusionDAGMutation());
292293 return DAG;
293294 }
294295
0 ; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s
11 ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
22
3 target triple = "arm64-apple-ios"
3 target triple = "aarch64-unknown"
44
55 declare void @foobar(i32 %v0, i32 %v1)
66
77 ; Make sure sub is scheduled in front of cbnz
88 ; CHECK-LABEL: test_sub_cbz:
9 ; CHECK: add w[[ADDRES:[0-9]+]], w1, #7
109 ; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13
11 ; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]]
12 ; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]]
13 ; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]]
14 ; CHECK: bl _foobar
15 ; CHECK: [[SKIPBLOCK]]:
16 ; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]]
17 ; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]]
18 ; CHECK: bl _foobar
10 ; CHECK-NEXT: cbnz w[[SUBRES]], {{.?LBB[0-9_]+}}
1911 define void @test_sub_cbz(i32 %a0, i32 %a1) {
2012 entry:
2113 ; except for the fusion opportunity the sub/add should be equal so the