llvm.org GIT mirror llvm / 13141f0
[PowerPC] Fix the PPCInstrInfo::getInstrLatency implementation PowerPC uses itineraries to describe processor pipelines (and dispatch-group restrictions for P7/P8 cores). Unfortunately, the target-independent implementation of TII.getInstrLatency calls ItinData->getStageLatency, and that looks for the largest cycle count in the pipeline for any given instruction. This, however, yields the wrong answer for the PPC itineraries, because we don't encode the full pipeline. Because the functional units are fully pipelined, we only model the initial stages (there are no relevant hazards in the later stages to model), and so the technique employed by getStageLatency does not really work. Instead, we should take the maximum output operand latency, and that's what PPCInstrInfo::getInstrLatency now does. This caused some test-case churn, including two unfortunate side effects. First, the new arrangement of copies we get from function parameters now sometimes blocks VSX FMA mutation (a FIXME has been added to the code and the test cases), and we have one significant test-suite regression: SingleSource/Benchmarks/BenchmarkGame/spectral-norm 56.4185% +/- 18.9398% In this benchmark we have a loop with a vectorized FP divide, and it with the new scheduling both divides end up in the same dispatch group (which in this case seems to cause a problem, although why is not exactly clear). The grouping structure is hard to predict from the bottom of the loop, and there may not be much we can do to fix this. Very few other test-suite performance effects were really significant, but almost all weakly favor this change. However, in light of the issues highlighted above, I've left the old behavior available via a command-line flag. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242188 91177308-0d34-0410-b5e6-96231b3b80d8 Hal Finkel 4 years ago
12 changed file(s) with 96 addition(s) and 34 deletion(s). Raw diff Collapse all Expand all
5656 cl::desc("Causes the backend to crash instead of generating a nop VSX copy"),
5757 cl::Hidden);
5858
59 static cl::opt
60 UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
61 cl::desc("Use the old (incorrect) instruction latency calculation"));
62
5963 // Pin the vtable to this file.
6064 void PPCInstrInfo::anchor() {}
6165
102106 return new ScoreboardHazardRecognizer(II, DAG);
103107 }
104108
109 unsigned PPCInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
110 const MachineInstr *MI,
111 unsigned *PredCost) const {
112 if (!ItinData || UseOldLatencyCalc)
113 return PPCGenInstrInfo::getInstrLatency(ItinData, MI, PredCost);
114
115 // The default implementation of getInstrLatency calls getStageLatency, but
116 // getStageLatency does not do the right thing for us. While we have
117 // itinerary, most cores are fully pipelined, and so the itineraries only
118 // express the first part of the pipeline, not every stage. Instead, we need
119 // to use the listed output operand cycle number (using operand 0 here, which
120 // is an output).
121
122 unsigned Latency = 1;
123 unsigned DefClass = MI->getDesc().getSchedClass();
124 for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
125 const MachineOperand &MO = MI->getOperand(i);
126 if (!MO.isReg() || !MO.isDef() || MO.isImplicit())
127 continue;
128
129 int Cycle = ItinData->getOperandCycle(DefClass, i);
130 if (Cycle < 0)
131 continue;
132
133 Latency = std::max(Latency, (unsigned) Cycle);
134 }
135
136 return Latency;
137 }
105138
106139 int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
107140 const MachineInstr *DefMI, unsigned DefIdx,
9494 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
9595 const ScheduleDAG *DAG) const override;
9696
97 unsigned getInstrLatency(const InstrItineraryData *ItinData,
98 const MachineInstr *MI,
99 unsigned *PredCost = nullptr) const override;
100
97101 int getOperandLatency(const InstrItineraryData *ItinData,
98102 const MachineInstr *DefMI, unsigned DefIdx,
99103 const MachineInstr *UseMI,
314314 P7_DU3, P7_DU4], 0>,
315315 InstrStage<1, [P7_VS1, P7_VS2]>],
316316 [5, 1, 1]>,
317 InstrItinData
318 P7_DU3, P7_DU4], 0>,
319 InstrStage<1, [P7_VS1, P7_VS2]>],
320 [5, 1, 1]>,
317321 InstrItinData
318322 P7_DU3, P7_DU4], 0>,
319323 InstrStage<1, [P7_VS1, P7_VS2]>],
322322 P8_DU4, P8_DU5, P8_DU6], 0>,
323323 InstrStage<1, [P8_FPU1, P8_FPU2]>],
324324 [5, 1, 1]>,
325 InstrItinData
326 P8_DU4, P8_DU5, P8_DU6], 0>,
327 InstrStage<1, [P8_FPU1, P8_FPU2]>],
328 [5, 1, 1]>,
325329 InstrItinData
326330 P8_DU4, P8_DU5, P8_DU6], 0>,
327331 InstrStage<1, [P8_FPU1, P8_FPU2]>],
135135 // source of the copy, it must still be live here. We can't use
136136 // interval testing for a physical register, so as long as we're
137137 // walking the MIs we may as well test liveness here.
138 //
139 // FIXME: There is a case that occurs in practice, like this:
140 // %vreg9 = COPY %F1; VSSRC:%vreg9
141 // ...
142 // %vreg6 = COPY %vreg9; VSSRC:%vreg6,%vreg9
143 // %vreg7 = COPY %vreg9; VSSRC:%vreg7,%vreg9
144 // %vreg9 = XSMADDASP %vreg9, %vreg1, %vreg4; VSSRC:
145 // %vreg6 = XSMADDASP %vreg6, %vreg1, %vreg2; VSSRC:
146 // %vreg7 = XSMADDASP %vreg7, %vreg1, %vreg3; VSSRC:
147 // which prevents an otherwise-profitable transformation.
138148 bool OtherUsers = false, KillsAddendSrc = false;
139149 for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
140150 J != JE; --J) {
1414 ; CHECK-DAG: cmplwi {{[0-9]+}}, 3, 0
1515 ; CHECK-DAG: li [[REG2:[0-9]+]], 1
1616 ; CHECK-DAG: cntlzw [[REG3:[0-9]+]],
17 ; CHECK: isel 3, 0, [[REG2]]
18 ; CHECK: and 3, 3, [[REG3]]
17 ; CHECK: isel [[REG4:[0-9]+]], 0, [[REG2]]
18 ; CHECK: and 3, [[REG4]], [[REG3]]
1919 ; CHECK: blr
2020 }
2121
3434 }
3535
3636 define void @cg2(i64 %v) #0 {
37 tail call fastcc i64 @g1(i64 0, double 0.0, i64 %v, double 0.0, i64 0, double 0.0, i64 0, double 0.0)
37 call fastcc i64 @g1(i64 0, double 0.0, i64 %v, double 0.0, i64 0, double 0.0, i64 0, double 0.0)
3838 ret void
3939
4040 ; CHECK-LABEL: @cg2
4343 }
4444
4545 define void @cf2(double %v) #0 {
46 tail call fastcc i64 @g1(i64 0, double 0.0, i64 0, double %v, i64 0, double 0.0, i64 0, double 0.0)
46 call fastcc i64 @g1(i64 0, double 0.0, i64 0, double %v, i64 0, double 0.0, i64 0, double 0.0)
4747 ret void
4848
4949 ; CHECK-LABEL: @cf2
50 ; CHECK: mr 2, 1
50 ; CHECK: fmr 2, 1
5151 ; CHECK: blr
5252 }
5353
520520 ret void
521521
522522 ; CHECK-LABEL: @cv13
523 ; CHECK: li [[REG1:[0-9]+]], 96
524 ; CHECK: stvx 2, 1, [[REG1]]
523 ; CHECK-DAG: li [[REG1:[0-9]+]], 96
524 ; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
525 ; CHECK: stvx [[REG2]], 1, [[REG1]]
525526 ; CHECK: blr
526527 }
527528
530531 ret void
531532
532533 ; CHECK-LABEL: @cv14
533 ; CHECK: li [[REG1:[0-9]+]], 128
534 ; CHECK: stvx 2, 1, [[REG1]]
534 ; CHECK-DAG: li [[REG1:[0-9]+]], 128
535 ; CHECK-DAG: vor [[REG2:[0-9]+]], 2, 2
536 ; CHECK: stvx [[REG2]], 1, [[REG1]]
535537 ; CHECK: blr
536538 }
537539
1717 ; CHECK: addi [[REG]], [[REG]], env_sigill@toc@l
1818 ; CHECK: ld 31, 0([[REG]])
1919 ; CHECK: ld [[REG2:[0-9]+]], 8([[REG]])
20 ; CHECK: ld 1, 16([[REG]])
21 ; CHECK: mtctr [[REG2]]
22 ; CHECK: ld 30, 32([[REG]])
23 ; CHECK: ld 2, 24([[REG]])
20 ; CHECK-DAG: ld 1, 16([[REG]])
21 ; CHECK-DAG: mtctr [[REG2]]
22 ; CHECK-DAG: ld 30, 32([[REG]])
23 ; CHECK-DAG: ld 2, 24([[REG]])
2424 ; CHECK: bctr
2525
2626 return: ; No predecessors!
2828 ; CHECK: addi 3, {{[0-9]+}}, __once_call@got@tlsgd@l
2929 ; CHECK: bl __tls_get_addr(__once_call@tlsgd)
3030 ; CHECK-NEXT: nop
31 ; CHECK: std {{[0-9]+}}, 0(3)
31 ; FIXME: We don't really need the copy here either, we could move the store up.
32 ; CHECK: mr [[REG1:[0-9]+]], 3
33 ; CHECK: std {{[0-9]+}}, 0([[REG1]])
3234
3335 declare void @__once_call_impl()
4848 ; CHECK-LABEL: @test2
4949 ; CHECK-DAG: li [[C1:[0-9]+]], 8
5050 ; CHECK-DAG: li [[C2:[0-9]+]], 16
51 ; CHECK-DAG: xsmaddmdp 3, 2, 1
52 ; CHECK-DAG: xsmaddmdp 4, 2, 1
53 ; CHECK-DAG: xsmaddadp 1, 2, 5
54 ; CHECK-DAG: stxsdx 3, 0, 8
55 ; CHECK-DAG: stxsdx 4, 8, [[C1]]
56 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
51 ; FIXME: We no longer get this because of copy ordering at the MI level.
52 ; CHECX-DAG: xsmaddmdp 3, 2, 1
53 ; CHECX-DAG: xsmaddmdp 4, 2, 1
54 ; CHECX-DAG: xsmaddadp 1, 2, 5
55 ; CHECX-DAG: stxsdx 3, 0, 8
56 ; CHECX-DAG: stxsdx 4, 8, [[C1]]
57 ; CHECX-DAG: stxsdx 1, 8, [[C2]]
5758 ; CHECK: blr
5859
5960 ; CHECK-FISL-LABEL: @test2
212213 ret void
213214
214215 ; CHECK-LABEL: @testv2
215 ; CHECK-DAG: xvmaddmdp 36, 35, 34
216 ; CHECK-DAG: xvmaddmdp 37, 35, 34
217 ; CHECK-DAG: li [[C1:[0-9]+]], 16
218 ; CHECK-DAG: li [[C2:[0-9]+]], 32
219 ; CHECK-DAG: xvmaddadp 34, 35, 38
220 ; CHECK-DAG: stxvd2x 36, 0, 3
221 ; CHECK-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
222 ; CHECK-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
216 ; FIXME: We currently don't get this because of copy ordering on the MI level.
217 ; CHECX-DAG: xvmaddmdp 36, 35, 34
218 ; CHECX-DAG: xvmaddmdp 37, 35, 34
219 ; CHECX-DAG: li [[C1:[0-9]+]], 16
220 ; CHECX-DAG: li [[C2:[0-9]+]], 32
221 ; CHECX-DAG: xvmaddadp 34, 35, 38
222 ; CHECX-DAG: stxvd2x 36, 0, 3
223 ; CHECX-DAG: stxvd2x 37, 3, [[C1:[0-9]+]]
224 ; CHECX-DAG: stxvd2x 34, 3, [[C2:[0-9]+]]
223225 ; CHECK: blr
224226
225227 ; CHECK-FISL-LABEL: @testv2
4141 ; CHECK-LABEL: @test2sp
4242 ; CHECK-DAG: li [[C1:[0-9]+]], 4
4343 ; CHECK-DAG: li [[C2:[0-9]+]], 8
44 ; CHECK-DAG: xsmaddmsp 3, 2, 1
45 ; CHECK-DAG: xsmaddmsp 4, 2, 1
46 ; CHECK-DAG: xsmaddasp 1, 2, 5
47 ; CHECK-DAG: stxsspx 3, 0, 8
48 ; CHECK-DAG: stxsspx 4, 8, [[C1]]
49 ; CHECK-DAG: stxsspx 1, 8, [[C2]]
44 ; FIXME: We now miss this because of copy ordering at the MI level.
45 ; CHECX-DAG: xsmaddmsp 3, 2, 1
46 ; CHECX-DAG: xsmaddmsp 4, 2, 1
47 ; CHECX-DAG: xsmaddasp 1, 2, 5
48 ; CHECX-DAG: stxsspx 3, 0, 8
49 ; CHECX-DAG: stxsspx 4, 8, [[C1]]
50 ; CHECX-DAG: stxsspx 1, 8, [[C2]]
5051 ; CHECK: blr
5152
5253 ; CHECK-FISL-LABEL: @test2sp