llvm.org GIT mirror llvm / 84c5eed
This patch combines several changes from Evan Cheng for rdar://8659675. Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Enable these fp vmlx codegen changes for Cortex-A9. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@129775 91177308-0d34-0410-b5e6-96231b3b80d8 Bob Wilson 8 years ago
6 changed file(s) with 80 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
117117 FeatureT2XtPk]>;
118118 def ProcA9 : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
119119 "Cortex-A9 ARM processors",
120 [FeatureHasSlowFPVMLx, FeatureVMLxForwarding,
120 [FeatureVMLxForwarding,
121121 FeatureT2XtPk, FeatureFP16,
122122 FeatureAvoidPartialCPSR]>;
123123
4848 const TargetInstrDesc &LastTID = LastMI->getDesc();
4949 // Skip over one non-VFP / NEON instruction.
5050 if (!LastTID.isBarrier() &&
51 // On A9, AGU and NEON/FPU are muxed.
52 !(STI.isCortexA9() && (LastTID.mayLoad() || LastTID.mayStore())) &&
5153 (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
5254 MachineBasicBlock::iterator I = LastMI;
5355 if (I != LastMI->getParent()->begin()) {
4444 static cl::opt
4545 CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
4646 cl::desc("Check fp vmla / vmls hazard at isel time"),
47 cl::init(false));
47 cl::init(true));
4848
4949 //===--------------------------------------------------------------------===//
5050 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
2020 #include "llvm/Target/TargetOptions.h"
2121 #include "llvm/Target/TargetRegistry.h"
2222 using namespace llvm;
23
24 static cl::optExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
2523
2624 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
2725 Triple TheTriple(TT);
147145 // FIXME: temporarily disabling load / store optimization pass for Thumb1.
148146 if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
149147 PM.add(createARMLoadStoreOptimizationPass(true));
150 if (ExpandMLx &&
151 OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
148 if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9())
152149 PM.add(createMLxExpansionPass());
153150
154151 return true;
1414 #define DEBUG_TYPE "mlx-expansion"
1515 #include "ARM.h"
1616 #include "ARMBaseInstrInfo.h"
17 #include "ARMSubtarget.h"
1718 #include "llvm/CodeGen/MachineInstr.h"
1819 #include "llvm/CodeGen/MachineInstrBuilder.h"
1920 #include "llvm/CodeGen/MachineFunctionPass.h"
2021 #include "llvm/CodeGen/MachineRegisterInfo.h"
2122 #include "llvm/Target/TargetRegisterInfo.h"
23 #include "llvm/ADT/SmallPtrSet.h"
2224 #include "llvm/ADT/Statistic.h"
2325 #include "llvm/Support/CommandLine.h"
2426 #include "llvm/Support/Debug.h"
4850 const TargetRegisterInfo *TRI;
4951 MachineRegisterInfo *MRI;
5052
53 bool isA9;
5154 unsigned MIIdx;
5255 MachineInstr* LastMIs[4];
56 SmallPtrSet IgnoreStall;
5357
5458 void clearStack();
5559 void pushStack(MachineInstr *MI);
5660 MachineInstr *getAccDefMI(MachineInstr *MI) const;
5761 unsigned getDefReg(MachineInstr *MI) const;
5862 bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
59 bool FindMLxHazard(MachineInstr *MI) const;
63 bool FindMLxHazard(MachineInstr *MI);
6064 void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
6165 unsigned MulOpc, unsigned AddSubOpc,
6266 bool NegAcc, bool HasLane);
145149 }
146150
147151
148 bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
152 bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
149153 if (NumExpand >= ExpandLimit)
150154 return false;
151155
153157 return true;
154158
155159 MachineInstr *DefMI = getAccDefMI(MI);
156 if (TII->isFpMLxInstruction(DefMI->getOpcode()))
160 if (TII->isFpMLxInstruction(DefMI->getOpcode())) {
157161 // r0 = vmla
158162 // r3 = vmla r0, r1, r2
159163 // takes 16 - 17 cycles
162166 // r4 = vmul r1, r2
163167 // r3 = vadd r0, r4
164168 // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
169 IgnoreStall.insert(DefMI);
165170 return true;
171 }
172
173 if (IgnoreStall.count(MI))
174 return false;
166175
167176 // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
168177 // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
169178 // preserves the in-order retirement of the instructions.
170179 // Look at the next few instructions, if *most* of them can cause hazards,
171180 // then the scheduler can't *fix* this, we'd better break up the VMLA.
181 unsigned Limit1 = isA9 ? 1 : 4;
182 unsigned Limit2 = isA9 ? 1 : 4;
172183 for (unsigned i = 1; i <= 4; ++i) {
173184 int Idx = ((int)MIIdx - i + 4) % 4;
174185 MachineInstr *NextMI = LastMIs[Idx];
175186 if (!NextMI)
176187 continue;
177188
178 if (TII->canCauseFpMLxStall(NextMI->getOpcode()))
179 return true;
189 if (TII->canCauseFpMLxStall(NextMI->getOpcode())) {
190 if (i <= Limit1)
191 return true;
192 }
180193
181194 // Look for VMLx RAW hazard.
182 if (hasRAWHazard(getDefReg(MI), NextMI))
195 if (i <= Limit2 && hasRAWHazard(getDefReg(MI), NextMI))
183196 return true;
184197 }
185198
247260 bool Changed = false;
248261
249262 clearStack();
263 IgnoreStall.clear();
250264
251265 unsigned Skip = 0;
252266 MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
298312 TII = static_cast(Fn.getTarget().getInstrInfo());
299313 TRI = Fn.getTarget().getRegisterInfo();
300314 MRI = &Fn.getRegInfo();
315 const ARMSubtarget *STI = &Fn.getTarget().getSubtarget();
316 isA9 = STI->isCortexA9();
301317
302318 bool Modified = false;
303319 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
0 ; RUN: llc < %s -march=arm -mattr=+vfp2 | FileCheck %s -check-prefix=VFP2
11 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s -check-prefix=NEON
22 ; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8
3 ; RUN: llc < %s -march=arm -mcpu=cortex-a9 | FileCheck %s -check-prefix=A9
4 ; RUN: llc < %s -mtriple=arm-linux-gnueabi -mcpu=cortex-a9 -float-abi=hard | FileCheck %s -check-prefix=HARD
35
46 define float @t1(float %acc, float %a, float %b) {
57 entry:
4850 %1 = fadd float %0, %acc
4951 ret float %1
5052 }
53
54 ; It's possible to make use of fp vmla / vmls on Cortex-A9.
55 ; rdar://8659675
56 define void @t4(float %acc1, float %a, float %b, float %acc2, float %c, float* %P1, float* %P2) {
57 entry:
58 ; A8: t4:
59 ; A8: vmul.f32
60 ; A8: vmul.f32
61 ; A8: vadd.f32
62 ; A8: vadd.f32
63
64 ; Two vmla with now RAW hazard
65 ; A9: t4:
66 ; A9: vmla.f32
67 ; A9: vmla.f32
68
69 ; HARD: t4:
70 ; HARD: vmla.f32 s0, s1, s2
71 ; HARD: vmla.f32 s3, s1, s4
72 %0 = fmul float %a, %b
73 %1 = fadd float %acc1, %0
74 %2 = fmul float %a, %c
75 %3 = fadd float %acc2, %2
76 store float %1, float* %P1
77 store float %3, float* %P2
78 ret void
79 }
80
81 define float @t5(float %a, float %b, float %c, float %d, float %e) {
82 entry:
83 ; A8: t5:
84 ; A8: vmul.f32
85 ; A8: vmul.f32
86 ; A8: vadd.f32
87 ; A8: vadd.f32
88
89 ; A9: t5:
90 ; A9: vmla.f32
91 ; A9: vmul.f32
92 ; A9: vadd.f32
93
94 ; HARD: t5:
95 ; HARD: vmla.f32 s4, s0, s1
96 ; HARD: vmul.f32 s0, s2, s3
97 ; HARD: vadd.f32 s0, s4, s0
98 %0 = fmul float %a, %b
99 %1 = fadd float %e, %0
100 %2 = fmul float %c, %d
101 %3 = fadd float %1, %2
102 ret float %3
103 }