llvm.org GIT mirror llvm / 48575f6
Making use of VFP / NEON floating point multiply-accumulate / subtraction is difficult on current ARM implementations for a few reasons. 1. Even though a single vmla has latency that is one cycle shorter than a pair of vmul + vadd, a RAW hazard during the first (4? on Cortex-a8) can cause additional pipeline stall. So it's frequently better to single codegen vmul + vadd. 2. A vmla folowed by a vmul, vmadd, or vsub causes the second fp instruction to stall for 4 cycles. We need to schedule them apart. 3. A vmla followed vmla is a special case. Obvious issuing back to back RAW vmla + vmla is very bad. But this isn't ideal either: vmul vadd vmla Instead, we want to expand the second vmla: vmla vmul vadd Even with the 4 cycle vmul stall, the second sequence is still 2 cycles faster. Up to now, isel simply avoid codegen'ing fp vmla / vmls. This works well enough but it isn't the optimial solution. This patch attempts to make it possible to use vmla / vmls in cases where it is profitable. A. Add missing isel predicates which cause vmla to be codegen'ed. B. Make sure the fmul in (fadd (fmul)) has a single use. We don't want to compute a fmul and a fmla. C. Add additional isel checks for vmla, avoid cases where vmla is feeding into fp instructions (except for the #3 exceptional case). D. Add ARM hazard recognizer to model the vmla / vmls hazards. E. Add a special pre-regalloc case to expand vmla / vmls when it's likely the vmla / vmls will trigger one of the special hazards. Work in progress, only A+B are enabled. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@120960 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 9 years ago
20 changed file(s) with 778 addition(s) and 197 deletion(s). Raw diff Collapse all Expand all
4848 FunctionPass *createARMGlobalMergePass(const TargetLowering* tli);
4949 FunctionPass *createARMConstantIslandPass();
5050 FunctionPass *createNEONMoveFixPass();
51 FunctionPass *createMLxExpansionPass();
5152 FunctionPass *createThumb2ITBlockPass();
5253 FunctionPass *createThumb2SizeReductionPass();
5354
4545 def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true",
4646 "Floating point unit supports single precision only">;
4747
48 // Some processors have multiply-accumulate instructions that don't
49 // play nicely with other VFP instructions, and it's generally better
48 // Some processors have FP multiply-accumulate instructions that don't
49 // play nicely with other VFP / NEON instructions, and it's generally better
5050 // to just not use them.
51 // FIXME: Currently, this is only flagged for Cortex-A8. It may be true for
52 // others as well. We should do more benchmarking and confirm one way or
53 // the other.
54 def FeatureHasSlowVMLx : SubtargetFeature<"vmlx", "SlowVMLx", "true",
55 "Disable VFP MAC instructions">;
51 def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true",
52 "Disable VFP / NEON MAC instructions">;
5653 // Some processors benefit from using NEON instructions for scalar
5754 // single-precision FP operations.
5855 def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP",
149146 // V6 Processors.
150147 def : Processor<"arm1136j-s", ARMV6Itineraries, [ArchV6]>;
151148 def : Processor<"arm1136jf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2,
152 FeatureHasSlowVMLx]>;
149 FeatureHasSlowFPVMLx]>;
153150 def : Processor<"arm1176jz-s", ARMV6Itineraries, [ArchV6]>;
154 def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
151 def : Processor<"arm1176jzf-s", ARMV6Itineraries, [ArchV6, FeatureVFP2,
152 FeatureHasSlowFPVMLx]>;
155153 def : Processor<"mpcorenovfp", ARMV6Itineraries, [ArchV6]>;
156 def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
154 def : Processor<"mpcore", ARMV6Itineraries, [ArchV6, FeatureVFP2,
155 FeatureHasSlowFPVMLx]>;
157156
158157 // V6M Processors.
159158 def : Processor<"cortex-m0", ARMV6Itineraries, [ArchV6M]>;
160159
161160 // V6T2 Processors.
162161 def : Processor<"arm1156t2-s", ARMV6Itineraries, [ArchV6T2]>;
163 def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2]>;
162 def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ArchV6T2, FeatureVFP2,
163 FeatureHasSlowFPVMLx]>;
164164
165165 // V7 Processors.
166166 def : Processor<"cortex-a8", CortexA8Itineraries,
167167 [ArchV7A, ProcA8,
168 FeatureHasSlowVMLx, FeatureT2XtPk]>;
168 FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
169169 def : Processor<"cortex-a9", CortexA9Itineraries,
170170 [ArchV7A, ProcA9,
171 FeatureHasSlowVMLx, FeatureT2XtPk]>;
171 FeatureHasSlowFPVMLx, FeatureT2XtPk]>;
172172
173173 // V7M Processors.
174174 def : ProcNoItin<"cortex-m3", [ArchV7M]>;
1414 #include "ARM.h"
1515 #include "ARMAddressingModes.h"
1616 #include "ARMConstantPoolValue.h"
17 #include "ARMHazardRecognizer.h"
1718 #include "ARMMachineFunctionInfo.h"
1819 #include "ARMRegisterInfo.h"
1920 #include "ARMGenInstrInfo.inc"
3940 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
4041 cl::desc("Enable ARM 2-addr to 3-addr conv"));
4142
43
44 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
45 struct ARM_MLxEntry {
46 unsigned MLxOpc; // MLA / MLS opcode
47 unsigned MulOpc; // Expanded multiplication opcode
48 unsigned AddSubOpc; // Expanded add / sub opcode
49 bool NegAcc; // True if the acc is negated before the add / sub.
50 bool HasLane; // True if instruction has an extra "lane" operand.
51 };
52
53 static const ARM_MLxEntry ARM_MLxTable[] = {
54 // MLxOpc, MulOpc, AddSubOpc, NegAcc, HasLane
55 // fp scalar ops
56 { ARM::VMLAS, ARM::VMULS, ARM::VADDS, false, false },
57 { ARM::VMLSS, ARM::VMULS, ARM::VSUBS, false, false },
58 { ARM::VMLAD, ARM::VMULD, ARM::VADDD, false, false },
59 { ARM::VMLSD, ARM::VMULD, ARM::VSUBD, false, false },
60 { ARM::VMLAfd_sfp, ARM::VMULfd_sfp, ARM::VADDfd_sfp, false, false },
61 { ARM::VMLSfd_sfp, ARM::VMULfd_sfp, ARM::VSUBfd_sfp, false, false },
62 { ARM::VNMLAS, ARM::VNMULS, ARM::VSUBS, true, false },
63 { ARM::VNMLSS, ARM::VMULS, ARM::VSUBS, true, false },
64 { ARM::VNMLAD, ARM::VNMULD, ARM::VSUBD, true, false },
65 { ARM::VNMLSD, ARM::VMULD, ARM::VSUBD, true, false },
66
67 // fp SIMD ops
68 { ARM::VMLAfd, ARM::VMULfd, ARM::VADDfd, false, false },
69 { ARM::VMLSfd, ARM::VMULfd, ARM::VSUBfd, false, false },
70 { ARM::VMLAfq, ARM::VMULfq, ARM::VADDfq, false, false },
71 { ARM::VMLSfq, ARM::VMULfq, ARM::VSUBfq, false, false },
72 { ARM::VMLAslfd, ARM::VMULslfd, ARM::VADDfd, false, true },
73 { ARM::VMLSslfd, ARM::VMULslfd, ARM::VSUBfd, false, true },
74 { ARM::VMLAslfq, ARM::VMULslfq, ARM::VADDfq, false, true },
75 { ARM::VMLSslfq, ARM::VMULslfq, ARM::VSUBfq, false, true },
76 };
77
4278 ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
4379 : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)),
4480 Subtarget(STI) {
81 for (unsigned i = 0, e = array_lengthof(ARM_MLxTable); i != e; ++i) {
82 if (!MLxEntryMap.insert(std::make_pair(ARM_MLxTable[i].MLxOpc, i)).second)
83 assert(false && "Duplicated entries?");
84 MLxHazardOpcodes.insert(ARM_MLxTable[i].AddSubOpc);
85 MLxHazardOpcodes.insert(ARM_MLxTable[i].MulOpc);
86 }
87 }
88
89 ScheduleHazardRecognizer *ARMBaseInstrInfo::
90 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const {
91 if (Subtarget.isThumb2() || Subtarget.hasVFP2())
92 return (ScheduleHazardRecognizer *)
93 new ARMHazardRecognizer(II, *this, getRegisterInfo(), Subtarget);
94 return TargetInstrInfoImpl::CreateTargetPostRAHazardRecognizer(II);
4595 }
4696
4797 MachineInstr *
195245 MFI->insert(MBBI, NewMIs[0]);
196246 return NewMIs[0];
197247 }
198
199248
200249 // Branch analysis.
201250 bool
21952244 }
21962245 return false;
21972246 }
2247
2248 bool
2249 ARMBaseInstrInfo::isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
2250 unsigned &AddSubOpc,
2251 bool &NegAcc, bool &HasLane) const {
2252 DenseMap::const_iterator I = MLxEntryMap.find(Opcode);
2253 if (I == MLxEntryMap.end())
2254 return false;
2255
2256 const ARM_MLxEntry &Entry = ARM_MLxTable[I->second];
2257 MulOpc = Entry.MulOpc;
2258 AddSubOpc = Entry.AddSubOpc;
2259 NegAcc = Entry.NegAcc;
2260 HasLane = Entry.HasLane;
2261 return true;
2262 }
1616 #include "ARM.h"
1717 #include "llvm/CodeGen/MachineInstrBuilder.h"
1818 #include "llvm/Target/TargetInstrInfo.h"
19 #include "llvm/ADT/DenseMap.h"
20 #include "llvm/ADT/SmallSet.h"
1921
2022 namespace llvm {
2123 class ARMSubtarget;
190192
191193 class ARMBaseInstrInfo : public TargetInstrInfoImpl {
192194 const ARMSubtarget &Subtarget;
195
193196 protected:
194197 // Can be only subclassed.
195198 explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
199
196200 public:
197201 // Return the non-pre/post incrementing version of 'Opc'. Return 0
198202 // if there is not such an opcode.
205209 virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
206210 const ARMSubtarget &getSubtarget() const { return Subtarget; }
207211
208 public:
212 ScheduleHazardRecognizer *
213 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const;
214
209215 // Branch analysis.
210216 virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
211217 MachineBasicBlock *&FBB,
392398 const MachineInstr *UseMI, unsigned UseIdx) const;
393399 bool hasLowDefLatency(const InstrItineraryData *ItinData,
394400 const MachineInstr *DefMI, unsigned DefIdx) const;
401
402 private:
403 /// Modeling special VFP / NEON fp MLA / MLS hazards.
404
405 /// MLxEntryMap - Map fp MLA / MLS to the corresponding entry in the internal
406 /// MLx table.
407 DenseMap MLxEntryMap;
408
409 /// MLxHazardOpcodes - Set of add / sub and multiply opcodes that would cause
410 /// stalls when scheduled together with fp MLA / MLS opcodes.
411 SmallSet MLxHazardOpcodes;
412
413 public:
414 /// isFpMLxInstruction - Return true if the specified opcode is a fp MLA / MLS
415 /// instruction.
416 bool isFpMLxInstruction(unsigned Opcode) const {
417 return MLxEntryMap.count(Opcode);
418 }
419
420 /// isFpMLxInstruction - This version also returns the multiply opcode and the
421 /// addition / subtraction opcode to expand to. Return true for 'HasLane' for
422 /// the MLX instructions with an extra lane operand.
423 bool isFpMLxInstruction(unsigned Opcode, unsigned &MulOpc,
424 unsigned &AddSubOpc, bool &NegAcc,
425 bool &HasLane) const;
426
427 /// canCauseFpMLxStall - Return true if an instruction of the specified opcode
428 /// will cause stalls when scheduled after (within 4-cycle window) a fp
429 /// MLA / MLS instruction.
430 bool canCauseFpMLxStall(unsigned Opcode) const {
431 return MLxHazardOpcodes.count(Opcode);
432 }
395433 };
396434
397435 static inline
0 //===-- ARMHazardRecognizer.cpp - ARM postra hazard recognizer ------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "ARMHazardRecognizer.h"
10 #include "ARMBaseInstrInfo.h"
11 #include "ARMSubtarget.h"
12 #include "llvm/CodeGen/MachineInstr.h"
13 #include "llvm/CodeGen/ScheduleDAG.h"
14 #include "llvm/Target/TargetRegisterInfo.h"
15 using namespace llvm;
16
17 static bool hasRAWHazard(MachineInstr *DefMI, MachineInstr *MI,
18 const TargetRegisterInfo &TRI) {
19 // FIXME: Detect integer instructions properly.
20 const TargetInstrDesc &TID = MI->getDesc();
21 unsigned Domain = TID.TSFlags & ARMII::DomainMask;
22 if (Domain == ARMII::DomainVFP) {
23 unsigned Opcode = MI->getOpcode();
24 if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
25 Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
26 return false;
27 } else if (Domain == ARMII::DomainNEON) {
28 if (MI->getDesc().mayStore() || MI->getDesc().mayLoad())
29 return false;
30 } else
31 return false;
32 return MI->readsRegister(DefMI->getOperand(0).getReg(), &TRI);
33 }
34
35 ScheduleHazardRecognizer::HazardType
36 ARMHazardRecognizer::getHazardType(SUnit *SU) {
37 MachineInstr *MI = SU->getInstr();
38
39 if (!MI->isDebugValue()) {
40 if (ITBlockSize && MI != ITBlockMIs[ITBlockSize-1])
41 return Hazard;
42
43 // Look for special VMLA / VMLS hazards. A VMUL / VADD / VSUB following
44 // a VMLA / VMLS will cause 4 cycle stall.
45 const TargetInstrDesc &TID = MI->getDesc();
46 if (LastMI && (TID.TSFlags & ARMII::DomainMask) != ARMII::DomainGeneral) {
47 MachineInstr *DefMI = LastMI;
48 const TargetInstrDesc &LastTID = LastMI->getDesc();
49 // Skip over one non-VFP / NEON instruction.
50 if (!LastTID.isBarrier() &&
51 (LastTID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
52 MachineBasicBlock::iterator I = LastMI;
53 if (I != LastMI->getParent()->begin()) {
54 I = llvm::prior(I);
55 DefMI = &*I;
56 }
57 }
58
59 if (TII.isFpMLxInstruction(DefMI->getOpcode()) &&
60 (TII.canCauseFpMLxStall(MI->getOpcode()) ||
61 hasRAWHazard(DefMI, MI, TRI))) {
62 // Try to schedule another instruction for the next 4 cycles.
63 if (Stalls == 0)
64 Stalls = 4;
65 return Hazard;
66 }
67 }
68 }
69
70 return PostRAHazardRecognizer::getHazardType(SU);
71 }
72
73 void ARMHazardRecognizer::Reset() {
74 LastMI = 0;
75 Stalls = 0;
76 ITBlockSize = 0;
77 PostRAHazardRecognizer::Reset();
78 }
79
80 void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
81 MachineInstr *MI = SU->getInstr();
82 unsigned Opcode = MI->getOpcode();
83 if (ITBlockSize) {
84 --ITBlockSize;
85 } else if (Opcode == ARM::t2IT) {
86 unsigned Mask = MI->getOperand(1).getImm();
87 unsigned NumTZ = CountTrailingZeros_32(Mask);
88 assert(NumTZ <= 3 && "Invalid IT mask!");
89 ITBlockSize = 4 - NumTZ;
90 MachineBasicBlock::iterator I = MI;
91 for (unsigned i = 0; i < ITBlockSize; ++i) {
92 // Advance to the next instruction, skipping any dbg_value instructions.
93 do {
94 ++I;
95 } while (I->isDebugValue());
96 ITBlockMIs[ITBlockSize-1-i] = &*I;
97 }
98 }
99
100 if (!MI->isDebugValue()) {
101 LastMI = MI;
102 Stalls = 0;
103 }
104
105 PostRAHazardRecognizer::EmitInstruction(SU);
106 }
107
108 void ARMHazardRecognizer::AdvanceCycle() {
109 if (Stalls && --Stalls == 0)
110 // Stalled for 4 cycles but still can't schedule any other instructions.
111 LastMI = 0;
112 PostRAHazardRecognizer::AdvanceCycle();
113 }
0 //===-- ARMHazardRecognizer.h - ARM Hazard Recognizers ----------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines hazard recognizers for scheduling ARM functions.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #ifndef ARMHAZARDRECOGNIZER_H
14 #define ARMHAZARDRECOGNIZER_H
15
16 #include "llvm/CodeGen/PostRAHazardRecognizer.h"
17
18 namespace llvm {
19
20 class ARMBaseInstrInfo;
21 class ARMBaseRegisterInfo;
22 class ARMSubtarget;
23 class MachineInstr;
24
25 class ARMHazardRecognizer : public PostRAHazardRecognizer {
26 const ARMBaseInstrInfo &TII;
27 const ARMBaseRegisterInfo &TRI;
28 const ARMSubtarget &STI;
29
30 MachineInstr *LastMI;
31 unsigned Stalls;
32 unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled.
33 MachineInstr *ITBlockMIs[4];
34
35 public:
36 ARMHazardRecognizer(const InstrItineraryData *ItinData,
37 const ARMBaseInstrInfo &tii,
38 const ARMBaseRegisterInfo &tri,
39 const ARMSubtarget &sti) :
40 PostRAHazardRecognizer(ItinData), TII(tii), TRI(tri), STI(sti),
41 LastMI(0), ITBlockSize(0) {}
42
43 virtual HazardType getHazardType(SUnit *SU);
44 virtual void Reset();
45 virtual void EmitInstruction(SUnit *SU);
46 virtual void AdvanceCycle();
47 };
48
49
50 } // end namespace llvm
51
52 #endif // ARMHAZARDRECOGNIZER_H
1212
1313 #define DEBUG_TYPE "arm-isel"
1414 #include "ARM.h"
15 #include "ARMBaseInstrInfo.h"
1516 #include "ARMAddressingModes.h"
1617 #include "ARMTargetMachine.h"
1718 #include "llvm/CallingConv.h"
4041 cl::desc("Disable isel of shifter-op"),
4142 cl::init(false));
4243
44 static cl::opt
45 CheckVMLxHazard("check-vmlx-hazard", cl::Hidden,
46 cl::desc("Check fp vmla / vmls hazard at isel time"),
47 cl::init(false));
48
4349 //===--------------------------------------------------------------------===//
4450 /// ARMDAGToDAGISel - ARM specific code to select ARM machine
4551 /// instructions for SelectionDAG operations.
5359
5460 class ARMDAGToDAGISel : public SelectionDAGISel {
5561 ARMBaseTargetMachine &TM;
62 const ARMBaseInstrInfo *TII;
5663
5764 /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
5865 /// make the right decision when generating code for different targets.
6269 explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
6370 CodeGenOpt::Level OptLevel)
6471 : SelectionDAGISel(tm, OptLevel), TM(tm),
65 Subtarget(&TM.getSubtarget()) {
72 TII(static_cast(TM.getInstrInfo())),
73 Subtarget(&TM.getSubtarget()) {
6674 }
6775
6876 virtual const char *getPassName() const {
7785
7886 SDNode *Select(SDNode *N);
7987
88
89 bool hasNoVMLxHazardUse(SDNode *N) const;
8090 bool isShifterOpProfitable(const SDValue &Shift,
8191 ARM_AM::ShiftOpc ShOpcVal, unsigned ShAmt);
8292 bool SelectShifterOperandReg(SDValue N, SDValue &A,
271281 isInt32Immediate(N->getOperand(1).getNode(), Imm);
272282 }
273283
284 /// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
285 /// node. VFP / NEON fp VMLA / VMLS instructions have special RAW hazards (at
286 /// least on current ARM implementations) which should be avoidded.
287 bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
288 if (OptLevel == CodeGenOpt::None)
289 return true;
290
291 if (!CheckVMLxHazard)
292 return true;
293
294 if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9())
295 return true;
296
297 if (!N->hasOneUse())
298 return false;
299
300 SDNode *Use = *N->use_begin();
301 if (Use->getOpcode() == ISD::CopyToReg)
302 return true;
303 if (Use->isMachineOpcode()) {
304 const TargetInstrDesc &TID = TII->get(Use->getMachineOpcode());
305 if (TID.mayStore())
306 return true;
307 unsigned Opcode = TID.getOpcode();
308 if (Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
309 return true;
310 // vmlx feeding into another vmlx. We actually want to unfold
311 // the use later in the MLxExpansion pass. e.g.
312 // vmla
313 // vmla (stall 8 cycles)
314 //
315 // vmul (5 cycles)
316 // vadd (5 cycles)
317 // vmla
318 // This adds up to about 18 - 19 cycles.
319 //
320 // vmla
321 // vmul (stall 4 cycles)
322 // vadd adds up to about 14 cycles.
323 return TII->isFpMLxInstruction(Opcode);
324 }
325
326 return false;
327 }
274328
275329 bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
276330 ARM_AM::ShiftOpc ShOpcVal,
174174 // FIXME: Eventually this will be just "hasV6T2Ops".
175175 def UseMovt : Predicate<"Subtarget->useMovt()">;
176176 def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
177 def UseVMLx : Predicate<"Subtarget->useVMLx()">;
177 def UseFPVMLx : Predicate<"Subtarget->useFPVMLx()">;
178178
179179 //===----------------------------------------------------------------------===//
180180 // ARM Flag Definitions.
276276 // An 'xor' node with a single use.
277277 def xor_su : PatFrag<(ops node:$lhs, node:$rhs), (xor node:$lhs, node:$rhs), [{
278278 return N->hasOneUse();
279 }]>;
280
281 // An 'fmul' node with a single use.
282 def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
283 return N->hasOneUse();
284 }]>;
285
286 // An 'fadd' node which checks for single non-hazardous use.
287 def fadd_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fadd node:$lhs, node:$rhs),[{
288 return hasNoVMLxHazardUse(N);
289 }]>;
290
291 // An 'fsub' node which checks for single non-hazardous use.
292 def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
293 return hasNoVMLxHazardUse(N);
279294 }]>;
280295
281296 //===----------------------------------------------------------------------===//
19061906 // Multiply-Add/Sub operations: single-, double- and quad-register.
19071907 class N3VSMulOp op21_20, bits<4> op11_8, bit op4,
19081908 InstrItinClass itin, string OpcodeStr, string Dt,
1909 ValueType Ty, SDNode MulOp, SDNode OpNode>
1909 ValueType Ty, SDPatternOperator MulOp, SDNode OpNode>
19101910 : N3V
19111911 (outs DPR_VFP2:$Vd),
19121912 (ins DPR_VFP2:$src1, DPR_VFP2:$Vn, DPR_VFP2:$Vm), N3RegFrm, itin,
19141914
19151915 class N3VDMulOp op21_20, bits<4> op11_8, bit op4,
19161916 InstrItinClass itin, string OpcodeStr, string Dt,
1917 ValueType Ty, SDNode MulOp, SDNode OpNode>
1917 ValueType Ty, SDPatternOperator MulOp, SDPatternOperator OpNode>
19181918 : N3V
19191919 (outs DPR:$Vd), (ins DPR:$src1, DPR:$Vn, DPR:$Vm), N3RegFrm, itin,
19201920 OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
19231923
19241924 class N3VDMulOpSL op21_20, bits<4> op11_8, InstrItinClass itin,
19251925 string OpcodeStr, string Dt,
1926 ValueType Ty, SDNode MulOp, SDNode ShOp>
1926 ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
19271927 : N3V<0, 1, op21_20, op11_8, 1, 0,
19281928 (outs DPR:$Vd),
19291929 (ins DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
19501950
19511951 class N3VQMulOp op21_20, bits<4> op11_8, bit op4,
19521952 InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
1953 SDNode MulOp, SDNode OpNode>
1953 SDPatternOperator MulOp, SDPatternOperator OpNode>
19541954 : N3V
19551955 (outs QPR:$Vd), (ins QPR:$src1, QPR:$Vn, QPR:$Vm), N3RegFrm, itin,
19561956 OpcodeStr, Dt, "$Vd, $Vn, $Vm", "$src1 = $Vd",
19581958 (Ty (MulOp QPR:$Vn, QPR:$Vm)))))]>;
19591959 class N3VQMulOpSL op21_20, bits<4> op11_8, InstrItinClass itin,
19601960 string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
1961 SDNode MulOp, SDNode ShOp>
1961 SDPatternOperator MulOp, SDPatternOperator ShOp>
19621962 : N3V<1, 1, op21_20, op11_8, 1, 0,
19631963 (outs QPR:$Vd),
19641964 (ins QPR:$src1, QPR:$Vn, DPR_VFP2:$Vm, nohash_imm:$lane),
32813281 defm VMLA : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
32823282 IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
32833283 def VMLAfd : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
3284 v2f32, fmul, fadd>;
3284 v2f32, fmul_su, fadd_mlx>,
3285 Requires<[HasNEON, UseFPVMLx]>;
32853286 def VMLAfq : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
3286 v4f32, fmul, fadd>;
3287 v4f32, fmul_su, fadd_mlx>,
3288 Requires<[HasNEON, UseFPVMLx]>;
32873289 defm VMLAsl : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
32883290 IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
32893291 def VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
3290 v2f32, fmul, fadd>;
3292 v2f32, fmul_su, fadd_mlx>,
3293 Requires<[HasNEON, UseFPVMLx]>;
32913294 def VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
3292 v4f32, v2f32, fmul, fadd>;
3295 v4f32, v2f32, fmul_su, fadd_mlx>,
3296 Requires<[HasNEON, UseFPVMLx]>;
32933297
32943298 def : Pat<(v8i16 (add (v8i16 QPR:$src1),
32953299 (mul (v8i16 QPR:$src2),
33073311 (DSubReg_i32_reg imm:$lane))),
33083312 (SubReg_i32_lane imm:$lane)))>;
33093313
3310 def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
3311 (fmul (v4f32 QPR:$src2),
3314 def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
3315 (fmul_su (v4f32 QPR:$src2),
33123316 (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
33133317 (v4f32 (VMLAslfq (v4f32 QPR:$src1),
33143318 (v4f32 QPR:$src2),
33153319 (v2f32 (EXTRACT_SUBREG QPR:$src3,
33163320 (DSubReg_i32_reg imm:$lane))),
3317 (SubReg_i32_lane imm:$lane)))>;
3321 (SubReg_i32_lane imm:$lane)))>,
3322 Requires<[HasNEON, UseFPVMLx]>;
33183323
33193324 // VMLAL : Vector Multiply Accumulate Long (Q += D * D)
33203325 defm VMLALs : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
33343339 defm VMLS : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
33353340 IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
33363341 def VMLSfd : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
3337 v2f32, fmul, fsub>;
3342 v2f32, fmul_su, fsub_mlx>,
3343 Requires<[HasNEON, UseFPVMLx]>;
33383344 def VMLSfq : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
3339 v4f32, fmul, fsub>;
3345 v4f32, fmul_su, fsub_mlx>,
3346 Requires<[HasNEON, UseFPVMLx]>;
33403347 defm VMLSsl : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
33413348 IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
33423349 def VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
3343 v2f32, fmul, fsub>;
3350 v2f32, fmul_su, fsub_mlx>,
3351 Requires<[HasNEON, UseFPVMLx]>;
33443352 def VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
3345 v4f32, v2f32, fmul, fsub>;
3353 v4f32, v2f32, fmul_su, fsub_mlx>,
3354 Requires<[HasNEON, UseFPVMLx]>;
33463355
33473356 def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
33483357 (mul (v8i16 QPR:$src2),
33603369 (DSubReg_i32_reg imm:$lane))),
33613370 (SubReg_i32_lane imm:$lane)))>;
33623371
3363 def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
3364 (fmul (v4f32 QPR:$src2),
3372 def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
3373 (fmul_su (v4f32 QPR:$src2),
33653374 (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
33663375 (v4f32 (VMLSslfq (v4f32 QPR:$src1), (v4f32 QPR:$src2),
33673376 (v2f32 (EXTRACT_SUBREG QPR:$src3,
33683377 (DSubReg_i32_reg imm:$lane))),
3369 (SubReg_i32_lane imm:$lane)))>;
3378 (SubReg_i32_lane imm:$lane)))>,
3379 Requires<[HasNEON, UseFPVMLx]>;
33703380
33713381 // VMLSL : Vector Multiply Subtract Long (Q -= D * D)
33723382 defm VMLSLs : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
47054715 // vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so
47064716 // we want to avoid them for now. e.g., alternating vmla/vadd instructions.
47074717
4708 //let neverHasSideEffects = 1 in
4709 //def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
4710 // v2f32, fmul, fadd>;
4711 //def : N3VSMulOpPat;
4712
4713 //let neverHasSideEffects = 1 in
4714 //def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
4715 // v2f32, fmul, fsub>;
4716 //def : N3VSMulOpPat;
4718 let neverHasSideEffects = 1 in
4719 def VMLAfd_sfp : N3VSMulOp<0,0,0b00,0b1101,1, IIC_VMACD, "vmla", "f32",
4720 v2f32, fmul_su, fadd>;
4721 def : N3VSMulOpPat,
4722 Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
4723
4724 let neverHasSideEffects = 1 in
4725 def VMLSfd_sfp : N3VSMulOp<0,0,0b10,0b1101,1, IIC_VMACD, "vmls", "f32",
4726 v2f32, fmul_su, fsub>;
4727 def : N3VSMulOpPat,
4728 Requires<[HasNEON, UseNEONForFP, UseFPVMLx]>;
47174729
47184730 // Vector Absolute used for single-precision FP
47194731 let neverHasSideEffects = 1 in
750750 def VMLAD : ADbI<0b11100, 0b00, 0, 0,
751751 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
752752 IIC_fpMAC64, "vmla", ".f64\t$Dd, $Dn, $Dm",
753 [(set DPR:$Dd, (fadd (fmul DPR:$Dn, DPR:$Dm),
754 (f64 DPR:$Ddin)))]>,
753 [(set DPR:$Dd, (fadd_mlx (fmul_su DPR:$Dn, DPR:$Dm),
754 (f64 DPR:$Ddin)))]>,
755755 RegConstraint<"$Ddin = $Dd">,
756 Requires<[HasVFP2,UseVMLx]>;
756 Requires<[HasVFP2,UseFPVMLx]>;
757757
758758 def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
759759 (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
760760 IIC_fpMAC32, "vmla", ".f32\t$Sd, $Sn, $Sm",
761 [(set SPR:$Sd, (fadd (fmul SPR:$Sn, SPR:$Sm),
762 SPR:$Sdin))]>,
761 [(set SPR:$Sd, (fadd_mlx (fmul_su SPR:$Sn, SPR:$Sm),
762 SPR:$Sdin))]>,
763763 RegConstraint<"$Sdin = $Sd">,
764 Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
765
766 def : Pat<(fadd DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))),
764 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
765
766 def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
767767 (VMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
768 Requires<[HasVFP2,UseVMLx]>;
769 def : Pat<(fadd SPR:$dstin, (fmul SPR:$a, SPR:$b)),
768 Requires<[HasVFP2,UseFPVMLx]>;
769 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
770770 (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
771 Requires<[HasVFP2,DontUseNEONForFP, UseVMLx]>;
771 Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
772772
773773 def VMLSD : ADbI<0b11100, 0b00, 1, 0,
774774 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
775775 IIC_fpMAC64, "vmls", ".f64\t$Dd, $Dn, $Dm",
776 [(set DPR:$Dd, (fadd (fneg (fmul DPR:$Dn,DPR:$Dm)),
777 (f64 DPR:$Ddin)))]>,
776 [(set DPR:$Dd, (fadd_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
777 (f64 DPR:$Ddin)))]>,
778778 RegConstraint<"$Ddin = $Dd">,
779 Requires<[HasVFP2,UseVMLx]>;
779 Requires<[HasVFP2,UseFPVMLx]>;
780780
781781 def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
782782 (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
783783 IIC_fpMAC32, "vmls", ".f32\t$Sd, $Sn, $Sm",
784 [(set SPR:$Sd, (fadd (fneg (fmul SPR:$Sn, SPR:$Sm)),
785 SPR:$Sdin))]>,
784 [(set SPR:$Sd, (fadd_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
785 SPR:$Sdin))]>,
786786 RegConstraint<"$Sdin = $Sd">,
787 Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
788
789 def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, (f64 DPR:$b))),
787 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
788
789 def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
790790 (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
791 Requires<[HasVFP2,UseVMLx]>;
792 def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
791 Requires<[HasVFP2,UseFPVMLx]>;
792 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
793793 (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
794 Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
794 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
795795
796796 def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
797797 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
798798 IIC_fpMAC64, "vnmla", ".f64\t$Dd, $Dn, $Dm",
799 [(set DPR:$Dd,(fsub (fneg (fmul DPR:$Dn,DPR:$Dm)),
800 (f64 DPR:$Ddin)))]>,
799 [(set DPR:$Dd,(fsub_mlx (fneg (fmul_su DPR:$Dn,DPR:$Dm)),
800 (f64 DPR:$Ddin)))]>,
801801 RegConstraint<"$Ddin = $Dd">,
802 Requires<[HasVFP2,UseVMLx]>;
802 Requires<[HasVFP2,UseFPVMLx]>;
803803
804804 def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
805805 (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
806806 IIC_fpMAC32, "vnmla", ".f32\t$Sd, $Sn, $Sm",
807 [(set SPR:$Sd, (fsub (fneg (fmul SPR:$Sn, SPR:$Sm)),
808 SPR:$Sdin))]>,
807 [(set SPR:$Sd, (fsub_mlx (fneg (fmul_su SPR:$Sn, SPR:$Sm)),
808 SPR:$Sdin))]>,
809809 RegConstraint<"$Sdin = $Sd">,
810 Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
811
812 def : Pat<(fsub (fneg (fmul DPR:$a, (f64 DPR:$b))), DPR:$dstin),
810 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
811
812 def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
813813 (VNMLAD DPR:$dstin, DPR:$a, DPR:$b)>,
814 Requires<[HasVFP2,UseVMLx]>;
815 def : Pat<(fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin),
814 Requires<[HasVFP2,UseFPVMLx]>;
815 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
816816 (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
817 Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
817 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
818818
819819 def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
820820 (outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
821821 IIC_fpMAC64, "vnmls", ".f64\t$Dd, $Dn, $Dm",
822 [(set DPR:$Dd, (fsub (fmul DPR:$Dn, DPR:$Dm),
823 (f64 DPR:$Ddin)))]>,
822 [(set DPR:$Dd, (fsub_mlx (fmul_su DPR:$Dn, DPR:$Dm),
823 (f64 DPR:$Ddin)))]>,
824824 RegConstraint<"$Ddin = $Dd">,
825 Requires<[HasVFP2,UseVMLx]>;
825 Requires<[HasVFP2,UseFPVMLx]>;
826826
827827 def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
828828 (outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
829829 IIC_fpMAC32, "vnmls", ".f32\t$Sd, $Sn, $Sm",
830 [(set SPR:$Sd, (fsub (fmul SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
830 [(set SPR:$Sd, (fsub_mlx (fmul_su SPR:$Sn, SPR:$Sm), SPR:$Sdin))]>,
831831 RegConstraint<"$Sdin = $Sd">,
832 Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
833
834 def : Pat<(fsub (fmul DPR:$a, (f64 DPR:$b)), DPR:$dstin),
832 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
833
834 def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
835835 (VNMLSD DPR:$dstin, DPR:$a, DPR:$b)>,
836 Requires<[HasVFP2,UseVMLx]>;
837 def : Pat<(fsub (fmul SPR:$a, SPR:$b), SPR:$dstin),
836 Requires<[HasVFP2,UseFPVMLx]>;
837 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
838838 (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
839 Requires<[HasVFP2,DontUseNEONForFP,UseVMLx]>;
839 Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
840840
841841
842842 //===----------------------------------------------------------------------===//
3636 , ARMProcFamily(Others)
3737 , ARMFPUType(None)
3838 , UseNEONForSinglePrecisionFP(false)
39 , SlowVMLx(false)
39 , SlowFPVMLx(false)
4040 , SlowFPBrcc(false)
4141 , IsThumb(isT)
4242 , ThumbMode(Thumb1)
5656 /// determine if NEON should actually be used.
5757 bool UseNEONForSinglePrecisionFP;
5858
59 /// SlowVMLx - If the VFP2 instructions are available, indicates whether
60 /// the VML[AS] instructions are slow (if so, don't use them).
61 bool SlowVMLx;
59 /// SlowFPVMLx - If the VFP2 / NEON instructions are available, indicates
60 /// whether the FP VML[AS] instructions are slow (if so, don't use them).
61 bool SlowFPVMLx;
6262
6363 /// SlowFPBrcc - True if floating point compare + branch is slow.
6464 bool SlowFPBrcc;
175175 bool hasDivide() const { return HasHardwareDivide; }
176176 bool hasT2ExtractPack() const { return HasT2ExtractPack; }
177177 bool hasDataBarrier() const { return HasDataBarrier; }
178 bool useVMLx() const {return hasVFP2() && !SlowVMLx; }
178 bool useFPVMLx() const { return !SlowFPVMLx; }
179179 bool isFPBrccSlow() const { return SlowFPBrcc; }
180180 bool isFPOnlySP() const { return FPOnlySP; }
181181 bool prefers32BitThumb() const { return Pref32BitThumb; }
1515 #include "ARM.h"
1616 #include "llvm/PassManager.h"
1717 #include "llvm/CodeGen/Passes.h"
18 #include "llvm/Support/CommandLine.h"
1819 #include "llvm/Support/FormattedStream.h"
1920 #include "llvm/Target/TargetOptions.h"
2021 #include "llvm/Target/TargetRegistry.h"
2122 using namespace llvm;
23
24 static cl::optExpandMLx("expand-fp-mlx", cl::init(false), cl::Hidden);
2225
2326 static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
2427 Triple TheTriple(TT);
145148 // FIXME: temporarily disabling load / store optimization pass for Thumb1.
146149 if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
147150 PM.add(createARMLoadStoreOptimizationPass(true));
151 if (ExpandMLx &&
152 OptLevel != CodeGenOpt::None && Subtarget.hasVFP2())
153 PM.add(createMLxExpansionPass());
148154
149155 return true;
150156 }
2828 ARMFastISel.cpp
2929 ARMFrameInfo.cpp
3030 ARMGlobalMerge.cpp
31 ARMHazardRecognizer.cpp
3132 ARMISelDAGToDAG.cpp
3233 ARMISelLowering.cpp
3334 ARMInstrInfo.cpp
4546 Thumb1InstrInfo.cpp
4647 Thumb1FrameInfo.cpp
4748 Thumb1RegisterInfo.cpp
48 Thumb2HazardRecognizer.cpp
4949 Thumb2ITBlockPass.cpp
5050 Thumb2InstrInfo.cpp
5151 Thumb2RegisterInfo.cpp
0 //===-- MLxExpansionPass.cpp - Expand MLx instrs to avoid hazards ----------=//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Expand VFP / NEON floating point MLA / MLS instructions (each to a pair of
10 // multiple and add / sub instructions) when special VMLx hazards are detected.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #define DEBUG_TYPE "mlx-expansion"
15 #include "ARM.h"
16 #include "ARMBaseInstrInfo.h"
17 #include "llvm/CodeGen/MachineInstr.h"
18 #include "llvm/CodeGen/MachineInstrBuilder.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/CodeGen/MachineRegisterInfo.h"
21 #include "llvm/Target/TargetRegisterInfo.h"
22 #include "llvm/ADT/DenseMap.h"
23 #include "llvm/ADT/SmallSet.h"
24 #include "llvm/ADT/Statistic.h"
25 #include "llvm/Support/CommandLine.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/raw_ostream.h"
28 using namespace llvm;
29
30 static cl::opt
31 ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
32 static cl::opt
33 ExpandLimit("expand-limit", cl::init(~0U), cl::Hidden);
34
35 STATISTIC(NumExpand, "Number of fp MLA / MLS instructions expanded");
36
37 namespace {
38 struct MLxExpansion : public MachineFunctionPass {
39 static char ID;
40 MLxExpansion() : MachineFunctionPass(ID) {}
41
42 virtual bool runOnMachineFunction(MachineFunction &Fn);
43
44 virtual const char *getPassName() const {
45 return "ARM MLA / MLS expansion pass";
46 }
47
48 private:
49 const ARMBaseInstrInfo *TII;
50 const TargetRegisterInfo *TRI;
51 MachineRegisterInfo *MRI;
52
53 unsigned HazardLimit;
54 unsigned MIIdx;
55 MachineInstr* LastMIs[4];
56
57 void clearStack();
58 void pushStack(MachineInstr *MI);
59 MachineInstr *getAccDefMI(MachineInstr *MI) const;
60 unsigned getDefReg(MachineInstr *MI) const;
61 bool hasRAWHazard(unsigned Reg, MachineInstr *MI) const;
62 bool FindMLxHazard(MachineInstr *MI) const;
63 void ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
64 unsigned MulOpc, unsigned AddSubOpc,
65 bool NegAcc, bool HasLane);
66 bool ExpandFPMLxInstructions(MachineBasicBlock &MBB);
67 };
68 char MLxExpansion::ID = 0;
69 }
70
71 void MLxExpansion::clearStack() {
72 std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0);
73 MIIdx = 0;
74 }
75
76 void MLxExpansion::pushStack(MachineInstr *MI) {
77 LastMIs[MIIdx] = MI;
78 if (++MIIdx == 4)
79 MIIdx = 0;
80 }
81
82 MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
83 // Look past COPY and INSERT_SUBREG instructions to find the
84 // real definition MI. This is important for _sfp instructions.
85 unsigned Reg = MI->getOperand(1).getReg();
86 if (TargetRegisterInfo::isPhysicalRegister(Reg))
87 return 0;
88
89 MachineBasicBlock *MBB = MI->getParent();
90 MachineInstr *DefMI = MRI->getVRegDef(Reg);
91 while (true) {
92 if (DefMI->getParent() != MBB)
93 break;
94 if (DefMI->isCopyLike()) {
95 Reg = DefMI->getOperand(1).getReg();
96 if (TargetRegisterInfo::isVirtualRegister(Reg)) {
97 DefMI = MRI->getVRegDef(Reg);
98 continue;
99 }
100 } else if (DefMI->isInsertSubreg()) {
101 Reg = DefMI->getOperand(2).getReg();
102 if (TargetRegisterInfo::isVirtualRegister(Reg)) {
103 DefMI = MRI->getVRegDef(Reg);
104 continue;
105 }
106 }
107 break;
108 }
109 return DefMI;
110 }
111
112 unsigned MLxExpansion::getDefReg(MachineInstr *MI) const {
113 unsigned Reg = MI->getOperand(0).getReg();
114 if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
115 !MRI->hasOneNonDBGUse(Reg))
116 return Reg;
117
118 MachineBasicBlock *MBB = MI->getParent();
119 MachineInstr *UseMI = &*MRI->use_nodbg_begin(Reg);
120 if (UseMI->getParent() != MBB)
121 return Reg;
122
123 while (UseMI->isCopy() || UseMI->isInsertSubreg()) {
124 Reg = UseMI->getOperand(0).getReg();
125 if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
126 !MRI->hasOneNonDBGUse(Reg))
127 return Reg;
128 UseMI = &*MRI->use_nodbg_begin(Reg);
129 if (UseMI->getParent() != MBB)
130 return Reg;
131 }
132
133 return Reg;
134 }
135
136 bool MLxExpansion::hasRAWHazard(unsigned Reg, MachineInstr *MI) const {
137 const TargetInstrDesc &TID = MI->getDesc();
138 // FIXME: Detect integer instructions properly.
139 unsigned Domain = TID.TSFlags & ARMII::DomainMask;
140 if (Domain == ARMII::DomainVFP) {
141 unsigned Opcode = TID.getOpcode();
142 if (Opcode == ARM::VSTRS || Opcode == ARM::VSTRD ||
143 Opcode == ARM::VMOVRS || Opcode == ARM::VMOVRRD)
144 return false;
145 } else if (Domain == ARMII::DomainNEON) {
146 if (TID.mayStore() || TID.mayLoad())
147 return false;
148 } else {
149 return false;
150 }
151
152 return MI->readsRegister(Reg, TRI);
153 return false;
154 }
155
156
157 bool MLxExpansion::FindMLxHazard(MachineInstr *MI) const {
158 if (NumExpand >= ExpandLimit)
159 return false;
160
161 if (ForceExapnd)
162 return true;
163
164 MachineInstr *DefMI = getAccDefMI(MI);
165 if (TII->isFpMLxInstruction(DefMI->getOpcode()))
166 // r0 = vmla
167 // r3 = vmla r0, r1, r2
168 // takes 16 - 17 cycles
169 //
170 // r0 = vmla
171 // r4 = vmul r1, r2
172 // r3 = vadd r0, r4
173 // takes about 14 - 15 cycles even with vmul stalling for 4 cycles.
174 return true;
175
176 // If a VMLA.F is followed by an VADD.F or VMUL.F with no RAW hazard, the
177 // VADD.F or VMUL.F will stall 4 cycles before issue. The 4 cycle stall
178 // preserves the in-order retirement of the instructions.
179 // Look at the next few instructions, if *most* of them can cause hazards,
180 // then the scheduler can't *fix* this, we'd better break up the VMLA.
181 for (unsigned i = 1; i <= 4; ++i) {
182 int Idx = ((int)MIIdx - i + 4) % 4;
183 MachineInstr *NextMI = LastMIs[Idx];
184 if (!NextMI)
185 continue;
186
187 if (TII->canCauseFpMLxStall(NextMI->getOpcode()))
188 return true;
189
190 // Look for VMLx RAW hazard.
191 if (hasRAWHazard(getDefReg(MI), NextMI))
192 return true;
193 }
194
195 return false;
196 }
197
198 /// ExpandFPMLxInstructions - Expand a MLA / MLS instruction into a pair
199 /// of MUL + ADD / SUB instructions.
200 void
201 MLxExpansion::ExpandFPMLxInstruction(MachineBasicBlock &MBB, MachineInstr *MI,
202 unsigned MulOpc, unsigned AddSubOpc,
203 bool NegAcc, bool HasLane) {
204 unsigned DstReg = MI->getOperand(0).getReg();
205 bool DstDead = MI->getOperand(0).isDead();
206 unsigned AccReg = MI->getOperand(1).getReg();
207 unsigned Src1Reg = MI->getOperand(2).getReg();
208 unsigned Src2Reg = MI->getOperand(3).getReg();
209 bool Src1Kill = MI->getOperand(2).isKill();
210 bool Src2Kill = MI->getOperand(3).isKill();
211 unsigned LaneImm = HasLane ? MI->getOperand(4).getImm() : 0;
212 unsigned NextOp = HasLane ? 5 : 4;
213 ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NextOp).getImm();
214 unsigned PredReg = MI->getOperand(++NextOp).getReg();
215
216 const TargetInstrDesc &TID1 = TII->get(MulOpc);
217 const TargetInstrDesc &TID2 = TII->get(AddSubOpc);
218 unsigned TmpReg = MRI->createVirtualRegister(TID1.getRegClass(0, TRI));
219
220 MachineInstrBuilder MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID1, TmpReg)
221 .addReg(Src1Reg, getKillRegState(Src1Kill))
222 .addReg(Src2Reg, getKillRegState(Src2Kill));
223 if (HasLane)
224 MIB.addImm(LaneImm);
225 MIB.addImm(Pred).addReg(PredReg);
226
227 MIB = BuildMI(MBB, *MI, MI->getDebugLoc(), TID2)
228 .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstDead));
229
230 if (NegAcc) {
231 bool AccKill = MRI->hasOneNonDBGUse(AccReg);
232 MIB.addReg(TmpReg, getKillRegState(true))
233 .addReg(AccReg, getKillRegState(AccKill));
234 } else {
235 MIB.addReg(AccReg).addReg(TmpReg, getKillRegState(true));
236 }
237 MIB.addImm(Pred).addReg(PredReg);
238
239 DEBUG({
240 dbgs() << "Expanding: " << *MI;
241 dbgs() << " to:\n";
242 MachineBasicBlock::iterator MII = MI;
243 MII = llvm::prior(MII);
244 MachineInstr &MI2 = *MII;
245 MII = llvm::prior(MII);
246 MachineInstr &MI1 = *MII;
247 dbgs() << " " << MI1;
248 dbgs() << " " << MI2;
249 });
250
251 MI->eraseFromParent();
252 ++NumExpand;
253 }
254
255 bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
256 bool Changed = false;
257
258 clearStack();
259
260 unsigned Skip = 0;
261 MachineBasicBlock::reverse_iterator MII = MBB.rbegin(), E = MBB.rend();
262 while (MII != E) {
263 MachineInstr *MI = &*MII;
264
265 if (MI->isLabel() || MI->isImplicitDef() || MI->isCopy()) {
266 ++MII;
267 continue;
268 }
269
270 const TargetInstrDesc &TID = MI->getDesc();
271 if (TID.isBarrier()) {
272 clearStack();
273 Skip = 0;
274 ++MII;
275 continue;
276 }
277
278 unsigned Domain = TID.TSFlags & ARMII::DomainMask;
279 if (Domain == ARMII::DomainGeneral) {
280 if (++Skip == 2)
281 // Assume dual issues of non-VFP / NEON instructions.
282 pushStack(0);
283 } else {
284 Skip = 0;
285
286 unsigned MulOpc, AddSubOpc;
287 bool NegAcc, HasLane;
288 if (!TII->isFpMLxInstruction(TID.getOpcode(),
289 MulOpc, AddSubOpc, NegAcc, HasLane) ||
290 !FindMLxHazard(MI))
291 pushStack(MI);
292 else {
293 ExpandFPMLxInstruction(MBB, MI, MulOpc, AddSubOpc, NegAcc, HasLane);
294 E = MBB.rend(); // May have changed if MI was the 1st instruction.
295 Changed = true;
296 continue;
297 }
298 }
299
300 ++MII;
301 }
302
303 return Changed;
304 }
305
306 bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
307 TII = static_cast(Fn.getTarget().getInstrInfo());
308 TRI = Fn.getTarget().getRegisterInfo();
309 MRI = &Fn.getRegInfo();
310
311 bool Modified = false;
312 for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
313 ++MFI) {
314 MachineBasicBlock &MBB = *MFI;
315 Modified |= ExpandFPMLxInstructions(MBB);
316 }
317
318 return Modified;
319 }
320
321 FunctionPass *llvm::createMLxExpansionPass() {
322 return new MLxExpansion();
323 }
+0
-53
lib/Target/ARM/Thumb2HazardRecognizer.cpp less more
None //===-- Thumb2HazardRecognizer.cpp - Thumb2 postra hazard recognizer ------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "ARM.h"
10 #include "Thumb2HazardRecognizer.h"
11 #include "llvm/CodeGen/MachineInstr.h"
12 #include "llvm/CodeGen/ScheduleDAG.h"
13 using namespace llvm;
14
15 ScheduleHazardRecognizer::HazardType
16 Thumb2HazardRecognizer::getHazardType(SUnit *SU) {
17 if (ITBlockSize) {
18 MachineInstr *MI = SU->getInstr();
19 if (!MI->isDebugValue() && MI != ITBlockMIs[ITBlockSize-1])
20 return Hazard;
21 }
22
23 return PostRAHazardRecognizer::getHazardType(SU);
24 }
25
26 void Thumb2HazardRecognizer::Reset() {
27 ITBlockSize = 0;
28 PostRAHazardRecognizer::Reset();
29 }
30
31 void Thumb2HazardRecognizer::EmitInstruction(SUnit *SU) {
32 MachineInstr *MI = SU->getInstr();
33 unsigned Opcode = MI->getOpcode();
34 if (ITBlockSize) {
35 --ITBlockSize;
36 } else if (Opcode == ARM::t2IT) {
37 unsigned Mask = MI->getOperand(1).getImm();
38 unsigned NumTZ = CountTrailingZeros_32(Mask);
39 assert(NumTZ <= 3 && "Invalid IT mask!");
40 ITBlockSize = 4 - NumTZ;
41 MachineBasicBlock::iterator I = MI;
42 for (unsigned i = 0; i < ITBlockSize; ++i) {
43 // Advance to the next instruction, skipping any dbg_value instructions.
44 do {
45 ++I;
46 } while (I->isDebugValue());
47 ITBlockMIs[ITBlockSize-1-i] = &*I;
48 }
49 }
50
51 PostRAHazardRecognizer::EmitInstruction(SU);
52 }
+0
-40
lib/Target/ARM/Thumb2HazardRecognizer.h less more
None //===-- Thumb2HazardRecognizer.h - Thumb2 Hazard Recognizers ----*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines hazard recognizers for scheduling Thumb2 functions on
10 // ARM processors.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #ifndef THUMB2HAZARDRECOGNIZER_H
15 #define THUMB2HAZARDRECOGNIZER_H
16
17 #include "llvm/CodeGen/PostRAHazardRecognizer.h"
18
19 namespace llvm {
20
21 class MachineInstr;
22
23 class Thumb2HazardRecognizer : public PostRAHazardRecognizer {
24 unsigned ITBlockSize; // No. of MIs in current IT block yet to be scheduled.
25 MachineInstr *ITBlockMIs[4];
26
27 public:
28 Thumb2HazardRecognizer(const InstrItineraryData *ItinData) :
29 PostRAHazardRecognizer(ItinData) {}
30
31 virtual HazardType getHazardType(SUnit *SU);
32 virtual void Reset();
33 virtual void EmitInstruction(SUnit *SU);
34 };
35
36
37 } // end namespace llvm
38
39 #endif // THUMB2HAZARDRECOGNIZER_H
1616 #include "ARMAddressingModes.h"
1717 #include "ARMGenInstrInfo.inc"
1818 #include "ARMMachineFunctionInfo.h"
19 #include "Thumb2HazardRecognizer.h"
2019 #include "Thumb2InstrInfo.h"
2120 #include "llvm/CodeGen/MachineFrameInfo.h"
2221 #include "llvm/CodeGen/MachineInstrBuilder.h"
172171 }
173172
174173 ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC, TRI);
175 }
176
177 ScheduleHazardRecognizer *Thumb2InstrInfo::
178 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const {
179 return (ScheduleHazardRecognizer *)new Thumb2HazardRecognizer(II);
180174 }
181175
182176 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
6464 /// always be able to get register info as well (through this method).
6565 ///
6666 const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
67
68 ScheduleHazardRecognizer *
69 CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II) const;
7067 };
7168
7269 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
269269 define arm_aapcs_vfpcc i32 @t10() nounwind {
270270 entry:
271271 ; CHECK: t10:
272 ; CHECK: vmul.f32 q8, q8, d0[0]
272273 ; CHECK: vmov.i32 q9, #0x3F000000
273 ; CHECK: vmla.f32 q8, q8, d0[0]
274 ; CHECK: vadd.f32 q8, q8, q8
274275 %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
275276 %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
276277 %2 = insertelement <4 x float> %1, float undef, i32 2 ; <<4 x float>> [#uses=1]