llvm.org GIT mirror llvm / 8239daf
Two sets of changes. Sorry they are intermingled. 1. Fix pre-ra scheduler so it doesn't try to push instructions above calls to "optimize for latency". Call instructions don't have the right latency and this is more likely to use introduce spills. 2. Fix if-converter cost function. For ARM, it should use instruction latencies, not # of micro-ops since multi-latency instructions is completely executed even when the predicate is false. Also, some instruction will be "slower" when they are predicated due to the register def becoming implicit input. rdar://8598427 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@118135 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 9 years ago
17 changed file(s) with 301 addition(s) and 170 deletion(s). Raw diff Collapse all Expand all
246246 unsigned NumSuccs; // # of SDep::Data sucss.
247247 unsigned NumPredsLeft; // # of preds not scheduled.
248248 unsigned NumSuccsLeft; // # of succs not scheduled.
249 bool isCall : 1; // Is a function call.
249250 bool isTwoAddress : 1; // Is a two-address instruction.
250251 bool isCommutable : 1; // Is a commutable instruction.
251252 bool hasPhysRegDefs : 1; // Has physreg defs that are being used.
272273 SUnit(SDNode *node, unsigned nodenum)
273274 : Node(node), Instr(0), OrigNode(0), NodeNum(nodenum),
274275 NodeQueueId(0), Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
275 NumSuccsLeft(0), isTwoAddress(false), isCommutable(false),
276 NumSuccsLeft(0),
277 isCall(false), isTwoAddress(false), isCommutable(false),
276278 hasPhysRegDefs(false), hasPhysRegClobbers(false),
277279 isPending(false), isAvailable(false), isScheduled(false),
278280 isScheduleHigh(false), isCloned(false),
285287 SUnit(MachineInstr *instr, unsigned nodenum)
286288 : Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum),
287289 NodeQueueId(0), Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
288 NumSuccsLeft(0), isTwoAddress(false), isCommutable(false),
290 NumSuccsLeft(0),
291 isCall(false), isTwoAddress(false), isCommutable(false),
289292 hasPhysRegDefs(false), hasPhysRegClobbers(false),
290293 isPending(false), isAvailable(false), isScheduled(false),
291294 isScheduleHigh(false), isCloned(false),
297300 SUnit()
298301 : Node(0), Instr(0), OrigNode(0), NodeNum(~0u),
299302 NodeQueueId(0), Latency(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
300 NumSuccsLeft(0), isTwoAddress(false), isCommutable(false),
303 NumSuccsLeft(0),
304 isCall(false), isTwoAddress(false), isCommutable(false),
301305 hasPhysRegDefs(false), hasPhysRegClobbers(false),
302306 isPending(false), isAvailable(false), isScheduled(false),
303307 isScheduleHigh(false), isCloned(false),
303303 return true;
304304 }
305305
306 /// isProfitableToIfCvt - Return true if it's profitable to first "NumInstrs"
306 /// isProfitableToIfCvt - Return true if it's profitable to predicate
307 /// instructions with accumulated instruction latency of "NumCycles"
307308 /// of the specified basic block, where the probability of the instructions
308309 /// being executed is given by Probability, and Confidence is a measure
309310 /// of our confidence that it will be properly predicted.
310311 virtual
311 bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs,
312 bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
313 unsigned ExtraPredCycles,
312314 float Probability, float Confidence) const {
313315 return false;
314316 }
320322 /// by Probability, and Confidence is a measure of our confidence that it
321323 /// will be properly predicted.
322324 virtual bool
323 isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs,
324 MachineBasicBlock &FMBB, unsigned NumFInstrs,
325 isProfitableToIfCvt(MachineBasicBlock &TMBB,
326 unsigned NumTCycles, unsigned ExtraTCycles,
327 MachineBasicBlock &FMBB,
328 unsigned NumFCycles, unsigned ExtraFCycles,
325329 float Probability, float Confidence) const {
326330 return false;
327331 }
328332
329333 /// isProfitableToDupForIfCvt - Return true if it's profitable for
330 /// if-converter to duplicate a specific number of instructions in the
331 /// specified MBB to enable if-conversion, where the probability of the
332 /// instructions being executed is given by Probability, and Confidence is
333 /// a measure of our confidence that it will be properly predicted.
334 /// if-converter to duplicate instructions of specified accumulated
335 /// instruction latencies in the specified MBB to enable if-conversion.
336 /// The probability of the instructions being executed is given by
337 /// Probability, and Confidence is a measure of our confidence that it
338 /// will be properly predicted.
334339 virtual bool
335 isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs,
340 isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
336341 float Probability, float Confidence) const {
337342 return false;
338343 }
607612
608613 /// getNumMicroOps - Return the number of u-operations the given machine
609614 /// instruction will be decoded to on the target cpu.
610 virtual unsigned getNumMicroOps(const MachineInstr *MI,
611 const InstrItineraryData *ItinData) const;
615 virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
616 const MachineInstr *MI) const;
612617
613618 /// getOperandLatency - Compute and return the use operand latency of a given
614 /// itinerary class and operand index if the value is produced by an
615 /// instruction of the specified itinerary class and def operand index.
619 /// pair of def and use.
616620 /// In most cases, the static scheduling itinerary was enough to determine the
617621 /// operand latency. But it may not be possible for instructions with variable
618622 /// number of defs / uses.
619 virtual
620 int getOperandLatency(const InstrItineraryData *ItinData,
621 const MachineInstr *DefMI, unsigned DefIdx,
622 const MachineInstr *UseMI, unsigned UseIdx) const;
623
624 virtual
625 int getOperandLatency(const InstrItineraryData *ItinData,
626 SDNode *DefNode, unsigned DefIdx,
627 SDNode *UseNode, unsigned UseIdx) const;
623 virtual int getOperandLatency(const InstrItineraryData *ItinData,
624 const MachineInstr *DefMI, unsigned DefIdx,
625 const MachineInstr *UseMI, unsigned UseIdx) const;
626
627 virtual int getOperandLatency(const InstrItineraryData *ItinData,
628 SDNode *DefNode, unsigned DefIdx,
629 SDNode *UseNode, unsigned UseIdx) const;
630
631 /// getInstrLatency - Compute the instruction latency of a given instruction.
632 /// If the instruction has higher cost when predicated, it's returned via
633 /// PredCost.
634 virtual int getInstrLatency(const InstrItineraryData *ItinData,
635 const MachineInstr *MI,
636 unsigned *PredCost = 0) const;
637
638 virtual int getInstrLatency(const InstrItineraryData *ItinData,
639 SDNode *Node) const;
628640
629641 /// hasHighOperandLatency - Compute operand latency between a def of 'Reg'
630642 /// and an use in the current loop, return true if the target considered
9292 /// ClobbersPred - True if BB could modify predicates (e.g. has
9393 /// cmp, call, etc.)
9494 /// NonPredSize - Number of non-predicated instructions.
95 /// ExtraCost - Extra cost for microcoded instructions.
95 /// ExtraCost - Extra cost for multi-cycle instructions.
96 /// ExtraCost2 - Some instructions are slower when predicated
9697 /// BB - Corresponding MachineBasicBlock.
9798 /// TrueBB / FalseBB- See AnalyzeBranch().
9899 /// BrCond - Conditions for end of block conditional branches.
109110 bool ClobbersPred : 1;
110111 unsigned NonPredSize;
111112 unsigned ExtraCost;
113 unsigned ExtraCost2;
112114 MachineBasicBlock *BB;
113115 MachineBasicBlock *TrueBB;
114116 MachineBasicBlock *FalseBB;
118120 IsAnalyzed(false), IsEnqueued(false), IsBrAnalyzable(false),
119121 HasFallThrough(false), IsUnpredicable(false),
120122 CannotBeCopied(false), ClobbersPred(false), NonPredSize(0),
121 ExtraCost(0), BB(0), TrueBB(0), FalseBB(0) {}
123 ExtraCost(0), ExtraCost2(0), BB(0), TrueBB(0), FalseBB(0) {}
122124 };
123125
124126 /// IfcvtToken - Record information about pending if-conversions to attempt:
202204 bool IgnoreBr = false);
203205 void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
204206
205 bool MeetIfcvtSizeLimit(MachineBasicBlock &BB, unsigned Size,
207 bool MeetIfcvtSizeLimit(MachineBasicBlock &BB,
208 unsigned Cycle, unsigned Extra,
206209 float Prediction, float Confidence) const {
207 return Size > 0 && TII->isProfitableToIfCvt(BB, Size,
208 Prediction, Confidence);
209 }
210
211 bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB, unsigned TSize,
212 MachineBasicBlock &FBB, unsigned FSize,
210 return Cycle > 0 && TII->isProfitableToIfCvt(BB, Cycle, Extra,
211 Prediction, Confidence);
212 }
213
214 bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB,
215 unsigned TCycle, unsigned TExtra,
216 MachineBasicBlock &FBB,
217 unsigned FCycle, unsigned FExtra,
213218 float Prediction, float Confidence) const {
214 return TSize > 0 && FSize > 0 &&
215 TII->isProfitableToIfCvt(TBB, TSize, FBB, FSize,
219 return TCycle > 0 && FCycle > 0 &&
220 TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra,
216221 Prediction, Confidence);
217222 }
218223
648653 // Then scan all the instructions.
649654 BBI.NonPredSize = 0;
650655 BBI.ExtraCost = 0;
656 BBI.ExtraCost2 = 0;
651657 BBI.ClobbersPred = false;
652658 for (MachineBasicBlock::iterator I = BBI.BB->begin(), E = BBI.BB->end();
653659 I != E; ++I) {
664670 if (!isCondBr) {
665671 if (!isPredicated) {
666672 BBI.NonPredSize++;
667 unsigned NumOps = TII->getNumMicroOps(&*I, InstrItins);
668 if (NumOps > 1)
669 BBI.ExtraCost += NumOps-1;
673 unsigned ExtraPredCost = 0;
674 unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I,
675 &ExtraPredCost);
676 if (NumCycles > 1)
677 BBI.ExtraCost += NumCycles-1;
678 BBI.ExtraCost2 += ExtraPredCost;
670679 } else if (!AlreadyPredicated) {
671680 // FIXME: This instruction is already predicated before the
672681 // if-conversion pass. It's probably something like a conditional move.
814823
815824 if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) &&
816825 MeetIfcvtSizeLimit(*TrueBBI.BB, (TrueBBI.NonPredSize - (Dups + Dups2) +
817 TrueBBI.ExtraCost),
826 TrueBBI.ExtraCost), TrueBBI.ExtraCost2,
818827 *FalseBBI.BB, (FalseBBI.NonPredSize - (Dups + Dups2) +
819 FalseBBI.ExtraCost),
828 FalseBBI.ExtraCost),FalseBBI.ExtraCost2,
820829 Prediction, Confidence) &&
821830 FeasibilityAnalysis(TrueBBI, BBI.BrCond) &&
822831 FeasibilityAnalysis(FalseBBI, RevCond)) {
835844
836845 if (ValidTriangle(TrueBBI, FalseBBI, false, Dups, Prediction, Confidence) &&
837846 MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
838 Prediction, Confidence) &&
847 TrueBBI.ExtraCost2, Prediction, Confidence) &&
839848 FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) {
840849 // Triangle:
841850 // EBB
850859
851860 if (ValidTriangle(TrueBBI, FalseBBI, true, Dups, Prediction, Confidence) &&
852861 MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
853 Prediction, Confidence) &&
862 TrueBBI.ExtraCost2, Prediction, Confidence) &&
854863 FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
855864 Tokens.push_back(new IfcvtToken(BBI, ICTriangleRev, TNeedSub, Dups));
856865 Enqueued = true;
858867
859868 if (ValidSimple(TrueBBI, Dups, Prediction, Confidence) &&
860869 MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
861 Prediction, Confidence) &&
870 TrueBBI.ExtraCost2, Prediction, Confidence) &&
862871 FeasibilityAnalysis(TrueBBI, BBI.BrCond)) {
863872 // Simple (split, no rejoin):
864873 // EBB
877886 1.0-Prediction, Confidence) &&
878887 MeetIfcvtSizeLimit(*FalseBBI.BB,
879888 FalseBBI.NonPredSize + FalseBBI.ExtraCost,
880 1.0-Prediction, Confidence) &&
889 FalseBBI.ExtraCost2, 1.0-Prediction, Confidence) &&
881890 FeasibilityAnalysis(FalseBBI, RevCond, true)) {
882891 Tokens.push_back(new IfcvtToken(BBI, ICTriangleFalse, FNeedSub, Dups));
883892 Enqueued = true;
887896 1.0-Prediction, Confidence) &&
888897 MeetIfcvtSizeLimit(*FalseBBI.BB,
889898 FalseBBI.NonPredSize + FalseBBI.ExtraCost,
890 1.0-Prediction, Confidence) &&
899 FalseBBI.ExtraCost2, 1.0-Prediction, Confidence) &&
891900 FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
892901 Tokens.push_back(new IfcvtToken(BBI, ICTriangleFRev, FNeedSub, Dups));
893902 Enqueued = true;
896905 if (ValidSimple(FalseBBI, Dups, 1.0-Prediction, Confidence) &&
897906 MeetIfcvtSizeLimit(*FalseBBI.BB,
898907 FalseBBI.NonPredSize + FalseBBI.ExtraCost,
899 1.0-Prediction, Confidence) &&
908 FalseBBI.ExtraCost2, 1.0-Prediction, Confidence) &&
900909 FeasibilityAnalysis(FalseBBI, RevCond)) {
901910 Tokens.push_back(new IfcvtToken(BBI, ICSimpleFalse, FNeedSub, Dups));
902911 Enqueued = true;
14261435 MachineInstr *MI = MF.CloneMachineInstr(I);
14271436 ToBBI.BB->insert(ToBBI.BB->end(), MI);
14281437 ToBBI.NonPredSize++;
1429 unsigned NumOps = TII->getNumMicroOps(MI, InstrItins);
1430 if (NumOps > 1)
1431 ToBBI.ExtraCost += NumOps-1;
1438 unsigned ExtraPredCost = 0;
1439 unsigned NumCycles = TII->getInstrLatency(InstrItins, &*I, &ExtraPredCost);
1440 if (NumCycles > 1)
1441 ToBBI.ExtraCost += NumCycles-1;
1442 ToBBI.ExtraCost2 += ExtraPredCost;
14321443
14331444 if (!TII->isPredicated(I) && !MI->isDebugValue()) {
14341445 if (!TII->PredicateInstruction(MI, Cond)) {
15031514
15041515 ToBBI.NonPredSize += FromBBI.NonPredSize;
15051516 ToBBI.ExtraCost += FromBBI.ExtraCost;
1517 ToBBI.ExtraCost2 += FromBBI.ExtraCost2;
15061518 FromBBI.NonPredSize = 0;
15071519 FromBBI.ExtraCost = 0;
1520 FromBBI.ExtraCost2 = 0;
15081521
15091522 ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
15101523 ToBBI.HasFallThrough = FromBBI.HasFallThrough;
237237 "Cannot schedule terminators or labels!");
238238 // Create the SUnit for this MI.
239239 SUnit *SU = NewSUnit(MI);
240 SU->isCall = TID.isCall();
241 SU->isCommutable = TID.isCommutable();
240242
241243 // Assign the Latency field of SU using target-provided information.
242244 if (UnitLatencies)
563565 // extra time.
564566 if (SU->getInstr()->getDesc().mayLoad())
565567 SU->Latency += 2;
566 } else
567 SU->Latency =
568 InstrItins->getStageLatency(SU->getInstr()->getDesc().getSchedClass());
568 } else {
569 SU->Latency = TII->getInstrLatency(InstrItins, SU->getInstr());
570 }
569571 }
570572
571573 void ScheduleDAGInstrs::ComputeOperandLatency(SUnit *Def, SUnit *Use,
15881588 }
15891589
15901590 bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
1591 if (left->isCall || right->isCall)
1592 // No way to compute latency of calls.
1593 return BURRSort(left, right, SPQ);
1594
15911595 bool LHigh = SPQ->HighRegPressure(left);
15921596 bool RHigh = SPQ->HighRegPressure(right);
15931597 // Avoid causing spills. If register pressure is high, schedule for
16471651
16481652 bool ilp_ls_rr_sort::operator()(const SUnit *left,
16491653 const SUnit *right) const {
1654 if (left->isCall || right->isCall)
1655 // No way to compute latency of calls.
1656 return BURRSort(left, right, SPQ);
1657
16501658 bool LHigh = SPQ->HighRegPressure(left);
16511659 bool RHigh = SPQ->HighRegPressure(right);
16521660 // Avoid causing spills. If register pressure is high, schedule for
7171 SUnit *SU = NewSUnit(Old->getNode());
7272 SU->OrigNode = Old->OrigNode;
7373 SU->Latency = Old->Latency;
74 SU->isCall = Old->isCall;
7475 SU->isTwoAddress = Old->isTwoAddress;
7576 SU->isCommutable = Old->isCommutable;
7677 SU->hasPhysRegDefs = Old->hasPhysRegDefs;
299300 N = N->getOperand(N->getNumOperands()-1).getNode();
300301 assert(N->getNodeId() == -1 && "Node already inserted!");
301302 N->setNodeId(NodeSUnit->NodeNum);
303 if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
304 NodeSUnit->isCall = true;
302305 }
303306
304307 // Scan down to find any flagged succs.
315318 assert(N->getNodeId() == -1 && "Node already inserted!");
316319 N->setNodeId(NodeSUnit->NodeNum);
317320 N = *UI;
321 if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
322 NodeSUnit->isCall = true;
318323 break;
319324 }
320325 if (!HasFlagUse) break;
437442 // all nodes flagged together into this SUnit.
438443 SU->Latency = 0;
439444 for (SDNode *N = SU->getNode(); N; N = N->getFlaggedNode())
440 if (N->isMachineOpcode()) {
441 SU->Latency += InstrItins->
442 getStageLatency(TII->get(N->getMachineOpcode()).getSchedClass());
443 }
445 if (N->isMachineOpcode())
446 SU->Latency += TII->getInstrLatency(InstrItins, N);
444447 }
445448
446449 void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
3838 static cl::opt
3939 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
4040 cl::desc("Enable ARM 2-addr to 3-addr conv"));
41
42 static cl::opt
43 OldARMIfCvt("old-arm-ifcvt", cl::Hidden,
44 cl::desc("Use old-style ARM if-conversion heuristics"));
4541
4642 ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
4743 : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)),
12041200 }
12051201
12061202 bool ARMBaseInstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
1207 unsigned NumInstrs,
1203 unsigned NumCyles,
1204 unsigned ExtraPredCycles,
12081205 float Probability,
12091206 float Confidence) const {
1210 if (!NumInstrs)
1211 return false;
1212
1213 // Use old-style heuristics
1214 if (OldARMIfCvt) {
1215 if (Subtarget.getCPUString() == "generic")
1216 // Generic (and overly aggressive) if-conversion limits for testing.
1217 return NumInstrs <= 10;
1218 if (Subtarget.hasV7Ops())
1219 return NumInstrs <= 3;
1220 return NumInstrs <= 2;
1221 }
1207 if (!NumCyles)
1208 return false;
12221209
12231210 // Attempt to estimate the relative costs of predication versus branching.
1224 float UnpredCost = Probability * NumInstrs;
1211 float UnpredCost = Probability * NumCyles;
12251212 UnpredCost += 1.0; // The branch itself
12261213 UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty();
12271214
1228 float PredCost = NumInstrs;
1229
1230 return PredCost < UnpredCost;
1231
1215 return (float)(NumCyles + ExtraPredCycles) < UnpredCost;
12321216 }
12331217
12341218 bool ARMBaseInstrInfo::
1235 isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
1236 MachineBasicBlock &FMBB, unsigned NumF,
1219 isProfitableToIfCvt(MachineBasicBlock &TMBB,
1220 unsigned TCycles, unsigned TExtra,
1221 MachineBasicBlock &FMBB,
1222 unsigned FCycles, unsigned FExtra,
12371223 float Probability, float Confidence) const {
1238 // Use old-style if-conversion heuristics
1239 if (OldARMIfCvt) {
1240 return NumT && NumF && NumT <= 2 && NumF <= 2;
1241 }
1242
1243 if (!NumT || !NumF)
1224 if (!TCycles || !FCycles)
12441225 return false;
12451226
12461227 // Attempt to estimate the relative costs of predication versus branching.
1247 float UnpredCost = Probability * NumT + (1.0 - Probability) * NumF;
1228 float UnpredCost = Probability * TCycles + (1.0 - Probability) * FCycles;
12481229 UnpredCost += 1.0; // The branch itself
12491230 UnpredCost += (1.0 - Confidence) * Subtarget.getMispredictionPenalty();
12501231
1251 float PredCost = NumT + NumF;
1252
1253 return PredCost < UnpredCost;
1232 return (float)(TCycles + FCycles + TExtra + FExtra) < UnpredCost;
12541233 }
12551234
12561235 /// getInstrPredicate - If instruction is predicated, returns its predicate
15901569 }
15911570
15921571 unsigned
1593 ARMBaseInstrInfo::getNumMicroOps(const MachineInstr *MI,
1594 const InstrItineraryData *ItinData) const {
1572 ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
1573 const MachineInstr *MI) const {
15951574 if (!ItinData || ItinData->isEmpty())
15961575 return 1;
15971576
16481627 case ARM::t2STM_UPD: {
16491628 unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1;
16501629 if (Subtarget.isCortexA8()) {
1651 // 4 registers would be issued: 1, 2, 1.
1652 // 5 registers would be issued: 1, 2, 2.
1653 return 1 + (NumRegs / 2);
1630 if (NumRegs < 4)
1631 return 2;
1632 // 4 registers would be issued: 2, 2.
1633 // 5 registers would be issued: 2, 2, 1.
1634 UOps = (NumRegs / 2);
1635 if (NumRegs % 2)
1636 ++UOps;
1637 return UOps;
16541638 } else if (Subtarget.isCortexA9()) {
16551639 UOps = (NumRegs / 2);
16561640 // If there are odd number of registers or if it's not 64-bit aligned,
20242008 return Latency;
20252009 }
20262010
2011 int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
2012 const MachineInstr *MI,
2013 unsigned *PredCost) const {
2014 if (MI->isCopyLike() || MI->isInsertSubreg() ||
2015 MI->isRegSequence() || MI->isImplicitDef())
2016 return 1;
2017
2018 if (!ItinData || ItinData->isEmpty())
2019 return 1;
2020
2021 const TargetInstrDesc &TID = MI->getDesc();
2022 unsigned Class = TID.getSchedClass();
2023 unsigned UOps = ItinData->Itineraries[Class].NumMicroOps;
2024 if (PredCost && TID.hasImplicitDefOfPhysReg(ARM::CPSR))
2025 // When predicated, CPSR is an additional source operand for CPSR updating
2026 // instructions, this apparently increases their latencies.
2027 *PredCost = 1;
2028 if (UOps)
2029 return ItinData->getStageLatency(Class);
2030 return getNumMicroOps(ItinData, MI);
2031 }
2032
2033 int ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
2034 SDNode *Node) const {
2035 if (!Node->isMachineOpcode())
2036 return 1;
2037
2038 if (!ItinData || ItinData->isEmpty())
2039 return 1;
2040
2041 unsigned Opcode = Node->getMachineOpcode();
2042 switch (Opcode) {
2043 default:
2044 return ItinData->getStageLatency(get(Opcode).getSchedClass());
2045 case ARM::VLDMQ:
2046 case ARM::VSTMQ:
2047 return 2;
2048 }
2049 }
2050
20272051 bool ARMBaseInstrInfo::
20282052 hasHighOperandLatency(const InstrItineraryData *ItinData,
20292053 const MachineRegisterInfo *MRI,
317317 const MachineFunction &MF) const;
318318
319319 virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
320 unsigned NumInstrs,
320 unsigned NumCyles, unsigned ExtraPredCycles,
321321 float Prob, float Confidence) const;
322322
323 virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,unsigned NumT,
324 MachineBasicBlock &FMBB,unsigned NumF,
323 virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
324 unsigned NumT, unsigned ExtraT,
325 MachineBasicBlock &FMBB,
326 unsigned NumF, unsigned ExtraF,
325327 float Probability, float Confidence) const;
326328
327329 virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
328 unsigned NumInstrs,
330 unsigned NumCyles,
329331 float Probability,
330332 float Confidence) const {
331 return NumInstrs == 1;
333 return NumCyles == 1;
332334 }
333335
334336 /// AnalyzeCompare - For a comparison instruction, return the source register
344346 const MachineRegisterInfo *MRI,
345347 MachineBasicBlock::iterator &MII) const;
346348
347 virtual unsigned getNumMicroOps(const MachineInstr *MI,
348 const InstrItineraryData *ItinData) const;
349 virtual unsigned getNumMicroOps(const InstrItineraryData *ItinData,
350 const MachineInstr *MI) const;
349351
350352 virtual
351353 int getOperandLatency(const InstrItineraryData *ItinData,
378380 const TargetInstrDesc &UseTID,
379381 unsigned UseIdx, unsigned UseAlign) const;
380382
383 int getInstrLatency(const InstrItineraryData *ItinData,
384 const MachineInstr *MI, unsigned *PredCost = 0) const;
385
386 int getInstrLatency(const InstrItineraryData *ItinData,
387 SDNode *Node) const;
388
381389 bool hasHighOperandLatency(const InstrItineraryData *ItinData,
382390 const MachineRegisterInfo *MRI,
383391 const MachineInstr *DefMI, unsigned DefIdx,
4040 // FIXME
4141 return 0;
4242 }
43
44 bool Thumb2InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
45 unsigned NumInstrs,
46 float Prediction,
47 float Confidence) const {
48 if (!OldT2IfCvt)
49 return ARMBaseInstrInfo::isProfitableToIfCvt(MBB, NumInstrs,
50 Prediction, Confidence);
51 return NumInstrs && NumInstrs <= 3;
52 }
53
54 bool Thumb2InstrInfo::
55 isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumT,
56 MachineBasicBlock &FMBB, unsigned NumF,
57 float Prediction, float Confidence) const {
58 if (!OldT2IfCvt)
59 return ARMBaseInstrInfo::isProfitableToIfCvt(TMBB, NumT,
60 FMBB, NumF,
61 Prediction, Confidence);
62
63 // FIXME: Catch optimization such as:
64 // r0 = movne
65 // r0 = moveq
66 return NumT && NumF &&
67 NumT <= 3 && NumF <= 3;
68 }
69
7043
7144 void
7245 Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
3636
3737 bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
3838 MachineBasicBlock::iterator MBBI) const;
39
40 bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumInstrs,
41 float Prediction, float Confidence) const;
42 bool isProfitableToIfCvt(MachineBasicBlock &TMBB, unsigned NumTInstrs,
43 MachineBasicBlock &FMBB, unsigned NumFInstrs,
44 float Prediction, float Confidence) const;
4539
4640 void copyPhysReg(MachineBasicBlock &MBB,
4741 MachineBasicBlock::iterator I, DebugLoc DL,
4949 }
5050
5151 unsigned
52 TargetInstrInfo::getNumMicroOps(const MachineInstr *MI,
53 const InstrItineraryData *ItinData) const {
52 TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
53 const MachineInstr *MI) const {
5454 if (!ItinData || ItinData->isEmpty())
5555 return 1;
5656
9191 return ItinData->getOperandCycle(DefClass, DefIdx);
9292 unsigned UseClass = get(UseNode->getMachineOpcode()).getSchedClass();
9393 return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
94 }
95
96 int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
97 const MachineInstr *MI,
98 unsigned *PredCost) const {
99 if (!ItinData || ItinData->isEmpty())
100 return 1;
101
102 return ItinData->getStageLatency(MI->getDesc().getSchedClass());
103 }
104
105 int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
106 SDNode *N) const {
107 if (!ItinData || ItinData->isEmpty())
108 return 1;
109
110 if (!N->isMachineOpcode())
111 return 1;
112
113 return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass());
94114 }
95115
96116 bool TargetInstrInfo::hasLowDefLatency(const InstrItineraryData *ItinData,
77 define fastcc i32 @dct_luma_sp(i32 %block_x, i32 %block_y, i32* %coeff_cost) {
88 entry:
99 ; Make sure to use base-updating stores for saving callee-saved registers.
10 ; CHECK: push
1011 ; CHECK-NOT: sub sp
11 ; CHECK: vpush
12 ; CHECK: push
1213 %predicted_block = alloca [4 x [4 x i32]], align 4 ; <[4 x [4 x i32]]*> [#uses=1]
1314 br label %cond_next489
1415
33 ; micro-coded and would have long issue latency even if predicated on
44 ; false predicate.
55
6 %0 = type { float, float, float, float }
7 %pln = type { %vec, float }
8 %vec = type { [4 x float] }
6 define void @t(double %a, double %b, double %c, double %d, i32* nocapture %solutions, double* nocapture %x) nounwind {
7 entry:
8 ; CHECK: t:
9 ; CHECK: vpop {d8}
10 ; CHECK-NOT: vpopne
11 ; CHECK: ldmia sp!, {r7, pc}
12 ; CHECK: vpop {d8}
13 ; CHECK: ldmia sp!, {r7, pc}
14 br i1 undef, label %if.else, label %if.then
915
10 define arm_aapcs_vfpcc float @aaa(%vec* nocapture %ustart, %vec* nocapture %udir, %vec* nocapture %vstart, %vec* nocapture %vdir, %vec* %upoint, %vec* %vpoint) {
11 ; CHECK: aaa:
12 ; CHECK: vldr.32
13 ; CHECK-NOT: vldrne
14 ; CHECK-NOT: vpopne
15 ; CHECK-NOT: popne
16 ; CHECK: vpop
17 ; CHECK: pop
18 entry:
19 br i1 undef, label %bb81, label %bb48
16 if.then: ; preds = %entry
17 %mul73 = fmul double undef, 0.000000e+00
18 %sub76 = fsub double %mul73, undef
19 store double %sub76, double* undef, align 4
20 %call88 = tail call double @cos(double 0.000000e+00) nounwind
21 %mul89 = fmul double undef, %call88
22 %sub92 = fsub double %mul89, undef
23 store double %sub92, double* undef, align 4
24 ret void
2025
21 bb48: ; preds = %entry
22 %0 = call arm_aapcs_vfpcc %0 @bbb(%pln* undef, %vec* %vstart, %vec* undef) nounwind ; <%0> [#uses=0]
23 ret float 0.000000e+00
24
25 bb81: ; preds = %entry
26 ret float 0.000000e+00
26 if.else: ; preds = %entry
27 %tmp101 = tail call double @llvm.pow.f64(double undef, double 0x3FD5555555555555)
28 %add112 = fadd double %tmp101, undef
29 %mul118 = fmul double %add112, undef
30 store double 0.000000e+00, double* %x, align 4
31 ret void
2732 }
2833
29 declare arm_aapcs_vfpcc %0 @bbb(%pln* nocapture, %vec* nocapture, %vec* nocapture) nounwind
34 declare double @acos(double)
35
36 declare double @sqrt(double) readnone
37
38 declare double @cos(double) readnone
39
40 declare double @fabs(double)
41
42 declare double @llvm.pow.f64(double, double) nounwind readonly
0 ; RUN: llc < %s -mtriple=arm-apple-darwin -mcpu=cortex-a8 | FileCheck %s
1 ; rdar://8598427
2 ; Adjust if-converter heuristics to avoid predicating vmrs which can cause
3 ; significant regression.
4
5 %struct.xyz_t = type { double, double, double }
6
7 define i32 @effie(i32 %tsets, %struct.xyz_t* nocapture %p, i32 %a, i32 %b, i32 %c) nounwind readonly noinline {
8 ; CHECK: effie:
9 entry:
10 %0 = icmp sgt i32 %tsets, 0
11 br i1 %0, label %bb.nph, label %bb6
12
13 bb.nph: ; preds = %entry
14 %1 = add nsw i32 %b, %a
15 %2 = add nsw i32 %1, %c
16 br label %bb
17
18 bb: ; preds = %bb4, %bb.nph
19 ; CHECK: vcmpe.f64
20 ; CHECK: vmrs apsr_nzcv, fpscr
21 %r.19 = phi i32 [ 0, %bb.nph ], [ %r.0, %bb4 ]
22 %n.08 = phi i32 [ 0, %bb.nph ], [ %10, %bb4 ]
23 %scevgep10 = getelementptr inbounds %struct.xyz_t* %p, i32 %n.08, i32 0
24 %scevgep11 = getelementptr %struct.xyz_t* %p, i32 %n.08, i32 1
25 %3 = load double* %scevgep10, align 4
26 %4 = load double* %scevgep11, align 4
27 %5 = fcmp uge double %3, %4
28 br i1 %5, label %bb3, label %bb1
29
30 bb1: ; preds = %bb
31 ; CHECK-NOT: it
32 ; CHECK-NOT: vcmpemi
33 ; CHECK-NOT: vmrsmi
34 ; CHECK: vcmpe.f64
35 ; CHECK: vmrs apsr_nzcv, fpscr
36 %scevgep12 = getelementptr %struct.xyz_t* %p, i32 %n.08, i32 2
37 %6 = load double* %scevgep12, align 4
38 %7 = fcmp uge double %3, %6
39 br i1 %7, label %bb3, label %bb2
40
41 bb2: ; preds = %bb1
42 %8 = add nsw i32 %2, %r.19
43 br label %bb4
44
45 bb3: ; preds = %bb1, %bb
46 %9 = add nsw i32 %r.19, 1
47 br label %bb4
48
49 bb4: ; preds = %bb3, %bb2
50 %r.0 = phi i32 [ %9, %bb3 ], [ %8, %bb2 ]
51 %10 = add nsw i32 %n.08, 1
52 %exitcond = icmp eq i32 %10, %tsets
53 br i1 %exitcond, label %bb6, label %bb
54
55 bb6: ; preds = %bb4, %entry
56 %r.1.lcssa = phi i32 [ 0, %entry ], [ %r.0, %bb4 ]
57 ret i32 %r.1.lcssa
58 }
33 ; constant offset addressing, so that each of the following stores
44 ; uses the same register.
55
6 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-128]
7 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-96]
8 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-64]
9 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #-32]
10 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}]
11 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #32]
12 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #64]
13 ; CHECK: vstr.32 s{{.*}}, [r{{.*}}, #96]
6 ; CHECK: vstr.32 s{{.*}}, [lr, #-128]
7 ; CHECK: vstr.32 s{{.*}}, [lr, #-96]
8 ; CHECK: vstr.32 s{{.*}}, [lr, #-64]
9 ; CHECK: vstr.32 s{{.*}}, [lr, #-32]
10 ; CHECK: vstr.32 s{{.*}}, [lr]
11 ; CHECK: vstr.32 s{{.*}}, [lr, #32]
12 ; CHECK: vstr.32 s{{.*}}, [lr, #64]
13 ; CHECK: vstr.32 s{{.*}}, [lr, #96]
1414
1515 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
1616
270270 entry:
271271 ; CHECK: t10:
272272 ; CHECK: vmov.i32 q9, #0x3F000000
273 ; CHECK: vmov d0, d17
274273 ; CHECK: vmla.f32 q8, q8, d0[0]
275274 %0 = shufflevector <4 x float> zeroinitializer, <4 x float> undef, <4 x i32> zeroinitializer ; <<4 x float>> [#uses=1]
276275 %1 = insertelement <4 x float> %0, float undef, i32 1 ; <<4 x float>> [#uses=1]
2222 %4 = insertelement <2 x double> %2, double %V.0.ph, i32 1 ; <<2 x double>> [#uses=2]
2323 ; Constant pool load followed by add.
2424 ; Then clobber the loaded register, not the sum.
25 ; CHECK: vldr.64
26 ; CHECK: vadd.f64
2725 ; CHECK: vldr.64 [[LDR:d.*]],
2826 ; CHECK: LPC0_0:
2927 ; CHECK: vadd.f64 [[ADD:d.*]], [[LDR]], [[LDR]]