llvm.org GIT mirror llvm / b569379
[AMDGPU] gfx908 scheduling Differential Revision: https://reviews.llvm.org/D64590 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@365826 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 1 year, 30 days ago
3 changed file(s) with 163 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
3838 #define GET_SUBTARGETINFO_CTOR
3939 #undef AMDGPUSubtarget
4040 #include "R600GenSubtargetInfo.inc"
41
42 static cl::opt DisablePowerSched(
43 "amdgpu-disable-power-sched",
44 cl::desc("Disable scheduling to minimize mAI power bursts"),
45 cl::init(false));
4146
4247 GCNSubtarget::~GCNSubtarget() = default;
4348
750755 }
751756 }
752757 };
758
759 struct FillMFMAShadowMutation : ScheduleDAGMutation {
760 const SIInstrInfo *TII;
761
762 ScheduleDAGMI *DAG;
763
764 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
765
766 bool isSALU(const SUnit *SU) const {
767 const MachineInstr &MI = *SU->getInstr();
768 return TII->isSALU(MI) && !MI.isTerminator();
769 }
770
771 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
772 if (Pred->NodeNum < Succ->NodeNum)
773 return true;
774
775 SmallVector Succs({Succ}), Preds({Pred});
776
777 for (unsigned I = 0; I < Succs.size(); ++I) {
778 for (const SDep &SI : Succs[I]->Succs) {
779 const SUnit *SU = SI.getSUnit();
780 if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
781 Succs.push_back(SU);
782 }
783 }
784
785 SmallPtrSet Visited;
786 while (!Preds.empty()) {
787 const SUnit *SU = Preds.pop_back_val();
788 if (llvm::find(Succs, SU) != Succs.end())
789 return false;
790 Visited.insert(SU);
791 for (const SDep &SI : SU->Preds)
792 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
793 Preds.push_back(SI.getSUnit());
794 }
795
796 return true;
797 }
798
799 // Link as much SALU intructions in chain as possible. Return the size
800 // of the chain. Links up to MaxChain instructions.
801 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
802 SmallPtrSetImpl &Visited) const {
803 SmallVector Worklist({To});
804 unsigned Linked = 0;
805
806 while (!Worklist.empty() && MaxChain-- > 0) {
807 SUnit *SU = Worklist.pop_back_val();
808 if (!Visited.insert(SU).second)
809 continue;
810
811 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
812 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
813
814 if (SU->addPred(SDep(From, SDep::Artificial), false))
815 ++Linked;
816
817 for (SDep &SI : From->Succs) {
818 SUnit *SUv = SI.getSUnit();
819 if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
820 SUv->addPred(SDep(SU, SDep::Artificial), false);
821 }
822
823 for (SDep &SI : SU->Succs) {
824 SUnit *Succ = SI.getSUnit();
825 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
826 Worklist.push_back(Succ);
827 }
828 }
829
830 return Linked;
831 }
832
833 void apply(ScheduleDAGInstrs *DAGInstrs) override {
834 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget();
835 if (!ST.hasMAIInsts() || DisablePowerSched)
836 return;
837 DAG = static_cast(DAGInstrs);
838 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
839 if (!TSchedModel || DAG->SUnits.empty())
840 return;
841
842 // Scan for MFMA long latency instructions and try to add a dependency
843 // of available SALU instructions to give them a chance to fill MFMA
844 // shadow. That is desirable to fill MFMA shadow with SALU instructions
845 // rather than VALU to prevent power consumption bursts and throttle.
846 auto LastSALU = DAG->SUnits.begin();
847 auto E = DAG->SUnits.end();
848 SmallPtrSet Visited;
849 for (SUnit &SU : DAG->SUnits) {
850 MachineInstr &MAI = *SU.getInstr();
851 if (!TII->isMAI(MAI) ||
852 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
853 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
854 continue;
855
856 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
857
858 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
859 dbgs() << "Need " << Lat
860 << " instructions to cover latency.\n");
861
862 // Find up to Lat independent scalar instructions as early as
863 // possible such that they can be scheduled after this MFMA.
864 for ( ; Lat && LastSALU != E; ++LastSALU) {
865 if (Visited.count(&*LastSALU))
866 continue;
867
868 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
869 continue;
870
871 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
872 }
873 }
874 }
875 };
753876 } // namespace
754877
755878 void GCNSubtarget::getPostRAMutations(
756879 std::vector> &Mutations) const {
757880 Mutations.push_back(llvm::make_unique(&InstrInfo));
881 Mutations.push_back(llvm::make_unique(&InstrInfo));
758882 }
759883
760884 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
659659 return !RI.isSGPRReg(MRI, Dest);
660660 }
661661
662 bool hasVGPRUses(const MachineInstr &MI) const {
663 const MachineFunction &MF = *MI.getParent()->getParent();
664 const MachineRegisterInfo &MRI = MF.getRegInfo();
665 return llvm::any_of(MI.explicit_uses(),
666 [&MRI, this](const MachineOperand &MO) {
667 return MO.isReg() && RI.isVGPR(MRI, MO.getReg());});
668 }
669
662670 /// Whether we must prevent this instruction from executing with EXEC = 0.
663671 bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
664672
2323 def WriteVMEM : SchedWrite;
2424 def WriteBarrier : SchedWrite;
2525
26 def MIVGPRRead : SchedRead;
27 def MIMFMARead : SchedRead;
28
2629 // Vector ALU instructions
2730 def Write32Bit : SchedWrite;
2831 def WriteQuarterRate32 : SchedWrite;
4144
4245 // Half rate 64-bit instructions.
4346 def Write64Bit : SchedWrite;
47
48 // mAI multipass instructions.
49 def Write2PassMAI : SchedWrite;
50 def Write8PassMAI : SchedWrite;
51 def Write16PassMAI : SchedWrite;
4452
4553 // FIXME: Should there be a class for instructions which are VALU
4654 // instructions and have VALU rates, but write to the SALU (i.e. VOPC
96104 class HWVALUWriteRes :
97105 HWWriteRes;
98106
107 def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>;
108
109 def MIReadVGPR : SchedReadVariant<[
110 SchedVar,
111 SchedVar]>;
99112
100113 // The latency numbers are taken from AMD Accelerated Parallel Processing
101114 // guide. They may not be accurate.
114127 def : HWVALUWriteRes;
115128 def : HWVALUWriteRes;
116129 def : HWVALUWriteRes;
130 def : HWVALUWriteRes;
131 def : HWVALUWriteRes;
132 def : HWVALUWriteRes;
133
134 def : ReadAdvance;
135 def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
136
137 // Technicaly mfma reads can be from 0 to 4 cycles but that does not make
138 // sense to model because its register setup is huge. In particular if we
139 // properly model read advanice as -2 for a vgpr read it will result in a
140 // bad scheduling of acc writes before that mfma. To avoid it we would
141 // need to consume 2 or 4 more vgprs to be initialized before the acc
142 // write sequence. Just assume worst case here.
143 def : ReadAdvance;
144
145 def : InstRW<[Write2PassMAI, MIMFMARead], (instregex "^V_MFMA_..._4X4X")>;
146 def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_MFMA_..._16X16X")>;
147 def : InstRW<[Write16PassMAI, MIMFMARead], (instregex "^V_MFMA_..._32X32X")>;
117148 }
118149
119150 def PredIsVGPR32Copy : SchedPredicate<[{TII->isVGPRCopy(*MI) && TII->getOpSize(*MI, 0) <= 32}]>;