llvm.org GIT mirror llvm / 5480a24
AMDGPU/SI: Handle hazard with > 8 byte VMEM stores Reviewers: arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, tony-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D25577 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@285359 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 3 years ago
7 changed file(s) with 206 addition(s) and 29 deletion(s). Raw diff Collapse all Expand all
555555 return SGPRInitBug;
556556 }
557557
558 bool has12DWordStoreHazard() const {
559 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
560 }
561
558562 unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
559563
560564 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
11631163 defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_si <0x5b>;
11641164 defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_si <0x5c>;
11651165 defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_si <0x5d>;
1166 // FIXME: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on CI.
11661167 //defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomic_si <0x5e">; // isn't on VI
11671168 //defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomic_si <0x5f>; // isn't on VI
11681169 //defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomic_si <0x60>; // isn't on VI
6666 if (SIInstrInfo::isVMEM(*MI) && checkVMEMHazards(MI) > 0)
6767 return NoopHazard;
6868
69 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
70 return NoopHazard;
71
6972 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
7073 return NoopHazard;
7174
8992 if (SIInstrInfo::isSMRD(*MI))
9093 return std::max(0, checkSMRDHazards(MI));
9194
92 if (SIInstrInfo::isVMEM(*MI))
93 return std::max(0, checkVMEMHazards(MI));
94
95 if (SIInstrInfo::isDPP(*MI))
96 return std::max(0, checkDPPHazards(MI));
97
98 if (isDivFMas(MI->getOpcode()))
99 return std::max(0, checkDivFMasHazards(MI));
95 if (SIInstrInfo::isVALU(*MI)) {
96 int WaitStates = std::max(0, checkVALUHazards(MI));
97
98 if (SIInstrInfo::isVMEM(*MI))
99 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
100
101 if (SIInstrInfo::isDPP(*MI))
102 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
103
104 if (isDivFMas(MI->getOpcode()))
105 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
106
107 return WaitStates;
108 }
100109
101110 if (isSGetReg(MI->getOpcode()))
102111 return std::max(0, checkGetRegHazards(MI));
148157 // Helper Functions
149158 //===----------------------------------------------------------------------===//
150159
160 int GCNHazardRecognizer::getWaitStatesSince(
161 function_ref IsHazard) {
162
163 int WaitStates = -1;
164 for (MachineInstr *MI : EmittedInstrs) {
165 ++WaitStates;
166 if (!MI || !IsHazard(MI))
167 continue;
168 return WaitStates;
169 }
170 return std::numeric_limits::max();
171 }
172
151173 int GCNHazardRecognizer::getWaitStatesSinceDef(
152174 unsigned Reg, function_ref IsHazardDef) {
153175 const SIRegisterInfo *TRI = ST.getRegisterInfo();
154176
155 int WaitStates = -1;
156 for (MachineInstr *MI : EmittedInstrs) {
157 ++WaitStates;
158 if (!MI || !IsHazardDef(MI))
159 continue;
160 if (MI->modifiesRegister(Reg, TRI))
161 return WaitStates;
162 }
163 return std::numeric_limits::max();
177 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
178 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
179 };
180
181 return getWaitStatesSince(IsHazardFn);
164182 }
165183
166184 int GCNHazardRecognizer::getWaitStatesSinceSetReg(
167185 function_ref IsHazard) {
168186
169 int WaitStates = -1;
170 for (MachineInstr *MI : EmittedInstrs) {
171 ++WaitStates;
172 if (!MI || !isSSetReg(MI->getOpcode()) || !IsHazard(MI))
173 continue;
174 return WaitStates;
175 }
176 return std::numeric_limits::max();
187 auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
188 return isSSetReg(MI->getOpcode()) && IsHazard(MI);
189 };
190
191 return getWaitStatesSince(IsHazardFn);
177192 }
178193
179194 //===----------------------------------------------------------------------===//
349364 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
350365 return SetRegWaitStates - WaitStatesNeeded;
351366 }
367
368 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
369 if (!MI.mayStore())
370 return -1;
371
372 const SIInstrInfo *TII = ST.getInstrInfo();
373 unsigned Opcode = MI.getOpcode();
374 const MCInstrDesc &Desc = MI.getDesc();
375
376 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
377 int VDataRCID = -1;
378 if (VDataIdx != -1)
379 VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
380
381 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
382 // For MUBUF/MTBUF instructions this hazard only exists if the
383 // instruction is not using a register in the soffset field.
384 const MachineOperand *SOffset =
385 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
386 // If we have no soffset operand, then assume this field has been
387 // hardcoded to zero.
388 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
389 (!SOffset || !SOffset->isReg()))
390 return VDataIdx;
391 }
392
393 // MIMG instructions create a hazard if they don't use a 256-bit T# and
394 // the store size is greater than 8 bytes and they have more than two bits
395 // of their dmask set.
396 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
397 if (TII->isMIMG(MI)) {
398 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
399 assert(SRsrcIdx != -1 &&
400 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
401 }
402
403 if (TII->isFLAT(MI)) {
404 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::data);
405 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
406 return DataIdx;
407 }
408
409 return -1;
410 }
411
412 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
413 // This checks for the hazard where VMEM instructions that store more than
414 // 8 bytes can have there store data over written by the next instruction.
415 if (!ST.has12DWordStoreHazard())
416 return 0;
417
418 const SIRegisterInfo *TRI = ST.getRegisterInfo();
419 const MachineRegisterInfo &MRI = VALU->getParent()->getParent()->getRegInfo();
420
421 const int VALUWaitStates = 1;
422 int WaitStatesNeeded = 0;
423
424 for (const MachineOperand &Def : VALU->defs()) {
425 if (!TRI->isVGPR(MRI, Def.getReg()))
426 continue;
427 unsigned Reg = Def.getReg();
428 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
429 int DataIdx = createsVALUHazard(*MI);
430 return DataIdx >= 0 &&
431 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
432 };
433 int WaitStatesNeededForDef =
434 VALUWaitStates - getWaitStatesSince(IsHazardFn);
435 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
436 }
437 return WaitStatesNeeded;
438 }
3434 const MachineFunction &MF;
3535 const SISubtarget &ST;
3636
37 int getWaitStatesSince(function_ref IsHazard);
3738 int getWaitStatesSinceDef(unsigned Reg,
3839 function_ref IsHazardDef =
3940 [](MachineInstr *) { return true; });
4647 int checkDivFMasHazards(MachineInstr *DivFMas);
4748 int checkGetRegHazards(MachineInstr *GetRegInstr);
4849 int checkSetRegHazards(MachineInstr *SetRegInstr);
50 int createsVALUHazard(const MachineInstr &MI);
51 int checkVALUHazards(MachineInstr *VALU);
4952 public:
5053 GCNHazardRecognizer(const MachineFunction &MF);
5154 // We can only issue one instruction per cycle.
351351
352352 // Avoid using MCRegisterClass::getSize, since that function will go away
353353 // (move from MC* level to Target* level). Return size in bits.
354 unsigned getRegBitWidth(const MCRegisterClass &RC) {
355 switch (RC.getID()) {
354 unsigned getRegBitWidth(unsigned RCID) {
355 switch (RCID) {
356356 case AMDGPU::SGPR_32RegClassID:
357357 case AMDGPU::VGPR_32RegClassID:
358358 case AMDGPU::VS_32RegClassID:
381381 }
382382 }
383383
384 unsigned getRegBitWidth(const MCRegisterClass &RC) {
385 return getRegBitWidth(RC.getID());
386 }
387
384388 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
385389 unsigned OpNo) {
386390 unsigned RCID = Desc.OpInfo[OpNo].RegClass;
157157 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo);
158158
159159 /// \brief Get the size in bits of a register from the register class \p RC.
160 unsigned getRegBitWidth(unsigned RCID);
161
162 /// \brief Get the size in bits of a register from the register class \p RC.
160163 unsigned getRegBitWidth(const MCRegisterClass &RC);
161164
162165 /// \brief Get size of register operand
0 # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
1 # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN
2 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,VI
1 # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI
2 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI
33
44 --- |
55 define void @div_fmas() { ret void }
66 define void @s_getreg() { ret void }
77 define void @s_setreg() { ret void }
8 define void @vmem_gt_8dw_store() { ret void }
89 ...
910 ---
1011 # GCN-LABEL: name: div_fmas
158159 S_SETREG_B32 %sgpr1, 0
159160 S_ENDPGM
160161 ...
162
163 ...
164 ---
165 # GCN-LABEL: name: vmem_gt_8dw_store
166
167 # GCN-LABEL: bb.0:
168 # GCN: BUFFER_STORE_DWORD_OFFSET
169 # GCN-NEXT: V_MOV_B32
170 # GCN: BUFFER_STORE_DWORDX3_OFFSET
171 # CIVI: S_NOP
172 # GCN-NEXT: V_MOV_B32
173 # GCN: BUFFER_STORE_DWORDX4_OFFSET
174 # GCN-NEXT: V_MOV_B32
175 # GCN: BUFFER_STORE_DWORDX4_OFFSET
176 # CIVI: S_NOP
177 # GCN-NEXT: V_MOV_B32
178 # GCN: BUFFER_STORE_FORMAT_XYZ_OFFSET
179 # CIVI: S_NOP
180 # GCN-NEXT: V_MOV_B32
181 # GCN: BUFFER_STORE_FORMAT_XYZW_OFFSET
182 # CIVI: S_NOP
183 # GCN-NEXT: V_MOV_B32
184
185 # GCN-LABEL: bb.1:
186 # GCN: FLAT_STORE_DWORDX2
187 # GCN-NEXT: V_MOV_B32
188 # GCN: FLAT_STORE_DWORDX3
189 # CIVI: S_NOP
190 # GCN-NEXT: V_MOV_B32
191 # GCN: FLAT_STORE_DWORDX4
192 # CIVI: S_NOP
193 # GCN-NEXT: V_MOV_B32
194 # GCN: FLAT_ATOMIC_CMPSWAP_X2
195 # CIVI: S_NOP
196 # GCN-NEXT: V_MOV_B32
197 # GCN: FLAT_ATOMIC_FCMPSWAP_X2
198 # CIVI: S_NOP
199 # GCN: V_MOV_B32
200
201 name: vmem_gt_8dw_store
202
203 body: |
204 bb.0:
205 successors: %bb.1
206 BUFFER_STORE_DWORD_OFFSET %vgpr3, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
207 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
208 BUFFER_STORE_DWORDX3_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
209 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
210 BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr4, 0, 0, 0, 0, implicit %exec
211 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
212 BUFFER_STORE_DWORDX4_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
213 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
214 BUFFER_STORE_FORMAT_XYZ_OFFSET %vgpr2_vgpr3_vgpr4, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
215 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
216 BUFFER_STORE_FORMAT_XYZW_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit %exec
217 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
218 BUFFER_ATOMIC_CMPSWAP_X2_OFFSET %vgpr2_vgpr3_vgpr4_vgpr5, %sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit %exec
219 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
220 S_BRANCH %bb.1
221
222 bb.1:
223 FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr
224 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
225 FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr
226 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
227 FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr
228 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
229 FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
230 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
231 FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr
232 %vgpr3 = V_MOV_B32_e32 0, implicit %exec
233 S_ENDPGM
234
235 ...