LLVM 19.0.0git
AMDGPUInstructionSelector.cpp
Go to the documentation of this file.
1//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the InstructionSelector class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
15#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUTargetMachine.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32#define DEBUG_TYPE "amdgpu-isel"
33
34using namespace llvm;
35using namespace MIPatternMatch;
36
37#define GET_GLOBALISEL_IMPL
38#define AMDGPUSubtarget GCNSubtarget
39#include "AMDGPUGenGlobalISel.inc"
40#undef GET_GLOBALISEL_IMPL
41#undef AMDGPUSubtarget
42
44 const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI,
46 : TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI), TM(TM),
47 STI(STI),
48 EnableLateStructurizeCFG(AMDGPUTargetMachine::EnableLateStructurizeCFG),
50#include "AMDGPUGenGlobalISel.inc"
53#include "AMDGPUGenGlobalISel.inc"
55{
56}
57
58const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
59
61 CodeGenCoverage *CoverageInfo,
63 BlockFrequencyInfo *BFI) {
64 MRI = &MF.getRegInfo();
65 Subtarget = &MF.getSubtarget<GCNSubtarget>();
68}
69
70// Return the wave level SGPR base address if this is a wave address.
72 return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
73 ? Def->getOperand(1).getReg()
74 : Register();
75}
76
77bool AMDGPUInstructionSelector::isVCC(Register Reg,
78 const MachineRegisterInfo &MRI) const {
79 // The verifier is oblivious to s1 being a valid value for wavesize registers.
80 if (Reg.isPhysical())
81 return false;
82
83 auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
84 const TargetRegisterClass *RC =
85 RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
86 if (RC) {
87 const LLT Ty = MRI.getType(Reg);
88 if (!Ty.isValid() || Ty.getSizeInBits() != 1)
89 return false;
90 // G_TRUNC s1 result is never vcc.
91 return MRI.getVRegDef(Reg)->getOpcode() != AMDGPU::G_TRUNC &&
92 RC->hasSuperClassEq(TRI.getBoolRC());
93 }
94
95 const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
96 return RB->getID() == AMDGPU::VCCRegBankID;
97}
98
99bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
100 unsigned NewOpc) const {
101 MI.setDesc(TII.get(NewOpc));
102 MI.removeOperand(1); // Remove intrinsic ID.
103 MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
104
105 MachineOperand &Dst = MI.getOperand(0);
106 MachineOperand &Src = MI.getOperand(1);
107
108 // TODO: This should be legalized to s32 if needed
109 if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
110 return false;
111
112 const TargetRegisterClass *DstRC
113 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
114 const TargetRegisterClass *SrcRC
115 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
116 if (!DstRC || DstRC != SrcRC)
117 return false;
118
119 return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
120 RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
121}
122
123bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
124 const DebugLoc &DL = I.getDebugLoc();
125 MachineBasicBlock *BB = I.getParent();
126 I.setDesc(TII.get(TargetOpcode::COPY));
127
128 const MachineOperand &Src = I.getOperand(1);
129 MachineOperand &Dst = I.getOperand(0);
130 Register DstReg = Dst.getReg();
131 Register SrcReg = Src.getReg();
132
133 if (isVCC(DstReg, *MRI)) {
134 if (SrcReg == AMDGPU::SCC) {
135 const TargetRegisterClass *RC
136 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
137 if (!RC)
138 return true;
139 return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
140 }
141
142 if (!isVCC(SrcReg, *MRI)) {
143 // TODO: Should probably leave the copy and let copyPhysReg expand it.
144 if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
145 return false;
146
147 const TargetRegisterClass *SrcRC
148 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
149
150 std::optional<ValueAndVReg> ConstVal =
151 getIConstantVRegValWithLookThrough(SrcReg, *MRI, true);
152 if (ConstVal) {
153 unsigned MovOpc =
154 STI.isWave64() ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
155 BuildMI(*BB, &I, DL, TII.get(MovOpc), DstReg)
156 .addImm(ConstVal->Value.getBoolValue() ? -1 : 0);
157 } else {
158 Register MaskedReg = MRI->createVirtualRegister(SrcRC);
159
160 // We can't trust the high bits at this point, so clear them.
161
162 // TODO: Skip masking high bits if def is known boolean.
163
164 bool IsSGPR = TRI.isSGPRClass(SrcRC);
165 unsigned AndOpc =
166 IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
167 auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
168 .addImm(1)
169 .addReg(SrcReg);
170 if (IsSGPR)
171 And.setOperandDead(3); // Dead scc
172
173 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
174 .addImm(0)
175 .addReg(MaskedReg);
176 }
177
178 if (!MRI->getRegClassOrNull(SrcReg))
179 MRI->setRegClass(SrcReg, SrcRC);
180 I.eraseFromParent();
181 return true;
182 }
183
184 const TargetRegisterClass *RC =
186 if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
187 return false;
188
189 return true;
190 }
191
192 for (const MachineOperand &MO : I.operands()) {
193 if (MO.getReg().isPhysical())
194 continue;
195
196 const TargetRegisterClass *RC =
198 if (!RC)
199 continue;
200 RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
201 }
202 return true;
203}
204
205bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
206 const Register DefReg = I.getOperand(0).getReg();
207 const LLT DefTy = MRI->getType(DefReg);
208
209 // S1 G_PHIs should not be selected in instruction-select, instead:
210 // - divergent S1 G_PHI should go through lane mask merging algorithm
211 // and be fully inst-selected in AMDGPUGlobalISelDivergenceLowering
212 // - uniform S1 G_PHI should be lowered into S32 G_PHI in AMDGPURegBankSelect
213 if (DefTy == LLT::scalar(1))
214 return false;
215
216 // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
217
218 const RegClassOrRegBank &RegClassOrBank =
219 MRI->getRegClassOrRegBank(DefReg);
220
221 const TargetRegisterClass *DefRC
222 = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
223 if (!DefRC) {
224 if (!DefTy.isValid()) {
225 LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
226 return false;
227 }
228
229 const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
230 DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
231 if (!DefRC) {
232 LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
233 return false;
234 }
235 }
236
237 // TODO: Verify that all registers have the same bank
238 I.setDesc(TII.get(TargetOpcode::PHI));
239 return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
240}
241
243AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
244 const TargetRegisterClass &SubRC,
245 unsigned SubIdx) const {
246
247 MachineInstr *MI = MO.getParent();
249 Register DstReg = MRI->createVirtualRegister(&SubRC);
250
251 if (MO.isReg()) {
252 unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
253 Register Reg = MO.getReg();
254 BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
255 .addReg(Reg, 0, ComposedSubIdx);
256
257 return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
258 MO.isKill(), MO.isDead(), MO.isUndef(),
259 MO.isEarlyClobber(), 0, MO.isDebug(),
260 MO.isInternalRead());
261 }
262
263 assert(MO.isImm());
264
265 APInt Imm(64, MO.getImm());
266
267 switch (SubIdx) {
268 default:
269 llvm_unreachable("do not know to split immediate with this sub index.");
270 case AMDGPU::sub0:
271 return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
272 case AMDGPU::sub1:
273 return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
274 }
275}
276
277static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
278 switch (Opc) {
279 case AMDGPU::G_AND:
280 return Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
281 case AMDGPU::G_OR:
282 return Is64 ? AMDGPU::S_OR_B64 : AMDGPU::S_OR_B32;
283 case AMDGPU::G_XOR:
284 return Is64 ? AMDGPU::S_XOR_B64 : AMDGPU::S_XOR_B32;
285 default:
286 llvm_unreachable("not a bit op");
287 }
288}
289
290bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
291 Register DstReg = I.getOperand(0).getReg();
292 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
293
294 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
295 if (DstRB->getID() != AMDGPU::SGPRRegBankID &&
296 DstRB->getID() != AMDGPU::VCCRegBankID)
297 return false;
298
299 bool Is64 = Size > 32 || (DstRB->getID() == AMDGPU::VCCRegBankID &&
300 STI.isWave64());
301 I.setDesc(TII.get(getLogicalBitOpcode(I.getOpcode(), Is64)));
302
303 // Dead implicit-def of scc
304 I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
305 true, // isImp
306 false, // isKill
307 true)); // isDead
308 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
309}
310
311bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
312 MachineBasicBlock *BB = I.getParent();
314 Register DstReg = I.getOperand(0).getReg();
315 const DebugLoc &DL = I.getDebugLoc();
316 LLT Ty = MRI->getType(DstReg);
317 if (Ty.isVector())
318 return false;
319
320 unsigned Size = Ty.getSizeInBits();
321 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
322 const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
323 const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
324
325 if (Size == 32) {
326 if (IsSALU) {
327 const unsigned Opc = Sub ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
329 BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
330 .add(I.getOperand(1))
331 .add(I.getOperand(2))
332 .setOperandDead(3); // Dead scc
333 I.eraseFromParent();
334 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
335 }
336
337 if (STI.hasAddNoCarry()) {
338 const unsigned Opc = Sub ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_ADD_U32_e64;
339 I.setDesc(TII.get(Opc));
340 I.addOperand(*MF, MachineOperand::CreateImm(0));
341 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
342 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
343 }
344
345 const unsigned Opc = Sub ? AMDGPU::V_SUB_CO_U32_e64 : AMDGPU::V_ADD_CO_U32_e64;
346
347 Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
349 = BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
350 .addDef(UnusedCarry, RegState::Dead)
351 .add(I.getOperand(1))
352 .add(I.getOperand(2))
353 .addImm(0);
354 I.eraseFromParent();
355 return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
356 }
357
358 assert(!Sub && "illegal sub should not reach here");
359
360 const TargetRegisterClass &RC
361 = IsSALU ? AMDGPU::SReg_64_XEXECRegClass : AMDGPU::VReg_64RegClass;
362 const TargetRegisterClass &HalfRC
363 = IsSALU ? AMDGPU::SReg_32RegClass : AMDGPU::VGPR_32RegClass;
364
365 MachineOperand Lo1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub0));
366 MachineOperand Lo2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub0));
367 MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
368 MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
369
370 Register DstLo = MRI->createVirtualRegister(&HalfRC);
371 Register DstHi = MRI->createVirtualRegister(&HalfRC);
372
373 if (IsSALU) {
374 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
375 .add(Lo1)
376 .add(Lo2);
377 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
378 .add(Hi1)
379 .add(Hi2)
380 .setOperandDead(3); // Dead scc
381 } else {
382 const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
383 Register CarryReg = MRI->createVirtualRegister(CarryRC);
384 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_CO_U32_e64), DstLo)
385 .addDef(CarryReg)
386 .add(Lo1)
387 .add(Lo2)
388 .addImm(0);
389 MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
390 .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
391 .add(Hi1)
392 .add(Hi2)
393 .addReg(CarryReg, RegState::Kill)
394 .addImm(0);
395
396 if (!constrainSelectedInstRegOperands(*Addc, TII, TRI, RBI))
397 return false;
398 }
399
400 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
401 .addReg(DstLo)
402 .addImm(AMDGPU::sub0)
403 .addReg(DstHi)
404 .addImm(AMDGPU::sub1);
405
406
407 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
408 return false;
409
410 I.eraseFromParent();
411 return true;
412}
413
414bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
415 MachineInstr &I) const {
416 MachineBasicBlock *BB = I.getParent();
418 const DebugLoc &DL = I.getDebugLoc();
419 Register Dst0Reg = I.getOperand(0).getReg();
420 Register Dst1Reg = I.getOperand(1).getReg();
421 const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO ||
422 I.getOpcode() == AMDGPU::G_UADDE;
423 const bool HasCarryIn = I.getOpcode() == AMDGPU::G_UADDE ||
424 I.getOpcode() == AMDGPU::G_USUBE;
425
426 if (isVCC(Dst1Reg, *MRI)) {
427 unsigned NoCarryOpc =
428 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
429 unsigned CarryOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
430 I.setDesc(TII.get(HasCarryIn ? CarryOpc : NoCarryOpc));
431 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
432 I.addOperand(*MF, MachineOperand::CreateImm(0));
433 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
434 }
435
436 Register Src0Reg = I.getOperand(2).getReg();
437 Register Src1Reg = I.getOperand(3).getReg();
438
439 if (HasCarryIn) {
440 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
441 .addReg(I.getOperand(4).getReg());
442 }
443
444 unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
445 unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
446
447 auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
448 .add(I.getOperand(2))
449 .add(I.getOperand(3));
450
451 if (MRI->use_nodbg_empty(Dst1Reg)) {
452 CarryInst.setOperandDead(3); // Dead scc
453 } else {
454 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
455 .addReg(AMDGPU::SCC);
456 if (!MRI->getRegClassOrNull(Dst1Reg))
457 MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
458 }
459
460 if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
461 !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
462 !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, *MRI))
463 return false;
464
465 if (HasCarryIn &&
466 !RBI.constrainGenericRegister(I.getOperand(4).getReg(),
467 AMDGPU::SReg_32RegClass, *MRI))
468 return false;
469
470 I.eraseFromParent();
471 return true;
472}
473
474bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
475 MachineInstr &I) const {
476 MachineBasicBlock *BB = I.getParent();
478 const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
479
480 unsigned Opc;
481 if (Subtarget->hasMADIntraFwdBug())
482 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_gfx11_e64
483 : AMDGPU::V_MAD_I64_I32_gfx11_e64;
484 else
485 Opc = IsUnsigned ? AMDGPU::V_MAD_U64_U32_e64 : AMDGPU::V_MAD_I64_I32_e64;
486 I.setDesc(TII.get(Opc));
487 I.addOperand(*MF, MachineOperand::CreateImm(0));
488 I.addImplicitDefUseOperands(*MF);
489 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
490}
491
492// TODO: We should probably legalize these to only using 32-bit results.
493bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
494 MachineBasicBlock *BB = I.getParent();
495 Register DstReg = I.getOperand(0).getReg();
496 Register SrcReg = I.getOperand(1).getReg();
497 LLT DstTy = MRI->getType(DstReg);
498 LLT SrcTy = MRI->getType(SrcReg);
499 const unsigned SrcSize = SrcTy.getSizeInBits();
500 unsigned DstSize = DstTy.getSizeInBits();
501
502 // TODO: Should handle any multiple of 32 offset.
503 unsigned Offset = I.getOperand(2).getImm();
504 if (Offset % 32 != 0 || DstSize > 128)
505 return false;
506
507 // 16-bit operations really use 32-bit registers.
508 // FIXME: Probably should not allow 16-bit G_EXTRACT results.
509 if (DstSize == 16)
510 DstSize = 32;
511
512 const TargetRegisterClass *DstRC =
513 TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
514 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
515 return false;
516
517 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
518 const TargetRegisterClass *SrcRC =
519 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
520 if (!SrcRC)
521 return false;
523 DstSize / 32);
524 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
525 if (!SrcRC)
526 return false;
527
528 SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
529 *SrcRC, I.getOperand(1));
530 const DebugLoc &DL = I.getDebugLoc();
531 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
532 .addReg(SrcReg, 0, SubReg);
533
534 I.eraseFromParent();
535 return true;
536}
537
538bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
539 MachineBasicBlock *BB = MI.getParent();
540 Register DstReg = MI.getOperand(0).getReg();
541 LLT DstTy = MRI->getType(DstReg);
542 LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
543
544 const unsigned SrcSize = SrcTy.getSizeInBits();
545 if (SrcSize < 32)
546 return selectImpl(MI, *CoverageInfo);
547
548 const DebugLoc &DL = MI.getDebugLoc();
549 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
550 const unsigned DstSize = DstTy.getSizeInBits();
551 const TargetRegisterClass *DstRC =
552 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
553 if (!DstRC)
554 return false;
555
556 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(DstRC, SrcSize / 8);
558 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::REG_SEQUENCE), DstReg);
559 for (int I = 0, E = MI.getNumOperands() - 1; I != E; ++I) {
560 MachineOperand &Src = MI.getOperand(I + 1);
561 MIB.addReg(Src.getReg(), getUndefRegState(Src.isUndef()));
562 MIB.addImm(SubRegs[I]);
563
564 const TargetRegisterClass *SrcRC
565 = TRI.getConstrainedRegClassForOperand(Src, *MRI);
566 if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
567 return false;
568 }
569
570 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
571 return false;
572
573 MI.eraseFromParent();
574 return true;
575}
576
577bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
578 MachineBasicBlock *BB = MI.getParent();
579 const int NumDst = MI.getNumOperands() - 1;
580
581 MachineOperand &Src = MI.getOperand(NumDst);
582
583 Register SrcReg = Src.getReg();
584 Register DstReg0 = MI.getOperand(0).getReg();
585 LLT DstTy = MRI->getType(DstReg0);
586 LLT SrcTy = MRI->getType(SrcReg);
587
588 const unsigned DstSize = DstTy.getSizeInBits();
589 const unsigned SrcSize = SrcTy.getSizeInBits();
590 const DebugLoc &DL = MI.getDebugLoc();
591 const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
592
593 const TargetRegisterClass *SrcRC =
594 TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank);
595 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
596 return false;
597
598 // Note we could have mixed SGPR and VGPR destination banks for an SGPR
599 // source, and this relies on the fact that the same subregister indices are
600 // used for both.
601 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
602 for (int I = 0, E = NumDst; I != E; ++I) {
603 MachineOperand &Dst = MI.getOperand(I);
604 BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::COPY), Dst.getReg())
605 .addReg(SrcReg, 0, SubRegs[I]);
606
607 // Make sure the subregister index is valid for the source register.
608 SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegs[I]);
609 if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
610 return false;
611
612 const TargetRegisterClass *DstRC =
614 if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
615 return false;
616 }
617
618 MI.eraseFromParent();
619 return true;
620}
621
622bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
623 assert(MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC ||
624 MI.getOpcode() == AMDGPU::G_BUILD_VECTOR);
625
626 Register Src0 = MI.getOperand(1).getReg();
627 Register Src1 = MI.getOperand(2).getReg();
628 LLT SrcTy = MRI->getType(Src0);
629 const unsigned SrcSize = SrcTy.getSizeInBits();
630
631 // BUILD_VECTOR with >=32 bits source is handled by MERGE_VALUE.
632 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR && SrcSize >= 32) {
633 return selectG_MERGE_VALUES(MI);
634 }
635
636 // Selection logic below is for V2S16 only.
637 // For G_BUILD_VECTOR_TRUNC, additionally check that the operands are s32.
638 Register Dst = MI.getOperand(0).getReg();
639 if (MRI->getType(Dst) != LLT::fixed_vector(2, 16) ||
640 (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC &&
641 SrcTy != LLT::scalar(32)))
642 return selectImpl(MI, *CoverageInfo);
643
644 const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
645 if (DstBank->getID() == AMDGPU::AGPRRegBankID)
646 return false;
647
648 assert(DstBank->getID() == AMDGPU::SGPRRegBankID ||
649 DstBank->getID() == AMDGPU::VGPRRegBankID);
650 const bool IsVector = DstBank->getID() == AMDGPU::VGPRRegBankID;
651
652 const DebugLoc &DL = MI.getDebugLoc();
653 MachineBasicBlock *BB = MI.getParent();
654
655 // First, before trying TableGen patterns, check if both sources are
656 // constants. In those cases, we can trivially compute the final constant
657 // and emit a simple move.
658 auto ConstSrc1 = getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
659 if (ConstSrc1) {
660 auto ConstSrc0 =
661 getAnyConstantVRegValWithLookThrough(Src0, *MRI, true, true);
662 if (ConstSrc0) {
663 const int64_t K0 = ConstSrc0->Value.getSExtValue();
664 const int64_t K1 = ConstSrc1->Value.getSExtValue();
665 uint32_t Lo16 = static_cast<uint32_t>(K0) & 0xffff;
666 uint32_t Hi16 = static_cast<uint32_t>(K1) & 0xffff;
667 uint32_t Imm = Lo16 | (Hi16 << 16);
668
669 // VALU
670 if (IsVector) {
671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), Dst).addImm(Imm);
672 MI.eraseFromParent();
673 return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI);
674 }
675
676 // SALU
677 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), Dst).addImm(Imm);
678 MI.eraseFromParent();
679 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
680 }
681 }
682
683 // Now try TableGen patterns.
684 if (selectImpl(MI, *CoverageInfo))
685 return true;
686
687 // TODO: This should probably be a combine somewhere
688 // (build_vector $src0, undef) -> copy $src0
689 MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
690 if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
691 MI.setDesc(TII.get(AMDGPU::COPY));
692 MI.removeOperand(2);
693 const auto &RC =
694 IsVector ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
695 return RBI.constrainGenericRegister(Dst, RC, *MRI) &&
696 RBI.constrainGenericRegister(Src0, RC, *MRI);
697 }
698
699 // TODO: Can be improved?
700 if (IsVector) {
701 Register TmpReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
702 auto MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_AND_B32_e32), TmpReg)
703 .addImm(0xFFFF)
704 .addReg(Src0);
705 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
706 return false;
707
708 MIB = BuildMI(*BB, MI, DL, TII.get(AMDGPU::V_LSHL_OR_B32_e64), Dst)
709 .addReg(Src1)
710 .addImm(16)
711 .addReg(TmpReg);
712 if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
713 return false;
714
715 MI.eraseFromParent();
716 return true;
717 }
718
719 Register ShiftSrc0;
720 Register ShiftSrc1;
721
722 // With multiple uses of the shift, this will duplicate the shift and
723 // increase register pressure.
724 //
725 // (build_vector (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
726 // => (S_PACK_HH_B32_B16 $src0, $src1)
727 // (build_vector (lshr_oneuse SReg_32:$src0, 16), $src1)
728 // => (S_PACK_HL_B32_B16 $src0, $src1)
729 // (build_vector $src0, (lshr_oneuse SReg_32:$src1, 16))
730 // => (S_PACK_LH_B32_B16 $src0, $src1)
731 // (build_vector $src0, $src1)
732 // => (S_PACK_LL_B32_B16 $src0, $src1)
733
734 bool Shift0 = mi_match(
735 Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_SpecificICst(16))));
736
737 bool Shift1 = mi_match(
738 Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_SpecificICst(16))));
739
740 unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
741 if (Shift0 && Shift1) {
742 Opc = AMDGPU::S_PACK_HH_B32_B16;
743 MI.getOperand(1).setReg(ShiftSrc0);
744 MI.getOperand(2).setReg(ShiftSrc1);
745 } else if (Shift1) {
746 Opc = AMDGPU::S_PACK_LH_B32_B16;
747 MI.getOperand(2).setReg(ShiftSrc1);
748 } else if (Shift0) {
749 auto ConstSrc1 =
750 getAnyConstantVRegValWithLookThrough(Src1, *MRI, true, true);
751 if (ConstSrc1 && ConstSrc1->Value == 0) {
752 // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
753 auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
754 .addReg(ShiftSrc0)
755 .addImm(16)
756 .setOperandDead(3); // Dead scc
757
758 MI.eraseFromParent();
759 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
760 }
761 if (STI.hasSPackHL()) {
762 Opc = AMDGPU::S_PACK_HL_B32_B16;
763 MI.getOperand(1).setReg(ShiftSrc0);
764 }
765 }
766
767 MI.setDesc(TII.get(Opc));
768 return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
769}
770
771bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
772 return selectG_ADD_SUB(I);
773}
774
775bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
776 const MachineOperand &MO = I.getOperand(0);
777
778 // FIXME: Interface for getConstrainedRegClassForOperand needs work. The
779 // regbank check here is to know why getConstrainedRegClassForOperand failed.
781 if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
782 (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
783 I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
784 return true;
785 }
786
787 return false;
788}
789
790bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
791 MachineBasicBlock *BB = I.getParent();
792
793 Register DstReg = I.getOperand(0).getReg();
794 Register Src0Reg = I.getOperand(1).getReg();
795 Register Src1Reg = I.getOperand(2).getReg();
796 LLT Src1Ty = MRI->getType(Src1Reg);
797
798 unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
799 unsigned InsSize = Src1Ty.getSizeInBits();
800
801 int64_t Offset = I.getOperand(3).getImm();
802
803 // FIXME: These cases should have been illegal and unnecessary to check here.
804 if (Offset % 32 != 0 || InsSize % 32 != 0)
805 return false;
806
807 // Currently not handled by getSubRegFromChannel.
808 if (InsSize > 128)
809 return false;
810
811 unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
812 if (SubReg == AMDGPU::NoSubRegister)
813 return false;
814
815 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
816 const TargetRegisterClass *DstRC =
817 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
818 if (!DstRC)
819 return false;
820
821 const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
822 const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
823 const TargetRegisterClass *Src0RC =
824 TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank);
825 const TargetRegisterClass *Src1RC =
826 TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank);
827
828 // Deal with weird cases where the class only partially supports the subreg
829 // index.
830 Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
831 if (!Src0RC || !Src1RC)
832 return false;
833
834 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
835 !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
836 !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
837 return false;
838
839 const DebugLoc &DL = I.getDebugLoc();
840 BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
841 .addReg(Src0Reg)
842 .addReg(Src1Reg)
843 .addImm(SubReg);
844
845 I.eraseFromParent();
846 return true;
847}
848
849bool AMDGPUInstructionSelector::selectG_SBFX_UBFX(MachineInstr &MI) const {
850 Register DstReg = MI.getOperand(0).getReg();
851 Register SrcReg = MI.getOperand(1).getReg();
852 Register OffsetReg = MI.getOperand(2).getReg();
853 Register WidthReg = MI.getOperand(3).getReg();
854
855 assert(RBI.getRegBank(DstReg, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID &&
856 "scalar BFX instructions are expanded in regbankselect");
857 assert(MRI->getType(MI.getOperand(0).getReg()).getSizeInBits() == 32 &&
858 "64-bit vector BFX instructions are expanded in regbankselect");
859
860 const DebugLoc &DL = MI.getDebugLoc();
861 MachineBasicBlock *MBB = MI.getParent();
862
863 bool IsSigned = MI.getOpcode() == TargetOpcode::G_SBFX;
864 unsigned Opc = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
865 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), DstReg)
866 .addReg(SrcReg)
867 .addReg(OffsetReg)
868 .addReg(WidthReg);
869 MI.eraseFromParent();
870 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
871}
872
873bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
874 if (STI.getLDSBankCount() != 16)
875 return selectImpl(MI, *CoverageInfo);
876
877 Register Dst = MI.getOperand(0).getReg();
878 Register Src0 = MI.getOperand(2).getReg();
879 Register M0Val = MI.getOperand(6).getReg();
880 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
881 !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
882 !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
883 return false;
884
885 // This requires 2 instructions. It is possible to write a pattern to support
886 // this, but the generated isel emitter doesn't correctly deal with multiple
887 // output instructions using the same physical register input. The copy to m0
888 // is incorrectly placed before the second instruction.
889 //
890 // TODO: Match source modifiers.
891
892 Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
893 const DebugLoc &DL = MI.getDebugLoc();
894 MachineBasicBlock *MBB = MI.getParent();
895
896 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
897 .addReg(M0Val);
898 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
899 .addImm(2)
900 .addImm(MI.getOperand(4).getImm()) // $attr
901 .addImm(MI.getOperand(3).getImm()); // $attrchan
902
903 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
904 .addImm(0) // $src0_modifiers
905 .addReg(Src0) // $src0
906 .addImm(MI.getOperand(4).getImm()) // $attr
907 .addImm(MI.getOperand(3).getImm()) // $attrchan
908 .addImm(0) // $src2_modifiers
909 .addReg(InterpMov) // $src2 - 2 f16 values selected by high
910 .addImm(MI.getOperand(5).getImm()) // $high
911 .addImm(0) // $clamp
912 .addImm(0); // $omod
913
914 MI.eraseFromParent();
915 return true;
916}
917
918// Writelane is special in that it can use SGPR and M0 (which would normally
919// count as using the constant bus twice - but in this case it is allowed since
920// the lane selector doesn't count as a use of the constant bus). However, it is
921// still required to abide by the 1 SGPR rule. Fix this up if we might have
922// multiple SGPRs.
923bool AMDGPUInstructionSelector::selectWritelane(MachineInstr &MI) const {
924 // With a constant bus limit of at least 2, there's no issue.
925 if (STI.getConstantBusLimit(AMDGPU::V_WRITELANE_B32) > 1)
926 return selectImpl(MI, *CoverageInfo);
927
928 MachineBasicBlock *MBB = MI.getParent();
929 const DebugLoc &DL = MI.getDebugLoc();
930 Register VDst = MI.getOperand(0).getReg();
931 Register Val = MI.getOperand(2).getReg();
932 Register LaneSelect = MI.getOperand(3).getReg();
933 Register VDstIn = MI.getOperand(4).getReg();
934
935 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_WRITELANE_B32), VDst);
936
937 std::optional<ValueAndVReg> ConstSelect =
938 getIConstantVRegValWithLookThrough(LaneSelect, *MRI);
939 if (ConstSelect) {
940 // The selector has to be an inline immediate, so we can use whatever for
941 // the other operands.
942 MIB.addReg(Val);
943 MIB.addImm(ConstSelect->Value.getSExtValue() &
944 maskTrailingOnes<uint64_t>(STI.getWavefrontSizeLog2()));
945 } else {
946 std::optional<ValueAndVReg> ConstVal =
948
949 // If the value written is an inline immediate, we can get away without a
950 // copy to m0.
951 if (ConstVal && AMDGPU::isInlinableLiteral32(ConstVal->Value.getSExtValue(),
952 STI.hasInv2PiInlineImm())) {
953 MIB.addImm(ConstVal->Value.getSExtValue());
954 MIB.addReg(LaneSelect);
955 } else {
956 MIB.addReg(Val);
957
958 // If the lane selector was originally in a VGPR and copied with
959 // readfirstlane, there's a hazard to read the same SGPR from the
960 // VALU. Constrain to a different SGPR to help avoid needing a nop later.
961 RBI.constrainGenericRegister(LaneSelect, AMDGPU::SReg_32_XM0RegClass, *MRI);
962
963 BuildMI(*MBB, *MIB, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
964 .addReg(LaneSelect);
965 MIB.addReg(AMDGPU::M0);
966 }
967 }
968
969 MIB.addReg(VDstIn);
970
971 MI.eraseFromParent();
972 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
973}
974
975// We need to handle this here because tablegen doesn't support matching
976// instructions with multiple outputs.
977bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
978 Register Dst0 = MI.getOperand(0).getReg();
979 Register Dst1 = MI.getOperand(1).getReg();
980
981 LLT Ty = MRI->getType(Dst0);
982 unsigned Opc;
983 if (Ty == LLT::scalar(32))
984 Opc = AMDGPU::V_DIV_SCALE_F32_e64;
985 else if (Ty == LLT::scalar(64))
986 Opc = AMDGPU::V_DIV_SCALE_F64_e64;
987 else
988 return false;
989
990 // TODO: Match source modifiers.
991
992 const DebugLoc &DL = MI.getDebugLoc();
993 MachineBasicBlock *MBB = MI.getParent();
994
995 Register Numer = MI.getOperand(3).getReg();
996 Register Denom = MI.getOperand(4).getReg();
997 unsigned ChooseDenom = MI.getOperand(5).getImm();
998
999 Register Src0 = ChooseDenom != 0 ? Numer : Denom;
1000
1001 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
1002 .addDef(Dst1)
1003 .addImm(0) // $src0_modifiers
1004 .addUse(Src0) // $src0
1005 .addImm(0) // $src1_modifiers
1006 .addUse(Denom) // $src1
1007 .addImm(0) // $src2_modifiers
1008 .addUse(Numer) // $src2
1009 .addImm(0) // $clamp
1010 .addImm(0); // $omod
1011
1012 MI.eraseFromParent();
1013 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1014}
1015
1016bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
1017 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
1018 switch (IntrinsicID) {
1019 case Intrinsic::amdgcn_if_break: {
1020 MachineBasicBlock *BB = I.getParent();
1021
1022 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1023 // SelectionDAG uses for wave32 vs wave64.
1024 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::SI_IF_BREAK))
1025 .add(I.getOperand(0))
1026 .add(I.getOperand(2))
1027 .add(I.getOperand(3));
1028
1029 Register DstReg = I.getOperand(0).getReg();
1030 Register Src0Reg = I.getOperand(2).getReg();
1031 Register Src1Reg = I.getOperand(3).getReg();
1032
1033 I.eraseFromParent();
1034
1035 for (Register Reg : { DstReg, Src0Reg, Src1Reg })
1036 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1037
1038 return true;
1039 }
1040 case Intrinsic::amdgcn_interp_p1_f16:
1041 return selectInterpP1F16(I);
1042 case Intrinsic::amdgcn_wqm:
1043 return constrainCopyLikeIntrin(I, AMDGPU::WQM);
1044 case Intrinsic::amdgcn_softwqm:
1045 return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
1046 case Intrinsic::amdgcn_strict_wwm:
1047 case Intrinsic::amdgcn_wwm:
1048 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1049 case Intrinsic::amdgcn_strict_wqm:
1050 return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
1051 case Intrinsic::amdgcn_writelane:
1052 return selectWritelane(I);
1053 case Intrinsic::amdgcn_div_scale:
1054 return selectDivScale(I);
1055 case Intrinsic::amdgcn_icmp:
1056 case Intrinsic::amdgcn_fcmp:
1057 if (selectImpl(I, *CoverageInfo))
1058 return true;
1059 return selectIntrinsicCmp(I);
1060 case Intrinsic::amdgcn_ballot:
1061 return selectBallot(I);
1062 case Intrinsic::amdgcn_inverse_ballot:
1063 return selectInverseBallot(I);
1064 case Intrinsic::amdgcn_reloc_constant:
1065 return selectRelocConstant(I);
1066 case Intrinsic::amdgcn_groupstaticsize:
1067 return selectGroupStaticSize(I);
1068 case Intrinsic::returnaddress:
1069 return selectReturnAddress(I);
1070 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
1071 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
1072 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
1073 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
1074 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
1075 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
1076 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
1077 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
1078 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
1079 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
1080 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
1081 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
1082 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
1083 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
1084 return selectSMFMACIntrin(I);
1085 default:
1086 return selectImpl(I, *CoverageInfo);
1087 }
1088}
1089
1091 const GCNSubtarget &ST) {
1092 if (Size != 16 && Size != 32 && Size != 64)
1093 return -1;
1094
1095 if (Size == 16 && !ST.has16BitInsts())
1096 return -1;
1097
1098 const auto Select = [&](unsigned S16Opc, unsigned TrueS16Opc, unsigned S32Opc,
1099 unsigned S64Opc) {
1100 if (Size == 16)
1101 return ST.hasTrue16BitInsts() ? TrueS16Opc : S16Opc;
1102 if (Size == 32)
1103 return S32Opc;
1104 return S64Opc;
1105 };
1106
1107 switch (P) {
1108 default:
1109 llvm_unreachable("Unknown condition code!");
1110 case CmpInst::ICMP_NE:
1111 return Select(AMDGPU::V_CMP_NE_U16_e64, AMDGPU::V_CMP_NE_U16_t16_e64,
1112 AMDGPU::V_CMP_NE_U32_e64, AMDGPU::V_CMP_NE_U64_e64);
1113 case CmpInst::ICMP_EQ:
1114 return Select(AMDGPU::V_CMP_EQ_U16_e64, AMDGPU::V_CMP_EQ_U16_t16_e64,
1115 AMDGPU::V_CMP_EQ_U32_e64, AMDGPU::V_CMP_EQ_U64_e64);
1116 case CmpInst::ICMP_SGT:
1117 return Select(AMDGPU::V_CMP_GT_I16_e64, AMDGPU::V_CMP_GT_I16_t16_e64,
1118 AMDGPU::V_CMP_GT_I32_e64, AMDGPU::V_CMP_GT_I64_e64);
1119 case CmpInst::ICMP_SGE:
1120 return Select(AMDGPU::V_CMP_GE_I16_e64, AMDGPU::V_CMP_GE_I16_t16_e64,
1121 AMDGPU::V_CMP_GE_I32_e64, AMDGPU::V_CMP_GE_I64_e64);
1122 case CmpInst::ICMP_SLT:
1123 return Select(AMDGPU::V_CMP_LT_I16_e64, AMDGPU::V_CMP_LT_I16_t16_e64,
1124 AMDGPU::V_CMP_LT_I32_e64, AMDGPU::V_CMP_LT_I64_e64);
1125 case CmpInst::ICMP_SLE:
1126 return Select(AMDGPU::V_CMP_LE_I16_e64, AMDGPU::V_CMP_LE_I16_t16_e64,
1127 AMDGPU::V_CMP_LE_I32_e64, AMDGPU::V_CMP_LE_I64_e64);
1128 case CmpInst::ICMP_UGT:
1129 return Select(AMDGPU::V_CMP_GT_U16_e64, AMDGPU::V_CMP_GT_U16_t16_e64,
1130 AMDGPU::V_CMP_GT_U32_e64, AMDGPU::V_CMP_GT_U64_e64);
1131 case CmpInst::ICMP_UGE:
1132 return Select(AMDGPU::V_CMP_GE_U16_e64, AMDGPU::V_CMP_GE_U16_t16_e64,
1133 AMDGPU::V_CMP_GE_U32_e64, AMDGPU::V_CMP_GE_U64_e64);
1134 case CmpInst::ICMP_ULT:
1135 return Select(AMDGPU::V_CMP_LT_U16_e64, AMDGPU::V_CMP_LT_U16_t16_e64,
1136 AMDGPU::V_CMP_LT_U32_e64, AMDGPU::V_CMP_LT_U64_e64);
1137 case CmpInst::ICMP_ULE:
1138 return Select(AMDGPU::V_CMP_LE_U16_e64, AMDGPU::V_CMP_LE_U16_t16_e64,
1139 AMDGPU::V_CMP_LE_U32_e64, AMDGPU::V_CMP_LE_U64_e64);
1140
1141 case CmpInst::FCMP_OEQ:
1142 return Select(AMDGPU::V_CMP_EQ_F16_e64, AMDGPU::V_CMP_EQ_F16_t16_e64,
1143 AMDGPU::V_CMP_EQ_F32_e64, AMDGPU::V_CMP_EQ_F64_e64);
1144 case CmpInst::FCMP_OGT:
1145 return Select(AMDGPU::V_CMP_GT_F16_e64, AMDGPU::V_CMP_GT_F16_t16_e64,
1146 AMDGPU::V_CMP_GT_F32_e64, AMDGPU::V_CMP_GT_F64_e64);
1147 case CmpInst::FCMP_OGE:
1148 return Select(AMDGPU::V_CMP_GE_F16_e64, AMDGPU::V_CMP_GE_F16_t16_e64,
1149 AMDGPU::V_CMP_GE_F32_e64, AMDGPU::V_CMP_GE_F64_e64);
1150 case CmpInst::FCMP_OLT:
1151 return Select(AMDGPU::V_CMP_LT_F16_e64, AMDGPU::V_CMP_LT_F16_t16_e64,
1152 AMDGPU::V_CMP_LT_F32_e64, AMDGPU::V_CMP_LT_F64_e64);
1153 case CmpInst::FCMP_OLE:
1154 return Select(AMDGPU::V_CMP_LE_F16_e64, AMDGPU::V_CMP_LE_F16_t16_e64,
1155 AMDGPU::V_CMP_LE_F32_e64, AMDGPU::V_CMP_LE_F64_e64);
1156 case CmpInst::FCMP_ONE:
1157 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1158 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1159 case CmpInst::FCMP_ORD:
1160 return Select(AMDGPU::V_CMP_O_F16_e64, AMDGPU::V_CMP_O_F16_t16_e64,
1161 AMDGPU::V_CMP_O_F32_e64, AMDGPU::V_CMP_O_F64_e64);
1162 case CmpInst::FCMP_UNO:
1163 return Select(AMDGPU::V_CMP_U_F16_e64, AMDGPU::V_CMP_U_F16_t16_e64,
1164 AMDGPU::V_CMP_U_F32_e64, AMDGPU::V_CMP_U_F64_e64);
1165 case CmpInst::FCMP_UEQ:
1166 return Select(AMDGPU::V_CMP_NLG_F16_e64, AMDGPU::V_CMP_NLG_F16_t16_e64,
1167 AMDGPU::V_CMP_NLG_F32_e64, AMDGPU::V_CMP_NLG_F64_e64);
1168 case CmpInst::FCMP_UGT:
1169 return Select(AMDGPU::V_CMP_NLE_F16_e64, AMDGPU::V_CMP_NLE_F16_t16_e64,
1170 AMDGPU::V_CMP_NLE_F32_e64, AMDGPU::V_CMP_NLE_F64_e64);
1171 case CmpInst::FCMP_UGE:
1172 return Select(AMDGPU::V_CMP_NLT_F16_e64, AMDGPU::V_CMP_NLT_F16_t16_e64,
1173 AMDGPU::V_CMP_NLT_F32_e64, AMDGPU::V_CMP_NLT_F64_e64);
1174 case CmpInst::FCMP_ULT:
1175 return Select(AMDGPU::V_CMP_NGE_F16_e64, AMDGPU::V_CMP_NGE_F16_t16_e64,
1176 AMDGPU::V_CMP_NGE_F32_e64, AMDGPU::V_CMP_NGE_F64_e64);
1177 case CmpInst::FCMP_ULE:
1178 return Select(AMDGPU::V_CMP_NGT_F16_e64, AMDGPU::V_CMP_NGT_F16_t16_e64,
1179 AMDGPU::V_CMP_NGT_F32_e64, AMDGPU::V_CMP_NGT_F64_e64);
1180 case CmpInst::FCMP_UNE:
1181 return Select(AMDGPU::V_CMP_NEQ_F16_e64, AMDGPU::V_CMP_NEQ_F16_t16_e64,
1182 AMDGPU::V_CMP_NEQ_F32_e64, AMDGPU::V_CMP_NEQ_F64_e64);
1183 case CmpInst::FCMP_TRUE:
1184 return Select(AMDGPU::V_CMP_TRU_F16_e64, AMDGPU::V_CMP_TRU_F16_t16_e64,
1185 AMDGPU::V_CMP_TRU_F32_e64, AMDGPU::V_CMP_TRU_F64_e64);
1187 return Select(AMDGPU::V_CMP_F_F16_e64, AMDGPU::V_CMP_F_F16_t16_e64,
1188 AMDGPU::V_CMP_F_F32_e64, AMDGPU::V_CMP_F_F64_e64);
1189 }
1190}
1191
1192int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
1193 unsigned Size) const {
1194 if (Size == 64) {
1195 if (!STI.hasScalarCompareEq64())
1196 return -1;
1197
1198 switch (P) {
1199 case CmpInst::ICMP_NE:
1200 return AMDGPU::S_CMP_LG_U64;
1201 case CmpInst::ICMP_EQ:
1202 return AMDGPU::S_CMP_EQ_U64;
1203 default:
1204 return -1;
1205 }
1206 }
1207
1208 if (Size == 32) {
1209 switch (P) {
1210 case CmpInst::ICMP_NE:
1211 return AMDGPU::S_CMP_LG_U32;
1212 case CmpInst::ICMP_EQ:
1213 return AMDGPU::S_CMP_EQ_U32;
1214 case CmpInst::ICMP_SGT:
1215 return AMDGPU::S_CMP_GT_I32;
1216 case CmpInst::ICMP_SGE:
1217 return AMDGPU::S_CMP_GE_I32;
1218 case CmpInst::ICMP_SLT:
1219 return AMDGPU::S_CMP_LT_I32;
1220 case CmpInst::ICMP_SLE:
1221 return AMDGPU::S_CMP_LE_I32;
1222 case CmpInst::ICMP_UGT:
1223 return AMDGPU::S_CMP_GT_U32;
1224 case CmpInst::ICMP_UGE:
1225 return AMDGPU::S_CMP_GE_U32;
1226 case CmpInst::ICMP_ULT:
1227 return AMDGPU::S_CMP_LT_U32;
1228 case CmpInst::ICMP_ULE:
1229 return AMDGPU::S_CMP_LE_U32;
1230 case CmpInst::FCMP_OEQ:
1231 return AMDGPU::S_CMP_EQ_F32;
1232 case CmpInst::FCMP_OGT:
1233 return AMDGPU::S_CMP_GT_F32;
1234 case CmpInst::FCMP_OGE:
1235 return AMDGPU::S_CMP_GE_F32;
1236 case CmpInst::FCMP_OLT:
1237 return AMDGPU::S_CMP_LT_F32;
1238 case CmpInst::FCMP_OLE:
1239 return AMDGPU::S_CMP_LE_F32;
1240 case CmpInst::FCMP_ONE:
1241 return AMDGPU::S_CMP_LG_F32;
1242 case CmpInst::FCMP_ORD:
1243 return AMDGPU::S_CMP_O_F32;
1244 case CmpInst::FCMP_UNO:
1245 return AMDGPU::S_CMP_U_F32;
1246 case CmpInst::FCMP_UEQ:
1247 return AMDGPU::S_CMP_NLG_F32;
1248 case CmpInst::FCMP_UGT:
1249 return AMDGPU::S_CMP_NLE_F32;
1250 case CmpInst::FCMP_UGE:
1251 return AMDGPU::S_CMP_NLT_F32;
1252 case CmpInst::FCMP_ULT:
1253 return AMDGPU::S_CMP_NGE_F32;
1254 case CmpInst::FCMP_ULE:
1255 return AMDGPU::S_CMP_NGT_F32;
1256 case CmpInst::FCMP_UNE:
1257 return AMDGPU::S_CMP_NEQ_F32;
1258 default:
1259 llvm_unreachable("Unknown condition code!");
1260 }
1261 }
1262
1263 if (Size == 16) {
1264 if (!STI.hasSALUFloatInsts())
1265 return -1;
1266
1267 switch (P) {
1268 case CmpInst::FCMP_OEQ:
1269 return AMDGPU::S_CMP_EQ_F16;
1270 case CmpInst::FCMP_OGT:
1271 return AMDGPU::S_CMP_GT_F16;
1272 case CmpInst::FCMP_OGE:
1273 return AMDGPU::S_CMP_GE_F16;
1274 case CmpInst::FCMP_OLT:
1275 return AMDGPU::S_CMP_LT_F16;
1276 case CmpInst::FCMP_OLE:
1277 return AMDGPU::S_CMP_LE_F16;
1278 case CmpInst::FCMP_ONE:
1279 return AMDGPU::S_CMP_LG_F16;
1280 case CmpInst::FCMP_ORD:
1281 return AMDGPU::S_CMP_O_F16;
1282 case CmpInst::FCMP_UNO:
1283 return AMDGPU::S_CMP_U_F16;
1284 case CmpInst::FCMP_UEQ:
1285 return AMDGPU::S_CMP_NLG_F16;
1286 case CmpInst::FCMP_UGT:
1287 return AMDGPU::S_CMP_NLE_F16;
1288 case CmpInst::FCMP_UGE:
1289 return AMDGPU::S_CMP_NLT_F16;
1290 case CmpInst::FCMP_ULT:
1291 return AMDGPU::S_CMP_NGE_F16;
1292 case CmpInst::FCMP_ULE:
1293 return AMDGPU::S_CMP_NGT_F16;
1294 case CmpInst::FCMP_UNE:
1295 return AMDGPU::S_CMP_NEQ_F16;
1296 default:
1297 llvm_unreachable("Unknown condition code!");
1298 }
1299 }
1300
1301 return -1;
1302}
1303
1304bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
1305
1306 MachineBasicBlock *BB = I.getParent();
1307 const DebugLoc &DL = I.getDebugLoc();
1308
1309 Register SrcReg = I.getOperand(2).getReg();
1310 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1311
1312 auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
1313
1314 Register CCReg = I.getOperand(0).getReg();
1315 if (!isVCC(CCReg, *MRI)) {
1316 int Opcode = getS_CMPOpcode(Pred, Size);
1317 if (Opcode == -1)
1318 return false;
1319 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode))
1320 .add(I.getOperand(2))
1321 .add(I.getOperand(3));
1322 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg)
1323 .addReg(AMDGPU::SCC);
1324 bool Ret =
1325 constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
1326 RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
1327 I.eraseFromParent();
1328 return Ret;
1329 }
1330
1331 if (I.getOpcode() == AMDGPU::G_FCMP)
1332 return false;
1333
1334 int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1335 if (Opcode == -1)
1336 return false;
1337
1338 MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode),
1339 I.getOperand(0).getReg())
1340 .add(I.getOperand(2))
1341 .add(I.getOperand(3));
1343 *TRI.getBoolRC(), *MRI);
1344 bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
1345 I.eraseFromParent();
1346 return Ret;
1347}
1348
1349bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
1350 Register Dst = I.getOperand(0).getReg();
1351 if (isVCC(Dst, *MRI))
1352 return false;
1353
1354 LLT DstTy = MRI->getType(Dst);
1355 if (DstTy.getSizeInBits() != STI.getWavefrontSize())
1356 return false;
1357
1358 MachineBasicBlock *BB = I.getParent();
1359 const DebugLoc &DL = I.getDebugLoc();
1360 Register SrcReg = I.getOperand(2).getReg();
1361 unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
1362
1363 // i1 inputs are not supported in GlobalISel.
1364 if (Size == 1)
1365 return false;
1366
1367 auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
1368 if (!CmpInst::isIntPredicate(Pred) && !CmpInst::isFPPredicate(Pred)) {
1369 BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Dst);
1370 I.eraseFromParent();
1371 return RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1372 }
1373
1374 const int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
1375 if (Opcode == -1)
1376 return false;
1377
1378 MachineInstrBuilder SelectedMI;
1379 MachineOperand &LHS = I.getOperand(2);
1380 MachineOperand &RHS = I.getOperand(3);
1381 auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
1382 auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
1383 Register Src0Reg =
1384 copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
1385 Register Src1Reg =
1386 copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
1387 SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
1388 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
1389 SelectedMI.addImm(Src0Mods);
1390 SelectedMI.addReg(Src0Reg);
1391 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
1392 SelectedMI.addImm(Src1Mods);
1393 SelectedMI.addReg(Src1Reg);
1394 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
1395 SelectedMI.addImm(0); // clamp
1396 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
1397 SelectedMI.addImm(0); // op_sel
1398
1399 RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
1400 if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
1401 return false;
1402
1403 I.eraseFromParent();
1404 return true;
1405}
1406
1407bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
1408 MachineBasicBlock *BB = I.getParent();
1409 const DebugLoc &DL = I.getDebugLoc();
1410 Register DstReg = I.getOperand(0).getReg();
1411 const unsigned Size = MRI->getType(DstReg).getSizeInBits();
1412 const bool Is64 = Size == 64;
1413 const bool IsWave32 = (STI.getWavefrontSize() == 32);
1414
1415 // In the common case, the return type matches the wave size.
1416 // However we also support emitting i64 ballots in wave32 mode.
1417 if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
1418 return false;
1419
1420 std::optional<ValueAndVReg> Arg =
1421 getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
1422
1423 const auto BuildCopy = [&](Register SrcReg) {
1424 if (Size == STI.getWavefrontSize()) {
1425 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1426 .addReg(SrcReg);
1427 return;
1428 }
1429
1430 // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
1431 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1432 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
1433 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
1434 .addReg(SrcReg)
1435 .addImm(AMDGPU::sub0)
1436 .addReg(HiReg)
1437 .addImm(AMDGPU::sub1);
1438 };
1439
1440 if (Arg) {
1441 const int64_t Value = Arg->Value.getSExtValue();
1442 if (Value == 0) {
1443 unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
1444 BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
1445 } else if (Value == -1) // all ones
1446 BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
1447 else
1448 return false;
1449 } else
1450 BuildCopy(I.getOperand(2).getReg());
1451
1452 I.eraseFromParent();
1453 return true;
1454}
1455
1456bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
1457 MachineBasicBlock *BB = I.getParent();
1458 const DebugLoc &DL = I.getDebugLoc();
1459 const Register DstReg = I.getOperand(0).getReg();
1460 const Register MaskReg = I.getOperand(2).getReg();
1461
1462 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
1463 I.eraseFromParent();
1464 return true;
1465}
1466
1467bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
1468 Register DstReg = I.getOperand(0).getReg();
1469 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
1470 const TargetRegisterClass *DstRC = TRI.getRegClassForSizeOnBank(32, *DstBank);
1471 if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
1472 return false;
1473
1474 const bool IsVALU = DstBank->getID() == AMDGPU::VGPRRegBankID;
1475
1477 const MDNode *Metadata = I.getOperand(2).getMetadata();
1478 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
1479 auto RelocSymbol = cast<GlobalVariable>(
1480 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
1481
1482 MachineBasicBlock *BB = I.getParent();
1483 BuildMI(*BB, &I, I.getDebugLoc(),
1484 TII.get(IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32), DstReg)
1486
1487 I.eraseFromParent();
1488 return true;
1489}
1490
1491bool AMDGPUInstructionSelector::selectGroupStaticSize(MachineInstr &I) const {
1493
1494 Register DstReg = I.getOperand(0).getReg();
1495 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
1496 unsigned Mov = DstRB->getID() == AMDGPU::SGPRRegBankID ?
1497 AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1498
1499 MachineBasicBlock *MBB = I.getParent();
1500 const DebugLoc &DL = I.getDebugLoc();
1501
1502 auto MIB = BuildMI(*MBB, &I, DL, TII.get(Mov), DstReg);
1503
1504 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) {
1506 MIB.addImm(MFI->getLDSSize());
1507 } else {
1509 const GlobalValue *GV
1510 = Intrinsic::getDeclaration(M, Intrinsic::amdgcn_groupstaticsize);
1512 }
1513
1514 I.eraseFromParent();
1515 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1516}
1517
1518bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
1519 MachineBasicBlock *MBB = I.getParent();
1521 const DebugLoc &DL = I.getDebugLoc();
1522
1523 MachineOperand &Dst = I.getOperand(0);
1524 Register DstReg = Dst.getReg();
1525 unsigned Depth = I.getOperand(2).getImm();
1526
1527 const TargetRegisterClass *RC
1528 = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
1529 if (!RC->hasSubClassEq(&AMDGPU::SGPR_64RegClass) ||
1530 !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
1531 return false;
1532
1533 // Check for kernel and shader functions
1534 if (Depth != 0 ||
1536 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
1537 .addImm(0);
1538 I.eraseFromParent();
1539 return true;
1540 }
1541
1543 // There is a call to @llvm.returnaddress in this function
1544 MFI.setReturnAddressIsTaken(true);
1545
1546 // Get the return address reg and mark it as an implicit live-in
1547 Register ReturnAddrReg = TRI.getReturnAddressReg(MF);
1548 Register LiveIn = getFunctionLiveInPhysReg(MF, TII, ReturnAddrReg,
1549 AMDGPU::SReg_64RegClass, DL);
1550 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
1551 .addReg(LiveIn);
1552 I.eraseFromParent();
1553 return true;
1554}
1555
1556bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1557 // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
1558 // SelectionDAG uses for wave32 vs wave64.
1559 MachineBasicBlock *BB = MI.getParent();
1560 BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1561 .add(MI.getOperand(1));
1562
1563 Register Reg = MI.getOperand(1).getReg();
1564 MI.eraseFromParent();
1565
1566 if (!MRI->getRegClassOrNull(Reg))
1567 MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
1568 return true;
1569}
1570
1571bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
1572 MachineInstr &MI, Intrinsic::ID IntrID) const {
1573 MachineBasicBlock *MBB = MI.getParent();
1575 const DebugLoc &DL = MI.getDebugLoc();
1576
1577 unsigned IndexOperand = MI.getOperand(7).getImm();
1578 bool WaveRelease = MI.getOperand(8).getImm() != 0;
1579 bool WaveDone = MI.getOperand(9).getImm() != 0;
1580
1581 if (WaveDone && !WaveRelease)
1582 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
1583
1584 unsigned OrderedCountIndex = IndexOperand & 0x3f;
1585 IndexOperand &= ~0x3f;
1586 unsigned CountDw = 0;
1587
1589 CountDw = (IndexOperand >> 24) & 0xf;
1590 IndexOperand &= ~(0xf << 24);
1591
1592 if (CountDw < 1 || CountDw > 4) {
1594 "ds_ordered_count: dword count must be between 1 and 4");
1595 }
1596 }
1597
1598 if (IndexOperand)
1599 report_fatal_error("ds_ordered_count: bad index operand");
1600
1601 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
1602 unsigned ShaderType = SIInstrInfo::getDSShaderTypeValue(*MF);
1603
1604 unsigned Offset0 = OrderedCountIndex << 2;
1605 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
1606
1608 Offset1 |= (CountDw - 1) << 6;
1609
1611 Offset1 |= ShaderType << 2;
1612
1613 unsigned Offset = Offset0 | (Offset1 << 8);
1614
1615 Register M0Val = MI.getOperand(2).getReg();
1616 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1617 .addReg(M0Val);
1618
1619 Register DstReg = MI.getOperand(0).getReg();
1620 Register ValReg = MI.getOperand(3).getReg();
1622 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_ORDERED_COUNT), DstReg)
1623 .addReg(ValReg)
1624 .addImm(Offset)
1625 .cloneMemRefs(MI);
1626
1627 if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI))
1628 return false;
1629
1630 bool Ret = constrainSelectedInstRegOperands(*DS, TII, TRI, RBI);
1631 MI.eraseFromParent();
1632 return Ret;
1633}
1634
1635static unsigned gwsIntrinToOpcode(unsigned IntrID) {
1636 switch (IntrID) {
1637 case Intrinsic::amdgcn_ds_gws_init:
1638 return AMDGPU::DS_GWS_INIT;
1639 case Intrinsic::amdgcn_ds_gws_barrier:
1640 return AMDGPU::DS_GWS_BARRIER;
1641 case Intrinsic::amdgcn_ds_gws_sema_v:
1642 return AMDGPU::DS_GWS_SEMA_V;
1643 case Intrinsic::amdgcn_ds_gws_sema_br:
1644 return AMDGPU::DS_GWS_SEMA_BR;
1645 case Intrinsic::amdgcn_ds_gws_sema_p:
1646 return AMDGPU::DS_GWS_SEMA_P;
1647 case Intrinsic::amdgcn_ds_gws_sema_release_all:
1648 return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
1649 default:
1650 llvm_unreachable("not a gws intrinsic");
1651 }
1652}
1653
1654bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
1655 Intrinsic::ID IID) const {
1656 if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
1657 !STI.hasGWSSemaReleaseAll()))
1658 return false;
1659
1660 // intrinsic ID, vsrc, offset
1661 const bool HasVSrc = MI.getNumOperands() == 3;
1662 assert(HasVSrc || MI.getNumOperands() == 2);
1663
1664 Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
1665 const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
1666 if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
1667 return false;
1668
1669 MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1670 unsigned ImmOffset;
1671
1672 MachineBasicBlock *MBB = MI.getParent();
1673 const DebugLoc &DL = MI.getDebugLoc();
1674
1675 MachineInstr *Readfirstlane = nullptr;
1676
1677 // If we legalized the VGPR input, strip out the readfirstlane to analyze the
1678 // incoming offset, in case there's an add of a constant. We'll have to put it
1679 // back later.
1680 if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
1681 Readfirstlane = OffsetDef;
1682 BaseOffset = OffsetDef->getOperand(1).getReg();
1683 OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
1684 }
1685
1686 if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
1687 // If we have a constant offset, try to use the 0 in m0 as the base.
1688 // TODO: Look into changing the default m0 initialization value. If the
1689 // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
1690 // the immediate offset.
1691
1692 ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
1693 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
1694 .addImm(0);
1695 } else {
1696 std::tie(BaseOffset, ImmOffset) =
1697 AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
1698
1699 if (Readfirstlane) {
1700 // We have the constant offset now, so put the readfirstlane back on the
1701 // variable component.
1702 if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
1703 return false;
1704
1705 Readfirstlane->getOperand(1).setReg(BaseOffset);
1706 BaseOffset = Readfirstlane->getOperand(0).getReg();
1707 } else {
1708 if (!RBI.constrainGenericRegister(BaseOffset,
1709 AMDGPU::SReg_32RegClass, *MRI))
1710 return false;
1711 }
1712
1713 Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1714 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
1715 .addReg(BaseOffset)
1716 .addImm(16)
1717 .setOperandDead(3); // Dead scc
1718
1719 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1720 .addReg(M0Base);
1721 }
1722
1723 // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
1724 // offset field) % 64. Some versions of the programming guide omit the m0
1725 // part, or claim it's from offset 0.
1726 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
1727
1728 if (HasVSrc) {
1729 Register VSrc = MI.getOperand(1).getReg();
1730 MIB.addReg(VSrc);
1731
1732 if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
1733 return false;
1734 }
1735
1736 MIB.addImm(ImmOffset)
1737 .cloneMemRefs(MI);
1738
1739 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::data0);
1740
1741 MI.eraseFromParent();
1742 return true;
1743}
1744
1745bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
1746 bool IsAppend) const {
1747 Register PtrBase = MI.getOperand(2).getReg();
1748 LLT PtrTy = MRI->getType(PtrBase);
1749 bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
1750
1751 unsigned Offset;
1752 std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
1753
1754 // TODO: Should this try to look through readfirstlane like GWS?
1755 if (!isDSOffsetLegal(PtrBase, Offset)) {
1756 PtrBase = MI.getOperand(2).getReg();
1757 Offset = 0;
1758 }
1759
1760 MachineBasicBlock *MBB = MI.getParent();
1761 const DebugLoc &DL = MI.getDebugLoc();
1762 const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
1763
1764 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
1765 .addReg(PtrBase);
1766 if (!RBI.constrainGenericRegister(PtrBase, AMDGPU::SReg_32RegClass, *MRI))
1767 return false;
1768
1769 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
1770 .addImm(Offset)
1771 .addImm(IsGDS ? -1 : 0)
1772 .cloneMemRefs(MI);
1773 MI.eraseFromParent();
1774 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1775}
1776
1777bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
1779 unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
1780 if (WGSize <= STI.getWavefrontSize()) {
1781 MachineBasicBlock *MBB = MI.getParent();
1782 const DebugLoc &DL = MI.getDebugLoc();
1783 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::WAVE_BARRIER));
1784 MI.eraseFromParent();
1785 return true;
1786 }
1787 }
1788
1789 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
1790 if (STI.hasSplitBarriers()) {
1791 MachineBasicBlock *MBB = MI.getParent();
1792 const DebugLoc &DL = MI.getDebugLoc();
1793 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
1795 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
1797 MI.eraseFromParent();
1798 return true;
1799 }
1800
1801 return selectImpl(MI, *CoverageInfo);
1802}
1803
1804static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
1805 bool &IsTexFail) {
1806 if (TexFailCtrl)
1807 IsTexFail = true;
1808
1809 TFE = (TexFailCtrl & 0x1) ? true : false;
1810 TexFailCtrl &= ~(uint64_t)0x1;
1811 LWE = (TexFailCtrl & 0x2) ? true : false;
1812 TexFailCtrl &= ~(uint64_t)0x2;
1813
1814 return TexFailCtrl == 0;
1815}
1816
1817bool AMDGPUInstructionSelector::selectImageIntrinsic(
1819 MachineBasicBlock *MBB = MI.getParent();
1820 const DebugLoc &DL = MI.getDebugLoc();
1821
1822 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1824
1825 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
1826 unsigned IntrOpcode = Intr->BaseOpcode;
1827 const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
1828 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
1829 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
1830
1831 const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
1832
1833 Register VDataIn, VDataOut;
1834 LLT VDataTy;
1835 int NumVDataDwords = -1;
1836 bool IsD16 = MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16 ||
1837 MI.getOpcode() == AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16;
1838
1839 bool Unorm;
1840 if (!BaseOpcode->Sampler)
1841 Unorm = true;
1842 else
1843 Unorm = MI.getOperand(ArgOffset + Intr->UnormIndex).getImm() != 0;
1844
1845 bool TFE;
1846 bool LWE;
1847 bool IsTexFail = false;
1848 if (!parseTexFail(MI.getOperand(ArgOffset + Intr->TexFailCtrlIndex).getImm(),
1849 TFE, LWE, IsTexFail))
1850 return false;
1851
1852 const int Flags = MI.getOperand(ArgOffset + Intr->NumArgs).getImm();
1853 const bool IsA16 = (Flags & 1) != 0;
1854 const bool IsG16 = (Flags & 2) != 0;
1855
1856 // A16 implies 16 bit gradients if subtarget doesn't support G16
1857 if (IsA16 && !STI.hasG16() && !IsG16)
1858 return false;
1859
1860 unsigned DMask = 0;
1861 unsigned DMaskLanes = 0;
1862
1863 if (BaseOpcode->Atomic) {
1864 VDataOut = MI.getOperand(0).getReg();
1865 VDataIn = MI.getOperand(2).getReg();
1866 LLT Ty = MRI->getType(VDataIn);
1867
1868 // Be careful to allow atomic swap on 16-bit element vectors.
1869 const bool Is64Bit = BaseOpcode->AtomicX2 ?
1870 Ty.getSizeInBits() == 128 :
1871 Ty.getSizeInBits() == 64;
1872
1873 if (BaseOpcode->AtomicX2) {
1874 assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
1875
1876 DMask = Is64Bit ? 0xf : 0x3;
1877 NumVDataDwords = Is64Bit ? 4 : 2;
1878 } else {
1879 DMask = Is64Bit ? 0x3 : 0x1;
1880 NumVDataDwords = Is64Bit ? 2 : 1;
1881 }
1882 } else {
1883 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
1884 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
1885
1886 if (BaseOpcode->Store) {
1887 VDataIn = MI.getOperand(1).getReg();
1888 VDataTy = MRI->getType(VDataIn);
1889 NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
1890 } else {
1891 VDataOut = MI.getOperand(0).getReg();
1892 VDataTy = MRI->getType(VDataOut);
1893 NumVDataDwords = DMaskLanes;
1894
1895 if (IsD16 && !STI.hasUnpackedD16VMem())
1896 NumVDataDwords = (DMaskLanes + 1) / 2;
1897 }
1898 }
1899
1900 // Set G16 opcode
1901 if (Subtarget->hasG16() && IsG16) {
1902 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
1904 assert(G16MappingInfo);
1905 IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
1906 }
1907
1908 // TODO: Check this in verifier.
1909 assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
1910
1911 unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
1912 if (BaseOpcode->Atomic)
1913 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
1914 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
1916 return false;
1917
1918 int NumVAddrRegs = 0;
1919 int NumVAddrDwords = 0;
1920 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
1921 // Skip the $noregs and 0s inserted during legalization.
1922 MachineOperand &AddrOp = MI.getOperand(ArgOffset + I);
1923 if (!AddrOp.isReg())
1924 continue; // XXX - Break?
1925
1926 Register Addr = AddrOp.getReg();
1927 if (!Addr)
1928 break;
1929
1930 ++NumVAddrRegs;
1931 NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
1932 }
1933
1934 // The legalizer preprocessed the intrinsic arguments. If we aren't using
1935 // NSA, these should have been packed into a single value in the first
1936 // address register
1937 const bool UseNSA =
1938 NumVAddrRegs != 1 &&
1939 (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
1940 : NumVAddrDwords == NumVAddrRegs);
1941 if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
1942 LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
1943 return false;
1944 }
1945
1946 if (IsTexFail)
1947 ++NumVDataDwords;
1948
1949 int Opcode = -1;
1950 if (IsGFX12Plus) {
1951 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
1952 NumVDataDwords, NumVAddrDwords);
1953 } else if (IsGFX11Plus) {
1954 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1955 UseNSA ? AMDGPU::MIMGEncGfx11NSA
1956 : AMDGPU::MIMGEncGfx11Default,
1957 NumVDataDwords, NumVAddrDwords);
1958 } else if (IsGFX10Plus) {
1959 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
1960 UseNSA ? AMDGPU::MIMGEncGfx10NSA
1961 : AMDGPU::MIMGEncGfx10Default,
1962 NumVDataDwords, NumVAddrDwords);
1963 } else {
1964 if (Subtarget->hasGFX90AInsts()) {
1965 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
1966 NumVDataDwords, NumVAddrDwords);
1967 if (Opcode == -1) {
1968 LLVM_DEBUG(
1969 dbgs()
1970 << "requested image instruction is not supported on this GPU\n");
1971 return false;
1972 }
1973 }
1974 if (Opcode == -1 &&
1976 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
1977 NumVDataDwords, NumVAddrDwords);
1978 if (Opcode == -1)
1979 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
1980 NumVDataDwords, NumVAddrDwords);
1981 }
1982 if (Opcode == -1)
1983 return false;
1984
1985 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
1986 .cloneMemRefs(MI);
1987
1988 if (VDataOut) {
1989 if (BaseOpcode->AtomicX2) {
1990 const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
1991
1992 Register TmpReg = MRI->createVirtualRegister(
1993 Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
1994 unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
1995
1996 MIB.addDef(TmpReg);
1997 if (!MRI->use_empty(VDataOut)) {
1998 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
1999 .addReg(TmpReg, RegState::Kill, SubReg);
2000 }
2001
2002 } else {
2003 MIB.addDef(VDataOut); // vdata output
2004 }
2005 }
2006
2007 if (VDataIn)
2008 MIB.addReg(VDataIn); // vdata input
2009
2010 for (int I = 0; I != NumVAddrRegs; ++I) {
2011 MachineOperand &SrcOp = MI.getOperand(ArgOffset + Intr->VAddrStart + I);
2012 if (SrcOp.isReg()) {
2013 assert(SrcOp.getReg() != 0);
2014 MIB.addReg(SrcOp.getReg());
2015 }
2016 }
2017
2018 MIB.addReg(MI.getOperand(ArgOffset + Intr->RsrcIndex).getReg());
2019 if (BaseOpcode->Sampler)
2020 MIB.addReg(MI.getOperand(ArgOffset + Intr->SampIndex).getReg());
2021
2022 MIB.addImm(DMask); // dmask
2023
2024 if (IsGFX10Plus)
2025 MIB.addImm(DimInfo->Encoding);
2026 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
2027 MIB.addImm(Unorm);
2028
2029 MIB.addImm(CPol);
2030 MIB.addImm(IsA16 && // a16 or r128
2031 STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
2032 if (IsGFX10Plus)
2033 MIB.addImm(IsA16 ? -1 : 0);
2034
2035 if (!Subtarget->hasGFX90AInsts()) {
2036 MIB.addImm(TFE); // tfe
2037 } else if (TFE) {
2038 LLVM_DEBUG(dbgs() << "TFE is not supported on this GPU\n");
2039 return false;
2040 }
2041
2042 if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
2043 MIB.addImm(LWE); // lwe
2044 if (!IsGFX10Plus)
2045 MIB.addImm(DimInfo->DA ? -1 : 0);
2046 if (BaseOpcode->HasD16)
2047 MIB.addImm(IsD16 ? -1 : 0);
2048
2049 MI.eraseFromParent();
2050 constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2051 TII.enforceOperandRCAlignment(*MIB, AMDGPU::OpName::vaddr);
2052 return true;
2053}
2054
2055// We need to handle this here because tablegen doesn't support matching
2056// instructions with multiple outputs.
2057bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
2058 MachineInstr &MI) const {
2059 Register Dst0 = MI.getOperand(0).getReg();
2060 Register Dst1 = MI.getOperand(1).getReg();
2061
2062 const DebugLoc &DL = MI.getDebugLoc();
2063 MachineBasicBlock *MBB = MI.getParent();
2064
2065 Register Addr = MI.getOperand(3).getReg();
2066 Register Data0 = MI.getOperand(4).getReg();
2067 Register Data1 = MI.getOperand(5).getReg();
2068 unsigned Offset = MI.getOperand(6).getImm();
2069
2070 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::DS_BVH_STACK_RTN_B32), Dst0)
2071 .addDef(Dst1)
2072 .addUse(Addr)
2073 .addUse(Data0)
2074 .addUse(Data1)
2075 .addImm(Offset)
2076 .cloneMemRefs(MI);
2077
2078 MI.eraseFromParent();
2079 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2080}
2081
2082bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
2083 MachineInstr &I) const {
2084 unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
2085 switch (IntrinsicID) {
2086 case Intrinsic::amdgcn_end_cf:
2087 return selectEndCfIntrinsic(I);
2088 case Intrinsic::amdgcn_ds_ordered_add:
2089 case Intrinsic::amdgcn_ds_ordered_swap:
2090 return selectDSOrderedIntrinsic(I, IntrinsicID);
2091 case Intrinsic::amdgcn_ds_gws_init:
2092 case Intrinsic::amdgcn_ds_gws_barrier:
2093 case Intrinsic::amdgcn_ds_gws_sema_v:
2094 case Intrinsic::amdgcn_ds_gws_sema_br:
2095 case Intrinsic::amdgcn_ds_gws_sema_p:
2096 case Intrinsic::amdgcn_ds_gws_sema_release_all:
2097 return selectDSGWSIntrinsic(I, IntrinsicID);
2098 case Intrinsic::amdgcn_ds_append:
2099 return selectDSAppendConsume(I, true);
2100 case Intrinsic::amdgcn_ds_consume:
2101 return selectDSAppendConsume(I, false);
2102 case Intrinsic::amdgcn_s_barrier:
2103 return selectSBarrier(I);
2104 case Intrinsic::amdgcn_raw_buffer_load_lds:
2105 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
2106 case Intrinsic::amdgcn_struct_buffer_load_lds:
2107 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
2108 return selectBufferLoadLds(I);
2109 case Intrinsic::amdgcn_global_load_lds:
2110 return selectGlobalLoadLds(I);
2111 case Intrinsic::amdgcn_exp_compr:
2112 if (!STI.hasCompressedExport()) {
2113 Function &F = I.getMF()->getFunction();
2115 F, "intrinsic not supported on subtarget", I.getDebugLoc(), DS_Error);
2116 F.getContext().diagnose(NoFpRet);
2117 return false;
2118 }
2119 break;
2120 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
2121 return selectDSBvhStackIntrinsic(I);
2122 case Intrinsic::amdgcn_s_barrier_init:
2123 case Intrinsic::amdgcn_s_barrier_join:
2124 case Intrinsic::amdgcn_s_wakeup_barrier:
2125 case Intrinsic::amdgcn_s_get_barrier_state:
2126 return selectNamedBarrierInst(I, IntrinsicID);
2127 case Intrinsic::amdgcn_s_barrier_signal_isfirst:
2128 case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
2129 return selectSBarrierSignalIsfirst(I, IntrinsicID);
2130 case Intrinsic::amdgcn_s_barrier_leave:
2131 return selectSBarrierLeave(I);
2132 }
2133 return selectImpl(I, *CoverageInfo);
2134}
2135
2136bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
2137 if (selectImpl(I, *CoverageInfo))
2138 return true;
2139
2140 MachineBasicBlock *BB = I.getParent();
2141 const DebugLoc &DL = I.getDebugLoc();
2142
2143 Register DstReg = I.getOperand(0).getReg();
2144 unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
2145 assert(Size <= 32 || Size == 64);
2146 const MachineOperand &CCOp = I.getOperand(1);
2147 Register CCReg = CCOp.getReg();
2148 if (!isVCC(CCReg, *MRI)) {
2149 unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
2150 AMDGPU::S_CSELECT_B32;
2151 MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
2152 .addReg(CCReg);
2153
2154 // The generic constrainSelectedInstRegOperands doesn't work for the scc register
2155 // bank, because it does not cover the register class that we used to represent
2156 // for it. So we need to manually set the register class here.
2157 if (!MRI->getRegClassOrNull(CCReg))
2158 MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
2159 MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
2160 .add(I.getOperand(2))
2161 .add(I.getOperand(3));
2162
2163 bool Ret = false;
2164 Ret |= constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2165 Ret |= constrainSelectedInstRegOperands(*CopySCC, TII, TRI, RBI);
2166 I.eraseFromParent();
2167 return Ret;
2168 }
2169
2170 // Wide VGPR select should have been split in RegBankSelect.
2171 if (Size > 32)
2172 return false;
2173
2175 BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
2176 .addImm(0)
2177 .add(I.getOperand(3))
2178 .addImm(0)
2179 .add(I.getOperand(2))
2180 .add(I.getOperand(1));
2181
2182 bool Ret = constrainSelectedInstRegOperands(*Select, TII, TRI, RBI);
2183 I.eraseFromParent();
2184 return Ret;
2185}
2186
2187static int sizeToSubRegIndex(unsigned Size) {
2188 switch (Size) {
2189 case 32:
2190 return AMDGPU::sub0;
2191 case 64:
2192 return AMDGPU::sub0_sub1;
2193 case 96:
2194 return AMDGPU::sub0_sub1_sub2;
2195 case 128:
2196 return AMDGPU::sub0_sub1_sub2_sub3;
2197 case 256:
2198 return AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7;
2199 default:
2200 if (Size < 32)
2201 return AMDGPU::sub0;
2202 if (Size > 256)
2203 return -1;
2205 }
2206}
2207
2208bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
2209 Register DstReg = I.getOperand(0).getReg();
2210 Register SrcReg = I.getOperand(1).getReg();
2211 const LLT DstTy = MRI->getType(DstReg);
2212 const LLT SrcTy = MRI->getType(SrcReg);
2213 const LLT S1 = LLT::scalar(1);
2214
2215 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2216 const RegisterBank *DstRB;
2217 if (DstTy == S1) {
2218 // This is a special case. We don't treat s1 for legalization artifacts as
2219 // vcc booleans.
2220 DstRB = SrcRB;
2221 } else {
2222 DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2223 if (SrcRB != DstRB)
2224 return false;
2225 }
2226
2227 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
2228
2229 unsigned DstSize = DstTy.getSizeInBits();
2230 unsigned SrcSize = SrcTy.getSizeInBits();
2231
2232 const TargetRegisterClass *SrcRC =
2233 TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB);
2234 const TargetRegisterClass *DstRC =
2235 TRI.getRegClassForSizeOnBank(DstSize, *DstRB);
2236 if (!SrcRC || !DstRC)
2237 return false;
2238
2239 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2240 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
2241 LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
2242 return false;
2243 }
2244
2245 if (DstTy == LLT::fixed_vector(2, 16) && SrcTy == LLT::fixed_vector(2, 32)) {
2246 MachineBasicBlock *MBB = I.getParent();
2247 const DebugLoc &DL = I.getDebugLoc();
2248
2249 Register LoReg = MRI->createVirtualRegister(DstRC);
2250 Register HiReg = MRI->createVirtualRegister(DstRC);
2251 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
2252 .addReg(SrcReg, 0, AMDGPU::sub0);
2253 BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
2254 .addReg(SrcReg, 0, AMDGPU::sub1);
2255
2256 if (IsVALU && STI.hasSDWA()) {
2257 // Write the low 16-bits of the high element into the high 16-bits of the
2258 // low element.
2259 MachineInstr *MovSDWA =
2260 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
2261 .addImm(0) // $src0_modifiers
2262 .addReg(HiReg) // $src0
2263 .addImm(0) // $clamp
2264 .addImm(AMDGPU::SDWA::WORD_1) // $dst_sel
2265 .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
2266 .addImm(AMDGPU::SDWA::WORD_0) // $src0_sel
2267 .addReg(LoReg, RegState::Implicit);
2268 MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
2269 } else {
2270 Register TmpReg0 = MRI->createVirtualRegister(DstRC);
2271 Register TmpReg1 = MRI->createVirtualRegister(DstRC);
2272 Register ImmReg = MRI->createVirtualRegister(DstRC);
2273 if (IsVALU) {
2274 BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
2275 .addImm(16)
2276 .addReg(HiReg);
2277 } else {
2278 BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
2279 .addReg(HiReg)
2280 .addImm(16)
2281 .setOperandDead(3); // Dead scc
2282 }
2283
2284 unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
2285 unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2286 unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
2287
2288 BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
2289 .addImm(0xffff);
2290 auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
2291 .addReg(LoReg)
2292 .addReg(ImmReg);
2293 auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
2294 .addReg(TmpReg0)
2295 .addReg(TmpReg1);
2296
2297 if (!IsVALU) {
2298 And.setOperandDead(3); // Dead scc
2299 Or.setOperandDead(3); // Dead scc
2300 }
2301 }
2302
2303 I.eraseFromParent();
2304 return true;
2305 }
2306
2307 if (!DstTy.isScalar())
2308 return false;
2309
2310 if (SrcSize > 32) {
2311 int SubRegIdx = sizeToSubRegIndex(DstSize);
2312 if (SubRegIdx == -1)
2313 return false;
2314
2315 // Deal with weird cases where the class only partially supports the subreg
2316 // index.
2317 const TargetRegisterClass *SrcWithSubRC
2318 = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
2319 if (!SrcWithSubRC)
2320 return false;
2321
2322 if (SrcWithSubRC != SrcRC) {
2323 if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
2324 return false;
2325 }
2326
2327 I.getOperand(1).setSubReg(SubRegIdx);
2328 }
2329
2330 I.setDesc(TII.get(TargetOpcode::COPY));
2331 return true;
2332}
2333
2334/// \returns true if a bitmask for \p Size bits will be an inline immediate.
2335static bool shouldUseAndMask(unsigned Size, unsigned &Mask) {
2336 Mask = maskTrailingOnes<unsigned>(Size);
2337 int SignedMask = static_cast<int>(Mask);
2338 return SignedMask >= -16 && SignedMask <= 64;
2339}
2340
2341// Like RegisterBankInfo::getRegBank, but don't assume vcc for s1.
2342const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
2343 Register Reg, const MachineRegisterInfo &MRI,
2344 const TargetRegisterInfo &TRI) const {
2345 const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2346 if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2347 return RB;
2348
2349 // Ignore the type, since we don't use vcc in artifacts.
2350 if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2351 return &RBI.getRegBankFromRegClass(*RC, LLT());
2352 return nullptr;
2353}
2354
2355bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
2356 bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
2357 bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
2358 const DebugLoc &DL = I.getDebugLoc();
2359 MachineBasicBlock &MBB = *I.getParent();
2360 const Register DstReg = I.getOperand(0).getReg();
2361 const Register SrcReg = I.getOperand(1).getReg();
2362
2363 const LLT DstTy = MRI->getType(DstReg);
2364 const LLT SrcTy = MRI->getType(SrcReg);
2365 const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
2366 I.getOperand(2).getImm() : SrcTy.getSizeInBits();
2367 const unsigned DstSize = DstTy.getSizeInBits();
2368 if (!DstTy.isScalar())
2369 return false;
2370
2371 // Artifact casts should never use vcc.
2372 const RegisterBank *SrcBank = getArtifactRegBank(SrcReg, *MRI, TRI);
2373
2374 // FIXME: This should probably be illegal and split earlier.
2375 if (I.getOpcode() == AMDGPU::G_ANYEXT) {
2376 if (DstSize <= 32)
2377 return selectCOPY(I);
2378
2379 const TargetRegisterClass *SrcRC =
2380 TRI.getRegClassForTypeOnBank(SrcTy, *SrcBank);
2381 const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
2382 const TargetRegisterClass *DstRC =
2383 TRI.getRegClassForSizeOnBank(DstSize, *DstBank);
2384
2385 Register UndefReg = MRI->createVirtualRegister(SrcRC);
2386 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2387 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2388 .addReg(SrcReg)
2389 .addImm(AMDGPU::sub0)
2390 .addReg(UndefReg)
2391 .addImm(AMDGPU::sub1);
2392 I.eraseFromParent();
2393
2394 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) &&
2395 RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI);
2396 }
2397
2398 if (SrcBank->getID() == AMDGPU::VGPRRegBankID && DstSize <= 32) {
2399 // 64-bit should have been split up in RegBankSelect
2400
2401 // Try to use an and with a mask if it will save code size.
2402 unsigned Mask;
2403 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2404 MachineInstr *ExtI =
2405 BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
2406 .addImm(Mask)
2407 .addReg(SrcReg);
2408 I.eraseFromParent();
2409 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2410 }
2411
2412 const unsigned BFE = Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64;
2413 MachineInstr *ExtI =
2414 BuildMI(MBB, I, DL, TII.get(BFE), DstReg)
2415 .addReg(SrcReg)
2416 .addImm(0) // Offset
2417 .addImm(SrcSize); // Width
2418 I.eraseFromParent();
2419 return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
2420 }
2421
2422 if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
2423 const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
2424 AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
2425 if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
2426 return false;
2427
2428 if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
2429 const unsigned SextOpc = SrcSize == 8 ?
2430 AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
2431 BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
2432 .addReg(SrcReg);
2433 I.eraseFromParent();
2434 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2435 }
2436
2437 // Using a single 32-bit SALU to calculate the high half is smaller than
2438 // S_BFE with a literal constant operand.
2439 if (DstSize > 32 && SrcSize == 32) {
2440 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2441 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2442 if (Signed) {
2443 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
2444 .addReg(SrcReg, 0, SubReg)
2445 .addImm(31)
2446 .setOperandDead(3); // Dead scc
2447 } else {
2448 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
2449 .addImm(0);
2450 }
2451 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2452 .addReg(SrcReg, 0, SubReg)
2453 .addImm(AMDGPU::sub0)
2454 .addReg(HiReg)
2455 .addImm(AMDGPU::sub1);
2456 I.eraseFromParent();
2457 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass,
2458 *MRI);
2459 }
2460
2461 const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
2462 const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
2463
2464 // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
2465 if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
2466 // We need a 64-bit register source, but the high bits don't matter.
2467 Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
2468 Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2469 unsigned SubReg = InReg ? AMDGPU::sub0 : AMDGPU::NoSubRegister;
2470
2471 BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
2472 BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
2473 .addReg(SrcReg, 0, SubReg)
2474 .addImm(AMDGPU::sub0)
2475 .addReg(UndefReg)
2476 .addImm(AMDGPU::sub1);
2477
2478 BuildMI(MBB, I, DL, TII.get(BFE64), DstReg)
2479 .addReg(ExtReg)
2480 .addImm(SrcSize << 16);
2481
2482 I.eraseFromParent();
2483 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
2484 }
2485
2486 unsigned Mask;
2487 if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
2488 BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
2489 .addReg(SrcReg)
2490 .addImm(Mask)
2491 .setOperandDead(3); // Dead scc
2492 } else {
2493 BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
2494 .addReg(SrcReg)
2495 .addImm(SrcSize << 16);
2496 }
2497
2498 I.eraseFromParent();
2499 return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
2500 }
2501
2502 return false;
2503}
2504
2506 Register &Out) {
2507 Register LShlSrc;
2508 if (mi_match(In, MRI,
2509 m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
2510 Out = LShlSrc;
2511 return true;
2512 }
2513 return false;
2514}
2515
2516bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
2517 if (!Subtarget->hasSALUFloatInsts())
2518 return false;
2519
2520 Register Dst = I.getOperand(0).getReg();
2521 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2522 if (DstRB->getID() != AMDGPU::SGPRRegBankID)
2523 return false;
2524
2525 Register Src = I.getOperand(1).getReg();
2526
2527 if (MRI->getType(Dst) == LLT::scalar(32) &&
2528 MRI->getType(Src) == LLT::scalar(16)) {
2529 if (isExtractHiElt(*MRI, Src, Src)) {
2530 MachineBasicBlock *BB = I.getParent();
2531 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
2532 .addUse(Src);
2533 I.eraseFromParent();
2534 return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
2535 }
2536 }
2537
2538 return false;
2539}
2540
2541bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
2542 MachineBasicBlock *BB = I.getParent();
2543 MachineOperand &ImmOp = I.getOperand(1);
2544 Register DstReg = I.getOperand(0).getReg();
2545 unsigned Size = MRI->getType(DstReg).getSizeInBits();
2546 bool IsFP = false;
2547
2548 // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
2549 if (ImmOp.isFPImm()) {
2550 const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
2551 ImmOp.ChangeToImmediate(Imm.getZExtValue());
2552 IsFP = true;
2553 } else if (ImmOp.isCImm()) {
2554 ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
2555 } else {
2556 llvm_unreachable("Not supported by g_constants");
2557 }
2558
2559 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2560 const bool IsSgpr = DstRB->getID() == AMDGPU::SGPRRegBankID;
2561
2562 unsigned Opcode;
2563 if (DstRB->getID() == AMDGPU::VCCRegBankID) {
2564 Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2565 } else if (Size == 64 &&
2566 AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
2567 Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
2568 I.setDesc(TII.get(Opcode));
2569 I.addImplicitDefUseOperands(*MF);
2570 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2571 } else {
2572 Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2573
2574 // We should never produce s1 values on banks other than VCC. If the user of
2575 // this already constrained the register, we may incorrectly think it's VCC
2576 // if it wasn't originally.
2577 if (Size == 1)
2578 return false;
2579 }
2580
2581 if (Size != 64) {
2582 I.setDesc(TII.get(Opcode));
2583 I.addImplicitDefUseOperands(*MF);
2584 return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2585 }
2586
2587 const DebugLoc &DL = I.getDebugLoc();
2588
2589 APInt Imm(Size, I.getOperand(1).getImm());
2590
2591 MachineInstr *ResInst;
2592 if (IsSgpr && TII.isInlineConstant(Imm)) {
2593 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
2594 .addImm(I.getOperand(1).getImm());
2595 } else {
2596 const TargetRegisterClass *RC = IsSgpr ?
2597 &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
2598 Register LoReg = MRI->createVirtualRegister(RC);
2599 Register HiReg = MRI->createVirtualRegister(RC);
2600
2601 BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
2602 .addImm(Imm.trunc(32).getZExtValue());
2603
2604 BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
2605 .addImm(Imm.ashr(32).getZExtValue());
2606
2607 ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
2608 .addReg(LoReg)
2609 .addImm(AMDGPU::sub0)
2610 .addReg(HiReg)
2611 .addImm(AMDGPU::sub1);
2612 }
2613
2614 // We can't call constrainSelectedInstRegOperands here, because it doesn't
2615 // work for target independent opcodes
2616 I.eraseFromParent();
2617 const TargetRegisterClass *DstRC =
2618 TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
2619 if (!DstRC)
2620 return true;
2621 return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
2622}
2623
2624bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
2625 // Only manually handle the f64 SGPR case.
2626 //
2627 // FIXME: This is a workaround for 2.5 different tablegen problems. Because
2628 // the bit ops theoretically have a second result due to the implicit def of
2629 // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
2630 // that is easy by disabling the check. The result works, but uses a
2631 // nonsensical sreg32orlds_and_sreg_1 regclass.
2632 //
2633 // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
2634 // the variadic REG_SEQUENCE operands.
2635
2636 Register Dst = MI.getOperand(0).getReg();
2637 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2638 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2639 MRI->getType(Dst) != LLT::scalar(64))
2640 return false;
2641
2642 Register Src = MI.getOperand(1).getReg();
2643 MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
2644 if (Fabs)
2645 Src = Fabs->getOperand(1).getReg();
2646
2647 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2648 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2649 return false;
2650
2651 MachineBasicBlock *BB = MI.getParent();
2652 const DebugLoc &DL = MI.getDebugLoc();
2653 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2654 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2655 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2656 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2657
2658 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2659 .addReg(Src, 0, AMDGPU::sub0);
2660 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2661 .addReg(Src, 0, AMDGPU::sub1);
2662 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2663 .addImm(0x80000000);
2664
2665 // Set or toggle sign bit.
2666 unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
2667 BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
2668 .addReg(HiReg)
2669 .addReg(ConstReg)
2670 .setOperandDead(3); // Dead scc
2671 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2672 .addReg(LoReg)
2673 .addImm(AMDGPU::sub0)
2674 .addReg(OpReg)
2675 .addImm(AMDGPU::sub1);
2676 MI.eraseFromParent();
2677 return true;
2678}
2679
2680// FIXME: This is a workaround for the same tablegen problems as G_FNEG
2681bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
2682 Register Dst = MI.getOperand(0).getReg();
2683 const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
2684 if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
2685 MRI->getType(Dst) != LLT::scalar(64))
2686 return false;
2687
2688 Register Src = MI.getOperand(1).getReg();
2689 MachineBasicBlock *BB = MI.getParent();
2690 const DebugLoc &DL = MI.getDebugLoc();
2691 Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2692 Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2693 Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2694 Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
2695
2696 if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
2697 !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
2698 return false;
2699
2700 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
2701 .addReg(Src, 0, AMDGPU::sub0);
2702 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
2703 .addReg(Src, 0, AMDGPU::sub1);
2704 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
2705 .addImm(0x7fffffff);
2706
2707 // Clear sign bit.
2708 // TODO: Should this used S_BITSET0_*?
2709 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
2710 .addReg(HiReg)
2711 .addReg(ConstReg)
2712 .setOperandDead(3); // Dead scc
2713 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
2714 .addReg(LoReg)
2715 .addImm(AMDGPU::sub0)
2716 .addReg(OpReg)
2717 .addImm(AMDGPU::sub1);
2718
2719 MI.eraseFromParent();
2720 return true;
2721}
2722
2723static bool isConstant(const MachineInstr &MI) {
2724 return MI.getOpcode() == TargetOpcode::G_CONSTANT;
2725}
2726
2727void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
2728 const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
2729
2730 unsigned OpNo = Load.getOpcode() == AMDGPU::G_PREFETCH ? 0 : 1;
2731 const MachineInstr *PtrMI =
2732 MRI.getUniqueVRegDef(Load.getOperand(OpNo).getReg());
2733
2734 assert(PtrMI);
2735
2736 if (PtrMI->getOpcode() != TargetOpcode::G_PTR_ADD)
2737 return;
2738
2739 GEPInfo GEPInfo;
2740
2741 for (unsigned i = 1; i != 3; ++i) {
2742 const MachineOperand &GEPOp = PtrMI->getOperand(i);
2743 const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
2744 assert(OpDef);
2745 if (i == 2 && isConstant(*OpDef)) {
2746 // TODO: Could handle constant base + variable offset, but a combine
2747 // probably should have commuted it.
2748 assert(GEPInfo.Imm == 0);
2749 GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
2750 continue;
2751 }
2752 const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
2753 if (OpBank->getID() == AMDGPU::SGPRRegBankID)
2754 GEPInfo.SgprParts.push_back(GEPOp.getReg());
2755 else
2756 GEPInfo.VgprParts.push_back(GEPOp.getReg());
2757 }
2758
2759 AddrInfo.push_back(GEPInfo);
2760 getAddrModeInfo(*PtrMI, MRI, AddrInfo);
2761}
2762
2763bool AMDGPUInstructionSelector::isSGPR(Register Reg) const {
2764 return RBI.getRegBank(Reg, *MRI, TRI)->getID() == AMDGPU::SGPRRegBankID;
2765}
2766
2767bool AMDGPUInstructionSelector::isInstrUniform(const MachineInstr &MI) const {
2768 if (!MI.hasOneMemOperand())
2769 return false;
2770
2771 const MachineMemOperand *MMO = *MI.memoperands_begin();
2772 const Value *Ptr = MMO->getValue();
2773
2774 // UndefValue means this is a load of a kernel input. These are uniform.
2775 // Sometimes LDS instructions have constant pointers.
2776 // If Ptr is null, then that means this mem operand contains a
2777 // PseudoSourceValue like GOT.
2778 if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
2779 isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
2780 return true;
2781
2783 return true;
2784
2785 if (MI.getOpcode() == AMDGPU::G_PREFETCH)
2786 return RBI.getRegBank(MI.getOperand(0).getReg(), *MRI, TRI)->getID() ==
2787 AMDGPU::SGPRRegBankID;
2788
2789 const Instruction *I = dyn_cast<Instruction>(Ptr);
2790 return I && I->getMetadata("amdgpu.uniform");
2791}
2792
2793bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
2794 for (const GEPInfo &GEPInfo : AddrInfo) {
2795 if (!GEPInfo.VgprParts.empty())
2796 return true;
2797 }
2798 return false;
2799}
2800
2801void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
2802 const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
2803 unsigned AS = PtrTy.getAddressSpace();
2805 STI.ldsRequiresM0Init()) {
2806 MachineBasicBlock *BB = I.getParent();
2807
2808 // If DS instructions require M0 initialization, insert it before selecting.
2809 BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2810 .addImm(-1);
2811 }
2812}
2813
2814bool AMDGPUInstructionSelector::selectG_LOAD_STORE_ATOMICRMW(
2815 MachineInstr &I) const {
2816 initM0(I);
2817 return selectImpl(I, *CoverageInfo);
2818}
2819
2821 if (Reg.isPhysical())
2822 return false;
2823
2824 MachineInstr &MI = *MRI.getUniqueVRegDef(Reg);
2825 const unsigned Opcode = MI.getOpcode();
2826
2827 if (Opcode == AMDGPU::COPY)
2828 return isVCmpResult(MI.getOperand(1).getReg(), MRI);
2829
2830 if (Opcode == AMDGPU::G_AND || Opcode == AMDGPU::G_OR ||
2831 Opcode == AMDGPU::G_XOR)
2832 return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
2833 isVCmpResult(MI.getOperand(2).getReg(), MRI);
2834
2835 if (auto *GI = dyn_cast<GIntrinsic>(&MI))
2836 return GI->is(Intrinsic::amdgcn_class);
2837
2838 return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
2839}
2840
2841bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
2842 MachineBasicBlock *BB = I.getParent();
2843 MachineOperand &CondOp = I.getOperand(0);
2844 Register CondReg = CondOp.getReg();
2845 const DebugLoc &DL = I.getDebugLoc();
2846
2847 unsigned BrOpcode;
2848 Register CondPhysReg;
2849 const TargetRegisterClass *ConstrainRC;
2850
2851 // In SelectionDAG, we inspect the IR block for uniformity metadata to decide
2852 // whether the branch is uniform when selecting the instruction. In
2853 // GlobalISel, we should push that decision into RegBankSelect. Assume for now
2854 // RegBankSelect knows what it's doing if the branch condition is scc, even
2855 // though it currently does not.
2856 if (!isVCC(CondReg, *MRI)) {
2857 if (MRI->getType(CondReg) != LLT::scalar(32))
2858 return false;
2859
2860 CondPhysReg = AMDGPU::SCC;
2861 BrOpcode = AMDGPU::S_CBRANCH_SCC1;
2862 ConstrainRC = &AMDGPU::SReg_32RegClass;
2863 } else {
2864 // FIXME: Should scc->vcc copies and with exec?
2865
2866 // Unless the value of CondReg is a result of a V_CMP* instruction then we
2867 // need to insert an and with exec.
2868 if (!isVCmpResult(CondReg, *MRI)) {
2869 const bool Is64 = STI.isWave64();
2870 const unsigned Opcode = Is64 ? AMDGPU::S_AND_B64 : AMDGPU::S_AND_B32;
2871 const Register Exec = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
2872
2873 Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
2874 BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
2875 .addReg(CondReg)
2876 .addReg(Exec)
2877 .setOperandDead(3); // Dead scc
2878 CondReg = TmpReg;
2879 }
2880
2881 CondPhysReg = TRI.getVCC();
2882 BrOpcode = AMDGPU::S_CBRANCH_VCCNZ;
2883 ConstrainRC = TRI.getBoolRC();
2884 }
2885
2886 if (!MRI->getRegClassOrNull(CondReg))
2887 MRI->setRegClass(CondReg, ConstrainRC);
2888
2889 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
2890 .addReg(CondReg);
2891 BuildMI(*BB, &I, DL, TII.get(BrOpcode))
2892 .addMBB(I.getOperand(1).getMBB());
2893
2894 I.eraseFromParent();
2895 return true;
2896}
2897
2898bool AMDGPUInstructionSelector::selectG_GLOBAL_VALUE(
2899 MachineInstr &I) const {
2900 Register DstReg = I.getOperand(0).getReg();
2901 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2902 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2903 I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
2904 if (IsVGPR)
2905 I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
2906
2907 return RBI.constrainGenericRegister(
2908 DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
2909}
2910
2911bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
2912 Register DstReg = I.getOperand(0).getReg();
2913 Register SrcReg = I.getOperand(1).getReg();
2914 Register MaskReg = I.getOperand(2).getReg();
2915 LLT Ty = MRI->getType(DstReg);
2916 LLT MaskTy = MRI->getType(MaskReg);
2917 MachineBasicBlock *BB = I.getParent();
2918 const DebugLoc &DL = I.getDebugLoc();
2919
2920 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
2921 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
2922 const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
2923 const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
2924 if (DstRB != SrcRB) // Should only happen for hand written MIR.
2925 return false;
2926
2927 // Try to avoid emitting a bit operation when we only need to touch half of
2928 // the 64-bit pointer.
2929 APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
2930 const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
2931 const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
2932
2933 const bool CanCopyLow32 = (MaskOnes & MaskLo32) == MaskLo32;
2934 const bool CanCopyHi32 = (MaskOnes & MaskHi32) == MaskHi32;
2935
2936 if (!IsVGPR && Ty.getSizeInBits() == 64 &&
2937 !CanCopyLow32 && !CanCopyHi32) {
2938 auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
2939 .addReg(SrcReg)
2940 .addReg(MaskReg)
2941 .setOperandDead(3); // Dead scc
2942 I.eraseFromParent();
2943 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
2944 }
2945
2946 unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
2947 const TargetRegisterClass &RegRC
2948 = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
2949
2950 const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB);
2951 const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB);
2952 const TargetRegisterClass *MaskRC =
2953 TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB);
2954
2955 if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
2956 !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
2957 !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
2958 return false;
2959
2960 if (Ty.getSizeInBits() == 32) {
2961 assert(MaskTy.getSizeInBits() == 32 &&
2962 "ptrmask should have been narrowed during legalize");
2963
2964 auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
2965 .addReg(SrcReg)
2966 .addReg(MaskReg);
2967
2968 if (!IsVGPR)
2969 NewOp.setOperandDead(3); // Dead scc
2970 I.eraseFromParent();
2971 return true;
2972 }
2973
2974 Register HiReg = MRI->createVirtualRegister(&RegRC);
2975 Register LoReg = MRI->createVirtualRegister(&RegRC);
2976
2977 // Extract the subregisters from the source pointer.
2978 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
2979 .addReg(SrcReg, 0, AMDGPU::sub0);
2980 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
2981 .addReg(SrcReg, 0, AMDGPU::sub1);
2982
2983 Register MaskedLo, MaskedHi;
2984
2985 if (CanCopyLow32) {
2986 // If all the bits in the low half are 1, we only need a copy for it.
2987 MaskedLo = LoReg;
2988 } else {
2989 // Extract the mask subregister and apply the and.
2990 Register MaskLo = MRI->createVirtualRegister(&RegRC);
2991 MaskedLo = MRI->createVirtualRegister(&RegRC);
2992
2993 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
2994 .addReg(MaskReg, 0, AMDGPU::sub0);
2995 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
2996 .addReg(LoReg)
2997 .addReg(MaskLo);
2998 }
2999
3000 if (CanCopyHi32) {
3001 // If all the bits in the high half are 1, we only need a copy for it.
3002 MaskedHi = HiReg;
3003 } else {
3004 Register MaskHi = MRI->createVirtualRegister(&RegRC);
3005 MaskedHi = MRI->createVirtualRegister(&RegRC);
3006
3007 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
3008 .addReg(MaskReg, 0, AMDGPU::sub1);
3009 BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
3010 .addReg(HiReg)
3011 .addReg(MaskHi);
3012 }
3013
3014 BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
3015 .addReg(MaskedLo)
3016 .addImm(AMDGPU::sub0)
3017 .addReg(MaskedHi)
3018 .addImm(AMDGPU::sub1);
3019 I.eraseFromParent();
3020 return true;
3021}
3022
3023/// Return the register to use for the index value, and the subregister to use
3024/// for the indirectly accessed register.
3025static std::pair<Register, unsigned>
3027 const TargetRegisterClass *SuperRC, Register IdxReg,
3028 unsigned EltSize, GISelKnownBits &KnownBits) {
3029 Register IdxBaseReg;
3030 int Offset;
3031
3032 std::tie(IdxBaseReg, Offset) =
3034 if (IdxBaseReg == AMDGPU::NoRegister) {
3035 // This will happen if the index is a known constant. This should ordinarily
3036 // be legalized out, but handle it as a register just in case.
3037 assert(Offset == 0);
3038 IdxBaseReg = IdxReg;
3039 }
3040
3041 ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
3042
3043 // Skip out of bounds offsets, or else we would end up using an undefined
3044 // register.
3045 if (static_cast<unsigned>(Offset) >= SubRegs.size())
3046 return std::pair(IdxReg, SubRegs[0]);
3047 return std::pair(IdxBaseReg, SubRegs[Offset]);
3048}
3049
3050bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
3051 MachineInstr &MI) const {
3052 Register DstReg = MI.getOperand(0).getReg();
3053 Register SrcReg = MI.getOperand(1).getReg();
3054 Register IdxReg = MI.getOperand(2).getReg();
3055
3056 LLT DstTy = MRI->getType(DstReg);
3057 LLT SrcTy = MRI->getType(SrcReg);
3058
3059 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3060 const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
3061 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3062
3063 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3064 // into a waterfall loop.
3065 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3066 return false;
3067
3068 const TargetRegisterClass *SrcRC =
3069 TRI.getRegClassForTypeOnBank(SrcTy, *SrcRB);
3070 const TargetRegisterClass *DstRC =
3071 TRI.getRegClassForTypeOnBank(DstTy, *DstRB);
3072 if (!SrcRC || !DstRC)
3073 return false;
3074 if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
3075 !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
3076 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3077 return false;
3078
3079 MachineBasicBlock *BB = MI.getParent();
3080 const DebugLoc &DL = MI.getDebugLoc();
3081 const bool Is64 = DstTy.getSizeInBits() == 64;
3082
3083 unsigned SubReg;
3084 std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
3085 *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
3086
3087 if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
3088 if (DstTy.getSizeInBits() != 32 && !Is64)
3089 return false;
3090
3091 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3092 .addReg(IdxReg);
3093
3094 unsigned Opc = Is64 ? AMDGPU::S_MOVRELS_B64 : AMDGPU::S_MOVRELS_B32;
3095 BuildMI(*BB, &MI, DL, TII.get(Opc), DstReg)
3096 .addReg(SrcReg, 0, SubReg)
3097 .addReg(SrcReg, RegState::Implicit);
3098 MI.eraseFromParent();
3099 return true;
3100 }
3101
3102 if (SrcRB->getID() != AMDGPU::VGPRRegBankID || DstTy.getSizeInBits() != 32)
3103 return false;
3104
3105 if (!STI.useVGPRIndexMode()) {
3106 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3107 .addReg(IdxReg);
3108 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::V_MOVRELS_B32_e32), DstReg)
3109 .addReg(SrcReg, 0, SubReg)
3110 .addReg(SrcReg, RegState::Implicit);
3111 MI.eraseFromParent();
3112 return true;
3113 }
3114
3115 const MCInstrDesc &GPRIDXDesc =
3116 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*SrcRC), true);
3117 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3118 .addReg(SrcReg)
3119 .addReg(IdxReg)
3120 .addImm(SubReg);
3121
3122 MI.eraseFromParent();
3123 return true;
3124}
3125
3126// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
3127bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
3128 MachineInstr &MI) const {
3129 Register DstReg = MI.getOperand(0).getReg();
3130 Register VecReg = MI.getOperand(1).getReg();
3131 Register ValReg = MI.getOperand(2).getReg();
3132 Register IdxReg = MI.getOperand(3).getReg();
3133
3134 LLT VecTy = MRI->getType(DstReg);
3135 LLT ValTy = MRI->getType(ValReg);
3136 unsigned VecSize = VecTy.getSizeInBits();
3137 unsigned ValSize = ValTy.getSizeInBits();
3138
3139 const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
3140 const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
3141 const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
3142
3143 assert(VecTy.getElementType() == ValTy);
3144
3145 // The index must be scalar. If it wasn't RegBankSelect should have moved this
3146 // into a waterfall loop.
3147 if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
3148 return false;
3149
3150 const TargetRegisterClass *VecRC =
3151 TRI.getRegClassForTypeOnBank(VecTy, *VecRB);
3152 const TargetRegisterClass *ValRC =
3153 TRI.getRegClassForTypeOnBank(ValTy, *ValRB);
3154
3155 if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
3156 !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
3157 !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
3158 !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
3159 return false;
3160
3161 if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
3162 return false;
3163
3164 unsigned SubReg;
3165 std::tie(IdxReg, SubReg) =
3166 computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
3167
3168 const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
3169 STI.useVGPRIndexMode();
3170
3171 MachineBasicBlock *BB = MI.getParent();
3172 const DebugLoc &DL = MI.getDebugLoc();
3173
3174 if (!IndexMode) {
3175 BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3176 .addReg(IdxReg);
3177
3178 const MCInstrDesc &RegWriteOp = TII.getIndirectRegWriteMovRelPseudo(
3179 VecSize, ValSize, VecRB->getID() == AMDGPU::SGPRRegBankID);
3180 BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
3181 .addReg(VecReg)
3182 .addReg(ValReg)
3183 .addImm(SubReg);
3184 MI.eraseFromParent();
3185 return true;
3186 }
3187
3188 const MCInstrDesc &GPRIDXDesc =
3189 TII.getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
3190 BuildMI(*BB, MI, DL, GPRIDXDesc, DstReg)
3191 .addReg(VecReg)
3192 .addReg(ValReg)
3193 .addReg(IdxReg)
3194 .addImm(SubReg);
3195
3196 MI.eraseFromParent();
3197 return true;
3198}
3199
3200bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
3202 unsigned Opc;
3203 unsigned Size = MI.getOperand(3).getImm();
3204
3205 // The struct intrinsic variants add one additional operand over raw.
3206 const bool HasVIndex = MI.getNumOperands() == 9;
3207 Register VIndex;
3208 int OpOffset = 0;
3209 if (HasVIndex) {
3210 VIndex = MI.getOperand(4).getReg();
3211 OpOffset = 1;
3212 }
3213
3214 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
3215 std::optional<ValueAndVReg> MaybeVOffset =
3217 const bool HasVOffset = !MaybeVOffset || MaybeVOffset->Value.getZExtValue();
3218
3219 switch (Size) {
3220 default:
3221 return false;
3222 case 1:
3223 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
3224 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
3225 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
3226 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
3227 break;
3228 case 2:
3229 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
3230 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
3231 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
3232 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
3233 break;
3234 case 4:
3235 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
3236 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
3237 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
3238 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
3239 break;
3240 }
3241
3242 MachineBasicBlock *MBB = MI.getParent();
3243 const DebugLoc &DL = MI.getDebugLoc();
3244 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3245 .add(MI.getOperand(2));
3246
3247 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc));
3248
3249 if (HasVIndex && HasVOffset) {
3250 Register IdxReg = MRI->createVirtualRegister(TRI.getVGPR64Class());
3251 BuildMI(*MBB, &*MIB, DL, TII.get(AMDGPU::REG_SEQUENCE), IdxReg)
3252 .addReg(VIndex)
3253 .addImm(AMDGPU::sub0)
3254 .addReg(VOffset)
3255 .addImm(AMDGPU::sub1);
3256
3257 MIB.addReg(IdxReg);
3258 } else if (HasVIndex) {
3259 MIB.addReg(VIndex);
3260 } else if (HasVOffset) {
3261 MIB.addReg(VOffset);
3262 }
3263
3264 MIB.add(MI.getOperand(1)); // rsrc
3265 MIB.add(MI.getOperand(5 + OpOffset)); // soffset
3266 MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
3267 unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
3268 MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
3269 MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
3270
3271 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3272 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3273 LoadPtrI.Offset = MI.getOperand(6 + OpOffset).getImm();
3274 MachinePointerInfo StorePtrI = LoadPtrI;
3275 StorePtrI.V = nullptr;
3277
3278 auto F = LoadMMO->getFlags() &
3280 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3281 Size, LoadMMO->getBaseAlign());
3282
3283 MachineMemOperand *StoreMMO =
3285 sizeof(int32_t), LoadMMO->getBaseAlign());
3286
3287 MIB.setMemRefs({LoadMMO, StoreMMO});
3288
3289 MI.eraseFromParent();
3290 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3291}
3292
3293/// Match a zero extend from a 32-bit value to 64-bits.
3295 Register ZExtSrc;
3296 if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3297 return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3298
3299 // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3300 const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3301 if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3302 return Register();
3303
3304 assert(Def->getNumOperands() == 3 &&
3305 MRI.getType(Def->getOperand(0).getReg()) == LLT::scalar(64));
3306 if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3307 return Def->getOperand(1).getReg();
3308 }
3309
3310 return Register();
3311}
3312
3313bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3314 unsigned Opc;
3315 unsigned Size = MI.getOperand(3).getImm();
3316
3317 switch (Size) {
3318 default:
3319 return false;
3320 case 1:
3321 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3322 break;
3323 case 2:
3324 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3325 break;
3326 case 4:
3327 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3328 break;
3329 }
3330
3331 MachineBasicBlock *MBB = MI.getParent();
3332 const DebugLoc &DL = MI.getDebugLoc();
3333 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3334 .add(MI.getOperand(2));
3335
3336 Register Addr = MI.getOperand(1).getReg();
3337 Register VOffset;
3338 // Try to split SAddr and VOffset. Global and LDS pointers share the same
3339 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
3340 if (!isSGPR(Addr)) {
3341 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3342 if (isSGPR(AddrDef->Reg)) {
3343 Addr = AddrDef->Reg;
3344 } else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3345 Register SAddr =
3346 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3347 if (isSGPR(SAddr)) {
3348 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3349 if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3350 Addr = SAddr;
3351 VOffset = Off;
3352 }
3353 }
3354 }
3355 }
3356
3357 if (isSGPR(Addr)) {
3358 Opc = AMDGPU::getGlobalSaddrOp(Opc);
3359 if (!VOffset) {
3360 VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3361 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3362 .addImm(0);
3363 }
3364 }
3365
3366 auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3367 .addReg(Addr);
3368
3369 if (isSGPR(Addr))
3370 MIB.addReg(VOffset);
3371
3372 MIB.add(MI.getOperand(4)) // offset
3373 .add(MI.getOperand(5)); // cpol
3374
3375 MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3376 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3377 LoadPtrI.Offset = MI.getOperand(4).getImm();
3378 MachinePointerInfo StorePtrI = LoadPtrI;
3381 auto F = LoadMMO->getFlags() &
3383 LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3384 Size, LoadMMO->getBaseAlign());
3385 MachineMemOperand *StoreMMO =
3387 sizeof(int32_t), Align(4));
3388
3389 MIB.setMemRefs({LoadMMO, StoreMMO});
3390
3391 MI.eraseFromParent();
3392 return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3393}
3394
3395bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
3396 MI.setDesc(TII.get(MI.getOperand(1).getImm()));
3397 MI.removeOperand(1);
3398 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3399 return true;
3400}
3401
3402bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
3403 unsigned Opc;
3404 switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
3405 case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
3406 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
3407 break;
3408 case Intrinsic::amdgcn_smfmac_f32_32x32x16_f16:
3409 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_F16_e64;
3410 break;
3411 case Intrinsic::amdgcn_smfmac_f32_16x16x32_bf16:
3412 Opc = AMDGPU::V_SMFMAC_F32_16X16X32_BF16_e64;
3413 break;
3414 case Intrinsic::amdgcn_smfmac_f32_32x32x16_bf16:
3415 Opc = AMDGPU::V_SMFMAC_F32_32X32X16_BF16_e64;
3416 break;
3417 case Intrinsic::amdgcn_smfmac_i32_16x16x64_i8:
3418 Opc = AMDGPU::V_SMFMAC_I32_16X16X64_I8_e64;
3419 break;
3420 case Intrinsic::amdgcn_smfmac_i32_32x32x32_i8:
3421 Opc = AMDGPU::V_SMFMAC_I32_32X32X32_I8_e64;
3422 break;
3423 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_bf8:
3424 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_BF8_e64;
3425 break;
3426 case Intrinsic::amdgcn_smfmac_f32_16x16x64_bf8_fp8:
3427 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_BF8_FP8_e64;
3428 break;
3429 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_bf8:
3430 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_BF8_e64;
3431 break;
3432 case Intrinsic::amdgcn_smfmac_f32_16x16x64_fp8_fp8:
3433 Opc = AMDGPU::V_SMFMAC_F32_16X16X64_FP8_FP8_e64;
3434 break;
3435 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_bf8:
3436 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_BF8_e64;
3437 break;
3438 case Intrinsic::amdgcn_smfmac_f32_32x32x32_bf8_fp8:
3439 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_BF8_FP8_e64;
3440 break;
3441 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_bf8:
3442 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_BF8_e64;
3443 break;
3444 case Intrinsic::amdgcn_smfmac_f32_32x32x32_fp8_fp8:
3445 Opc = AMDGPU::V_SMFMAC_F32_32X32X32_FP8_FP8_e64;
3446 break;
3447 default:
3448 llvm_unreachable("unhandled smfmac intrinsic");
3449 }
3450
3451 auto VDst_In = MI.getOperand(4);
3452
3453 MI.setDesc(TII.get(Opc));
3454 MI.removeOperand(4); // VDst_In
3455 MI.removeOperand(1); // Intrinsic ID
3456 MI.addOperand(VDst_In); // Readd VDst_In to the end
3457 MI.addImplicitDefUseOperands(*MI.getParent()->getParent());
3458 return true;
3459}
3460
3461bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
3462 Register DstReg = MI.getOperand(0).getReg();
3463 Register SrcReg = MI.getOperand(1).getReg();
3464 const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
3465 const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
3466 MachineBasicBlock *MBB = MI.getParent();
3467 const DebugLoc &DL = MI.getDebugLoc();
3468
3469 if (IsVALU) {
3470 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
3471 .addImm(Subtarget->getWavefrontSizeLog2())
3472 .addReg(SrcReg);
3473 } else {
3474 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
3475 .addReg(SrcReg)
3476 .addImm(Subtarget->getWavefrontSizeLog2())
3477 .setOperandDead(3); // Dead scc
3478 }
3479
3480 const TargetRegisterClass &RC =
3481 IsVALU ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
3482 if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
3483 return false;
3484
3485 MI.eraseFromParent();
3486 return true;
3487}
3488
3489bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
3490 Register SrcReg = MI.getOperand(0).getReg();
3491 if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
3492 return false;
3493
3494 MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
3495 Register SP =
3497 Register WaveAddr = getWaveAddress(DefMI);
3498 MachineBasicBlock *MBB = MI.getParent();
3499 const DebugLoc &DL = MI.getDebugLoc();
3500
3501 if (!WaveAddr) {
3502 WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
3503 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
3504 .addReg(SrcReg)
3505 .addImm(Subtarget->getWavefrontSizeLog2())
3506 .setOperandDead(3); // Dead scc
3507 }
3508
3509 BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
3510 .addReg(WaveAddr);
3511
3512 MI.eraseFromParent();
3513 return true;
3514}
3515
3517
3518 if (!I.isPreISelOpcode()) {
3519 if (I.isCopy())
3520 return selectCOPY(I);
3521 return true;
3522 }
3523
3524 switch (I.getOpcode()) {
3525 case TargetOpcode::G_AND:
3526 case TargetOpcode::G_OR:
3527 case TargetOpcode::G_XOR:
3528 if (selectImpl(I, *CoverageInfo))
3529 return true;
3530 return selectG_AND_OR_XOR(I);
3531 case TargetOpcode::G_ADD:
3532 case TargetOpcode::G_SUB:
3533 if (selectImpl(I, *CoverageInfo))
3534 return true;
3535 return selectG_ADD_SUB(I);
3536 case TargetOpcode::G_UADDO:
3537 case TargetOpcode::G_USUBO:
3538 case TargetOpcode::G_UADDE:
3539 case TargetOpcode::G_USUBE:
3540 return selectG_UADDO_USUBO_UADDE_USUBE(I);
3541 case AMDGPU::G_AMDGPU_MAD_U64_U32:
3542 case AMDGPU::G_AMDGPU_MAD_I64_I32:
3543 return selectG_AMDGPU_MAD_64_32(I);
3544 case TargetOpcode::G_INTTOPTR:
3545 case TargetOpcode::G_BITCAST:
3546 case TargetOpcode::G_PTRTOINT:
3547 return selectCOPY(I);
3548 case TargetOpcode::G_CONSTANT:
3549 case TargetOpcode::G_FCONSTANT:
3550 return selectG_CONSTANT(I);
3551 case TargetOpcode::G_FNEG:
3552 if (selectImpl(I, *CoverageInfo))
3553 return true;
3554 return selectG_FNEG(I);
3555 case TargetOpcode::G_FABS:
3556 if (selectImpl(I, *CoverageInfo))
3557 return true;
3558 return selectG_FABS(I);
3559 case TargetOpcode::G_EXTRACT:
3560 return selectG_EXTRACT(I);
3561 case TargetOpcode::G_MERGE_VALUES:
3562 case TargetOpcode::G_CONCAT_VECTORS:
3563 return selectG_MERGE_VALUES(I);
3564 case TargetOpcode::G_UNMERGE_VALUES:
3565 return selectG_UNMERGE_VALUES(I);
3566 case TargetOpcode::G_BUILD_VECTOR:
3567 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
3568 return selectG_BUILD_VECTOR(I);
3569 case TargetOpcode::G_PTR_ADD:
3570 if (selectImpl(I, *CoverageInfo))
3571 return true;
3572 return selectG_PTR_ADD(I);
3573 case TargetOpcode::G_IMPLICIT_DEF:
3574 return selectG_IMPLICIT_DEF(I);
3575 case TargetOpcode::G_FREEZE:
3576 return selectCOPY(I);
3577 case TargetOpcode::G_INSERT:
3578 return selectG_INSERT(I);
3579 case TargetOpcode::G_INTRINSIC:
3580 case TargetOpcode::G_INTRINSIC_CONVERGENT:
3581 return selectG_INTRINSIC(I);
3582 case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3583 case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
3584 return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
3585 case TargetOpcode::G_ICMP:
3586 case TargetOpcode::G_FCMP:
3587 if (selectG_ICMP_or_FCMP(I))
3588 return true;
3589 return selectImpl(I, *CoverageInfo);
3590 case TargetOpcode::G_LOAD:
3591 case TargetOpcode::G_STORE:
3592 case TargetOpcode::G_ATOMIC_CMPXCHG:
3593 case TargetOpcode::G_ATOMICRMW_XCHG:
3594 case TargetOpcode::G_ATOMICRMW_ADD:
3595 case TargetOpcode::G_ATOMICRMW_SUB:
3596 case TargetOpcode::G_ATOMICRMW_AND:
3597 case TargetOpcode::G_ATOMICRMW_OR:
3598 case TargetOpcode::G_ATOMICRMW_XOR:
3599 case TargetOpcode::G_ATOMICRMW_MIN:
3600 case TargetOpcode::G_ATOMICRMW_MAX:
3601 case TargetOpcode::G_ATOMICRMW_UMIN:
3602 case TargetOpcode::G_ATOMICRMW_UMAX:
3603 case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
3604 case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
3605 case TargetOpcode::G_ATOMICRMW_FADD:
3606 case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
3607 case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
3608 return selectG_LOAD_STORE_ATOMICRMW(I);
3609 case TargetOpcode::G_SELECT:
3610 return selectG_SELECT(I);
3611 case TargetOpcode::G_TRUNC:
3612 return selectG_TRUNC(I);
3613 case TargetOpcode::G_SEXT:
3614 case TargetOpcode::G_ZEXT:
3615 case TargetOpcode::G_ANYEXT:
3616 case TargetOpcode::G_SEXT_INREG:
3617 // This is a workaround. For extension from type i1, `selectImpl()` uses
3618 // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
3619 // i1 can only be hold in a SGPR class.
3620 if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
3621 selectImpl(I, *CoverageInfo))
3622 return true;
3623 return selectG_SZA_EXT(I);
3624 case TargetOpcode::G_FPEXT:
3625 if (selectG_FPEXT(I))
3626 return true;
3627 return selectImpl(I, *CoverageInfo);
3628 case TargetOpcode::G_BRCOND:
3629 return selectG_BRCOND(I);
3630 case TargetOpcode::G_GLOBAL_VALUE:
3631 return selectG_GLOBAL_VALUE(I);
3632 case TargetOpcode::G_PTRMASK:
3633 return selectG_PTRMASK(I);
3634 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3635 return selectG_EXTRACT_VECTOR_ELT(I);
3636 case TargetOpcode::G_INSERT_VECTOR_ELT:
3637 return selectG_INSERT_VECTOR_ELT(I);
3638 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
3639 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
3640 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
3641 case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
3644 assert(Intr && "not an image intrinsic with image pseudo");
3645 return selectImageIntrinsic(I, Intr);
3646 }
3647 case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY:
3648 return selectBVHIntrinsic(I);
3649 case AMDGPU::G_SBFX:
3650 case AMDGPU::G_UBFX:
3651 return selectG_SBFX_UBFX(I);
3652 case AMDGPU::G_SI_CALL:
3653 I.setDesc(TII.get(AMDGPU::SI_CALL));
3654 return true;
3655 case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
3656 return selectWaveAddress(I);
3657 case AMDGPU::G_STACKRESTORE:
3658 return selectStackRestore(I);
3659 case AMDGPU::G_PHI:
3660 return selectPHI(I);
3661 default:
3662 return selectImpl(I, *CoverageInfo);
3663 }
3664 return false;
3665}
3666
3668AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
3669 return {{
3670 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3671 }};
3672
3673}
3674
3675std::pair<Register, unsigned>
3676AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
3677 bool IsCanonicalizing,
3678 bool AllowAbs, bool OpSel) const {
3679 Register Src = Root.getReg();
3680 unsigned Mods = 0;
3681 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
3682
3683 if (MI->getOpcode() == AMDGPU::G_FNEG) {
3684 Src = MI->getOperand(1).getReg();
3685 Mods |= SISrcMods::NEG;
3686 MI = getDefIgnoringCopies(Src, *MRI);
3687 } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
3688 // Fold fsub [+-]0 into fneg. This may not have folded depending on the
3689 // denormal mode, but we're implicitly canonicalizing in a source operand.
3690 const ConstantFP *LHS =
3691 getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
3692 if (LHS && LHS->isZero()) {
3693 Mods |= SISrcMods::NEG;
3694 Src = MI->getOperand(2).getReg();
3695 }
3696 }
3697
3698 if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
3699 Src = MI->getOperand(1).getReg();
3700 Mods |= SISrcMods::ABS;
3701 }
3702
3703 if (OpSel)
3704 Mods |= SISrcMods::OP_SEL_0;
3705
3706 return std::pair(Src, Mods);
3707}
3708
3709Register AMDGPUInstructionSelector::copyToVGPRIfSrcFolded(
3710 Register Src, unsigned Mods, MachineOperand Root, MachineInstr *InsertPt,
3711 bool ForceVGPR) const {
3712 if ((Mods != 0 || ForceVGPR) &&
3713 RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
3714
3715 // If we looked through copies to find source modifiers on an SGPR operand,
3716 // we now have an SGPR register source. To avoid potentially violating the
3717 // constant bus restriction, we need to insert a copy to a VGPR.
3718 Register VGPRSrc = MRI->cloneVirtualRegister(Root.getReg());
3719 BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
3720 TII.get(AMDGPU::COPY), VGPRSrc)
3721 .addReg(Src);
3722 Src = VGPRSrc;
3723 }
3724
3725 return Src;
3726}
3727
3728///
3729/// This will select either an SGPR or VGPR operand and will save us from
3730/// having to write an extra tablegen pattern.
3732AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
3733 return {{
3734 [=](MachineInstrBuilder &MIB) { MIB.add(Root); }
3735 }};
3736}
3737
3739AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
3740 Register Src;
3741 unsigned Mods;
3742 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3743
3744 return {{
3745 [=](MachineInstrBuilder &MIB) {
3746 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3747 },
3748 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3749 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3750 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3751 }};
3752}
3753
3755AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
3756 Register Src;
3757 unsigned Mods;
3758 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
3759 /*IsCanonicalizing=*/true,
3760 /*AllowAbs=*/false);
3761
3762 return {{
3763 [=](MachineInstrBuilder &MIB) {
3764 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3765 },
3766 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
3767 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3768 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3769 }};
3770}
3771
3773AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
3774 return {{
3775 [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
3776 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
3777 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
3778 }};
3779}
3780
3782AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
3783 Register Src;
3784 unsigned Mods;
3785 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
3786
3787 return {{
3788 [=](MachineInstrBuilder &MIB) {
3789 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3790 },
3791 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3792 }};
3793}
3794
3796AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
3797 MachineOperand &Root) const {
3798 Register Src;
3799 unsigned Mods;
3800 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
3801
3802 return {{
3803 [=](MachineInstrBuilder &MIB) {
3804 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3805 },
3806 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3807 }};
3808}
3809
3811AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
3812 Register Src;
3813 unsigned Mods;
3814 std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
3815 /*AllowAbs=*/false);
3816
3817 return {{
3818 [=](MachineInstrBuilder &MIB) {
3819 MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
3820 },
3821 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3822 }};
3823}
3824
3826AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
3827 Register Reg = Root.getReg();
3828 const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
3829 if (Def->getOpcode() == AMDGPU::G_FNEG || Def->getOpcode() == AMDGPU::G_FABS)
3830 return {};
3831 return {{
3832 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
3833 }};
3834}
3835
3836std::pair<Register, unsigned>
3837AMDGPUInstructionSelector::selectVOP3PModsImpl(
3838 Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
3839 unsigned Mods = 0;
3840 MachineInstr *MI = MRI.getVRegDef(Src);
3841
3842 if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
3843 // It's possible to see an f32 fneg here, but unlikely.
3844 // TODO: Treat f32 fneg as only high bit.
3845 MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
3847 Src = MI->getOperand(1).getReg();
3848 MI = MRI.getVRegDef(Src);
3849 }
3850
3851 // TODO: Handle G_FSUB 0 as fneg
3852
3853 // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
3854 (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
3855
3856 // Packed instructions do not have abs modifiers.
3857 Mods |= SISrcMods::OP_SEL_1;
3858
3859 return std::pair(Src, Mods);
3860}
3861
3863AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
3865 = Root.getParent()->getParent()->getParent()->getRegInfo();
3866
3867 Register Src;
3868 unsigned Mods;
3869 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
3870
3871 return {{
3872 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3873 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3874 }};
3875}
3876
3878AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
3880 = Root.getParent()->getParent()->getParent()->getRegInfo();
3881
3882 Register Src;
3883 unsigned Mods;
3884 std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
3885
3886 return {{
3887 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
3888 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3889 }};
3890}
3891
3893AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
3894 // Literal i1 value set in intrinsic, represents SrcMods for the next operand.
3895 // Value is in Imm operand as i1 sign extended to int64_t.
3896 // 1(-1) promotes packed values to signed, 0 treats them as unsigned.
3897 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3898 "expected i1 value");
3899 unsigned Mods = SISrcMods::OP_SEL_1;
3900 if (Root.getImm() == -1)
3901 Mods ^= SISrcMods::NEG;
3902 return {{
3903 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3904 }};
3905}
3906
3908AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
3909 MachineOperand &Root) const {
3910 assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) &&
3911 "expected i1 value");
3912 unsigned Mods = SISrcMods::OP_SEL_1;
3913 if (Root.getImm() != 0)
3914 Mods |= SISrcMods::OP_SEL_0;
3915
3916 return {{
3917 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
3918 }};
3919}
3920
3922 MachineInstr *InsertPt,
3924 const TargetRegisterClass *DstRegClass;
3925 switch (Elts.size()) {
3926 case 8:
3927 DstRegClass = &AMDGPU::VReg_256RegClass;
3928 break;
3929 case 4:
3930 DstRegClass = &AMDGPU::VReg_128RegClass;
3931 break;
3932 case 2:
3933 DstRegClass = &AMDGPU::VReg_64RegClass;
3934 break;
3935 default:
3936 llvm_unreachable("unhandled Reg sequence size");
3937 }
3938
3939 MachineIRBuilder B(*InsertPt);
3940 auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
3941 .addDef(MRI.createVirtualRegister(DstRegClass));
3942 for (unsigned i = 0; i < Elts.size(); ++i) {
3943 MIB.addReg(Elts[i]);
3945 }
3946 return MIB->getOperand(0).getReg();
3947}
3948
3949static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
3951 MachineInstr *InsertPt,
3953 if (ModOpcode == TargetOpcode::G_FNEG) {
3954 Mods |= SISrcMods::NEG;
3955 // Check if all elements also have abs modifier
3956 SmallVector<Register, 8> NegAbsElts;
3957 for (auto El : Elts) {
3958 Register FabsSrc;
3959 if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
3960 break;
3961 NegAbsElts.push_back(FabsSrc);
3962 }
3963 if (Elts.size() != NegAbsElts.size()) {
3964 // Neg
3965 Src = buildRegSequence(Elts, InsertPt, MRI);
3966 } else {
3967 // Neg and Abs
3968 Mods |= SISrcMods::NEG_HI;
3969 Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
3970 }
3971 } else {
3972 assert(ModOpcode == TargetOpcode::G_FABS);
3973 // Abs
3974 Mods |= SISrcMods::NEG_HI;
3975 Src = buildRegSequence(Elts, InsertPt, MRI);
3976 }
3977}
3978
3980AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
3981 Register Src = Root.getReg();
3982 unsigned Mods = SISrcMods::OP_SEL_1;
3984
3985 if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
3986 assert(BV->getNumSources() > 0);
3987 // Based on first element decide which mod we match, neg or abs
3988 MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(0));
3989 unsigned ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG)
3990 ? AMDGPU::G_FNEG
3991 : AMDGPU::G_FABS;
3992 for (unsigned i = 0; i < BV->getNumSources(); ++i) {
3993 ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
3994 if (ElF32->getOpcode() != ModOpcode)
3995 break;
3996 EltsF32.push_back(ElF32->getOperand(1).getReg());
3997 }
3998
3999 // All elements had ModOpcode modifier
4000 if (BV->getNumSources() == EltsF32.size()) {
4001 selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
4002 *MRI);
4003 }
4004 }
4005
4006 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4007 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4008}
4009
4011AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
4012 Register Src = Root.getReg();
4013 unsigned Mods = SISrcMods::OP_SEL_1;
4014 SmallVector<Register, 8> EltsV2F16;
4015
4016 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4017 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4018 Register FNegSrc;
4019 if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
4020 break;
4021 EltsV2F16.push_back(FNegSrc);
4022 }
4023
4024 // All elements had ModOpcode modifier
4025 if (CV->getNumSources() == EltsV2F16.size()) {
4026 Mods |= SISrcMods::NEG;
4027 Mods |= SISrcMods::NEG_HI;
4028 Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
4029 }
4030 }
4031
4032 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4033 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4034}
4035
4037AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
4038 Register Src = Root.getReg();
4039 unsigned Mods = SISrcMods::OP_SEL_1;
4040 SmallVector<Register, 8> EltsV2F16;
4041
4042 if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
4043 assert(CV->getNumSources() > 0);
4044 MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(0));
4045 // Based on first element decide which mod we match, neg or abs
4046 unsigned ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG)
4047 ? AMDGPU::G_FNEG
4048 : AMDGPU::G_FABS;
4049
4050 for (unsigned i = 0; i < CV->getNumSources(); ++i) {
4051 ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
4052 if (ElV2F16->getOpcode() != ModOpcode)
4053 break;
4054 EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
4055 }
4056
4057 // All elements had ModOpcode modifier
4058 if (CV->getNumSources() == EltsV2F16.size()) {
4059 MachineIRBuilder B(*Root.getParent());
4060 selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
4061 *MRI);
4062 }
4063 }
4064
4065 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4066 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
4067}
4068
4070AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
4071 std::optional<FPValueAndVReg> FPValReg;
4072 if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
4073 if (TII.isInlineConstant(FPValReg->Value)) {
4074 return {{[=](MachineInstrBuilder &MIB) {
4075 MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
4076 }}};
4077 }
4078 // Non-inlineable splat floats should not fall-through for integer immediate
4079 // checks.
4080 return {};
4081 }
4082
4083 APInt ICst;
4084 if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
4085 if (TII.isInlineConstant(ICst)) {
4086 return {
4087 {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
4088 }
4089 }
4090
4091 return {};
4092}
4093
4095AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
4096 Register Src =
4097 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4098 unsigned Key = 0;
4099
4100 Register ShiftSrc;
4101 std::optional<ValueAndVReg> ShiftAmt;
4102 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4103 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4104 ShiftAmt->Value.getZExtValue() % 8 == 0) {
4105 Key = ShiftAmt->Value.getZExtValue() / 8;
4106 Src = ShiftSrc;
4107 }
4108
4109 return {{
4110 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4111 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4112 }};
4113}
4114
4116AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
4117
4118 Register Src =
4119 getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
4120 unsigned Key = 0;
4121
4122 Register ShiftSrc;
4123 std::optional<ValueAndVReg> ShiftAmt;
4124 if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
4125 MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
4126 ShiftAmt->Value.getZExtValue() == 16) {
4127 Src = ShiftSrc;
4128 Key = 1;
4129 }
4130
4131 return {{
4132 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4133 [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
4134 }};
4135}
4136
4138AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
4139 Register Src;
4140 unsigned Mods;
4141 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
4142
4143 // FIXME: Handle op_sel
4144 return {{
4145 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
4146 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
4147 }};
4148}
4149
4151AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
4152 Register Src;
4153 unsigned Mods;
4154 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4155 /*IsCanonicalizing=*/true,
4156 /*AllowAbs=*/false,
4157 /*OpSel=*/false);
4158
4159 return {{
4160 [=](MachineInstrBuilder &MIB) {
4161 MIB.addReg(
4162 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4163 },
4164 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4165 }};
4166}
4167
4169AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
4170 Register Src;
4171 unsigned Mods;
4172 std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
4173 /*IsCanonicalizing=*/true,
4174 /*AllowAbs=*/false,
4175 /*OpSel=*/true);
4176
4177 return {{
4178 [=](MachineInstrBuilder &MIB) {
4179 MIB.addReg(
4180 copyToVGPRIfSrcFolded(Src, Mods, Root, MIB, /* ForceVGPR */ true));
4181 },
4182 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
4183 }};
4184}
4185
4186bool AMDGPUInstructionSelector::selectSmrdOffset(MachineOperand &Root,
4187 Register &Base,
4188 Register *SOffset,
4189 int64_t *Offset) const {
4190 MachineInstr *MI = Root.getParent();
4191 MachineBasicBlock *MBB = MI->getParent();
4192
4193 // FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
4194 // then we can select all ptr + 32-bit offsets.
4195 SmallVector<GEPInfo, 4> AddrInfo;
4196 getAddrModeInfo(*MI, *MRI, AddrInfo);
4197
4198 if (AddrInfo.empty())
4199 return false;
4200
4201 const GEPInfo &GEPI = AddrInfo[0];
4202 std::optional<int64_t> EncodedImm =
4203 AMDGPU::getSMRDEncodedOffset(STI, GEPI.Imm, false);
4204
4205 if (SOffset && Offset) {
4206 if (GEPI.SgprParts.size() == 1 && GEPI.Imm != 0 && EncodedImm &&
4207 AddrInfo.size() > 1) {
4208 const GEPInfo &GEPI2 = AddrInfo[1];
4209 if (GEPI2.SgprParts.size() == 2 && GEPI2.Imm == 0) {
4210 if (Register OffsetReg =
4211 matchZeroExtendFromS32(*MRI, GEPI2.SgprParts[1])) {
4212 Base = GEPI2.SgprParts[0];
4213 *SOffset = OffsetReg;
4214 *Offset = *EncodedImm;
4215 return true;
4216 }
4217 }
4218 }
4219 return false;
4220 }
4221
4222 if (Offset && GEPI.SgprParts.size() == 1 && EncodedImm) {
4223 Base = GEPI.SgprParts[0];
4224 *Offset = *EncodedImm;
4225 return true;
4226 }
4227
4228 // SGPR offset is unsigned.
4229 if (SOffset && GEPI.SgprParts.size() == 1 && isUInt<32>(GEPI.Imm) &&
4230 GEPI.Imm != 0) {
4231 // If we make it this far we have a load with an 32-bit immediate offset.
4232 // It is OK to select this using a sgpr offset, because we have already
4233 // failed trying to select this load into one of the _IMM variants since
4234 // the _IMM Patterns are considered before the _SGPR patterns.
4235 Base = GEPI.SgprParts[0];
4236 *SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4237 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), *SOffset)
4238 .addImm(GEPI.Imm);
4239 return true;
4240 }
4241
4242 if (SOffset && GEPI.SgprParts.size() && GEPI.Imm == 0) {
4243 if (Register OffsetReg = matchZeroExtendFromS32(*MRI, GEPI.SgprParts[1])) {
4244 Base = GEPI.SgprParts[0];
4245 *SOffset = OffsetReg;
4246 return true;
4247 }
4248 }
4249
4250 return false;
4251}
4252
4254AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
4255 Register Base;
4256 int64_t Offset;
4257 if (!selectSmrdOffset(Root, Base, /* SOffset= */ nullptr, &Offset))
4258 return std::nullopt;
4259
4260 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4261 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4262}
4263
4265AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
4266 SmallVector<GEPInfo, 4> AddrInfo;
4267 getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
4268
4269 if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
4270 return std::nullopt;
4271
4272 const GEPInfo &GEPInfo = AddrInfo[0];
4273 Register PtrReg = GEPInfo.SgprParts[0];
4274 std::optional<int64_t> EncodedImm =
4275 AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
4276 if (!EncodedImm)
4277 return std::nullopt;
4278
4279 return {{
4280 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
4281 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
4282 }};
4283}
4284
4286AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
4287 Register Base, SOffset;
4288 if (!selectSmrdOffset(Root, Base, &SOffset, /* Offset= */ nullptr))
4289 return std::nullopt;
4290
4291 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4292 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
4293}
4294
4296AMDGPUInstructionSelector::selectSmrdSgprImm(MachineOperand &Root) const {
4297 Register Base, SOffset;
4298 int64_t Offset;
4299 if (!selectSmrdOffset(Root, Base, &SOffset, &Offset))
4300 return std::nullopt;
4301
4302 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Base); },
4303 [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
4304 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }}};
4305}
4306
4307std::pair<Register, int>
4308AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
4309 uint64_t FlatVariant) const {
4310 MachineInstr *MI = Root.getParent();
4311
4312 auto Default = std::pair(Root.getReg(), 0);
4313
4314 if (!STI.hasFlatInstOffsets())
4315 return Default;
4316
4317 Register PtrBase;
4318 int64_t ConstOffset;
4319 std::tie(PtrBase, ConstOffset) =
4320 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4321
4322 if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
4323 !isFlatScratchBaseLegal(Root.getReg())))
4324 return Default;
4325
4326 unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
4327 if (!TII.isLegalFLATOffset(ConstOffset, AddrSpace, FlatVariant))
4328 return Default;
4329
4330 return std::pair(PtrBase, ConstOffset);
4331}
4332
4334AMDGPUInstructionSelector::selectFlatOffset(MachineOperand &Root) const {
4335 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FLAT);
4336
4337 return {{
4338 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4339 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4340 }};
4341}
4342
4344AMDGPUInstructionSelector::selectGlobalOffset(MachineOperand &Root) const {
4345 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatGlobal);
4346
4347 return {{
4348 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4349 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4350 }};
4351}
4352
4354AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
4355 auto PtrWithOffset = selectFlatOffsetImpl(Root, SIInstrFlags::FlatScratch);
4356
4357 return {{
4358 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrWithOffset.first); },
4359 [=](MachineInstrBuilder &MIB) { MIB.addImm(PtrWithOffset.second); },
4360 }};
4361}
4362
4363// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
4365AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {
4366 Register Addr = Root.getReg();
4367 Register PtrBase;
4368 int64_t ConstOffset;
4369 int64_t ImmOffset = 0;
4370
4371 // Match the immediate offset first, which canonically is moved as low as
4372 // possible.
4373 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4374
4375 if (ConstOffset != 0) {
4376 if (TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::GLOBAL_ADDRESS,
4378 Addr = PtrBase;
4379 ImmOffset = ConstOffset;
4380 } else {
4381 auto PtrBaseDef = getDefSrcRegIgnoringCopies(PtrBase, *MRI);
4382 if (isSGPR(PtrBaseDef->Reg)) {
4383 if (ConstOffset > 0) {
4384 // Offset is too large.
4385 //
4386 // saddr + large_offset -> saddr +
4387 // (voffset = large_offset & ~MaxOffset) +
4388 // (large_offset & MaxOffset);
4389 int64_t SplitImmOffset, RemainderOffset;
4390 std::tie(SplitImmOffset, RemainderOffset) = TII.splitFlatOffset(
4392
4393 if (isUInt<32>(RemainderOffset)) {
4394 MachineInstr *MI = Root.getParent();
4395 MachineBasicBlock *MBB = MI->getParent();
4396 Register HighBits =
4397 MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4398
4399 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4400 HighBits)
4401 .addImm(RemainderOffset);
4402
4403 return {{
4404 [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrBase); }, // saddr
4405 [=](MachineInstrBuilder &MIB) {
4406 MIB.addReg(HighBits);
4407 }, // voffset
4408 [=](MachineInstrBuilder &MIB) { MIB.addImm(SplitImmOffset); },
4409 }};
4410 }
4411 }
4412
4413 // We are adding a 64 bit SGPR and a constant. If constant bus limit
4414 // is 1 we would need to perform 1 or 2 extra moves for each half of
4415 // the constant and it is better to do a scalar add and then issue a
4416 // single VALU instruction to materialize zero. Otherwise it is less
4417 // instructions to perform VALU adds with immediates or inline literals.
4418 unsigned NumLiterals =
4419 !TII.isInlineConstant(APInt(32, ConstOffset & 0xffffffff)) +
4420 !TII.isInlineConstant(APInt(32, ConstOffset >> 32));
4421 if (STI.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) > NumLiterals)
4422 return std::nullopt;
4423 }
4424 }
4425 }
4426
4427 // Match the variable offset.
4428 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4429 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4430 // Look through the SGPR->VGPR copy.
4431 Register SAddr =
4432 getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
4433
4434 if (isSGPR(SAddr)) {
4435 Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
4436
4437 // It's possible voffset is an SGPR here, but the copy to VGPR will be
4438 // inserted later.
4439 if (Register VOffset = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
4440 return {{[=](MachineInstrBuilder &MIB) { // saddr
4441 MIB.addReg(SAddr);
4442 },
4443 [=](MachineInstrBuilder &MIB) { // voffset
4444 MIB.addReg(VOffset);
4445 },
4446 [=](MachineInstrBuilder &MIB) { // offset
4447 MIB.addImm(ImmOffset);
4448 }}};
4449 }
4450 }
4451 }
4452
4453 // FIXME: We should probably have folded COPY (G_IMPLICIT_DEF) earlier, and
4454 // drop this.
4455 if (AddrDef->MI->getOpcode() == AMDGPU::G_IMPLICIT_DEF ||
4456 AddrDef->MI->getOpcode() == AMDGPU::G_CONSTANT || !isSGPR(AddrDef->Reg))
4457 return std::nullopt;
4458
4459 // It's cheaper to materialize a single 32-bit zero for vaddr than the two
4460 // moves required to copy a 64-bit SGPR to VGPR.
4461 MachineInstr *MI = Root.getParent();
4462 MachineBasicBlock *MBB = MI->getParent();
4463 Register VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4464
4465 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
4466 .addImm(0);
4467
4468 return {{
4469 [=](MachineInstrBuilder &MIB) { MIB.addReg(AddrDef->Reg); }, // saddr
4470 [=](MachineInstrBuilder &MIB) { MIB.addReg(VOffset); }, // voffset
4471 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4472 }};
4473}
4474
4476AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
4477 Register Addr = Root.getReg();
4478 Register PtrBase;
4479 int64_t ConstOffset;
4480 int64_t ImmOffset = 0;
4481
4482 // Match the immediate offset first, which canonically is moved as low as
4483 // possible.
4484 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4485
4486 if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
4489 Addr = PtrBase;
4490 ImmOffset = ConstOffset;
4491 }
4492
4493 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4494 if (AddrDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4495 int FI = AddrDef->MI->getOperand(1).getIndex();
4496 return {{
4497 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4498 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4499 }};
4500 }
4501
4502 Register SAddr = AddrDef->Reg;
4503
4504 if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
4505 Register LHS = AddrDef->MI->getOperand(1).getReg();
4506 Register RHS = AddrDef->MI->getOperand(2).getReg();
4507 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4508 auto RHSDef = getDefSrcRegIgnoringCopies(RHS, *MRI);
4509
4510 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX &&
4511 isSGPR(RHSDef->Reg)) {
4512 int FI = LHSDef->MI->getOperand(1).getIndex();
4513 MachineInstr &I = *Root.getParent();
4514 MachineBasicBlock *BB = I.getParent();
4515 const DebugLoc &DL = I.getDebugLoc();
4516 SAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
4517
4518 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
4519 .addFrameIndex(FI)
4520 .addReg(RHSDef->Reg)
4521 .setOperandDead(3); // Dead scc
4522 }
4523 }
4524
4525 if (!isSGPR(SAddr))
4526 return std::nullopt;
4527
4528 return {{
4529 [=](MachineInstrBuilder &MIB) { MIB.addReg(SAddr); }, // saddr
4530 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4531 }};
4532}
4533
4534// Check whether the flat scratch SVS swizzle bug affects this access.
4535bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
4536 Register VAddr, Register SAddr, uint64_t ImmOffset) const {
4537 if (!Subtarget->hasFlatScratchSVSSwizzleBug())
4538 return false;
4539
4540 // The bug affects the swizzling of SVS accesses if there is any carry out
4541 // from the two low order bits (i.e. from bit 1 into bit 2) when adding
4542 // voffset to (soffset + inst_offset).
4543 auto VKnown = KB->getKnownBits(VAddr);
4544 auto SKnown = KnownBits::computeForAddSub(
4545 /*Add=*/true, /*NSW=*/false, /*NUW=*/false, KB->getKnownBits(SAddr),
4546 KnownBits::makeConstant(APInt(32, ImmOffset)));
4547 uint64_t VMax = VKnown.getMaxValue().getZExtValue();
4548 uint64_t SMax = SKnown.getMaxValue().getZExtValue();
4549 return (VMax & 3) + (SMax & 3) >= 4;
4550}
4551
4553AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
4554 Register Addr = Root.getReg();
4555 Register PtrBase;
4556 int64_t ConstOffset;
4557 int64_t ImmOffset = 0;
4558
4559 // Match the immediate offset first, which canonically is moved as low as
4560 // possible.
4561 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
4562
4563 Register OrigAddr = Addr;
4564 if (ConstOffset != 0 &&
4565 TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
4566 Addr = PtrBase;
4567 ImmOffset = ConstOffset;
4568 }
4569
4570 auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
4571 if (AddrDef->MI->getOpcode() != AMDGPU::G_PTR_ADD)
4572 return std::nullopt;
4573
4574 Register RHS = AddrDef->MI->getOperand(2).getReg();
4575 if (RBI.getRegBank(RHS, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID)
4576 return std::nullopt;
4577
4578 Register LHS = AddrDef->MI->getOperand(1).getReg();
4579 auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
4580
4581 if (OrigAddr != Addr) {
4582 if (!isFlatScratchBaseLegalSVImm(OrigAddr))
4583 return std::nullopt;
4584 } else {
4585 if (!isFlatScratchBaseLegalSV(OrigAddr))
4586 return std::nullopt;
4587 }
4588
4589 if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
4590 return std::nullopt;
4591
4592 if (LHSDef->MI->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4593 int FI = LHSDef->MI->getOperand(1).getIndex();
4594 return {{
4595 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4596 [=](MachineInstrBuilder &MIB) { MIB.addFrameIndex(FI); }, // saddr
4597 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4598 }};
4599 }
4600
4601 if (!isSGPR(LHS))
4602 return std::nullopt;
4603
4604 return {{
4605 [=](MachineInstrBuilder &MIB) { MIB.addReg(RHS); }, // vaddr
4606 [=](MachineInstrBuilder &MIB) { MIB.addReg(LHS); }, // saddr
4607 [=](MachineInstrBuilder &MIB) { MIB.addImm(ImmOffset); } // offset
4608 }};
4609}
4610
4612AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
4613 MachineInstr *MI = Root.getParent();
4614 MachineBasicBlock *MBB = MI->getParent();
4617
4618 int64_t Offset = 0;
4619 if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
4621 Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4622
4623 // TODO: Should this be inside the render function? The iterator seems to
4624 // move.
4625 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
4626 BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
4627 HighBits)
4628 .addImm(Offset & ~MaxOffset);
4629
4630 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4631 MIB.addReg(Info->getScratchRSrcReg());
4632 },
4633 [=](MachineInstrBuilder &MIB) { // vaddr
4634 MIB.addReg(HighBits);
4635 },
4636 [=](MachineInstrBuilder &MIB) { // soffset
4637 // Use constant zero for soffset and rely on eliminateFrameIndex
4638 // to choose the appropriate frame register if need be.
4639 MIB.addImm(0);
4640 },
4641 [=](MachineInstrBuilder &MIB) { // offset
4642 MIB.addImm(Offset & MaxOffset);
4643 }}};
4644 }
4645
4646 assert(Offset == 0 || Offset == -1);
4647
4648 // Try to fold a frame index directly into the MUBUF vaddr field, and any
4649 // offsets.
4650 std::optional<int> FI;
4651 Register VAddr = Root.getReg();
4652 if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
4653 Register PtrBase;
4654 int64_t ConstOffset;
4655 std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
4656 if (ConstOffset != 0) {
4657 if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
4659 KB->signBitIsZero(PtrBase))) {
4660 const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
4661 if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
4662 FI = PtrBaseDef->getOperand(1).getIndex();
4663 else
4664 VAddr = PtrBase;
4665 Offset = ConstOffset;
4666 }
4667 } else if (RootDef->getOpcode() == AMDGPU::G_FRAME_INDEX) {
4668 FI = RootDef->getOperand(1).getIndex();
4669 }
4670 }
4671
4672 return {{[=](MachineInstrBuilder &MIB) { // rsrc
4673 MIB.addReg(Info->getScratchRSrcReg());
4674 },
4675 [=](MachineInstrBuilder &MIB) { // vaddr
4676 if (FI)
4677 MIB.addFrameIndex(*FI);
4678 else
4679 MIB.addReg(VAddr);
4680 },
4681 [=](MachineInstrBuilder &MIB) { // soffset
4682 // Use constant zero for soffset and rely on eliminateFrameIndex
4683 // to choose the appropriate frame register if need be.
4684 MIB.addImm(0);
4685 },
4686 [=](MachineInstrBuilder &MIB) { // offset
4687 MIB.addImm(Offset);
4688 }}};
4689}
4690
4691bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
4692 int64_t Offset) const {
4693 if (!isUInt<16>(Offset))
4694 return false;
4695
4697 return true;
4698
4699 // On Southern Islands instruction with a negative base value and an offset
4700 // don't seem to work.
4701 return KB->signBitIsZero(Base);
4702}
4703
4704bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
4705 int64_t Offset1,
4706 unsigned Size) const {
4707 if (Offset0 % Size != 0 || Offset1 % Size != 0)
4708 return false;
4709 if (!isUInt<8>(Offset0 / Size) || !isUInt<8>(Offset1 / Size))
4710 return false;
4711
4713 return true;
4714
4715 // On Southern Islands instruction with a negative base value and an offset
4716 // don't seem to work.
4717 return KB->signBitIsZero(Base);
4718}
4719
4720// Return whether the operation has NoUnsignedWrap property.
4722 return Addr->getOpcode() == TargetOpcode::G_OR ||
4723 (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
4724 Addr->getFlag(MachineInstr::NoUWrap));
4725}
4726
4727// Check that the base address of flat scratch load/store in the form of `base +
4728// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
4729// requirement). We always treat the first operand as the base address here.
4730bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
4731 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4732
4733 if (isNoUnsignedWrap(AddrMI))
4734 return true;
4735
4736 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4737 // values.
4738 if (STI.hasSignedScratchOffsets())
4739 return true;
4740
4741 Register LHS = AddrMI->getOperand(1).getReg();
4742 Register RHS = AddrMI->getOperand(2).getReg();
4743
4744 if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
4745 std::optional<ValueAndVReg> RhsValReg =
4747 // If the immediate offset is negative and within certain range, the base
4748 // address cannot also be negative. If the base is also negative, the sum
4749 // would be either negative or much larger than the valid range of scratch
4750 // memory a thread can access.
4751 if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
4752 RhsValReg->Value.getSExtValue() > -0x40000000)
4753 return true;
4754 }
4755
4756 return KB->signBitIsZero(LHS);
4757}
4758
4759// Check address value in SGPR/VGPR are legal for flat scratch in the form
4760// of: SGPR + VGPR.
4761bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
4762 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4763
4764 if (isNoUnsignedWrap(AddrMI))
4765 return true;
4766
4767 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4768 // values.
4769 if (STI.hasSignedScratchOffsets())
4770 return true;
4771
4772 Register LHS = AddrMI->getOperand(1).getReg();
4773 Register RHS = AddrMI->getOperand(2).getReg();
4774 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4775}
4776
4777// Check address value in SGPR/VGPR are legal for flat scratch in the form
4778// of: SGPR + VGPR + Imm.
4779bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
4780 Register Addr) const {
4781 // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
4782 // values.
4783 if (STI.hasSignedScratchOffsets())
4784 return true;
4785
4786 MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
4787 Register Base = AddrMI->getOperand(1).getReg();
4788 std::optional<DefinitionAndSourceRegister> BaseDef =
4790 std::optional<ValueAndVReg> RHSOffset =
4792 assert(RHSOffset);
4793
4794 // If the immediate offset is negative and within certain range, the base
4795 // address cannot also be negative. If the base is also negative, the sum
4796 // would be either negative or much larger than the valid range of scratch
4797 // memory a thread can access.
4798 if (isNoUnsignedWrap(BaseDef->MI) &&
4799 (isNoUnsignedWrap(AddrMI) ||
4800 (RHSOffset->Value.getSExtValue() < 0 &&
4801 RHSOffset->Value.getSExtValue() > -0x40000000)))
4802 return true;
4803
4804 Register LHS = BaseDef->MI->getOperand(1).getReg();
4805 Register RHS = BaseDef->MI->getOperand(2).getReg();
4806 return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
4807}
4808
4809bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
4810 unsigned ShAmtBits) const {
4811 assert(MI.getOpcode() == TargetOpcode::G_AND);
4812
4813 std::optional<APInt> RHS =
4814 getIConstantVRegVal(MI.getOperand(2).getReg(), *MRI);
4815 if (!RHS)
4816 return false;
4817
4818 if (RHS->countr_one() >= ShAmtBits)
4819 return true;
4820
4821 const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
4822 return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
4823}
4824
4826AMDGPUInstructionSelector::selectMUBUFScratchOffset(
4827 MachineOperand &Root) const {
4828 Register Reg = Root.getReg();
4830
4831 std::optional<DefinitionAndSourceRegister> Def =
4832 getDefSrcRegIgnoringCopies(Reg, *MRI);
4833 assert(Def && "this shouldn't be an optional result");
4834 Reg = Def->Reg;
4835
4836 if (Register WaveBase = getWaveAddress(Def->MI)) {
4837 return {{
4838 [=](MachineInstrBuilder &MIB) { // rsrc
4839 MIB.addReg(Info->getScratchRSrcReg());
4840 },
4841 [=](MachineInstrBuilder &MIB) { // soffset
4842 MIB.addReg(WaveBase);
4843 },
4844 [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // offset
4845 }};
4846 }
4847
4848 int64_t Offset = 0;
4849
4850 // FIXME: Copy check is a hack
4852 if (mi_match(Reg, *MRI,
4853 m_GPtrAdd(m_Reg(BasePtr),
4855 if (!TII.isLegalMUBUFImmOffset(Offset))
4856 return {};
4857 MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
4858 Register WaveBase = getWaveAddress(BasePtrDef);
4859 if (!WaveBase)
4860 return {};
4861
4862 return {{
4863 [=](MachineInstrBuilder &MIB) { // rsrc
4864 MIB.addReg(Info->getScratchRSrcReg());
4865 },
4866 [=](MachineInstrBuilder &MIB) { // soffset
4867 MIB.addReg(WaveBase);
4868 },
4869 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4870 }};
4871 }
4872
4873 if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
4875 return {};
4876
4877 return {{
4878 [=](MachineInstrBuilder &MIB) { // rsrc
4879 MIB.addReg(Info->getScratchRSrcReg());
4880 },
4881 [=](MachineInstrBuilder &MIB) { // soffset
4882 MIB.addImm(0);
4883 },
4884 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
4885 }};
4886}
4887
4888std::pair<Register, unsigned>
4889AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
4890 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4891 if (!RootDef)
4892 return std::pair(Root.getReg(), 0);
4893
4894 int64_t ConstAddr = 0;
4895
4896 Register PtrBase;
4897 int64_t Offset;
4898 std::tie(PtrBase, Offset) =
4899 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4900
4901 if (Offset) {
4902 if (isDSOffsetLegal(PtrBase, Offset)) {
4903 // (add n0, c0)
4904 return std::pair(PtrBase, Offset);
4905 }
4906 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4907 // TODO
4908
4909
4910 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4911 // TODO
4912
4913 }
4914
4915 return std::pair(Root.getReg(), 0);
4916}
4917
4919AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
4920 Register Reg;
4921 unsigned Offset;
4922 std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
4923 return {{
4924 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4925 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
4926 }};
4927}
4928
4930AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
4931 return selectDSReadWrite2(Root, 4);
4932}
4933
4935AMDGPUInstructionSelector::selectDS128Bit8ByteAligned(MachineOperand &Root) const {
4936 return selectDSReadWrite2(Root, 8);
4937}
4938
4940AMDGPUInstructionSelector::selectDSReadWrite2(MachineOperand &Root,
4941 unsigned Size) const {
4942 Register Reg;
4943 unsigned Offset;
4944 std::tie(Reg, Offset) = selectDSReadWrite2Impl(Root, Size);
4945 return {{
4946 [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
4947 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
4948 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
4949 }};
4950}
4951
4952std::pair<Register, unsigned>
4953AMDGPUInstructionSelector::selectDSReadWrite2Impl(MachineOperand &Root,
4954 unsigned Size) const {
4955 const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
4956 if (!RootDef)
4957 return std::pair(Root.getReg(), 0);
4958
4959 int64_t ConstAddr = 0;
4960
4961 Register PtrBase;
4962 int64_t Offset;
4963 std::tie(PtrBase, Offset) =
4964 getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
4965
4966 if (Offset) {
4967 int64_t OffsetValue0 = Offset;
4968 int64_t OffsetValue1 = Offset + Size;
4969 if (isDSOffset2Legal(PtrBase, OffsetValue0, OffsetValue1, Size)) {
4970 // (add n0, c0)
4971 return std::pair(PtrBase, OffsetValue0 / Size);
4972 }
4973 } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
4974 // TODO
4975
4976 } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
4977 // TODO
4978
4979 }
4980
4981 return std::pair(Root.getReg(), 0);
4982}
4983
4984/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
4985/// the base value with the constant offset. There may be intervening copies
4986/// between \p Root and the identified constant. Returns \p Root, 0 if this does
4987/// not match the pattern.
4988std::pair<Register, int64_t>
4989AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
4990 Register Root, const MachineRegisterInfo &MRI) const {
4991 MachineInstr *RootI = getDefIgnoringCopies(Root, MRI);
4992 if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
4993 return {Root, 0};
4994
4995 MachineOperand &RHS = RootI->getOperand(2);
4996 std::optional<ValueAndVReg> MaybeOffset =
4998 if (!MaybeOffset)
4999 return {Root, 0};
5000 return {RootI->getOperand(1).getReg(), MaybeOffset->Value.getSExtValue()};
5001}
5002
5004 MIB.addImm(0);
5005}
5006
5007/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
5008/// BasePtr is not valid, a null base pointer will be used.
5010 uint32_t FormatLo, uint32_t FormatHi,
5011 Register BasePtr) {
5012 Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5013 Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5014 Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5015 Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
5016
5017 B.buildInstr(AMDGPU::S_MOV_B32)
5018 .addDef(RSrc2)
5019 .addImm(FormatLo);
5020 B.buildInstr(AMDGPU::S_MOV_B32)
5021 .addDef(RSrc3)
5022 .addImm(FormatHi);
5023
5024 // Build the half of the subregister with the constants before building the
5025 // full 128-bit register. If we are building multiple resource descriptors,
5026 // this will allow CSEing of the 2-component register.
5027 B.buildInstr(AMDGPU::REG_SEQUENCE)
5028 .addDef(RSrcHi)
5029 .addReg(RSrc2)
5030 .addImm(AMDGPU::sub0)
5031 .addReg(RSrc3)
5032 .addImm(AMDGPU::sub1);
5033
5034 Register RSrcLo = BasePtr;
5035 if (!BasePtr) {
5036 RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
5037 B.buildInstr(AMDGPU::S_MOV_B64)
5038 .addDef(RSrcLo)
5039 .addImm(0);
5040 }
5041
5042 B.buildInstr(AMDGPU::REG_SEQUENCE)
5043 .addDef(RSrc)
5044 .addReg(RSrcLo)
5045 .addImm(AMDGPU::sub0_sub1)
5046 .addReg(RSrcHi)
5047 .addImm(AMDGPU::sub2_sub3);
5048
5049 return RSrc;
5050}
5051
5053 const SIInstrInfo &TII, Register BasePtr) {
5054 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5055
5056 // FIXME: Why are half the "default" bits ignored based on the addressing
5057 // mode?
5058 return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
5059}
5060
5062 const SIInstrInfo &TII, Register BasePtr) {
5063 uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
5064
5065 // FIXME: Why are half the "default" bits ignored based on the addressing
5066 // mode?
5067 return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
5068}
5069
5070AMDGPUInstructionSelector::MUBUFAddressData
5071AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
5072 MUBUFAddressData Data;
5073 Data.N0 = Src;
5074
5075 Register PtrBase;
5076 int64_t Offset;
5077
5078 std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
5079 if (isUInt<32>(Offset)) {
5080 Data.N0 = PtrBase;
5081 Data.Offset = Offset;
5082 }
5083
5084 if (MachineInstr *InputAdd
5085 = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
5086 Data.N2 = InputAdd->getOperand(1).getReg();
5087 Data.N3 = InputAdd->getOperand(2).getReg();
5088
5089 // FIXME: Need to fix extra SGPR->VGPRcopies inserted
5090 // FIXME: Don't know this was defined by operand 0
5091 //
5092 // TODO: Remove this when we have copy folding optimizations after
5093 // RegBankSelect.
5094 Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
5095 Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
5096 }
5097
5098 return Data;
5099}
5100
5101/// Return if the addr64 mubuf mode should be used for the given address.
5102bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
5103 // (ptr_add N2, N3) -> addr64, or
5104 // (ptr_add (ptr_add N2, N3), C1) -> addr64
5105 if (Addr.N2)
5106 return true;
5107
5108 const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
5109 return N0Bank->getID() == AMDGPU::VGPRRegBankID;
5110}
5111
5112/// Split an immediate offset \p ImmOffset depending on whether it fits in the
5113/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
5114/// component.
5115void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
5116 MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
5117 if (TII.isLegalMUBUFImmOffset(ImmOffset))
5118 return;
5119
5120 // Illegal offset, store it in soffset.
5121 SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5122 B.buildInstr(AMDGPU::S_MOV_B32)
5123 .addDef(SOffset)
5124 .addImm(ImmOffset);
5125 ImmOffset = 0;
5126}
5127
5128bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
5129 MachineOperand &Root, Register &VAddr, Register &RSrcReg,
5130 Register &SOffset, int64_t &Offset) const {
5131 // FIXME: Predicates should stop this from reaching here.
5132 // addr64 bit was removed for volcanic islands.
5133 if (!STI.hasAddr64() || STI.useFlatForGlobal())
5134 return false;
5135
5136 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5137 if (!shouldUseAddr64(AddrData))
5138 return false;
5139
5140 Register N0 = AddrData.N0;
5141 Register N2 = AddrData.N2;
5142 Register N3 = AddrData.N3;
5143 Offset = AddrData.Offset;
5144
5145 // Base pointer for the SRD.
5146 Register SRDPtr;
5147
5148 if (N2) {
5149 if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5150 assert(N3);
5151 if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5152 // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
5153 // addr64, and construct the default resource from a 0 address.
5154 VAddr = N0;
5155 } else {
5156 SRDPtr = N3;
5157 VAddr = N2;
5158 }
5159 } else {
5160 // N2 is not divergent.
5161 SRDPtr = N2;
5162 VAddr = N3;
5163 }
5164 } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
5165 // Use the default null pointer in the resource
5166 VAddr = N0;
5167 } else {
5168 // N0 -> offset, or
5169 // (N0 + C1) -> offset
5170 SRDPtr = N0;
5171 }
5172
5173 MachineIRBuilder B(*Root.getParent());
5174 RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
5175 splitIllegalMUBUFOffset(B, SOffset, Offset);
5176 return true;
5177}
5178
5179bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
5180 MachineOperand &Root, Register &RSrcReg, Register &SOffset,
5181 int64_t &Offset) const {
5182
5183 // FIXME: Pattern should not reach here.
5184 if (STI.useFlatForGlobal())
5185 return false;
5186
5187 MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
5188 if (shouldUseAddr64(AddrData))
5189 return false;
5190
5191 // N0 -> offset, or
5192 // (N0 + C1) -> offset
5193 Register SRDPtr = AddrData.N0;
5194 Offset = AddrData.Offset;
5195
5196 // TODO: Look through extensions for 32-bit soffset.
5197 MachineIRBuilder B(*Root.getParent());
5198
5199 RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
5200 splitIllegalMUBUFOffset(B, SOffset, Offset);
5201 return true;
5202}
5203
5205AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
5206 Register VAddr;
5207 Register RSrcReg;
5208 Register SOffset;
5209 int64_t Offset = 0;
5210
5211 if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
5212 return {};
5213
5214 // FIXME: Use defaulted operands for trailing 0s and remove from the complex
5215 // pattern.
5216 return {{
5217 [=](MachineInstrBuilder &MIB) { // rsrc
5218 MIB.addReg(RSrcReg);
5219 },
5220 [=](MachineInstrBuilder &MIB) { // vaddr
5221 MIB.addReg(VAddr);
5222 },
5223 [=](MachineInstrBuilder &MIB) { // soffset
5224 if (SOffset)
5225 MIB.addReg(SOffset);
5226 else if (STI.hasRestrictedSOffset())
5227 MIB.addReg(AMDGPU::SGPR_NULL);
5228 else
5229 MIB.addImm(0);
5230 },
5231 [=](MachineInstrBuilder &MIB) { // offset
5232 MIB.addImm(Offset);
5233 },
5234 addZeroImm, // cpol
5235 addZeroImm, // tfe
5236 addZeroImm // swz
5237 }};
5238}
5239
5241AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
5242 Register RSrcReg;
5243 Register SOffset;
5244 int64_t Offset = 0;
5245
5246 if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
5247 return {};
5248
5249 return {{
5250 [=](MachineInstrBuilder &MIB) { // rsrc
5251 MIB.addReg(RSrcReg);
5252 },
5253 [=](MachineInstrBuilder &MIB) { // soffset
5254 if (SOffset)
5255 MIB.addReg(SOffset);
5256 else if (STI.hasRestrictedSOffset())
5257 MIB.addReg(AMDGPU::SGPR_NULL);
5258 else
5259 MIB.addImm(0);
5260 },
5261 [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
5262 addZeroImm, // cpol
5263 addZeroImm, // tfe
5264 addZeroImm, // swz
5265 }};
5266}
5267
5269AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
5270
5271 Register SOffset = Root.getReg();
5272
5273 if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
5274 SOffset = AMDGPU::SGPR_NULL;
5275
5276 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
5277}
5278
5279/// Get an immediate that must be 32-bits, and treated as zero extended.
5280static std::optional<uint64_t>
5282 // getIConstantVRegVal sexts any values, so see if that matters.
5283 std::optional<int64_t> OffsetVal = getIConstantVRegSExtVal(Reg, MRI);
5284 if (!OffsetVal || !isInt<32>(*OffsetVal))
5285 return std::nullopt;
5286 return Lo_32(*OffsetVal);
5287}
5288
5290AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
5291 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5292 if (!OffsetVal)
5293 return {};
5294
5295 std::optional<int64_t> EncodedImm =
5296 AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
5297 if (!EncodedImm)
5298 return {};
5299
5300 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5301}
5302
5304AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
5306
5307 std::optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
5308 if (!OffsetVal)
5309 return {};
5310
5311 std::optional<int64_t> EncodedImm =
5313 if (!EncodedImm)
5314 return {};
5315
5316 return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); } }};
5317}
5318
5320AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
5321 // Match the (soffset + offset) pair as a 32-bit register base and
5322 // an immediate offset.
5323 Register SOffset;
5324 unsigned Offset;
5325 std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
5326 *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
5327 if (!SOffset)
5328 return std::nullopt;
5329
5330 std::optional<int64_t> EncodedOffset =
5331 AMDGPU::getSMRDEncodedOffset(STI, Offset, /* IsBuffer */ true);
5332 if (!EncodedOffset)
5333 return std::nullopt;
5334
5335 assert(MRI->getType(SOffset) == LLT::scalar(32));
5336 return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); },
5337 [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedOffset); }}};
5338}
5339
5340// Variant of stripBitCast that returns the instruction instead of a
5341// MachineOperand.
5343 if (MI->getOpcode() == AMDGPU::G_BITCAST)
5344 return getDefIgnoringCopies(MI->getOperand(1).getReg(), MRI);
5345 return MI;
5346}
5347
5348// Figure out if this is really an extract of the high 16-bits of a dword,
5349// returns nullptr if it isn't.
5352 Inst = stripBitCast(Inst, MRI);
5353
5354 if (Inst->getOpcode() != AMDGPU::G_TRUNC)
5355 return nullptr;
5356
5357 MachineInstr *TruncOp =
5359 TruncOp = stripBitCast(TruncOp, MRI);
5360
5361 // G_LSHR x, (G_CONSTANT i32 16)
5362 if (TruncOp->getOpcode() == AMDGPU::G_LSHR) {
5363 auto SrlAmount = getIConstantVRegValWithLookThrough(
5364 TruncOp->getOperand(2).getReg(), MRI);
5365 if (SrlAmount && SrlAmount->Value.getZExtValue() == 16) {
5366 MachineInstr *SrlOp =
5367 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5368 return stripBitCast(SrlOp, MRI);
5369 }
5370 }
5371
5372 // G_SHUFFLE_VECTOR x, y, shufflemask(1, 1|0)
5373 // 1, 0 swaps the low/high 16 bits.
5374 // 1, 1 sets the high 16 bits to be the same as the low 16.
5375 // in any case, it selects the high elts.
5376 if (TruncOp->getOpcode() == AMDGPU::G_SHUFFLE_VECTOR) {
5377 assert(MRI.getType(TruncOp->getOperand(0).getReg()) ==
5378 LLT::fixed_vector(2, 16));
5379
5380 ArrayRef<int> Mask = TruncOp->getOperand(3).getShuffleMask();
5381 assert(Mask.size() == 2);
5382
5383 if (Mask[0] == 1 && Mask[1] <= 1) {
5384 MachineInstr *LHS =
5385 getDefIgnoringCopies(TruncOp->getOperand(1).getReg(), MRI);
5386 return stripBitCast(LHS, MRI);
5387 }
5388 }
5389
5390 return nullptr;
5391}
5392
5393std::pair<Register, unsigned>
5394AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
5395 bool &Matched) const {
5396 Matched = false;
5397
5398 Register Src;
5399 unsigned Mods;
5400 std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
5401
5402 MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
5403 if (MI->getOpcode() == AMDGPU::G_FPEXT) {
5404 MachineOperand *MO = &MI->getOperand(1);
5405 Src = MO->getReg();
5406 MI = getDefIgnoringCopies(Src, *MRI);
5407
5408 assert(MRI->getType(Src) == LLT::scalar(16));
5409
5410 // See through bitcasts.
5411 // FIXME: Would be nice to use stripBitCast here.
5412 if (MI->getOpcode() == AMDGPU::G_BITCAST) {
5413 MO = &MI->getOperand(1);
5414 Src = MO->getReg();
5415 MI = getDefIgnoringCopies(Src, *MRI);
5416 }
5417
5418 const auto CheckAbsNeg = [&]() {
5419 // Be careful about folding modifiers if we already have an abs. fneg is
5420 // applied last, so we don't want to apply an earlier fneg.
5421 if ((Mods & SISrcMods::ABS) == 0) {
5422 unsigned ModsTmp;
5423 std::tie(Src, ModsTmp) = selectVOP3ModsImpl(*MO);
5424 MI = getDefIgnoringCopies(Src, *MRI);
5425
5426 if ((ModsTmp & SISrcMods::NEG) != 0)
5427 Mods ^= SISrcMods::NEG;
5428
5429 if ((ModsTmp & SISrcMods::ABS) != 0)
5430 Mods |= SISrcMods::ABS;
5431 }
5432 };
5433
5434 CheckAbsNeg();
5435
5436 // op_sel/op_sel_hi decide the source type and source.
5437 // If the source's op_sel_hi is set, it indicates to do a conversion from
5438 // fp16. If the sources's op_sel is set, it picks the high half of the
5439 // source register.
5440
5441 Mods |= SISrcMods::OP_SEL_1;
5442
5443 if (MachineInstr *ExtractHiEltMI = isExtractHiElt(MI, *MRI)) {
5444 Mods |= SISrcMods::OP_SEL_0;
5445 MI = ExtractHiEltMI;
5446 MO = &MI->getOperand(0);
5447 Src = MO->getReg();
5448
5449 CheckAbsNeg();
5450 }
5451
5452 Matched = true;
5453 }
5454
5455 return {Src, Mods};
5456}
5457
5459AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
5460 MachineOperand &Root) const {
5461 Register Src;
5462 unsigned Mods;
5463 bool Matched;
5464 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5465 if (!Matched)
5466 return {};
5467
5468 return {{
5469 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5470 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5471 }};
5472}
5473
5475AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
5476 Register Src;
5477 unsigned Mods;
5478 bool Matched;
5479 std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
5480
5481 return {{
5482 [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
5483 [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
5484 }};
5485}
5486
5487bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
5488 MachineInstr &I, Intrinsic::ID IntrID) const {
5489 MachineBasicBlock *MBB = I.getParent();
5490 const DebugLoc &DL = I.getDebugLoc();
5491 Register CCReg = I.getOperand(0).getReg();
5492
5493 bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
5494
5495 if (HasM0) {
5496 auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
5497 .addReg(I.getOperand(2).getReg());
5498 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
5499 if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
5500 return false;
5501 } else {
5502 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
5503 .addImm(I.getOperand(2).getImm());
5504 }
5505
5506 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5507
5508 I.eraseFromParent();
5509 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5510 *MRI);
5511}
5512
5513unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
5514 if (HasInlineConst) {
5515 switch (IntrID) {
5516 default:
5517 llvm_unreachable("not a named barrier op");
5518 case Intrinsic::amdgcn_s_barrier_init:
5519 return AMDGPU::S_BARRIER_INIT_IMM;
5520 case Intrinsic::amdgcn_s_barrier_join:
5521 return AMDGPU::S_BARRIER_JOIN_IMM;
5522 case Intrinsic::amdgcn_s_wakeup_barrier:
5523 return AMDGPU::S_WAKEUP_BARRIER_IMM;
5524 case Intrinsic::amdgcn_s_get_barrier_state:
5525 return AMDGPU::S_GET_BARRIER_STATE_IMM;
5526 };
5527 } else {
5528 switch (IntrID) {
5529 default:
5530 llvm_unreachable("not a named barrier op");
5531 case Intrinsic::amdgcn_s_barrier_init:
5532 return AMDGPU::S_BARRIER_INIT_M0;
5533 case Intrinsic::amdgcn_s_barrier_join:
5534 return AMDGPU::S_BARRIER_JOIN_M0;
5535 case Intrinsic::amdgcn_s_wakeup_barrier:
5536 return AMDGPU::S_WAKEUP_BARRIER_M0;
5537 case Intrinsic::amdgcn_s_get_barrier_state:
5538 return AMDGPU::S_GET_BARRIER_STATE_M0;
5539 };
5540 }
5541}
5542
5543bool AMDGPUInstructionSelector::selectNamedBarrierInst(
5544 MachineInstr &I, Intrinsic::ID IntrID) const {
5545 MachineBasicBlock *MBB = I.getParent();
5546 const DebugLoc &DL = I.getDebugLoc();
5547 MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
5548 ? I.getOperand(2)
5549 : I.getOperand(1);
5550 std::optional<int64_t> BarValImm =
5551 getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
5552 Register M0Val;
5553 Register TmpReg0;
5554
5555 // For S_BARRIER_INIT, member count will always be read from M0[16:22]
5556 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5557 Register MemberCount = I.getOperand(2).getReg();
5558 TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5559 // TODO: This should be expanded during legalization so that the the S_LSHL
5560 // and S_OR can be constant-folded
5561 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
5562 .addImm(16)
5563 .addReg(MemberCount);
5564 M0Val = TmpReg0;
5565 }
5566
5567 // If not inlinable, get reference to barrier depending on the instruction
5568 if (!BarValImm) {
5569 if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
5570 // If reference to barrier id is not an inlinable constant then it must be
5571 // referenced with M0[4:0]. Perform an OR with the member count to include
5572 // it in M0 for S_BARRIER_INIT.
5573 Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
5574 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
5575 .addReg(BarOp.getReg())
5576 .addReg(TmpReg0);
5577 M0Val = TmpReg1;
5578 } else {
5579 M0Val = BarOp.getReg();
5580 }
5581 }
5582
5583 // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
5584 if (M0Val) {
5585 auto CopyMIB =
5586 BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
5587 constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
5588 }
5589
5591 unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
5592 MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
5593
5594 if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
5595 MIB.addDef(I.getOperand(0).getReg());
5596
5597 if (BarValImm)
5598 MIB.addImm(*BarValImm);
5599
5600 I.eraseFromParent();
5601 return true;
5602}
5603
5604bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
5605 MachineBasicBlock *BB = I.getParent();
5606 const DebugLoc &DL = I.getDebugLoc();
5607 Register CCReg = I.getOperand(0).getReg();
5608
5609 BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
5610 BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
5611
5612 I.eraseFromParent();
5613 return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
5614 *MRI);
5615}
5616
5617void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
5618 const MachineInstr &MI,
5619 int OpIdx) const {
5620 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5621 "Expected G_CONSTANT");
5622 MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
5623}
5624
5625void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
5626 const MachineInstr &MI,
5627 int OpIdx) const {
5628 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5629 "Expected G_CONSTANT");
5630 MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
5631}
5632
5633void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
5634 const MachineInstr &MI,
5635 int OpIdx) const {
5636 assert(OpIdx == -1);
5637
5638 const MachineOperand &Op = MI.getOperand(1);
5639 if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
5640 MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
5641 else {
5642 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
5643 MIB.addImm(Op.getCImm()->getSExtValue());
5644 }
5645}
5646
5647void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
5648 const MachineInstr &MI,
5649 int OpIdx) const {
5650 assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
5651 "Expected G_CONSTANT");
5652 MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
5653}
5654
5655/// This only really exists to satisfy DAG type checking machinery, so is a
5656/// no-op here.
5657void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
5658 const MachineInstr &MI,
5659 int OpIdx) const {
5660 MIB.addImm(MI.getOperand(OpIdx).getImm());
5661}
5662
5663void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
5664 const MachineInstr &MI,
5665 int OpIdx) const {
5666 assert(OpIdx >= 0 && "expected to match an immediate operand");
5667 MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
5668}
5669
5670void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
5671 const MachineInstr &MI,
5672 int OpIdx) const {
5673 assert(OpIdx >= 0 && "expected to match an immediate operand");
5674 MIB.addImm(MI.getOperand(OpIdx).getImm() &
5677}
5678
5679void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
5680 const MachineInstr &MI,
5681 int OpIdx) const {
5682 assert(OpIdx >= 0 && "expected to match an immediate operand");
5683 const bool Swizzle = MI.getOperand(OpIdx).getImm() &
5686 MIB.addImm(Swizzle);
5687}
5688
5689void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
5690 MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
5691 assert(OpIdx >= 0 && "expected to match an immediate operand");
5692 const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
5695 MIB.addImm(Cpol | AMDGPU::CPol::GLC);
5696}
5697
5698void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
5699 const MachineInstr &MI,
5700 int OpIdx) const {
5701 MIB.addFrameIndex(MI.getOperand(1).getIndex());
5702}
5703
5704void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
5705 const MachineInstr &MI,
5706 int OpIdx) const {
5707 const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
5708 int ExpVal = APF.getExactLog2Abs();
5709 assert(ExpVal != INT_MIN);
5710 MIB.addImm(ExpVal);
5711}
5712
5713bool AMDGPUInstructionSelector::isInlineImmediate(const APInt &Imm) const {
5714 return TII.isInlineConstant(Imm);
5715}
5716
5717bool AMDGPUInstructionSelector::isInlineImmediate(const APFloat &Imm) const {
5718 return TII.isInlineConstant(Imm);
5719}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID)
#define GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
static Register getWaveAddress(const MachineInstr *Def)
static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In, Register &Out)
static bool shouldUseAndMask(unsigned Size, unsigned &Mask)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static std::pair< Register, unsigned > computeIndirectRegIndex(MachineRegisterInfo &MRI, const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, Register IdxReg, unsigned EltSize, GISelKnownBits &KnownBits)
Return the register to use for the index value, and the subregister to use for the indirectly accesse...
static void addZeroImm(MachineInstrBuilder &MIB)
static unsigned gwsIntrinToOpcode(unsigned IntrID)
static bool isConstant(const MachineInstr &MI)
static int sizeToSubRegIndex(unsigned Size)
static Register buildRegSequence(SmallVectorImpl< Register > &Elts, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI, uint32_t FormatLo, uint32_t FormatHi, Register BasePtr)
Return a resource descriptor for use with an arbitrary 64-bit pointer.
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg)
Match a zero extend from a 32-bit value to 64-bits.
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64)
static std::optional< uint64_t > getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI)
Get an immediate that must be 32-bits, and treated as zero extended.
static MachineInstr * stripBitCast(MachineInstr *MI, MachineRegisterInfo &MRI)
static int getV_CMPOpcode(CmpInst::Predicate P, unsigned Size, const GCNSubtarget &ST)
static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI)
static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI, const SIInstrInfo &TII, Register BasePtr)
static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods, SmallVectorImpl< Register > &Elts, Register &Src, MachineInstr *InsertPt, MachineRegisterInfo &MRI)
This file declares the targeting of the InstructionSelector class for AMDGPU.
static const LLT S1
amdgpu AMDGPU Register Bank Select
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static Error getAddrSpace(StringRef R, unsigned &AddrSpace)
Definition: DataLayout.cpp:266
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
#define P(N)
const char LLVMTargetMachineRef TM
static std::vector< std::pair< int, unsigned > > Swizzle(std::vector< std::pair< int, unsigned > > Src, R600InstrInfo::BankSwizzle Swz)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
Value * RHS
Value * LHS
AMDGPUInstructionSelector(const GCNSubtarget &STI, const AMDGPURegisterBankInfo &RBI, const AMDGPUTargetMachine &TM)
static const char * getName()
bool select(MachineInstr &I) override
Select the (possibly generic) instruction I to only use target-specific opcodes.
void setupMF(MachineFunction &MF, GISelKnownBits *KB, CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) override
Setup per-MF executor state.
const RegisterBank & getRegBankFromRegClass(const TargetRegisterClass &RC, LLT) const override
Get a register bank that covers RC.
std::pair< unsigned, unsigned > getFlatWorkGroupSizes(const Function &F) const
unsigned getWavefrontSizeLog2() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
LLVM_READONLY int getExactLog2Abs() const
Definition: APFloat.h:1339
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt zext(unsigned width) const
Zero extend to a new width.
Definition: APInt.cpp:981
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition: APInt.h:284
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
unsigned countr_one() const
Count the number of trailing one bits.
Definition: APInt.h:1606
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:996
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition: InstrTypes.h:1010
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition: InstrTypes.h:1008
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:998
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition: InstrTypes.h:1007
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:1001
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:1004
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition: InstrTypes.h:1005
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:1000
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:1002
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:1009
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition: InstrTypes.h:1006
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition: InstrTypes.h:995
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:268
const APFloat & getValueAPF() const
Definition: Constants.h:311
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:160
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
Represents a G_BUILD_VECTOR.
bool useVGPRIndexMode() const
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:954
int getLDSBankCount() const
Definition: GCNSubtarget.h:331
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:459
bool unsafeDSOffsetFoldingEnabled() const
Definition: GCNSubtarget.h:463
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:618
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasMADIntraFwdBug() const
bool privateMemoryResourceIsRangeChecked() const
Definition: GCNSubtarget.h:544
bool hasSignedScratchOffsets() const
bool hasRestrictedSOffset() const
const SITargetLowering * getTargetLowering() const override
Definition: GCNSubtarget.h:265
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
Definition: GCNSubtarget.h:696
bool isWave32() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool hasG16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasGWS() const
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:522
Generation getGeneration() const
Definition: GCNSubtarget.h:308
bool hasSplitBarriers() const
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:726
bool hasGWSSemaReleaseAll() const
Definition: GCNSubtarget.h:710
bool hasAddr64() const
Definition: GCNSubtarget.h:372
bool isWave64() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:718
bool hasSALUFloatInsts() const
bool hasPartialNSAEncoding() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
Represents a G_CONCAT_VECTORS.
std::optional< SmallVector< std::function< void(MachineInstrBuilder &)>, 4 > > ComplexRendererFns
virtual void setupMF(MachineFunction &mf, GISelKnownBits *kb, CodeGenCoverage *covinfo=nullptr, ProfileSummaryInfo *psi=nullptr, BlockFrequencyInfo *bfi=nullptr)
Setup per-MF executor state.
APInt getKnownOnes(Register R)
KnownBits getKnownBits(Register R)
bool signBitIsZero(Register Op)
APInt getKnownZeroes(Register R)
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
constexpr bool isValid() const
Definition: LowLevelType.h:145
constexpr bool isVector() const
Definition: LowLevelType.h:148
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
Definition: LowLevelType.h:290
constexpr unsigned getAddressSpace() const
Definition: LowLevelType.h:280
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
Definition: LowLevelType.h:100
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
Metadata node.
Definition: Metadata.h:1067
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
void setReturnAddressIsTaken(bool s)
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Helper class to build MachineInstr.
const MachineInstrBuilder & setMemRefs(ArrayRef< MachineMemOperand * > MMOs) const
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:558
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:341
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:561
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:487
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
A description of a memory reference used in the backend.
unsigned getAddrSpace() const
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
const Value * getValue() const
Return the base address of the memory access.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
const ConstantInt * getCImm() const
bool isCImm() const
isCImm - Test if this is a MO_CImmediate operand.
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
ArrayRef< int > getShuffleMask() const
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
static MachineOperand CreateImm(int64_t Val)
bool isEarlyClobber() const
Register getReg() const
getReg - Returns the register number.
bool isInternalRead() const
const ConstantFP * getFPImm() const
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
Analysis providing profile information.
static const TargetRegisterClass * constrainGenericRegister(Register Reg, const TargetRegisterClass &RC, MachineRegisterInfo &MRI)
Constrain the (possibly generic) virtual register Reg to RC.
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
TypeSize getSizeInBits(Register Reg, const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) const
Get the size in bits of Reg.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCRegister getReturnAddressReg(const MachineFunction &MF) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getRegClassForSizeOnBank(unsigned Size, const RegisterBank &Bank) const
const TargetRegisterClass * getConstrainedRegClassForOperand(const MachineOperand &MO, const MachineRegisterInfo &MRI) const override
const TargetRegisterClass * getRegClassForTypeOnBank(LLT Ty, const RegisterBank &Bank) const
const TargetRegisterClass * getBoolRC() const
const TargetRegisterClass * getWaveMaskRegClass() const
static bool isSGPRClass(const TargetRegisterClass *RC)
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
const Triple & getTargetTriple() const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
static IntegerType * getInt32Ty(LLVMContext &C)
LLVM Value Representation.
Definition: Value.h:74
Value(Type *Ty, unsigned scid)
Definition: Value.cpp:53
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
Key
PAL metadata keys.
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
std::optional< int64_t > getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST, int64_t ByteOffset)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
std::optional< int64_t > getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset, bool IsBuffer)
bool isGFX11Plus(const MCSubtargetInfo &STI)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
unsigned getIntrinsicID(const MachineInstr &I)
Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
IndexMode
ARM Index Modes.
Definition: ARMBaseInfo.h:177
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1471
operand_type_match m_Reg()
GCstAndRegMatch m_GCst(std::optional< ValueAndVReg > &ValReg)
UnaryOp_match< SrcTy, TargetOpcode::COPY > m_Copy(SrcTy &&Src)
SpecificConstantMatch m_SpecificICst(int64_t RequestedValue)
Matches a constant equal to RequestedValue.
UnaryOp_match< SrcTy, TargetOpcode::G_ZEXT > m_GZExt(const SrcTy &Src)
ConstantMatch< APInt > m_ICst(APInt &Cst)
ICstOrSplatMatch< APInt > m_ICstOrSplat(APInt &Cst)
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
BinaryOp_match< LHS, RHS, TargetOpcode::G_PTR_ADD, false > m_GPtrAdd(const LHS &L, const RHS &R)
Or< Preds... > m_any_of(Preds &&... preds)
UnaryOp_match< SrcTy, TargetOpcode::G_FNEG > m_GFNeg(const SrcTy &Src)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
UnaryOp_match< SrcTy, TargetOpcode::G_FABS > m_GFabs(const SrcTy &Src)
BinaryOp_match< LHS, RHS, TargetOpcode::G_LSHR, false > m_GLShr(const LHS &L, const RHS &R)
UnaryOp_match< SrcTy, TargetOpcode::G_TRUNC > m_GTrunc(const SrcTy &Src)
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
Reg
All possible values of the reg field in the ModR/M byte.
NodeAddr< DefNode * > Def
Definition: RDFGraph.h:384
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition: Utils.cpp:884
@ Offset
Definition: DWP.cpp:456
Register constrainOperandRegClass(const MachineFunction &MF, const TargetRegisterInfo &TRI, MachineRegisterInfo &MRI, const TargetInstrInfo &TII, const RegisterBankInfo &RBI, MachineInstr &InsertPt, const TargetRegisterClass &RegClass, MachineOperand &RegMO)
Constrain the Register operand OpIdx, so that it is now constrained to the TargetRegisterClass passed...
Definition: Utils.cpp:56
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition: Utils.cpp:627
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition: Utils.cpp:440
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
std::optional< APInt > getIConstantVRegVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT, return the corresponding value.
Definition: Utils.cpp:295
bool constrainSelectedInstRegOperands(MachineInstr &I, const TargetInstrInfo &TII, const TargetRegisterInfo &TRI, const RegisterBankInfo &RBI)
Mutate the newly-selected instruction I to constrain its (possibly generic) virtual register operands...
Definition: Utils.cpp:155
MachineInstr * getDefIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, folding away any trivial copies.
Definition: Utils.cpp:467
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition: Utils.cpp:307
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
std::optional< ValueAndVReg > getAnyConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true, bool LookThroughAnyExt=false)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT or G_FCONST...
Definition: Utils.cpp:421
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
unsigned getUndefRegState(bool B)
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ DS_Error
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition: Utils.cpp:415
std::optional< DefinitionAndSourceRegister > getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the def instruction for Reg, and underlying value Register folding away any copies.
Definition: Utils.cpp:448
Register getSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI)
Find the source register for Reg, folding away any trivial copies.
Definition: Utils.cpp:474
@ Default
The result values are uniform if and only if all operands are uniform.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition: KnownBits.h:297
static KnownBits computeForAddSub(bool Add, bool NSW, bool NUW, const KnownBits &LHS, const KnownBits &RHS)
Compute known bits resulting from adding LHS and RHS.
Definition: KnownBits.cpp:57
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.