LLVM 19.0.0git
SIInstrInfo.cpp
Go to the documentation of this file.
1//===- SIInstrInfo.cpp - SI Instruction Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// SI Implementation of TargetInstrInfo.
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIInstrInfo.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "GCNHazardRecognizer.h"
18#include "GCNSubtarget.h"
31#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include "llvm/MC/MCContext.h"
35
36using namespace llvm;
37
38#define DEBUG_TYPE "si-instr-info"
39
40#define GET_INSTRINFO_CTOR_DTOR
41#include "AMDGPUGenInstrInfo.inc"
42
43namespace llvm {
44namespace AMDGPU {
45#define GET_D16ImageDimIntrinsics_IMPL
46#define GET_ImageDimIntrinsicTable_IMPL
47#define GET_RsrcIntrinsics_IMPL
48#include "AMDGPUGenSearchableTables.inc"
49}
50}
51
52
53// Must be at least 4 to be able to branch over minimum unconditional branch
54// code. This is only for making it possible to write reasonably small tests for
55// long branches.
57BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
58 cl::desc("Restrict range of branch instructions (DEBUG)"));
59
61 "amdgpu-fix-16-bit-physreg-copies",
62 cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
63 cl::init(true),
65
67 : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
68 RI(ST), ST(ST) {
69 SchedModel.init(&ST);
70}
71
72//===----------------------------------------------------------------------===//
73// TargetInstrInfo callbacks
74//===----------------------------------------------------------------------===//
75
76static unsigned getNumOperandsNoGlue(SDNode *Node) {
77 unsigned N = Node->getNumOperands();
78 while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
79 --N;
80 return N;
81}
82
83/// Returns true if both nodes have the same value for the given
84/// operand \p Op, or if both nodes do not have this operand.
85static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
86 unsigned Opc0 = N0->getMachineOpcode();
87 unsigned Opc1 = N1->getMachineOpcode();
88
89 int Op0Idx = AMDGPU::getNamedOperandIdx(Opc0, OpName);
90 int Op1Idx = AMDGPU::getNamedOperandIdx(Opc1, OpName);
91
92 if (Op0Idx == -1 && Op1Idx == -1)
93 return true;
94
95
96 if ((Op0Idx == -1 && Op1Idx != -1) ||
97 (Op1Idx == -1 && Op0Idx != -1))
98 return false;
99
100 // getNamedOperandIdx returns the index for the MachineInstr's operands,
101 // which includes the result as the first operand. We are indexing into the
102 // MachineSDNode's operands, so we need to skip the result operand to get
103 // the real index.
104 --Op0Idx;
105 --Op1Idx;
106
107 return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
108}
109
110static bool canRemat(const MachineInstr &MI) {
111
115 return true;
116
117 if (SIInstrInfo::isSMRD(MI)) {
118 return !MI.memoperands_empty() &&
119 llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
120 return MMO->isLoad() && MMO->isInvariant();
121 });
122 }
123
124 return false;
125}
126
128 const MachineInstr &MI) const {
129
130 if (canRemat(MI)) {
131 // Normally VALU use of exec would block the rematerialization, but that
132 // is OK in this case to have an implicit exec read as all VALU do.
133 // We really want all of the generic logic for this except for this.
134
135 // Another potential implicit use is mode register. The core logic of
136 // the RA will not attempt rematerialization if mode is set anywhere
137 // in the function, otherwise it is safe since mode is not changed.
138
139 // There is difference to generic method which does not allow
140 // rematerialization if there are virtual register uses. We allow this,
141 // therefore this method includes SOP instructions as well.
142 if (!MI.hasImplicitDef() &&
143 MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
144 !MI.mayRaiseFPException())
145 return true;
146 }
147
149}
150
151// Returns true if the scalar result of a VALU instruction depends on exec.
153 // Ignore comparisons which are only used masked with exec.
154 // This allows some hoisting/sinking of VALU comparisons.
155 if (MI.isCompare()) {
156 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
157 Register DstReg = MI.getOperand(0).getReg();
158 if (!DstReg.isVirtual())
159 return true;
160 for (MachineInstr &Use : MRI.use_nodbg_instructions(DstReg)) {
161 switch (Use.getOpcode()) {
162 case AMDGPU::S_AND_SAVEEXEC_B32:
163 case AMDGPU::S_AND_SAVEEXEC_B64:
164 break;
165 case AMDGPU::S_AND_B32:
166 case AMDGPU::S_AND_B64:
167 if (!Use.readsRegister(AMDGPU::EXEC, /*TRI=*/nullptr))
168 return true;
169 break;
170 default:
171 return true;
172 }
173 }
174 return false;
175 }
176
177 switch (MI.getOpcode()) {
178 default:
179 break;
180 case AMDGPU::V_READFIRSTLANE_B32:
181 return true;
182 }
183
184 return false;
185}
186
188 // Any implicit use of exec by VALU is not a real register read.
189 return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
191}
192
194 MachineBasicBlock *SuccToSinkTo,
195 MachineCycleInfo *CI) const {
196 // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
197 if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
198 return true;
199
200 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
201 // Check if sinking of MI would create temporal divergent use.
202 for (auto Op : MI.uses()) {
203 if (Op.isReg() && Op.getReg().isVirtual() &&
204 RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
205 MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
206
207 // SgprDef defined inside cycle
208 MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
209 if (FromCycle == nullptr)
210 continue;
211
212 MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
213 // Check if there is a FromCycle that contains SgprDef's basic block but
214 // does not contain SuccToSinkTo and also has divergent exit condition.
215 while (FromCycle && !FromCycle->contains(ToCycle)) {
217 FromCycle->getExitingBlocks(ExitingBlocks);
218
219 // FromCycle has divergent exit condition.
220 for (MachineBasicBlock *ExitingBlock : ExitingBlocks) {
221 if (hasDivergentBranch(ExitingBlock))
222 return false;
223 }
224
225 FromCycle = FromCycle->getParentCycle();
226 }
227 }
228 }
229
230 return true;
231}
232
234 int64_t &Offset0,
235 int64_t &Offset1) const {
236 if (!Load0->isMachineOpcode() || !Load1->isMachineOpcode())
237 return false;
238
239 unsigned Opc0 = Load0->getMachineOpcode();
240 unsigned Opc1 = Load1->getMachineOpcode();
241
242 // Make sure both are actually loads.
243 if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
244 return false;
245
246 // A mayLoad instruction without a def is not a load. Likely a prefetch.
247 if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
248 return false;
249
250 if (isDS(Opc0) && isDS(Opc1)) {
251
252 // FIXME: Handle this case:
253 if (getNumOperandsNoGlue(Load0) != getNumOperandsNoGlue(Load1))
254 return false;
255
256 // Check base reg.
257 if (Load0->getOperand(0) != Load1->getOperand(0))
258 return false;
259
260 // Skip read2 / write2 variants for simplicity.
261 // TODO: We should report true if the used offsets are adjacent (excluded
262 // st64 versions).
263 int Offset0Idx = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
264 int Offset1Idx = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
265 if (Offset0Idx == -1 || Offset1Idx == -1)
266 return false;
267
268 // XXX - be careful of dataless loads
269 // getNamedOperandIdx returns the index for MachineInstrs. Since they
270 // include the output in the operand list, but SDNodes don't, we need to
271 // subtract the index by one.
272 Offset0Idx -= get(Opc0).NumDefs;
273 Offset1Idx -= get(Opc1).NumDefs;
274 Offset0 = Load0->getConstantOperandVal(Offset0Idx);
275 Offset1 = Load1->getConstantOperandVal(Offset1Idx);
276 return true;
277 }
278
279 if (isSMRD(Opc0) && isSMRD(Opc1)) {
280 // Skip time and cache invalidation instructions.
281 if (!AMDGPU::hasNamedOperand(Opc0, AMDGPU::OpName::sbase) ||
282 !AMDGPU::hasNamedOperand(Opc1, AMDGPU::OpName::sbase))
283 return false;
284
285 unsigned NumOps = getNumOperandsNoGlue(Load0);
286 if (NumOps != getNumOperandsNoGlue(Load1))
287 return false;
288
289 // Check base reg.
290 if (Load0->getOperand(0) != Load1->getOperand(0))
291 return false;
292
293 // Match register offsets, if both register and immediate offsets present.
294 assert(NumOps == 4 || NumOps == 5);
295 if (NumOps == 5 && Load0->getOperand(1) != Load1->getOperand(1))
296 return false;
297
298 const ConstantSDNode *Load0Offset =
299 dyn_cast<ConstantSDNode>(Load0->getOperand(NumOps - 3));
300 const ConstantSDNode *Load1Offset =
301 dyn_cast<ConstantSDNode>(Load1->getOperand(NumOps - 3));
302
303 if (!Load0Offset || !Load1Offset)
304 return false;
305
306 Offset0 = Load0Offset->getZExtValue();
307 Offset1 = Load1Offset->getZExtValue();
308 return true;
309 }
310
311 // MUBUF and MTBUF can access the same addresses.
312 if ((isMUBUF(Opc0) || isMTBUF(Opc0)) && (isMUBUF(Opc1) || isMTBUF(Opc1))) {
313
314 // MUBUF and MTBUF have vaddr at different indices.
315 if (!nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::soffset) ||
316 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::vaddr) ||
317 !nodesHaveSameOperandValue(Load0, Load1, AMDGPU::OpName::srsrc))
318 return false;
319
320 int OffIdx0 = AMDGPU::getNamedOperandIdx(Opc0, AMDGPU::OpName::offset);
321 int OffIdx1 = AMDGPU::getNamedOperandIdx(Opc1, AMDGPU::OpName::offset);
322
323 if (OffIdx0 == -1 || OffIdx1 == -1)
324 return false;
325
326 // getNamedOperandIdx returns the index for MachineInstrs. Since they
327 // include the output in the operand list, but SDNodes don't, we need to
328 // subtract the index by one.
329 OffIdx0 -= get(Opc0).NumDefs;
330 OffIdx1 -= get(Opc1).NumDefs;
331
332 SDValue Off0 = Load0->getOperand(OffIdx0);
333 SDValue Off1 = Load1->getOperand(OffIdx1);
334
335 // The offset might be a FrameIndexSDNode.
336 if (!isa<ConstantSDNode>(Off0) || !isa<ConstantSDNode>(Off1))
337 return false;
338
339 Offset0 = Off0->getAsZExtVal();
340 Offset1 = Off1->getAsZExtVal();
341 return true;
342 }
343
344 return false;
345}
346
347static bool isStride64(unsigned Opc) {
348 switch (Opc) {
349 case AMDGPU::DS_READ2ST64_B32:
350 case AMDGPU::DS_READ2ST64_B64:
351 case AMDGPU::DS_WRITE2ST64_B32:
352 case AMDGPU::DS_WRITE2ST64_B64:
353 return true;
354 default:
355 return false;
356 }
357}
358
361 int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width,
362 const TargetRegisterInfo *TRI) const {
363 if (!LdSt.mayLoadOrStore())
364 return false;
365
366 unsigned Opc = LdSt.getOpcode();
367 OffsetIsScalable = false;
368 const MachineOperand *BaseOp, *OffsetOp;
369 int DataOpIdx;
370
371 if (isDS(LdSt)) {
372 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
373 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
374 if (OffsetOp) {
375 // Normal, single offset LDS instruction.
376 if (!BaseOp) {
377 // DS_CONSUME/DS_APPEND use M0 for the base address.
378 // TODO: find the implicit use operand for M0 and use that as BaseOp?
379 return false;
380 }
381 BaseOps.push_back(BaseOp);
382 Offset = OffsetOp->getImm();
383 // Get appropriate operand, and compute width accordingly.
384 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
385 if (DataOpIdx == -1)
386 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
387 Width = getOpSize(LdSt, DataOpIdx);
388 } else {
389 // The 2 offset instructions use offset0 and offset1 instead. We can treat
390 // these as a load with a single offset if the 2 offsets are consecutive.
391 // We will use this for some partially aligned loads.
392 const MachineOperand *Offset0Op =
393 getNamedOperand(LdSt, AMDGPU::OpName::offset0);
394 const MachineOperand *Offset1Op =
395 getNamedOperand(LdSt, AMDGPU::OpName::offset1);
396
397 unsigned Offset0 = Offset0Op->getImm() & 0xff;
398 unsigned Offset1 = Offset1Op->getImm() & 0xff;
399 if (Offset0 + 1 != Offset1)
400 return false;
401
402 // Each of these offsets is in element sized units, so we need to convert
403 // to bytes of the individual reads.
404
405 unsigned EltSize;
406 if (LdSt.mayLoad())
407 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, 0)) / 16;
408 else {
409 assert(LdSt.mayStore());
410 int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
411 EltSize = TRI->getRegSizeInBits(*getOpRegClass(LdSt, Data0Idx)) / 8;
412 }
413
414 if (isStride64(Opc))
415 EltSize *= 64;
416
417 BaseOps.push_back(BaseOp);
418 Offset = EltSize * Offset0;
419 // Get appropriate operand(s), and compute width accordingly.
420 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
421 if (DataOpIdx == -1) {
422 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
423 Width = getOpSize(LdSt, DataOpIdx);
424 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
425 Width = Width.getValue() + getOpSize(LdSt, DataOpIdx);
426 } else {
427 Width = getOpSize(LdSt, DataOpIdx);
428 }
429 }
430 return true;
431 }
432
433 if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
434 const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
435 if (!RSrc) // e.g. BUFFER_WBINVL1_VOL
436 return false;
437 BaseOps.push_back(RSrc);
438 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
439 if (BaseOp && !BaseOp->isFI())
440 BaseOps.push_back(BaseOp);
441 const MachineOperand *OffsetImm =
442 getNamedOperand(LdSt, AMDGPU::OpName::offset);
443 Offset = OffsetImm->getImm();
444 const MachineOperand *SOffset =
445 getNamedOperand(LdSt, AMDGPU::OpName::soffset);
446 if (SOffset) {
447 if (SOffset->isReg())
448 BaseOps.push_back(SOffset);
449 else
450 Offset += SOffset->getImm();
451 }
452 // Get appropriate operand, and compute width accordingly.
453 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
454 if (DataOpIdx == -1)
455 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
456 if (DataOpIdx == -1) // LDS DMA
457 return false;
458 Width = getOpSize(LdSt, DataOpIdx);
459 return true;
460 }
461
462 if (isImage(LdSt)) {
463 auto RsrcOpName =
464 isMIMG(LdSt) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
465 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcOpName);
466 BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
467 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
468 if (VAddr0Idx >= 0) {
469 // GFX10 possible NSA encoding.
470 for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
471 BaseOps.push_back(&LdSt.getOperand(I));
472 } else {
473 BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
474 }
475 Offset = 0;
476 // Get appropriate operand, and compute width accordingly.
477 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
478 Width = getOpSize(LdSt, DataOpIdx);
479 return true;
480 }
481
482 if (isSMRD(LdSt)) {
483 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
484 if (!BaseOp) // e.g. S_MEMTIME
485 return false;
486 BaseOps.push_back(BaseOp);
487 OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
488 Offset = OffsetOp ? OffsetOp->getImm() : 0;
489 // Get appropriate operand, and compute width accordingly.
490 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
491 if (DataOpIdx == -1)
492 return false;
493 Width = getOpSize(LdSt, DataOpIdx);
494 return true;
495 }
496
497 if (isFLAT(LdSt)) {
498 // Instructions have either vaddr or saddr or both or none.
499 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
500 if (BaseOp)
501 BaseOps.push_back(BaseOp);
502 BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
503 if (BaseOp)
504 BaseOps.push_back(BaseOp);
505 Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
506 // Get appropriate operand, and compute width accordingly.
507 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
508 if (DataOpIdx == -1)
509 DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
510 if (DataOpIdx == -1) // LDS DMA
511 return false;
512 Width = getOpSize(LdSt, DataOpIdx);
513 return true;
514 }
515
516 return false;
517}
518
519static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
521 const MachineInstr &MI2,
523 // Only examine the first "base" operand of each instruction, on the
524 // assumption that it represents the real base address of the memory access.
525 // Other operands are typically offsets or indices from this base address.
526 if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
527 return true;
528
529 if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
530 return false;
531
532 auto MO1 = *MI1.memoperands_begin();
533 auto MO2 = *MI2.memoperands_begin();
534 if (MO1->getAddrSpace() != MO2->getAddrSpace())
535 return false;
536
537 auto Base1 = MO1->getValue();
538 auto Base2 = MO2->getValue();
539 if (!Base1 || !Base2)
540 return false;
541 Base1 = getUnderlyingObject(Base1);
542 Base2 = getUnderlyingObject(Base2);
543
544 if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
545 return false;
546
547 return Base1 == Base2;
548}
549
551 int64_t Offset1, bool OffsetIsScalable1,
553 int64_t Offset2, bool OffsetIsScalable2,
554 unsigned ClusterSize,
555 unsigned NumBytes) const {
556 // If the mem ops (to be clustered) do not have the same base ptr, then they
557 // should not be clustered
558 if (!BaseOps1.empty() && !BaseOps2.empty()) {
559 const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
560 const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
561 if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
562 return false;
563 } else if (!BaseOps1.empty() || !BaseOps2.empty()) {
564 // If only one base op is empty, they do not have the same base ptr
565 return false;
566 }
567
568 // In order to avoid register pressure, on an average, the number of DWORDS
569 // loaded together by all clustered mem ops should not exceed 8. This is an
570 // empirical value based on certain observations and performance related
571 // experiments.
572 // The good thing about this heuristic is - it avoids clustering of too many
573 // sub-word loads, and also avoids clustering of wide loads. Below is the
574 // brief summary of how the heuristic behaves for various `LoadSize`.
575 // (1) 1 <= LoadSize <= 4: cluster at max 8 mem ops
576 // (2) 5 <= LoadSize <= 8: cluster at max 4 mem ops
577 // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
578 // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
579 // (5) LoadSize >= 17: do not cluster
580 const unsigned LoadSize = NumBytes / ClusterSize;
581 const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
582 return NumDWORDs <= 8;
583}
584
585// FIXME: This behaves strangely. If, for example, you have 32 load + stores,
586// the first 16 loads will be interleaved with the stores, and the next 16 will
587// be clustered as expected. It should really split into 2 16 store batches.
588//
589// Loads are clustered until this returns false, rather than trying to schedule
590// groups of stores. This also means we have to deal with saying different
591// address space loads should be clustered, and ones which might cause bank
592// conflicts.
593//
594// This might be deprecated so it might not be worth that much effort to fix.
596 int64_t Offset0, int64_t Offset1,
597 unsigned NumLoads) const {
598 assert(Offset1 > Offset0 &&
599 "Second offset should be larger than first offset!");
600 // If we have less than 16 loads in a row, and the offsets are within 64
601 // bytes, then schedule together.
602
603 // A cacheline is 64 bytes (for global memory).
604 return (NumLoads <= 16 && (Offset1 - Offset0) < 64);
605}
606
609 const DebugLoc &DL, MCRegister DestReg,
610 MCRegister SrcReg, bool KillSrc,
611 const char *Msg = "illegal VGPR to SGPR copy") {
613 DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
615 C.diagnose(IllegalCopy);
616
617 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
618 .addReg(SrcReg, getKillRegState(KillSrc));
619}
620
621/// Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908. It is not
622/// possible to have a direct copy in these cases on GFX908, so an intermediate
623/// VGPR copy is required.
627 const DebugLoc &DL, MCRegister DestReg,
628 MCRegister SrcReg, bool KillSrc,
629 RegScavenger &RS, bool RegsOverlap,
630 Register ImpDefSuperReg = Register(),
631 Register ImpUseSuperReg = Register()) {
632 assert((TII.getSubtarget().hasMAIInsts() &&
633 !TII.getSubtarget().hasGFX90AInsts()) &&
634 "Expected GFX908 subtarget.");
635
636 assert((AMDGPU::SReg_32RegClass.contains(SrcReg) ||
637 AMDGPU::AGPR_32RegClass.contains(SrcReg)) &&
638 "Source register of the copy should be either an SGPR or an AGPR.");
639
640 assert(AMDGPU::AGPR_32RegClass.contains(DestReg) &&
641 "Destination register of the copy should be an AGPR.");
642
643 const SIRegisterInfo &RI = TII.getRegisterInfo();
644
645 // First try to find defining accvgpr_write to avoid temporary registers.
646 // In the case of copies of overlapping AGPRs, we conservatively do not
647 // reuse previous accvgpr_writes. Otherwise, we may incorrectly pick up
648 // an accvgpr_write used for this same copy due to implicit-defs
649 if (!RegsOverlap) {
650 for (auto Def = MI, E = MBB.begin(); Def != E; ) {
651 --Def;
652
653 if (!Def->modifiesRegister(SrcReg, &RI))
654 continue;
655
656 if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
657 Def->getOperand(0).getReg() != SrcReg)
658 break;
659
660 MachineOperand &DefOp = Def->getOperand(1);
661 assert(DefOp.isReg() || DefOp.isImm());
662
663 if (DefOp.isReg()) {
664 bool SafeToPropagate = true;
665 // Check that register source operand is not clobbered before MI.
666 // Immediate operands are always safe to propagate.
667 for (auto I = Def; I != MI && SafeToPropagate; ++I)
668 if (I->modifiesRegister(DefOp.getReg(), &RI))
669 SafeToPropagate = false;
670
671 if (!SafeToPropagate)
672 break;
673
674 DefOp.setIsKill(false);
675 }
676
677 MachineInstrBuilder Builder =
678 BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
679 .add(DefOp);
680 if (ImpDefSuperReg)
681 Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
682
683 if (ImpUseSuperReg) {
684 Builder.addReg(ImpUseSuperReg,
686 }
687
688 return;
689 }
690 }
691
693 RS.backward(std::next(MI));
694
695 // Ideally we want to have three registers for a long reg_sequence copy
696 // to hide 2 waitstates between v_mov_b32 and accvgpr_write.
697 unsigned MaxVGPRs = RI.getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
698 *MBB.getParent());
699
700 // Registers in the sequence are allocated contiguously so we can just
701 // use register number to pick one of three round-robin temps.
702 unsigned RegNo = (DestReg - AMDGPU::AGPR0) % 3;
703 Register Tmp =
704 MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getVGPRForAGPRCopy();
706 "VGPR used for an intermediate copy should have been reserved.");
707
708 // Only loop through if there are any free registers left. We don't want to
709 // spill.
710 while (RegNo--) {
711 Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
712 /* RestoreAfter */ false, 0,
713 /* AllowSpill */ false);
714 if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
715 break;
716 Tmp = Tmp2;
717 RS.setRegUsed(Tmp);
718 }
719
720 // Insert copy to temporary VGPR.
721 unsigned TmpCopyOp = AMDGPU::V_MOV_B32_e32;
722 if (AMDGPU::AGPR_32RegClass.contains(SrcReg)) {
723 TmpCopyOp = AMDGPU::V_ACCVGPR_READ_B32_e64;
724 } else {
725 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
726 }
727
728 MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp)
729 .addReg(SrcReg, getKillRegState(KillSrc));
730 if (ImpUseSuperReg) {
731 UseBuilder.addReg(ImpUseSuperReg,
733 }
734
735 MachineInstrBuilder DefBuilder
736 = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
737 .addReg(Tmp, RegState::Kill);
738
739 if (ImpDefSuperReg)
740 DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit);
741}
742
745 MCRegister DestReg, MCRegister SrcReg, bool KillSrc,
746 const TargetRegisterClass *RC, bool Forward) {
747 const SIRegisterInfo &RI = TII.getRegisterInfo();
748 ArrayRef<int16_t> BaseIndices = RI.getRegSplitParts(RC, 4);
750 MachineInstr *FirstMI = nullptr, *LastMI = nullptr;
751
752 for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
753 int16_t SubIdx = BaseIndices[Idx];
754 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
755 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
756 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
757 unsigned Opcode = AMDGPU::S_MOV_B32;
758
759 // Is SGPR aligned? If so try to combine with next.
760 bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
761 bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
762 if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
763 // Can use SGPR64 copy
764 unsigned Channel = RI.getChannelFromSubReg(SubIdx);
765 SubIdx = RI.getSubRegFromChannel(Channel, 2);
766 DestSubReg = RI.getSubReg(DestReg, SubIdx);
767 SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
768 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
769 Opcode = AMDGPU::S_MOV_B64;
770 Idx++;
771 }
772
773 LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
774 .addReg(SrcSubReg)
775 .addReg(SrcReg, RegState::Implicit);
776
777 if (!FirstMI)
778 FirstMI = LastMI;
779
780 if (!Forward)
781 I--;
782 }
783
784 assert(FirstMI && LastMI);
785 if (!Forward)
786 std::swap(FirstMI, LastMI);
787
788 FirstMI->addOperand(
789 MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/));
790
791 if (KillSrc)
792 LastMI->addRegisterKilled(SrcReg, &RI);
793}
794
797 const DebugLoc &DL, MCRegister DestReg,
798 MCRegister SrcReg, bool KillSrc) const {
799 const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
800 unsigned Size = RI.getRegSizeInBits(*RC);
801 const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
802 unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
803
804 // The rest of copyPhysReg assumes Src and Dst size are the same size.
805 // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
806 // we remove Fix16BitCopies and this code block?
807 if (Fix16BitCopies) {
808 if (((Size == 16) != (SrcSize == 16))) {
809 // Non-VGPR Src and Dst will later be expanded back to 32 bits.
811 MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
812 MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
813 RegToFix = SubReg;
814
815 if (DestReg == SrcReg) {
816 // Identity copy. Insert empty bundle since ExpandPostRA expects an
817 // instruction here.
818 BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
819 return;
820 }
821 RC = RI.getPhysRegBaseClass(DestReg);
822 Size = RI.getRegSizeInBits(*RC);
823 SrcRC = RI.getPhysRegBaseClass(SrcReg);
824 SrcSize = RI.getRegSizeInBits(*SrcRC);
825 }
826 }
827
828 if (RC == &AMDGPU::VGPR_32RegClass) {
829 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
830 AMDGPU::SReg_32RegClass.contains(SrcReg) ||
831 AMDGPU::AGPR_32RegClass.contains(SrcReg));
832 unsigned Opc = AMDGPU::AGPR_32RegClass.contains(SrcReg) ?
833 AMDGPU::V_ACCVGPR_READ_B32_e64 : AMDGPU::V_MOV_B32_e32;
834 BuildMI(MBB, MI, DL, get(Opc), DestReg)
835 .addReg(SrcReg, getKillRegState(KillSrc));
836 return;
837 }
838
839 if (RC == &AMDGPU::SReg_32_XM0RegClass ||
840 RC == &AMDGPU::SReg_32RegClass) {
841 if (SrcReg == AMDGPU::SCC) {
842 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B32), DestReg)
843 .addImm(1)
844 .addImm(0);
845 return;
846 }
847
848 if (DestReg == AMDGPU::VCC_LO) {
849 if (AMDGPU::SReg_32RegClass.contains(SrcReg)) {
850 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::VCC_LO)
851 .addReg(SrcReg, getKillRegState(KillSrc));
852 } else {
853 // FIXME: Hack until VReg_1 removed.
854 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
855 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
856 .addImm(0)
857 .addReg(SrcReg, getKillRegState(KillSrc));
858 }
859
860 return;
861 }
862
863 if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
864 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
865 return;
866 }
867
868 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
869 .addReg(SrcReg, getKillRegState(KillSrc));
870 return;
871 }
872
873 if (RC == &AMDGPU::SReg_64RegClass) {
874 if (SrcReg == AMDGPU::SCC) {
875 BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
876 .addImm(1)
877 .addImm(0);
878 return;
879 }
880
881 if (DestReg == AMDGPU::VCC) {
882 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
883 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
884 .addReg(SrcReg, getKillRegState(KillSrc));
885 } else {
886 // FIXME: Hack until VReg_1 removed.
887 assert(AMDGPU::VGPR_32RegClass.contains(SrcReg));
888 BuildMI(MBB, MI, DL, get(AMDGPU::V_CMP_NE_U32_e32))
889 .addImm(0)
890 .addReg(SrcReg, getKillRegState(KillSrc));
891 }
892
893 return;
894 }
895
896 if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
897 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
898 return;
899 }
900
901 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
902 .addReg(SrcReg, getKillRegState(KillSrc));
903 return;
904 }
905
906 if (DestReg == AMDGPU::SCC) {
907 // Copying 64-bit or 32-bit sources to SCC barely makes sense,
908 // but SelectionDAG emits such copies for i1 sources.
909 if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
910 // This copy can only be produced by patterns
911 // with explicit SCC, which are known to be enabled
912 // only for subtargets with S_CMP_LG_U64 present.
914 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U64))
915 .addReg(SrcReg, getKillRegState(KillSrc))
916 .addImm(0);
917 } else {
918 assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
919 BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
920 .addReg(SrcReg, getKillRegState(KillSrc))
921 .addImm(0);
922 }
923
924 return;
925 }
926
927 if (RC == &AMDGPU::AGPR_32RegClass) {
928 if (AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
929 (ST.hasGFX90AInsts() && AMDGPU::SReg_32RegClass.contains(SrcReg))) {
930 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg)
931 .addReg(SrcReg, getKillRegState(KillSrc));
932 return;
933 }
934
935 if (AMDGPU::AGPR_32RegClass.contains(SrcReg) && ST.hasGFX90AInsts()) {
936 BuildMI(MBB, MI, DL, get(AMDGPU::V_ACCVGPR_MOV_B32), DestReg)
937 .addReg(SrcReg, getKillRegState(KillSrc));
938 return;
939 }
940
941 // FIXME: Pass should maintain scavenger to avoid scan through the block on
942 // every AGPR spill.
943 RegScavenger RS;
944 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
945 indirectCopyToAGPR(*this, MBB, MI, DL, DestReg, SrcReg, KillSrc, RS, Overlap);
946 return;
947 }
948
949 if (Size == 16) {
950 assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
951 AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
952 AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
953
954 bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
955 bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
956 bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
957 bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
958 bool DstLow = !AMDGPU::isHi(DestReg, RI);
959 bool SrcLow = !AMDGPU::isHi(SrcReg, RI);
960 MCRegister NewDestReg = RI.get32BitRegister(DestReg);
961 MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
962
963 if (IsSGPRDst) {
964 if (!IsSGPRSrc) {
965 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
966 return;
967 }
968
969 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
970 .addReg(NewSrcReg, getKillRegState(KillSrc));
971 return;
972 }
973
974 if (IsAGPRDst || IsAGPRSrc) {
975 if (!DstLow || !SrcLow) {
976 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
977 "Cannot use hi16 subreg with an AGPR!");
978 }
979
980 copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
981 return;
982 }
983
984 if (ST.hasTrue16BitInsts()) {
985 if (IsSGPRSrc) {
986 assert(SrcLow);
987 SrcReg = NewSrcReg;
988 }
989 // Use the smaller instruction encoding if possible.
990 if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
991 (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
992 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
993 .addReg(SrcReg);
994 } else {
995 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
996 .addImm(0) // src0_modifiers
997 .addReg(SrcReg)
998 .addImm(0); // op_sel
999 }
1000 return;
1001 }
1002
1003 if (IsSGPRSrc && !ST.hasSDWAScalar()) {
1004 if (!DstLow || !SrcLow) {
1005 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
1006 "Cannot use hi16 subreg on VI!");
1007 }
1008
1009 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
1010 .addReg(NewSrcReg, getKillRegState(KillSrc));
1011 return;
1012 }
1013
1014 auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
1015 .addImm(0) // src0_modifiers
1016 .addReg(NewSrcReg)
1017 .addImm(0) // clamp
1024 // First implicit operand is $exec.
1025 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1026 return;
1027 }
1028
1029 if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
1030 if (ST.hasMovB64()) {
1031 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
1032 .addReg(SrcReg, getKillRegState(KillSrc));
1033 return;
1034 }
1035 if (ST.hasPkMovB32()) {
1036 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
1038 .addReg(SrcReg)
1040 .addReg(SrcReg)
1041 .addImm(0) // op_sel_lo
1042 .addImm(0) // op_sel_hi
1043 .addImm(0) // neg_lo
1044 .addImm(0) // neg_hi
1045 .addImm(0) // clamp
1046 .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit);
1047 return;
1048 }
1049 }
1050
1051 const bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
1052 if (RI.isSGPRClass(RC)) {
1053 if (!RI.isSGPRClass(SrcRC)) {
1054 reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
1055 return;
1056 }
1057 const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg);
1058 expandSGPRCopy(*this, MBB, MI, DL, DestReg, SrcReg, CanKillSuperReg, RC,
1059 Forward);
1060 return;
1061 }
1062
1063 unsigned EltSize = 4;
1064 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1065 if (RI.isAGPRClass(RC)) {
1066 if (ST.hasGFX90AInsts() && RI.isAGPRClass(SrcRC))
1067 Opcode = AMDGPU::V_ACCVGPR_MOV_B32;
1068 else if (RI.hasVGPRs(SrcRC) ||
1069 (ST.hasGFX90AInsts() && RI.isSGPRClass(SrcRC)))
1070 Opcode = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
1071 else
1072 Opcode = AMDGPU::INSTRUCTION_LIST_END;
1073 } else if (RI.hasVGPRs(RC) && RI.isAGPRClass(SrcRC)) {
1074 Opcode = AMDGPU::V_ACCVGPR_READ_B32_e64;
1075 } else if ((Size % 64 == 0) && RI.hasVGPRs(RC) &&
1076 (RI.isProperlyAlignedRC(*RC) &&
1077 (SrcRC == RC || RI.isSGPRClass(SrcRC)))) {
1078 // TODO: In 96-bit case, could do a 64-bit mov and then a 32-bit mov.
1079 if (ST.hasMovB64()) {
1080 Opcode = AMDGPU::V_MOV_B64_e32;
1081 EltSize = 8;
1082 } else if (ST.hasPkMovB32()) {
1083 Opcode = AMDGPU::V_PK_MOV_B32;
1084 EltSize = 8;
1085 }
1086 }
1087
1088 // For the cases where we need an intermediate instruction/temporary register
1089 // (destination is an AGPR), we need a scavenger.
1090 //
1091 // FIXME: The pass should maintain this for us so we don't have to re-scan the
1092 // whole block for every handled copy.
1093 std::unique_ptr<RegScavenger> RS;
1094 if (Opcode == AMDGPU::INSTRUCTION_LIST_END)
1095 RS.reset(new RegScavenger());
1096
1097 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
1098
1099 // If there is an overlap, we can't kill the super-register on the last
1100 // instruction, since it will also kill the components made live by this def.
1101 const bool Overlap = RI.regsOverlap(SrcReg, DestReg);
1102 const bool CanKillSuperReg = KillSrc && !Overlap;
1103
1104 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1105 unsigned SubIdx;
1106 if (Forward)
1107 SubIdx = SubIndices[Idx];
1108 else
1109 SubIdx = SubIndices[SubIndices.size() - Idx - 1];
1110 Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
1111 Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
1112 assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
1113
1114 bool IsFirstSubreg = Idx == 0;
1115 bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
1116
1117 if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
1118 Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
1119 Register ImpUseSuper = SrcReg;
1120 indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
1121 *RS, Overlap, ImpDefSuper, ImpUseSuper);
1122 } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
1124 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
1126 .addReg(SrcSubReg)
1128 .addReg(SrcSubReg)
1129 .addImm(0) // op_sel_lo
1130 .addImm(0) // op_sel_hi
1131 .addImm(0) // neg_lo
1132 .addImm(0) // neg_hi
1133 .addImm(0) // clamp
1134 .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1135 if (IsFirstSubreg)
1137 } else {
1138 MachineInstrBuilder Builder =
1139 BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
1140 if (IsFirstSubreg)
1141 Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
1142
1143 Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
1144 }
1145 }
1146}
1147
1148int SIInstrInfo::commuteOpcode(unsigned Opcode) const {
1149 int NewOpc;
1150
1151 // Try to map original to commuted opcode
1152 NewOpc = AMDGPU::getCommuteRev(Opcode);
1153 if (NewOpc != -1)
1154 // Check if the commuted (REV) opcode exists on the target.
1155 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1156
1157 // Try to map commuted to original opcode
1158 NewOpc = AMDGPU::getCommuteOrig(Opcode);
1159 if (NewOpc != -1)
1160 // Check if the original (non-REV) opcode exists on the target.
1161 return pseudoToMCOpcode(NewOpc) != -1 ? NewOpc : -1;
1162
1163 return Opcode;
1164}
1165
1168 const DebugLoc &DL, Register DestReg,
1169 int64_t Value) const {
1171 const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
1172 if (RegClass == &AMDGPU::SReg_32RegClass ||
1173 RegClass == &AMDGPU::SGPR_32RegClass ||
1174 RegClass == &AMDGPU::SReg_32_XM0RegClass ||
1175 RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
1176 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
1177 .addImm(Value);
1178 return;
1179 }
1180
1181 if (RegClass == &AMDGPU::SReg_64RegClass ||
1182 RegClass == &AMDGPU::SGPR_64RegClass ||
1183 RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
1184 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
1185 .addImm(Value);
1186 return;
1187 }
1188
1189 if (RegClass == &AMDGPU::VGPR_32RegClass) {
1190 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
1191 .addImm(Value);
1192 return;
1193 }
1194 if (RegClass->hasSuperClassEq(&AMDGPU::VReg_64RegClass)) {
1195 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
1196 .addImm(Value);
1197 return;
1198 }
1199
1200 unsigned EltSize = 4;
1201 unsigned Opcode = AMDGPU::V_MOV_B32_e32;
1202 if (RI.isSGPRClass(RegClass)) {
1203 if (RI.getRegSizeInBits(*RegClass) > 32) {
1204 Opcode = AMDGPU::S_MOV_B64;
1205 EltSize = 8;
1206 } else {
1207 Opcode = AMDGPU::S_MOV_B32;
1208 EltSize = 4;
1209 }
1210 }
1211
1212 ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
1213 for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
1214 int64_t IdxValue = Idx == 0 ? Value : 0;
1215
1216 MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
1217 get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
1218 Builder.addImm(IdxValue);
1219 }
1220}
1221
1222const TargetRegisterClass *
1224 return &AMDGPU::VGPR_32RegClass;
1225}
1226
1229 const DebugLoc &DL, Register DstReg,
1231 Register TrueReg,
1232 Register FalseReg) const {
1234 const TargetRegisterClass *BoolXExecRC =
1235 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
1236 assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass &&
1237 "Not a VGPR32 reg");
1238
1239 if (Cond.size() == 1) {
1240 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1241 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1242 .add(Cond[0]);
1243 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1244 .addImm(0)
1245 .addReg(FalseReg)
1246 .addImm(0)
1247 .addReg(TrueReg)
1248 .addReg(SReg);
1249 } else if (Cond.size() == 2) {
1250 assert(Cond[0].isImm() && "Cond[0] is not an immediate");
1251 switch (Cond[0].getImm()) {
1252 case SIInstrInfo::SCC_TRUE: {
1253 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1254 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1255 : AMDGPU::S_CSELECT_B64), SReg)
1256 .addImm(1)
1257 .addImm(0);
1258 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1259 .addImm(0)
1260 .addReg(FalseReg)
1261 .addImm(0)
1262 .addReg(TrueReg)
1263 .addReg(SReg);
1264 break;
1265 }
1266 case SIInstrInfo::SCC_FALSE: {
1267 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1268 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1269 : AMDGPU::S_CSELECT_B64), SReg)
1270 .addImm(0)
1271 .addImm(1);
1272 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1273 .addImm(0)
1274 .addReg(FalseReg)
1275 .addImm(0)
1276 .addReg(TrueReg)
1277 .addReg(SReg);
1278 break;
1279 }
1280 case SIInstrInfo::VCCNZ: {
1281 MachineOperand RegOp = Cond[1];
1282 RegOp.setImplicit(false);
1283 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1284 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1285 .add(RegOp);
1286 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1287 .addImm(0)
1288 .addReg(FalseReg)
1289 .addImm(0)
1290 .addReg(TrueReg)
1291 .addReg(SReg);
1292 break;
1293 }
1294 case SIInstrInfo::VCCZ: {
1295 MachineOperand RegOp = Cond[1];
1296 RegOp.setImplicit(false);
1297 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1298 BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
1299 .add(RegOp);
1300 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1301 .addImm(0)
1302 .addReg(TrueReg)
1303 .addImm(0)
1304 .addReg(FalseReg)
1305 .addReg(SReg);
1306 break;
1307 }
1308 case SIInstrInfo::EXECNZ: {
1309 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1310 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1311 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1312 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1313 .addImm(0);
1314 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1315 : AMDGPU::S_CSELECT_B64), SReg)
1316 .addImm(1)
1317 .addImm(0);
1318 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1319 .addImm(0)
1320 .addReg(FalseReg)
1321 .addImm(0)
1322 .addReg(TrueReg)
1323 .addReg(SReg);
1324 break;
1325 }
1326 case SIInstrInfo::EXECZ: {
1327 Register SReg = MRI.createVirtualRegister(BoolXExecRC);
1328 Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
1329 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
1330 : AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
1331 .addImm(0);
1332 BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
1333 : AMDGPU::S_CSELECT_B64), SReg)
1334 .addImm(0)
1335 .addImm(1);
1336 BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
1337 .addImm(0)
1338 .addReg(FalseReg)
1339 .addImm(0)
1340 .addReg(TrueReg)
1341 .addReg(SReg);
1342 llvm_unreachable("Unhandled branch predicate EXECZ");
1343 break;
1344 }
1345 default:
1346 llvm_unreachable("invalid branch predicate");
1347 }
1348 } else {
1349 llvm_unreachable("Can only handle Cond size 1 or 2");
1350 }
1351}
1352
1355 const DebugLoc &DL,
1356 Register SrcReg, int Value) const {
1358 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1359 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
1360 .addImm(Value)
1361 .addReg(SrcReg);
1362
1363 return Reg;
1364}
1365
1368 const DebugLoc &DL,
1369 Register SrcReg, int Value) const {
1371 Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
1372 BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
1373 .addImm(Value)
1374 .addReg(SrcReg);
1375
1376 return Reg;
1377}
1378
1380
1381 if (RI.isAGPRClass(DstRC))
1382 return AMDGPU::COPY;
1383 if (RI.getRegSizeInBits(*DstRC) == 16) {
1384 // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
1385 // before RA.
1386 return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
1387 } else if (RI.getRegSizeInBits(*DstRC) == 32) {
1388 return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1389 } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
1390 return AMDGPU::S_MOV_B64;
1391 } else if (RI.getRegSizeInBits(*DstRC) == 64 && !RI.isSGPRClass(DstRC)) {
1392 return AMDGPU::V_MOV_B64_PSEUDO;
1393 }
1394 return AMDGPU::COPY;
1395}
1396
1397const MCInstrDesc &
1399 bool IsIndirectSrc) const {
1400 if (IsIndirectSrc) {
1401 if (VecSize <= 32) // 4 bytes
1402 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1);
1403 if (VecSize <= 64) // 8 bytes
1404 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2);
1405 if (VecSize <= 96) // 12 bytes
1406 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3);
1407 if (VecSize <= 128) // 16 bytes
1408 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4);
1409 if (VecSize <= 160) // 20 bytes
1410 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5);
1411 if (VecSize <= 256) // 32 bytes
1412 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8);
1413 if (VecSize <= 288) // 36 bytes
1414 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9);
1415 if (VecSize <= 320) // 40 bytes
1416 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10);
1417 if (VecSize <= 352) // 44 bytes
1418 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11);
1419 if (VecSize <= 384) // 48 bytes
1420 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12);
1421 if (VecSize <= 512) // 64 bytes
1422 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16);
1423 if (VecSize <= 1024) // 128 bytes
1424 return get(AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32);
1425
1426 llvm_unreachable("unsupported size for IndirectRegReadGPRIDX pseudos");
1427 }
1428
1429 if (VecSize <= 32) // 4 bytes
1430 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1);
1431 if (VecSize <= 64) // 8 bytes
1432 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2);
1433 if (VecSize <= 96) // 12 bytes
1434 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3);
1435 if (VecSize <= 128) // 16 bytes
1436 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4);
1437 if (VecSize <= 160) // 20 bytes
1438 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5);
1439 if (VecSize <= 256) // 32 bytes
1440 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8);
1441 if (VecSize <= 288) // 36 bytes
1442 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9);
1443 if (VecSize <= 320) // 40 bytes
1444 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10);
1445 if (VecSize <= 352) // 44 bytes
1446 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11);
1447 if (VecSize <= 384) // 48 bytes
1448 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12);
1449 if (VecSize <= 512) // 64 bytes
1450 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16);
1451 if (VecSize <= 1024) // 128 bytes
1452 return get(AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32);
1453
1454 llvm_unreachable("unsupported size for IndirectRegWriteGPRIDX pseudos");
1455}
1456
1457static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize) {
1458 if (VecSize <= 32) // 4 bytes
1459 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1460 if (VecSize <= 64) // 8 bytes
1461 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1462 if (VecSize <= 96) // 12 bytes
1463 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1464 if (VecSize <= 128) // 16 bytes
1465 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1466 if (VecSize <= 160) // 20 bytes
1467 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1468 if (VecSize <= 256) // 32 bytes
1469 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1470 if (VecSize <= 288) // 36 bytes
1471 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1472 if (VecSize <= 320) // 40 bytes
1473 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1474 if (VecSize <= 352) // 44 bytes
1475 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1476 if (VecSize <= 384) // 48 bytes
1477 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1478 if (VecSize <= 512) // 64 bytes
1479 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1480 if (VecSize <= 1024) // 128 bytes
1481 return AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1482
1483 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1484}
1485
1486static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
1487 if (VecSize <= 32) // 4 bytes
1488 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1;
1489 if (VecSize <= 64) // 8 bytes
1490 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2;
1491 if (VecSize <= 96) // 12 bytes
1492 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3;
1493 if (VecSize <= 128) // 16 bytes
1494 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4;
1495 if (VecSize <= 160) // 20 bytes
1496 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
1497 if (VecSize <= 256) // 32 bytes
1498 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
1499 if (VecSize <= 288) // 36 bytes
1500 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
1501 if (VecSize <= 320) // 40 bytes
1502 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
1503 if (VecSize <= 352) // 44 bytes
1504 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
1505 if (VecSize <= 384) // 48 bytes
1506 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
1507 if (VecSize <= 512) // 64 bytes
1508 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
1509 if (VecSize <= 1024) // 128 bytes
1510 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32;
1511
1512 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1513}
1514
1515static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize) {
1516 if (VecSize <= 64) // 8 bytes
1517 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1;
1518 if (VecSize <= 128) // 16 bytes
1519 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2;
1520 if (VecSize <= 256) // 32 bytes
1521 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4;
1522 if (VecSize <= 512) // 64 bytes
1523 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8;
1524 if (VecSize <= 1024) // 128 bytes
1525 return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16;
1526
1527 llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
1528}
1529
1530const MCInstrDesc &
1531SIInstrInfo::getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize,
1532 bool IsSGPR) const {
1533 if (IsSGPR) {
1534 switch (EltSize) {
1535 case 32:
1536 return get(getIndirectSGPRWriteMovRelPseudo32(VecSize));
1537 case 64:
1538 return get(getIndirectSGPRWriteMovRelPseudo64(VecSize));
1539 default:
1540 llvm_unreachable("invalid reg indexing elt size");
1541 }
1542 }
1543
1544 assert(EltSize == 32 && "invalid reg indexing elt size");
1546}
1547
1548static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
1549 switch (Size) {
1550 case 4:
1551 return AMDGPU::SI_SPILL_S32_SAVE;
1552 case 8:
1553 return AMDGPU::SI_SPILL_S64_SAVE;
1554 case 12:
1555 return AMDGPU::SI_SPILL_S96_SAVE;
1556 case 16:
1557 return AMDGPU::SI_SPILL_S128_SAVE;
1558 case 20:
1559 return AMDGPU::SI_SPILL_S160_SAVE;
1560 case 24:
1561 return AMDGPU::SI_SPILL_S192_SAVE;
1562 case 28:
1563 return AMDGPU::SI_SPILL_S224_SAVE;
1564 case 32:
1565 return AMDGPU::SI_SPILL_S256_SAVE;
1566 case 36:
1567 return AMDGPU::SI_SPILL_S288_SAVE;
1568 case 40:
1569 return AMDGPU::SI_SPILL_S320_SAVE;
1570 case 44:
1571 return AMDGPU::SI_SPILL_S352_SAVE;
1572 case 48:
1573 return AMDGPU::SI_SPILL_S384_SAVE;
1574 case 64:
1575 return AMDGPU::SI_SPILL_S512_SAVE;
1576 case 128:
1577 return AMDGPU::SI_SPILL_S1024_SAVE;
1578 default:
1579 llvm_unreachable("unknown register size");
1580 }
1581}
1582
1583static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
1584 switch (Size) {
1585 case 4:
1586 return AMDGPU::SI_SPILL_V32_SAVE;
1587 case 8:
1588 return AMDGPU::SI_SPILL_V64_SAVE;
1589 case 12:
1590 return AMDGPU::SI_SPILL_V96_SAVE;
1591 case 16:
1592 return AMDGPU::SI_SPILL_V128_SAVE;
1593 case 20:
1594 return AMDGPU::SI_SPILL_V160_SAVE;
1595 case 24:
1596 return AMDGPU::SI_SPILL_V192_SAVE;
1597 case 28:
1598 return AMDGPU::SI_SPILL_V224_SAVE;
1599 case 32:
1600 return AMDGPU::SI_SPILL_V256_SAVE;
1601 case 36:
1602 return AMDGPU::SI_SPILL_V288_SAVE;
1603 case 40:
1604 return AMDGPU::SI_SPILL_V320_SAVE;
1605 case 44:
1606 return AMDGPU::SI_SPILL_V352_SAVE;
1607 case 48:
1608 return AMDGPU::SI_SPILL_V384_SAVE;
1609 case 64:
1610 return AMDGPU::SI_SPILL_V512_SAVE;
1611 case 128:
1612 return AMDGPU::SI_SPILL_V1024_SAVE;
1613 default:
1614 llvm_unreachable("unknown register size");
1615 }
1616}
1617
1618static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
1619 switch (Size) {
1620 case 4:
1621 return AMDGPU::SI_SPILL_A32_SAVE;
1622 case 8:
1623 return AMDGPU::SI_SPILL_A64_SAVE;
1624 case 12:
1625 return AMDGPU::SI_SPILL_A96_SAVE;
1626 case 16:
1627 return AMDGPU::SI_SPILL_A128_SAVE;
1628 case 20:
1629 return AMDGPU::SI_SPILL_A160_SAVE;
1630 case 24:
1631 return AMDGPU::SI_SPILL_A192_SAVE;
1632 case 28:
1633 return AMDGPU::SI_SPILL_A224_SAVE;
1634 case 32:
1635 return AMDGPU::SI_SPILL_A256_SAVE;
1636 case 36:
1637 return AMDGPU::SI_SPILL_A288_SAVE;
1638 case 40:
1639 return AMDGPU::SI_SPILL_A320_SAVE;
1640 case 44:
1641 return AMDGPU::SI_SPILL_A352_SAVE;
1642 case 48:
1643 return AMDGPU::SI_SPILL_A384_SAVE;
1644 case 64:
1645 return AMDGPU::SI_SPILL_A512_SAVE;
1646 case 128:
1647 return AMDGPU::SI_SPILL_A1024_SAVE;
1648 default:
1649 llvm_unreachable("unknown register size");
1650 }
1651}
1652
1653static unsigned getAVSpillSaveOpcode(unsigned Size) {
1654 switch (Size) {
1655 case 4:
1656 return AMDGPU::SI_SPILL_AV32_SAVE;
1657 case 8:
1658 return AMDGPU::SI_SPILL_AV64_SAVE;
1659 case 12:
1660 return AMDGPU::SI_SPILL_AV96_SAVE;
1661 case 16:
1662 return AMDGPU::SI_SPILL_AV128_SAVE;
1663 case 20:
1664 return AMDGPU::SI_SPILL_AV160_SAVE;
1665 case 24:
1666 return AMDGPU::SI_SPILL_AV192_SAVE;
1667 case 28:
1668 return AMDGPU::SI_SPILL_AV224_SAVE;
1669 case 32:
1670 return AMDGPU::SI_SPILL_AV256_SAVE;
1671 case 36:
1672 return AMDGPU::SI_SPILL_AV288_SAVE;
1673 case 40:
1674 return AMDGPU::SI_SPILL_AV320_SAVE;
1675 case 44:
1676 return AMDGPU::SI_SPILL_AV352_SAVE;
1677 case 48:
1678 return AMDGPU::SI_SPILL_AV384_SAVE;
1679 case 64:
1680 return AMDGPU::SI_SPILL_AV512_SAVE;
1681 case 128:
1682 return AMDGPU::SI_SPILL_AV1024_SAVE;
1683 default:
1684 llvm_unreachable("unknown register size");
1685 }
1686}
1687
1688static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
1689 bool IsVectorSuperClass) {
1690 // Currently, there is only 32-bit WWM register spills needed.
1691 if (Size != 4)
1692 llvm_unreachable("unknown wwm register spill size");
1693
1694 if (IsVectorSuperClass)
1695 return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
1696
1697 return AMDGPU::SI_SPILL_WWM_V32_SAVE;
1698}
1699
1701 const TargetRegisterClass *RC,
1702 unsigned Size,
1703 const SIRegisterInfo &TRI,
1704 const SIMachineFunctionInfo &MFI) {
1705 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1706
1707 // Choose the right opcode if spilling a WWM register.
1709 return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
1710
1711 if (IsVectorSuperClass)
1712 return getAVSpillSaveOpcode(Size);
1713
1714 return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
1716}
1717
1720 bool isKill, int FrameIndex, const TargetRegisterClass *RC,
1721 const TargetRegisterInfo *TRI, Register VReg) const {
1724 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1725 const DebugLoc &DL = MBB.findDebugLoc(MI);
1726
1727 MachinePointerInfo PtrInfo
1728 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1730 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
1731 FrameInfo.getObjectAlign(FrameIndex));
1732 unsigned SpillSize = TRI->getSpillSize(*RC);
1733
1735 if (RI.isSGPRClass(RC)) {
1736 MFI->setHasSpilledSGPRs();
1737 assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
1738 assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
1739 SrcReg != AMDGPU::EXEC && "exec should not be spilled");
1740
1741 // We are only allowed to create one new instruction when spilling
1742 // registers, so we need to use pseudo instruction for spilling SGPRs.
1743 const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
1744
1745 // The SGPR spill/restore instructions only work on number sgprs, so we need
1746 // to make sure we are using the correct register class.
1747 if (SrcReg.isVirtual() && SpillSize == 4) {
1748 MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1749 }
1750
1751 BuildMI(MBB, MI, DL, OpDesc)
1752 .addReg(SrcReg, getKillRegState(isKill)) // data
1753 .addFrameIndex(FrameIndex) // addr
1754 .addMemOperand(MMO)
1756
1757 if (RI.spillSGPRToVGPR())
1758 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1759 return;
1760 }
1761
1762 unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
1763 SpillSize, RI, *MFI);
1764 MFI->setHasSpilledVGPRs();
1765
1766 BuildMI(MBB, MI, DL, get(Opcode))
1767 .addReg(SrcReg, getKillRegState(isKill)) // data
1768 .addFrameIndex(FrameIndex) // addr
1769 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1770 .addImm(0) // offset
1771 .addMemOperand(MMO);
1772}
1773
1774static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
1775 switch (Size) {
1776 case 4:
1777 return AMDGPU::SI_SPILL_S32_RESTORE;
1778 case 8:
1779 return AMDGPU::SI_SPILL_S64_RESTORE;
1780 case 12:
1781 return AMDGPU::SI_SPILL_S96_RESTORE;
1782 case 16:
1783 return AMDGPU::SI_SPILL_S128_RESTORE;
1784 case 20:
1785 return AMDGPU::SI_SPILL_S160_RESTORE;
1786 case 24:
1787 return AMDGPU::SI_SPILL_S192_RESTORE;
1788 case 28:
1789 return AMDGPU::SI_SPILL_S224_RESTORE;
1790 case 32:
1791 return AMDGPU::SI_SPILL_S256_RESTORE;
1792 case 36:
1793 return AMDGPU::SI_SPILL_S288_RESTORE;
1794 case 40:
1795 return AMDGPU::SI_SPILL_S320_RESTORE;
1796 case 44:
1797 return AMDGPU::SI_SPILL_S352_RESTORE;
1798 case 48:
1799 return AMDGPU::SI_SPILL_S384_RESTORE;
1800 case 64:
1801 return AMDGPU::SI_SPILL_S512_RESTORE;
1802 case 128:
1803 return AMDGPU::SI_SPILL_S1024_RESTORE;
1804 default:
1805 llvm_unreachable("unknown register size");
1806 }
1807}
1808
1809static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
1810 switch (Size) {
1811 case 4:
1812 return AMDGPU::SI_SPILL_V32_RESTORE;
1813 case 8:
1814 return AMDGPU::SI_SPILL_V64_RESTORE;
1815 case 12:
1816 return AMDGPU::SI_SPILL_V96_RESTORE;
1817 case 16:
1818 return AMDGPU::SI_SPILL_V128_RESTORE;
1819 case 20:
1820 return AMDGPU::SI_SPILL_V160_RESTORE;
1821 case 24:
1822 return AMDGPU::SI_SPILL_V192_RESTORE;
1823 case 28:
1824 return AMDGPU::SI_SPILL_V224_RESTORE;
1825 case 32:
1826 return AMDGPU::SI_SPILL_V256_RESTORE;
1827 case 36:
1828 return AMDGPU::SI_SPILL_V288_RESTORE;
1829 case 40:
1830 return AMDGPU::SI_SPILL_V320_RESTORE;
1831 case 44:
1832 return AMDGPU::SI_SPILL_V352_RESTORE;
1833 case 48:
1834 return AMDGPU::SI_SPILL_V384_RESTORE;
1835 case 64:
1836 return AMDGPU::SI_SPILL_V512_RESTORE;
1837 case 128:
1838 return AMDGPU::SI_SPILL_V1024_RESTORE;
1839 default:
1840 llvm_unreachable("unknown register size");
1841 }
1842}
1843
1844static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
1845 switch (Size) {
1846 case 4:
1847 return AMDGPU::SI_SPILL_A32_RESTORE;
1848 case 8:
1849 return AMDGPU::SI_SPILL_A64_RESTORE;
1850 case 12:
1851 return AMDGPU::SI_SPILL_A96_RESTORE;
1852 case 16:
1853 return AMDGPU::SI_SPILL_A128_RESTORE;
1854 case 20:
1855 return AMDGPU::SI_SPILL_A160_RESTORE;
1856 case 24:
1857 return AMDGPU::SI_SPILL_A192_RESTORE;
1858 case 28:
1859 return AMDGPU::SI_SPILL_A224_RESTORE;
1860 case 32:
1861 return AMDGPU::SI_SPILL_A256_RESTORE;
1862 case 36:
1863 return AMDGPU::SI_SPILL_A288_RESTORE;
1864 case 40:
1865 return AMDGPU::SI_SPILL_A320_RESTORE;
1866 case 44:
1867 return AMDGPU::SI_SPILL_A352_RESTORE;
1868 case 48:
1869 return AMDGPU::SI_SPILL_A384_RESTORE;
1870 case 64:
1871 return AMDGPU::SI_SPILL_A512_RESTORE;
1872 case 128:
1873 return AMDGPU::SI_SPILL_A1024_RESTORE;
1874 default:
1875 llvm_unreachable("unknown register size");
1876 }
1877}
1878
1879static unsigned getAVSpillRestoreOpcode(unsigned Size) {
1880 switch (Size) {
1881 case 4:
1882 return AMDGPU::SI_SPILL_AV32_RESTORE;
1883 case 8:
1884 return AMDGPU::SI_SPILL_AV64_RESTORE;
1885 case 12:
1886 return AMDGPU::SI_SPILL_AV96_RESTORE;
1887 case 16:
1888 return AMDGPU::SI_SPILL_AV128_RESTORE;
1889 case 20:
1890 return AMDGPU::SI_SPILL_AV160_RESTORE;
1891 case 24:
1892 return AMDGPU::SI_SPILL_AV192_RESTORE;
1893 case 28:
1894 return AMDGPU::SI_SPILL_AV224_RESTORE;
1895 case 32:
1896 return AMDGPU::SI_SPILL_AV256_RESTORE;
1897 case 36:
1898 return AMDGPU::SI_SPILL_AV288_RESTORE;
1899 case 40:
1900 return AMDGPU::SI_SPILL_AV320_RESTORE;
1901 case 44:
1902 return AMDGPU::SI_SPILL_AV352_RESTORE;
1903 case 48:
1904 return AMDGPU::SI_SPILL_AV384_RESTORE;
1905 case 64:
1906 return AMDGPU::SI_SPILL_AV512_RESTORE;
1907 case 128:
1908 return AMDGPU::SI_SPILL_AV1024_RESTORE;
1909 default:
1910 llvm_unreachable("unknown register size");
1911 }
1912}
1913
1914static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
1915 bool IsVectorSuperClass) {
1916 // Currently, there is only 32-bit WWM register spills needed.
1917 if (Size != 4)
1918 llvm_unreachable("unknown wwm register spill size");
1919
1920 if (IsVectorSuperClass)
1921 return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
1922
1923 return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
1924}
1925
1926static unsigned
1928 unsigned Size, const SIRegisterInfo &TRI,
1929 const SIMachineFunctionInfo &MFI) {
1930 bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
1931
1932 // Choose the right opcode if restoring a WWM register.
1934 return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
1935
1936 if (IsVectorSuperClass)
1938
1939 return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
1941}
1942
1945 Register DestReg, int FrameIndex,
1946 const TargetRegisterClass *RC,
1947 const TargetRegisterInfo *TRI,
1948 Register VReg) const {
1951 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1952 const DebugLoc &DL = MBB.findDebugLoc(MI);
1953 unsigned SpillSize = TRI->getSpillSize(*RC);
1954
1955 MachinePointerInfo PtrInfo
1956 = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
1957
1959 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
1960 FrameInfo.getObjectAlign(FrameIndex));
1961
1962 if (RI.isSGPRClass(RC)) {
1963 MFI->setHasSpilledSGPRs();
1964 assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
1965 assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
1966 DestReg != AMDGPU::EXEC && "exec should not be spilled");
1967
1968 // FIXME: Maybe this should not include a memoperand because it will be
1969 // lowered to non-memory instructions.
1970 const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
1971 if (DestReg.isVirtual() && SpillSize == 4) {
1973 MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
1974 }
1975
1976 if (RI.spillSGPRToVGPR())
1977 FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
1978 BuildMI(MBB, MI, DL, OpDesc, DestReg)
1979 .addFrameIndex(FrameIndex) // addr
1980 .addMemOperand(MMO)
1982
1983 return;
1984 }
1985
1986 unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
1987 SpillSize, RI, *MFI);
1988 BuildMI(MBB, MI, DL, get(Opcode), DestReg)
1989 .addFrameIndex(FrameIndex) // vaddr
1990 .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
1991 .addImm(0) // offset
1992 .addMemOperand(MMO);
1993}
1994
1997 insertNoops(MBB, MI, 1);
1998}
1999
2002 unsigned Quantity) const {
2004 while (Quantity > 0) {
2005 unsigned Arg = std::min(Quantity, 8u);
2006 Quantity -= Arg;
2007 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)).addImm(Arg - 1);
2008 }
2009}
2010
2012 auto MF = MBB.getParent();
2014
2015 assert(Info->isEntryFunction());
2016
2017 if (MBB.succ_empty()) {
2018 bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
2019 if (HasNoTerminator) {
2020 if (Info->returnsVoid()) {
2021 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::S_ENDPGM)).addImm(0);
2022 } else {
2023 BuildMI(MBB, MBB.end(), DebugLoc(), get(AMDGPU::SI_RETURN_TO_EPILOG));
2024 }
2025 }
2026 }
2027}
2028
2032 const DebugLoc &DL) const {
2034 MachineBasicBlock *SplitBB = MBB.splitAt(MI, /*UpdateLiveIns=*/false);
2036 MF->push_back(HaltLoop);
2037
2038 constexpr unsigned DoorbellIDMask = 0x3ff;
2039 constexpr unsigned ECQueueWaveAbort = 0x400;
2040
2041 // Start with a `s_trap 2`, if we're in PRIV=1 and we need the workaround this
2042 // will be a nop.
2043 BuildMI(MBB, MI, DL, get(AMDGPU::S_TRAP))
2044 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
2045 Register DoorbellReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2046 BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG_RTN_B32), DoorbellReg)
2048 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::TTMP2)
2049 .addUse(AMDGPU::M0);
2050 Register DoorbellRegMasked =
2051 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2052 BuildMI(MBB, MI, DL, get(AMDGPU::S_AND_B32), DoorbellRegMasked)
2053 .addUse(DoorbellReg)
2054 .addImm(DoorbellIDMask);
2055 Register SetWaveAbortBit =
2056 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
2057 BuildMI(MBB, MI, DL, get(AMDGPU::S_OR_B32), SetWaveAbortBit)
2058 .addUse(DoorbellRegMasked)
2059 .addImm(ECQueueWaveAbort);
2060 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2061 .addUse(SetWaveAbortBit);
2062 BuildMI(MBB, MI, DL, get(AMDGPU::S_SENDMSG))
2064 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), AMDGPU::M0)
2065 .addUse(AMDGPU::TTMP2);
2066 BuildMI(MBB, MI, DL, get(AMDGPU::S_BRANCH)).addMBB(HaltLoop);
2067
2068 BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_SETHALT)).addImm(5);
2069 BuildMI(*HaltLoop, HaltLoop->end(), DL, get(AMDGPU::S_BRANCH))
2070 .addMBB(HaltLoop);
2071
2072 if (SplitBB != &MBB)
2073 MBB.removeSuccessor(SplitBB);
2074 MBB.addSuccessor(HaltLoop);
2075 HaltLoop->addSuccessor(HaltLoop);
2076
2077 return SplitBB;
2078}
2079
2081 switch (MI.getOpcode()) {
2082 default:
2083 if (MI.isMetaInstruction())
2084 return 0;
2085 return 1; // FIXME: Do wait states equal cycles?
2086
2087 case AMDGPU::S_NOP:
2088 return MI.getOperand(0).getImm() + 1;
2089 // SI_RETURN_TO_EPILOG is a fallthrough to code outside of the function. The
2090 // hazard, even if one exist, won't really be visible. Should we handle it?
2091 }
2092}
2093
2095 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2096 MachineBasicBlock &MBB = *MI.getParent();
2098 switch (MI.getOpcode()) {
2099 default: return TargetInstrInfo::expandPostRAPseudo(MI);
2100 case AMDGPU::S_MOV_B64_term:
2101 // This is only a terminator to get the correct spill code placement during
2102 // register allocation.
2103 MI.setDesc(get(AMDGPU::S_MOV_B64));
2104 break;
2105
2106 case AMDGPU::S_MOV_B32_term:
2107 // This is only a terminator to get the correct spill code placement during
2108 // register allocation.
2109 MI.setDesc(get(AMDGPU::S_MOV_B32));
2110 break;
2111
2112 case AMDGPU::S_XOR_B64_term:
2113 // This is only a terminator to get the correct spill code placement during
2114 // register allocation.
2115 MI.setDesc(get(AMDGPU::S_XOR_B64));
2116 break;
2117
2118 case AMDGPU::S_XOR_B32_term:
2119 // This is only a terminator to get the correct spill code placement during
2120 // register allocation.
2121 MI.setDesc(get(AMDGPU::S_XOR_B32));
2122 break;
2123 case AMDGPU::S_OR_B64_term:
2124 // This is only a terminator to get the correct spill code placement during
2125 // register allocation.
2126 MI.setDesc(get(AMDGPU::S_OR_B64));
2127 break;
2128 case AMDGPU::S_OR_B32_term:
2129 // This is only a terminator to get the correct spill code placement during
2130 // register allocation.
2131 MI.setDesc(get(AMDGPU::S_OR_B32));
2132 break;
2133
2134 case AMDGPU::S_ANDN2_B64_term:
2135 // This is only a terminator to get the correct spill code placement during
2136 // register allocation.
2137 MI.setDesc(get(AMDGPU::S_ANDN2_B64));
2138 break;
2139
2140 case AMDGPU::S_ANDN2_B32_term:
2141 // This is only a terminator to get the correct spill code placement during
2142 // register allocation.
2143 MI.setDesc(get(AMDGPU::S_ANDN2_B32));
2144 break;
2145
2146 case AMDGPU::S_AND_B64_term:
2147 // This is only a terminator to get the correct spill code placement during
2148 // register allocation.
2149 MI.setDesc(get(AMDGPU::S_AND_B64));
2150 break;
2151
2152 case AMDGPU::S_AND_B32_term:
2153 // This is only a terminator to get the correct spill code placement during
2154 // register allocation.
2155 MI.setDesc(get(AMDGPU::S_AND_B32));
2156 break;
2157
2158 case AMDGPU::S_AND_SAVEEXEC_B64_term:
2159 // This is only a terminator to get the correct spill code placement during
2160 // register allocation.
2161 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
2162 break;
2163
2164 case AMDGPU::S_AND_SAVEEXEC_B32_term:
2165 // This is only a terminator to get the correct spill code placement during
2166 // register allocation.
2167 MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
2168 break;
2169
2170 case AMDGPU::SI_SPILL_S32_TO_VGPR:
2171 MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
2172 break;
2173
2174 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
2175 MI.setDesc(get(AMDGPU::V_READLANE_B32));
2176 break;
2177
2178 case AMDGPU::V_MOV_B64_PSEUDO: {
2179 Register Dst = MI.getOperand(0).getReg();
2180 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2181 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2182
2183 const MachineOperand &SrcOp = MI.getOperand(1);
2184 // FIXME: Will this work for 64-bit floating point immediates?
2185 assert(!SrcOp.isFPImm());
2186 if (ST.hasMovB64()) {
2187 MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
2188 if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
2189 isUInt<32>(SrcOp.getImm()))
2190 break;
2191 }
2192 if (SrcOp.isImm()) {
2193 APInt Imm(64, SrcOp.getImm());
2194 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2195 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2196 if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
2197 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2199 .addImm(Lo.getSExtValue())
2201 .addImm(Lo.getSExtValue())
2202 .addImm(0) // op_sel_lo
2203 .addImm(0) // op_sel_hi
2204 .addImm(0) // neg_lo
2205 .addImm(0) // neg_hi
2206 .addImm(0); // clamp
2207 } else {
2208 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2209 .addImm(Lo.getSExtValue())
2211 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2212 .addImm(Hi.getSExtValue())
2214 }
2215 } else {
2216 assert(SrcOp.isReg());
2217 if (ST.hasPkMovB32() &&
2218 !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
2219 BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
2220 .addImm(SISrcMods::OP_SEL_1) // src0_mod
2221 .addReg(SrcOp.getReg())
2223 .addReg(SrcOp.getReg())
2224 .addImm(0) // op_sel_lo
2225 .addImm(0) // op_sel_hi
2226 .addImm(0) // neg_lo
2227 .addImm(0) // neg_hi
2228 .addImm(0); // clamp
2229 } else {
2230 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
2231 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0))
2233 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi)
2234 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
2236 }
2237 }
2238 MI.eraseFromParent();
2239 break;
2240 }
2241 case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
2243 break;
2244 }
2245 case AMDGPU::S_MOV_B64_IMM_PSEUDO: {
2246 const MachineOperand &SrcOp = MI.getOperand(1);
2247 assert(!SrcOp.isFPImm());
2248 APInt Imm(64, SrcOp.getImm());
2249 if (Imm.isIntN(32) || isInlineConstant(Imm)) {
2250 MI.setDesc(get(AMDGPU::S_MOV_B64));
2251 break;
2252 }
2253
2254 Register Dst = MI.getOperand(0).getReg();
2255 Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
2256 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2257
2258 APInt Lo(32, Imm.getLoBits(32).getZExtValue());
2259 APInt Hi(32, Imm.getHiBits(32).getZExtValue());
2260 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo)
2261 .addImm(Lo.getSExtValue())
2263 BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi)
2264 .addImm(Hi.getSExtValue())
2266 MI.eraseFromParent();
2267 break;
2268 }
2269 case AMDGPU::V_SET_INACTIVE_B32: {
2270 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2271 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2272 // FIXME: We may possibly optimize the COPY once we find ways to make LLVM
2273 // optimizations (mainly Register Coalescer) aware of WWM register liveness.
2274 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2275 .add(MI.getOperand(1));
2276 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2277 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2278 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
2279 .add(MI.getOperand(2));
2280 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2281 .addReg(Exec);
2282 MI.eraseFromParent();
2283 break;
2284 }
2285 case AMDGPU::V_SET_INACTIVE_B64: {
2286 unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
2287 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2288 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2289 MI.getOperand(0).getReg())
2290 .add(MI.getOperand(1));
2291 expandPostRAPseudo(*Copy);
2292 auto FirstNot = BuildMI(MBB, MI, DL, get(NotOpc), Exec).addReg(Exec);
2293 FirstNot->addRegisterDead(AMDGPU::SCC, TRI); // SCC is overwritten
2294 Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
2295 MI.getOperand(0).getReg())
2296 .add(MI.getOperand(2));
2297 expandPostRAPseudo(*Copy);
2298 BuildMI(MBB, MI, DL, get(NotOpc), Exec)
2299 .addReg(Exec);
2300 MI.eraseFromParent();
2301 break;
2302 }
2303 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2304 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2305 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2306 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2307 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2308 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2309 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2310 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2311 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2312 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2313 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2314 case AMDGPU::V_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2315 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V1:
2316 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V2:
2317 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V3:
2318 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
2319 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
2320 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
2321 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
2322 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
2323 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
2324 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
2325 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
2326 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
2327 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
2328 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V2:
2329 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V4:
2330 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V8:
2331 case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V16: {
2332 const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
2333
2334 unsigned Opc;
2335 if (RI.hasVGPRs(EltRC)) {
2336 Opc = AMDGPU::V_MOVRELD_B32_e32;
2337 } else {
2338 Opc = RI.getRegSizeInBits(*EltRC) == 64 ? AMDGPU::S_MOVRELD_B64
2339 : AMDGPU::S_MOVRELD_B32;
2340 }
2341
2342 const MCInstrDesc &OpDesc = get(Opc);
2343 Register VecReg = MI.getOperand(0).getReg();
2344 bool IsUndef = MI.getOperand(1).isUndef();
2345 unsigned SubReg = MI.getOperand(3).getImm();
2346 assert(VecReg == MI.getOperand(1).getReg());
2347
2349 BuildMI(MBB, MI, DL, OpDesc)
2350 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2351 .add(MI.getOperand(2))
2353 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2354
2355 const int ImpDefIdx =
2356 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2357 const int ImpUseIdx = ImpDefIdx + 1;
2358 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2359 MI.eraseFromParent();
2360 break;
2361 }
2362 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V1:
2363 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V2:
2364 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V3:
2365 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V4:
2366 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V5:
2367 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V8:
2368 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V9:
2369 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V10:
2370 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V11:
2371 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V12:
2372 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V16:
2373 case AMDGPU::V_INDIRECT_REG_WRITE_GPR_IDX_B32_V32: {
2375 Register VecReg = MI.getOperand(0).getReg();
2376 bool IsUndef = MI.getOperand(1).isUndef();
2377 Register Idx = MI.getOperand(3).getReg();
2378 Register SubReg = MI.getOperand(4).getImm();
2379
2380 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2381 .addReg(Idx)
2383 SetOn->getOperand(3).setIsUndef();
2384
2385 const MCInstrDesc &OpDesc = get(AMDGPU::V_MOV_B32_indirect_write);
2387 BuildMI(MBB, MI, DL, OpDesc)
2388 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2389 .add(MI.getOperand(2))
2391 .addReg(VecReg,
2392 RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2393
2394 const int ImpDefIdx =
2395 OpDesc.getNumOperands() + OpDesc.implicit_uses().size();
2396 const int ImpUseIdx = ImpDefIdx + 1;
2397 MIB->tieOperands(ImpDefIdx, ImpUseIdx);
2398
2399 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2400
2401 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2402
2403 MI.eraseFromParent();
2404 break;
2405 }
2406 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V1:
2407 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V2:
2408 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V3:
2409 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V4:
2410 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V5:
2411 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V8:
2412 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V9:
2413 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V10:
2414 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V11:
2415 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V12:
2416 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V16:
2417 case AMDGPU::V_INDIRECT_REG_READ_GPR_IDX_B32_V32: {
2419 Register Dst = MI.getOperand(0).getReg();
2420 Register VecReg = MI.getOperand(1).getReg();
2421 bool IsUndef = MI.getOperand(1).isUndef();
2422 Register Idx = MI.getOperand(2).getReg();
2423 Register SubReg = MI.getOperand(3).getImm();
2424
2425 MachineInstr *SetOn = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_ON))
2426 .addReg(Idx)
2428 SetOn->getOperand(3).setIsUndef();
2429
2430 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_indirect_read))
2431 .addDef(Dst)
2432 .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
2433 .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
2434
2435 MachineInstr *SetOff = BuildMI(MBB, MI, DL, get(AMDGPU::S_SET_GPR_IDX_OFF));
2436
2437 finalizeBundle(MBB, SetOn->getIterator(), std::next(SetOff->getIterator()));
2438
2439 MI.eraseFromParent();
2440 break;
2441 }
2442 case AMDGPU::SI_PC_ADD_REL_OFFSET: {
2443 MachineFunction &MF = *MBB.getParent();
2444 Register Reg = MI.getOperand(0).getReg();
2445 Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
2446 Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
2447 MachineOperand OpLo = MI.getOperand(1);
2448 MachineOperand OpHi = MI.getOperand(2);
2449
2450 // Create a bundle so these instructions won't be re-ordered by the
2451 // post-RA scheduler.
2452 MIBundleBuilder Bundler(MBB, MI);
2453 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
2454
2455 // What we want here is an offset from the value returned by s_getpc (which
2456 // is the address of the s_add_u32 instruction) to the global variable, but
2457 // since the encoding of $symbol starts 4 bytes after the start of the
2458 // s_add_u32 instruction, we end up with an offset that is 4 bytes too
2459 // small. This requires us to add 4 to the global variable offset in order
2460 // to compute the correct address. Similarly for the s_addc_u32 instruction,
2461 // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
2462 // instruction.
2463
2464 int64_t Adjust = 0;
2465 if (ST.hasGetPCZeroExtension()) {
2466 // Fix up hardware that does not sign-extend the 48-bit PC value by
2467 // inserting: s_sext_i32_i16 reghi, reghi
2468 Bundler.append(
2469 BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
2470 Adjust += 4;
2471 }
2472
2473 if (OpLo.isGlobal())
2474 OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
2475 Bundler.append(
2476 BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
2477
2478 if (OpHi.isGlobal())
2479 OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
2480 Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
2481 .addReg(RegHi)
2482 .add(OpHi));
2483
2484 finalizeBundle(MBB, Bundler.begin());
2485
2486 MI.eraseFromParent();
2487 break;
2488 }
2489 case AMDGPU::ENTER_STRICT_WWM: {
2490 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2491 // Whole Wave Mode is entered.
2492 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
2493 : AMDGPU::S_OR_SAVEEXEC_B64));
2494 break;
2495 }
2496 case AMDGPU::ENTER_STRICT_WQM: {
2497 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2498 // STRICT_WQM is entered.
2499 const unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
2500 const unsigned WQMOp = ST.isWave32() ? AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64;
2501 const unsigned MovOp = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
2502 BuildMI(MBB, MI, DL, get(MovOp), MI.getOperand(0).getReg()).addReg(Exec);
2503 BuildMI(MBB, MI, DL, get(WQMOp), Exec).addReg(Exec);
2504
2505 MI.eraseFromParent();
2506 break;
2507 }
2508 case AMDGPU::EXIT_STRICT_WWM:
2509 case AMDGPU::EXIT_STRICT_WQM: {
2510 // This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
2511 // WWM/STICT_WQM is exited.
2512 MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
2513 break;
2514 }
2515 case AMDGPU::ENTER_PSEUDO_WM:
2516 case AMDGPU::EXIT_PSEUDO_WM: {
2517 // These do nothing.
2518 MI.eraseFromParent();
2519 break;
2520 }
2521 case AMDGPU::SI_RETURN: {
2522 const MachineFunction *MF = MBB.getParent();
2523 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2524 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2525 // Hiding the return address use with SI_RETURN may lead to extra kills in
2526 // the function and missing live-ins. We are fine in practice because callee
2527 // saved register handling ensures the register value is restored before
2528 // RET, but we need the undef flag here to appease the MachineVerifier
2529 // liveness checks.
2531 BuildMI(MBB, MI, DL, get(AMDGPU::S_SETPC_B64_return))
2532 .addReg(TRI->getReturnAddressReg(*MF), RegState::Undef);
2533
2534 MIB.copyImplicitOps(MI);
2535 MI.eraseFromParent();
2536 break;
2537 }
2538
2539 case AMDGPU::S_MUL_U64_U32_PSEUDO:
2540 case AMDGPU::S_MUL_I64_I32_PSEUDO:
2541 MI.setDesc(get(AMDGPU::S_MUL_U64));
2542 break;
2543
2544 case AMDGPU::S_GETPC_B64_pseudo:
2545 MI.setDesc(get(AMDGPU::S_GETPC_B64));
2546 if (ST.hasGetPCZeroExtension()) {
2547 Register Dst = MI.getOperand(0).getReg();
2548 Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
2549 // Fix up hardware that does not sign-extend the 48-bit PC value by
2550 // inserting: s_sext_i32_i16 dsthi, dsthi
2551 BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
2552 DstHi)
2553 .addReg(DstHi);
2554 }
2555 break;
2556 }
2557 return true;
2558}
2559
2562 unsigned SubIdx, const MachineInstr &Orig,
2563 const TargetRegisterInfo &RI) const {
2564
2565 // Try shrinking the instruction to remat only the part needed for current
2566 // context.
2567 // TODO: Handle more cases.
2568 unsigned Opcode = Orig.getOpcode();
2569 switch (Opcode) {
2570 case AMDGPU::S_LOAD_DWORDX16_IMM:
2571 case AMDGPU::S_LOAD_DWORDX8_IMM: {
2572 if (SubIdx != 0)
2573 break;
2574
2575 if (I == MBB.end())
2576 break;
2577
2578 if (I->isBundled())
2579 break;
2580
2581 // Look for a single use of the register that is also a subreg.
2582 Register RegToFind = Orig.getOperand(0).getReg();
2583 MachineOperand *UseMO = nullptr;
2584 for (auto &CandMO : I->operands()) {
2585 if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
2586 continue;
2587 if (UseMO) {
2588 UseMO = nullptr;
2589 break;
2590 }
2591 UseMO = &CandMO;
2592 }
2593 if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
2594 break;
2595
2596 unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
2597 unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
2598
2601 assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
2602
2603 unsigned NewOpcode = -1;
2604 if (SubregSize == 256)
2605 NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
2606 else if (SubregSize == 128)
2607 NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
2608 else
2609 break;
2610
2611 const MCInstrDesc &TID = get(NewOpcode);
2612 const TargetRegisterClass *NewRC =
2613 RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
2614 MRI.setRegClass(DestReg, NewRC);
2615
2616 UseMO->setReg(DestReg);
2617 UseMO->setSubReg(AMDGPU::NoSubRegister);
2618
2619 // Use a smaller load with the desired size, possibly with updated offset.
2620 MachineInstr *MI = MF->CloneMachineInstr(&Orig);
2621 MI->setDesc(TID);
2622 MI->getOperand(0).setReg(DestReg);
2623 MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
2624 if (Offset) {
2625 MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
2626 int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
2627 OffsetMO->setImm(FinalOffset);
2628 }
2630 for (const MachineMemOperand *MemOp : Orig.memoperands())
2631 NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
2632 SubregSize / 8));
2633 MI->setMemRefs(*MF, NewMMOs);
2634
2635 MBB.insert(I, MI);
2636 return;
2637 }
2638
2639 default:
2640 break;
2641 }
2642
2643 TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
2644}
2645
2646std::pair<MachineInstr*, MachineInstr*>
2648 assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
2649
2650 if (ST.hasMovB64() &&
2652 getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
2653 MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
2654 return std::pair(&MI, nullptr);
2655 }
2656
2657 MachineBasicBlock &MBB = *MI.getParent();
2661 Register Dst = MI.getOperand(0).getReg();
2662 unsigned Part = 0;
2663 MachineInstr *Split[2];
2664
2665 for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
2666 auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
2667 if (Dst.isPhysical()) {
2668 MovDPP.addDef(RI.getSubReg(Dst, Sub));
2669 } else {
2670 assert(MRI.isSSA());
2671 auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
2672 MovDPP.addDef(Tmp);
2673 }
2674
2675 for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
2676 const MachineOperand &SrcOp = MI.getOperand(I);
2677 assert(!SrcOp.isFPImm());
2678 if (SrcOp.isImm()) {
2679 APInt Imm(64, SrcOp.getImm());
2680 Imm.ashrInPlace(Part * 32);
2681 MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
2682 } else {
2683 assert(SrcOp.isReg());
2684 Register Src = SrcOp.getReg();
2685 if (Src.isPhysical())
2686 MovDPP.addReg(RI.getSubReg(Src, Sub));
2687 else
2688 MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
2689 }
2690 }
2691
2692 for (const MachineOperand &MO : llvm::drop_begin(MI.explicit_operands(), 3))
2693 MovDPP.addImm(MO.getImm());
2694
2695 Split[Part] = MovDPP;
2696 ++Part;
2697 }
2698
2699 if (Dst.isVirtual())
2700 BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
2701 .addReg(Split[0]->getOperand(0).getReg())
2702 .addImm(AMDGPU::sub0)
2703 .addReg(Split[1]->getOperand(0).getReg())
2704 .addImm(AMDGPU::sub1);
2705
2706 MI.eraseFromParent();
2707 return std::pair(Split[0], Split[1]);
2708}
2709
2710std::optional<DestSourcePair>
2712 if (MI.getOpcode() == AMDGPU::WWM_COPY)
2713 return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
2714
2715 return std::nullopt;
2716}
2717
2719 MachineOperand &Src0,
2720 unsigned Src0OpName,
2721 MachineOperand &Src1,
2722 unsigned Src1OpName) const {
2723 MachineOperand *Src0Mods = getNamedOperand(MI, Src0OpName);
2724 if (!Src0Mods)
2725 return false;
2726
2727 MachineOperand *Src1Mods = getNamedOperand(MI, Src1OpName);
2728 assert(Src1Mods &&
2729 "All commutable instructions have both src0 and src1 modifiers");
2730
2731 int Src0ModsVal = Src0Mods->getImm();
2732 int Src1ModsVal = Src1Mods->getImm();
2733
2734 Src1Mods->setImm(Src0ModsVal);
2735 Src0Mods->setImm(Src1ModsVal);
2736 return true;
2737}
2738
2740 MachineOperand &RegOp,
2741 MachineOperand &NonRegOp) {
2742 Register Reg = RegOp.getReg();
2743 unsigned SubReg = RegOp.getSubReg();
2744 bool IsKill = RegOp.isKill();
2745 bool IsDead = RegOp.isDead();
2746 bool IsUndef = RegOp.isUndef();
2747 bool IsDebug = RegOp.isDebug();
2748
2749 if (NonRegOp.isImm())
2750 RegOp.ChangeToImmediate(NonRegOp.getImm());
2751 else if (NonRegOp.isFI())
2752 RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
2753 else if (NonRegOp.isGlobal()) {
2754 RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
2755 NonRegOp.getTargetFlags());
2756 } else
2757 return nullptr;
2758
2759 // Make sure we don't reinterpret a subreg index in the target flags.
2760 RegOp.setTargetFlags(NonRegOp.getTargetFlags());
2761
2762 NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
2763 NonRegOp.setSubReg(SubReg);
2764
2765 return &MI;
2766}
2767
2769 unsigned Src0Idx,
2770 unsigned Src1Idx) const {
2771 assert(!NewMI && "this should never be used");
2772
2773 unsigned Opc = MI.getOpcode();
2774 int CommutedOpcode = commuteOpcode(Opc);
2775 if (CommutedOpcode == -1)
2776 return nullptr;
2777
2778 if (Src0Idx > Src1Idx)
2779 std::swap(Src0Idx, Src1Idx);
2780
2781 assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
2782 static_cast<int>(Src0Idx) &&
2783 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
2784 static_cast<int>(Src1Idx) &&
2785 "inconsistency with findCommutedOpIndices");
2786
2787 MachineOperand &Src0 = MI.getOperand(Src0Idx);
2788 MachineOperand &Src1 = MI.getOperand(Src1Idx);
2789
2790 MachineInstr *CommutedMI = nullptr;
2791 if (Src0.isReg() && Src1.isReg()) {
2792 if (isOperandLegal(MI, Src1Idx, &Src0)) {
2793 // Be sure to copy the source modifiers to the right place.
2794 CommutedMI
2795 = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, Src0Idx, Src1Idx);
2796 }
2797
2798 } else if (Src0.isReg() && !Src1.isReg()) {
2799 // src0 should always be able to support any operand type, so no need to
2800 // check operand legality.
2801 CommutedMI = swapRegAndNonRegOperand(MI, Src0, Src1);
2802 } else if (!Src0.isReg() && Src1.isReg()) {
2803 if (isOperandLegal(MI, Src1Idx, &Src0))
2804 CommutedMI = swapRegAndNonRegOperand(MI, Src1, Src0);
2805 } else {
2806 // FIXME: Found two non registers to commute. This does happen.
2807 return nullptr;
2808 }
2809
2810 if (CommutedMI) {
2811 swapSourceModifiers(MI, Src0, AMDGPU::OpName::src0_modifiers,
2812 Src1, AMDGPU::OpName::src1_modifiers);
2813
2814 CommutedMI->setDesc(get(CommutedOpcode));
2815 }
2816
2817 return CommutedMI;
2818}
2819
2820// This needs to be implemented because the source modifiers may be inserted
2821// between the true commutable operands, and the base
2822// TargetInstrInfo::commuteInstruction uses it.
2824 unsigned &SrcOpIdx0,
2825 unsigned &SrcOpIdx1) const {
2826 return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
2827}
2828
2830 unsigned &SrcOpIdx0,
2831 unsigned &SrcOpIdx1) const {
2832 if (!Desc.isCommutable())
2833 return false;
2834
2835 unsigned Opc = Desc.getOpcode();
2836 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
2837 if (Src0Idx == -1)
2838 return false;
2839
2840 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
2841 if (Src1Idx == -1)
2842 return false;
2843
2844 return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx);
2845}
2846
2848 int64_t BrOffset) const {
2849 // BranchRelaxation should never have to check s_setpc_b64 because its dest
2850 // block is unanalyzable.
2851 assert(BranchOp != AMDGPU::S_SETPC_B64);
2852
2853 // Convert to dwords.
2854 BrOffset /= 4;
2855
2856 // The branch instructions do PC += signext(SIMM16 * 4) + 4, so the offset is
2857 // from the next instruction.
2858 BrOffset -= 1;
2859
2860 return isIntN(BranchOffsetBits, BrOffset);
2861}
2862
2865 return MI.getOperand(0).getMBB();
2866}
2867
2869 for (const MachineInstr &MI : MBB->terminators()) {
2870 if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO ||
2871 MI.getOpcode() == AMDGPU::SI_IF || MI.getOpcode() == AMDGPU::SI_ELSE ||
2872 MI.getOpcode() == AMDGPU::SI_LOOP)
2873 return true;
2874 }
2875 return false;
2876}
2877
2879 MachineBasicBlock &DestBB,
2880 MachineBasicBlock &RestoreBB,
2881 const DebugLoc &DL, int64_t BrOffset,
2882 RegScavenger *RS) const {
2883 assert(RS && "RegScavenger required for long branching");
2884 assert(MBB.empty() &&
2885 "new block should be inserted for expanding unconditional branch");
2886 assert(MBB.pred_size() == 1);
2887 assert(RestoreBB.empty() &&
2888 "restore block should be inserted for restoring clobbered registers");
2889
2893
2894 // FIXME: Virtual register workaround for RegScavenger not working with empty
2895 // blocks.
2896 Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
2897
2898 auto I = MBB.end();
2899
2900 // We need to compute the offset relative to the instruction immediately after
2901 // s_getpc_b64. Insert pc arithmetic code before last terminator.
2902 MachineInstr *GetPC = BuildMI(MBB, I, DL, get(AMDGPU::S_GETPC_B64), PCReg);
2903
2904 auto &MCCtx = MF->getContext();
2905 MCSymbol *PostGetPCLabel =
2906 MCCtx.createTempSymbol("post_getpc", /*AlwaysAddSuffix=*/true);
2907 GetPC->setPostInstrSymbol(*MF, PostGetPCLabel);
2908
2909 MCSymbol *OffsetLo =
2910 MCCtx.createTempSymbol("offset_lo", /*AlwaysAddSuffix=*/true);
2911 MCSymbol *OffsetHi =
2912 MCCtx.createTempSymbol("offset_hi", /*AlwaysAddSuffix=*/true);
2913 BuildMI(MBB, I, DL, get(AMDGPU::S_ADD_U32))
2914 .addReg(PCReg, RegState::Define, AMDGPU::sub0)
2915 .addReg(PCReg, 0, AMDGPU::sub0)
2916 .addSym(OffsetLo, MO_FAR_BRANCH_OFFSET);
2917 BuildMI(MBB, I, DL, get(AMDGPU::S_ADDC_U32))
2918 .addReg(PCReg, RegState::Define, AMDGPU::sub1)
2919 .addReg(PCReg, 0, AMDGPU::sub1)
2920 .addSym(OffsetHi, MO_FAR_BRANCH_OFFSET);
2921
2922 // Insert the indirect branch after the other terminator.
2923 BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
2924 .addReg(PCReg);
2925
2926 // If a spill is needed for the pc register pair, we need to insert a spill
2927 // restore block right before the destination block, and insert a short branch
2928 // into the old destination block's fallthrough predecessor.
2929 // e.g.:
2930 //
2931 // s_cbranch_scc0 skip_long_branch:
2932 //
2933 // long_branch_bb:
2934 // spill s[8:9]
2935 // s_getpc_b64 s[8:9]
2936 // s_add_u32 s8, s8, restore_bb
2937 // s_addc_u32 s9, s9, 0
2938 // s_setpc_b64 s[8:9]
2939 //
2940 // skip_long_branch:
2941 // foo;
2942 //
2943 // .....
2944 //
2945 // dest_bb_fallthrough_predecessor:
2946 // bar;
2947 // s_branch dest_bb
2948 //
2949 // restore_bb:
2950 // restore s[8:9]
2951 // fallthrough dest_bb
2952 ///
2953 // dest_bb:
2954 // buzz;
2955
2956 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
2957 Register Scav;
2958
2959 // If we've previously reserved a register for long branches
2960 // avoid running the scavenger and just use those registers
2961 if (LongBranchReservedReg) {
2962 RS->enterBasicBlock(MBB);
2963 Scav = LongBranchReservedReg;
2964 } else {
2966 Scav = RS->scavengeRegisterBackwards(
2967 AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
2968 /* RestoreAfter */ false, 0, /* AllowSpill */ false);
2969 }
2970 if (Scav) {
2971 RS->setRegUsed(Scav);
2972 MRI.replaceRegWith(PCReg, Scav);
2973 MRI.clearVirtRegs();
2974 } else {
2975 // As SGPR needs VGPR to be spilled, we reuse the slot of temporary VGPR for
2976 // SGPR spill.
2977 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
2978 const SIRegisterInfo *TRI = ST.getRegisterInfo();
2979 TRI->spillEmergencySGPR(GetPC, RestoreBB, AMDGPU::SGPR0_SGPR1, RS);
2980 MRI.replaceRegWith(PCReg, AMDGPU::SGPR0_SGPR1);
2981 MRI.clearVirtRegs();
2982 }
2983
2984 MCSymbol *DestLabel = Scav ? DestBB.getSymbol() : RestoreBB.getSymbol();
2985 // Now, the distance could be defined.
2987 MCSymbolRefExpr::create(DestLabel, MCCtx),
2988 MCSymbolRefExpr::create(PostGetPCLabel, MCCtx), MCCtx);
2989 // Add offset assignments.
2990 auto *Mask = MCConstantExpr::create(0xFFFFFFFFULL, MCCtx);
2991 OffsetLo->setVariableValue(MCBinaryExpr::createAnd(Offset, Mask, MCCtx));
2992 auto *ShAmt = MCConstantExpr::create(32, MCCtx);
2993 OffsetHi->setVariableValue(MCBinaryExpr::createAShr(Offset, ShAmt, MCCtx));
2994}
2995
2996unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) {
2997 switch (Cond) {
2998 case SIInstrInfo::SCC_TRUE:
2999 return AMDGPU::S_CBRANCH_SCC1;
3000 case SIInstrInfo::SCC_FALSE:
3001 return AMDGPU::S_CBRANCH_SCC0;
3002 case SIInstrInfo::VCCNZ:
3003 return AMDGPU::S_CBRANCH_VCCNZ;
3004 case SIInstrInfo::VCCZ:
3005 return AMDGPU::S_CBRANCH_VCCZ;
3006 case SIInstrInfo::EXECNZ:
3007 return AMDGPU::S_CBRANCH_EXECNZ;
3008 case SIInstrInfo::EXECZ:
3009 return AMDGPU::S_CBRANCH_EXECZ;
3010 default:
3011 llvm_unreachable("invalid branch predicate");
3012 }
3013}
3014
3015SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) {
3016 switch (Opcode) {
3017 case AMDGPU::S_CBRANCH_SCC0:
3018 return SCC_FALSE;
3019 case AMDGPU::S_CBRANCH_SCC1:
3020 return SCC_TRUE;
3021 case AMDGPU::S_CBRANCH_VCCNZ:
3022 return VCCNZ;
3023 case AMDGPU::S_CBRANCH_VCCZ:
3024 return VCCZ;
3025 case AMDGPU::S_CBRANCH_EXECNZ:
3026 return EXECNZ;
3027 case AMDGPU::S_CBRANCH_EXECZ:
3028 return EXECZ;
3029 default:
3030 return INVALID_BR;
3031 }
3032}
3033
3037 MachineBasicBlock *&FBB,
3039 bool AllowModify) const {
3040 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3041 // Unconditional Branch
3042 TBB = I->getOperand(0).getMBB();
3043 return false;
3044 }
3045
3046 MachineBasicBlock *CondBB = nullptr;
3047
3048 if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
3049 CondBB = I->getOperand(1).getMBB();
3050 Cond.push_back(I->getOperand(0));
3051 } else {
3052 BranchPredicate Pred = getBranchPredicate(I->getOpcode());
3053 if (Pred == INVALID_BR)
3054 return true;
3055
3056 CondBB = I->getOperand(0).getMBB();
3057 Cond.push_back(MachineOperand::CreateImm(Pred));
3058 Cond.push_back(I->getOperand(1)); // Save the branch register.
3059 }
3060 ++I;
3061
3062 if (I == MBB.end()) {
3063 // Conditional branch followed by fall-through.
3064 TBB = CondBB;
3065 return false;
3066 }
3067
3068 if (I->getOpcode() == AMDGPU::S_BRANCH) {
3069 TBB = CondBB;
3070 FBB = I->getOperand(0).getMBB();
3071 return false;
3072 }
3073
3074 return true;
3075}
3076
3078 MachineBasicBlock *&FBB,
3080 bool AllowModify) const {
3082 auto E = MBB.end();
3083 if (I == E)
3084 return false;
3085
3086 // Skip over the instructions that are artificially terminators for special
3087 // exec management.
3088 while (I != E && !I->isBranch() && !I->isReturn()) {
3089 switch (I->getOpcode()) {
3090 case AMDGPU::S_MOV_B64_term:
3091 case AMDGPU::S_XOR_B64_term:
3092 case AMDGPU::S_OR_B64_term:
3093 case AMDGPU::S_ANDN2_B64_term:
3094 case AMDGPU::S_AND_B64_term:
3095 case AMDGPU::S_AND_SAVEEXEC_B64_term:
3096 case AMDGPU::S_MOV_B32_term:
3097 case AMDGPU::S_XOR_B32_term:
3098 case AMDGPU::S_OR_B32_term:
3099 case AMDGPU::S_ANDN2_B32_term:
3100 case AMDGPU::S_AND_B32_term:
3101 case AMDGPU::S_AND_SAVEEXEC_B32_term:
3102 break;
3103 case AMDGPU::SI_IF:
3104 case AMDGPU::SI_ELSE:
3105 case AMDGPU::SI_KILL_I1_TERMINATOR:
3106 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
3107 // FIXME: It's messy that these need to be considered here at all.
3108 return true;
3109 default:
3110 llvm_unreachable("unexpected non-branch terminator inst");
3111 }
3112
3113 ++I;
3114 }
3115
3116 if (I == E)
3117 return false;
3118
3119 return analyzeBranchImpl(MBB, I, TBB, FBB, Cond, AllowModify);
3120}
3121
3123 int *BytesRemoved) const {
3124 unsigned Count = 0;
3125 unsigned RemovedSize = 0;
3127 // Skip over artificial terminators when removing instructions.
3128 if (MI.isBranch() || MI.isReturn()) {
3129 RemovedSize += getInstSizeInBytes(MI);
3130 MI.eraseFromParent();
3131 ++Count;
3132 }
3133 }
3134
3135 if (BytesRemoved)
3136 *BytesRemoved = RemovedSize;
3137
3138 return Count;
3139}
3140
3141// Copy the flags onto the implicit condition register operand.
3143 const MachineOperand &OrigCond) {
3144 CondReg.setIsUndef(OrigCond.isUndef());
3145 CondReg.setIsKill(OrigCond.isKill());
3146}
3147
3150 MachineBasicBlock *FBB,
3152 const DebugLoc &DL,
3153 int *BytesAdded) const {
3154 if (!FBB && Cond.empty()) {
3155 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3156 .addMBB(TBB);
3157 if (BytesAdded)
3158 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3159 return 1;
3160 }
3161
3162 if(Cond.size() == 1 && Cond[0].isReg()) {
3163 BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
3164 .add(Cond[0])
3165 .addMBB(TBB);
3166 return 1;
3167 }
3168
3169 assert(TBB && Cond[0].isImm());
3170
3171 unsigned Opcode
3172 = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
3173
3174 if (!FBB) {
3175 MachineInstr *CondBr =
3176 BuildMI(&MBB, DL, get(Opcode))
3177 .addMBB(TBB);
3178
3179 // Copy the flags onto the implicit condition register operand.
3180 preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
3181 fixImplicitOperands(*CondBr);
3182
3183 if (BytesAdded)
3184 *BytesAdded = ST.hasOffset3fBug() ? 8 : 4;
3185 return 1;
3186 }
3187
3188 assert(TBB && FBB);
3189
3190 MachineInstr *CondBr =
3191 BuildMI(&MBB, DL, get(Opcode))
3192 .addMBB(TBB);
3193 fixImplicitOperands(*CondBr);
3194 BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH))
3195 .addMBB(FBB);
3196
3197 MachineOperand &CondReg = CondBr->getOperand(1);
3198 CondReg.setIsUndef(Cond[1].isUndef());
3199 CondReg.setIsKill(Cond[1].isKill());
3200
3201 if (BytesAdded)
3202 *BytesAdded = ST.hasOffset3fBug() ? 16 : 8;
3203
3204 return 2;
3205}
3206
3209 if (Cond.size() != 2) {
3210 return true;
3211 }
3212
3213 if (Cond[0].isImm()) {
3214 Cond[0].setImm(-Cond[0].getImm());
3215 return false;
3216 }
3217
3218 return true;
3219}
3220
3223 Register DstReg, Register TrueReg,
3224 Register FalseReg, int &CondCycles,
3225 int &TrueCycles, int &FalseCycles) const {
3226 switch (Cond[0].getImm()) {
3227 case VCCNZ:
3228 case VCCZ: {
3230 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3231 if (MRI.getRegClass(FalseReg) != RC)
3232 return false;
3233
3234 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3235 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3236
3237 // Limit to equal cost for branch vs. N v_cndmask_b32s.
3238 return RI.hasVGPRs(RC) && NumInsts <= 6;
3239 }
3240 case SCC_TRUE:
3241 case SCC_FALSE: {
3242 // FIXME: We could insert for VGPRs if we could replace the original compare
3243 // with a vector one.
3245 const TargetRegisterClass *RC = MRI.getRegClass(TrueReg);
3246 if (MRI.getRegClass(FalseReg) != RC)
3247 return false;
3248
3249 int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
3250
3251 // Multiples of 8 can do s_cselect_b64
3252 if (NumInsts % 2 == 0)
3253 NumInsts /= 2;
3254
3255 CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
3256 return RI.isSGPRClass(RC);
3257 }
3258 default:
3259 return false;
3260 }
3261}
3262
3266 Register TrueReg, Register FalseReg) const {
3267 BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
3268 if (Pred == VCCZ || Pred == SCC_FALSE) {
3269 Pred = static_cast<BranchPredicate>(-Pred);
3270 std::swap(TrueReg, FalseReg);
3271 }
3272
3274 const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg);
3275 unsigned DstSize = RI.getRegSizeInBits(*DstRC);
3276
3277 if (DstSize == 32) {
3279 if (Pred == SCC_TRUE) {
3280 Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
3281 .addReg(TrueReg)
3282 .addReg(FalseReg);
3283 } else {
3284 // Instruction's operands are backwards from what is expected.
3285 Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
3286 .addReg(FalseReg)
3287 .addReg(TrueReg);
3288 }
3289
3290 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3291 return;
3292 }
3293
3294 if (DstSize == 64 && Pred == SCC_TRUE) {
3296 BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
3297 .addReg(TrueReg)
3298 .addReg(FalseReg);
3299
3300 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3301 return;
3302 }
3303
3304 static const int16_t Sub0_15[] = {
3305 AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
3306 AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
3307 AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
3308 AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
3309 };
3310
3311 static const int16_t Sub0_15_64[] = {
3312 AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
3313 AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
3314 AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
3315 AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
3316 };
3317
3318 unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32;
3319 const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass;
3320 const int16_t *SubIndices = Sub0_15;
3321 int NElts = DstSize / 32;
3322
3323 // 64-bit select is only available for SALU.
3324 // TODO: Split 96-bit into 64-bit and 32-bit, not 3x 32-bit.
3325 if (Pred == SCC_TRUE) {
3326 if (NElts % 2) {
3327 SelOp = AMDGPU::S_CSELECT_B32;
3328 EltRC = &AMDGPU::SGPR_32RegClass;
3329 } else {
3330 SelOp = AMDGPU::S_CSELECT_B64;
3331 EltRC = &AMDGPU::SGPR_64RegClass;
3332 SubIndices = Sub0_15_64;
3333 NElts /= 2;
3334 }
3335 }
3336
3338 MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg);
3339
3340 I = MIB->getIterator();
3341
3343 for (int Idx = 0; Idx != NElts; ++Idx) {
3344 Register DstElt = MRI.createVirtualRegister(EltRC);
3345 Regs.push_back(DstElt);
3346
3347 unsigned SubIdx = SubIndices[Idx];
3348
3350 if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
3351 Select =
3352 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3353 .addReg(FalseReg, 0, SubIdx)
3354 .addReg(TrueReg, 0, SubIdx);
3355 } else {
3356 Select =
3357 BuildMI(MBB, I, DL, get(SelOp), DstElt)
3358 .addReg(TrueReg, 0, SubIdx)
3359 .addReg(FalseReg, 0, SubIdx);
3360 }
3361
3362 preserveCondRegFlags(Select->getOperand(3), Cond[1]);
3364
3365 MIB.addReg(DstElt)
3366 .addImm(SubIdx);
3367 }
3368}
3369
3371 switch (MI.getOpcode()) {
3372 case AMDGPU::V_MOV_B32_e32:
3373 case AMDGPU::V_MOV_B32_e64:
3374 case AMDGPU::V_MOV_B64_PSEUDO:
3375 case AMDGPU::V_MOV_B64_e32:
3376 case AMDGPU::V_MOV_B64_e64:
3377 case AMDGPU::S_MOV_B32:
3378 case AMDGPU::S_MOV_B64:
3379 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3380 case AMDGPU::COPY:
3381 case AMDGPU::WWM_COPY:
3382 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3383 case AMDGPU::V_ACCVGPR_READ_B32_e64:
3384 case AMDGPU::V_ACCVGPR_MOV_B32:
3385 return true;
3386 default:
3387 return false;
3388 }
3389}
3390
3391static constexpr unsigned ModifierOpNames[] = {
3392 AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
3393 AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::clamp,
3394 AMDGPU::OpName::omod, AMDGPU::OpName::op_sel};
3395
3397 unsigned Opc = MI.getOpcode();
3398 for (unsigned Name : reverse(ModifierOpNames)) {
3400 if (Idx >= 0)
3401 MI.removeOperand(Idx);
3402 }
3403}
3404
3406 Register Reg, MachineRegisterInfo *MRI) const {
3407 if (!MRI->hasOneNonDBGUse(Reg))
3408 return false;
3409
3410 switch (DefMI.getOpcode()) {
3411 default:
3412 return false;
3413 case AMDGPU::V_MOV_B64_e32:
3414 case AMDGPU::S_MOV_B64:
3415 case AMDGPU::V_MOV_B64_PSEUDO:
3416 case AMDGPU::S_MOV_B64_IMM_PSEUDO:
3417 case AMDGPU::V_MOV_B32_e32:
3418 case AMDGPU::S_MOV_B32:
3419 case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
3420 break;
3421 }
3422
3423 const MachineOperand *ImmOp = getNamedOperand(DefMI, AMDGPU::OpName::src0);
3424 assert(ImmOp);
3425 // FIXME: We could handle FrameIndex values here.
3426 if (!ImmOp->isImm())
3427 return false;
3428
3429 auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
3430 int64_t Imm = ImmOp->getImm();
3431 switch (UseOp.getSubReg()) {
3432 default:
3433 return Imm;
3434 case AMDGPU::sub0:
3435 return Lo_32(Imm);
3436 case AMDGPU::sub1:
3437 return Hi_32(Imm);
3438 case AMDGPU::lo16:
3439 return APInt(16, Imm).getSExtValue();
3440 case AMDGPU::hi16:
3441 return APInt(32, Imm).ashr(16).getSExtValue();
3442 case AMDGPU::sub1_lo16:
3443 return APInt(16, Hi_32(Imm)).getSExtValue();
3444 case AMDGPU::sub1_hi16:
3445 return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
3446 }
3447 };
3448
3449 assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
3450
3451 unsigned Opc = UseMI.getOpcode();
3452 if (Opc == AMDGPU::COPY) {
3453 assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
3454
3455 Register DstReg = UseMI.getOperand(0).getReg();
3456 unsigned OpSize = getOpSize(UseMI, 0);
3457 bool Is16Bit = OpSize == 2;
3458 bool Is64Bit = OpSize == 8;
3459 bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
3460 unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
3461 : AMDGPU::V_MOV_B32_e32
3462 : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
3463 : AMDGPU::S_MOV_B32;
3464 APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
3465
3466 if (RI.isAGPR(*MRI, DstReg)) {
3467 if (Is64Bit || !isInlineConstant(Imm))
3468 return false;
3469 NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
3470 }
3471
3472 if (Is16Bit) {
3473 if (isVGPRCopy)
3474 return false; // Do not clobber vgpr_hi16
3475
3476 if (DstReg.isVirtual() && UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
3477 return false;
3478
3479 UseMI.getOperand(0).setSubReg(0);
3480 if (DstReg.isPhysical()) {
3481 DstReg = RI.get32BitRegister(DstReg);
3482 UseMI.getOperand(0).setReg(DstReg);
3483 }
3484 assert(UseMI.getOperand(1).getReg().isVirtual());
3485 }
3486
3487 const MCInstrDesc &NewMCID = get(NewOpc);
3488 if (DstReg.isPhysical() &&
3489 !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
3490 return false;
3491
3492 UseMI.setDesc(NewMCID);
3493 UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
3494 UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
3495 return true;
3496 }
3497
3498 if (Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3499 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3500 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3501 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3502 Opc == AMDGPU::V_FMAC_F16_t16_e64) {
3503 // Don't fold if we are using source or output modifiers. The new VOP2
3504 // instructions don't have them.
3506 return false;
3507
3508 // If this is a free constant, there's no reason to do this.
3509 // TODO: We could fold this here instead of letting SIFoldOperands do it
3510 // later.
3511 MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
3512
3513 // Any src operand can be used for the legality check.
3514 if (isInlineConstant(UseMI, *Src0, *ImmOp))
3515 return false;
3516
3517 bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 ||
3518 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64;
3519 bool IsFMA =
3520 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3521 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3522 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3523 MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
3524 MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
3525
3526 // Multiplied part is the constant: Use v_madmk_{f16, f32}.
3527 if ((Src0->isReg() && Src0->getReg() == Reg) ||
3528 (Src1->isReg() && Src1->getReg() == Reg)) {
3529 MachineOperand *RegSrc =
3530 Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
3531 if (!RegSrc->isReg())
3532 return false;
3533 if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
3534 ST.getConstantBusLimit(Opc) < 2)
3535 return false;
3536
3537 if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
3538 return false;
3539
3540 // If src2 is also a literal constant then we have to choose which one to
3541 // fold. In general it is better to choose madak so that the other literal
3542 // can be materialized in an sgpr instead of a vgpr:
3543 // s_mov_b32 s0, literal
3544 // v_madak_f32 v0, s0, v0, literal
3545 // Instead of:
3546 // v_mov_b32 v1, literal
3547 // v_madmk_f32 v0, v0, literal, v1
3548 MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
3549 if (Def && Def->isMoveImmediate() &&
3550 !isInlineConstant(Def->getOperand(1)))
3551 return false;
3552
3553 unsigned NewOpc =
3554 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3555 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3556 : AMDGPU::V_FMAMK_F16)
3557 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3558 if (pseudoToMCOpcode(NewOpc) == -1)
3559 return false;
3560
3561 // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3562 // would also require restricting their register classes. For now
3563 // just bail out.
3564 if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
3565 return false;
3566
3567 const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
3568
3569 // FIXME: This would be a lot easier if we could return a new instruction
3570 // instead of having to modify in place.
3571
3572 Register SrcReg = RegSrc->getReg();
3573 unsigned SrcSubReg = RegSrc->getSubReg();
3574 Src0->setReg(SrcReg);
3575 Src0->setSubReg(SrcSubReg);
3576 Src0->setIsKill(RegSrc->isKill());
3577
3578 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3579 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3580 Opc == AMDGPU::V_FMAC_F16_e64)
3581 UseMI.untieRegOperand(
3582 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3583
3584 Src1->ChangeToImmediate(Imm);
3585
3587 UseMI.setDesc(get(NewOpc));
3588
3589 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3590 if (DeleteDef)
3591 DefMI.eraseFromParent();
3592
3593 return true;
3594 }
3595
3596 // Added part is the constant: Use v_madak_{f16, f32}.
3597 if (Src2->isReg() && Src2->getReg() == Reg) {
3598 if (ST.getConstantBusLimit(Opc) < 2) {
3599 // Not allowed to use constant bus for another operand.
3600 // We can however allow an inline immediate as src0.
3601 bool Src0Inlined = false;
3602 if (Src0->isReg()) {
3603 // Try to inline constant if possible.
3604 // If the Def moves immediate and the use is single
3605 // We are saving VGPR here.
3606 MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
3607 if (Def && Def->isMoveImmediate() &&
3608 isInlineConstant(Def->getOperand(1)) &&
3609 MRI->hasOneUse(Src0->getReg())) {
3610 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3611 Src0Inlined = true;
3612 } else if (ST.getConstantBusLimit(Opc) <= 1 &&
3613 RI.isSGPRReg(*MRI, Src0->getReg())) {
3614 return false;
3615 }
3616 // VGPR is okay as Src0 - fallthrough
3617 }
3618
3619 if (Src1->isReg() && !Src0Inlined) {
3620 // We have one slot for inlinable constant so far - try to fill it
3621 MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
3622 if (Def && Def->isMoveImmediate() &&
3623 isInlineConstant(Def->getOperand(1)) &&
3624 MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
3625 Src0->ChangeToImmediate(Def->getOperand(1).getImm());
3626 else if (RI.isSGPRReg(*MRI, Src1->getReg()))
3627 return false;
3628 // VGPR is okay as Src1 - fallthrough
3629 }
3630 }
3631
3632 unsigned NewOpc =
3633 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3634 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3635 : AMDGPU::V_FMAAK_F16)
3636 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3637 if (pseudoToMCOpcode(NewOpc) == -1)
3638 return false;
3639
3640 // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
3641 // would also require restricting their register classes. For now
3642 // just bail out.
3643 if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
3644 return false;
3645
3646 // FIXME: This would be a lot easier if we could return a new instruction
3647 // instead of having to modify in place.
3648
3649 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3650 Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3651 Opc == AMDGPU::V_FMAC_F16_e64)
3652 UseMI.untieRegOperand(
3653 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
3654
3655 // ChangingToImmediate adds Src2 back to the instruction.
3656 Src2->ChangeToImmediate(getImmFor(*Src2));
3657
3658 // These come before src2.
3660 UseMI.setDesc(get(NewOpc));
3661 // It might happen that UseMI was commuted
3662 // and we now have SGPR as SRC1. If so 2 inlined
3663 // constant and SGPR are illegal.
3665
3666 bool DeleteDef = MRI->use_nodbg_empty(Reg);
3667 if (DeleteDef)
3668 DefMI.eraseFromParent();
3669
3670 return true;
3671 }
3672 }
3673
3674 return false;
3675}
3676
3677static bool
3680 if (BaseOps1.size() != BaseOps2.size())
3681 return false;
3682 for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
3683 if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
3684 return false;
3685 }
3686 return true;
3687}
3688
3689static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA,
3690 LocationSize WidthB, int OffsetB) {
3691 int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
3692 int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
3693 LocationSize LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
3694 return LowWidth.hasValue() &&
3695 LowOffset + (int)LowWidth.getValue() <= HighOffset;
3696}
3697
3698bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
3699 const MachineInstr &MIb) const {
3700 SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
3701 int64_t Offset0, Offset1;
3702 LocationSize Dummy0 = 0, Dummy1 = 0;
3703 bool Offset0IsScalable, Offset1IsScalable;
3704 if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
3705 Dummy0, &RI) ||
3706 !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
3707 Dummy1, &RI))
3708 return false;
3709
3710 if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
3711 return false;
3712
3713 if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
3714 // FIXME: Handle ds_read2 / ds_write2.
3715 return false;
3716 }
3717 LocationSize Width0 = MIa.memoperands().front()->getSize();
3718 LocationSize Width1 = MIb.memoperands().front()->getSize();
3719 return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
3720}
3721
3723 const MachineInstr &MIb) const {
3724 assert(MIa.mayLoadOrStore() &&
3725 "MIa must load from or modify a memory location");
3726 assert(MIb.mayLoadOrStore() &&
3727 "MIb must load from or modify a memory location");
3728
3730 return false;
3731
3732 // XXX - Can we relax this between address spaces?
3733 if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
3734 return false;
3735
3736 if (isLDSDMA(MIa) || isLDSDMA(MIb))
3737 return false;
3738
3739 // TODO: Should we check the address space from the MachineMemOperand? That
3740 // would allow us to distinguish objects we know don't alias based on the
3741 // underlying address space, even if it was lowered to a different one,
3742 // e.g. private accesses lowered to use MUBUF instructions on a scratch
3743 // buffer.
3744 if (isDS(MIa)) {
3745 if (isDS(MIb))
3746 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3747
3748 return !isFLAT(MIb) || isSegmentSpecificFLAT(MIb);
3749 }
3750
3751 if (isMUBUF(MIa) || isMTBUF(MIa)) {
3752 if (isMUBUF(MIb) || isMTBUF(MIb))
3753 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3754
3755 if (isFLAT(MIb))
3756 return isFLATScratch(MIb);
3757
3758 return !isSMRD(MIb);
3759 }
3760
3761 if (isSMRD(MIa)) {
3762 if (isSMRD(MIb))
3763 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3764
3765 if (isFLAT(MIb))
3766 return isFLATScratch(MIb);
3767
3768 return !isMUBUF(MIb) && !isMTBUF(MIb);
3769 }
3770
3771 if (isFLAT(MIa)) {
3772 if (isFLAT(MIb)) {
3773 if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
3774 (isFLATGlobal(MIa) && isFLATScratch(MIb)))
3775 return true;
3776
3777 return checkInstOffsetsDoNotOverlap(MIa, MIb);
3778 }
3779
3780 return false;
3781 }
3782
3783 return false;
3784}
3785
3787 int64_t &Imm, MachineInstr **DefMI = nullptr) {
3788 if (Reg.isPhysical())
3789 return false;
3790 auto *Def = MRI.getUniqueVRegDef(Reg);
3791 if (Def && SIInstrInfo::isFoldableCopy(*Def) && Def->getOperand(1).isImm()) {
3792 Imm = Def->getOperand(1).getImm();
3793 if (DefMI)
3794 *DefMI = Def;
3795 return true;
3796 }
3797 return false;
3798}
3799
3800static bool getFoldableImm(const MachineOperand *MO, int64_t &Imm,
3801 MachineInstr **DefMI = nullptr) {
3802 if (!MO->isReg())
3803 return false;
3804 const MachineFunction *MF = MO->getParent()->getParent()->getParent();
3805 const MachineRegisterInfo &MRI = MF->getRegInfo();
3806 return getFoldableImm(MO->getReg(), MRI, Imm, DefMI);
3807}
3808
3810 MachineInstr &NewMI) {
3811 if (LV) {
3812 unsigned NumOps = MI.getNumOperands();
3813 for (unsigned I = 1; I < NumOps; ++I) {
3814 MachineOperand &Op = MI.getOperand(I);
3815 if (Op.isReg() && Op.isKill())
3816 LV->replaceKillInstruction(Op.getReg(), MI, NewMI);
3817 }
3818 }
3819}
3820
3822 LiveVariables *LV,
3823 LiveIntervals *LIS) const {
3824 MachineBasicBlock &MBB = *MI.getParent();
3825 unsigned Opc = MI.getOpcode();
3826
3827 // Handle MFMA.
3828 int NewMFMAOpc = AMDGPU::getMFMAEarlyClobberOp(Opc);
3829 if (NewMFMAOpc != -1) {
3831 BuildMI(MBB, MI, MI.getDebugLoc(), get(NewMFMAOpc));
3832 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3833 MIB.add(MI.getOperand(I));
3834 updateLiveVariables(LV, MI, *MIB);
3835 if (LIS) {
3836 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3837 // SlotIndex of defs needs to be updated when converting to early-clobber
3838 MachineOperand &Def = MIB->getOperand(0);
3839 if (Def.isEarlyClobber() && Def.isReg() &&
3840 LIS->hasInterval(Def.getReg())) {
3841 SlotIndex OldIndex = LIS->getInstructionIndex(*MIB).getRegSlot(false);
3842 SlotIndex NewIndex = LIS->getInstructionIndex(*MIB).getRegSlot(true);
3843 auto &LI = LIS->getInterval(Def.getReg());
3844 auto UpdateDefIndex = [&](LiveRange &LR) {
3845 auto S = LR.find(OldIndex);
3846 if (S != LR.end() && S->start == OldIndex) {
3847 assert(S->valno && S->valno->def == OldIndex);
3848 S->start = NewIndex;
3849 S->valno->def = NewIndex;
3850 }
3851 };
3852 UpdateDefIndex(LI);
3853 for (auto &SR : LI.subranges())
3854 UpdateDefIndex(SR);
3855 }
3856 }
3857 return MIB;
3858 }
3859
3860 if (SIInstrInfo::isWMMA(MI)) {
3861 unsigned NewOpc = AMDGPU::mapWMMA2AddrTo3AddrOpcode(MI.getOpcode());
3862 MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3863 .setMIFlags(MI.getFlags());
3864 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I)
3865 MIB->addOperand(MI.getOperand(I));
3866
3867 updateLiveVariables(LV, MI, *MIB);
3868 if (LIS)
3869 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3870
3871 return MIB;
3872 }
3873
3874 assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3875 "V_FMAC_F16_t16_e32 is not supported and not expected to be present "
3876 "pre-RA");
3877
3878 // Handle MAC/FMAC.
3879 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
3880 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3881 Opc == AMDGPU::V_FMAC_F16_t16_e64;
3882 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
3883 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3884 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
3885 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3886 Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
3887 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3888 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
3889 bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 ||
3890 Opc == AMDGPU::V_MAC_LEGACY_F32_e64 ||
3891 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
3892 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64;
3893 bool Src0Literal = false;
3894
3895 switch (Opc) {
3896 default:
3897 return nullptr;
3898 case AMDGPU::V_MAC_F16_e64:
3899 case AMDGPU::V_FMAC_F16_e64:
3900 case AMDGPU::V_FMAC_F16_t16_e64:
3901 case AMDGPU::V_MAC_F32_e64:
3902 case AMDGPU::V_MAC_LEGACY_F32_e64:
3903 case AMDGPU::V_FMAC_F32_e64:
3904 case AMDGPU::V_FMAC_LEGACY_F32_e64:
3905 case AMDGPU::V_FMAC_F64_e64:
3906 break;
3907 case AMDGPU::V_MAC_F16_e32:
3908 case AMDGPU::V_FMAC_F16_e32:
3909 case AMDGPU::V_MAC_F32_e32:
3910 case AMDGPU::V_MAC_LEGACY_F32_e32:
3911 case AMDGPU::V_FMAC_F32_e32:
3912 case AMDGPU::V_FMAC_LEGACY_F32_e32:
3913 case AMDGPU::V_FMAC_F64_e32: {
3914 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
3915 AMDGPU::OpName::src0);
3916 const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
3917 if (!Src0->isReg() && !Src0->isImm())
3918 return nullptr;
3919
3920 if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
3921 Src0Literal = true;
3922
3923 break;
3924 }
3925 }
3926
3928 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
3929 const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
3930 const MachineOperand *Src0Mods =
3931 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
3932 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
3933 const MachineOperand *Src1Mods =
3934 getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
3935 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
3936 const MachineOperand *Src2Mods =
3937 getNamedOperand(MI, AMDGPU::OpName::src2_modifiers);
3938 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
3939 const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
3940 const MachineOperand *OpSel = getNamedOperand(MI, AMDGPU::OpName::op_sel);
3941
3942 if (!Src0Mods && !Src1Mods && !Src2Mods && !Clamp && !Omod && !IsF64 &&
3943 !IsLegacy &&
3944 // If we have an SGPR input, we will violate the constant bus restriction.
3945 (ST.getConstantBusLimit(Opc) > 1 || !Src0->isReg() ||
3946 !RI.isSGPRReg(MBB.getParent()->getRegInfo(), Src0->getReg()))) {
3948 const auto killDef = [&]() -> void {
3950 // The only user is the instruction which will be killed.
3951 Register DefReg = DefMI->getOperand(0).getReg();
3952 if (!MRI.hasOneNonDBGUse(DefReg))
3953 return;
3954 // We cannot just remove the DefMI here, calling pass will crash.
3955 DefMI->setDesc(get(AMDGPU::IMPLICIT_DEF));
3956 for (unsigned I = DefMI->getNumOperands() - 1; I != 0; --I)
3958 if (LV)
3959 LV->getVarInfo(DefReg).AliveBlocks.clear();
3960 };
3961
3962 int64_t Imm;
3963 if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) {
3964 unsigned NewOpc =
3965 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16
3966 : AMDGPU::V_FMAAK_F16)
3967 : AMDGPU::V_FMAAK_F32)
3968 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
3969 if (pseudoToMCOpcode(NewOpc) != -1) {
3970 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3971 .add(*Dst)
3972 .add(*Src0)
3973 .add(*Src1)
3974 .addImm(Imm);
3975 updateLiveVariables(LV, MI, *MIB);
3976 if (LIS)
3977 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3978 killDef();
3979 return MIB;
3980 }
3981 }
3982 unsigned NewOpc =
3983 IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
3984 : AMDGPU::V_FMAMK_F16)
3985 : AMDGPU::V_FMAMK_F32)
3986 : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
3987 if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) {
3988 if (pseudoToMCOpcode(NewOpc) != -1) {
3989 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
3990 .add(*Dst)
3991 .add(*Src0)
3992 .addImm(Imm)
3993 .add(*Src2);
3994 updateLiveVariables(LV, MI, *MIB);
3995 if (LIS)
3996 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
3997 killDef();
3998 return MIB;
3999 }
4000 }
4001 if (Src0Literal || getFoldableImm(Src0, Imm, &DefMI)) {
4002 if (Src0Literal) {
4003 Imm = Src0->getImm();
4004 DefMI = nullptr;
4005 }
4006 if (pseudoToMCOpcode(NewOpc) != -1 &&
4008 MI, AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::src0),
4009 Src1)) {
4010 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4011 .add(*Dst)
4012 .add(*Src1)
4013 .addImm(Imm)
4014 .add(*Src2);
4015 updateLiveVariables(LV, MI, *MIB);
4016 if (LIS)
4017 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4018 if (DefMI)
4019 killDef();
4020 return MIB;
4021 }
4022 }
4023 }
4024
4025 // VOP2 mac/fmac with a literal operand cannot be converted to VOP3 mad/fma
4026 // if VOP3 does not allow a literal operand.
4027 if (Src0Literal && !ST.hasVOP3Literal())
4028 return nullptr;
4029
4030 unsigned NewOpc = IsFMA ? IsF16 ? AMDGPU::V_FMA_F16_gfx9_e64
4031 : IsF64 ? AMDGPU::V_FMA_F64_e64
4032 : IsLegacy
4033 ? AMDGPU::V_FMA_LEGACY_F32_e64
4034 : AMDGPU::V_FMA_F32_e64
4035 : IsF16 ? AMDGPU::V_MAD_F16_e64
4036 : IsLegacy ? AMDGPU::V_MAD_LEGACY_F32_e64
4037 : AMDGPU::V_MAD_F32_e64;
4038 if (pseudoToMCOpcode(NewOpc) == -1)
4039 return nullptr;
4040
4041 MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc))
4042 .add(*Dst)
4043 .addImm(Src0Mods ? Src0Mods->getImm() : 0)
4044 .add(*Src0)
4045 .addImm(Src1Mods ? Src1Mods->getImm() : 0)
4046 .add(*Src1)
4047 .addImm(Src2Mods ? Src2Mods->getImm() : 0)
4048 .add(*Src2)
4049 .addImm(Clamp ? Clamp->getImm() : 0)
4050 .addImm(Omod ? Omod->getImm() : 0);
4051 if (AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
4052 MIB.addImm(OpSel ? OpSel->getImm() : 0);
4053 updateLiveVariables(LV, MI, *MIB);
4054 if (LIS)
4055 LIS->ReplaceMachineInstrInMaps(MI, *MIB);
4056 return MIB;
4057}
4058
4059// It's not generally safe to move VALU instructions across these since it will
4060// start using the register as a base index rather than directly.
4061// XXX - Why isn't hasSideEffects sufficient for these?
4063 switch (MI.getOpcode()) {
4064 case AMDGPU::S_SET_GPR_IDX_ON:
4065 case AMDGPU::S_SET_GPR_IDX_MODE:
4066 case AMDGPU::S_SET_GPR_IDX_OFF:
4067 return true;
4068 default:
4069 return false;
4070 }
4071}
4072
4074 const MachineBasicBlock *MBB,
4075 const MachineFunction &MF) const {
4076 // Skipping the check for SP writes in the base implementation. The reason it
4077 // was added was apparently due to compile time concerns.
4078 //
4079 // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
4080 // but is probably avoidable.
4081
4082 // Copied from base implementation.
4083 // Terminators and labels can't be scheduled around.
4084 if (MI.isTerminator() || MI.isPosition())
4085 return true;
4086
4087 // INLINEASM_BR can jump to another block
4088 if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
4089 return true;
4090
4091 if (MI.getOpcode() == AMDGPU::SCHED_BARRIER && MI.getOperand(0).getImm() == 0)
4092 return true;
4093
4094 // Target-independent instructions do not have an implicit-use of EXEC, even
4095 // when they operate on VGPRs. Treating EXEC modifications as scheduling
4096 // boundaries prevents incorrect movements of such instructions.
4097 return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
4098 MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
4099 MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
4100 MI.getOpcode() == AMDGPU::S_SETPRIO ||
4102}
4103
4105 return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
4106}
4107
4109 // Skip the full operand and register alias search modifiesRegister
4110 // does. There's only a handful of instructions that touch this, it's only an
4111 // implicit def, and doesn't alias any other registers.
4112 return is_contained(MI.getDesc().implicit_defs(), AMDGPU::MODE);
4113}
4114
4116 unsigned Opcode = MI.getOpcode();
4117
4118 if (MI.mayStore() && isSMRD(MI))
4119 return true; // scalar store or atomic
4120
4121 // This will terminate the function when other lanes may need to continue.
4122 if (MI.isReturn())
4123 return true;
4124
4125 // These instructions cause shader I/O that may cause hardware lockups
4126 // when executed with an empty EXEC mask.
4127 //
4128 // Note: exp with VM = DONE = 0 is automatically skipped by hardware when
4129 // EXEC = 0, but checking for that case here seems not worth it
4130 // given the typical code patterns.
4131 if (Opcode == AMDGPU::S_SENDMSG || Opcode == AMDGPU::S_SENDMSGHALT ||
4132 isEXP(Opcode) ||
4133 Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::S_TRAP ||
4134 Opcode == AMDGPU::DS_GWS_INIT || Opcode == AMDGPU::DS_GWS_BARRIER)
4135 return true;
4136
4137 if (MI.isCall() || MI.isInlineAsm())
4138 return true; // conservative assumption
4139
4140 // A mode change is a scalar operation that influences vector instructions.
4142 return true;
4143
4144 // These are like SALU instructions in terms of effects, so it's questionable
4145 // whether we should return true for those.
4146 //
4147 // However, executing them with EXEC = 0 causes them to operate on undefined
4148 // data, which we avoid by returning true here.
4149 if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
4150 Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
4151 Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
4152 Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
4153 return true;
4154
4155 return false;
4156}
4157
4159 const MachineInstr &MI) const {
4160 if (MI.isMetaInstruction())
4161 return false;
4162
4163 // This won't read exec if this is an SGPR->SGPR copy.
4164 if (MI.isCopyLike()) {
4165 if (!RI.isSGPRReg(MRI, MI.getOperand(0).getReg()))
4166 return true;
4167
4168 // Make sure this isn't copying exec as a normal operand
4169 return MI.readsRegister(AMDGPU::EXEC, &RI);
4170 }
4171
4172 // Make a conservative assumption about the callee.
4173 if (MI.isCall())
4174 return true;
4175
4176 // Be conservative with any unhandled generic opcodes.
4177 if (!isTargetSpecificOpcode(MI.getOpcode()))
4178 return true;
4179
4180 return !isSALU(MI) || MI.readsRegister(AMDGPU::EXEC, &RI);
4181}
4182
4183bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
4184 switch (Imm.getBitWidth()) {
4185 case 1: // This likely will be a condition code mask.
4186 return true;
4187
4188 case 32:
4189 return AMDGPU::isInlinableLiteral32(Imm.getSExtValue(),
4190 ST.hasInv2PiInlineImm());
4191 case 64:
4192 return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
4193 ST.hasInv2PiInlineImm());
4194 case 16:
4195 return ST.has16BitInsts() &&
4196 AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(),
4197 ST.hasInv2PiInlineImm());
4198 default:
4199 llvm_unreachable("invalid bitwidth");
4200 }
4201}
4202
4204 APInt IntImm = Imm.bitcastToAPInt();
4205 int64_t IntImmVal = IntImm.getSExtValue();
4206 bool HasInv2Pi = ST.hasInv2PiInlineImm();
4207 switch (APFloat::SemanticsToEnum(Imm.getSemantics())) {
4208 default:
4209 llvm_unreachable("invalid fltSemantics");
4212 return isInlineConstant(IntImm);
4214 return ST.has16BitInsts() &&
4215 AMDGPU::isInlinableLiteralBF16(IntImmVal, HasInv2Pi);
4217 return ST.has16BitInsts() &&
4218 AMDGPU::isInlinableLiteralFP16(IntImmVal, HasInv2Pi);
4219 }
4220}
4221
4223 uint8_t OperandType) const {
4224 assert(!MO.isReg() && "isInlineConstant called on register operand!");
4225 if (!MO.isImm())
4226 return false;
4227
4228 // MachineOperand provides no way to tell the true operand size, since it only
4229 // records a 64-bit value. We need to know the size to determine if a 32-bit
4230 // floating point immediate bit pattern is legal for an integer immediate. It
4231 // would be for any 32-bit integer operand, but would not be for a 64-bit one.
4232
4233 int64_t Imm = MO.getImm();
4234 switch (OperandType) {
4247 int32_t Trunc = static_cast<int32_t>(Imm);
4249 }
4256 ST.hasInv2PiInlineImm());
4260 // We would expect inline immediates to not be concerned with an integer/fp
4261 // distinction. However, in the case of 16-bit integer operations, the
4262 // "floating point" values appear to not work. It seems read the low 16-bits
4263 // of 32-bit immediates, which happens to always work for the integer
4264 // values.
4265 //
4266 // See llvm bugzilla 46302.
4267 //
4268 // TODO: Theoretically we could use op-sel to use the high bits of the
4269 // 32-bit FP values.
4287 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4288 // A few special case instructions have 16-bit operands on subtargets
4289 // where 16-bit instructions are not legal.
4290 // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
4291 // constants in these cases
4292 int16_t Trunc = static_cast<int16_t>(Imm);
4293 return ST.has16BitInsts() &&
4295 }
4296
4297 return false;
4298 }
4303 if (isInt<16>(Imm) || isUInt<16>(Imm)) {
4304 int16_t Trunc = static_cast<int16_t>(Imm);
4305 return ST.has16BitInsts() &&
4307 }
4308 return false;
4309 }
4312 return false;
4315 // Always embedded in the instruction for free.
4316 return true;
4326 // Just ignore anything else.
4327 return true;
4328 default:
4329 llvm_unreachable("invalid operand type");
4330 }
4331}
4332
4333static bool compareMachineOp(const MachineOperand &Op0,
4334 const MachineOperand &Op1) {
4335 if (Op0.getType() != Op1.getType())
4336 return false;
4337
4338 switch (Op0.getType()) {
4340 return Op0.getReg() == Op1.getReg();
4342 return Op0.getImm() == Op1.getImm();
4343 default:
4344 llvm_unreachable("Didn't expect to be comparing these operand types");
4345 }
4346}
4347
4349 const MachineOperand &MO) const {
4350 const MCInstrDesc &InstDesc = MI.getDesc();
4351 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
4352
4353 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
4354
4356 return true;
4357
4358 if (OpInfo.RegClass < 0)
4359 return false;
4360
4361 if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
4362 if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
4363 OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
4364 AMDGPU::OpName::src2))
4365 return false;
4366 return RI.opCanUseInlineConstant(OpInfo.OperandType);
4367 }
4368
4369 if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
4370 return false;
4371
4372 if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
4373 return true;
4374
4375 return ST.hasVOP3Literal();
4376}
4377
4378bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
4379 // GFX90A does not have V_MUL_LEGACY_F32_e32.
4380 if (Opcode == AMDGPU::V_MUL_LEGACY_F32_e64 && ST.hasGFX90AInsts())
4381 return false;
4382
4383 int Op32 = AMDGPU::getVOPe32(Opcode);
4384 if (Op32 == -1)
4385 return false;
4386
4387 return pseudoToMCOpcode(Op32) != -1;
4388}
4389
4390bool SIInstrInfo::hasModifiers(unsigned Opcode) const {
4391 // The src0_modifier operand is present on all instructions
4392 // that have modifiers.
4393
4394 return AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers);
4395}
4396
4398 unsigned OpName) const {
4399 const MachineOperand *Mods = getNamedOperand(MI, OpName);
4400 return Mods && Mods->getImm();
4401}
4402
4404 return any_of(ModifierOpNames,
4405 [&](unsigned Name) { return hasModifiersSet(MI, Name); });
4406}
4407
4409 const MachineRegisterInfo &MRI) const {
4410 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4411 // Can't shrink instruction with three operands.
4412 if (Src2) {
4413 switch (MI.getOpcode()) {
4414 default: return false;
4415
4416 case AMDGPU::V_ADDC_U32_e64:
4417 case AMDGPU::V_SUBB_U32_e64:
4418 case AMDGPU::V_SUBBREV_U32_e64: {
4419 const MachineOperand *Src1
4420 = getNamedOperand(MI, AMDGPU::OpName::src1);
4421 if (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()))
4422 return false;
4423 // Additional verification is needed for sdst/src2.
4424 return true;
4425 }
4426 case AMDGPU::V_MAC_F16_e64:
4427 case AMDGPU::V_MAC_F32_e64:
4428 case AMDGPU::V_MAC_LEGACY_F32_e64:
4429 case AMDGPU::V_FMAC_F16_e64:
4430 case AMDGPU::V_FMAC_F16_t16_e64:
4431 case AMDGPU::V_FMAC_F32_e64:
4432 case AMDGPU::V_FMAC_F64_e64:
4433 case AMDGPU::V_FMAC_LEGACY_F32_e64:
4434 if (!Src2->isReg() || !RI.isVGPR(MRI, Src2->getReg()) ||
4435 hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers))
4436 return false;
4437 break;
4438
4439 case AMDGPU::V_CNDMASK_B32_e64:
4440 break;
4441 }
4442 }
4443
4444 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4445 if (Src1 && (!Src1->isReg() || !RI.isVGPR(MRI, Src1->getReg()) ||
4446 hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)))
4447 return false;
4448
4449 // We don't need to check src0, all input types are legal, so just make sure
4450 // src0 isn't using any modifiers.
4451 if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers))
4452 return false;
4453
4454 // Can it be shrunk to a valid 32 bit opcode?
4455 if (!hasVALU32BitEncoding(MI.getOpcode()))
4456 return false;
4457
4458 // Check output modifiers
4459 return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
4460 !hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
4461 !hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
4462}
4463
4464// Set VCC operand with all flags from \p Orig, except for setting it as
4465// implicit.
4467 const MachineOperand &Orig) {
4468
4469 for (MachineOperand &Use : MI.implicit_operands()) {
4470 if (Use.isUse() &&
4471 (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
4472 Use.setIsUndef(Orig.isUndef());
4473 Use.setIsKill(Orig.isKill());
4474 return;
4475 }
4476 }
4477}
4478
4480 unsigned Op32) const {
4481 MachineBasicBlock *MBB = MI.getParent();
4482 MachineInstrBuilder Inst32 =
4483 BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
4484 .setMIFlags(MI.getFlags());
4485
4486 // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
4487 // For VOPC instructions, this is replaced by an implicit def of vcc.
4488 if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::vdst)) {
4489 // dst
4490 Inst32.add(MI.getOperand(0));
4491 } else if (AMDGPU::hasNamedOperand(Op32, AMDGPU::OpName::sdst)) {
4492 // VOPCX instructions won't be writing to an explicit dst, so this should
4493 // not fail for these instructions.
4494 assert(((MI.getOperand(0).getReg() == AMDGPU::VCC) ||
4495 (MI.getOperand(0).getReg() == AMDGPU::VCC_LO)) &&
4496 "Unexpected case");
4497 }
4498
4499 Inst32.add(*getNamedOperand(MI, AMDGPU::OpName::src0));
4500
4501 const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
4502 if (Src1)
4503 Inst32.add(*Src1);
4504
4505 const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
4506
4507 if (Src2) {
4508 int Op32Src2Idx = AMDGPU::getNamedOperandIdx(Op32, AMDGPU::OpName::src2);
4509 if (Op32Src2Idx != -1) {
4510 Inst32.add(*Src2);
4511 } else {
4512 // In the case of V_CNDMASK_B32_e32, the explicit operand src2 is
4513 // replaced with an implicit read of vcc or vcc_lo. The implicit read
4514 // of vcc was already added during the initial BuildMI, but we
4515 // 1) may need to change vcc to vcc_lo to preserve the original register
4516 // 2) have to preserve the original flags.
4517 fixImplicitOperands(*Inst32);
4518 copyFlagsToImplicitVCC(*Inst32, *Src2);
4519 }
4520 }
4521
4522 return Inst32;
4523}
4524
4526 const MachineOperand &MO,
4527 const MCOperandInfo &OpInfo) const {
4528 // Literal constants use the constant bus.
4529 if (!MO.isReg())
4530 return !isInlineConstant(MO, OpInfo);
4531
4532 if (!MO.isUse())
4533 return false;
4534
4535 if (MO.getReg().isVirtual())
4536 return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
4537
4538 // Null is free
4539 if (MO.getReg() == AMDGPU::SGPR_NULL || MO.getReg() == AMDGPU::SGPR_NULL64)
4540 return false;
4541
4542 // SGPRs use the constant bus
4543 if (MO.isImplicit()) {
4544 return MO.getReg() == AMDGPU::M0 ||
4545 MO.getReg() == AMDGPU::VCC ||
4546 MO.getReg() == AMDGPU::VCC_LO;
4547 } else {
4548 return AMDGPU::SReg_32RegClass.contains(MO.getReg()) ||
4549 AMDGPU::SReg_64RegClass.contains(MO.getReg());
4550 }
4551}
4552
4554 for (const MachineOperand &MO : MI.implicit_operands()) {
4555 // We only care about reads.
4556 if (MO.isDef())
4557 continue;
4558
4559 switch (MO.getReg()) {
4560 case AMDGPU::VCC:
4561 case AMDGPU::VCC_LO:
4562 case AMDGPU::VCC_HI:
4563 case AMDGPU::M0:
4564 case AMDGPU::FLAT_SCR:
4565 return MO.getReg();
4566
4567 default:
4568 break;
4569 }
4570 }
4571
4572 return Register();
4573}
4574
4575static bool shouldReadExec(const MachineInstr &MI) {
4576 if (SIInstrInfo::isVALU(MI)) {
4577 switch (MI.getOpcode()) {
4578 case AMDGPU::V_READLANE_B32:
4579 case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
4580 case AMDGPU::V_WRITELANE_B32:
4581 case AMDGPU::SI_SPILL_S32_TO_VGPR:
4582 return false;
4583 }
4584
4585 return true;
4586 }
4587
4588 if (MI.isPreISelOpcode() ||
4589 SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
4592 return false;
4593
4594 return true;
4595}
4596
4597static bool isSubRegOf(const SIRegisterInfo &TRI,
4598 const MachineOperand &SuperVec,
4599 const MachineOperand &SubReg) {
4600 if (SubReg.getReg().isPhysical())
4601 return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
4602
4603 return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
4604 SubReg.getReg() == SuperVec.getReg();
4605}
4606
4608 StringRef &ErrInfo) const {
4609 uint16_t Opcode = MI.getOpcode();
4610 if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
4611 return true;
4612
4613 const MachineFunction *MF = MI.getParent()->getParent();
4614 const MachineRegisterInfo &MRI = MF->getRegInfo();
4615
4616 int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
4617 int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
4618 int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
4619 int Src3Idx = -1;
4620 if (Src0Idx == -1) {
4621 // VOPD V_DUAL_* instructions use different operand names.
4622 Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X);
4623 Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X);
4624 Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y);
4625 Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y);
4626 }
4627
4628 // Make sure the number of operands is correct.
4629 const MCInstrDesc &Desc = get(Opcode);
4630 if (!Desc.isVariadic() &&
4631 Desc.getNumOperands() != MI.getNumExplicitOperands()) {
4632 ErrInfo = "Instruction has wrong number of operands.";
4633 return false;
4634 }
4635
4636 if (MI.isInlineAsm()) {
4637 // Verify register classes for inlineasm constraints.
4638 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = MI.getNumOperands();
4639 I != E; ++I) {
4640 const TargetRegisterClass *RC = MI.getRegClassConstraint(I, this, &RI);
4641 if (!RC)
4642 continue;
4643
4644 const MachineOperand &Op = MI.getOperand(I);
4645 if (!Op.isReg())
4646 continue;
4647
4648 Register Reg = Op.getReg();
4649 if (!Reg.isVirtual() && !RC->contains(Reg)) {
4650 ErrInfo = "inlineasm operand has incorrect register class.";
4651 return false;
4652 }
4653 }
4654
4655 return true;
4656 }
4657
4658 if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
4659 ErrInfo = "missing memory operand from image instruction.";
4660 return false;
4661 }
4662
4663 // Make sure the register classes are correct.
4664 for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
4665 const MachineOperand &MO = MI.getOperand(i);
4666 if (MO.isFPImm()) {
4667 ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast "
4668 "all fp values to integers.";
4669 return false;
4670 }
4671
4672 int RegClass = Desc.operands()[i].RegClass;
4673
4674 switch (Desc.operands()[i].OperandType) {
4676 if (MI.getOperand(i).isImm() || MI.getOperand(i).isGlobal()) {
4677 ErrInfo = "Illegal immediate value for operand.";
4678 return false;
4679 }
4680 break;
4685 break;
4697 if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
4698 ErrInfo = "Illegal immediate value for operand.";
4699 return false;
4700 }
4701 break;
4702 }
4704 if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
4705 ErrInfo = "Expected inline constant for operand.";
4706 return false;
4707 }
4708 break;
4711 // Check if this operand is an immediate.
4712 // FrameIndex operands will be replaced by immediates, so they are
4713 // allowed.
4714 if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) {
4715 ErrInfo = "Expected immediate, but got non-immediate";
4716 return false;
4717 }
4718 [[fallthrough]];
4719 default:
4720 continue;
4721 }
4722
4723 if (!MO.isReg())
4724 continue;
4725 Register Reg = MO.getReg();
4726 if (!Reg)
4727 continue;
4728
4729 // FIXME: Ideally we would have separate instruction definitions with the
4730 // aligned register constraint.
4731 // FIXME: We do not verify inline asm operands, but custom inline asm
4732 // verification is broken anyway
4733 if (ST.needsAlignedVGPRs()) {
4734 const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
4735 if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
4736 const TargetRegisterClass *SubRC =
4737 RI.getSubRegisterClass(RC, MO.getSubReg());
4738 RC = RI.getCompatibleSubRegClass(RC, SubRC, MO.getSubReg());
4739 if (RC)
4740 RC = SubRC;
4741 }
4742
4743 // Check that this is the aligned version of the class.
4744 if (!RC || !RI.isProperlyAlignedRC(*RC)) {
4745 ErrInfo = "Subtarget requires even aligned vector registers";
4746 return false;
4747 }
4748 }
4749
4750 if (RegClass != -1) {
4751 if (Reg.isVirtual())
4752 continue;
4753
4754 const TargetRegisterClass *RC = RI.getRegClass(RegClass);
4755 if (!RC->contains(Reg)) {
4756 ErrInfo = "Operand has incorrect register class.";
4757 return false;
4758 }
4759 }
4760 }
4761
4762 // Verify SDWA
4763 if (isSDWA(MI)) {
4764 if (!ST.hasSDWA()) {
4765 ErrInfo = "SDWA is not supported on this target";
4766 return false;
4767 }
4768
4769 int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
4770
4771 for (int OpIdx : {DstIdx, Src0Idx, Src1Idx, Src2Idx}) {
4772 if (OpIdx == -1)
4773 continue;
4774 const MachineOperand &MO = MI.getOperand(OpIdx);
4775
4776 if (!ST.hasSDWAScalar()) {
4777 // Only VGPRS on VI
4778 if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
4779 ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
4780 return false;
4781 }
4782 } else {
4783 // No immediates on GFX9
4784 if (!MO.isReg()) {
4785 ErrInfo =
4786 "Only reg allowed as operands in SDWA instructions on GFX9+";
4787 return false;
4788 }
4789 }
4790 }
4791
4792 if (!ST.hasSDWAOmod()) {
4793 // No omod allowed on VI
4794 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4795 if (OMod != nullptr &&
4796 (!OMod->isImm() || OMod->getImm() != 0)) {
4797 ErrInfo = "OMod not allowed in SDWA instructions on VI";
4798 return false;
4799 }
4800 }
4801
4802 if (Opcode == AMDGPU::V_CVT_F32_FP8_sdwa ||
4803 Opcode == AMDGPU::V_CVT_F32_BF8_sdwa ||
4804 Opcode == AMDGPU::V_CVT_PK_F32_FP8_sdwa ||
4805 Opcode == AMDGPU::V_CVT_PK_F32_BF8_sdwa) {
4806 const MachineOperand *Src0ModsMO =
4807 getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
4808 unsigned Mods = Src0ModsMO->getImm();
4809 if (Mods & SISrcMods::ABS || Mods & SISrcMods::NEG ||
4810 Mods & SISrcMods::SEXT) {
4811 ErrInfo = "sext, abs and neg are not allowed on this instruction";
4812 return false;
4813 }
4814 }
4815
4816 uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
4817 if (isVOPC(BasicOpcode)) {
4818 if (!ST.hasSDWASdst() && DstIdx != -1) {
4819 // Only vcc allowed as dst on VI for VOPC
4820 const MachineOperand &Dst = MI.getOperand(DstIdx);
4821 if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
4822 ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
4823 return false;
4824 }
4825 } else if (!ST.hasSDWAOutModsVOPC()) {
4826 // No clamp allowed on GFX9 for VOPC
4827 const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
4828 if (Clamp && (!Clamp->isImm() || Clamp->getImm() != 0)) {
4829 ErrInfo = "Clamp not allowed in VOPC SDWA instructions on VI";
4830 return false;
4831 }
4832
4833 // No omod allowed on GFX9 for VOPC
4834 const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
4835 if (OMod && (!OMod->isImm() || OMod->getImm() != 0)) {
4836 ErrInfo = "OMod not allowed in VOPC SDWA instructions on VI";
4837 return false;
4838 }
4839 }
4840 }
4841
4842 const MachineOperand *DstUnused = getNamedOperand(MI, AMDGPU::OpName::dst_unused);
4843 if (DstUnused && DstUnused->isImm() &&
4844 DstUnused->getImm() == AMDGPU::SDWA::UNUSED_PRESERVE) {
4845 const MachineOperand &Dst = MI.getOperand(DstIdx);
4846 if (!Dst.isReg() || !Dst.isTied()) {
4847 ErrInfo = "Dst register should have tied register";
4848 return false;
4849 }
4850
4851 const MachineOperand &TiedMO =
4852 MI.getOperand(MI.findTiedOperandIdx(DstIdx));
4853 if (!TiedMO.isReg() || !TiedMO.isImplicit() || !TiedMO.isUse()) {
4854 ErrInfo =
4855 "Dst register should be tied to implicit use of preserved register";
4856 return false;
4857 } else if (TiedMO.getReg().isPhysical() &&
4858 Dst.getReg() != TiedMO.getReg()) {
4859 ErrInfo = "Dst register should use same physical register as preserved";
4860 return false;
4861 }
4862 }
4863 }
4864
4865 // Verify MIMG / VIMAGE / VSAMPLE
4866 if (isImage(MI.getOpcode()) && !MI.mayStore()) {
4867 // Ensure that the return type used is large enough for all the options
4868 // being used TFE/LWE require an extra result register.
4869 const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
4870 if (DMask) {
4871 uint64_t DMaskImm = DMask->getImm();
4872 uint32_t RegCount =
4873 isGather4(MI.getOpcode()) ? 4 : llvm::popcount(DMaskImm);
4874 const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
4875 const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
4876 const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
4877
4878 // Adjust for packed 16 bit values
4879 if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
4880 RegCount = divideCeil(RegCount, 2);
4881
4882 // Adjust if using LWE or TFE
4883 if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
4884 RegCount += 1;
4885
4886 const uint32_t DstIdx =
4887 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
4888 const MachineOperand &Dst = MI.getOperand(DstIdx);
4889 if (Dst.isReg()) {
4890 const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
4891 uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
4892 if (RegCount > DstSize) {
4893 ErrInfo = "Image instruction returns too many registers for dst "
4894 "register class";
4895 return false;
4896 }
4897 }
4898 }
4899 }
4900
4901 // Verify VOP*. Ignore multiple sgpr operands on writelane.
4902 if (isVALU(MI) && Desc.getOpcode() != AMDGPU::V_WRITELANE_B32) {
4903 unsigned ConstantBusCount = 0;
4904 bool UsesLiteral = false;
4905 const MachineOperand *LiteralVal = nullptr;
4906
4907 int ImmIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm);
4908 if (ImmIdx != -1) {
4909 ++ConstantBusCount;
4910 UsesLiteral = true;
4911 LiteralVal = &MI.getOperand(ImmIdx);
4912 }
4913
4914 SmallVector<Register, 2> SGPRsUsed;
4915 Register SGPRUsed;
4916
4917 // Only look at the true operands. Only a real operand can use the constant
4918 // bus, and we don't want to check pseudo-operands like the source modifier
4919 // flags.
4920 for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) {
4921 if (OpIdx == -1)
4922 continue;
4923 const MachineOperand &MO = MI.getOperand(OpIdx);
4924 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4925 if (MO.isReg()) {
4926 SGPRUsed = MO.getReg();
4927 if (!llvm::is_contained(SGPRsUsed, SGPRUsed)) {
4928 ++ConstantBusCount;
4929 SGPRsUsed.push_back(SGPRUsed);
4930 }
4931 } else {
4932 if (!UsesLiteral) {
4933 ++ConstantBusCount;
4934 UsesLiteral = true;
4935 LiteralVal = &MO;
4936 } else if (!MO.isIdenticalTo(*LiteralVal)) {
4937 assert(isVOP2(MI) || isVOP3(MI));
4938 ErrInfo = "VOP2/VOP3 instruction uses more than one literal";
4939 return false;
4940 }
4941 }
4942 }
4943 }
4944
4945 SGPRUsed = findImplicitSGPRRead(MI);
4946 if (SGPRUsed) {
4947 // Implicit uses may safely overlap true operands
4948 if (llvm::all_of(SGPRsUsed, [this, SGPRUsed](unsigned SGPR) {
4949 return !RI.regsOverlap(SGPRUsed, SGPR);
4950 })) {
4951 ++ConstantBusCount;
4952 SGPRsUsed.push_back(SGPRUsed);
4953 }
4954 }
4955
4956 // v_writelane_b32 is an exception from constant bus restriction:
4957 // vsrc0 can be sgpr, const or m0 and lane select sgpr, m0 or inline-const
4958 if (ConstantBusCount > ST.getConstantBusLimit(Opcode) &&
4959 Opcode != AMDGPU::V_WRITELANE_B32) {
4960 ErrInfo = "VOP* instruction violates constant bus restriction";
4961 return false;
4962 }
4963
4964 if (isVOP3(MI) && UsesLiteral && !ST.hasVOP3Literal()) {
4965 ErrInfo = "VOP3 instruction uses literal";
4966 return false;
4967 }
4968 }
4969
4970 // Special case for writelane - this can break the multiple constant bus rule,
4971 // but still can't use more than one SGPR register
4972 if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
4973 unsigned SGPRCount = 0;
4974 Register SGPRUsed;
4975
4976 for (int OpIdx : {Src0Idx, Src1Idx}) {
4977 if (OpIdx == -1)
4978 break;
4979
4980 const MachineOperand &MO = MI.getOperand(OpIdx);
4981
4982 if (usesConstantBus(MRI, MO, MI.getDesc().operands()[OpIdx])) {
4983 if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
4984 if (MO.getReg() != SGPRUsed)
4985 ++SGPRCount;
4986 SGPRUsed = MO.getReg();
4987 }
4988 }
4989 if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
4990 ErrInfo = "WRITELANE instruction violates constant bus restriction";
4991 return false;
4992 }
4993 }
4994 }
4995
4996 // Verify misc. restrictions on specific instructions.
4997 if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32_e64 ||
4998 Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64_e64) {
4999 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5000 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5001 const MachineOperand &Src2 = MI.getOperand(Src2Idx);
5002 if (Src0.isReg() && Src1.isReg() && Src2.isReg()) {
5003 if (!compareMachineOp(Src0, Src1) &&
5004 !compareMachineOp(Src0, Src2)) {
5005 ErrInfo = "v_div_scale_{f32|f64} require src0 = src1 or src2";
5006 return false;
5007 }
5008 }
5009 if ((getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
5010 SISrcMods::ABS) ||
5011 (getNamedOperand(MI, AMDGPU::OpName::src1_modifiers)->getImm() &
5012 SISrcMods::ABS) ||
5013 (getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
5014 SISrcMods::ABS)) {
5015 ErrInfo = "ABS not allowed in VOP3B instructions";
5016 return false;
5017 }
5018 }
5019
5020 if (isSOP2(MI) || isSOPC(MI)) {
5021 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5022 const MachineOperand &Src1 = MI.getOperand(Src1Idx);
5023
5024 if (!Src0.isReg() && !Src1.isReg() &&
5025 !isInlineConstant(Src0, Desc.operands()[Src0Idx]) &&
5026 !isInlineConstant(Src1, Desc.operands()[Src1Idx]) &&
5027 !Src0.isIdenticalTo(Src1)) {
5028 ErrInfo = "SOP2/SOPC instruction requires too many immediate constants";
5029 return false;
5030 }
5031 }
5032
5033 if (isSOPK(MI)) {
5034 auto Op = getNamedOperand(MI, AMDGPU::OpName::simm16);
5035 if (Desc.isBranch()) {
5036 if (!Op->isMBB()) {
5037 ErrInfo = "invalid branch target for SOPK instruction";
5038 return false;
5039 }
5040 } else {
5041 uint64_t Imm = Op->getImm();
5042 if (sopkIsZext(Opcode)) {
5043 if (!isUInt<16>(Imm)) {
5044 ErrInfo = "invalid immediate for SOPK instruction";
5045 return false;
5046 }
5047 } else {
5048 if (!isInt<16>(Imm)) {
5049 ErrInfo = "invalid immediate for SOPK instruction";
5050 return false;
5051 }
5052 }
5053 }
5054 }
5055
5056 if (Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e32 ||
5057 Desc.getOpcode() == AMDGPU::V_MOVRELS_B32_e64 ||
5058 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5059 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64) {
5060 const bool IsDst = Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e32 ||
5061 Desc.getOpcode() == AMDGPU::V_MOVRELD_B32_e64;
5062
5063 const unsigned StaticNumOps =
5064 Desc.getNumOperands() + Desc.implicit_uses().size();
5065 const unsigned NumImplicitOps = IsDst ? 2 : 1;
5066
5067 // Allow additional implicit operands. This allows a fixup done by the post
5068 // RA scheduler where the main implicit operand is killed and implicit-defs
5069 // are added for sub-registers that remain live after this instruction.
5070 if (MI.getNumOperands() < StaticNumOps + NumImplicitOps) {
5071 ErrInfo = "missing implicit register operands";
5072 return false;
5073 }
5074
5075 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5076 if (IsDst) {
5077 if (!Dst->isUse()) {
5078 ErrInfo = "v_movreld_b32 vdst should be a use operand";
5079 return false;
5080 }
5081
5082 unsigned UseOpIdx;
5083 if (!MI.isRegTiedToUseOperand(StaticNumOps, &UseOpIdx) ||
5084 UseOpIdx != StaticNumOps + 1) {
5085 ErrInfo = "movrel implicit operands should be tied";
5086 return false;
5087 }
5088 }
5089
5090 const MachineOperand &Src0 = MI.getOperand(Src0Idx);
5091 const MachineOperand &ImpUse
5092 = MI.getOperand(StaticNumOps + NumImplicitOps - 1);
5093 if (!ImpUse.isReg() || !ImpUse.isUse() ||
5094 !isSubRegOf(RI, ImpUse, IsDst ? *Dst : Src0)) {
5095 ErrInfo = "src0 should be subreg of implicit vector use";
5096 return false;
5097 }
5098 }
5099
5100 // Make sure we aren't losing exec uses in the td files. This mostly requires
5101 // being careful when using let Uses to try to add other use registers.
5102 if (shouldReadExec(MI)) {
5103 if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
5104 ErrInfo = "VALU instruction does not implicitly read exec mask";
5105 return false;
5106 }
5107 }
5108
5109 if (isSMRD(MI)) {
5110 if (MI.mayStore() &&
5112 // The register offset form of scalar stores may only use m0 as the
5113 // soffset register.
5114 const MachineOperand *Soff = getNamedOperand(MI, AMDGPU::OpName::soffset);
5115 if (Soff && Soff->getReg() != AMDGPU::M0) {
5116 ErrInfo = "scalar stores must use m0 as offset register";
5117 return false;
5118 }
5119 }
5120 }
5121
5122 if (isFLAT(MI) && !ST.hasFlatInstOffsets()) {
5123 const MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
5124 if (Offset->getImm() != 0) {
5125 ErrInfo = "subtarget does not support offsets in flat instructions";
5126 return false;
5127 }
5128 }
5129
5130 if (isDS(MI) && !ST.hasGDS()) {
5131 const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
5132 if (GDSOp && GDSOp->getImm() != 0) {
5133 ErrInfo = "GDS is not supported on this subtarget";
5134 return false;
5135 }
5136 }
5137
5138 if (isImage(MI)) {
5139 const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
5140 if (DimOp) {
5141 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
5142 AMDGPU::OpName::vaddr0);
5143 int RSrcOpName =
5144 isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
5145 int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
5146 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
5147 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
5149 const AMDGPU::MIMGDimInfo *Dim =
5151
5152 if (!Dim) {
5153 ErrInfo = "dim is out of range";
5154 return false;
5155 }
5156
5157 bool IsA16 = false;
5158 if (ST.hasR128A16()) {
5159 const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
5160 IsA16 = R128A16->getImm() != 0;
5161 } else if (ST.hasA16()) {
5162 const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
5163 IsA16 = A16->getImm() != 0;
5164 }
5165
5166 bool IsNSA = RsrcIdx - VAddr0Idx > 1;
5167
5168 unsigned AddrWords =
5169 AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
5170
5171 unsigned VAddrWords;
5172 if (IsNSA) {
5173 VAddrWords = RsrcIdx - VAddr0Idx;
5174 if (ST.hasPartialNSAEncoding() &&
5175 AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
5176 unsigned LastVAddrIdx = RsrcIdx - 1;
5177 VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
5178 }
5179 } else {
5180 VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
5181 if (AddrWords > 12)
5182 AddrWords = 16;
5183 }
5184
5185 if (VAddrWords != AddrWords) {
5186 LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
5187 << " but got " << VAddrWords << "\n");
5188 ErrInfo = "bad vaddr size";
5189 return false;
5190 }
5191 }
5192 }
5193
5194 const MachineOperand *DppCt = getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl);
5195 if (DppCt) {
5196 using namespace AMDGPU::DPP;
5197
5198 unsigned DC = DppCt->getImm();
5199 if (DC == DppCtrl::DPP_UNUSED1 || DC == DppCtrl::DPP_UNUSED2 ||
5200 DC == DppCtrl::DPP_UNUSED3 || DC > DppCtrl::DPP_LAST ||
5201 (DC >= DppCtrl::DPP_UNUSED4_FIRST && DC <= DppCtrl::DPP_UNUSED4_LAST) ||
5202 (DC >= DppCtrl::DPP_UNUSED5_FIRST && DC <= DppCtrl::DPP_UNUSED5_LAST) ||
5203 (DC >= DppCtrl::DPP_UNUSED6_FIRST && DC <= DppCtrl::DPP_UNUSED6_LAST) ||
5204 (DC >= DppCtrl::DPP_UNUSED7_FIRST && DC <= DppCtrl::DPP_UNUSED7_LAST) ||
5205 (DC >= DppCtrl::DPP_UNUSED8_FIRST && DC <= DppCtrl::DPP_UNUSED8_LAST)) {
5206 ErrInfo = "Invalid dpp_ctrl value";
5207 return false;
5208 }
5209 if (DC >= DppCtrl::WAVE_SHL1 && DC <= DppCtrl::WAVE_ROR1 &&
5211 ErrInfo = "Invalid dpp_ctrl value: "
5212 "wavefront shifts are not supported on GFX10+";
5213 return false;
5214 }
5215 if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
5217 ErrInfo = "Invalid dpp_ctrl value: "
5218 "broadcasts are not supported on GFX10+";
5219 return false;
5220 }
5221 if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
5223 if (DC >= DppCtrl::ROW_NEWBCAST_FIRST &&
5224 DC <= DppCtrl::ROW_NEWBCAST_LAST &&
5225 !ST.hasGFX90AInsts()) {
5226 ErrInfo = "Invalid dpp_ctrl value: "
5227 "row_newbroadcast/row_share is not supported before "
5228 "GFX90A/GFX10";
5229 return false;
5230 } else if (DC > DppCtrl::ROW_NEWBCAST_LAST || !ST.hasGFX90AInsts()) {
5231 ErrInfo = "Invalid dpp_ctrl value: "
5232 "row_share and row_xmask are not supported before GFX10";
5233 return false;
5234 }
5235 }
5236
5237 if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
5239 ErrInfo = "Invalid dpp_ctrl value: "
5240 "DP ALU dpp only support row_newbcast";
5241 return false;
5242 }
5243 }
5244
5245 if ((MI.mayStore() || MI.mayLoad()) && !isVGPRSpill(MI)) {
5246 const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
5247 uint16_t DataNameIdx = isDS(Opcode) ? AMDGPU::OpName::data0
5248 : AMDGPU::OpName::vdata;
5249 const MachineOperand *Data = getNamedOperand(MI, DataNameIdx);
5250 const MachineOperand *Data2 = getNamedOperand(MI, AMDGPU::OpName::data1);
5251 if (Data && !Data->isReg())
5252 Data = nullptr;
5253
5254 if (ST.hasGFX90AInsts()) {
5255 if (Dst && Data &&
5256 (RI.isAGPR(MRI, Dst->getReg()) != RI.isAGPR(MRI, Data->getReg()))) {
5257 ErrInfo = "Invalid register class: "
5258 "vdata and vdst should be both VGPR or AGPR";
5259 return false;
5260 }
5261 if (Data && Data2 &&
5262 (RI.isAGPR(MRI, Data->getReg()) != RI.isAGPR(MRI, Data2->getReg()))) {
5263 ErrInfo = "Invalid register class: "
5264 "both data operands should be VGPR or AGPR";
5265 return false;
5266 }
5267 } else {
5268 if ((Dst && RI.isAGPR(MRI, Dst->getReg())) ||
5269 (Data && RI.isAGPR(MRI, Data->getReg())) ||
5270 (Data2 && RI.isAGPR(MRI, Data2->getReg()))) {
5271 ErrInfo = "Invalid register class: "
5272 "agpr loads and stores not supported on this GPU";
5273 return false;
5274 }
5275 }
5276 }
5277
5278 if (ST.needsAlignedVGPRs()) {
5279 const auto isAlignedReg = [&MI, &MRI, this](unsigned OpName) -> bool {
5281 if (!Op)
5282 return true;
5283 Register Reg = Op->getReg();
5284 if (Reg.isPhysical())
5285 return !(RI.getHWRegIndex(Reg) & 1);
5286 const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
5287 return RI.getRegSizeInBits(RC) > 32 && RI.isProperlyAlignedRC(RC) &&
5288 !(RI.getChannelFromSubReg(Op->getSubReg()) & 1);
5289 };
5290
5291 if (MI.getOpcode() == AMDGPU::DS_GWS_INIT ||
5292 MI.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
5293 MI.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
5294
5295 if (!isAlignedReg(AMDGPU::OpName::data0)) {
5296 ErrInfo = "Subtarget requires even aligned vector registers "
5297 "for DS_GWS instructions";
5298 return false;
5299 }
5300 }
5301
5302 if (isMIMG(MI)) {
5303 if (!isAlignedReg(AMDGPU::OpName::vaddr)) {
5304 ErrInfo = "Subtarget requires even aligned vector registers "
5305 "for vaddr operand of image instructions";
5306 return false;
5307 }
5308 }
5309 }
5310
5311 if (MI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
5312 !ST.hasGFX90AInsts()) {
5313 const MachineOperand *Src = getNamedOperand(MI, AMDGPU::OpName::src0);
5314 if (Src->isReg() && RI.isSGPRReg(MRI, Src->getReg())) {
5315 ErrInfo = "Invalid register class: "
5316 "v_accvgpr_write with an SGPR is not supported on this GPU";
5317 return false;
5318 }
5319 }
5320
5321 if (Desc.getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS) {
5322 const MachineOperand &SrcOp = MI.getOperand(1);
5323 if (!SrcOp.isReg() || SrcOp.getReg().isVirtual()) {
5324 ErrInfo = "pseudo expects only physical SGPRs";
5325 return false;
5326 }
5327 }
5328
5329 return true;
5330}
5331
5332// It is more readable to list mapped opcodes on the same line.
5333// clang-format off
5334
5336 switch (MI.getOpcode()) {
5337 default: return AMDGPU::INSTRUCTION_LIST_END;
5338 case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
5339 case AMDGPU::COPY: return AMDGPU::COPY;
5340 case AMDGPU::PHI: return AMDGPU::PHI;
5341 case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
5342 case AMDGPU::WQM: return AMDGPU::WQM;
5343 case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
5344 case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
5345 case AMDGPU::STRICT_WQM: return AMDGPU::STRICT_WQM;
5346 case AMDGPU::S_MOV_B32: {
5347 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5348 return MI.getOperand(1).isReg() ||
5349 RI.isAGPR(MRI, MI.getOperand(0).getReg()) ?
5350 AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
5351 }
5352 case AMDGPU::S_ADD_I32:
5353 return ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
5354 case AMDGPU::S_ADDC_U32:
5355 return AMDGPU::V_ADDC_U32_e32;
5356 case AMDGPU::S_SUB_I32:
5357 return ST.hasAddNoCarry() ? AMDGPU::V_SUB_U32_e64 : AMDGPU::V_SUB_CO_U32_e32;
5358 // FIXME: These are not consistently handled, and selected when the carry is
5359 // used.
5360 case AMDGPU::S_ADD_U32:
5361 return AMDGPU::V_ADD_CO_U32_e32;
5362 case AMDGPU::S_SUB_U32:
5363 return AMDGPU::V_SUB_CO_U32_e32;
5364 case AMDGPU::S_SUBB_U32: return AMDGPU::V_SUBB_U32_e32;
5365 case AMDGPU::S_MUL_I32: return AMDGPU::V_MUL_LO_U32_e64;
5366 case AMDGPU::S_MUL_HI_U32: return AMDGPU::V_MUL_HI_U32_e64;
5367 case AMDGPU::S_MUL_HI_I32: return AMDGPU::V_MUL_HI_I32_e64;
5368 case AMDGPU::S_AND_B32: return AMDGPU::V_AND_B32_e64;
5369 case AMDGPU::S_OR_B32: return AMDGPU::V_OR_B32_e64;
5370 case AMDGPU::S_XOR_B32: return AMDGPU::V_XOR_B32_e64;
5371 case AMDGPU::S_XNOR_B32:
5372 return ST.hasDLInsts() ? AMDGPU::V_XNOR_B32_e64 : AMDGPU::INSTRUCTION_LIST_END;
5373 case AMDGPU::S_MIN_I32: return AMDGPU::V_MIN_I32_e64;
5374 case AMDGPU::S_MIN_U32: return AMDGPU::V_MIN_U32_e64;
5375 case AMDGPU::S_MAX_I32: return AMDGPU::V_MAX_I32_e64;
5376 case AMDGPU::S_MAX_U32: return AMDGPU::V_MAX_U32_e64;
5377 case AMDGPU::S_ASHR_I32: return AMDGPU::V_ASHR_I32_e32;
5378 case AMDGPU::S_ASHR_I64: return AMDGPU::V_ASHR_I64_e64;
5379 case AMDGPU::S_LSHL_B32: return AMDGPU::V_LSHL_B32_e32;
5380 case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64_e64;
5381 case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
5382 case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64_e64;
5383 case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32_e64;
5384 case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32_e64;
5385 case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32_e64;
5386 case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32_e64;
5387 case AMDGPU::S_BFM_B32: return AMDGPU::V_BFM_B32_e64;
5388 case AMDGPU::S_BREV_B32: return AMDGPU::V_BFREV_B32_e32;
5389 case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
5390 case AMDGPU::S_NOT_B64: return AMDGPU::V_NOT_B32_e32;
5391 case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e64;
5392 case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e64;
5393 case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e64;
5394 case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e64;
5395 case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e64;
5396 case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e64;
5397 case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e64;
5398 case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e64;
5399 case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e64;
5400 case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e64;
5401 case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e64;
5402 case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e64;
5403 case AMDGPU::S_CMP_EQ_U64: return AMDGPU::V_CMP_EQ_U64_e64;
5404 case AMDGPU::S_CMP_LG_U64: return AMDGPU::V_CMP_NE_U64_e64;
5405 case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64;
5406 case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32;
5407 case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32;
5408 case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
5409 case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
5410 case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
5411 case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
5412 case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
5413 case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
5414 case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
5415 case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5416 case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
5417 case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
5418 case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
5419 case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
5420 case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
5421 case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
5422 case AMDGPU::S_CEIL_F16:
5423 return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
5424 : AMDGPU::V_CEIL_F16_fake16_e64;
5425 case AMDGPU::S_FLOOR_F16:
5426 return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
5427 : AMDGPU::V_FLOOR_F16_fake16_e64;
5428 case AMDGPU::S_TRUNC_F16:
5429 return AMDGPU::V_TRUNC_F16_fake16_e64;
5430 case AMDGPU::S_RNDNE_F16:
5431 return AMDGPU::V_RNDNE_F16_fake16_e64;
5432 case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
5433 case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
5434 case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
5435 case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
5436 case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
5437 case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
5438 case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
5439 case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
5440 case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
5441 case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
5442 case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
5443 case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
5444 case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
5445 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
5446 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
5447 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5448 case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
5449 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
5450 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
5451 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
5452 case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
5453 case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
5454 case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
5455 case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
5456 case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
5457 case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
5458 case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
5459 case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
5460 case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
5461 case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
5462 case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
5463 case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
5464 case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
5465 case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
5466 case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
5467 case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
5468 case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
5469 case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
5470 case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
5471 case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
5472 case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
5473 case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
5474 case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
5475 case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
5476 case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
5477 case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
5478 case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
5479 case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
5480 case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64;
5481 case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
5482 case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64;
5483 case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
5484 case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64;
5485 case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
5486 case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64;
5487 case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
5488 case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64;
5489 }
5491 "Unexpected scalar opcode without corresponding vector one!");
5492}
5493
5494// clang-format on
5495
5499 const DebugLoc &DL, Register Reg,
5500 bool IsSCCLive,
5501 SlotIndexes *Indexes) const {
5502 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
5503 const SIInstrInfo *TII = ST.getInstrInfo();
5504 bool IsWave32 = ST.isWave32();
5505 if (IsSCCLive) {
5506 // Insert two move instructions, one to save the original value of EXEC and
5507 // the other to turn on all bits in EXEC. This is required as we can't use
5508 // the single instruction S_OR_SAVEEXEC that clobbers SCC.
5509 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5510 MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5511 auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
5512 .addReg(Exec, RegState::Kill);
5513 auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
5514 if (Indexes) {
5515 Indexes->insertMachineInstrInMaps(*StoreExecMI);
5516 Indexes->insertMachineInstrInMaps(*FlipExecMI);
5517 }
5518 } else {
5519 const unsigned OrSaveExec =
5520 IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
5521 auto SaveExec =
5522 BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
5523 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
5524 if (Indexes)
5525 Indexes->insertMachineInstrInMaps(*SaveExec);
5526 }
5527}
5528
5531 const DebugLoc &DL, Register Reg,
5532 SlotIndexes *Indexes) const {
5533 unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5534 MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5535 auto ExecRestoreMI =
5536 BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
5537 if (Indexes)
5538 Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
5539}
5540
5541static const TargetRegisterClass *
5543 const MachineRegisterInfo &MRI,
5544 const MCInstrDesc &TID, unsigned RCID,
5545 bool IsAllocatable) {
5546 if ((IsAllocatable || !ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5547 (((TID.mayLoad() || TID.mayStore()) &&
5548 !(TID.TSFlags & SIInstrFlags::Spill)) ||
5550 switch (RCID) {
5551 case AMDGPU::AV_32RegClassID:
5552 RCID = AMDGPU::VGPR_32RegClassID;
5553 break;
5554 case AMDGPU::AV_64RegClassID:
5555 RCID = AMDGPU::VReg_64RegClassID;
5556 break;
5557 case AMDGPU::AV_96RegClassID:
5558 RCID = AMDGPU::VReg_96RegClassID;
5559 break;
5560 case AMDGPU::AV_128RegClassID:
5561 RCID = AMDGPU::VReg_128RegClassID;
5562 break;
5563 case AMDGPU::AV_160RegClassID:
5564 RCID = AMDGPU::VReg_160RegClassID;
5565 break;
5566 case AMDGPU::AV_512RegClassID:
5567 RCID = AMDGPU::VReg_512RegClassID;
5568 break;
5569 default:
5570 break;
5571 }
5572 }
5573
5574 return RI.getProperlyAlignedRC(RI.getRegClass(RCID));
5575}
5576
5578 unsigned OpNum, const TargetRegisterInfo *TRI,
5579 const MachineFunction &MF)
5580 const {
5581 if (OpNum >= TID.getNumOperands())
5582 return nullptr;
5583 auto RegClass = TID.operands()[OpNum].RegClass;
5584 bool IsAllocatable = false;
5586 // vdst and vdata should be both VGPR or AGPR, same for the DS instructions
5587 // with two data operands. Request register class constrained to VGPR only
5588 // of both operands present as Machine Copy Propagation can not check this
5589 // constraint and possibly other passes too.
5590 //
5591 // The check is limited to FLAT and DS because atomics in non-flat encoding
5592 // have their vdst and vdata tied to be the same register.
5593 const int VDstIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5594 AMDGPU::OpName::vdst);
5595 const int DataIdx = AMDGPU::getNamedOperandIdx(TID.Opcode,
5596 (TID.TSFlags & SIInstrFlags::DS) ? AMDGPU::OpName::data0
5597 : AMDGPU::OpName::vdata);
5598 if (DataIdx != -1) {
5599 IsAllocatable = VDstIdx != -1 || AMDGPU::hasNamedOperand(
5600 TID.Opcode, AMDGPU::OpName::data1);
5601 }
5602 }
5603 return adjustAllocatableRegClass(ST, RI, MF.getRegInfo(), TID, RegClass,
5604 IsAllocatable);
5605}
5606
5608 unsigned OpNo) const {
5609 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
5610 const MCInstrDesc &Desc = get(MI.getOpcode());
5611 if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
5612 Desc.operands()[OpNo].RegClass == -1) {
5613 Register Reg = MI.getOperand(OpNo).getReg();
5614
5615 if (Reg.isVirtual())
5616 return MRI.getRegClass(Reg);
5617 return RI.getPhysRegBaseClass(Reg);
5618 }
5619
5620 unsigned RCID = Desc.operands()[OpNo].RegClass;
5621 return adjustAllocatableRegClass(ST, RI, MRI, Desc, RCID, true);
5622}
5623
5626 MachineBasicBlock *MBB = MI.getParent();
5627 MachineOperand &MO = MI.getOperand(OpIdx);
5629 unsigned RCID = get(MI.getOpcode()).operands()[OpIdx].RegClass;
5630 const TargetRegisterClass *RC = RI.getRegClass(RCID);
5631 unsigned Size = RI.getRegSizeInBits(*RC);
5632 unsigned Opcode = (Size == 64) ? AMDGPU::V_MOV_B64_PSEUDO : AMDGPU::V_MOV_B32_e32;
5633 if (MO.isReg())
5634 Opcode = AMDGPU::COPY;
5635 else if (RI.isSGPRClass(RC))
5636 Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
5637
5638 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
5639 Register Reg = MRI.createVirtualRegister(VRC);
5641 BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
5642 MO.ChangeToRegister(Reg, false);
5643}
5644
5647 const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
5648 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5649 MachineBasicBlock *MBB = MI->getParent();
5650 DebugLoc DL = MI->getDebugLoc();
5651 Register SubReg = MRI.createVirtualRegister(SubRC);
5652
5653 if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
5654 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5655 .addReg(SuperReg.getReg(), 0, SubIdx);
5656 return SubReg;
5657 }
5658
5659 // Just in case the super register is itself a sub-register, copy it to a new
5660 // value so we don't need to worry about merging its subreg index with the
5661 // SubIdx passed to this function. The register coalescer should be able to
5662 // eliminate this extra copy.
5663 Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
5664
5665 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
5666 .addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
5667
5668 BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
5669 .addReg(NewSuperReg, 0, SubIdx);
5670
5671 return SubReg;
5672}
5673
5676 const MachineOperand &Op, const TargetRegisterClass *SuperRC,
5677 unsigned SubIdx, const TargetRegisterClass *SubRC) const {
5678 if (Op.isImm()) {
5679 if (SubIdx == AMDGPU::sub0)
5680 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
5681 if (SubIdx == AMDGPU::sub1)
5682 return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm() >> 32));
5683
5684 llvm_unreachable("Unhandled register index for immediate");
5685 }
5686
5687 unsigned SubReg = buildExtractSubReg(MII, MRI, Op, SuperRC,
5688 SubIdx, SubRC);
5689 return MachineOperand::CreateReg(SubReg, false);
5690}
5691
5692// Change the order of operands from (0, 1, 2) to (0, 2, 1)
5693void SIInstrInfo::swapOperands(MachineInstr &Inst) const {
5694 assert(Inst.getNumExplicitOperands() == 3);
5695 MachineOperand Op1 = Inst.getOperand(1);
5696 Inst.removeOperand(1);
5697 Inst.addOperand(Op1);
5698}
5699
5701 const MCOperandInfo &OpInfo,
5702 const MachineOperand &MO) const {
5703 if (!MO.isReg())
5704 return false;
5705
5706 Register Reg = MO.getReg();
5707
5708 const TargetRegisterClass *DRC = RI.getRegClass(OpInfo.RegClass);
5709 if (Reg.isPhysical())
5710 return DRC->contains(Reg);
5711
5712 const TargetRegisterClass *RC = MRI.getRegClass(Reg);
5713
5714 if (MO.getSubReg()) {
5715 const MachineFunction *MF = MO.getParent()->getParent()->getParent();
5716 const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
5717 if (!SuperRC)
5718 return false;
5719
5720 DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
5721 if (!DRC)
5722 return false;
5723 }
5724 return RC->hasSuperClassEq(DRC);
5725}
5726
5728 const MCOperandInfo &OpInfo,
5729 const MachineOperand &MO) const {
5730 if (MO.isReg())
5731 return isLegalRegOperand(MRI, OpInfo, MO);
5732
5733 // Handle non-register types that are treated like immediates.
5734 assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
5735 return true;
5736}
5737
5738bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
5739 const MachineOperand *MO) const {
5740 const MachineFunction &MF = *MI.getParent()->getParent();
5741 const MachineRegisterInfo &MRI = MF.getRegInfo();
5742 const MCInstrDesc &InstDesc = MI.getDesc();
5743 const MCOperandInfo &OpInfo = InstDesc.operands()[OpIdx];
5744 const TargetRegisterClass *DefinedRC =
5745 OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr;
5746 if (!MO)
5747 MO = &MI.getOperand(OpIdx);
5748
5749 int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode());
5750 int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0;
5751 if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
5752 if (!MO->isReg() && !isInlineConstant(*MO, OpInfo) && !LiteralLimit--)
5753 return false;
5754
5756 if (MO->isReg())
5757 SGPRsUsed.insert(RegSubRegPair(MO->getReg(), MO->getSubReg()));
5758
5759 for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
5760 if (i == OpIdx)
5761 continue;
5762 const MachineOperand &Op = MI.getOperand(i);
5763 if (Op.isReg()) {
5764 RegSubRegPair SGPR(Op.getReg(), Op.getSubReg());
5765 if (!SGPRsUsed.count(SGPR) &&
5766 // FIXME: This can access off the end of the operands() array.
5767 usesConstantBus(MRI, Op, InstDesc.operands().begin()[i])) {
5768 if (--ConstantBusLimit <= 0)
5769 return false;
5770 SGPRsUsed.insert(SGPR);
5771 }
5772 } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
5773 !isInlineConstant(Op, InstDesc.operands()[i])) {
5774 if (!LiteralLimit--)
5775 return false;
5776 if (--ConstantBusLimit <= 0)
5777 return false;
5778 }
5779 }
5780 }
5781
5782 if (MO->isReg()) {
5783 if (!DefinedRC)
5784 return OpInfo.OperandType == MCOI::OPERAND_UNKNOWN;
5785 if (!isLegalRegOperand(MRI, OpInfo, *MO))
5786 return false;
5787 bool IsAGPR = RI.isAGPR(MRI, MO->getReg());
5788 if (IsAGPR && !ST.hasMAIInsts())
5789 return false;
5790 unsigned Opc = MI.getOpcode();
5791 if (IsAGPR &&
5792 (!ST.hasGFX90AInsts() || !MRI.reservedRegsFrozen()) &&
5793 (MI.mayLoad() || MI.mayStore() || isDS(Opc) || isMIMG(Opc)))
5794 return false;
5795 // Atomics should have both vdst and vdata either vgpr or agpr.
5796 const int VDstIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
5797 const int DataIdx = AMDGPU::getNamedOperandIdx(Opc,
5798 isDS(Opc) ? AMDGPU::OpName::data0 : AMDGPU::OpName::vdata);
5799 if ((int)OpIdx == VDstIdx && DataIdx != -1 &&
5800 MI.getOperand(DataIdx).isReg() &&
5801 RI.isAGPR(MRI, MI.getOperand(DataIdx).getReg()) != IsAGPR)
5802 return false;
5803 if ((int)OpIdx == DataIdx) {
5804 if (VDstIdx != -1 &&
5805 RI.isAGPR(MRI, MI.getOperand(VDstIdx).getReg()) != IsAGPR)
5806 return false;
5807 // DS instructions with 2 src operands also must have tied RC.
5808 const int Data1Idx = AMDGPU::getNamedOperandIdx(Opc,
5809 AMDGPU::OpName::data1);
5810 if (Data1Idx != -1 && MI.getOperand(Data1Idx).isReg() &&
5811 RI.isAGPR(MRI, MI.getOperand(Data1Idx).getReg()) != IsAGPR)
5812 return false;
5813 }
5814 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64 && !ST.hasGFX90AInsts() &&
5815 (int)OpIdx == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) &&
5816 RI.isSGPRReg(MRI, MO->getReg()))
5817 return false;
5818 return true;
5819 }
5820
5821 if (MO->isImm()) {
5822 uint64_t Imm = MO->getImm();
5823 bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
5824 bool Is64BitOp = Is64BitFPOp ||
5828 if (Is64BitOp &&
5830 if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
5831 return false;
5832
5833 // FIXME: We can use sign extended 64-bit literals, but only for signed
5834 // operands. At the moment we do not know if an operand is signed.
5835 // Such operand will be encoded as its low 32 bits and then either
5836 // correctly sign extended or incorrectly zero extended by HW.
5837 if (!Is64BitFPOp && (int32_t)Imm < 0)
5838 return false;
5839 }
5840 }
5841
5842 // Handle non-register types that are treated like immediates.
5843 assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
5844
5845 if (!DefinedRC) {
5846 // This operand expects an immediate.
5847 return true;
5848 }
5849
5850 return isImmOperandLegal(MI, OpIdx, *MO);
5851}
5852
5854 MachineInstr &MI) const {
5855 unsigned Opc = MI.getOpcode();
5856 const MCInstrDesc &InstrDesc = get(Opc);
5857
5858 int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
5859 MachineOperand &Src0 = MI.getOperand(Src0Idx);
5860
5861 int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
5862 MachineOperand &Src1 = MI.getOperand(Src1Idx);
5863
5864 // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32
5865 // we need to only have one constant bus use before GFX10.
5866 bool HasImplicitSGPR = findImplicitSGPRRead(MI);
5867 if (HasImplicitSGPR && ST.getConstantBusLimit(Opc) <= 1 && Src0.isReg() &&
5868 RI.isSGPRReg(MRI, Src0.getReg()))
5869 legalizeOpWithMove(MI, Src0Idx);
5870
5871 // Special case: V_WRITELANE_B32 accepts only immediate or SGPR operands for
5872 // both the value to write (src0) and lane select (src1). Fix up non-SGPR
5873 // src0/src1 with V_READFIRSTLANE.
5874 if (Opc == AMDGPU::V_WRITELANE_B32) {
5875 const DebugLoc &DL = MI.getDebugLoc();
5876 if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
5877 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5878 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5879 .add(Src0);
5880 Src0.ChangeToRegister(Reg, false);
5881 }
5882 if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
5883 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5884 const DebugLoc &DL = MI.getDebugLoc();
5885 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5886 .add(Src1);
5887 Src1.ChangeToRegister(Reg, false);
5888 }
5889 return;
5890 }
5891
5892 // No VOP2 instructions support AGPRs.
5893 if (Src0.isReg() && RI.isAGPR(MRI, Src0.getReg()))
5894 legalizeOpWithMove(MI, Src0Idx);
5895
5896 if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
5897 legalizeOpWithMove(MI, Src1Idx);
5898
5899 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
5900 if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
5901 int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
5902 if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
5903 legalizeOpWithMove(MI, Src2Idx);
5904 }
5905
5906 // VOP2 src0 instructions support all operand types, so we don't need to check
5907 // their legality. If src1 is already legal, we don't need to do anything.
5908 if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
5909 return;
5910
5911 // Special case: V_READLANE_B32 accepts only immediate or SGPR operands for
5912 // lane select. Fix up using V_READFIRSTLANE, since we assume that the lane
5913 // select is uniform.
5914 if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
5915 RI.isVGPR(MRI, Src1.getReg())) {
5916 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5917 const DebugLoc &DL = MI.getDebugLoc();
5918 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5919 .add(Src1);
5920 Src1.ChangeToRegister(Reg, false);
5921 return;
5922 }
5923
5924 // We do not use commuteInstruction here because it is too aggressive and will
5925 // commute if it is possible. We only want to commute here if it improves
5926 // legality. This can be called a fairly large number of times so don't waste
5927 // compile time pointlessly swapping and checking legality again.
5928 if (HasImplicitSGPR || !MI.isCommutable()) {
5929 legalizeOpWithMove(MI, Src1Idx);
5930 return;
5931 }
5932
5933 // If src0 can be used as src1, commuting will make the operands legal.
5934 // Otherwise we have to give up and insert a move.
5935 //
5936 // TODO: Other immediate-like operand kinds could be commuted if there was a
5937 // MachineOperand::ChangeTo* for them.
5938 if ((!Src1.isImm() && !Src1.isReg()) ||
5939 !isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src0)) {
5940 legalizeOpWithMove(MI, Src1Idx);
5941 return;
5942 }
5943
5944 int CommutedOpc = commuteOpcode(MI);
5945 if (CommutedOpc == -1) {
5946 legalizeOpWithMove(MI, Src1Idx);
5947 return;
5948 }
5949
5950 MI.setDesc(get(CommutedOpc));
5951
5952 Register Src0Reg = Src0.getReg();
5953 unsigned Src0SubReg = Src0.getSubReg();
5954 bool Src0Kill = Src0.isKill();
5955
5956 if (Src1.isImm())
5957 Src0.ChangeToImmediate(Src1.getImm());
5958 else if (Src1.isReg()) {
5959 Src0.ChangeToRegister(Src1.getReg(), false, false, Src1.isKill());
5960 Src0.setSubReg(Src1.getSubReg());
5961 } else
5962 llvm_unreachable("Should only have register or immediate operands");
5963
5964 Src1.ChangeToRegister(Src0Reg, false, false, Src0Kill);
5965 Src1.setSubReg(Src0SubReg);
5967}
5968
5969// Legalize VOP3 operands. All operand types are supported for any operand
5970// but only one literal constant and only starting from GFX10.
5972 MachineInstr &MI) const {
5973 unsigned Opc = MI.getOpcode();
5974
5975 int VOP3Idx[3] = {
5976 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
5977 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1),
5978 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)
5979 };
5980
5981 if (Opc == AMDGPU::V_PERMLANE16_B32_e64 ||
5982 Opc == AMDGPU::V_PERMLANEX16_B32_e64) {
5983 // src1 and src2 must be scalar
5984 MachineOperand &Src1 = MI.getOperand(VOP3Idx[1]);
5985 MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
5986 const DebugLoc &DL = MI.getDebugLoc();
5987 if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
5988 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5989 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5990 .add(Src1);
5991 Src1.ChangeToRegister(Reg, false);
5992 }
5993 if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
5994 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5995 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
5996 .add(Src2);
5997 Src2.ChangeToRegister(Reg, false);
5998 }
5999 }
6000
6001 // Find the one SGPR operand we are allowed to use.
6002 int ConstantBusLimit = ST.getConstantBusLimit(Opc);
6003 int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
6004 SmallDenseSet<unsigned> SGPRsUsed;
6005 Register SGPRReg = findUsedSGPR(MI, VOP3Idx);
6006 if (SGPRReg) {
6007 SGPRsUsed.insert(SGPRReg);
6008 --ConstantBusLimit;
6009 }
6010
6011 for (int Idx : VOP3Idx) {
6012 if (Idx == -1)
6013 break;
6014 MachineOperand &MO = MI.getOperand(Idx);
6015
6016 if (!MO.isReg()) {
6017 if (isInlineConstant(MO, get(Opc).operands()[Idx]))
6018 continue;
6019
6020 if (LiteralLimit > 0 && ConstantBusLimit > 0) {
6021 --LiteralLimit;
6022 --ConstantBusLimit;
6023 continue;
6024 }
6025
6026 --LiteralLimit;
6027 --ConstantBusLimit;
6029 continue;
6030 }
6031
6032 if (RI.hasAGPRs(RI.getRegClassForReg(MRI, MO.getReg())) &&
6033 !isOperandLegal(MI, Idx, &MO)) {
6035 continue;
6036 }
6037
6038 if (!RI.isSGPRClass(RI.getRegClassForReg(MRI, MO.getReg())))
6039 continue; // VGPRs are legal
6040
6041 // We can use one SGPR in each VOP3 instruction prior to GFX10
6042 // and two starting from GFX10.
6043 if (SGPRsUsed.count(MO.getReg()))
6044 continue;
6045 if (ConstantBusLimit > 0) {
6046 SGPRsUsed.insert(MO.getReg());
6047 --ConstantBusLimit;
6048 continue;
6049 }
6050
6051 // If we make it this far, then the operand is not legal and we must
6052 // legalize it.
6054 }
6055
6056 // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
6057 if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
6058 !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
6059 legalizeOpWithMove(MI, VOP3Idx[2]);
6060}
6061
6063 MachineRegisterInfo &MRI) const {
6064 const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
6065 const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
6066 Register DstReg = MRI.createVirtualRegister(SRC);
6067 unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
6068
6069 if (RI.hasAGPRs(VRC)) {
6070 VRC = RI.getEquivalentVGPRClass(VRC);
6071 Register NewSrcReg = MRI.createVirtualRegister(VRC);
6072 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6073 get(TargetOpcode::COPY), NewSrcReg)
6074 .addReg(SrcReg);
6075 SrcReg = NewSrcReg;
6076 }
6077
6078 if (SubRegs == 1) {
6079 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6080 get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6081 .addReg(SrcReg);
6082 return DstReg;
6083 }
6084
6086 for (unsigned i = 0; i < SubRegs; ++i) {
6087 Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6088 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6089 get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
6090 .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
6091 SRegs.push_back(SGPR);
6092 }
6093
6095 BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
6096 get(AMDGPU::REG_SEQUENCE), DstReg);
6097 for (unsigned i = 0; i < SubRegs; ++i) {
6098 MIB.addReg(SRegs[i]);
6099 MIB.addImm(RI.getSubRegFromChannel(i));
6100 }
6101 return DstReg;
6102}
6103
6105 MachineInstr &MI) const {
6106
6107 // If the pointer is store in VGPRs, then we need to move them to
6108 // SGPRs using v_readfirstlane. This is safe because we only select
6109 // loads with uniform pointers to SMRD instruction so we know the
6110 // pointer value is uniform.
6111 MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase);
6112 if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) {
6113 Register SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI);
6114 SBase->setReg(SGPR);
6115 }
6116 MachineOperand *SOff = getNamedOperand(MI, AMDGPU::OpName::soffset);
6117 if (SOff && !RI.isSGPRClass(MRI.getRegClass(SOff->getReg()))) {
6118 Register SGPR = readlaneVGPRToSGPR(SOff->getReg(), MI, MRI);
6119 SOff->setReg(SGPR);
6120 }
6121}
6122
6124 unsigned Opc = Inst.getOpcode();
6125 int OldSAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr);
6126 if (OldSAddrIdx < 0)
6127 return false;
6128
6130
6131 int NewOpc = AMDGPU::getGlobalVaddrOp(Opc);
6132 if (NewOpc < 0)
6134 if (NewOpc < 0)
6135 return false;
6136
6138 MachineOperand &SAddr = Inst.getOperand(OldSAddrIdx);
6139 if (RI.isSGPRReg(MRI, SAddr.getReg()))
6140 return false;
6141
6142 int NewVAddrIdx = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vaddr);
6143 if (NewVAddrIdx < 0)
6144 return false;
6145
6146 int OldVAddrIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
6147
6148 // Check vaddr, it shall be zero or absent.
6149 MachineInstr *VAddrDef = nullptr;
6150 if (OldVAddrIdx >= 0) {
6151 MachineOperand &VAddr = Inst.getOperand(OldVAddrIdx);
6152 VAddrDef = MRI.getUniqueVRegDef(VAddr.getReg());
6153 if (!VAddrDef || VAddrDef->getOpcode() != AMDGPU::V_MOV_B32_e32 ||
6154 !VAddrDef->getOperand(1).isImm() ||
6155 VAddrDef->getOperand(1).getImm() != 0)
6156 return false;
6157 }
6158
6159 const MCInstrDesc &NewDesc = get(NewOpc);
6160 Inst.setDesc(NewDesc);
6161
6162 // Callers expect iterator to be valid after this call, so modify the
6163 // instruction in place.
6164 if (OldVAddrIdx == NewVAddrIdx) {
6165 MachineOperand &NewVAddr = Inst.getOperand(NewVAddrIdx);
6166 // Clear use list from the old vaddr holding a zero register.
6167 MRI.removeRegOperandFromUseList(&NewVAddr);
6168 MRI.moveOperands(&NewVAddr, &SAddr, 1);
6169 Inst.removeOperand(OldSAddrIdx);
6170 // Update the use list with the pointer we have just moved from vaddr to
6171 // saddr position. Otherwise new vaddr will be missing from the use list.
6172 MRI.removeRegOperandFromUseList(&NewVAddr);
6173 MRI.addRegOperandToUseList(&NewVAddr);
6174 } else {
6175 assert(OldSAddrIdx == NewVAddrIdx);
6176
6177 if (OldVAddrIdx >= 0) {
6178 int NewVDstIn = AMDGPU::getNamedOperandIdx(NewOpc,
6179 AMDGPU::OpName::vdst_in);
6180
6181 // removeOperand doesn't try to fixup tied operand indexes at it goes, so
6182 // it asserts. Untie the operands for now and retie them afterwards.
6183 if (NewVDstIn != -1) {
6184 int OldVDstIn = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
6185 Inst.untieRegOperand(OldVDstIn);
6186 }
6187
6188 Inst.removeOperand(OldVAddrIdx);
6189
6190 if (NewVDstIn != -1) {
6191 int NewVDst = AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
6192 Inst.tieOperands(NewVDst, NewVDstIn);
6193 }
6194 }
6195 }
6196
6197 if (VAddrDef && MRI.use_nodbg_empty(VAddrDef->getOperand(0).getReg()))
6198 VAddrDef->eraseFromParent();
6199
6200 return true;
6201}
6202
6203// FIXME: Remove this when SelectionDAG is obsoleted.
6205 MachineInstr &MI) const {
6207 return;
6208
6209 // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence
6210 // thinks they are uniform, so a readfirstlane should be valid.
6211 MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr);
6212 if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg())))
6213 return;
6214
6216 return;
6217
6218 Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI);
6219 SAddr->setReg(ToSGPR);
6220}
6221
6224 const TargetRegisterClass *DstRC,
6227 const DebugLoc &DL) const {
6228 Register OpReg = Op.getReg();
6229 unsigned OpSubReg = Op.getSubReg();
6230
6231 const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
6232 RI.getRegClassForReg(MRI, OpReg), OpSubReg);
6233
6234 // Check if operand is already the correct register class.
6235 if (DstRC == OpRC)
6236 return;
6237
6238 Register DstReg = MRI.createVirtualRegister(DstRC);
6239 auto Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
6240
6241 Op.setReg(DstReg);
6242 Op.setSubReg(0);
6243
6244 MachineInstr *Def = MRI.getVRegDef(OpReg);
6245 if (!Def)
6246 return;
6247
6248 // Try to eliminate the copy if it is copying an immediate value.
6249 if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
6250 foldImmediate(*Copy, *Def, OpReg, &MRI);
6251
6252 bool ImpDef = Def->isImplicitDef();
6253 while (!ImpDef && Def && Def->isCopy()) {
6254 if (Def->getOperand(1).getReg().isPhysical())
6255 break;
6256 Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
6257 ImpDef = Def && Def->isImplicitDef();
6258 }
6259 if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
6260 !ImpDef)
6261 Copy.addReg(AMDGPU::EXEC, RegState::Implicit);
6262}
6263
6264// Emit the actual waterfall loop, executing the wrapped instruction for each
6265// unique value of \p ScalarOps across all lanes. In the best case we execute 1
6266// iteration, in the worst case we execute 64 (once per lane).
6269 MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
6270 ArrayRef<MachineOperand *> ScalarOps) {
6271 MachineFunction &MF = *OrigBB.getParent();
6272 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6273 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6274 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6275 unsigned SaveExecOpc =
6276 ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
6277 unsigned XorTermOpc =
6278 ST.isWave32() ? AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
6279 unsigned AndOpc =
6280 ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6281 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6282
6284
6285 SmallVector<Register, 8> ReadlanePieces;
6286 Register CondReg;
6287
6288 for (MachineOperand *ScalarOp : ScalarOps) {
6289 unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
6290 unsigned NumSubRegs = RegSize / 32;
6291 Register VScalarOp = ScalarOp->getReg();
6292
6293 if (NumSubRegs == 1) {
6294 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6295
6296 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
6297 .addReg(VScalarOp);
6298
6299 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6300
6301 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
6302 .addReg(CurReg)
6303 .addReg(VScalarOp);
6304
6305 // Combine the comparison results with AND.
6306 if (!CondReg) // First.
6307 CondReg = NewCondReg;
6308 else { // If not the first, we create an AND.
6309 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6310 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6311 .addReg(CondReg)
6312 .addReg(NewCondReg);
6313 CondReg = AndReg;
6314 }
6315
6316 // Update ScalarOp operand to use the SGPR ScalarOp.
6317 ScalarOp->setReg(CurReg);
6318 ScalarOp->setIsKill();
6319 } else {
6320 unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
6321 assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
6322 "Unhandled register size");
6323
6324 for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
6325 Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6326 Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6327
6328 // Read the next variant <- also loop target.
6329 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
6330 .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
6331
6332 // Read the next variant <- also loop target.
6333 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
6334 .addReg(VScalarOp, VScalarOpUndef,
6335 TRI->getSubRegFromChannel(Idx + 1));
6336
6337 ReadlanePieces.push_back(CurRegLo);
6338 ReadlanePieces.push_back(CurRegHi);
6339
6340 // Comparison is to be done as 64-bit.
6341 Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
6342 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
6343 .addReg(CurRegLo)
6344 .addImm(AMDGPU::sub0)
6345 .addReg(CurRegHi)
6346 .addImm(AMDGPU::sub1);
6347
6348 Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
6349 auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
6350 NewCondReg)
6351 .addReg(CurReg);
6352 if (NumSubRegs <= 2)
6353 Cmp.addReg(VScalarOp);
6354 else
6355 Cmp.addReg(VScalarOp, VScalarOpUndef,
6356 TRI->getSubRegFromChannel(Idx, 2));
6357
6358 // Combine the comparison results with AND.
6359 if (!CondReg) // First.
6360 CondReg = NewCondReg;
6361 else { // If not the first, we create an AND.
6362 Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
6363 BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
6364 .addReg(CondReg)
6365 .addReg(NewCondReg);
6366 CondReg = AndReg;
6367 }
6368 } // End for loop.
6369
6370 auto SScalarOpRC =
6371 TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
6372 Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
6373
6374 // Build scalar ScalarOp.
6375 auto Merge =
6376 BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
6377 unsigned Channel = 0;
6378 for (Register Piece : ReadlanePieces) {
6379 Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
6380 }
6381
6382 // Update ScalarOp operand to use the SGPR ScalarOp.
6383 ScalarOp->setReg(SScalarOp);
6384 ScalarOp->setIsKill();
6385 }
6386 }
6387
6388 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6389 MRI.setSimpleHint(SaveExec, CondReg);
6390
6391 // Update EXEC to matching lanes, saving original to SaveExec.
6392 BuildMI(LoopBB, I, DL, TII.get(SaveExecOpc), SaveExec)
6393 .addReg(CondReg, RegState::Kill);
6394
6395 // The original instruction is here; we insert the terminators after it.
6396 I = BodyBB.end();
6397
6398 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
6399 BuildMI(BodyBB, I, DL, TII.get(XorTermOpc), Exec)
6400 .addReg(Exec)
6401 .addReg(SaveExec);
6402
6403 BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
6404}
6405
6406// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
6407// with SGPRs by iterating over all unique values across all lanes.
6408// Returns the loop basic block that now contains \p MI.
6409static MachineBasicBlock *
6413 MachineBasicBlock::iterator Begin = nullptr,
6414 MachineBasicBlock::iterator End = nullptr) {
6415 MachineBasicBlock &MBB = *MI.getParent();
6416 MachineFunction &MF = *MBB.getParent();
6417 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
6418 const SIRegisterInfo *TRI = ST.getRegisterInfo();
6420 if (!Begin.isValid())
6421 Begin = &MI;
6422 if (!End.isValid()) {
6423 End = &MI;
6424 ++End;
6425 }
6426 const DebugLoc &DL = MI.getDebugLoc();
6427 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6428 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6429 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6430
6431 // Save SCC. Waterfall Loop may overwrite SCC.
6432 Register SaveSCCReg;
6433 bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) !=
6435 if (SCCNotDead) {
6436 SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6437 BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
6438 .addImm(1)
6439 .addImm(0);
6440 }
6441
6442 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
6443
6444 // Save the EXEC mask
6445 BuildMI(MBB, Begin, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
6446
6447 // Killed uses in the instruction we are waterfalling around will be
6448 // incorrect due to the added control-flow.
6450 ++AfterMI;
6451 for (auto I = Begin; I != AfterMI; I++) {
6452 for (auto &MO : I->all_uses())
6453 MRI.clearKillFlags(MO.getReg());
6454 }
6455
6456 // To insert the loop we need to split the block. Move everything after this
6457 // point to a new block, and insert a new empty block between the two.
6460 MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
6462 ++MBBI;
6463
6464 MF.insert(MBBI, LoopBB);
6465 MF.insert(MBBI, BodyBB);
6466 MF.insert(MBBI, RemainderBB);
6467
6468 LoopBB->addSuccessor(BodyBB);
6469 BodyBB->addSuccessor(LoopBB);
6470 BodyBB->addSuccessor(RemainderBB);
6471
6472 // Move Begin to MI to the BodyBB, and the remainder of the block to
6473 // RemainderBB.
6474 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
6475 RemainderBB->splice(RemainderBB->begin(), &MBB, End, MBB.end());
6476 BodyBB->splice(BodyBB->begin(), &MBB, Begin, MBB.end());
6477
6478 MBB.addSuccessor(LoopBB);
6479
6480 // Update dominators. We know that MBB immediately dominates LoopBB, that
6481 // LoopBB immediately dominates BodyBB, and BodyBB immediately dominates
6482 // RemainderBB. RemainderBB immediately dominates all of the successors
6483 // transferred to it from MBB that MBB used to properly dominate.
6484 if (MDT) {
6485 MDT->addNewBlock(LoopBB, &MBB);
6486 MDT->addNewBlock(BodyBB, LoopBB);
6487 MDT->addNewBlock(RemainderBB, BodyBB);
6488 for (auto &Succ : RemainderBB->successors()) {
6489 if (MDT->properlyDominates(&MBB, Succ)) {
6490 MDT->changeImmediateDominator(Succ, RemainderBB);
6491 }
6492 }
6493 }
6494
6495 emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
6496
6497 MachineBasicBlock::iterator First = RemainderBB->begin();
6498 // Restore SCC
6499 if (SCCNotDead) {
6500 BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
6501 .addReg(SaveSCCReg, RegState::Kill)
6502 .addImm(0);
6503 }
6504
6505 // Restore the EXEC mask
6506 BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
6507 return BodyBB;
6508}
6509
6510// Extract pointer from Rsrc and return a zero-value Rsrc replacement.
6511static std::tuple<unsigned, unsigned>
6513 MachineBasicBlock &MBB = *MI.getParent();
6514 MachineFunction &MF = *MBB.getParent();
6516
6517 // Extract the ptr from the resource descriptor.
6518 unsigned RsrcPtr =
6519 TII.buildExtractSubReg(MI, MRI, Rsrc, &AMDGPU::VReg_128RegClass,
6520 AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
6521
6522 // Create an empty resource descriptor
6523 Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6524 Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6525 Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
6526 Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
6527 uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
6528
6529 // Zero64 = 0
6530 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B64), Zero64)
6531 .addImm(0);
6532
6533 // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0}
6534 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatLo)
6535 .addImm(RsrcDataFormat & 0xFFFFFFFF);
6536
6537 // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32}
6538 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), SRsrcFormatHi)
6539 .addImm(RsrcDataFormat >> 32);
6540
6541 // NewSRsrc = {Zero64, SRsrcFormat}
6542 BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(AMDGPU::REG_SEQUENCE), NewSRsrc)
6543 .addReg(Zero64)
6544 .addImm(AMDGPU::sub0_sub1)
6545 .addReg(SRsrcFormatLo)
6546 .addImm(AMDGPU::sub2)
6547 .addReg(SRsrcFormatHi)
6548 .addImm(AMDGPU::sub3);
6549
6550 return std::tuple(RsrcPtr, NewSRsrc);
6551}
6552
6555 MachineDominatorTree *MDT) const {
6556 MachineFunction &MF = *MI.getParent()->getParent();
6558 MachineBasicBlock *CreatedBB = nullptr;
6559
6560 // Legalize VOP2
6561 if (isVOP2(MI) || isVOPC(MI)) {
6563 return CreatedBB;
6564 }
6565
6566 // Legalize VOP3
6567 if (isVOP3(MI)) {
6569 return CreatedBB;
6570 }
6571
6572 // Legalize SMRD
6573 if (isSMRD(MI)) {
6575 return CreatedBB;
6576 }
6577
6578 // Legalize FLAT
6579 if (isFLAT(MI)) {
6581 return CreatedBB;
6582 }
6583
6584 // Legalize REG_SEQUENCE and PHI
6585 // The register class of the operands much be the same type as the register
6586 // class of the output.
6587 if (MI.getOpcode() == AMDGPU::PHI) {
6588 const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
6589 for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
6590 if (!MI.getOperand(i).isReg() || !MI.getOperand(i).getReg().isVirtual())
6591 continue;
6592 const TargetRegisterClass *OpRC =
6593 MRI.getRegClass(MI.getOperand(i).getReg());
6594 if (RI.hasVectorRegisters(OpRC)) {
6595 VRC = OpRC;
6596 } else {
6597 SRC = OpRC;
6598 }
6599 }
6600
6601 // If any of the operands are VGPR registers, then they all most be
6602 // otherwise we will create illegal VGPR->SGPR copies when legalizing
6603 // them.
6604 if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
6605 if (!VRC) {
6606 assert(SRC);
6607 if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
6608 VRC = &AMDGPU::VReg_1RegClass;
6609 } else
6610 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6611 ? RI.getEquivalentAGPRClass(SRC)
6612 : RI.getEquivalentVGPRClass(SRC);
6613 } else {
6614 VRC = RI.isAGPRClass(getOpRegClass(MI, 0))
6615 ? RI.getEquivalentAGPRClass(VRC)
6616 : RI.getEquivalentVGPRClass(VRC);
6617 }
6618 RC = VRC;
6619 } else {
6620 RC = SRC;
6621 }
6622
6623 // Update all the operands so they have the same type.
6624 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6625 MachineOperand &Op = MI.getOperand(I);
6626 if (!Op.isReg() || !Op.getReg().isVirtual())
6627 continue;
6628
6629 // MI is a PHI instruction.
6630 MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB();
6632
6633 // Avoid creating no-op copies with the same src and dst reg class. These
6634 // confuse some of the machine passes.
6635 legalizeGenericOperand(*InsertBB, Insert, RC, Op, MRI, MI.getDebugLoc());
6636 }
6637 }
6638
6639 // REG_SEQUENCE doesn't really require operand legalization, but if one has a
6640 // VGPR dest type and SGPR sources, insert copies so all operands are
6641 // VGPRs. This seems to help operand folding / the register coalescer.
6642 if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) {
6643 MachineBasicBlock *MBB = MI.getParent();
6644 const TargetRegisterClass *DstRC = getOpRegClass(MI, 0);
6645 if (RI.hasVGPRs(DstRC)) {
6646 // Update all the operands so they are VGPR register classes. These may
6647 // not be the same register class because REG_SEQUENCE supports mixing
6648 // subregister index types e.g. sub0_sub1 + sub2 + sub3
6649 for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
6650 MachineOperand &Op = MI.getOperand(I);
6651 if (!Op.isReg() || !Op.getReg().isVirtual())
6652 continue;
6653
6654 const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
6655 const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(OpRC);
6656 if (VRC == OpRC)
6657 continue;
6658
6659 legalizeGenericOperand(*MBB, MI, VRC, Op, MRI, MI.getDebugLoc());
6660 Op.setIsKill();
6661 }
6662 }
6663
6664 return CreatedBB;
6665 }
6666
6667 // Legalize INSERT_SUBREG
6668 // src0 must have the same register class as dst
6669 if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
6670 Register Dst = MI.getOperand(0).getReg();
6671 Register Src0 = MI.getOperand(1).getReg();
6672 const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
6673 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
6674 if (DstRC != Src0RC) {
6675 MachineBasicBlock *MBB = MI.getParent();
6676 MachineOperand &Op = MI.getOperand(1);
6677 legalizeGenericOperand(*MBB, MI, DstRC, Op, MRI, MI.getDebugLoc());
6678 }
6679 return CreatedBB;
6680 }
6681
6682 // Legalize SI_INIT_M0
6683 if (MI.getOpcode() == AMDGPU::SI_INIT_M0) {
6684 MachineOperand &Src = MI.getOperand(0);
6685 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6686 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6687 return CreatedBB;
6688 }
6689
6690 // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
6691 if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
6692 MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
6693 MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
6694 MI.getOpcode() == AMDGPU::S_WQM_B32 ||
6695 MI.getOpcode() == AMDGPU::S_WQM_B64) {
6696 MachineOperand &Src = MI.getOperand(1);
6697 if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
6698 Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
6699 return CreatedBB;
6700 }
6701
6702 // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
6703 //
6704 // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
6705 // scratch memory access. In both cases, the legalization never involves
6706 // conversion to the addr64 form.
6708 (isMUBUF(MI) || isMTBUF(MI)))) {
6709 int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
6710 : AMDGPU::OpName::srsrc;
6711 MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
6712 if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
6713 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
6714
6715 int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
6716 MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
6717 if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
6718 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
6719
6720 return CreatedBB;
6721 }
6722
6723 // Legalize SI_CALL
6724 if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
6725 MachineOperand *Dest = &MI.getOperand(0);
6726 if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
6727 // Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
6728 // following copies, we also need to move copies from and to physical
6729 // registers into the loop block.
6730 unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
6731 unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
6732
6733 // Also move the copies to physical registers into the loop block
6734 MachineBasicBlock &MBB = *MI.getParent();
6736 while (Start->getOpcode() != FrameSetupOpcode)
6737 --Start;
6739 while (End->getOpcode() != FrameDestroyOpcode)
6740 ++End;
6741 // Also include following copies of the return value
6742 ++End;
6743 while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
6744 MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
6745 ++End;
6746 CreatedBB =
6747 loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
6748 }
6749 }
6750
6751 // Legalize s_sleep_var.
6752 if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
6753 const DebugLoc &DL = MI.getDebugLoc();
6754 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6755 int Src0Idx =
6756 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
6757 MachineOperand &Src0 = MI.getOperand(Src0Idx);
6758 BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
6759 .add(Src0);
6760 Src0.ChangeToRegister(Reg, false);
6761 return nullptr;
6762 }
6763
6764 // Legalize MUBUF instructions.
6765 bool isSoffsetLegal = true;
6766 int SoffsetIdx =
6767 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
6768 if (SoffsetIdx != -1) {
6769 MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
6770 if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
6771 !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
6772 isSoffsetLegal = false;
6773 }
6774 }
6775
6776 bool isRsrcLegal = true;
6777 int RsrcIdx =
6778 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
6779 if (RsrcIdx != -1) {
6780 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6781 if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
6782 isRsrcLegal = false;
6783 }
6784 }
6785
6786 // The operands are legal.
6787 if (isRsrcLegal && isSoffsetLegal)
6788 return CreatedBB;
6789
6790 if (!isRsrcLegal) {
6791 // Legalize a VGPR Rsrc
6792 //
6793 // If the instruction is _ADDR64, we can avoid a waterfall by extracting
6794 // the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
6795 // a zero-value SRsrc.
6796 //
6797 // If the instruction is _OFFSET (both idxen and offen disabled), and we
6798 // support ADDR64 instructions, we can convert to ADDR64 and do the same as
6799 // above.
6800 //
6801 // Otherwise we are on non-ADDR64 hardware, and/or we have
6802 // idxen/offen/bothen and we fall back to a waterfall loop.
6803
6804 MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
6805 MachineBasicBlock &MBB = *MI.getParent();
6806
6807 MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
6808 if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
6809 // This is already an ADDR64 instruction so we need to add the pointer
6810 // extracted from the resource descriptor to the current value of VAddr.
6811 Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6812 Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6813 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6814
6815 const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
6816 Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
6817 Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
6818
6819 unsigned RsrcPtr, NewSRsrc;
6820 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6821
6822 // NewVaddrLo = RsrcPtr:sub0 + VAddr:sub0
6823 const DebugLoc &DL = MI.getDebugLoc();
6824 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_CO_U32_e64), NewVAddrLo)
6825 .addDef(CondReg0)
6826 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6827 .addReg(VAddr->getReg(), 0, AMDGPU::sub0)
6828 .addImm(0);
6829
6830 // NewVaddrHi = RsrcPtr:sub1 + VAddr:sub1
6831 BuildMI(MBB, MI, DL, get(AMDGPU::V_ADDC_U32_e64), NewVAddrHi)
6832 .addDef(CondReg1, RegState::Dead)
6833 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6834 .addReg(VAddr->getReg(), 0, AMDGPU::sub1)
6835 .addReg(CondReg0, RegState::Kill)
6836 .addImm(0);
6837
6838 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6839 BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr)
6840 .addReg(NewVAddrLo)
6841 .addImm(AMDGPU::sub0)
6842 .addReg(NewVAddrHi)
6843 .addImm(AMDGPU::sub1);
6844
6845 VAddr->setReg(NewVAddr);
6846 Rsrc->setReg(NewSRsrc);
6847 } else if (!VAddr && ST.hasAddr64()) {
6848 // This instructions is the _OFFSET variant, so we need to convert it to
6849 // ADDR64.
6851 "FIXME: Need to emit flat atomics here");
6852
6853 unsigned RsrcPtr, NewSRsrc;
6854 std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
6855
6856 Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
6857 MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
6858 MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
6859 MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6860 unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode());
6861
6862 // Atomics with return have an additional tied operand and are
6863 // missing some of the special bits.
6864 MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in);
6865 MachineInstr *Addr64;
6866
6867 if (!VDataIn) {
6868 // Regular buffer load / store.
6870 BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6871 .add(*VData)
6872 .addReg(NewVAddr)
6873 .addReg(NewSRsrc)
6874 .add(*SOffset)
6875 .add(*Offset);
6876
6877 if (const MachineOperand *CPol =
6878 getNamedOperand(MI, AMDGPU::OpName::cpol)) {
6879 MIB.addImm(CPol->getImm());
6880 }
6881
6882 if (const MachineOperand *TFE =
6883 getNamedOperand(MI, AMDGPU::OpName::tfe)) {
6884 MIB.addImm(TFE->getImm());
6885 }
6886
6887 MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
6888
6889 MIB.cloneMemRefs(MI);
6890 Addr64 = MIB;
6891 } else {
6892 // Atomics with return.
6893 Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode))
6894 .add(*VData)
6895 .add(*VDataIn)
6896 .addReg(NewVAddr)
6897 .addReg(NewSRsrc)
6898 .add(*SOffset)
6899 .add(*Offset)
6900 .addImm(getNamedImmOperand(MI, AMDGPU::OpName::cpol))
6901 .cloneMemRefs(MI);
6902 }
6903
6904 MI.removeFromParent();
6905
6906 // NewVaddr = {NewVaddrHi, NewVaddrLo}
6907 BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE),
6908 NewVAddr)
6909 .addReg(RsrcPtr, 0, AMDGPU::sub0)
6910 .addImm(AMDGPU::sub0)
6911 .addReg(RsrcPtr, 0, AMDGPU::sub1)
6912 .addImm(AMDGPU::sub1);
6913 } else {
6914 // Legalize a VGPR Rsrc and soffset together.
6915 if (!isSoffsetLegal) {
6916 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6917 CreatedBB =
6918 loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
6919 return CreatedBB;
6920 }
6921 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
6922 return CreatedBB;
6923 }
6924 }
6925
6926 // Legalize a VGPR soffset.
6927 if (!isSoffsetLegal) {
6928 MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
6929 CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
6930 return CreatedBB;
6931 }
6932 return CreatedBB;
6933}
6934
6936 InstrList.insert(MI);
6937 // Add MBUF instructiosn to deferred list.
6938 int RsrcIdx =
6939 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
6940 if (RsrcIdx != -1) {
6941 DeferredList.insert(MI);
6942 }
6943}
6944
6946 return DeferredList.contains(MI);
6947}
6948
6950 MachineDominatorTree *MDT) const {
6951
6952 while (!Worklist.empty()) {
6953 MachineInstr &Inst = *Worklist.top();
6954 Worklist.erase_top();
6955 // Skip MachineInstr in the deferred list.
6956 if (Worklist.isDeferred(&Inst))
6957 continue;
6958 moveToVALUImpl(Worklist, MDT, Inst);
6959 }
6960
6961 // Deferred list of instructions will be processed once
6962 // all the MachineInstr in the worklist are done.
6963 for (MachineInstr *Inst : Worklist.getDeferredList()) {
6964 moveToVALUImpl(Worklist, MDT, *Inst);
6965 assert(Worklist.empty() &&
6966 "Deferred MachineInstr are not supposed to re-populate worklist");
6967 }
6968}
6969
6972 MachineInstr &Inst) const {
6973
6975 if (!MBB)
6976 return;
6978 unsigned Opcode = Inst.getOpcode();
6979 unsigned NewOpcode = getVALUOp(Inst);
6980 // Handle some special cases
6981 switch (Opcode) {
6982 default:
6983 break;
6984 case AMDGPU::S_ADD_U64_PSEUDO:
6985 NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
6986 break;
6987 case AMDGPU::S_SUB_U64_PSEUDO:
6988 NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
6989 break;
6990 case AMDGPU::S_ADD_I32:
6991 case AMDGPU::S_SUB_I32: {
6992 // FIXME: The u32 versions currently selected use the carry.
6993 bool Changed;
6994 MachineBasicBlock *CreatedBBTmp = nullptr;
6995 std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
6996 if (Changed)
6997 return;
6998
6999 // Default handling
7000 break;
7001 }
7002
7003 case AMDGPU::S_MUL_U64:
7004 // Split s_mul_u64 in 32-bit vector multiplications.
7005 splitScalarSMulU64(Worklist, Inst, MDT);
7006 Inst.eraseFromParent();
7007 return;
7008
7009 case AMDGPU::S_MUL_U64_U32_PSEUDO:
7010 case AMDGPU::S_MUL_I64_I32_PSEUDO:
7011 // This is a special case of s_mul_u64 where all the operands are either
7012 // zero extended or sign extended.
7013 splitScalarSMulPseudo(Worklist, Inst, MDT);
7014 Inst.eraseFromParent();
7015 return;
7016
7017 case AMDGPU::S_AND_B64:
7018 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
7019 Inst.eraseFromParent();
7020 return;
7021
7022 case AMDGPU::S_OR_B64:
7023 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
7024 Inst.eraseFromParent();
7025 return;
7026
7027 case AMDGPU::S_XOR_B64:
7028 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
7029 Inst.eraseFromParent();
7030 return;
7031
7032 case AMDGPU::S_NAND_B64:
7033 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
7034 Inst.eraseFromParent();
7035 return;
7036
7037 case AMDGPU::S_NOR_B64:
7038 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
7039 Inst.eraseFromParent();
7040 return;
7041
7042 case AMDGPU::S_XNOR_B64:
7043 if (ST.hasDLInsts())
7044 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
7045 else
7046 splitScalar64BitXnor(Worklist, Inst, MDT);
7047 Inst.eraseFromParent();
7048 return;
7049
7050 case AMDGPU::S_ANDN2_B64:
7051 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
7052 Inst.eraseFromParent();
7053 return;
7054
7055 case AMDGPU::S_ORN2_B64:
7056 splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
7057 Inst.eraseFromParent();
7058 return;
7059
7060 case AMDGPU::S_BREV_B64:
7061 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
7062 Inst.eraseFromParent();
7063 return;
7064
7065 case AMDGPU::S_NOT_B64:
7066 splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
7067 Inst.eraseFromParent();
7068 return;
7069
7070 case AMDGPU::S_BCNT1_I32_B64:
7071 splitScalar64BitBCNT(Worklist, Inst);
7072 Inst.eraseFromParent();
7073 return;
7074
7075 case AMDGPU::S_BFE_I64:
7076 splitScalar64BitBFE(Worklist, Inst);
7077 Inst.eraseFromParent();
7078 return;
7079
7080 case AMDGPU::S_FLBIT_I32_B64:
7081 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
7082 Inst.eraseFromParent();
7083 return;
7084 case AMDGPU::S_FF1_I32_B64:
7085 splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
7086 Inst.eraseFromParent();
7087 return;
7088
7089 case AMDGPU::S_LSHL_B32:
7090 if (ST.hasOnlyRevVALUShifts()) {
7091 NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
7092 swapOperands(Inst);
7093 }
7094 break;
7095 case AMDGPU::S_ASHR_I32:
7096 if (ST.hasOnlyRevVALUShifts()) {
7097 NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
7098 swapOperands(Inst);
7099 }
7100 break;
7101 case AMDGPU::S_LSHR_B32:
7102 if (ST.hasOnlyRevVALUShifts()) {
7103 NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
7104 swapOperands(Inst);
7105 }
7106 break;
7107 case AMDGPU::S_LSHL_B64:
7108 if (ST.hasOnlyRevVALUShifts()) {
7109 NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
7110 ? AMDGPU::V_LSHLREV_B64_pseudo_e64
7111 : AMDGPU::V_LSHLREV_B64_e64;
7112 swapOperands(Inst);
7113 }
7114 break;
7115 case AMDGPU::S_ASHR_I64:
7116 if (ST.hasOnlyRevVALUShifts()) {
7117 NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
7118 swapOperands(Inst);
7119 }
7120 break;
7121 case AMDGPU::S_LSHR_B64:
7122 if (ST.hasOnlyRevVALUShifts()) {
7123 NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
7124 swapOperands(Inst);
7125 }
7126 break;
7127
7128 case AMDGPU::S_ABS_I32:
7129 lowerScalarAbs(Worklist, Inst);
7130 Inst.eraseFromParent();
7131 return;
7132
7133 case AMDGPU::S_CBRANCH_SCC0:
7134 case AMDGPU::S_CBRANCH_SCC1: {
7135 // Clear unused bits of vcc
7136 Register CondReg = Inst.getOperand(1).getReg();
7137 bool IsSCC = CondReg == AMDGPU::SCC;
7138 Register VCC = RI.getVCC();
7139 Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
7140 unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
7141 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
7142 .addReg(EXEC)
7143 .addReg(IsSCC ? VCC : CondReg);
7144 Inst.removeOperand(1);
7145 } break;
7146
7147 case AMDGPU::S_BFE_U64:
7148 case AMDGPU::S_BFM_B64:
7149 llvm_unreachable("Moving this op to VALU not implemented");
7150
7151 case AMDGPU::S_PACK_LL_B32_B16:
7152 case AMDGPU::S_PACK_LH_B32_B16:
7153 case AMDGPU::S_PACK_HL_B32_B16:
7154 case AMDGPU::S_PACK_HH_B32_B16:
7155 movePackToVALU(Worklist, MRI, Inst);
7156 Inst.eraseFromParent();
7157 return;
7158
7159 case AMDGPU::S_XNOR_B32:
7160 lowerScalarXnor(Worklist, Inst);
7161 Inst.eraseFromParent();
7162 return;
7163
7164 case AMDGPU::S_NAND_B32:
7165 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
7166 Inst.eraseFromParent();
7167 return;
7168
7169 case AMDGPU::S_NOR_B32:
7170 splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
7171 Inst.eraseFromParent();
7172 return;
7173
7174 case AMDGPU::S_ANDN2_B32:
7175 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
7176 Inst.eraseFromParent();
7177 return;
7178
7179 case AMDGPU::S_ORN2_B32:
7180 splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
7181 Inst.eraseFromParent();
7182 return;
7183
7184 // TODO: remove as soon as everything is ready
7185 // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
7186 // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
7187 // can only be selected from the uniform SDNode.
7188 case AMDGPU::S_ADD_CO_PSEUDO:
7189 case AMDGPU::S_SUB_CO_PSEUDO: {
7190 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
7191 ? AMDGPU::V_ADDC_U32_e64
7192 : AMDGPU::V_SUBB_U32_e64;
7193 const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7194
7195 Register CarryInReg = Inst.getOperand(4).getReg();
7196 if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
7197 Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
7198 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
7199 .addReg(CarryInReg);
7200 }
7201
7202 Register CarryOutReg = Inst.getOperand(1).getReg();
7203
7204 Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
7205 MRI.getRegClass(Inst.getOperand(0).getReg())));
7206 MachineInstr *CarryOp =
7207 BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
7208 .addReg(CarryOutReg, RegState::Define)
7209 .add(Inst.getOperand(2))
7210 .add(Inst.getOperand(3))
7211 .addReg(CarryInReg)
7212 .addImm(0);
7213 legalizeOperands(*CarryOp);
7214 MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
7215 addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
7216 Inst.eraseFromParent();
7217 }
7218 return;
7219 case AMDGPU::S_UADDO_PSEUDO:
7220 case AMDGPU::S_USUBO_PSEUDO: {
7221 const DebugLoc &DL = Inst.getDebugLoc();
7222 MachineOperand &Dest0 = Inst.getOperand(0);
7223 MachineOperand &Dest1 = Inst.getOperand(1);
7224 MachineOperand &Src0 = Inst.getOperand(2);
7225 MachineOperand &Src1 = Inst.getOperand(3);
7226
7227 unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
7228 ? AMDGPU::V_ADD_CO_U32_e64
7229 : AMDGPU::V_SUB_CO_U32_e64;
7230 const TargetRegisterClass *NewRC =
7231 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
7232 Register DestReg = MRI.createVirtualRegister(NewRC);
7233 MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
7234 .addReg(Dest1.getReg(), RegState::Define)
7235 .add(Src0)
7236 .add(Src1)
7237 .addImm(0); // clamp bit
7238
7239 legalizeOperands(*NewInstr, MDT);
7240 MRI.replaceRegWith(Dest0.getReg(), DestReg);
7241 addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
7242 Worklist);
7243 Inst.eraseFromParent();
7244 }
7245 return;
7246
7247 case AMDGPU::S_CSELECT_B32:
7248 case AMDGPU::S_CSELECT_B64:
7249 lowerSelect(Worklist, Inst, MDT);
7250 Inst.eraseFromParent();
7251 return;
7252 case AMDGPU::S_CMP_EQ_I32:
7253 case AMDGPU::S_CMP_LG_I32:
7254 case AMDGPU::S_CMP_GT_I32:
7255 case AMDGPU::S_CMP_GE_I32:
7256 case AMDGPU::S_CMP_LT_I32:
7257 case AMDGPU::S_CMP_LE_I32:
7258 case AMDGPU::S_CMP_EQ_U32:
7259 case AMDGPU::S_CMP_LG_U32:
7260 case AMDGPU::S_CMP_GT_U32:
7261 case AMDGPU::S_CMP_GE_U32:
7262 case AMDGPU::S_CMP_LT_U32:
7263 case AMDGPU::S_CMP_LE_U32:
7264 case AMDGPU::S_CMP_EQ_U64:
7265 case AMDGPU::S_CMP_LG_U64:
7266 case AMDGPU::S_CMP_LT_F32:
7267 case AMDGPU::S_CMP_EQ_F32:
7268 case AMDGPU::S_CMP_LE_F32:
7269 case AMDGPU::S_CMP_GT_F32:
7270 case AMDGPU::S_CMP_LG_F32:
7271 case AMDGPU::S_CMP_GE_F32:
7272 case AMDGPU::S_CMP_O_F32:
7273 case AMDGPU::S_CMP_U_F32:
7274 case AMDGPU::S_CMP_NGE_F32:
7275 case AMDGPU::S_CMP_NLG_F32:
7276 case AMDGPU::S_CMP_NGT_F32:
7277 case AMDGPU::S_CMP_NLE_F32:
7278 case AMDGPU::S_CMP_NEQ_F32:
7279 case AMDGPU::S_CMP_NLT_F32:
7280 case AMDGPU::S_CMP_LT_F16:
7281 case AMDGPU::S_CMP_EQ_F16:
7282 case AMDGPU::S_CMP_LE_F16:
7283 case AMDGPU::S_CMP_GT_F16:
7284 case AMDGPU::S_CMP_LG_F16:
7285 case AMDGPU::S_CMP_GE_F16:
7286 case AMDGPU::S_CMP_O_F16:
7287 case AMDGPU::S_CMP_U_F16:
7288 case AMDGPU::S_CMP_NGE_F16:
7289 case AMDGPU::S_CMP_NLG_F16:
7290 case AMDGPU::S_CMP_NGT_F16:
7291 case AMDGPU::S_CMP_NLE_F16:
7292 case AMDGPU::S_CMP_NEQ_F16:
7293 case AMDGPU::S_CMP_NLT_F16: {
7294 Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
7295 auto NewInstr =
7296 BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
7297 .setMIFlags(Inst.getFlags());
7298 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7299 AMDGPU::OpName::src0_modifiers) >= 0) {
7300 NewInstr
7301 .addImm(0) // src0_modifiers
7302 .add(Inst.getOperand(0)) // src0
7303 .addImm(0) // src1_modifiers
7304 .add(Inst.getOperand(1)) // src1
7305 .addImm(0); // clamp
7306 } else {
7307 NewInstr
7308 .add(Inst.getOperand(0))
7309 .add(Inst.getOperand(1));
7310 }
7311 legalizeOperands(*NewInstr, MDT);
7312 int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC, /*TRI=*/nullptr);
7313 MachineOperand SCCOp = Inst.getOperand(SCCIdx);
7314 addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
7315 Inst.eraseFromParent();
7316 return;
7317 }
7318 case AMDGPU::S_CVT_HI_F32_F16: {
7319 const DebugLoc &DL = Inst.getDebugLoc();
7320 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7321 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7322 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7323 .addImm(16)
7324 .add(Inst.getOperand(1));
7325 BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7326 .addImm(0) // src0_modifiers
7327 .addReg(TmpReg)
7328 .addImm(0) // clamp
7329 .addImm(0); // omod
7330
7331 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7332 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7333 Inst.eraseFromParent();
7334 return;
7335 }
7336 case AMDGPU::S_MINIMUM_F32:
7337 case AMDGPU::S_MAXIMUM_F32:
7338 case AMDGPU::S_MINIMUM_F16:
7339 case AMDGPU::S_MAXIMUM_F16: {
7340 const DebugLoc &DL = Inst.getDebugLoc();
7341 Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7342 MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7343 .addImm(0) // src0_modifiers
7344 .add(Inst.getOperand(1))
7345 .addImm(0) // src1_modifiers
7346 .add(Inst.getOperand(2))
7347 .addImm(0) // clamp
7348 .addImm(0); // omod
7349 MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
7350
7351 legalizeOperands(*NewInstr, MDT);
7352 addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
7353 Inst.eraseFromParent();
7354 return;
7355 }
7356 }
7357
7358 if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
7359 // We cannot move this instruction to the VALU, so we should try to
7360 // legalize its operands instead.
7361 legalizeOperands(Inst, MDT);
7362 return;
7363 }
7364 // Handle converting generic instructions like COPY-to-SGPR into
7365 // COPY-to-VGPR.
7366 if (NewOpcode == Opcode) {
7367 Register DstReg = Inst.getOperand(0).getReg();
7368 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
7369
7370 // If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
7371 // hope for the best.
7372 if (Inst.isCopy() && DstReg.isPhysical() &&
7373 RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
7374 // TODO: Only works for 32 bit registers.
7375 BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
7376 get(AMDGPU::V_READFIRSTLANE_B32), Inst.getOperand(0).getReg())
7377 .add(Inst.getOperand(1));
7378 Inst.eraseFromParent();
7379 return;
7380 }
7381
7382 if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
7383 NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
7384 // Instead of creating a copy where src and dst are the same register
7385 // class, we just replace all uses of dst with src. These kinds of
7386 // copies interfere with the heuristics MachineSink uses to decide
7387 // whether or not to split a critical edge. Since the pass assumes
7388 // that copies will end up as machine instructions and not be
7389 // eliminated.
7390 addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
7391 MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
7392 MRI.clearKillFlags(Inst.getOperand(1).getReg());
7393 Inst.getOperand(0).setReg(DstReg);
7394 // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
7395 // these are deleted later, but at -O0 it would leave a suspicious
7396 // looking illegal copy of an undef register.
7397 for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
7398 Inst.removeOperand(I);
7399 Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
7400 return;
7401 }
7402 Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
7403 MRI.replaceRegWith(DstReg, NewDstReg);
7404 legalizeOperands(Inst, MDT);
7405 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7406 return;
7407 }
7408
7409 // Use the new VALU Opcode.
7410 auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
7411 .setMIFlags(Inst.getFlags());
7412 if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
7413 // Intersperse VOP3 modifiers among the SALU operands.
7414 NewInstr->addOperand(Inst.getOperand(0));
7415 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7416 AMDGPU::OpName::src0_modifiers) >= 0)
7417 NewInstr.addImm(0);
7418 if (AMDGPU::hasNamedOperand(NewOpcode, AMDGPU::OpName::src0)) {
7419 MachineOperand Src = Inst.getOperand(1);
7420 if (AMDGPU::isTrue16Inst(NewOpcode) && ST.useRealTrue16Insts() &&
7421 Src.isReg() && RI.isVGPR(MRI, Src.getReg()))
7422 NewInstr.addReg(Src.getReg(), 0, AMDGPU::lo16);
7423 else
7424 NewInstr->addOperand(Src);
7425 }
7426
7427 if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
7428 // We are converting these to a BFE, so we need to add the missing
7429 // operands for the size and offset.
7430 unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
7431 NewInstr.addImm(0);
7432 NewInstr.addImm(Size);
7433 } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
7434 // The VALU version adds the second operand to the result, so insert an
7435 // extra 0 operand.
7436 NewInstr.addImm(0);
7437 } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
7438 const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
7439 // If we need to move this to VGPRs, we need to unpack the second
7440 // operand back into the 2 separate ones for bit offset and width.
7441 assert(OffsetWidthOp.isImm() &&
7442 "Scalar BFE is only implemented for constant width and offset");
7443 uint32_t Imm = OffsetWidthOp.getImm();
7444
7445 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
7446 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
7447 NewInstr.addImm(Offset);
7448 NewInstr.addImm(BitWidth);
7449 } else {
7450 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7451 AMDGPU::OpName::src1_modifiers) >= 0)
7452 NewInstr.addImm(0);
7453 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
7454 NewInstr->addOperand(Inst.getOperand(2));
7455 if (AMDGPU::getNamedOperandIdx(NewOpcode,
7456 AMDGPU::OpName::src2_modifiers) >= 0)
7457 NewInstr.addImm(0);
7458 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
7459 NewInstr->addOperand(Inst.getOperand(3));
7460 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
7461 NewInstr.addImm(0);
7462 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
7463 NewInstr.addImm(0);
7464 if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
7465 NewInstr.addImm(0);
7466 }
7467 } else {
7468 // Just copy the SALU operands.
7469 for (const MachineOperand &Op : Inst.explicit_operands())
7470 NewInstr->addOperand(Op);
7471 }
7472
7473 // Remove any references to SCC. Vector instructions can't read from it, and
7474 // We're just about to add the implicit use / defs of VCC, and we don't want
7475 // both.
7476 for (MachineOperand &Op : Inst.implicit_operands()) {
7477 if (Op.getReg() == AMDGPU::SCC) {
7478 // Only propagate through live-def of SCC.
7479 if (Op.isDef() && !Op.isDead())
7480 addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
7481 if (Op.isUse())
7482 addSCCDefsToVALUWorklist(NewInstr, Worklist);
7483 }
7484 }
7485 Inst.eraseFromParent();
7486 Register NewDstReg;
7487 if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
7488 Register DstReg = NewInstr->getOperand(0).getReg();
7489 assert(DstReg.isVirtual());
7490 // Update the destination register class.
7491 const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
7492 assert(NewDstRC);
7493 NewDstReg = MRI.createVirtualRegister(NewDstRC);
7494 MRI.replaceRegWith(DstReg, NewDstReg);
7495 }
7496 fixImplicitOperands(*NewInstr);
7497 // Legalize the operands
7498 legalizeOperands(*NewInstr, MDT);
7499 if (NewDstReg)
7500 addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
7501}
7502
7503// Add/sub require special handling to deal with carry outs.
7504std::pair<bool, MachineBasicBlock *>
7505SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
7506 MachineDominatorTree *MDT) const {
7507 if (ST.hasAddNoCarry()) {
7508 // Assume there is no user of scc since we don't select this in that case.
7509 // Since scc isn't used, it doesn't really matter if the i32 or u32 variant
7510 // is used.
7511
7512 MachineBasicBlock &MBB = *Inst.getParent();
7514
7515 Register OldDstReg = Inst.getOperand(0).getReg();
7516 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7517
7518 unsigned Opc = Inst.getOpcode();
7519 assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
7520
7521 unsigned NewOpc = Opc == AMDGPU::S_ADD_I32 ?
7522 AMDGPU::V_ADD_U32_e64 : AMDGPU::V_SUB_U32_e64;
7523
7524 assert(Inst.getOperand(3).getReg() == AMDGPU::SCC);
7525 Inst.removeOperand(3);
7526
7527 Inst.setDesc(get(NewOpc));
7528 Inst.addOperand(MachineOperand::CreateImm(0)); // clamp bit
7530 MRI.replaceRegWith(OldDstReg, ResultReg);
7531 MachineBasicBlock *NewBB = legalizeOperands(Inst, MDT);
7532
7533 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7534 return std::pair(true, NewBB);
7535 }
7536
7537 return std::pair(false, nullptr);
7538}
7539
7540void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
7541 MachineDominatorTree *MDT) const {
7542
7543 MachineBasicBlock &MBB = *Inst.getParent();
7545 MachineBasicBlock::iterator MII = Inst;
7546 DebugLoc DL = Inst.getDebugLoc();
7547
7548 MachineOperand &Dest = Inst.getOperand(0);
7549 MachineOperand &Src0 = Inst.getOperand(1);
7550 MachineOperand &Src1 = Inst.getOperand(2);
7551 MachineOperand &Cond = Inst.getOperand(3);
7552
7553 Register CondReg = Cond.getReg();
7554 bool IsSCC = (CondReg == AMDGPU::SCC);
7555
7556 // If this is a trivial select where the condition is effectively not SCC
7557 // (CondReg is a source of copy to SCC), then the select is semantically
7558 // equivalent to copying CondReg. Hence, there is no need to create
7559 // V_CNDMASK, we can just use that and bail out.
7560 if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
7561 (Src1.getImm() == 0)) {
7562 MRI.replaceRegWith(Dest.getReg(), CondReg);
7563 return;
7564 }
7565
7566 Register NewCondReg = CondReg;
7567 if (IsSCC) {
7568 const TargetRegisterClass *TC =
7569 RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
7570 NewCondReg = MRI.createVirtualRegister(TC);
7571
7572 // Now look for the closest SCC def if it is a copy
7573 // replacing the CondReg with the COPY source register
7574 bool CopyFound = false;
7575 for (MachineInstr &CandI :
7577 Inst.getParent()->rend())) {
7578 if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) !=
7579 -1) {
7580 if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
7581 BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
7582 .addReg(CandI.getOperand(1).getReg());
7583 CopyFound = true;
7584 }
7585 break;
7586 }
7587 }
7588 if (!CopyFound) {
7589 // SCC def is not a copy
7590 // Insert a trivial select instead of creating a copy, because a copy from
7591 // SCC would semantically mean just copying a single bit, but we may need
7592 // the result to be a vector condition mask that needs preserving.
7593 unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
7594 : AMDGPU::S_CSELECT_B32;
7595 auto NewSelect =
7596 BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
7597 NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
7598 }
7599 }
7600
7601 Register NewDestReg = MRI.createVirtualRegister(
7602 RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
7603 MachineInstr *NewInst;
7604 if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
7605 NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
7606 .addImm(0)
7607 .add(Src1) // False
7608 .addImm(0)
7609 .add(Src0) // True
7610 .addReg(NewCondReg);
7611 } else {
7612 NewInst =
7613 BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
7614 .add(Src1) // False
7615 .add(Src0) // True
7616 .addReg(NewCondReg);
7617 }
7618 MRI.replaceRegWith(Dest.getReg(), NewDestReg);
7619 legalizeOperands(*NewInst, MDT);
7620 addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
7621}
7622
7623void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
7624 MachineInstr &Inst) const {
7625 MachineBasicBlock &MBB = *Inst.getParent();
7627 MachineBasicBlock::iterator MII = Inst;
7628 DebugLoc DL = Inst.getDebugLoc();
7629
7630 MachineOperand &Dest = Inst.getOperand(0);
7631 MachineOperand &Src = Inst.getOperand(1);
7632 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7633 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7634
7635 unsigned SubOp = ST.hasAddNoCarry() ?
7636 AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_CO_U32_e32;
7637
7638 BuildMI(MBB, MII, DL, get(SubOp), TmpReg)
7639 .addImm(0)
7640 .addReg(Src.getReg());
7641
7642 BuildMI(MBB, MII, DL, get(AMDGPU::V_MAX_I32_e64), ResultReg)
7643 .addReg(Src.getReg())
7644 .addReg(TmpReg);
7645
7646 MRI.replaceRegWith(Dest.getReg(), ResultReg);
7647 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
7648}
7649
7650void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
7651 MachineInstr &Inst) const {
7652 MachineBasicBlock &MBB = *Inst.getParent();
7654 MachineBasicBlock::iterator MII = Inst;
7655 const DebugLoc &DL = Inst.getDebugLoc();
7656
7657 MachineOperand &Dest = Inst.getOperand(0);
7658 MachineOperand &Src0 = Inst.getOperand(1);
7659 MachineOperand &Src1 = Inst.getOperand(2);
7660
7661 if (ST.hasDLInsts()) {
7662 Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7663 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
7664 legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
7665
7666 BuildMI(MBB, MII, DL, get(AMDGPU::V_XNOR_B32_e64), NewDest)
7667 .add(Src0)
7668 .add(Src1);
7669
7670 MRI.replaceRegWith(Dest.getReg(), NewDest);
7671 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7672 } else {
7673 // Using the identity !(x ^ y) == (!x ^ y) == (x ^ !y), we can
7674 // invert either source and then perform the XOR. If either source is a
7675 // scalar register, then we can leave the inversion on the scalar unit to
7676 // achieve a better distribution of scalar and vector instructions.
7677 bool Src0IsSGPR = Src0.isReg() &&
7678 RI.isSGPRClass(MRI.getRegClass(Src0.getReg()));
7679 bool Src1IsSGPR = Src1.isReg() &&
7680 RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
7682 Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7683 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7684
7685 // Build a pair of scalar instructions and add them to the work list.
7686 // The next iteration over the work list will lower these to the vector
7687 // unit as necessary.
7688 if (Src0IsSGPR) {
7689 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src0);
7690 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7691 .addReg(Temp)
7692 .add(Src1);
7693 } else if (Src1IsSGPR) {
7694 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Temp).add(Src1);
7695 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), NewDest)
7696 .add(Src0)
7697 .addReg(Temp);
7698 } else {
7699 Xor = BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B32), Temp)
7700 .add(Src0)
7701 .add(Src1);
7702 MachineInstr *Not =
7703 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest).addReg(Temp);
7704 Worklist.insert(Not);
7705 }
7706
7707 MRI.replaceRegWith(Dest.getReg(), NewDest);
7708
7709 Worklist.insert(Xor);
7710
7711 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7712 }
7713}
7714
7715void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
7716 MachineInstr &Inst,
7717 unsigned Opcode) const {
7718 MachineBasicBlock &MBB = *Inst.getParent();
7720 MachineBasicBlock::iterator MII = Inst;
7721 const DebugLoc &DL = Inst.getDebugLoc();
7722
7723 MachineOperand &Dest = Inst.getOperand(0);
7724 MachineOperand &Src0 = Inst.getOperand(1);
7725 MachineOperand &Src1 = Inst.getOperand(2);
7726
7727 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7728 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7729
7730 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
7731 .add(Src0)
7732 .add(Src1);
7733
7734 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), NewDest)
7735 .addReg(Interm);
7736
7737 Worklist.insert(&Op);
7738 Worklist.insert(&Not);
7739
7740 MRI.replaceRegWith(Dest.getReg(), NewDest);
7741 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7742}
7743
7744void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
7745 MachineInstr &Inst,
7746 unsigned Opcode) const {
7747 MachineBasicBlock &MBB = *Inst.getParent();
7749 MachineBasicBlock::iterator MII = Inst;
7750 const DebugLoc &DL = Inst.getDebugLoc();
7751
7752 MachineOperand &Dest = Inst.getOperand(0);
7753 MachineOperand &Src0 = Inst.getOperand(1);
7754 MachineOperand &Src1 = Inst.getOperand(2);
7755
7756 Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7757 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
7758
7759 MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
7760 .add(Src1);
7761
7762 MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), NewDest)
7763 .add(Src0)
7764 .addReg(Interm);
7765
7766 Worklist.insert(&Not);
7767 Worklist.insert(&Op);
7768
7769 MRI.replaceRegWith(Dest.getReg(), NewDest);
7770 addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
7771}
7772
7773void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
7774 MachineInstr &Inst, unsigned Opcode,
7775 bool Swap) const {
7776 MachineBasicBlock &MBB = *Inst.getParent();
7778
7779 MachineOperand &Dest = Inst.getOperand(0);
7780 MachineOperand &Src0 = Inst.getOperand(1);
7781 DebugLoc DL = Inst.getDebugLoc();
7782
7783 MachineBasicBlock::iterator MII = Inst;
7784
7785 const MCInstrDesc &InstDesc = get(Opcode);
7786 const TargetRegisterClass *Src0RC = Src0.isReg() ?
7787 MRI.getRegClass(Src0.getReg()) :
7788 &AMDGPU::SGPR_32RegClass;
7789
7790 const TargetRegisterClass *Src0SubRC =
7791 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7792
7793 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7794 AMDGPU::sub0, Src0SubRC);
7795
7796 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
7797 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
7798 const TargetRegisterClass *NewDestSubRC =
7799 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
7800
7801 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
7802 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
7803
7804 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
7805 AMDGPU::sub1, Src0SubRC);
7806
7807 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
7808 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
7809
7810 if (Swap)
7811 std::swap(DestSub0, DestSub1);
7812
7813 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
7814 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7815 .addReg(DestSub0)
7816 .addImm(AMDGPU::sub0)
7817 .addReg(DestSub1)
7818 .addImm(AMDGPU::sub1);
7819
7820 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7821
7822 Worklist.insert(&LoHalf);
7823 Worklist.insert(&HiHalf);
7824
7825 // We don't need to legalizeOperands here because for a single operand, src0
7826 // will support any kind of input.
7827
7828 // Move all users of this moved value.
7829 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7830}
7831
7832// There is not a vector equivalent of s_mul_u64. For this reason, we need to
7833// split the s_mul_u64 in 32-bit vector multiplications.
7834void SIInstrInfo::splitScalarSMulU64(SIInstrWorklist &Worklist,
7835 MachineInstr &Inst,
7836 MachineDominatorTree *MDT) const {
7837 MachineBasicBlock &MBB = *Inst.getParent();
7839
7840 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7841 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7842 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7843
7844 MachineOperand &Dest = Inst.getOperand(0);
7845 MachineOperand &Src0 = Inst.getOperand(1);
7846 MachineOperand &Src1 = Inst.getOperand(2);
7847 const DebugLoc &DL = Inst.getDebugLoc();
7848 MachineBasicBlock::iterator MII = Inst;
7849
7850 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7851 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7852 const TargetRegisterClass *Src0SubRC =
7853 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7854 if (RI.isSGPRClass(Src0SubRC))
7855 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7856 const TargetRegisterClass *Src1SubRC =
7857 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7858 if (RI.isSGPRClass(Src1SubRC))
7859 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7860
7861 // First, we extract the low 32-bit and high 32-bit values from each of the
7862 // operands.
7863 MachineOperand Op0L =
7864 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7865 MachineOperand Op1L =
7866 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7867 MachineOperand Op0H =
7868 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
7869 MachineOperand Op1H =
7870 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
7871
7872 // The multilication is done as follows:
7873 //
7874 // Op1H Op1L
7875 // * Op0H Op0L
7876 // --------------------
7877 // Op1H*Op0L Op1L*Op0L
7878 // + Op1H*Op0H Op1L*Op0H
7879 // -----------------------------------------
7880 // (Op1H*Op0L + Op1L*Op0H + carry) Op1L*Op0L
7881 //
7882 // We drop Op1H*Op0H because the result of the multiplication is a 64-bit
7883 // value and that would overflow.
7884 // The low 32-bit value is Op1L*Op0L.
7885 // The high 32-bit value is Op1H*Op0L + Op1L*Op0H + carry (from Op1L*Op0L).
7886
7887 Register Op1L_Op0H_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7888 MachineInstr *Op1L_Op0H =
7889 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1L_Op0H_Reg)
7890 .add(Op1L)
7891 .add(Op0H);
7892
7893 Register Op1H_Op0L_Reg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7894 MachineInstr *Op1H_Op0L =
7895 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), Op1H_Op0L_Reg)
7896 .add(Op1H)
7897 .add(Op0L);
7898
7899 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7900 MachineInstr *Carry =
7901 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_HI_U32_e64), CarryReg)
7902 .add(Op1L)
7903 .add(Op0L);
7904
7905 MachineInstr *LoHalf =
7906 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7907 .add(Op1L)
7908 .add(Op0L);
7909
7910 Register AddReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7911 MachineInstr *Add = BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), AddReg)
7912 .addReg(Op1L_Op0H_Reg)
7913 .addReg(Op1H_Op0L_Reg);
7914
7915 MachineInstr *HiHalf =
7916 BuildMI(MBB, MII, DL, get(AMDGPU::V_ADD_U32_e32), DestSub1)
7917 .addReg(AddReg)
7918 .addReg(CarryReg);
7919
7920 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7921 .addReg(DestSub0)
7922 .addImm(AMDGPU::sub0)
7923 .addReg(DestSub1)
7924 .addImm(AMDGPU::sub1);
7925
7926 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7927
7928 // Try to legalize the operands in case we need to swap the order to keep it
7929 // valid.
7930 legalizeOperands(*Op1L_Op0H, MDT);
7931 legalizeOperands(*Op1H_Op0L, MDT);
7932 legalizeOperands(*Carry, MDT);
7933 legalizeOperands(*LoHalf, MDT);
7934 legalizeOperands(*Add, MDT);
7935 legalizeOperands(*HiHalf, MDT);
7936
7937 // Move all users of this moved value.
7938 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
7939}
7940
7941// Lower S_MUL_U64_U32_PSEUDO/S_MUL_I64_I32_PSEUDO in two 32-bit vector
7942// multiplications.
7943void SIInstrInfo::splitScalarSMulPseudo(SIInstrWorklist &Worklist,
7944 MachineInstr &Inst,
7945 MachineDominatorTree *MDT) const {
7946 MachineBasicBlock &MBB = *Inst.getParent();
7948
7949 Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
7950 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7951 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7952
7953 MachineOperand &Dest = Inst.getOperand(0);
7954 MachineOperand &Src0 = Inst.getOperand(1);
7955 MachineOperand &Src1 = Inst.getOperand(2);
7956 const DebugLoc &DL = Inst.getDebugLoc();
7957 MachineBasicBlock::iterator MII = Inst;
7958
7959 const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
7960 const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
7961 const TargetRegisterClass *Src0SubRC =
7962 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
7963 if (RI.isSGPRClass(Src0SubRC))
7964 Src0SubRC = RI.getEquivalentVGPRClass(Src0SubRC);
7965 const TargetRegisterClass *Src1SubRC =
7966 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
7967 if (RI.isSGPRClass(Src1SubRC))
7968 Src1SubRC = RI.getEquivalentVGPRClass(Src1SubRC);
7969
7970 // First, we extract the low 32-bit and high 32-bit values from each of the
7971 // operands.
7972 MachineOperand Op0L =
7973 buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
7974 MachineOperand Op1L =
7975 buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
7976
7977 unsigned Opc = Inst.getOpcode();
7978 unsigned NewOpc = Opc == AMDGPU::S_MUL_U64_U32_PSEUDO
7979 ? AMDGPU::V_MUL_HI_U32_e64
7980 : AMDGPU::V_MUL_HI_I32_e64;
7981 MachineInstr *HiHalf =
7982 BuildMI(MBB, MII, DL, get(NewOpc), DestSub1).add(Op1L).add(Op0L);
7983
7984 MachineInstr *LoHalf =
7985 BuildMI(MBB, MII, DL, get(AMDGPU::V_MUL_LO_U32_e64), DestSub0)
7986 .add(Op1L)
7987 .add(Op0L);
7988
7989 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
7990 .addReg(DestSub0)
7991 .addImm(AMDGPU::sub0)
7992 .addReg(DestSub1)
7993 .addImm(AMDGPU::sub1);
7994
7995 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
7996
7997 // Try to legalize the operands in case we need to swap the order to keep it
7998 // valid.
7999 legalizeOperands(*HiHalf, MDT);
8000 legalizeOperands(*LoHalf, MDT);
8001
8002 // Move all users of this moved value.
8003 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8004}
8005
8006void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
8007 MachineInstr &Inst, unsigned Opcode,
8008 MachineDominatorTree *MDT) const {
8009 MachineBasicBlock &MBB = *Inst.getParent();
8011
8012 MachineOperand &Dest = Inst.getOperand(0);
8013 MachineOperand &Src0 = Inst.getOperand(1);
8014 MachineOperand &Src1 = Inst.getOperand(2);
8015 DebugLoc DL = Inst.getDebugLoc();
8016
8017 MachineBasicBlock::iterator MII = Inst;
8018
8019 const MCInstrDesc &InstDesc = get(Opcode);
8020 const TargetRegisterClass *Src0RC = Src0.isReg() ?
8021 MRI.getRegClass(Src0.getReg()) :
8022 &AMDGPU::SGPR_32RegClass;
8023
8024 const TargetRegisterClass *Src0SubRC =
8025 RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
8026 const TargetRegisterClass *Src1RC = Src1.isReg() ?
8027 MRI.getRegClass(Src1.getReg()) :
8028 &AMDGPU::SGPR_32RegClass;
8029
8030 const TargetRegisterClass *Src1SubRC =
8031 RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
8032
8033 MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8034 AMDGPU::sub0, Src0SubRC);
8035 MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8036 AMDGPU::sub0, Src1SubRC);
8037 MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
8038 AMDGPU::sub1, Src0SubRC);
8039 MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
8040 AMDGPU::sub1, Src1SubRC);
8041
8042 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8043 const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
8044 const TargetRegisterClass *NewDestSubRC =
8045 RI.getSubRegisterClass(NewDestRC, AMDGPU::sub0);
8046
8047 Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
8048 MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
8049 .add(SrcReg0Sub0)
8050 .add(SrcReg1Sub0);
8051
8052 Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
8053 MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
8054 .add(SrcReg0Sub1)
8055 .add(SrcReg1Sub1);
8056
8057 Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
8058 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
8059 .addReg(DestSub0)
8060 .addImm(AMDGPU::sub0)
8061 .addReg(DestSub1)
8062 .addImm(AMDGPU::sub1);
8063
8064 MRI.replaceRegWith(Dest.getReg(), FullDestReg);
8065
8066 Worklist.insert(&LoHalf);
8067 Worklist.insert(&HiHalf);
8068
8069 // Move all users of this moved value.
8070 addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
8071}
8072
8073void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
8074 MachineInstr &Inst,
8075 MachineDominatorTree *MDT) const {
8076 MachineBasicBlock &MBB = *Inst.getParent();
8078
8079 MachineOperand &Dest = Inst.getOperand(0);
8080 MachineOperand &Src0 = Inst.getOperand(1);
8081 MachineOperand &Src1 = Inst.getOperand(2);
8082 const DebugLoc &DL = Inst.getDebugLoc();
8083
8084 MachineBasicBlock::iterator MII = Inst;
8085
8086 const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
8087
8088 Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
8089
8090 MachineOperand* Op0;
8091 MachineOperand* Op1;
8092
8093 if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) {
8094 Op0 = &Src0;
8095 Op1 = &Src1;
8096 } else {
8097 Op0 = &Src1;
8098 Op1 = &Src0;
8099 }
8100
8101 BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
8102 .add(*Op0);
8103
8104 Register NewDest = MRI.createVirtualRegister(DestRC);
8105
8106 MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
8107 .addReg(Interm)
8108 .add(*Op1);
8109
8110 MRI.replaceRegWith(Dest.getReg(), NewDest);
8111
8112 Worklist.insert(&Xor);
8113}
8114
8115void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
8116 MachineInstr &Inst) const {
8117 MachineBasicBlock &MBB = *Inst.getParent();
8119
8120 MachineBasicBlock::iterator MII = Inst;
8121 const DebugLoc &DL = Inst.getDebugLoc();
8122
8123 MachineOperand &Dest = Inst.getOperand(0);
8124 MachineOperand &Src = Inst.getOperand(1);
8125
8126 const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64);
8127 const TargetRegisterClass *SrcRC = Src.isReg() ?
8128 MRI.getRegClass(Src.getReg()) :
8129 &AMDGPU::SGPR_32RegClass;
8130
8131 Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8132 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8133
8134 const TargetRegisterClass *SrcSubRC =
8135 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8136
8137 MachineOperand SrcRegSub0 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8138 AMDGPU::sub0, SrcSubRC);
8139 MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC,
8140 AMDGPU::sub1, SrcSubRC);
8141
8142 BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0);
8143
8144 BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg);
8145
8146 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8147
8148 // We don't need to legalize operands here. src0 for either instruction can be
8149 // an SGPR, and the second input is unused or determined here.
8150 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8151}
8152
8153void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
8154 MachineInstr &Inst) const {
8155 MachineBasicBlock &MBB = *Inst.getParent();
8157 MachineBasicBlock::iterator MII = Inst;
8158 const DebugLoc &DL = Inst.getDebugLoc();
8159
8160 MachineOperand &Dest = Inst.getOperand(0);
8161 uint32_t Imm = Inst.getOperand(2).getImm();
8162 uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
8163 uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
8164
8165 (void) Offset;
8166
8167 // Only sext_inreg cases handled.
8168 assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 &&
8169 Offset == 0 && "Not implemented");
8170
8171 if (BitWidth < 32) {
8172 Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8173 Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8174 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8175
8176 BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32_e64), MidRegLo)
8177 .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
8178 .addImm(0)
8179 .addImm(BitWidth);
8180
8181 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi)
8182 .addImm(31)
8183 .addReg(MidRegLo);
8184
8185 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8186 .addReg(MidRegLo)
8187 .addImm(AMDGPU::sub0)
8188 .addReg(MidRegHi)
8189 .addImm(AMDGPU::sub1);
8190
8191 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8192 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8193 return;
8194 }
8195
8196 MachineOperand &Src = Inst.getOperand(1);
8197 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8198 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
8199
8200 BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
8201 .addImm(31)
8202 .addReg(Src.getReg(), 0, AMDGPU::sub0);
8203
8204 BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), ResultReg)
8205 .addReg(Src.getReg(), 0, AMDGPU::sub0)
8206 .addImm(AMDGPU::sub0)
8207 .addReg(TmpReg)
8208 .addImm(AMDGPU::sub1);
8209
8210 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8211 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8212}
8213
8214void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
8215 MachineInstr &Inst, unsigned Opcode,
8216 MachineDominatorTree *MDT) const {
8217 // (S_FLBIT_I32_B64 hi:lo) ->
8218 // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
8219 // (S_FF1_I32_B64 hi:lo) ->
8220 // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
8221
8222 MachineBasicBlock &MBB = *Inst.getParent();
8224 MachineBasicBlock::iterator MII = Inst;
8225 const DebugLoc &DL = Inst.getDebugLoc();
8226
8227 MachineOperand &Dest = Inst.getOperand(0);
8228 MachineOperand &Src = Inst.getOperand(1);
8229
8230 const MCInstrDesc &InstDesc = get(Opcode);
8231
8232 bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
8233 unsigned OpcodeAdd =
8234 ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
8235
8236 const TargetRegisterClass *SrcRC =
8237 Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
8238 const TargetRegisterClass *SrcSubRC =
8239 RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
8240
8241 MachineOperand SrcRegSub0 =
8242 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
8243 MachineOperand SrcRegSub1 =
8244 buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
8245
8246 Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8247 Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8248 Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8249 Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8250
8251 BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
8252
8253 BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
8254
8255 BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
8256 .addReg(IsCtlz ? MidReg1 : MidReg2)
8257 .addImm(32)
8258 .addImm(1); // enable clamp
8259
8260 BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
8261 .addReg(MidReg3)
8262 .addReg(IsCtlz ? MidReg2 : MidReg1);
8263
8264 MRI.replaceRegWith(Dest.getReg(), MidReg4);
8265
8266 addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
8267}
8268
8269void SIInstrInfo::addUsersToMoveToVALUWorklist(
8271 SIInstrWorklist &Worklist) const {
8272 for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
8273 E = MRI.use_end(); I != E;) {
8274 MachineInstr &UseMI = *I->getParent();
8275
8276 unsigned OpNo = 0;
8277
8278 switch (UseMI.getOpcode()) {
8279 case AMDGPU::COPY:
8280 case AMDGPU::WQM:
8281 case AMDGPU::SOFT_WQM:
8282 case AMDGPU::STRICT_WWM:
8283 case AMDGPU::STRICT_WQM:
8284 case AMDGPU::REG_SEQUENCE:
8285 case AMDGPU::PHI:
8286 case AMDGPU::INSERT_SUBREG:
8287 break;
8288 default:
8289 OpNo = I.getOperandNo();
8290 break;
8291 }
8292
8293 if (!RI.hasVectorRegisters(getOpRegClass(UseMI, OpNo))) {
8294 Worklist.insert(&UseMI);
8295
8296 do {
8297 ++I;
8298 } while (I != E && I->getParent() == &UseMI);
8299 } else {
8300 ++I;
8301 }
8302 }
8303}
8304
8305void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
8307 MachineInstr &Inst) const {
8308 Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8310 MachineOperand &Src0 = Inst.getOperand(1);
8311 MachineOperand &Src1 = Inst.getOperand(2);
8312 const DebugLoc &DL = Inst.getDebugLoc();
8313
8314 switch (Inst.getOpcode()) {
8315 case AMDGPU::S_PACK_LL_B32_B16: {
8316 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8317 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8318
8319 // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
8320 // 0.
8321 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8322 .addImm(0xffff);
8323
8324 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
8325 .addReg(ImmReg, RegState::Kill)
8326 .add(Src0);
8327
8328 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8329 .add(Src1)
8330 .addImm(16)
8331 .addReg(TmpReg, RegState::Kill);
8332 break;
8333 }
8334 case AMDGPU::S_PACK_LH_B32_B16: {
8335 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8336 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8337 .addImm(0xffff);
8338 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32_e64), ResultReg)
8339 .addReg(ImmReg, RegState::Kill)
8340 .add(Src0)
8341 .add(Src1);
8342 break;
8343 }
8344 case AMDGPU::S_PACK_HL_B32_B16: {
8345 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8346 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8347 .addImm(16)
8348 .add(Src0);
8349 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32_e64), ResultReg)
8350 .add(Src1)
8351 .addImm(16)
8352 .addReg(TmpReg, RegState::Kill);
8353 break;
8354 }
8355 case AMDGPU::S_PACK_HH_B32_B16: {
8356 Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8357 Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
8358 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
8359 .addImm(16)
8360 .add(Src0);
8361 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
8362 .addImm(0xffff0000);
8363 BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32_e64), ResultReg)
8364 .add(Src1)
8365 .addReg(ImmReg, RegState::Kill)
8366 .addReg(TmpReg, RegState::Kill);
8367 break;
8368 }
8369 default:
8370 llvm_unreachable("unhandled s_pack_* instruction");
8371 }
8372
8373 MachineOperand &Dest = Inst.getOperand(0);
8374 MRI.replaceRegWith(Dest.getReg(), ResultReg);
8375 addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
8376}
8377
8378void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
8379 MachineInstr &SCCDefInst,
8380 SIInstrWorklist &Worklist,
8381 Register NewCond) const {
8382
8383 // Ensure that def inst defines SCC, which is still live.
8384 assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
8385 !Op.isDead() && Op.getParent() == &SCCDefInst);
8386 SmallVector<MachineInstr *, 4> CopyToDelete;
8387 // This assumes that all the users of SCC are in the same block
8388 // as the SCC def.
8389 for (MachineInstr &MI : // Skip the def inst itself.
8390 make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
8391 SCCDefInst.getParent()->end())) {
8392 // Check if SCC is used first.
8393 int SCCIdx = MI.findRegisterUseOperandIdx(AMDGPU::SCC, &RI, false);
8394 if (SCCIdx != -1) {
8395 if (MI.isCopy()) {
8396 MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8397 Register DestReg = MI.getOperand(0).getReg();
8398
8399 MRI.replaceRegWith(DestReg, NewCond);
8400 CopyToDelete.push_back(&MI);
8401 } else {
8402
8403 if (NewCond.isValid())
8404 MI.getOperand(SCCIdx).setReg(NewCond);
8405
8406 Worklist.insert(&MI);
8407 }
8408 }
8409 // Exit if we find another SCC def.
8410 if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, &RI, false, false) != -1)
8411 break;
8412 }
8413 for (auto &Copy : CopyToDelete)
8414 Copy->eraseFromParent();
8415}
8416
8417// Instructions that use SCC may be converted to VALU instructions. When that
8418// happens, the SCC register is changed to VCC_LO. The instruction that defines
8419// SCC must be changed to an instruction that defines VCC. This function makes
8420// sure that the instruction that defines SCC is added to the moveToVALU
8421// worklist.
8422void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
8423 SIInstrWorklist &Worklist) const {
8424 // Look for a preceding instruction that either defines VCC or SCC. If VCC
8425 // then there is nothing to do because the defining instruction has been
8426 // converted to a VALU already. If SCC then that instruction needs to be
8427 // converted to a VALU.
8428 for (MachineInstr &MI :
8429 make_range(std::next(MachineBasicBlock::reverse_iterator(SCCUseInst)),
8430 SCCUseInst->getParent()->rend())) {
8431 if (MI.modifiesRegister(AMDGPU::VCC, &RI))
8432 break;
8433 if (MI.definesRegister(AMDGPU::SCC, &RI)) {
8434 Worklist.insert(&MI);
8435 break;
8436 }
8437 }
8438}
8439
8440const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
8441 const MachineInstr &Inst) const {
8442 const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0);
8443
8444 switch (Inst.getOpcode()) {
8445 // For target instructions, getOpRegClass just returns the virtual register
8446 // class associated with the operand, so we need to find an equivalent VGPR
8447 // register class in order to move the instruction to the VALU.
8448 case AMDGPU::COPY:
8449 case AMDGPU::PHI:
8450 case AMDGPU::REG_SEQUENCE:
8451 case AMDGPU::INSERT_SUBREG:
8452 case AMDGPU::WQM:
8453 case AMDGPU::SOFT_WQM:
8454 case AMDGPU::STRICT_WWM:
8455 case AMDGPU::STRICT_WQM: {
8456 const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
8457 if (RI.isAGPRClass(SrcRC)) {
8458 if (RI.isAGPRClass(NewDstRC))
8459 return nullptr;
8460
8461 switch (Inst.getOpcode()) {
8462 case AMDGPU::PHI:
8463 case AMDGPU::REG_SEQUENCE:
8464 case AMDGPU::INSERT_SUBREG:
8465 NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
8466 break;
8467 default:
8468 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8469 }
8470
8471 if (!NewDstRC)
8472 return nullptr;
8473 } else {
8474 if (RI.isVGPRClass(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
8475 return nullptr;
8476
8477 NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
8478 if (!NewDstRC)
8479 return nullptr;
8480 }
8481
8482 return NewDstRC;
8483 }
8484 default:
8485 return NewDstRC;
8486 }
8487}
8488
8489// Find the one SGPR operand we are allowed to use.
8490Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
8491 int OpIndices[3]) const {
8492 const MCInstrDesc &Desc = MI.getDesc();
8493
8494 // Find the one SGPR operand we are allowed to use.
8495 //
8496 // First we need to consider the instruction's operand requirements before
8497 // legalizing. Some operands are required to be SGPRs, such as implicit uses
8498 // of VCC, but we are still bound by the constant bus requirement to only use
8499 // one.
8500 //
8501 // If the operand's class is an SGPR, we can never move it.
8502
8503 Register SGPRReg = findImplicitSGPRRead(MI);
8504 if (SGPRReg)
8505 return SGPRReg;
8506
8507 Register UsedSGPRs[3] = {Register()};
8508 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8509
8510 for (unsigned i = 0; i < 3; ++i) {
8511 int Idx = OpIndices[i];
8512 if (Idx == -1)
8513 break;
8514
8515 const MachineOperand &MO = MI.getOperand(Idx);
8516 if (!MO.isReg())
8517 continue;
8518
8519 // Is this operand statically required to be an SGPR based on the operand
8520 // constraints?
8521 const TargetRegisterClass *OpRC =
8522 RI.getRegClass(Desc.operands()[Idx].RegClass);
8523 bool IsRequiredSGPR = RI.isSGPRClass(OpRC);
8524 if (IsRequiredSGPR)
8525 return MO.getReg();
8526
8527 // If this could be a VGPR or an SGPR, Check the dynamic register class.
8528 Register Reg = MO.getReg();
8529 const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
8530 if (RI.isSGPRClass(RegRC))
8531 UsedSGPRs[i] = Reg;
8532 }
8533
8534 // We don't have a required SGPR operand, so we have a bit more freedom in
8535 // selecting operands to move.
8536
8537 // Try to select the most used SGPR. If an SGPR is equal to one of the
8538 // others, we choose that.
8539 //
8540 // e.g.
8541 // V_FMA_F32 v0, s0, s0, s0 -> No moves
8542 // V_FMA_F32 v0, s0, s1, s0 -> Move s1
8543
8544 // TODO: If some of the operands are 64-bit SGPRs and some 32, we should
8545 // prefer those.
8546
8547 if (UsedSGPRs[0]) {
8548 if (UsedSGPRs[0] == UsedSGPRs[1] || UsedSGPRs[0] == UsedSGPRs[2])
8549 SGPRReg = UsedSGPRs[0];
8550 }
8551
8552 if (!SGPRReg && UsedSGPRs[1]) {
8553 if (UsedSGPRs[1] == UsedSGPRs[2])
8554 SGPRReg = UsedSGPRs[1];
8555 }
8556
8557 return SGPRReg;
8558}
8559
8561 unsigned OperandName) const {
8562 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName);
8563 if (Idx == -1)
8564 return nullptr;
8565
8566 return &MI.getOperand(Idx);
8567}
8568
8574 return (Format << 44) |
8575 (1ULL << 56) | // RESOURCE_LEVEL = 1
8576 (3ULL << 60); // OOB_SELECT = 3
8577 }
8578
8579 uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
8580 if (ST.isAmdHsaOS()) {
8581 // Set ATC = 1. GFX9 doesn't have this bit.
8583 RsrcDataFormat |= (1ULL << 56);
8584
8585 // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
8586 // BTW, it disables TC L2 and therefore decreases performance.
8588 RsrcDataFormat |= (2ULL << 59);
8589 }
8590
8591 return RsrcDataFormat;
8592}
8593
8597 0xffffffff; // Size;
8598
8599 // GFX9 doesn't have ELEMENT_SIZE.
8601 uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
8602 Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
8603 }
8604
8605 // IndexStride = 64 / 32.
8606 uint64_t IndexStride = ST.getWavefrontSize() == 64 ? 3 : 2;
8607 Rsrc23 |= IndexStride << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
8608
8609 // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
8610 // Clear them unless we want a huge stride.
8613 Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT;
8614
8615 return Rsrc23;
8616}
8617
8619 unsigned Opc = MI.getOpcode();
8620
8621 return isSMRD(Opc);
8622}
8623
8625 return get(Opc).mayLoad() &&
8626 (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
8627}
8628
8630 int &FrameIndex) const {
8631 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
8632 if (!Addr || !Addr->isFI())
8633 return Register();
8634
8635 assert(!MI.memoperands_empty() &&
8636 (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
8637
8638 FrameIndex = Addr->getIndex();
8639 return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
8640}
8641
8643 int &FrameIndex) const {
8644 const MachineOperand *Addr = getNamedOperand(MI, AMDGPU::OpName::addr);
8645 assert(Addr && Addr->isFI());
8646 FrameIndex = Addr->getIndex();
8647 return getNamedOperand(MI, AMDGPU::OpName::data)->getReg();
8648}
8649
8651 int &FrameIndex) const {
8652 if (!MI.mayLoad())
8653 return Register();
8654
8655 if (isMUBUF(MI) || isVGPRSpill(MI))
8656 return isStackAccess(MI, FrameIndex);
8657
8658 if (isSGPRSpill(MI))
8659 return isSGPRStackAccess(MI, FrameIndex);
8660
8661 return Register();
8662}
8663
8665 int &FrameIndex) const {
8666 if (!MI.mayStore())
8667 return Register();
8668
8669 if (isMUBUF(MI) || isVGPRSpill(MI))
8670 return isStackAccess(MI, FrameIndex);
8671
8672 if (isSGPRSpill(MI))
8673 return isSGPRStackAccess(MI, FrameIndex);
8674
8675 return Register();
8676}
8677
8679 unsigned Size = 0;
8681 MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
8682 while (++I != E && I->isInsideBundle()) {
8683 assert(!I->isBundle() && "No nested bundle!");
8685 }
8686
8687 return Size;
8688}
8689
8691 unsigned Opc = MI.getOpcode();
8693 unsigned DescSize = Desc.getSize();
8694
8695 // If we have a definitive size, we can use it. Otherwise we need to inspect
8696 // the operands to know the size.
8697 if (isFixedSize(MI)) {
8698 unsigned Size = DescSize;
8699
8700 // If we hit the buggy offset, an extra nop will be inserted in MC so
8701 // estimate the worst case.
8702 if (MI.isBranch() && ST.hasOffset3fBug())
8703 Size += 4;
8704
8705 return Size;
8706 }
8707
8708 // Instructions may have a 32-bit literal encoded after them. Check
8709 // operands that could ever be literals.
8710 if (isVALU(MI) || isSALU(MI)) {
8711 if (isDPP(MI))
8712 return DescSize;
8713 bool HasLiteral = false;
8714 for (int I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
8715 const MachineOperand &Op = MI.getOperand(I);
8716 const MCOperandInfo &OpInfo = Desc.operands()[I];
8717 if (!Op.isReg() && !isInlineConstant(Op, OpInfo)) {
8718 HasLiteral = true;
8719 break;
8720 }
8721 }
8722 return HasLiteral ? DescSize + 4 : DescSize;
8723 }
8724
8725 // Check whether we have extra NSA words.
8726 if (isMIMG(MI)) {
8727 int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
8728 if (VAddr0Idx < 0)
8729 return 8;
8730
8731 int RSrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
8732 return 8 + 4 * ((RSrcIdx - VAddr0Idx + 2) / 4);
8733 }
8734
8735 switch (Opc) {
8736 case TargetOpcode::BUNDLE:
8737 return getInstBundleSize(MI);
8738 case TargetOpcode::INLINEASM:
8739 case TargetOpcode::INLINEASM_BR: {
8740 const MachineFunction *MF = MI.getParent()->getParent();
8741 const char *AsmStr = MI.getOperand(0).getSymbolName();
8742 return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo(), &ST);
8743 }
8744 default:
8745 if (MI.isMetaInstruction())
8746 return 0;
8747 return DescSize;
8748 }
8749}
8750
8752 if (!isFLAT(MI))
8753 return false;
8754
8755 if (MI.memoperands_empty())
8756 return true;
8757
8758 for (const MachineMemOperand *MMO : MI.memoperands()) {
8759 if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
8760 return true;
8761 }
8762 return false;
8763}
8764
8766 return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
8767}
8768
8770 MachineBasicBlock *IfEnd) const {
8772 assert(TI != IfEntry->end());
8773
8774 MachineInstr *Branch = &(*TI);
8775 MachineFunction *MF = IfEntry->getParent();
8777
8778 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8779 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8780 MachineInstr *SIIF =
8781 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
8782 .add(Branch->getOperand(0))
8783 .add(Branch->getOperand(1));
8784 MachineInstr *SIEND =
8785 BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
8786 .addReg(DstReg);
8787
8788 IfEntry->erase(TI);
8789 IfEntry->insert(IfEntry->end(), SIIF);
8790 IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
8791 }
8792}
8793
8795 MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
8797 // We expect 2 terminators, one conditional and one unconditional.
8798 assert(TI != LoopEnd->end());
8799
8800 MachineInstr *Branch = &(*TI);
8801 MachineFunction *MF = LoopEnd->getParent();
8803
8804 if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
8805
8806 Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
8807 Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
8808 MachineInstrBuilder HeaderPHIBuilder =
8809 BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
8810 for (MachineBasicBlock *PMBB : LoopEntry->predecessors()) {
8811 if (PMBB == LoopEnd) {
8812 HeaderPHIBuilder.addReg(BackEdgeReg);
8813 } else {
8814 Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
8815 materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
8816 ZeroReg, 0);
8817 HeaderPHIBuilder.addReg(ZeroReg);
8818 }
8819 HeaderPHIBuilder.addMBB(PMBB);
8820 }
8821 MachineInstr *HeaderPhi = HeaderPHIBuilder;
8822 MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
8823 get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
8824 .addReg(DstReg)
8825 .add(Branch->getOperand(0));
8826 MachineInstr *SILOOP =
8827 BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
8828 .addReg(BackEdgeReg)
8829 .addMBB(LoopEntry);
8830
8831 LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
8832 LoopEnd->erase(TI);
8833 LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
8834 LoopEnd->insert(LoopEnd->end(), SILOOP);
8835 }
8836}
8837
8840 static const std::pair<int, const char *> TargetIndices[] = {
8841 {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"},
8842 {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"},
8843 {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"},
8844 {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"},
8845 {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}};
8846 return ArrayRef(TargetIndices);
8847}
8848
8849/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The
8850/// post-RA version of misched uses CreateTargetMIHazardRecognizer.
8853 const ScheduleDAG *DAG) const {
8854 return new GCNHazardRecognizer(DAG->MF);
8855}
8856
8857/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer
8858/// pass.
8861 return new GCNHazardRecognizer(MF);
8862}
8863
8864// Called during:
8865// - pre-RA scheduling and post-RA scheduling
8868 const ScheduleDAGMI *DAG) const {
8869 // Borrowed from Arm Target
8870 // We would like to restrict this hazard recognizer to only
8871 // post-RA scheduling; we can tell that we're post-RA because we don't
8872 // track VRegLiveness.
8873 if (!DAG->hasVRegLiveness())
8874 return new GCNHazardRecognizer(DAG->MF);
8876}
8877
8878std::pair<unsigned, unsigned>
8880 return std::pair(TF & MO_MASK, TF & ~MO_MASK);
8881}
8882
8885 static const std::pair<unsigned, const char *> TargetFlags[] = {
8886 { MO_GOTPCREL, "amdgpu-gotprel" },
8887 { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" },
8888 { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" },
8889 { MO_REL32_LO, "amdgpu-rel32-lo" },
8890 { MO_REL32_HI, "amdgpu-rel32-hi" },
8891 { MO_ABS32_LO, "amdgpu-abs32-lo" },
8892 { MO_ABS32_HI, "amdgpu-abs32-hi" },
8893 };
8894
8895 return ArrayRef(TargetFlags);
8896}
8897
8900 static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
8901 {
8902 {MONoClobber, "amdgpu-noclobber"},
8903 {MOLastUse, "amdgpu-last-use"},
8904 };
8905
8906 return ArrayRef(TargetFlags);
8907}
8908
8910 const MachineFunction &MF) const {
8912 assert(SrcReg.isVirtual());
8913 if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
8914 return AMDGPU::WWM_COPY;
8915
8916 return AMDGPU::COPY;
8917}
8918
8920 Register Reg) const {
8921 // We need to handle instructions which may be inserted during register
8922 // allocation to handle the prolog. The initial prolog instruction may have
8923 // been separated from the start of the block by spills and copies inserted
8924 // needed by the prolog. However, the insertions for scalar registers can
8925 // always be placed at the BB top as they are independent of the exec mask
8926 // value.
8927 bool IsNullOrVectorRegister = true;
8928 if (Reg) {
8929 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
8930 IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
8931 }
8932
8933 uint16_t Opcode = MI.getOpcode();
8934 // FIXME: Copies inserted in the block prolog for live-range split should also
8935 // be included.
8936 return IsNullOrVectorRegister &&
8937 (isSpill(Opcode) || (!MI.isTerminator() && Opcode != AMDGPU::COPY &&
8938 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
8939}
8940
8944 const DebugLoc &DL,
8945 Register DestReg) const {
8946 if (ST.hasAddNoCarry())
8947 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
8948
8950 Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
8951 MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
8952
8953 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8954 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8955}
8956
8959 const DebugLoc &DL,
8960 Register DestReg,
8961 RegScavenger &RS) const {
8962 if (ST.hasAddNoCarry())
8963 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
8964
8965 // If available, prefer to use vcc.
8966 Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
8967 ? Register(RI.getVCC())
8969 *RI.getBoolRC(), I, /* RestoreAfter */ false,
8970 0, /* AllowSpill */ false);
8971
8972 // TODO: Users need to deal with this.
8973 if (!UnusedCarry.isValid())
8974 return MachineInstrBuilder();
8975
8976 return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_CO_U32_e64), DestReg)
8977 .addReg(UnusedCarry, RegState::Define | RegState::Dead);
8978}
8979
8980bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
8981 switch (Opcode) {
8982 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
8983 case AMDGPU::SI_KILL_I1_TERMINATOR:
8984 return true;
8985 default:
8986 return false;
8987 }
8988}
8989
8991 switch (Opcode) {
8992 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
8993 return get(AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR);
8994 case AMDGPU::SI_KILL_I1_PSEUDO:
8995 return get(AMDGPU::SI_KILL_I1_TERMINATOR);
8996 default:
8997 llvm_unreachable("invalid opcode, expected SI_KILL_*_PSEUDO");
8998 }
8999}
9000
9001bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
9002 return Imm <= getMaxMUBUFImmOffset(ST);
9003}
9004
9006 // GFX12 field is non-negative 24-bit signed byte offset.
9007 const unsigned OffsetBits =
9008 ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
9009 return (1 << OffsetBits) - 1;
9010}
9011
9013 if (!ST.isWave32())
9014 return;
9015
9016 if (MI.isInlineAsm())
9017 return;
9018
9019 for (auto &Op : MI.implicit_operands()) {
9020 if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
9021 Op.setReg(AMDGPU::VCC_LO);
9022 }
9023}
9024
9026 if (!isSMRD(MI))
9027 return false;
9028
9029 // Check that it is using a buffer resource.
9030 int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sbase);
9031 if (Idx == -1) // e.g. s_memtime
9032 return false;
9033
9034 const auto RCID = MI.getDesc().operands()[Idx].RegClass;
9035 return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
9036}
9037
9038// Given Imm, split it into the values to put into the SOffset and ImmOffset
9039// fields in an MUBUF instruction. Return false if it is not possible (due to a
9040// hardware bug needing a workaround).
9041//
9042// The required alignment ensures that individual address components remain
9043// aligned if they are aligned to begin with. It also ensures that additional
9044// offsets within the given alignment can be added to the resulting ImmOffset.
9046 uint32_t &ImmOffset, Align Alignment) const {
9047 const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
9048 const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
9049 uint32_t Overflow = 0;
9050
9051 if (Imm > MaxImm) {
9052 if (Imm <= MaxImm + 64) {
9053 // Use an SOffset inline constant for 4..64
9054 Overflow = Imm - MaxImm;
9055 Imm = MaxImm;
9056 } else {
9057 // Try to keep the same value in SOffset for adjacent loads, so that
9058 // the corresponding register contents can be re-used.
9059 //
9060 // Load values with all low-bits (except for alignment bits) set into
9061 // SOffset, so that a larger range of values can be covered using
9062 // s_movk_i32.
9063 //
9064 // Atomic operations fail to work correctly when individual address
9065 // components are unaligned, even if their sum is aligned.
9066 uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
9067 uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
9068 Imm = Low;
9069 Overflow = High - Alignment.value();
9070 }
9071 }
9072
9073 if (Overflow > 0) {
9074 // There is a hardware bug in SI and CI which prevents address clamping in
9075 // MUBUF instructions from working correctly with SOffsets. The immediate
9076 // offset is unaffected.
9078 return false;
9079
9080 // It is not possible to set immediate in SOffset field on some targets.
9081 if (ST.hasRestrictedSOffset())
9082 return false;
9083 }
9084
9085 ImmOffset = Imm;
9086 SOffset = Overflow;
9087 return true;
9088}
9089
9090// Depending on the used address space and instructions, some immediate offsets
9091// are allowed and some are not.
9092// Pre-GFX12, flat instruction offsets can only be non-negative, global and
9093// scratch instruction offsets can also be negative. On GFX12, offsets can be
9094// negative for all variants.
9095//
9096// There are several bugs related to these offsets:
9097// On gfx10.1, flat instructions that go into the global address space cannot
9098// use an offset.
9099//
9100// For scratch instructions, the address can be either an SGPR or a VGPR.
9101// The following offsets can be used, depending on the architecture (x means
9102// cannot be used):
9103// +----------------------------+------+------+
9104// | Address-Mode | SGPR | VGPR |
9105// +----------------------------+------+------+
9106// | gfx9 | | |
9107// | negative, 4-aligned offset | x | ok |
9108// | negative, unaligned offset | x | ok |
9109// +----------------------------+------+------+
9110// | gfx10 | | |
9111// | negative, 4-aligned offset | ok | ok |
9112// | negative, unaligned offset | ok | x |
9113// +----------------------------+------+------+
9114// | gfx10.3 | | |
9115// | negative, 4-aligned offset | ok | ok |
9116// | negative, unaligned offset | ok | ok |
9117// +----------------------------+------+------+
9118//
9119// This function ignores the addressing mode, so if an offset cannot be used in
9120// one addressing mode, it is considered illegal.
9121bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
9122 uint64_t FlatVariant) const {
9123 // TODO: Should 0 be special cased?
9124 if (!ST.hasFlatInstOffsets())
9125 return false;
9126
9127 if (ST.hasFlatSegmentOffsetBug() && FlatVariant == SIInstrFlags::FLAT &&
9128 (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
9129 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
9130 return false;
9131
9133 FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
9134 (Offset % 4) != 0) {
9135 return false;
9136 }
9137
9138 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9139 unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
9140 return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
9141}
9142
9143// See comment on SIInstrInfo::isLegalFLATOffset for what is legal and what not.
9144std::pair<int64_t, int64_t>
9145SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
9146 uint64_t FlatVariant) const {
9147 int64_t RemainderOffset = COffsetVal;
9148 int64_t ImmField = 0;
9149
9150 bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
9151 const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
9152
9153 if (AllowNegative) {
9154 // Use signed division by a power of two to truncate towards 0.
9155 int64_t D = 1LL << NumBits;
9156 RemainderOffset = (COffsetVal / D) * D;
9157 ImmField = COffsetVal - RemainderOffset;
9158
9160 FlatVariant == SIInstrFlags::FlatScratch && ImmField < 0 &&
9161 (ImmField % 4) != 0) {
9162 // Make ImmField a multiple of 4
9163 RemainderOffset += ImmField % 4;
9164 ImmField -= ImmField % 4;
9165 }
9166 } else if (COffsetVal >= 0) {
9167 ImmField = COffsetVal & maskTrailingOnes<uint64_t>(NumBits);
9168 RemainderOffset = COffsetVal - ImmField;
9169 }
9170
9171 assert(isLegalFLATOffset(ImmField, AddrSpace, FlatVariant));
9172 assert(RemainderOffset + ImmField == COffsetVal);
9173 return {ImmField, RemainderOffset};
9174}
9175
9177 if (ST.hasNegativeScratchOffsetBug() &&
9178 FlatVariant == SIInstrFlags::FlatScratch)
9179 return false;
9180
9181 return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
9182}
9183
9184static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
9185 switch (ST.getGeneration()) {
9186 default:
9187 break;
9190 return SIEncodingFamily::SI;
9193 return SIEncodingFamily::VI;
9200 }
9201 llvm_unreachable("Unknown subtarget generation!");
9202}
9203
9204bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
9205 switch(MCOp) {
9206 // These opcodes use indirect register addressing so
9207 // they need special handling by codegen (currently missing).
9208 // Therefore it is too risky to allow these opcodes
9209 // to be selected by dpp combiner or sdwa peepholer.
9210 case AMDGPU::V_MOVRELS_B32_dpp_gfx10:
9211 case AMDGPU::V_MOVRELS_B32_sdwa_gfx10:
9212 case AMDGPU::V_MOVRELD_B32_dpp_gfx10:
9213 case AMDGPU::V_MOVRELD_B32_sdwa_gfx10:
9214 case AMDGPU::V_MOVRELSD_B32_dpp_gfx10:
9215 case AMDGPU::V_MOVRELSD_B32_sdwa_gfx10:
9216 case AMDGPU::V_MOVRELSD_2_B32_dpp_gfx10:
9217 case AMDGPU::V_MOVRELSD_2_B32_sdwa_gfx10:
9218 return true;
9219 default:
9220 return false;
9221 }
9222}
9223
9224int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
9225 Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
9226
9227 unsigned Gen = subtargetEncodingFamily(ST);
9228
9229 if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
9232
9233 // Adjust the encoding family to GFX80 for D16 buffer instructions when the
9234 // subtarget has UnpackedD16VMem feature.
9235 // TODO: remove this when we discard GFX80 encoding.
9236 if (ST.hasUnpackedD16VMem() && (get(Opcode).TSFlags & SIInstrFlags::D16Buf))
9238
9239 if (get(Opcode).TSFlags & SIInstrFlags::SDWA) {
9240 switch (ST.getGeneration()) {
9241 default:
9243 break;
9246 break;
9249 break;
9250 }
9251 }
9252
9253 if (isMAI(Opcode)) {
9254 int MFMAOp = AMDGPU::getMFMAEarlyClobberOp(Opcode);
9255 if (MFMAOp != -1)
9256 Opcode = MFMAOp;
9257 }
9258
9259 int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
9260
9261 // -1 means that Opcode is already a native instruction.
9262 if (MCOp == -1)
9263 return Opcode;
9264
9265 if (ST.hasGFX90AInsts()) {
9266 uint16_t NMCOp = (uint16_t)-1;
9267 if (ST.hasGFX940Insts())
9269 if (NMCOp == (uint16_t)-1)
9271 if (NMCOp == (uint16_t)-1)
9273 if (NMCOp != (uint16_t)-1)
9274 MCOp = NMCOp;
9275 }
9276
9277 // (uint16_t)-1 means that Opcode is a pseudo instruction that has
9278 // no encoding in the given subtarget generation.
9279 if (MCOp == (uint16_t)-1)
9280 return -1;
9281
9282 if (isAsmOnlyOpcode(MCOp))
9283 return -1;
9284
9285 return MCOp;
9286}
9287
9288static
9290 assert(RegOpnd.isReg());
9291 return RegOpnd.isUndef() ? TargetInstrInfo::RegSubRegPair() :
9292 getRegSubRegPair(RegOpnd);
9293}
9294
9297 assert(MI.isRegSequence());
9298 for (unsigned I = 0, E = (MI.getNumOperands() - 1)/ 2; I < E; ++I)
9299 if (MI.getOperand(1 + 2 * I + 1).getImm() == SubReg) {
9300 auto &RegOp = MI.getOperand(1 + 2 * I);
9301 return getRegOrUndef(RegOp);
9302 }
9304}
9305
9306// Try to find the definition of reg:subreg in subreg-manipulation pseudos
9307// Following a subreg of reg:subreg isn't supported
9310 if (!RSR.SubReg)
9311 return false;
9312 switch (MI.getOpcode()) {
9313 default: break;
9314 case AMDGPU::REG_SEQUENCE:
9315 RSR = getRegSequenceSubReg(MI, RSR.SubReg);
9316 return true;
9317 // EXTRACT_SUBREG ins't supported as this would follow a subreg of subreg
9318 case AMDGPU::INSERT_SUBREG:
9319 if (RSR.SubReg == (unsigned)MI.getOperand(3).getImm())
9320 // inserted the subreg we're looking for
9321 RSR = getRegOrUndef(MI.getOperand(2));
9322 else { // the subreg in the rest of the reg
9323 auto R1 = getRegOrUndef(MI.getOperand(1));
9324 if (R1.SubReg) // subreg of subreg isn't supported
9325 return false;
9326 RSR.Reg = R1.Reg;
9327 }
9328 return true;
9329 }
9330 return false;
9331}
9332
9335 assert(MRI.isSSA());
9336 if (!P.Reg.isVirtual())
9337 return nullptr;
9338
9339 auto RSR = P;
9340 auto *DefInst = MRI.getVRegDef(RSR.Reg);
9341 while (auto *MI = DefInst) {
9342 DefInst = nullptr;
9343 switch (MI->getOpcode()) {
9344 case AMDGPU::COPY:
9345 case AMDGPU::V_MOV_B32_e32: {
9346 auto &Op1 = MI->getOperand(1);
9347 if (Op1.isReg() && Op1.getReg().isVirtual()) {
9348 if (Op1.isUndef())
9349 return nullptr;
9350 RSR = getRegSubRegPair(Op1);
9351 DefInst = MRI.getVRegDef(RSR.Reg);
9352 }
9353 break;
9354 }
9355 default:
9356 if (followSubRegDef(*MI, RSR)) {
9357 if (!RSR.Reg)
9358 return nullptr;
9359 DefInst = MRI.getVRegDef(RSR.Reg);
9360 }
9361 }
9362 if (!DefInst)
9363 return MI;
9364 }
9365 return nullptr;
9366}
9367
9369 Register VReg,
9370 const MachineInstr &DefMI,
9371 const MachineInstr &UseMI) {
9372 assert(MRI.isSSA() && "Must be run on SSA");
9373
9374 auto *TRI = MRI.getTargetRegisterInfo();
9375 auto *DefBB = DefMI.getParent();
9376
9377 // Don't bother searching between blocks, although it is possible this block
9378 // doesn't modify exec.
9379 if (UseMI.getParent() != DefBB)
9380 return true;
9381
9382 const int MaxInstScan = 20;
9383 int NumInst = 0;
9384
9385 // Stop scan at the use.
9386 auto E = UseMI.getIterator();
9387 for (auto I = std::next(DefMI.getIterator()); I != E; ++I) {
9388 if (I->isDebugInstr())
9389 continue;
9390
9391 if (++NumInst > MaxInstScan)
9392 return true;
9393
9394 if (I->modifiesRegister(AMDGPU::EXEC, TRI))
9395 return true;
9396 }
9397
9398 return false;
9399}
9400
9402 Register VReg,
9403 const MachineInstr &DefMI) {
9404 assert(MRI.isSSA() && "Must be run on SSA");
9405
9406 auto *TRI = MRI.getTargetRegisterInfo();
9407 auto *DefBB = DefMI.getParent();
9408
9409 const int MaxUseScan = 10;
9410 int NumUse = 0;
9411
9412 for (auto &Use : MRI.use_nodbg_operands(VReg)) {
9413 auto &UseInst = *Use.getParent();
9414 // Don't bother searching between blocks, although it is possible this block
9415 // doesn't modify exec.
9416 if (UseInst.getParent() != DefBB || UseInst.isPHI())
9417 return true;
9418
9419 if (++NumUse > MaxUseScan)
9420 return true;
9421 }
9422
9423 if (NumUse == 0)
9424 return false;
9425
9426 const int MaxInstScan = 20;
9427 int NumInst = 0;
9428
9429 // Stop scan when we have seen all the uses.
9430 for (auto I = std::next(DefMI.getIterator()); ; ++I) {
9431 assert(I != DefBB->end());
9432
9433 if (I->isDebugInstr())
9434 continue;
9435
9436 if (++NumInst > MaxInstScan)
9437 return true;
9438
9439 for (const MachineOperand &Op : I->operands()) {
9440 // We don't check reg masks here as they're used only on calls:
9441 // 1. EXEC is only considered const within one BB
9442 // 2. Call should be a terminator instruction if present in a BB
9443
9444 if (!Op.isReg())
9445 continue;
9446
9447 Register Reg = Op.getReg();
9448 if (Op.isUse()) {
9449 if (Reg == VReg && --NumUse == 0)
9450 return false;
9451 } else if (TRI->regsOverlap(Reg, AMDGPU::EXEC))
9452 return true;
9453 }
9454 }
9455}
9456
9459 const DebugLoc &DL, Register Src, Register Dst) const {
9460 auto Cur = MBB.begin();
9461 if (Cur != MBB.end())
9462 do {
9463 if (!Cur->isPHI() && Cur->readsRegister(Dst, /*TRI=*/nullptr))
9464 return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
9465 ++Cur;
9466 } while (Cur != MBB.end() && Cur != LastPHIIt);
9467
9468 return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
9469 Dst);
9470}
9471
9474 const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const {
9475 if (InsPt != MBB.end() &&
9476 (InsPt->getOpcode() == AMDGPU::SI_IF ||
9477 InsPt->getOpcode() == AMDGPU::SI_ELSE ||
9478 InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
9479 InsPt->definesRegister(Src, /*TRI=*/nullptr)) {
9480 InsPt++;
9481 return BuildMI(MBB, InsPt, DL,
9482 get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
9483 : AMDGPU::S_MOV_B64_term),
9484 Dst)
9485 .addReg(Src, 0, SrcSubReg)
9486 .addReg(AMDGPU::EXEC, RegState::Implicit);
9487 }
9488 return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
9489 Dst);
9490}
9491
9492bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
9493
9496 MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
9497 VirtRegMap *VRM) const {
9498 // This is a bit of a hack (copied from AArch64). Consider this instruction:
9499 //
9500 // %0:sreg_32 = COPY $m0
9501 //
9502 // We explicitly chose SReg_32 for the virtual register so such a copy might
9503 // be eliminated by RegisterCoalescer. However, that may not be possible, and
9504 // %0 may even spill. We can't spill $m0 normally (it would require copying to
9505 // a numbered SGPR anyway), and since it is in the SReg_32 register class,
9506 // TargetInstrInfo::foldMemoryOperand() is going to try.
9507 // A similar issue also exists with spilling and reloading $exec registers.
9508 //
9509 // To prevent that, constrain the %0 register class here.
9510 if (isFullCopyInstr(MI)) {
9511 Register DstReg = MI.getOperand(0).getReg();
9512 Register SrcReg = MI.getOperand(1).getReg();
9513 if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
9514 (DstReg.isVirtual() != SrcReg.isVirtual())) {
9516 Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
9517 const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
9518 if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
9519 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
9520 return nullptr;
9521 } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
9522 MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
9523 return nullptr;
9524 }
9525 }
9526 }
9527
9528 return nullptr;
9529}
9530
9532 const MachineInstr &MI,
9533 unsigned *PredCost) const {
9534 if (MI.isBundle()) {
9536 MachineBasicBlock::const_instr_iterator E(MI.getParent()->instr_end());
9537 unsigned Lat = 0, Count = 0;
9538 for (++I; I != E && I->isBundledWithPred(); ++I) {
9539 ++Count;
9540 Lat = std::max(Lat, SchedModel.computeInstrLatency(&*I));
9541 }
9542 return Lat + Count - 1;
9543 }
9544
9545 return SchedModel.computeInstrLatency(&MI);
9546}
9547
9550 unsigned opcode = MI.getOpcode();
9551 if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
9552 auto IID = GI->getIntrinsicID();
9557
9558 switch (IID) {
9559 case Intrinsic::amdgcn_if:
9560 case Intrinsic::amdgcn_else:
9561 // FIXME: Uniform if second result
9562 break;
9563 }
9564
9566 }
9567
9568 // Loads from the private and flat address spaces are divergent, because
9569 // threads can execute the load instruction with the same inputs and get
9570 // different results.
9571 //
9572 // All other loads are not divergent, because if threads issue loads with the
9573 // same arguments, they will always get the same result.
9574 if (opcode == AMDGPU::G_LOAD) {
9575 if (MI.memoperands_empty())
9576 return InstructionUniformity::NeverUniform; // conservative assumption
9577
9578 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9579 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9580 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9581 })) {
9582 // At least one MMO in a non-global address space.
9584 }
9586 }
9587
9588 if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
9589 opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
9590 opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
9591 AMDGPU::isGenericAtomic(opcode)) {
9593 }
9595}
9596
9599
9600 if (isNeverUniform(MI))
9602
9603 unsigned opcode = MI.getOpcode();
9604 if (opcode == AMDGPU::V_READLANE_B32 ||
9605 opcode == AMDGPU::V_READFIRSTLANE_B32 ||
9606 opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
9608
9609 if (isCopyInstr(MI)) {
9610 const MachineOperand &srcOp = MI.getOperand(1);
9611 if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
9612 const TargetRegisterClass *regClass =
9613 RI.getPhysRegBaseClass(srcOp.getReg());
9616 }
9618 }
9619
9620 // GMIR handling
9621 if (MI.isPreISelOpcode())
9623
9624 // Atomics are divergent because they are executed sequentially: when an
9625 // atomic operation refers to the same address in each thread, then each
9626 // thread after the first sees the value written by the previous thread as
9627 // original value.
9628
9629 if (isAtomic(MI))
9631
9632 // Loads from the private and flat address spaces are divergent, because
9633 // threads can execute the load instruction with the same inputs and get
9634 // different results.
9635 if (isFLAT(MI) && MI.mayLoad()) {
9636 if (MI.memoperands_empty())
9637 return InstructionUniformity::NeverUniform; // conservative assumption
9638
9639 if (llvm::any_of(MI.memoperands(), [](const MachineMemOperand *mmo) {
9640 return mmo->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
9641 mmo->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS;
9642 })) {
9643 // At least one MMO in a non-global address space.
9645 }
9646
9648 }
9649
9650 const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
9651 const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
9652
9653 // FIXME: It's conceptually broken to report this for an instruction, and not
9654 // a specific def operand. For inline asm in particular, there could be mixed
9655 // uniform and divergent results.
9656 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
9657 const MachineOperand &SrcOp = MI.getOperand(I);
9658 if (!SrcOp.isReg())
9659 continue;
9660
9661 Register Reg = SrcOp.getReg();
9662 if (!Reg || !SrcOp.readsReg())
9663 continue;
9664
9665 // If RegBank is null, this is unassigned or an unallocatable special
9666 // register, which are all scalars.
9667 const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
9668 if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
9670 }
9671
9672 // TODO: Uniformity check condtions above can be rearranged for more
9673 // redability
9674
9675 // TODO: amdgcn.{ballot, [if]cmp} should be AlwaysUniform, but they are
9676 // currently turned into no-op COPYs by SelectionDAG ISel and are
9677 // therefore no longer recognizable.
9678
9680}
9681
9683 switch (MF.getFunction().getCallingConv()) {
9685 return 1;
9687 return 2;
9689 return 3;
9693 report_fatal_error("ds_ordered_count unsupported for this calling conv");
9696 case CallingConv::C:
9697 case CallingConv::Fast:
9698 default:
9699 // Assume other calling conventions are various compute callable functions
9700 return 0;
9701 }
9702}
9703
9705 Register &SrcReg2, int64_t &CmpMask,
9706 int64_t &CmpValue) const {
9707 if (!MI.getOperand(0).isReg() || MI.getOperand(0).getSubReg())
9708 return false;
9709
9710 switch (MI.getOpcode()) {
9711 default:
9712 break;
9713 case AMDGPU::S_CMP_EQ_U32:
9714 case AMDGPU::S_CMP_EQ_I32:
9715 case AMDGPU::S_CMP_LG_U32:
9716 case AMDGPU::S_CMP_LG_I32:
9717 case AMDGPU::S_CMP_LT_U32:
9718 case AMDGPU::S_CMP_LT_I32:
9719 case AMDGPU::S_CMP_GT_U32:
9720 case AMDGPU::S_CMP_GT_I32:
9721 case AMDGPU::S_CMP_LE_U32:
9722 case AMDGPU::S_CMP_LE_I32:
9723 case AMDGPU::S_CMP_GE_U32:
9724 case AMDGPU::S_CMP_GE_I32:
9725 case AMDGPU::S_CMP_EQ_U64:
9726 case AMDGPU::S_CMP_LG_U64:
9727 SrcReg = MI.getOperand(0).getReg();
9728 if (MI.getOperand(1).isReg()) {
9729 if (MI.getOperand(1).getSubReg())
9730 return false;
9731 SrcReg2 = MI.getOperand(1).getReg();
9732 CmpValue = 0;
9733 } else if (MI.getOperand(1).isImm()) {
9734 SrcReg2 = Register();
9735 CmpValue = MI.getOperand(1).getImm();
9736 } else {
9737 return false;
9738 }
9739 CmpMask = ~0;
9740 return true;
9741 case AMDGPU::S_CMPK_EQ_U32:
9742 case AMDGPU::S_CMPK_EQ_I32:
9743 case AMDGPU::S_CMPK_LG_U32:
9744 case AMDGPU::S_CMPK_LG_I32:
9745 case AMDGPU::S_CMPK_LT_U32:
9746 case AMDGPU::S_CMPK_LT_I32:
9747 case AMDGPU::S_CMPK_GT_U32:
9748 case AMDGPU::S_CMPK_GT_I32:
9749 case AMDGPU::S_CMPK_LE_U32:
9750 case AMDGPU::S_CMPK_LE_I32:
9751 case AMDGPU::S_CMPK_GE_U32:
9752 case AMDGPU::S_CMPK_GE_I32:
9753 SrcReg = MI.getOperand(0).getReg();
9754 SrcReg2 = Register();
9755 CmpValue = MI.getOperand(1).getImm();
9756 CmpMask = ~0;
9757 return true;
9758 }
9759
9760 return false;
9761}
9762
9764 Register SrcReg2, int64_t CmpMask,
9765 int64_t CmpValue,
9766 const MachineRegisterInfo *MRI) const {
9767 if (!SrcReg || SrcReg.isPhysical())
9768 return false;
9769
9770 if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
9771 return false;
9772
9773 const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
9774 this](int64_t ExpectedValue, unsigned SrcSize,
9775 bool IsReversible, bool IsSigned) -> bool {
9776 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9777 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9778 // s_cmp_ge_u32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9779 // s_cmp_ge_i32 (s_and_b32 $src, 1 << n), 1 << n => s_and_b32 $src, 1 << n
9780 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 1 << n => s_and_b64 $src, 1 << n
9781 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9782 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9783 // s_cmp_gt_u32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9784 // s_cmp_gt_i32 (s_and_b32 $src, 1 << n), 0 => s_and_b32 $src, 1 << n
9785 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 0 => s_and_b64 $src, 1 << n
9786 //
9787 // Signed ge/gt are not used for the sign bit.
9788 //
9789 // If result of the AND is unused except in the compare:
9790 // s_and_b(32|64) $src, 1 << n => s_bitcmp1_b(32|64) $src, n
9791 //
9792 // s_cmp_eq_u32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9793 // s_cmp_eq_i32 (s_and_b32 $src, 1 << n), 0 => s_bitcmp0_b32 $src, n
9794 // s_cmp_eq_u64 (s_and_b64 $src, 1 << n), 0 => s_bitcmp0_b64 $src, n
9795 // s_cmp_lg_u32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9796 // s_cmp_lg_i32 (s_and_b32 $src, 1 << n), 1 << n => s_bitcmp0_b32 $src, n
9797 // s_cmp_lg_u64 (s_and_b64 $src, 1 << n), 1 << n => s_bitcmp0_b64 $src, n
9798
9799 MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
9800 if (!Def || Def->getParent() != CmpInstr.getParent())
9801 return false;
9802
9803 if (Def->getOpcode() != AMDGPU::S_AND_B32 &&
9804 Def->getOpcode() != AMDGPU::S_AND_B64)
9805 return false;
9806
9807 int64_t Mask;
9808 const auto isMask = [&Mask, SrcSize](const MachineOperand *MO) -> bool {
9809 if (MO->isImm())
9810 Mask = MO->getImm();
9811 else if (!getFoldableImm(MO, Mask))
9812 return false;
9813 Mask &= maxUIntN(SrcSize);
9814 return isPowerOf2_64(Mask);
9815 };
9816
9817 MachineOperand *SrcOp = &Def->getOperand(1);
9818 if (isMask(SrcOp))
9819 SrcOp = &Def->getOperand(2);
9820 else if (isMask(&Def->getOperand(2)))
9821 SrcOp = &Def->getOperand(1);
9822 else
9823 return false;
9824
9825 unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
9826 if (IsSigned && BitNo == SrcSize - 1)
9827 return false;
9828
9829 ExpectedValue <<= BitNo;
9830
9831 bool IsReversedCC = false;
9832 if (CmpValue != ExpectedValue) {
9833 if (!IsReversible)
9834 return false;
9835 IsReversedCC = CmpValue == (ExpectedValue ^ Mask);
9836 if (!IsReversedCC)
9837 return false;
9838 }
9839
9840 Register DefReg = Def->getOperand(0).getReg();
9841 if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
9842 return false;
9843
9844 for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
9845 I != E; ++I) {
9846 if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
9847 I->killsRegister(AMDGPU::SCC, &RI))
9848 return false;
9849 }
9850
9851 MachineOperand *SccDef =
9852 Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
9853 SccDef->setIsDead(false);
9854 CmpInstr.eraseFromParent();
9855
9856 if (!MRI->use_nodbg_empty(DefReg)) {
9857 assert(!IsReversedCC);
9858 return true;
9859 }
9860
9861 // Replace AND with unused result with a S_BITCMP.
9862 MachineBasicBlock *MBB = Def->getParent();
9863
9864 unsigned NewOpc = (SrcSize == 32) ? IsReversedCC ? AMDGPU::S_BITCMP0_B32
9865 : AMDGPU::S_BITCMP1_B32
9866 : IsReversedCC ? AMDGPU::S_BITCMP0_B64
9867 : AMDGPU::S_BITCMP1_B64;
9868
9869 BuildMI(*MBB, Def, Def->getDebugLoc(), get(NewOpc))
9870 .add(*SrcOp)
9871 .addImm(BitNo);
9872 Def->eraseFromParent();
9873
9874 return true;
9875 };
9876
9877 switch (CmpInstr.getOpcode()) {
9878 default:
9879 break;
9880 case AMDGPU::S_CMP_EQ_U32:
9881 case AMDGPU::S_CMP_EQ_I32:
9882 case AMDGPU::S_CMPK_EQ_U32:
9883 case AMDGPU::S_CMPK_EQ_I32:
9884 return optimizeCmpAnd(1, 32, true, false);
9885 case AMDGPU::S_CMP_GE_U32:
9886 case AMDGPU::S_CMPK_GE_U32:
9887 return optimizeCmpAnd(1, 32, false, false);
9888 case AMDGPU::S_CMP_GE_I32:
9889 case AMDGPU::S_CMPK_GE_I32:
9890 return optimizeCmpAnd(1, 32, false, true);
9891 case AMDGPU::S_CMP_EQ_U64:
9892 return optimizeCmpAnd(1, 64, true, false);
9893 case AMDGPU::S_CMP_LG_U32:
9894 case AMDGPU::S_CMP_LG_I32:
9895 case AMDGPU::S_CMPK_LG_U32:
9896 case AMDGPU::S_CMPK_LG_I32:
9897 return optimizeCmpAnd(0, 32, true, false);
9898 case AMDGPU::S_CMP_GT_U32:
9899 case AMDGPU::S_CMPK_GT_U32:
9900 return optimizeCmpAnd(0, 32, false, false);
9901 case AMDGPU::S_CMP_GT_I32:
9902 case AMDGPU::S_CMPK_GT_I32:
9903 return optimizeCmpAnd(0, 32, false, true);
9904 case AMDGPU::S_CMP_LG_U64:
9905 return optimizeCmpAnd(0, 64, true, false);
9906 }
9907
9908 return false;
9909}
9910
9912 unsigned OpName) const {
9913 if (!ST.needsAlignedVGPRs())
9914 return;
9915
9916 int OpNo = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
9917 if (OpNo < 0)
9918 return;
9919 MachineOperand &Op = MI.getOperand(OpNo);
9920 if (getOpSize(MI, OpNo) > 4)
9921 return;
9922
9923 // Add implicit aligned super-reg to force alignment on the data operand.
9924 const DebugLoc &DL = MI.getDebugLoc();
9925 MachineBasicBlock *BB = MI.getParent();
9927 Register DataReg = Op.getReg();
9928 bool IsAGPR = RI.isAGPR(MRI, DataReg);
9929 Register Undef = MRI.createVirtualRegister(
9930 IsAGPR ? &AMDGPU::AGPR_32RegClass : &AMDGPU::VGPR_32RegClass);
9931 BuildMI(*BB, MI, DL, get(AMDGPU::IMPLICIT_DEF), Undef);
9932 Register NewVR =
9933 MRI.createVirtualRegister(IsAGPR ? &AMDGPU::AReg_64_Align2RegClass
9934 : &AMDGPU::VReg_64_Align2RegClass);
9935 BuildMI(*BB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewVR)
9936 .addReg(DataReg, 0, Op.getSubReg())
9937 .addImm(AMDGPU::sub0)
9938 .addReg(Undef)
9939 .addImm(AMDGPU::sub1);
9940 Op.setReg(NewVR);
9941 Op.setSubReg(AMDGPU::sub0);
9942 MI.addOperand(MachineOperand::CreateReg(NewVR, false, true));
9943}
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
unsigned RegSize
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
amdgpu AMDGPU Register Bank Select
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
uint64_t High
#define P(N)
TargetInstrInfo::RegSubRegPair RegSubRegPair
R600 Clause Merge
const SmallVectorImpl< MachineOperand > MachineBasicBlock * TBB
const SmallVectorImpl< MachineOperand > & Cond
This file declares the machine register scavenger class.
static cl::opt< bool > Fix16BitCopies("amdgpu-fix-16-bit-physreg-copies", cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), cl::init(true), cl::ReallyHidden)
static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const TargetRegisterClass *RC, bool Forward)
static void indirectCopyToAGPR(const SIInstrInfo &TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, RegScavenger &RS, bool RegsOverlap, Register ImpDefSuperReg=Register(), Register ImpUseSuperReg=Register())
Handle copying from SGPR to AGPR, or from AGPR to AGPR on GFX908.
static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize)
static bool compareMachineOp(const MachineOperand &Op0, const MachineOperand &Op1)
static bool isStride64(unsigned Opc)
static std::tuple< unsigned, unsigned > extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc)
static bool followSubRegDef(MachineInstr &MI, TargetInstrInfo::RegSubRegPair &RSR)
static unsigned getIndirectSGPRWriteMovRelPseudo64(unsigned VecSize)
static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillRestoreOpcode(unsigned Size)
static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig)
static bool offsetsDoNotOverlap(LocationSize WidthA, int OffsetA, LocationSize WidthB, int OffsetB)
static unsigned getWWMRegSpillSaveOpcode(unsigned Size, bool IsVectorSuperClass)
static bool memOpsHaveSameBaseOperands(ArrayRef< const MachineOperand * > BaseOps1, ArrayRef< const MachineOperand * > BaseOps2)
static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, bool IsVectorSuperClass)
static const TargetRegisterClass * adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI, const MachineRegisterInfo &MRI, const MCInstrDesc &TID, unsigned RCID, bool IsAllocatable)
static unsigned getVectorRegSpillSaveOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI)
static unsigned getAGPRSpillSaveOpcode(unsigned Size)
static bool resultDependsOnExec(const MachineInstr &MI)
static bool getFoldableImm(Register Reg, const MachineRegisterInfo &MRI, int64_t &Imm, MachineInstr **DefMI=nullptr)
static unsigned getIndirectVGPRWriteMovRelPseudoOpc(unsigned VecSize)
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST)
static void preserveCondRegFlags(MachineOperand &CondReg, const MachineOperand &OrigCond)
static Register findImplicitSGPRRead(const MachineInstr &MI)
static cl::opt< unsigned > BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)"))
static void updateLiveVariables(LiveVariables *LV, MachineInstr &MI, MachineInstr &NewMI)
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, ArrayRef< const MachineOperand * > BaseOps1, const MachineInstr &MI2, ArrayRef< const MachineOperand * > BaseOps2)
static bool nodesHaveSameOperandValue(SDNode *N0, SDNode *N1, unsigned OpName)
Returns true if both nodes have the same value for the given operand Op, or if both nodes do not have...
Definition: SIInstrInfo.cpp:85
static unsigned getSGPRSpillRestoreOpcode(unsigned Size)
static unsigned getSGPRSpillSaveOpcode(unsigned Size)
static void emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL, ArrayRef< MachineOperand * > ScalarOps)
static unsigned getVGPRSpillSaveOpcode(unsigned Size)
static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, const char *Msg="illegal VGPR to SGPR copy")
static MachineInstr * swapRegAndNonRegOperand(MachineInstr &MI, MachineOperand &RegOp, MachineOperand &NonRegOp)
static bool shouldReadExec(const MachineInstr &MI)
static TargetInstrInfo::RegSubRegPair getRegOrUndef(const MachineOperand &RegOpnd)
static constexpr unsigned ModifierOpNames[]
static bool changesVGPRIndexingMode(const MachineInstr &MI)
static bool isSubRegOf(const SIRegisterInfo &TRI, const MachineOperand &SuperVec, const MachineOperand &SubReg)
static unsigned getAVSpillSaveOpcode(unsigned Size)
static unsigned getNumOperandsNoGlue(SDNode *Node)
Definition: SIInstrInfo.cpp:76
static bool canRemat(const MachineInstr &MI)
static MachineBasicBlock * loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, ArrayRef< MachineOperand * > ScalarOps, MachineDominatorTree *MDT, MachineBasicBlock::iterator Begin=nullptr, MachineBasicBlock::iterator End=nullptr)
static unsigned getAVSpillRestoreOpcode(unsigned Size)
static unsigned getVGPRSpillRestoreOpcode(unsigned Size)
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool IsDead
static bool isImm(const MachineOperand &MO, MachineRegisterInfo *MRI)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool has16BitInsts() const
bool hasTrue16BitInsts() const
Return true if the subtarget supports True16 instructions.
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
Class for arbitrary precision integers.
Definition: APInt.h:76
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1513
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
bool useVGPRIndexMode() const
bool hasSDWAOmod() const
Definition: GCNSubtarget.h:739
bool hasA16() const
bool hasSDWAScalar() const
Definition: GCNSubtarget.h:743
bool hasScalarCompareEq64() const
Definition: GCNSubtarget.h:954
bool hasOnlyRevVALUShifts() const
Definition: GCNSubtarget.h:382
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:618
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:759
bool hasMAIInsts() const
Definition: GCNSubtarget.h:809
bool hasMFMAInlineLiteralBug() const
bool hasNegativeScratchOffsetBug() const
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
bool hasR128A16() const
bool hasOffset3fBug() const
bool hasGetPCZeroExtension() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:269
const AMDGPURegisterBankInfo * getRegBankInfo() const override
Definition: GCNSubtarget.h:289
bool hasSDWAOutModsVOPC() const
Definition: GCNSubtarget.h:755
bool hasRestrictedSOffset() const
bool hasFlatSegmentOffsetBug() const
Definition: GCNSubtarget.h:674
bool hasGFX940Insts() const
bool hasSDWASdst() const
Definition: GCNSubtarget.h:747
bool hasMovB64() const
bool isWave32() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:335
bool hasNegativeUnalignedScratchOffsetBug() const
bool hasG16() const
unsigned getNSAMaxSize(bool HasSampler=false) const
Generation getGeneration() const
Definition: GCNSubtarget.h:308
bool hasVOP3Literal() const
Definition: GCNSubtarget.h:879
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:726
bool hasAddr64() const
Definition: GCNSubtarget.h:372
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:718
bool hasGDS() const
bool hasPartialNSAEncoding() const
CycleT * getCycle(const BlockT *Block) const
Find the innermost cycle containing a given block.
A possibly irreducible generalization of a Loop.
void getExitingBlocks(SmallVectorImpl< BlockT * > &TmpStorage) const
Return all blocks of this cycle that have successor outside of this cycle.
bool contains(const BlockT *Block) const
Return whether Block is contained in the cycle.
const GenericCycle * getParentCycle() const
Itinerary data supplied by a subtarget to be used by a target.
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
bool hasInterval(Register Reg) const
SlotIndex getInstructionIndex(const MachineInstr &Instr) const
Returns the base index of the given instruction.
LiveInterval & getInterval(Register Reg)
SlotIndex ReplaceMachineInstrInMaps(MachineInstr &MI, MachineInstr &NewMI)
This class represents the liveness of a register, stack slot, etc.
Definition: LiveInterval.h:157
iterator find(SlotIndex Pos)
find - Return an iterator pointing to the first segment that ends after Pos, or end().
void replaceKillInstruction(Register Reg, MachineInstr &OldMI, MachineInstr &NewMI)
replaceKillInstruction - Update register kill info by replacing a kill instruction with a new one.
VarInfo & getVarInfo(Register Reg)
getVarInfo - Return the VarInfo structure for the specified VIRTUAL register.
bool hasValue() const
TypeSize getValue() const
static const MCBinaryExpr * createAnd(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:541
static const MCBinaryExpr * createAShr(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:611
static const MCBinaryExpr * createSub(const MCExpr *LHS, const MCExpr *RHS, MCContext &Ctx)
Definition: MCExpr.h:621
static const MCConstantExpr * create(int64_t Value, MCContext &Ctx, bool PrintInHex=false, unsigned SizeInBytes=0)
Definition: MCExpr.cpp:194
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
unsigned getNumOperands() const
Return the number of declared MachineOperands for this MachineInstruction.
Definition: MCInstrDesc.h:237
ArrayRef< MCOperandInfo > operands() const
Definition: MCInstrDesc.h:239
bool mayStore() const
Return true if this instruction could possibly modify memory.
Definition: MCInstrDesc.h:444
bool mayLoad() const
Return true if this instruction could possibly read memory.
Definition: MCInstrDesc.h:438
unsigned short Opcode
Definition: MCInstrDesc.h:205
ArrayRef< MCPhysReg > implicit_uses() const
Return a list of registers that are potentially read by any instance of this machine instruction.
Definition: MCInstrDesc.h:565
This holds information about one operand of a machine instruction, indicating the register class for ...
Definition: MCInstrDesc.h:85
uint8_t OperandType
Information about the type of the operand.
Definition: MCInstrDesc.h:97
int16_t RegClass
This specifies the register class enumeration of the operand if the operand is a register.
Definition: MCInstrDesc.h:91
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
static const MCSymbolRefExpr * create(const MCSymbol *Symbol, MCContext &Ctx)
Definition: MCExpr.h:397
MCSymbol - Instances of this class represent a symbol name in the MC file, and MCSymbols are created ...
Definition: MCSymbol.h:40
void setVariableValue(const MCExpr *Value)
Definition: MCSymbol.cpp:47
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
MIBundleBuilder & append(MachineInstr *MI)
Insert MI into MBB by appending it to the instructions in the bundle.
unsigned pred_size() const
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
MCSymbol * getSymbol() const
Return the MCSymbol for this basic block.
reverse_iterator rend()
instr_iterator insert(instr_iterator I, MachineInstr *M)
Insert MI into the instruction list before I, possibly inside a bundle.
LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI, MCRegister Reg, const_iterator Before, unsigned Neighborhood=10) const
Return whether (physical) register Reg has been defined and not killed as of just before Before.
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
iterator getFirstNonPHI()
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
DebugLoc findDebugLoc(instr_iterator MBBI)
Find the next valid DebugLoc starting at MBBI, skipping any debug instructions.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
Instructions::const_iterator const_instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< iterator > terminators()
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
@ LQR_Dead
Register is known to be fully dead.
DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to compute a normal dominat...
MachineDomTreeNode * addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *DomBB)
addNewBlock - Add a new node to the dominator tree information.
bool properlyDominates(const MachineDomTreeNode *A, const MachineDomTreeNode *B) const
void changeImmediateDominator(MachineBasicBlock *N, MachineBasicBlock *NewIDom)
changeImmediateDominator - This method is used to update the dominator tree information when a node's...
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MCContext & getContext() const
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineInstr * CloneMachineInstr(const MachineInstr *Orig)
Create a new MachineInstr which is a copy of Orig, identical in all ways except the instruction has n...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addSym(MCSymbol *Sym, unsigned char TargetFlags=0) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & addUse(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & copyImplicitOps(const MachineInstr &OtherMI) const
Copy all the implicit operands from OtherMI onto this one.
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
const MachineInstrBuilder & addDef(Register RegNo, unsigned Flags=0, unsigned SubReg=0) const
Add a virtual register definition operand.
Representation of each machine instruction.
Definition: MachineInstr.h:69
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Definition: MachineInstr.h:558
bool mayLoadOrStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read or modify memory.
bool isCopy() const
const MachineBasicBlock * getParent() const
Definition: MachineInstr.h:341
void addImplicitDefUseOperands(MachineFunction &MF)
Add all implicit def and use operands to this instruction.
unsigned getNumOperands() const
Retuns the total number of operands.
Definition: MachineInstr.h:561
void addOperand(MachineFunction &MF, const MachineOperand &Op)
Add the specified operand to the instruction.
iterator_range< mop_iterator > explicit_operands()
Definition: MachineInstr.h:680
unsigned getNumExplicitOperands() const
Returns the number of non-implicit operands.
bool mayLoad(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly read memory.
bool hasUnmodeledSideEffects() const
Return true if this instruction has side effects that are not modeled by mayLoad / mayStore,...
void untieRegOperand(unsigned OpIdx)
Break any tie involving OpIdx.
void setDesc(const MCInstrDesc &TID)
Replace the instruction descriptor (thus opcode) of the current instruction with a new one.
bool hasOneMemOperand() const
Return true if this instruction has exactly one MachineMemOperand.
Definition: MachineInstr.h:804
void tieOperands(unsigned DefIdx, unsigned UseIdx)
Add a tie between the register operands at DefIdx and UseIdx.
mmo_iterator memoperands_begin() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:789
bool hasOrderedMemoryRef() const
Return true if this instruction may have an ordered or volatile memory reference, or if the informati...
const MachineFunction * getMF() const
Return the function that contains the basic block that this instruction belongs to.
ArrayRef< MachineMemOperand * > memoperands() const
Access to memory operands of the instruction.
Definition: MachineInstr.h:771
bool mayStore(QueryType Type=AnyInBundle) const
Return true if this instruction could possibly modify memory.
const DebugLoc & getDebugLoc() const
Returns the debug location id of this MachineInstr.
Definition: MachineInstr.h:487
void eraseFromParent()
Unlink 'this' from the containing basic block and delete it.
void removeOperand(unsigned OpNo)
Erase an operand from an instruction, leaving it with one fewer operand than it started with.
void setPostInstrSymbol(MachineFunction &MF, MCSymbol *Symbol)
Set a symbol that will be emitted just after the instruction itself.
iterator_range< mop_iterator > implicit_operands()
Definition: MachineInstr.h:688
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
uint32_t getFlags() const
Return the MI flags bitvector.
Definition: MachineInstr.h:386
int findRegisterDefOperandIdx(Register Reg, const TargetRegisterInfo *TRI, bool isDead=false, bool Overlap=false) const
Returns the operand index that is a def of the specified register or -1 if it is not found.
bool addRegisterDead(Register Reg, const TargetRegisterInfo *RegInfo, bool AddIfNotFound=false)
We have determined MI defined a register without a use.
A description of a memory reference used in the backend.
@ MOLoad
The memory access reads data.
@ MOStore
The memory access writes data.
MachineOperand class - Representation of each machine instruction operand.
void setSubReg(unsigned subReg)
unsigned getSubReg() const
const GlobalValue * getGlobal() const
void setImplicit(bool Val=true)
void ChangeToFrameIndex(int Idx, unsigned TargetFlags=0)
Replace this operand with a frame index.
void setImm(int64_t immVal)
int64_t getImm() const
bool isImplicit() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setIsDead(bool Val=true)
void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
void ChangeToImmediate(int64_t ImmVal, unsigned TargetFlags=0)
ChangeToImmediate - Replace this operand with a new immediate operand of the specified value.
void ChangeToGA(const GlobalValue *GV, int64_t Offset, unsigned TargetFlags=0)
ChangeToGA - Replace this operand with a new global address operand.
void setIsKill(bool Val=true)
void ChangeToRegister(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isDebug=false)
ChangeToRegister - Replace this operand with a new register operand of the specified value.
MachineInstr * getParent()
getParent - Return the instruction that this operand belongs to.
void setOffset(int64_t Offset)
unsigned getTargetFlags() const
static MachineOperand CreateImm(int64_t Val)
bool isGlobal() const
isGlobal - Tests if this is a MO_GlobalAddress operand.
MachineOperandType getType() const
getType - Returns the MachineOperandType for this operand.
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
bool isTargetIndex() const
isTargetIndex - Tests if this is a MO_TargetIndex operand.
void setTargetFlags(unsigned F)
bool isFI() const
isFI - Tests if this is a MO_FrameIndex operand.
bool isIdenticalTo(const MachineOperand &Other) const
Returns true if this operand is identical to the specified operand except for liveness related flags ...
@ MO_Immediate
Immediate operand.
@ MO_Register
Register operand.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
int64_t getOffset() const
Return the offset from the symbol in this operand.
bool isFPImm() const
isFPImm - Tests if this is a MO_FPImmediate operand.
reg_begin/reg_end - Provide iteration support to walk over all definitions and uses of a register wit...
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
bool isReserved(MCRegister PhysReg) const
isReserved - Returns true when PhysReg is a reserved register.
void enterBasicBlockEnd(MachineBasicBlock &MBB)
Start tracking liveness from the end of basic block MBB.
bool isRegUsed(Register Reg, bool includeReserved=true) const
Return if a specific register is currently used.
void setRegUsed(Register Reg, LaneBitmask LaneMask=LaneBitmask::getAll())
Tell the scavenger a register is used.
void backward()
Update internal register state and move MBB iterator backwards.
void enterBasicBlock(MachineBasicBlock &MBB)
Start tracking liveness from the begin of basic block MBB.
Register scavengeRegisterBackwards(const TargetRegisterClass &RC, MachineBasicBlock::iterator To, bool RestoreAfter, int SPAdj, bool AllowSpill=true)
Make a register of the specific register class available from the current position backwards to the p...
const RegisterBank & getRegBank(unsigned ID)
Get the register bank identified by ID.
This class implements the register bank concept.
Definition: RegisterBank.h:28
unsigned getID() const
Get the identifier of this register bank.
Definition: RegisterBank.h:45
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
constexpr bool isValid() const
Definition: Register.h:116
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition: Register.h:91
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Represents one node in the SelectionDAG.
bool isMachineOpcode() const
Test if this node has a post-isel opcode, directly corresponding to a MachineInstr opcode.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isLegalMUBUFImmOffset(unsigned Imm) const
bool isInlineConstant(const APInt &Imm) const
static bool isMAI(const MachineInstr &MI)
Definition: SIInstrInfo.h:792
void legalizeOperandsVOP3(MachineRegisterInfo &MRI, MachineInstr &MI) const
Fix operands in MI to satisfy constant bus requirements.
static bool isDS(const MachineInstr &MI)
Definition: SIInstrInfo.h:554
MachineBasicBlock * legalizeOperands(MachineInstr &MI, MachineDominatorTree *MDT=nullptr) const
Legalize all operands in this instruction.
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const override
bool isNonUniformBranchInstr(MachineInstr &Instr) const
static bool isVOP3(const MachineInstr &MI)
Definition: SIInstrInfo.h:504
unsigned getLiveRangeSplitOpcode(Register Reg, const MachineFunction &MF) const override
bool getMemOperandsWithOffsetWidth(const MachineInstr &LdSt, SmallVectorImpl< const MachineOperand * > &BaseOps, int64_t &Offset, bool &OffsetIsScalable, LocationSize &Width, const TargetRegisterInfo *TRI) const final
unsigned getInstSizeInBytes(const MachineInstr &MI) const override
static bool isNeverUniform(const MachineInstr &MI)
Definition: SIInstrInfo.h:924
unsigned getOpSize(uint16_t Opcode, unsigned OpNo) const
Return the size in bytes of the operand OpNo on the given.
Definition: SIInstrInfo.h:1105
bool isBasicBlockPrologue(const MachineInstr &MI, Register Reg=Register()) const override
uint64_t getDefaultRsrcDataFormat() const
InstructionUniformity getGenericInstructionUniformity(const MachineInstr &MI) const
static bool isFLATScratch(const MachineInstr &MI)
Definition: SIInstrInfo.h:636
const MCInstrDesc & getIndirectRegWriteMovRelPseudo(unsigned VecSize, unsigned EltSize, bool IsSGPR) const
MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DestReg) const
Return a partially built integer add instruction without carry.
bool mayAccessFlatAddressSpace(const MachineInstr &MI) const
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0, int64_t Offset1, unsigned NumLoads) const override
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset, Align Alignment=Align(4)) const
ArrayRef< std::pair< unsigned, const char * > > getSerializableDirectMachineOperandTargetFlags() const override
void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const
Replace the instructions opcode with the equivalent VALU opcode.
static bool isSMRD(const MachineInstr &MI)
Definition: SIInstrInfo.h:544
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, SlotIndexes *Indexes=nullptr) const
bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const
Returns true if this operand uses the constant bus.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
Register isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int64_t CmpMask, int64_t CmpValue, const MachineRegisterInfo *MRI) const override
int64_t getNamedImmOperand(const MachineInstr &MI, unsigned OpName) const
Get required immediate operand.
Definition: SIInstrInfo.h:1233
static bool isMTBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:536
const MCInstrDesc & getIndirectGPRIDXPseudo(unsigned VecSize, bool IsIndirectSrc) const
void insertReturn(MachineBasicBlock &MBB) const
static bool isEXP(const MachineInstr &MI)
Definition: SIInstrInfo.h:649
static bool isSALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:408
void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, MachineOperand &Op, MachineRegisterInfo &MRI, const DebugLoc &DL) const
MachineInstr * buildShrunkInst(MachineInstr &MI, unsigned NewOpcode) const
unsigned getInstBundleSize(const MachineInstr &MI) const
static bool isVOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:496
bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify=false) const override
static bool isSDWA(const MachineInstr &MI)
Definition: SIInstrInfo.h:512
const MCInstrDesc & getKillTerminatorFromPseudo(unsigned Opcode) const
void insertNoops(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned Quantity) const override
static bool isGather4(const MachineInstr &MI)
Definition: SIInstrInfo.h:604
bool isLegalVSrcOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO would be a valid operand for the given operand definition OpInfo.
MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const override
Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const
Copy a value from a VGPR (SrcReg) to SGPR.
bool hasModifiers(unsigned Opcode) const
Return true if this instruction has any modifiers.
bool shouldClusterMemOps(ArrayRef< const MachineOperand * > BaseOps1, int64_t Offset1, bool OffsetIsScalable1, ArrayRef< const MachineOperand * > BaseOps2, int64_t Offset2, bool OffsetIsScalable2, unsigned ClusterSize, unsigned NumBytes) const override
ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override
bool isWave32() const
bool isHighLatencyDef(int Opc) const override
void legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const
Legalize the OpIndex operand of this instruction by inserting a MOV.
bool reverseBranchCondition(SmallVectorImpl< MachineOperand > &Cond) const override
static bool isVOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:520
void removeModOperands(MachineInstr &MI) const
std::pair< int64_t, int64_t > splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace, uint64_t FlatVariant) const
Split COffsetVal into {immediate offset field, remainder offset} values.
bool isSpill(uint16_t Opcode) const
Definition: SIInstrInfo.h:740
static bool isVIMAGE(const MachineInstr &MI)
Definition: SIInstrInfo.h:588
static bool isSOP2(const MachineInstr &MI)
Definition: SIInstrInfo.h:448
static bool isGWS(const MachineInstr &MI)
Definition: SIInstrInfo.h:570
LLVM_READONLY MachineOperand * getNamedOperand(MachineInstr &MI, unsigned OperandName) const
Returns the operand named Op.
const TargetRegisterClass * getPreferredSelectRegClass(unsigned Size) const
bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const override
bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const
static bool isFLATGlobal(const MachineInstr &MI)
Definition: SIInstrInfo.h:628
static bool isVSAMPLE(const MachineInstr &MI)
Definition: SIInstrInfo.h:596
bool isBufferSMRD(const MachineInstr &MI) const
static bool isKillTerminator(unsigned Opcode)
bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const override
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, bool IsSCCLive, SlotIndexes *Indexes=nullptr) const
bool hasVALU32BitEncoding(unsigned Opcode) const
Return true if this 64-bit VALU instruction has a 32-bit encoding.
unsigned getMovOpcode(const TargetRegisterClass *DstRC) const
unsigned isSGPRStackAccess(const MachineInstr &MI, int &FrameIndex) const
void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const override
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
void legalizeOperandsVOP2(MachineRegisterInfo &MRI, MachineInstr &MI) const
Legalize operands in MI by either commuting it or inserting a copy of src1.
bool foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const final
static bool isImage(const MachineInstr &MI)
Definition: SIInstrInfo.h:424
static bool isSOPK(const MachineInstr &MI)
Definition: SIInstrInfo.h:464
const TargetRegisterClass * getOpRegClass(const MachineInstr &MI, unsigned OpNo) const
Return the correct register class for OpNo.
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode)
Definition: SIInstrInfo.h:947
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
static bool isFoldableCopy(const MachineInstr &MI)
void loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const
bool isIgnorableUse(const MachineOperand &MO) const override
static bool isMUBUF(const MachineInstr &MI)
Definition: SIInstrInfo.h:528
bool expandPostRAPseudo(MachineInstr &MI) const override
bool analyzeCompare(const MachineInstr &MI, Register &SrcReg, Register &SrcReg2, int64_t &CmpMask, int64_t &CmpValue) const override
void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const
InstructionUniformity getInstructionUniformity(const MachineInstr &MI) const override final
static bool isSegmentSpecificFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:618
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const override
static bool isDPP(const MachineInstr &MI)
Definition: SIInstrInfo.h:760
bool analyzeBranchImpl(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl< MachineOperand > &Cond, bool AllowModify) const
bool isLowLatencyInstruction(const MachineInstr &MI) const
void materializeImmediate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, Register DestReg, int64_t Value) const
std::optional< DestSourcePair > isCopyInstrImpl(const MachineInstr &MI) const override
If the specific machine instruction is a instruction that moves/copies value from one register to ano...
bool isAlwaysGDS(uint16_t Opcode) const
Register isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override
void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT, MachineInstr &Inst) const
bool canShrink(const MachineInstr &MI, const MachineRegisterInfo &MRI) const
bool isAsmOnlyOpcode(int MCOp) const
Check if this instruction should only be used by assembler.
static bool isVGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:716
ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override
This is used by the post-RA scheduler (SchedulePostRAList.cpp).
bool verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const override
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
unsigned getInstrLatency(const InstrItineraryData *ItinData, const MachineInstr &MI, unsigned *PredCost=nullptr) const override
MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef< unsigned > Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS=nullptr, VirtRegMap *VRM=nullptr) const override
ArrayRef< std::pair< int, const char * > > getSerializableTargetIndices() const override
bool isVGPRCopy(const MachineInstr &MI) const
Definition: SIInstrInfo.h:970
static bool isMIMG(const MachineInstr &MI)
Definition: SIInstrInfo.h:580
MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, unsigned SubIdx, const TargetRegisterClass *SubRC) const
bool isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const override
bool isLegalRegOperand(const MachineRegisterInfo &MRI, const MCOperandInfo &OpInfo, const MachineOperand &MO) const
Check if MO (a register operand) is a legal register for the given operand description.
bool allowNegativeFlatOffset(uint64_t FlatVariant) const
Returns true if negative offsets are allowed for the given FlatVariant.
static unsigned getNumWaitStates(const MachineInstr &MI)
Return the number of wait states that result from executing this instruction.
const TargetRegisterClass * getRegClass(const MCInstrDesc &TID, unsigned OpNum, const TargetRegisterInfo *TRI, const MachineFunction &MF) const override
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override
unsigned getVALUOp(const MachineInstr &MI) const
static bool modifiesModeRegister(const MachineInstr &MI)
Return true if the instruction modifies the mode register.q.
void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, MachineBasicBlock *IfEnd) const
bool hasDivergentBranch(const MachineBasicBlock *MBB) const
Return whether the block terminate with divergent branch.
unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved=nullptr) const override
void fixImplicitOperands(MachineInstr &MI) const
bool moveFlatAddrToVGPR(MachineInstr &Inst) const
Change SADDR form of a FLAT Inst to its VADDR form if saddr operand was moved to VGPR.
Register insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
MachineBasicBlock * getBranchDestBlock(const MachineInstr &MI) const override
bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
Whether we must prevent this instruction from executing with EXEC = 0.
static bool isAtomic(const MachineInstr &MI)
Definition: SIInstrInfo.h:681
bool canInsertSelect(const MachineBasicBlock &MBB, ArrayRef< MachineOperand > Cond, Register DstReg, Register TrueReg, Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const override
static bool sopkIsZext(unsigned Opcode)
Definition: SIInstrInfo.h:863
void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI, Register VReg) const override
static bool isSGPRSpill(const MachineInstr &MI)
Definition: SIInstrInfo.h:728
static bool isWMMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:809
ArrayRef< std::pair< MachineMemOperand::Flags, const char * > > getSerializableMachineMemOperandTargetFlags() const override
MachineInstr * convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override
bool mayReadEXEC(const MachineRegisterInfo &MRI, const MachineInstr &MI) const
Returns true if the instruction could potentially depend on the value of exec.
void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const
bool isBranchOffsetInRange(unsigned BranchOpc, int64_t BrOffset) const override
unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, ArrayRef< MachineOperand > Cond, const DebugLoc &DL, int *BytesAdded=nullptr) const override
void insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register DstReg, ArrayRef< MachineOperand > Cond, Register TrueReg, Register FalseReg) const
void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override
std::pair< MachineInstr *, MachineInstr * > expandMovDPP64(MachineInstr &MI) const
Register insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, Register SrcReg, int Value) const
static bool isSOPC(const MachineInstr &MI)
Definition: SIInstrInfo.h:456
static bool isFLAT(const MachineInstr &MI)
Definition: SIInstrInfo.h:612
static bool isVALU(const MachineInstr &MI)
Definition: SIInstrInfo.h:416
MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const override
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const
int pseudoToMCOpcode(int Opcode) const
Return a target-specific opcode if Opcode is a pseudo instruction.
const MCInstrDesc & getMCOpcodeFromPseudo(unsigned Opcode) const
Return the descriptor of the target-specific machine instruction that corresponds to the specified ps...
Definition: SIInstrInfo.h:1246
MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const override
bool hasModifiersSet(const MachineInstr &MI, unsigned OpName) const
static bool isFixedSize(const MachineInstr &MI)
Definition: SIInstrInfo.h:880
bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo, MachineCycleInfo *CI) const override
LLVM_READONLY int commuteOpcode(unsigned Opc) const
uint64_t getScratchRsrcWords23() const
std::pair< unsigned, unsigned > decomposeMachineOperandsTargetFlags(unsigned TF) const override
bool areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, const MachineInstr &MIb) const override
bool isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO=nullptr) const
Check if MO is a legal operand if it was the OpIdx Operand for MI.
static bool isLDSDMA(const MachineInstr &MI)
Definition: SIInstrInfo.h:562
unsigned isStackAccess(const MachineInstr &MI, int &FrameIndex) const
static bool isVOP1(const MachineInstr &MI)
Definition: SIInstrInfo.h:488
SIInstrInfo(const GCNSubtarget &ST)
Definition: SIInstrInfo.cpp:66
void insertIndirectBranch(MachineBasicBlock &MBB, MachineBasicBlock &NewDestBB, MachineBasicBlock &RestoreBB, const DebugLoc &DL, int64_t BrOffset, RegScavenger *RS) const override
bool hasAnyModifiersSet(const MachineInstr &MI) const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
void setHasSpilledVGPRs(bool Spill=true)
bool checkFlag(Register Reg, uint8_t Flag) const
void setHasSpilledSGPRs(bool Spill=true)
const TargetRegisterClass * getRegClass(unsigned RCID) const
const TargetRegisterClass * getCompatibleSubRegClass(const TargetRegisterClass *SuperRC, const TargetRegisterClass *SubRC, unsigned SubIdx) const
Returns a register class which is compatible with SuperRC, such that a subregister exists with class ...
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
MCPhysReg get32BitRegister(MCPhysReg Reg) const
const TargetRegisterClass * getProperlyAlignedRC(const TargetRegisterClass *RC) const
bool isProperlyAlignedRC(const TargetRegisterClass &RC) const
static bool hasVectorRegisters(const TargetRegisterClass *RC)
const TargetRegisterClass * getEquivalentVGPRClass(const TargetRegisterClass *SRC) const
ArrayRef< int16_t > getRegSplitParts(const TargetRegisterClass *RC, unsigned EltSize) const
const TargetRegisterClass * getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const override
bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const
bool opCanUseInlineConstant(unsigned OpType) const
const TargetRegisterClass * getRegClassForReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentAGPRClass(const TargetRegisterClass *SRC) const
bool opCanUseLiteralConstant(unsigned OpType) const
static bool hasVGPRs(const TargetRegisterClass *RC)
static bool isVGPRClass(const TargetRegisterClass *RC)
unsigned getHWRegIndex(MCRegister Reg) const
bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const
const TargetRegisterClass * getEquivalentSGPRClass(const TargetRegisterClass *VRC) const
unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override
const TargetRegisterClass * getBoolRC() const
bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const
unsigned getChannelFromSubReg(unsigned SubReg) const
MCRegister getVCC() const
static bool hasAGPRs(const TargetRegisterClass *RC)
const TargetRegisterClass * getWaveMaskRegClass() const
bool spillSGPRToVGPR() const
const TargetRegisterClass * getVGPR64Class() const
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
virtual bool hasVRegLiveness() const
Return true if this DAG supports VReg liveness and RegPressure.
MachineFunction & MF
Machine function.
Definition: ScheduleDAG.h:559
HazardRecognizer - This determines whether or not an instruction can be issued this cycle,...
SlotIndex - An opaque wrapper around machine indexes.
Definition: SlotIndexes.h:68
SlotIndex getRegSlot(bool EC=false) const
Returns the register use/def slot in the current instruction for a normal or early-clobber def.
Definition: SlotIndexes.h:240
SlotIndexes pass.
Definition: SlotIndexes.h:300
SlotIndex insertMachineInstrInMaps(MachineInstr &MI, bool Late=false)
Insert the given machine instruction into the mapping.
Definition: SlotIndexes.h:523
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
int64_t getImm() const
Register getReg() const
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
virtual ScheduleHazardRecognizer * CreateTargetMIHazardRecognizer(const InstrItineraryData *, const ScheduleDAGMI *DAG) const
Allocate and return a hazard recognizer to use for this target when scheduling the machine instructio...
virtual MachineInstr * createPHIDestinationCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register DestReg, unsigned SubIdx, const MachineInstr &Orig, const TargetRegisterInfo &TRI) const
Re-issue the specified 'original' instruction at the specific location targeting a new destination re...
virtual MachineInstr * createPHISourceCopy(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt, const DebugLoc &DL, Register Src, unsigned SrcSubReg, Register Dst) const
During PHI eleimination lets target to make necessary checks and insert the copy to the PHI destinati...
virtual bool isReallyTriviallyReMaterializable(const MachineInstr &MI) const
For instructions with opcodes for which the M_REMATERIALIZABLE flag is set, this hook lets the target...
virtual MachineInstr * commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx1, unsigned OpIdx2) const
This method commutes the operands of the given machine instruction MI.
virtual bool expandPostRAPseudo(MachineInstr &MI) const
This function is called for all pseudo instructions that remain after register allocation.
const MCAsmInfo * getMCAsmInfo() const
Return target specific asm information.
bool contains(Register Reg) const
Return true if the specified register is included in this register class.
bool hasSubClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a sub-class of or equal to this class.
bool hasSuperClassEq(const TargetRegisterClass *RC) const
Returns true if RC is a super-class of or equal to this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
const TargetRegisterClass * getAllocatableClass(const TargetRegisterClass *RC) const
Return the maximal subclass of the given register class that is allocatable or NULL.
unsigned getSubRegIdxSize(unsigned Idx) const
Get the size of the bit range covered by a sub-register index.
unsigned getSubRegIdxOffset(unsigned Idx) const
Get the offset of the bit range covered by a sub-register index.
void init(const TargetSubtargetInfo *TSInfo)
Initialize the machine model for instruction scheduling.
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
LLVM Value Representation.
Definition: Value.h:74
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
const uint64_t RSRC_DATA_FORMAT
Definition: SIInstrInfo.h:1524
LLVM_READONLY int getBasicFromSDWAOp(uint16_t Opcode)
LLVM_READONLY const MIMGInfo * getMIMGInfo(unsigned Opc)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY int getVOPe32(uint16_t Opcode)
LLVM_READNONE bool isLegalDPALU_DPPControl(unsigned DC)
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc)
bool isInlinableLiteralV2I16(uint32_t Literal)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
bool isInlinableLiteralV2BF16(uint32_t Literal)
LLVM_READONLY int getFlatScratchInstSVfromSS(uint16_t Opcode)
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST)
For pre-GFX12 FLAT instructions the offset must be positive; MSB is ignored and forced to zero.
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isInlinableLiteralV2F16(uint32_t Literal)
LLVM_READONLY int getGlobalVaddrOp(uint16_t Opcode)
bool isValid32BitLiteral(uint64_t Val, bool IsFP64)
bool isDPALU_DPP(const MCInstrDesc &OpDesc)
const uint64_t RSRC_ELEMENT_SIZE_SHIFT
Definition: SIInstrInfo.h:1525
LLVM_READONLY int getAddr64Inst(uint16_t Opcode)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY int getMFMAEarlyClobberOp(uint16_t Opcode)
bool isTrue16Inst(unsigned Opc)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfoByEncoding(uint8_t DimEnc)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo)
Is this an AMDGPU specific source operand? These include registers, inline constants,...
const uint64_t RSRC_TID_ENABLE
Definition: SIInstrInfo.h:1527
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isGenericAtomic(unsigned Opc)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool isHi(unsigned Reg, const MCRegisterInfo &MRI)
LLVM_READONLY int getCommuteRev(uint16_t Opcode)
unsigned getAddrSizeMIMGOp(const MIMGBaseOpcodeInfo *BaseOpcode, const MIMGDimInfo *Dim, bool IsA16, bool IsG16Supported)
@ OPERAND_KIMM32
Operand with 32-bit immediate that uses the constant bus.
Definition: SIDefines.h:234
@ OPERAND_REG_IMM_INT64
Definition: SIDefines.h:201
@ OPERAND_REG_IMM_V2FP16
Definition: SIDefines.h:211
@ OPERAND_REG_INLINE_C_V2INT32
Definition: SIDefines.h:227
@ OPERAND_REG_INLINE_C_FP64
Definition: SIDefines.h:223
@ OPERAND_REG_INLINE_C_BF16
Definition: SIDefines.h:220
@ OPERAND_REG_INLINE_C_V2BF16
Definition: SIDefines.h:225
@ OPERAND_REG_IMM_V2INT16
Definition: SIDefines.h:212
@ OPERAND_REG_IMM_BF16
Definition: SIDefines.h:205
@ OPERAND_REG_INLINE_AC_V2FP16
Definition: SIDefines.h:246
@ OPERAND_REG_IMM_INT32
Operands with register or 32-bit immediate.
Definition: SIDefines.h:200
@ OPERAND_REG_IMM_V2BF16
Definition: SIDefines.h:210
@ OPERAND_REG_IMM_BF16_DEFERRED
Definition: SIDefines.h:207
@ OPERAND_REG_IMM_FP16
Definition: SIDefines.h:206
@ OPERAND_REG_INLINE_C_INT64
Definition: SIDefines.h:219
@ OPERAND_REG_INLINE_AC_BF16
Definition: SIDefines.h:240
@ OPERAND_REG_INLINE_C_INT16
Operands with register or inline constant.
Definition: SIDefines.h:217
@ OPERAND_REG_INLINE_AC_INT16
Operands with an AccVGPR register or inline constant.
Definition: SIDefines.h:238
@ OPERAND_REG_IMM_FP64
Definition: SIDefines.h:204
@ OPERAND_REG_INLINE_C_V2FP16
Definition: SIDefines.h:226
@ OPERAND_REG_INLINE_AC_V2INT16
Definition: SIDefines.h:244
@ OPERAND_REG_INLINE_AC_FP16
Definition: SIDefines.h:241
@ OPERAND_REG_INLINE_AC_INT32
Definition: SIDefines.h:239
@ OPERAND_REG_INLINE_AC_FP32
Definition: SIDefines.h:242
@ OPERAND_REG_INLINE_AC_V2BF16
Definition: SIDefines.h:245
@ OPERAND_REG_IMM_V2INT32
Definition: SIDefines.h:213
@ OPERAND_REG_IMM_FP32
Definition: SIDefines.h:203
@ OPERAND_INPUT_MODS
Definition: SIDefines.h:251
@ OPERAND_REG_INLINE_C_FP32
Definition: SIDefines.h:222
@ OPERAND_REG_INLINE_C_INT32
Definition: SIDefines.h:218
@ OPERAND_REG_INLINE_C_V2INT16
Definition: SIDefines.h:224
@ OPERAND_REG_IMM_V2FP32
Definition: SIDefines.h:214
@ OPERAND_REG_INLINE_AC_FP64
Definition: SIDefines.h:243
@ OPERAND_REG_INLINE_C_FP16
Definition: SIDefines.h:221
@ OPERAND_REG_IMM_INT16
Definition: SIDefines.h:202
@ OPERAND_REG_INLINE_C_V2FP32
Definition: SIDefines.h:228
@ OPERAND_INLINE_SPLIT_BARRIER_INT32
Definition: SIDefines.h:231
@ OPERAND_REG_IMM_FP32_DEFERRED
Definition: SIDefines.h:209
@ OPERAND_REG_IMM_FP16_DEFERRED
Definition: SIDefines.h:208
@ TI_SCRATCH_RSRC_DWORD1
Definition: AMDGPU.h:409
@ TI_SCRATCH_RSRC_DWORD3
Definition: AMDGPU.h:411
@ TI_SCRATCH_RSRC_DWORD0
Definition: AMDGPU.h:408
@ TI_SCRATCH_RSRC_DWORD2
Definition: AMDGPU.h:410
@ TI_CONSTDATA_START
Definition: AMDGPU.h:407
LLVM_READONLY int getCommuteOrig(uint16_t Opcode)
unsigned getRegBitWidth(const TargetRegisterClass &RC)
Get the size in bits of a register from the register class RC.
int getMCOpcode(uint16_t Opcode, unsigned Gen)
const uint64_t RSRC_INDEX_STRIDE_SHIFT
Definition: SIInstrInfo.h:1526
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
LLVM_READONLY int getIfAddr64Inst(uint16_t Opcode)
Check if Opcode is an Addr64 opcode.
bool isGraphics(CallingConv::ID cc)
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
Definition: CallingConv.h:188
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
Definition: CallingConv.h:206
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
Definition: CallingConv.h:191
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
Definition: CallingConv.h:218
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
Definition: CallingConv.h:213
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ OPERAND_GENERIC_4
Definition: MCInstrDesc.h:70
@ OPERAND_GENERIC_2
Definition: MCInstrDesc.h:68
@ OPERAND_GENERIC_1
Definition: MCInstrDesc.h:67
@ OPERAND_REGISTER
Definition: MCInstrDesc.h:61
@ OPERAND_GENERIC_3
Definition: MCInstrDesc.h:69
@ OPERAND_IMMEDIATE
Definition: MCInstrDesc.h:60
@ OPERAND_UNKNOWN
Definition: MCInstrDesc.h:59
@ OPERAND_GENERIC_0
Definition: MCInstrDesc.h:66
@ OPERAND_GENERIC_5
Definition: MCInstrDesc.h:71
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Not(const Pred &P) -> Not< Pred >
Reg
All possible values of the reg field in the ModR/M byte.
@ ReallyHidden
Definition: CommandLine.h:139
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
@ Offset
Definition: DWP.cpp:456
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
TargetInstrInfo::RegSubRegPair getRegSubRegPair(const MachineOperand &O)
Create RegSubRegPair from a register MachineOperand.
Definition: SIInstrInfo.h:1415
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
bool execMayBeModifiedBeforeUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI, const MachineInstr &UseMI)
Return false if EXEC is not changed between the def of VReg at DefMI and the use at UseMI.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:280
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
TargetInstrInfo::RegSubRegPair getRegSequenceSubReg(MachineInstr &MI, unsigned SubReg)
Return the SubReg component from REG_SEQUENCE.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
MachineInstr * getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P, MachineRegisterInfo &MRI)
Return the defining instruction for a given reg:subreg pair skipping copy like instructions and subre...
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
@ Xor
Bitwise or logical XOR of integers.
@ Add
Sum of integers.
unsigned getKillRegState(bool B)
bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition: MathExtras.h:244
bool isTargetSpecificOpcode(unsigned Opcode)
Check whether the given Opcode is a target-specific opcode.
Definition: TargetOpcodes.h:36
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Error
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition: Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
@ NeverUniform
The result values can never be assumed to be uniform.
@ Default
The result values are uniform if and only if all operands are uniform.
uint64_t maxUIntN(uint64_t N)
Gets the maximum value for a N-bit unsigned integer.
Definition: MathExtras.h:203
bool execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI, Register VReg, const MachineInstr &DefMI)
Return false if EXEC is not changed between the def of VReg at DefMI and all its uses.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
static Semantics SemanticsToEnum(const llvm::fltSemantics &Sem)
Definition: APFloat.cpp:216
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
Description of the encoding of one expression Op.
SparseBitVector AliveBlocks
AliveBlocks - Set of blocks in which this value is alive completely through.
Definition: LiveVariables.h:85
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
Utility to store machine instructions worklist.
Definition: SIInstrInfo.h:49
MachineInstr * top() const
Definition: SIInstrInfo.h:54
bool empty() const
Definition: SIInstrInfo.h:64
bool isDeferred(MachineInstr *MI)
SetVector< MachineInstr * > & getDeferredList()
Definition: SIInstrInfo.h:73
void insert(MachineInstr *MI)
A pair composed of a register and a sub-register index.