LLVM 19.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPUTargetMachine.h"
18#include "GCNSubtarget.h"
21#include "SIRegisterInfo.h"
22#include "llvm/ADT/APInt.h"
24#include "llvm/ADT/Statistic.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Support/ModRef.h"
45#include <optional>
46
47using namespace llvm;
48
49#define DEBUG_TYPE "si-lower"
50
51STATISTIC(NumTailCalls, "Number of tail calls");
52
54 "amdgpu-disable-loop-alignment",
55 cl::desc("Do not align and prefetch loops"),
56 cl::init(false));
57
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc("Use indirect register addressing for divergent indexes"),
62 cl::init(false));
63
66 return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
67}
68
71 return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
72}
73
74static unsigned findFirstFreeSGPR(CCState &CCInfo) {
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
77 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
78 return AMDGPU::SGPR0 + Reg;
79 }
80 }
81 llvm_unreachable("Cannot allocate sgpr");
82}
83
85 const GCNSubtarget &STI)
87 Subtarget(&STI) {
88 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
89 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
90
91 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
92 addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
93
94 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
95
96 const SIRegisterInfo *TRI = STI.getRegisterInfo();
97 const TargetRegisterClass *V64RegClass = TRI->getVGPR64Class();
98
99 addRegisterClass(MVT::f64, V64RegClass);
100 addRegisterClass(MVT::v2f32, V64RegClass);
101 addRegisterClass(MVT::Untyped, V64RegClass);
102
103 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
104 addRegisterClass(MVT::v3f32, TRI->getVGPRClassForBitWidth(96));
105
106 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
107 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
108
109 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
110 addRegisterClass(MVT::v4f32, TRI->getVGPRClassForBitWidth(128));
111
112 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
113 addRegisterClass(MVT::v5f32, TRI->getVGPRClassForBitWidth(160));
114
115 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
116 addRegisterClass(MVT::v6f32, TRI->getVGPRClassForBitWidth(192));
117
118 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
119 addRegisterClass(MVT::v3f64, TRI->getVGPRClassForBitWidth(192));
120
121 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
122 addRegisterClass(MVT::v7f32, TRI->getVGPRClassForBitWidth(224));
123
124 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
125 addRegisterClass(MVT::v8f32, TRI->getVGPRClassForBitWidth(256));
126
127 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
128 addRegisterClass(MVT::v4f64, TRI->getVGPRClassForBitWidth(256));
129
130 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
131 addRegisterClass(MVT::v9f32, TRI->getVGPRClassForBitWidth(288));
132
133 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
134 addRegisterClass(MVT::v10f32, TRI->getVGPRClassForBitWidth(320));
135
136 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
137 addRegisterClass(MVT::v11f32, TRI->getVGPRClassForBitWidth(352));
138
139 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
140 addRegisterClass(MVT::v12f32, TRI->getVGPRClassForBitWidth(384));
141
142 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
143 addRegisterClass(MVT::v16f32, TRI->getVGPRClassForBitWidth(512));
144
145 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
146 addRegisterClass(MVT::v8f64, TRI->getVGPRClassForBitWidth(512));
147
148 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
149 addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
150
151 if (Subtarget->has16BitInsts()) {
152 if (Subtarget->useRealTrue16Insts()) {
153 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
154 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
155 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
156 } else {
157 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
158 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
159 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
160 }
161
162 // Unless there are also VOP3P operations, not operations are really legal.
163 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
164 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
165 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
166 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
167 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
168 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
169 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
170 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
171 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
172 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
173 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
174 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
175 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
176 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
177 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
178 }
179
180 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
181 addRegisterClass(MVT::v32f32, TRI->getVGPRClassForBitWidth(1024));
182
184
185 // The boolean content concept here is too inflexible. Compares only ever
186 // really produce a 1-bit result. Any copy/extend from these will turn into a
187 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
188 // it's what most targets use.
191
192 // We need to custom lower vector stores from local memory
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
198 Custom);
199
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
205 Custom);
206
207 if (isTypeLegal(MVT::bf16)) {
208 for (unsigned Opc :
217 ISD::SETCC}) {
218 // FIXME: The promoted to type shouldn't need to be explicit
219 setOperationAction(Opc, MVT::bf16, Promote);
220 AddPromotedToType(Opc, MVT::bf16, MVT::f32);
221 }
222
224
226 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
227
231
232 // We only need to custom lower because we can't specify an action for bf16
233 // sources.
236
238 AddPromotedToType(ISD::BUILD_VECTOR, MVT::v2bf16, MVT::v2i16);
239 }
240
241 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
242 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
243 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
244 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
245 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
246 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
247 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
248 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
249 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
250 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
251 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
252 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
253 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
254 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
255 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
256 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
257
258 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
259 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
260 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
263 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
264 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
265
266 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
267
271 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
272
273 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
274
276 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
277
279 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
280 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
281
283 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
284 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
285 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
286 Expand);
288 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
289 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
290 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
291 Expand);
292
294 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
295 MVT::v3i16, MVT::v4i16, MVT::Other},
296 Custom);
297
300 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
301
303
305
307 Expand);
308
309#if 0
311#endif
312
313 // We only support LOAD/STORE and vector manipulation ops for vectors
314 // with > 4 elements.
315 for (MVT VT :
316 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
317 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
318 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
319 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
320 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
321 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
322 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
323 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
324 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
325 switch (Op) {
326 case ISD::LOAD:
327 case ISD::STORE:
329 case ISD::BITCAST:
330 case ISD::UNDEF:
334 case ISD::IS_FPCLASS:
335 break;
340 break;
341 default:
343 break;
344 }
345 }
346 }
347
349
350 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
351 // is expanded to avoid having two separate loops in case the index is a VGPR.
352
353 // Most operations are naturally 32-bit vector operations. We only support
354 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
355 for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
357 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
358
360 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
361
363 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
364
366 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
367 }
368
369 for (MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
371 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
372
374 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
375
377 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
378
380 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
381 }
382
383 for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
385 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
386
388 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
389
391 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
392
394 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
395 }
396
397 for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
399 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
400
402 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
403
405 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
406
408 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
409 }
410
411 for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
413 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
414
416 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
417
419 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
420
422 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
423 }
424
426 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
427 Expand);
428
429 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
430 Custom);
431
432 // Avoid stack access for these.
433 // TODO: Generalize to more vector types.
435 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
436 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
437 Custom);
438
439 // Deal with vec3 vector operations when widened to vec4.
441 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
442
443 // Deal with vec5/6/7 vector operations when widened to vec8.
445 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
446 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
447 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
448 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
449 Custom);
450
451 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
452 // and output demarshalling
453 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
454
455 // We can't return success/failure, only the old value,
456 // let LLVM add the comparison
458 Expand);
459
460 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
461
462 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
463
464 // FIXME: This should be narrowed to i32, but that only happens if i64 is
465 // illegal.
466 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
467 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
468
469 // On SI this is s_memtime and s_memrealtime on VI.
471
472 if (Subtarget->hasSMemRealTime() ||
476
477 if (Subtarget->has16BitInsts()) {
480 } else {
482 }
483
484 if (Subtarget->hasMadMacF32Insts())
486
487 if (!Subtarget->hasBFI())
488 // fcopysign can be done in a single instruction with BFI.
489 setOperationAction(ISD::FCOPYSIGN, {MVT::f32, MVT::f64}, Expand);
490
491 if (!Subtarget->hasBCNT(32))
493
494 if (!Subtarget->hasBCNT(64))
496
497 if (Subtarget->hasFFBH())
499
500 if (Subtarget->hasFFBL())
502
503 // We only really have 32-bit BFE instructions (and 16-bit on VI).
504 //
505 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
506 // effort to match them now. We want this to be false for i64 cases when the
507 // extraction isn't restricted to the upper or lower half. Ideally we would
508 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
509 // span the midpoint are probably relatively rare, so don't worry about them
510 // for now.
511 if (Subtarget->hasBFE())
513
514 // Clamp modifier on add/sub
515 if (Subtarget->hasIntClamp())
517
518 if (Subtarget->hasAddNoCarry())
519 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
520 Legal);
521
522 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, {MVT::f32, MVT::f64},
523 Custom);
524
525 // These are really only legal for ieee_mode functions. We should be avoiding
526 // them for functions that don't have ieee_mode enabled, so just say they are
527 // legal.
529 {MVT::f32, MVT::f64}, Legal);
530
531 if (Subtarget->haveRoundOpsF64())
533 Legal);
534 else
536 MVT::f64, Custom);
537
539 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
540 Legal);
541 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
542
545
546 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
547 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
548
549 // Custom lower these because we can't specify a rule based on an illegal
550 // source bf16.
553
554 if (Subtarget->has16BitInsts()) {
557 MVT::i16, Legal);
558
559 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
560
562 MVT::i16, Expand);
563
567 ISD::CTPOP},
568 MVT::i16, Promote);
569
571
572 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
573
575 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
577 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
578
582
584
585 // F16 - Constant Actions.
588
589 // F16 - Load/Store Actions.
591 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
593 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
594
595 // BF16 - Load/Store Actions.
597 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
599 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
600
601 // F16 - VOP1 Actions.
604 MVT::f16, Custom);
605
608
609 // F16 - VOP2 Actions.
610 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
611 Expand);
615
616 // F16 - VOP3 Actions.
618 if (STI.hasMadF16())
620
621 for (MVT VT :
622 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
623 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
624 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
625 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
626 switch (Op) {
627 case ISD::LOAD:
628 case ISD::STORE:
630 case ISD::BITCAST:
631 case ISD::UNDEF:
637 case ISD::IS_FPCLASS:
638 break;
641 break;
642 default:
644 break;
645 }
646 }
647 }
648
649 // v_perm_b32 can handle either of these.
650 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
652
653 // XXX - Do these do anything? Vector constants turn into build_vector.
654 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
655
656 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
657 Legal);
658
660 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
662 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
663
665 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
667 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
668
669 setOperationAction(ISD::AND, MVT::v2i16, Promote);
670 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
671 setOperationAction(ISD::OR, MVT::v2i16, Promote);
672 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
673 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
674 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
675
677 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
679 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
680 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
681 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
682
684 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
686 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
688 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
689
691 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
693 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
694 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
695 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
696
698 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
700 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
701
703 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
705 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
707 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
708
709 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
710 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
711 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
712 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
713 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
714 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
715
717 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
719 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
720 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
721 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
722
723 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
724 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
725 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
726 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
727 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
728 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
729
731 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
733 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
734 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
735 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
736
738 MVT::v2i32, Expand);
740
742 MVT::v4i32, Expand);
743
745 MVT::v8i32, Expand);
746
747 if (!Subtarget->hasVOP3PInsts())
749 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
750
751 setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
752 // This isn't really legal, but this avoids the legalizer unrolling it (and
753 // allows matching fneg (fabs x) patterns)
754 setOperationAction(ISD::FABS, MVT::v2f16, Legal);
755
758
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
761 Custom);
762
764 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
765 Expand);
766
767 for (MVT Vec16 :
768 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
769 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
772 Vec16, Custom);
774 }
775 }
776
777 if (Subtarget->hasVOP3PInsts()) {
781 MVT::v2i16, Legal);
782
785 MVT::v2f16, Legal);
786
787 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
788 Custom);
789
791 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
792 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
793 Custom);
794
795 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
796 // Split vector operations.
801 VT, Custom);
802
803 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
804 // Split vector operations.
806 VT, Custom);
807
808 setOperationAction({ISD::FMAXNUM, ISD::FMINNUM}, {MVT::v2f16, MVT::v4f16},
809 Custom);
810
811 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
812 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
813 Custom);
814
815 if (Subtarget->hasPackedFP32Ops()) {
817 MVT::v2f32, Legal);
819 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
820 Custom);
821 }
822 }
823
825
826 if (Subtarget->has16BitInsts()) {
828 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
830 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
831 } else {
832 // Legalization hack.
833 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
834
836 }
837
839 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
840 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
841 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
842 MVT::v32f16, MVT::v32bf16},
843 Custom);
844
846
847 if (Subtarget->hasScalarSMulU64())
849
850 if (Subtarget->hasMad64_32())
852
853 if (Subtarget->hasPrefetch())
855
856 if (Subtarget->hasIEEEMinMax()) {
858 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
860 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
861 Custom);
862 }
863
865 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
866 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
867 Custom);
868
870 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
871 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
872 MVT::i16, MVT::i8, MVT::i128},
873 Custom);
874
876 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
877 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
878 MVT::i8, MVT::i128},
879 Custom);
880
886
887 // TODO: Could move this to custom lowering, could benefit from combines on
888 // extract of relevant bits.
890
892
895 ISD::SUB,
897 ISD::FADD,
898 ISD::FSUB,
899 ISD::FDIV,
906 ISD::FMA,
907 ISD::SMIN,
908 ISD::SMAX,
909 ISD::UMIN,
910 ISD::UMAX,
912 ISD::AND,
913 ISD::OR,
914 ISD::XOR,
915 ISD::FSHR,
925
926 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
928
929 // All memory operations. Some folding on the pointer operand is done to help
930 // matching the constant offsets in the addressing modes.
953
954 // FIXME: In other contexts we pretend this is a per-function property.
956
958}
959
961 return Subtarget;
962}
963
964//===----------------------------------------------------------------------===//
965// TargetLowering queries
966//===----------------------------------------------------------------------===//
967
968// v_mad_mix* support a conversion from f16 to f32.
969//
970// There is only one special case when denormals are enabled we don't currently,
971// where this is OK to use.
972bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
973 EVT DestVT, EVT SrcVT) const {
974 return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
975 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
976 DestVT.getScalarType() == MVT::f32 &&
977 SrcVT.getScalarType() == MVT::f16 &&
978 // TODO: This probably only requires no input flushing?
980}
981
983 LLT DestTy, LLT SrcTy) const {
984 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
985 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
986 DestTy.getScalarSizeInBits() == 32 &&
987 SrcTy.getScalarSizeInBits() == 16 &&
988 // TODO: This probably only requires no input flushing?
990}
991
993 // SI has some legal vector types, but no legal vector operations. Say no
994 // shuffles are legal in order to prefer scalarizing some vector operations.
995 return false;
996}
997
1000 EVT VT) const {
1003
1004 if (VT.isVector()) {
1005 EVT ScalarVT = VT.getScalarType();
1006 unsigned Size = ScalarVT.getSizeInBits();
1007 if (Size == 16) {
1008 if (Subtarget->has16BitInsts()) {
1009 if (VT.isInteger())
1010 return MVT::v2i16;
1011 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1012 }
1013 return VT.isInteger() ? MVT::i32 : MVT::f32;
1014 }
1015
1016 if (Size < 16)
1017 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1018 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1019 }
1020
1021 if (VT.getSizeInBits() > 32)
1022 return MVT::i32;
1023
1025}
1026
1029 EVT VT) const {
1032
1033 if (VT.isVector()) {
1034 unsigned NumElts = VT.getVectorNumElements();
1035 EVT ScalarVT = VT.getScalarType();
1036 unsigned Size = ScalarVT.getSizeInBits();
1037
1038 // FIXME: Should probably promote 8-bit vectors to i16.
1039 if (Size == 16 && Subtarget->has16BitInsts())
1040 return (NumElts + 1) / 2;
1041
1042 if (Size <= 32)
1043 return NumElts;
1044
1045 if (Size > 32)
1046 return NumElts * ((Size + 31) / 32);
1047 } else if (VT.getSizeInBits() > 32)
1048 return (VT.getSizeInBits() + 31) / 32;
1049
1051}
1052
1054 LLVMContext &Context, CallingConv::ID CC,
1055 EVT VT, EVT &IntermediateVT,
1056 unsigned &NumIntermediates, MVT &RegisterVT) const {
1057 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1058 unsigned NumElts = VT.getVectorNumElements();
1059 EVT ScalarVT = VT.getScalarType();
1060 unsigned Size = ScalarVT.getSizeInBits();
1061 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1062 // support, but unless we can properly handle 3-vectors, it will be still be
1063 // inconsistent.
1064 if (Size == 16 && Subtarget->has16BitInsts()) {
1065 if (ScalarVT == MVT::bf16) {
1066 RegisterVT = MVT::i32;
1067 IntermediateVT = MVT::v2bf16;
1068 } else {
1069 RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
1070 IntermediateVT = RegisterVT;
1071 }
1072 NumIntermediates = (NumElts + 1) / 2;
1073 return NumIntermediates;
1074 }
1075
1076 if (Size == 32) {
1077 RegisterVT = ScalarVT.getSimpleVT();
1078 IntermediateVT = RegisterVT;
1079 NumIntermediates = NumElts;
1080 return NumIntermediates;
1081 }
1082
1083 if (Size < 16 && Subtarget->has16BitInsts()) {
1084 // FIXME: Should probably form v2i16 pieces
1085 RegisterVT = MVT::i16;
1086 IntermediateVT = ScalarVT;
1087 NumIntermediates = NumElts;
1088 return NumIntermediates;
1089 }
1090
1091
1092 if (Size != 16 && Size <= 32) {
1093 RegisterVT = MVT::i32;
1094 IntermediateVT = ScalarVT;
1095 NumIntermediates = NumElts;
1096 return NumIntermediates;
1097 }
1098
1099 if (Size > 32) {
1100 RegisterVT = MVT::i32;
1101 IntermediateVT = RegisterVT;
1102 NumIntermediates = NumElts * ((Size + 31) / 32);
1103 return NumIntermediates;
1104 }
1105 }
1106
1108 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1109}
1110
1111static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes) {
1112 assert(MaxNumLanes != 0);
1113
1114 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1115 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1116 return EVT::getVectorVT(Ty->getContext(),
1117 EVT::getEVT(VT->getElementType()),
1118 NumElts);
1119 }
1120
1121 return EVT::getEVT(Ty);
1122}
1123
1124// Peek through TFE struct returns to only use the data size.
1125static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
1126 auto *ST = dyn_cast<StructType>(Ty);
1127 if (!ST)
1128 return memVTFromLoadIntrData(Ty, MaxNumLanes);
1129
1130 // TFE intrinsics return an aggregate type.
1131 assert(ST->getNumContainedTypes() == 2 &&
1132 ST->getContainedType(1)->isIntegerTy(32));
1133 return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
1134}
1135
1136/// Map address space 7 to MVT::v5i32 because that's its in-memory
1137/// representation. This return value is vector-typed because there is no
1138/// MVT::i160 and it is not clear if one can be added. While this could
1139/// cause issues during codegen, these address space 7 pointers will be
1140/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
1141/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
1142/// modeling, to work.
1144 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1145 return MVT::v5i32;
1147 DL.getPointerSizeInBits(AS) == 192)
1148 return MVT::v6i32;
1150}
1151/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1152/// v8i32 when padding is added.
1153/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1154/// also v8i32 with padding.
1156 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1157 DL.getPointerSizeInBits(AS) == 160) ||
1159 DL.getPointerSizeInBits(AS) == 192))
1160 return MVT::v8i32;
1162}
1163
1165 const CallInst &CI,
1166 MachineFunction &MF,
1167 unsigned IntrID) const {
1169 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1171
1172 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1175 (Intrinsic::ID)IntrID);
1176 MemoryEffects ME = Attr.getMemoryEffects();
1177 if (ME.doesNotAccessMemory())
1178 return false;
1179
1180 // TODO: Should images get their own address space?
1181 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1182
1183 if (RsrcIntr->IsImage)
1184 Info.align.reset();
1185
1186 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1187 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1188 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1189 // We conservatively set the memory operand of a buffer intrinsic to the
1190 // base resource pointer, so that we can access alias information about
1191 // those pointers. Cases like "this points at the same value
1192 // but with a different offset" are handled in
1193 // areMemAccessesTriviallyDisjoint.
1194 Info.ptrVal = RsrcArg;
1195 }
1196
1197 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1198 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1201 if (ME.onlyReadsMemory()) {
1202 unsigned MaxNumLanes = 4;
1203
1204 if (RsrcIntr->IsImage) {
1207 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
1209
1210 if (!BaseOpcode->Gather4) {
1211 // If this isn't a gather, we may have excess loaded elements in the
1212 // IR type. Check the dmask for the real number of elements loaded.
1213 unsigned DMask
1214 = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1215 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1216 }
1217 }
1218
1219 Info.memVT = memVTFromLoadIntrReturn(CI.getType(), MaxNumLanes);
1220
1221 // FIXME: What does alignment mean for an image?
1224 } else if (ME.onlyWritesMemory()) {
1226
1227 Type *DataTy = CI.getArgOperand(0)->getType();
1228 if (RsrcIntr->IsImage) {
1229 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1230 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1231 Info.memVT = memVTFromLoadIntrData(DataTy, DMaskLanes);
1232 } else
1233 Info.memVT = EVT::getEVT(DataTy);
1234
1236 } else {
1237 // Atomic
1238 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID :
1240 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1244
1245 switch (IntrID) {
1246 default:
1247 // XXX - Should this be volatile without known ordering?
1249 break;
1250 case Intrinsic::amdgcn_raw_buffer_load_lds:
1251 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1252 case Intrinsic::amdgcn_struct_buffer_load_lds:
1253 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1254 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1255 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1256 Info.ptrVal = CI.getArgOperand(1);
1257 return true;
1258 }
1259 }
1260 }
1261 return true;
1262 }
1263
1264 switch (IntrID) {
1265 case Intrinsic::amdgcn_ds_ordered_add:
1266 case Intrinsic::amdgcn_ds_ordered_swap:
1267 case Intrinsic::amdgcn_ds_fadd:
1268 case Intrinsic::amdgcn_ds_fmin:
1269 case Intrinsic::amdgcn_ds_fmax: {
1271 Info.memVT = MVT::getVT(CI.getType());
1272 Info.ptrVal = CI.getOperand(0);
1273 Info.align.reset();
1275
1276 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1277 if (!Vol->isZero())
1279
1280 return true;
1281 }
1282 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1284 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1285 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1286 Info.align.reset();
1288
1289 const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
1290 if (!Vol || !Vol->isZero())
1292
1293 return true;
1294 }
1295 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1296 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1298 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1299 Info.ptrVal = nullptr;
1300 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1302 return true;
1303 }
1304 case Intrinsic::amdgcn_ds_append:
1305 case Intrinsic::amdgcn_ds_consume: {
1307 Info.memVT = MVT::getVT(CI.getType());
1308 Info.ptrVal = CI.getOperand(0);
1309 Info.align.reset();
1311
1312 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1313 if (!Vol->isZero())
1315
1316 return true;
1317 }
1318 case Intrinsic::amdgcn_global_atomic_csub: {
1320 Info.memVT = MVT::getVT(CI.getType());
1321 Info.ptrVal = CI.getOperand(0);
1322 Info.align.reset();
1326 return true;
1327 }
1328 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1330 Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
1331
1332 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1333 Info.align.reset();
1336 return true;
1337 }
1338 case Intrinsic::amdgcn_global_atomic_fadd:
1339 case Intrinsic::amdgcn_global_atomic_fmin:
1340 case Intrinsic::amdgcn_global_atomic_fmax:
1341 case Intrinsic::amdgcn_global_atomic_fmin_num:
1342 case Intrinsic::amdgcn_global_atomic_fmax_num:
1343 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1344 case Intrinsic::amdgcn_flat_atomic_fadd:
1345 case Intrinsic::amdgcn_flat_atomic_fmin:
1346 case Intrinsic::amdgcn_flat_atomic_fmax:
1347 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1348 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1349 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1350 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1351 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1353 Info.memVT = MVT::getVT(CI.getType());
1354 Info.ptrVal = CI.getOperand(0);
1355 Info.align.reset();
1360 return true;
1361 }
1362 case Intrinsic::amdgcn_global_load_tr_b64:
1363 case Intrinsic::amdgcn_global_load_tr_b128: {
1365 Info.memVT = MVT::getVT(CI.getType());
1366 Info.ptrVal = CI.getOperand(0);
1367 Info.align.reset();
1369 return true;
1370 }
1371 case Intrinsic::amdgcn_ds_gws_init:
1372 case Intrinsic::amdgcn_ds_gws_barrier:
1373 case Intrinsic::amdgcn_ds_gws_sema_v:
1374 case Intrinsic::amdgcn_ds_gws_sema_br:
1375 case Intrinsic::amdgcn_ds_gws_sema_p:
1376 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1378
1379 const GCNTargetMachine &TM =
1380 static_cast<const GCNTargetMachine &>(getTargetMachine());
1381
1383 Info.ptrVal = MFI->getGWSPSV(TM);
1384
1385 // This is an abstract access, but we need to specify a type and size.
1386 Info.memVT = MVT::i32;
1387 Info.size = 4;
1388 Info.align = Align(4);
1389
1390 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1392 else
1394 return true;
1395 }
1396 case Intrinsic::amdgcn_global_load_lds: {
1398 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1399 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1400 Info.ptrVal = CI.getArgOperand(1);
1402 return true;
1403 }
1404 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1406
1407 const GCNTargetMachine &TM =
1408 static_cast<const GCNTargetMachine &>(getTargetMachine());
1409
1411 Info.ptrVal = MFI->getGWSPSV(TM);
1412
1413 // This is an abstract access, but we need to specify a type and size.
1414 Info.memVT = MVT::i32;
1415 Info.size = 4;
1416 Info.align = Align(4);
1417
1419 return true;
1420 }
1421 default:
1422 return false;
1423 }
1424}
1425
1427 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1428 switch (cast<IntrinsicInst>(I).getIntrinsicID()) {
1429 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1430 // The DAG's ValueType loses the addrspaces.
1431 // Add them as 2 extra Constant operands "from" and "to".
1432 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1433 unsigned DstAS = I.getType()->getPointerAddressSpace();
1434 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1435 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1436 break;
1437 }
1438 default:
1439 break;
1440 }
1441}
1442
1445 Type *&AccessTy) const {
1446 Value *Ptr = nullptr;
1447 switch (II->getIntrinsicID()) {
1448 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1449 case Intrinsic::amdgcn_ds_append:
1450 case Intrinsic::amdgcn_ds_consume:
1451 case Intrinsic::amdgcn_ds_fadd:
1452 case Intrinsic::amdgcn_ds_fmax:
1453 case Intrinsic::amdgcn_ds_fmin:
1454 case Intrinsic::amdgcn_ds_ordered_add:
1455 case Intrinsic::amdgcn_ds_ordered_swap:
1456 case Intrinsic::amdgcn_flat_atomic_fadd:
1457 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1458 case Intrinsic::amdgcn_flat_atomic_fmax:
1459 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1460 case Intrinsic::amdgcn_flat_atomic_fmin:
1461 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1462 case Intrinsic::amdgcn_global_atomic_csub:
1463 case Intrinsic::amdgcn_global_atomic_fadd:
1464 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1465 case Intrinsic::amdgcn_global_atomic_fmax:
1466 case Intrinsic::amdgcn_global_atomic_fmax_num:
1467 case Intrinsic::amdgcn_global_atomic_fmin:
1468 case Intrinsic::amdgcn_global_atomic_fmin_num:
1469 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1470 case Intrinsic::amdgcn_global_load_tr_b64:
1471 case Intrinsic::amdgcn_global_load_tr_b128:
1472 Ptr = II->getArgOperand(0);
1473 break;
1474 case Intrinsic::amdgcn_global_load_lds:
1475 Ptr = II->getArgOperand(1);
1476 break;
1477 default:
1478 return false;
1479 }
1480 AccessTy = II->getType();
1481 Ops.push_back(Ptr);
1482 return true;
1483}
1484
1485bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
1486 unsigned AddrSpace,
1487 uint64_t FlatVariant) const {
1488 if (!Subtarget->hasFlatInstOffsets()) {
1489 // Flat instructions do not have offsets, and only have the register
1490 // address.
1491 return AM.BaseOffs == 0 && AM.Scale == 0;
1492 }
1493
1494 return AM.Scale == 0 &&
1495 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1496 AM.BaseOffs, AddrSpace, FlatVariant));
1497}
1498
1500 if (Subtarget->hasFlatGlobalInsts())
1501 return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
1503
1504 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1505 // Assume the we will use FLAT for all global memory accesses
1506 // on VI.
1507 // FIXME: This assumption is currently wrong. On VI we still use
1508 // MUBUF instructions for the r + i addressing mode. As currently
1509 // implemented, the MUBUF instructions only work on buffer < 4GB.
1510 // It may be possible to support > 4GB buffers with MUBUF instructions,
1511 // by setting the stride value in the resource descriptor which would
1512 // increase the size limit to (stride * 4GB). However, this is risky,
1513 // because it has never been validated.
1514 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1516 }
1517
1518 return isLegalMUBUFAddressingMode(AM);
1519}
1520
1521bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1522 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1523 // additionally can do r + r + i with addr64. 32-bit has more addressing
1524 // mode options. Depending on the resource constant, it can also do
1525 // (i64 r0) + (i32 r1) * (i14 i).
1526 //
1527 // Private arrays end up using a scratch buffer most of the time, so also
1528 // assume those use MUBUF instructions. Scratch loads / stores are currently
1529 // implemented as mubuf instructions with offen bit set, so slightly
1530 // different than the normal addr64.
1531 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1532 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1533 return false;
1534
1535 // FIXME: Since we can split immediate into soffset and immediate offset,
1536 // would it make sense to allow any immediate?
1537
1538 switch (AM.Scale) {
1539 case 0: // r + i or just i, depending on HasBaseReg.
1540 return true;
1541 case 1:
1542 return true; // We have r + r or r + i.
1543 case 2:
1544 if (AM.HasBaseReg) {
1545 // Reject 2 * r + r.
1546 return false;
1547 }
1548
1549 // Allow 2 * r as r + r
1550 // Or 2 * r + i is allowed as r + r + i.
1551 return true;
1552 default: // Don't allow n * r
1553 return false;
1554 }
1555}
1556
1558 const AddrMode &AM, Type *Ty,
1559 unsigned AS, Instruction *I) const {
1560 // No global is ever allowed as a base.
1561 if (AM.BaseGV)
1562 return false;
1563
1564 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1565 return isLegalGlobalAddressingMode(AM);
1566
1567 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1571 // If the offset isn't a multiple of 4, it probably isn't going to be
1572 // correctly aligned.
1573 // FIXME: Can we get the real alignment here?
1574 if (AM.BaseOffs % 4 != 0)
1575 return isLegalMUBUFAddressingMode(AM);
1576
1577 if (!Subtarget->hasScalarSubwordLoads()) {
1578 // There are no SMRD extloads, so if we have to do a small type access we
1579 // will use a MUBUF load.
1580 // FIXME?: We also need to do this if unaligned, but we don't know the
1581 // alignment here.
1582 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1583 return isLegalGlobalAddressingMode(AM);
1584 }
1585
1587 // SMRD instructions have an 8-bit, dword offset on SI.
1588 if (!isUInt<8>(AM.BaseOffs / 4))
1589 return false;
1590 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1591 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1592 // in 8-bits, it can use a smaller encoding.
1593 if (!isUInt<32>(AM.BaseOffs / 4))
1594 return false;
1595 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1596 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
1597 if (!isUInt<20>(AM.BaseOffs))
1598 return false;
1599 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
1600 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
1601 // for S_BUFFER_* instructions).
1602 if (!isInt<21>(AM.BaseOffs))
1603 return false;
1604 } else {
1605 // On GFX12, all offsets are signed 24-bit in bytes.
1606 if (!isInt<24>(AM.BaseOffs))
1607 return false;
1608 }
1609
1610 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1611 return true;
1612
1613 if (AM.Scale == 1 && AM.HasBaseReg)
1614 return true;
1615
1616 return false;
1617 }
1618
1619 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1620 return Subtarget->enableFlatScratch()
1621 ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
1623 : isLegalMUBUFAddressingMode(AM);
1624
1625 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
1626 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
1627 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
1628 // field.
1629 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
1630 // an 8-bit dword offset but we don't know the alignment here.
1631 if (!isUInt<16>(AM.BaseOffs))
1632 return false;
1633
1634 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
1635 return true;
1636
1637 if (AM.Scale == 1 && AM.HasBaseReg)
1638 return true;
1639
1640 return false;
1641 }
1642
1644 // For an unknown address space, this usually means that this is for some
1645 // reason being used for pure arithmetic, and not based on some addressing
1646 // computation. We don't have instructions that compute pointers with any
1647 // addressing modes, so treat them as having no offset like flat
1648 // instructions.
1649 return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
1651 }
1652
1653 // Assume a user alias of global for unknown address spaces.
1654 return isLegalGlobalAddressingMode(AM);
1655}
1656
1658 const MachineFunction &MF) const {
1660 return (MemVT.getSizeInBits() <= 4 * 32);
1661 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
1662 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
1663 return (MemVT.getSizeInBits() <= MaxPrivateBits);
1664 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
1665 return (MemVT.getSizeInBits() <= 2 * 32);
1666 }
1667 return true;
1668}
1669
1671 unsigned Size, unsigned AddrSpace, Align Alignment,
1672 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
1673 if (IsFast)
1674 *IsFast = 0;
1675
1676 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1677 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
1678 // Check if alignment requirements for ds_read/write instructions are
1679 // disabled.
1680 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
1681 return false;
1682
1683 Align RequiredAlignment(PowerOf2Ceil(Size/8)); // Natural alignment.
1684 if (Subtarget->hasLDSMisalignedBug() && Size > 32 &&
1685 Alignment < RequiredAlignment)
1686 return false;
1687
1688 // Either, the alignment requirements are "enabled", or there is an
1689 // unaligned LDS access related hardware bug though alignment requirements
1690 // are "disabled". In either case, we need to check for proper alignment
1691 // requirements.
1692 //
1693 switch (Size) {
1694 case 64:
1695 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
1696 // address is negative, then the instruction is incorrectly treated as
1697 // out-of-bounds even if base + offsets is in bounds. Split vectorized
1698 // loads here to avoid emitting ds_read2_b32. We may re-combine the
1699 // load later in the SILoadStoreOptimizer.
1700 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
1701 return false;
1702
1703 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
1704 // can do a 4 byte aligned, 8 byte access in a single operation using
1705 // ds_read2/write2_b32 with adjacent offsets.
1706 RequiredAlignment = Align(4);
1707
1708 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1709 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
1710 // ds_write2_b32 depending on the alignment. In either case with either
1711 // alignment there is no faster way of doing this.
1712
1713 // The numbers returned here and below are not additive, it is a 'speed
1714 // rank'. They are just meant to be compared to decide if a certain way
1715 // of lowering an operation is faster than another. For that purpose
1716 // naturally aligned operation gets it bitsize to indicate that "it
1717 // operates with a speed comparable to N-bit wide load". With the full
1718 // alignment ds128 is slower than ds96 for example. If underaligned it
1719 // is comparable to a speed of a single dword access, which would then
1720 // mean 32 < 128 and it is faster to issue a wide load regardless.
1721 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
1722 // wider load which will not be aligned anymore the latter is slower.
1723 if (IsFast)
1724 *IsFast = (Alignment >= RequiredAlignment) ? 64
1725 : (Alignment < Align(4)) ? 32
1726 : 1;
1727 return true;
1728 }
1729
1730 break;
1731 case 96:
1732 if (!Subtarget->hasDS96AndDS128())
1733 return false;
1734
1735 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
1736 // gfx8 and older.
1737
1738 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1739 // Naturally aligned access is fastest. However, also report it is Fast
1740 // if memory is aligned less than DWORD. A narrow load or store will be
1741 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
1742 // be more of them, so overall we will pay less penalty issuing a single
1743 // instruction.
1744
1745 // See comment on the values above.
1746 if (IsFast)
1747 *IsFast = (Alignment >= RequiredAlignment) ? 96
1748 : (Alignment < Align(4)) ? 32
1749 : 1;
1750 return true;
1751 }
1752
1753 break;
1754 case 128:
1755 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
1756 return false;
1757
1758 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
1759 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
1760 // single operation using ds_read2/write2_b64.
1761 RequiredAlignment = Align(8);
1762
1763 if (Subtarget->hasUnalignedDSAccessEnabled()) {
1764 // Naturally aligned access is fastest. However, also report it is Fast
1765 // if memory is aligned less than DWORD. A narrow load or store will be
1766 // be equally slow as a single ds_read_b128/ds_write_b128, but there
1767 // will be more of them, so overall we will pay less penalty issuing a
1768 // single instruction.
1769
1770 // See comment on the values above.
1771 if (IsFast)
1772 *IsFast = (Alignment >= RequiredAlignment) ? 128
1773 : (Alignment < Align(4)) ? 32
1774 : 1;
1775 return true;
1776 }
1777
1778 break;
1779 default:
1780 if (Size > 32)
1781 return false;
1782
1783 break;
1784 }
1785
1786 // See comment on the values above.
1787 // Note that we have a single-dword or sub-dword here, so if underaligned
1788 // it is a slowest possible access, hence returned value is 0.
1789 if (IsFast)
1790 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
1791
1792 return Alignment >= RequiredAlignment ||
1793 Subtarget->hasUnalignedDSAccessEnabled();
1794 }
1795
1796 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
1797 bool AlignedBy4 = Alignment >= Align(4);
1798 if (IsFast)
1799 *IsFast = AlignedBy4;
1800
1801 return AlignedBy4 ||
1802 Subtarget->enableFlatScratch() ||
1803 Subtarget->hasUnalignedScratchAccess();
1804 }
1805
1806 // FIXME: We have to be conservative here and assume that flat operations
1807 // will access scratch. If we had access to the IR function, then we
1808 // could determine if any private memory was used in the function.
1809 if (AddrSpace == AMDGPUAS::FLAT_ADDRESS &&
1810 !Subtarget->hasUnalignedScratchAccess()) {
1811 bool AlignedBy4 = Alignment >= Align(4);
1812 if (IsFast)
1813 *IsFast = AlignedBy4;
1814
1815 return AlignedBy4;
1816 }
1817
1818 // So long as they are correct, wide global memory operations perform better
1819 // than multiple smaller memory ops -- even when misaligned
1820 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
1821 if (IsFast)
1822 *IsFast = Size;
1823
1824 return Alignment >= Align(4) ||
1826 }
1827
1828 // Smaller than dword value must be aligned.
1829 if (Size < 32)
1830 return false;
1831
1832 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
1833 // byte-address are ignored, thus forcing Dword alignment.
1834 // This applies to private, global, and constant memory.
1835 if (IsFast)
1836 *IsFast = 1;
1837
1838 return Size >= 32 && Alignment >= Align(4);
1839}
1840
1842 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
1843 unsigned *IsFast) const {
1845 Alignment, Flags, IsFast);
1846}
1847
1849 const MemOp &Op, const AttributeList &FuncAttributes) const {
1850 // FIXME: Should account for address space here.
1851
1852 // The default fallback uses the private pointer size as a guess for a type to
1853 // use. Make sure we switch these to 64-bit accesses.
1854
1855 if (Op.size() >= 16 &&
1856 Op.isDstAligned(Align(4))) // XXX: Should only do for global
1857 return MVT::v4i32;
1858
1859 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
1860 return MVT::v2i32;
1861
1862 // Use the default.
1863 return MVT::Other;
1864}
1865
1867 const MemSDNode *MemNode = cast<MemSDNode>(N);
1868 return MemNode->getMemOperand()->getFlags() & MONoClobber;
1869}
1870
1872 return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
1874}
1875
1877 unsigned DestAS) const {
1878 // Flat -> private/local is a simple truncate.
1879 // Flat -> global is no-op
1880 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1881 return true;
1882
1883 const GCNTargetMachine &TM =
1884 static_cast<const GCNTargetMachine &>(getTargetMachine());
1885 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1886}
1887
1889 const MemSDNode *MemNode = cast<MemSDNode>(N);
1890
1892}
1893
1896 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1897 VT.getScalarType().bitsLE(MVT::i16))
1900}
1901
1903 Type *Ty) const {
1904 // FIXME: Could be smarter if called for vector constants.
1905 return true;
1906}
1907
1909 unsigned Index) const {
1911 return false;
1912
1913 // TODO: Add more cases that are cheap.
1914 return Index == 0;
1915}
1916
1918 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
1919 switch (Op) {
1920 case ISD::LOAD:
1921 case ISD::STORE:
1922
1923 // These operations are done with 32-bit instructions anyway.
1924 case ISD::AND:
1925 case ISD::OR:
1926 case ISD::XOR:
1927 case ISD::SELECT:
1928 // TODO: Extensions?
1929 return true;
1930 default:
1931 return false;
1932 }
1933 }
1934
1935 // SimplifySetCC uses this function to determine whether or not it should
1936 // create setcc with i1 operands. We don't have instructions for i1 setcc.
1937 if (VT == MVT::i1 && Op == ISD::SETCC)
1938 return false;
1939
1941}
1942
1943SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
1944 const SDLoc &SL,
1945 SDValue Chain,
1946 uint64_t Offset) const {
1947 const DataLayout &DL = DAG.getDataLayout();
1950
1951 const ArgDescriptor *InputPtrReg;
1952 const TargetRegisterClass *RC;
1953 LLT ArgTy;
1955
1956 std::tie(InputPtrReg, RC, ArgTy) =
1958
1959 // We may not have the kernarg segment argument if we have no kernel
1960 // arguments.
1961 if (!InputPtrReg)
1962 return DAG.getConstant(Offset, SL, PtrVT);
1963
1965 SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
1966 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1967
1968 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
1969}
1970
1971SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
1972 const SDLoc &SL) const {
1975 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
1976}
1977
1978SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
1979 const SDLoc &SL) const {
1980
1982 std::optional<uint32_t> KnownSize =
1984 if (KnownSize.has_value())
1985 return DAG.getConstant(*KnownSize, SL, MVT::i32);
1986 return SDValue();
1987}
1988
1989SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
1990 const SDLoc &SL, SDValue Val,
1991 bool Signed,
1992 const ISD::InputArg *Arg) const {
1993 // First, if it is a widened vector, narrow it.
1994 if (VT.isVector() &&
1996 EVT NarrowedVT =
1999 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2000 DAG.getConstant(0, SL, MVT::i32));
2001 }
2002
2003 // Then convert the vector elements or scalar value.
2004 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
2005 VT.bitsLT(MemVT)) {
2006 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2007 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2008 }
2009
2010 if (MemVT.isFloatingPoint())
2011 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2012 else if (Signed)
2013 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2014 else
2015 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2016
2017 return Val;
2018}
2019
2020SDValue SITargetLowering::lowerKernargMemParameter(
2021 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2022 uint64_t Offset, Align Alignment, bool Signed,
2023 const ISD::InputArg *Arg) const {
2025
2026 // Try to avoid using an extload by loading earlier than the argument address,
2027 // and extracting the relevant bits. The load should hopefully be merged with
2028 // the previous argument.
2029 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2030 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2031 int64_t AlignDownOffset = alignDown(Offset, 4);
2032 int64_t OffsetDiff = Offset - AlignDownOffset;
2033
2034 EVT IntVT = MemVT.changeTypeToInteger();
2035
2036 // TODO: If we passed in the base kernel offset we could have a better
2037 // alignment than 4, but we don't really need it.
2038 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2039 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr, PtrInfo, Align(4),
2042
2043 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2044 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2045
2046 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2047 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2048 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2049
2050
2051 return DAG.getMergeValues({ ArgVal, Load.getValue(1) }, SL);
2052 }
2053
2054 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2055 SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
2058
2059 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2060 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
2061}
2062
2063SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
2064 const SDLoc &SL, SDValue Chain,
2065 const ISD::InputArg &Arg) const {
2067 MachineFrameInfo &MFI = MF.getFrameInfo();
2068
2069 if (Arg.Flags.isByVal()) {
2070 unsigned Size = Arg.Flags.getByValSize();
2071 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2072 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2073 }
2074
2075 unsigned ArgOffset = VA.getLocMemOffset();
2076 unsigned ArgSize = VA.getValVT().getStoreSize();
2077
2078 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2079
2080 // Create load nodes to retrieve arguments from the stack.
2081 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2082 SDValue ArgValue;
2083
2084 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2086 MVT MemVT = VA.getValVT();
2087
2088 switch (VA.getLocInfo()) {
2089 default:
2090 break;
2091 case CCValAssign::BCvt:
2092 MemVT = VA.getLocVT();
2093 break;
2094 case CCValAssign::SExt:
2095 ExtType = ISD::SEXTLOAD;
2096 break;
2097 case CCValAssign::ZExt:
2098 ExtType = ISD::ZEXTLOAD;
2099 break;
2100 case CCValAssign::AExt:
2101 ExtType = ISD::EXTLOAD;
2102 break;
2103 }
2104
2105 ArgValue = DAG.getExtLoad(
2106 ExtType, SL, VA.getLocVT(), Chain, FIN,
2108 MemVT);
2109 return ArgValue;
2110}
2111
2112SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
2113 const SIMachineFunctionInfo &MFI,
2114 EVT VT,
2116 const ArgDescriptor *Reg = nullptr;
2117 const TargetRegisterClass *RC;
2118 LLT Ty;
2119
2121 const ArgDescriptor WorkGroupIDX =
2122 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2123 // If GridZ is not programmed in an entry function then the hardware will set
2124 // it to all zeros, so there is no need to mask the GridY value in the low
2125 // order bits.
2126 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2127 AMDGPU::TTMP7,
2128 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2129 const ArgDescriptor WorkGroupIDZ =
2130 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2131 if (Subtarget->hasArchitectedSGPRs() &&
2133 switch (PVID) {
2135 Reg = &WorkGroupIDX;
2136 RC = &AMDGPU::SReg_32RegClass;
2137 Ty = LLT::scalar(32);
2138 break;
2140 Reg = &WorkGroupIDY;
2141 RC = &AMDGPU::SReg_32RegClass;
2142 Ty = LLT::scalar(32);
2143 break;
2145 Reg = &WorkGroupIDZ;
2146 RC = &AMDGPU::SReg_32RegClass;
2147 Ty = LLT::scalar(32);
2148 break;
2149 default:
2150 break;
2151 }
2152 }
2153
2154 if (!Reg)
2155 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2156 if (!Reg) {
2158 // It's possible for a kernarg intrinsic call to appear in a kernel with
2159 // no allocated segment, in which case we do not add the user sgpr
2160 // argument, so just return null.
2161 return DAG.getConstant(0, SDLoc(), VT);
2162 }
2163
2164 // It's undefined behavior if a function marked with the amdgpu-no-*
2165 // attributes uses the corresponding intrinsic.
2166 return DAG.getUNDEF(VT);
2167 }
2168
2169 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2170}
2171
2173 CallingConv::ID CallConv,
2174 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2175 FunctionType *FType,
2176 SIMachineFunctionInfo *Info) {
2177 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2178 const ISD::InputArg *Arg = &Ins[I];
2179
2180 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2181 "vector type argument should have been split");
2182
2183 // First check if it's a PS input addr.
2184 if (CallConv == CallingConv::AMDGPU_PS &&
2185 !Arg->Flags.isInReg() && PSInputNum <= 15) {
2186 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2187
2188 // Inconveniently only the first part of the split is marked as isSplit,
2189 // so skip to the end. We only want to increment PSInputNum once for the
2190 // entire split argument.
2191 if (Arg->Flags.isSplit()) {
2192 while (!Arg->Flags.isSplitEnd()) {
2193 assert((!Arg->VT.isVector() ||
2194 Arg->VT.getScalarSizeInBits() == 16) &&
2195 "unexpected vector split in ps argument type");
2196 if (!SkipArg)
2197 Splits.push_back(*Arg);
2198 Arg = &Ins[++I];
2199 }
2200 }
2201
2202 if (SkipArg) {
2203 // We can safely skip PS inputs.
2204 Skipped.set(Arg->getOrigArgIndex());
2205 ++PSInputNum;
2206 continue;
2207 }
2208
2209 Info->markPSInputAllocated(PSInputNum);
2210 if (Arg->Used)
2211 Info->markPSInputEnabled(PSInputNum);
2212
2213 ++PSInputNum;
2214 }
2215
2216 Splits.push_back(*Arg);
2217 }
2218}
2219
2220// Allocate special inputs passed in VGPRs.
2222 MachineFunction &MF,
2223 const SIRegisterInfo &TRI,
2224 SIMachineFunctionInfo &Info) const {
2225 const LLT S32 = LLT::scalar(32);
2227
2228 if (Info.hasWorkItemIDX()) {
2229 Register Reg = AMDGPU::VGPR0;
2230 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2231
2232 CCInfo.AllocateReg(Reg);
2233 unsigned Mask = (Subtarget->hasPackedTID() &&
2234 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2235 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2236 }
2237
2238 if (Info.hasWorkItemIDY()) {
2239 assert(Info.hasWorkItemIDX());
2240 if (Subtarget->hasPackedTID()) {
2241 Info.setWorkItemIDY(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2242 0x3ff << 10));
2243 } else {
2244 unsigned Reg = AMDGPU::VGPR1;
2245 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2246
2247 CCInfo.AllocateReg(Reg);
2248 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2249 }
2250 }
2251
2252 if (Info.hasWorkItemIDZ()) {
2253 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2254 if (Subtarget->hasPackedTID()) {
2255 Info.setWorkItemIDZ(ArgDescriptor::createRegister(AMDGPU::VGPR0,
2256 0x3ff << 20));
2257 } else {
2258 unsigned Reg = AMDGPU::VGPR2;
2259 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2260
2261 CCInfo.AllocateReg(Reg);
2262 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2263 }
2264 }
2265}
2266
2267// Try to allocate a VGPR at the end of the argument list, or if no argument
2268// VGPRs are left allocating a stack slot.
2269// If \p Mask is is given it indicates bitfield position in the register.
2270// If \p Arg is given use it with new ]p Mask instead of allocating new.
2271static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2272 ArgDescriptor Arg = ArgDescriptor()) {
2273 if (Arg.isSet())
2274 return ArgDescriptor::createArg(Arg, Mask);
2275
2276 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2277 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2278 if (RegIdx == ArgVGPRs.size()) {
2279 // Spill to stack required.
2280 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2281
2282 return ArgDescriptor::createStack(Offset, Mask);
2283 }
2284
2285 unsigned Reg = ArgVGPRs[RegIdx];
2286 Reg = CCInfo.AllocateReg(Reg);
2287 assert(Reg != AMDGPU::NoRegister);
2288
2289 MachineFunction &MF = CCInfo.getMachineFunction();
2290 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2291 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2292 return ArgDescriptor::createRegister(Reg, Mask);
2293}
2294
2296 const TargetRegisterClass *RC,
2297 unsigned NumArgRegs) {
2298 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2299 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2300 if (RegIdx == ArgSGPRs.size())
2301 report_fatal_error("ran out of SGPRs for arguments");
2302
2303 unsigned Reg = ArgSGPRs[RegIdx];
2304 Reg = CCInfo.AllocateReg(Reg);
2305 assert(Reg != AMDGPU::NoRegister);
2306
2307 MachineFunction &MF = CCInfo.getMachineFunction();
2308 MF.addLiveIn(Reg, RC);
2310}
2311
2312// If this has a fixed position, we still should allocate the register in the
2313// CCInfo state. Technically we could get away with this for values passed
2314// outside of the normal argument range.
2316 const TargetRegisterClass *RC,
2317 MCRegister Reg) {
2318 Reg = CCInfo.AllocateReg(Reg);
2319 assert(Reg != AMDGPU::NoRegister);
2320 MachineFunction &MF = CCInfo.getMachineFunction();
2321 MF.addLiveIn(Reg, RC);
2322}
2323
2324static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2325 if (Arg) {
2326 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2327 Arg.getRegister());
2328 } else
2329 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2330}
2331
2332static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2333 if (Arg) {
2334 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2335 Arg.getRegister());
2336 } else
2337 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2338}
2339
2340/// Allocate implicit function VGPR arguments at the end of allocated user
2341/// arguments.
2343 CCState &CCInfo, MachineFunction &MF,
2344 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2345 const unsigned Mask = 0x3ff;
2346 ArgDescriptor Arg;
2347
2348 if (Info.hasWorkItemIDX()) {
2349 Arg = allocateVGPR32Input(CCInfo, Mask);
2350 Info.setWorkItemIDX(Arg);
2351 }
2352
2353 if (Info.hasWorkItemIDY()) {
2354 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2355 Info.setWorkItemIDY(Arg);
2356 }
2357
2358 if (Info.hasWorkItemIDZ())
2359 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2360}
2361
2362/// Allocate implicit function VGPR arguments in fixed registers.
2364 CCState &CCInfo, MachineFunction &MF,
2365 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2366 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2367 if (!Reg)
2368 report_fatal_error("failed to allocated VGPR for implicit arguments");
2369
2370 const unsigned Mask = 0x3ff;
2371 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2372 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2373 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2374}
2375
2377 CCState &CCInfo,
2378 MachineFunction &MF,
2379 const SIRegisterInfo &TRI,
2380 SIMachineFunctionInfo &Info) const {
2381 auto &ArgInfo = Info.getArgInfo();
2382 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2383
2384 // TODO: Unify handling with private memory pointers.
2385 if (UserSGPRInfo.hasDispatchPtr())
2386 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2387
2388 const Module *M = MF.getFunction().getParent();
2389 if (UserSGPRInfo.hasQueuePtr() &&
2391 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2392
2393 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2394 // constant offset from the kernarg segment.
2395 if (Info.hasImplicitArgPtr())
2396 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2397
2398 if (UserSGPRInfo.hasDispatchID())
2399 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2400
2401 // flat_scratch_init is not applicable for non-kernel functions.
2402
2403 if (Info.hasWorkGroupIDX())
2404 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2405
2406 if (Info.hasWorkGroupIDY())
2407 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2408
2409 if (Info.hasWorkGroupIDZ())
2410 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2411
2412 if (Info.hasLDSKernelId())
2413 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2414}
2415
2416// Allocate special inputs passed in user SGPRs.
2418 MachineFunction &MF,
2419 const SIRegisterInfo &TRI,
2420 SIMachineFunctionInfo &Info) const {
2421 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2422 if (UserSGPRInfo.hasImplicitBufferPtr()) {
2423 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
2424 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2425 CCInfo.AllocateReg(ImplicitBufferPtrReg);
2426 }
2427
2428 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
2429 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
2430 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
2431 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2432 CCInfo.AllocateReg(PrivateSegmentBufferReg);
2433 }
2434
2435 if (UserSGPRInfo.hasDispatchPtr()) {
2436 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
2437 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2438 CCInfo.AllocateReg(DispatchPtrReg);
2439 }
2440
2441 const Module *M = MF.getFunction().getParent();
2442 if (UserSGPRInfo.hasQueuePtr() &&
2444 Register QueuePtrReg = Info.addQueuePtr(TRI);
2445 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2446 CCInfo.AllocateReg(QueuePtrReg);
2447 }
2448
2449 if (UserSGPRInfo.hasKernargSegmentPtr()) {
2451 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
2452 CCInfo.AllocateReg(InputPtrReg);
2453
2454 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
2455 MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
2456 }
2457
2458 if (UserSGPRInfo.hasDispatchID()) {
2459 Register DispatchIDReg = Info.addDispatchID(TRI);
2460 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2461 CCInfo.AllocateReg(DispatchIDReg);
2462 }
2463
2464 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
2465 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
2466 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2467 CCInfo.AllocateReg(FlatScratchInitReg);
2468 }
2469
2470 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
2471 // these from the dispatch pointer.
2472}
2473
2474// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
2475// sequential starting from the first argument.
2477 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
2479 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
2480 Function &F = MF.getFunction();
2481 unsigned LastExplicitArgOffset =
2482 MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
2483 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
2484 bool InPreloadSequence = true;
2485 unsigned InIdx = 0;
2486 for (auto &Arg : F.args()) {
2487 if (!InPreloadSequence || !Arg.hasInRegAttr())
2488 break;
2489
2490 int ArgIdx = Arg.getArgNo();
2491 // Don't preload non-original args or parts not in the current preload
2492 // sequence.
2493 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2494 (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2495 break;
2496
2497 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2498 (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2499 InIdx++) {
2500 assert(ArgLocs[ArgIdx].isMemLoc());
2501 auto &ArgLoc = ArgLocs[InIdx];
2502 const Align KernelArgBaseAlign = Align(16);
2503 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2504 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
2505 unsigned NumAllocSGPRs =
2506 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2507
2508 // Arg is preloaded into the previous SGPR.
2509 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2510 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2511 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2512 continue;
2513 }
2514
2515 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2516 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
2517 // Check for free user SGPRs for preloading.
2518 if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
2519 SGPRInfo.getNumFreeUserSGPRs()) {
2520 InPreloadSequence = false;
2521 break;
2522 }
2523
2524 // Preload this argument.
2525 const TargetRegisterClass *RC =
2526 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2527 SmallVectorImpl<MCRegister> *PreloadRegs =
2528 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2529
2530 if (PreloadRegs->size() > 1)
2531 RC = &AMDGPU::SGPR_32RegClass;
2532 for (auto &Reg : *PreloadRegs) {
2533 assert(Reg);
2534 MF.addLiveIn(Reg, RC);
2535 CCInfo.AllocateReg(Reg);
2536 }
2537
2538 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2539 }
2540 }
2541}
2542
2544 const SIRegisterInfo &TRI,
2545 SIMachineFunctionInfo &Info) const {
2546 // Always allocate this last since it is a synthetic preload.
2547 if (Info.hasLDSKernelId()) {
2548 Register Reg = Info.addLDSKernelId();
2549 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2550 CCInfo.AllocateReg(Reg);
2551 }
2552}
2553
2554// Allocate special input registers that are initialized per-wave.
2556 MachineFunction &MF,
2558 CallingConv::ID CallConv,
2559 bool IsShader) const {
2560 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
2561 if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
2562 // Note: user SGPRs are handled by the front-end for graphics shaders
2563 // Pad up the used user SGPRs with dead inputs.
2564
2565 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
2566 // before enabling architected SGPRs for workgroup IDs.
2567 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
2568
2569 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
2570 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
2571 // rely on it to reach 16 since if we end up having no stack usage, it will
2572 // not really be added.
2573 unsigned NumRequiredSystemSGPRs = Info.hasWorkGroupIDX() +
2574 Info.hasWorkGroupIDY() +
2575 Info.hasWorkGroupIDZ() +
2576 Info.hasWorkGroupInfo();
2577 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2578 Register Reg = Info.addReservedUserSGPR();
2579 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2580 CCInfo.AllocateReg(Reg);
2581 }
2582 }
2583
2584 if (!HasArchitectedSGPRs) {
2585 if (Info.hasWorkGroupIDX()) {
2586 Register Reg = Info.addWorkGroupIDX();
2587 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2588 CCInfo.AllocateReg(Reg);
2589 }
2590
2591 if (Info.hasWorkGroupIDY()) {
2592 Register Reg = Info.addWorkGroupIDY();
2593 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2594 CCInfo.AllocateReg(Reg);
2595 }
2596
2597 if (Info.hasWorkGroupIDZ()) {
2598 Register Reg = Info.addWorkGroupIDZ();
2599 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2600 CCInfo.AllocateReg(Reg);
2601 }
2602 }
2603
2604 if (Info.hasWorkGroupInfo()) {
2605 Register Reg = Info.addWorkGroupInfo();
2606 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2607 CCInfo.AllocateReg(Reg);
2608 }
2609
2610 if (Info.hasPrivateSegmentWaveByteOffset()) {
2611 // Scratch wave offset passed in system SGPR.
2612 unsigned PrivateSegmentWaveByteOffsetReg;
2613
2614 if (IsShader) {
2615 PrivateSegmentWaveByteOffsetReg =
2616 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2617
2618 // This is true if the scratch wave byte offset doesn't have a fixed
2619 // location.
2620 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2621 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
2622 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2623 }
2624 } else
2625 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
2626
2627 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2628 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
2629 }
2630
2631 assert(!Subtarget->hasUserSGPRInit16Bug() || IsShader ||
2632 Info.getNumPreloadedSGPRs() >= 16);
2633}
2634
2636 MachineFunction &MF,
2637 const SIRegisterInfo &TRI,
2638 SIMachineFunctionInfo &Info) {
2639 // Now that we've figured out where the scratch register inputs are, see if
2640 // should reserve the arguments and use them directly.
2641 MachineFrameInfo &MFI = MF.getFrameInfo();
2642 bool HasStackObjects = MFI.hasStackObjects();
2643 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2644
2645 // Record that we know we have non-spill stack objects so we don't need to
2646 // check all stack objects later.
2647 if (HasStackObjects)
2648 Info.setHasNonSpillStackObjects(true);
2649
2650 // Everything live out of a block is spilled with fast regalloc, so it's
2651 // almost certain that spilling will be required.
2652 if (TM.getOptLevel() == CodeGenOptLevel::None)
2653 HasStackObjects = true;
2654
2655 // For now assume stack access is needed in any callee functions, so we need
2656 // the scratch registers to pass in.
2657 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
2658
2659 if (!ST.enableFlatScratch()) {
2660 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
2661 // If we have stack objects, we unquestionably need the private buffer
2662 // resource. For the Code Object V2 ABI, this will be the first 4 user
2663 // SGPR inputs. We can reserve those and use them directly.
2664
2665 Register PrivateSegmentBufferReg =
2667 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2668 } else {
2669 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
2670 // We tentatively reserve the last registers (skipping the last registers
2671 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
2672 // we'll replace these with the ones immediately after those which were
2673 // really allocated. In the prologue copies will be inserted from the
2674 // argument to these reserved registers.
2675
2676 // Without HSA, relocations are used for the scratch pointer and the
2677 // buffer resource setup is always inserted in the prologue. Scratch wave
2678 // offset is still in an input SGPR.
2679 Info.setScratchRSrcReg(ReservedBufferReg);
2680 }
2681 }
2682
2684
2685 // For entry functions we have to set up the stack pointer if we use it,
2686 // whereas non-entry functions get this "for free". This means there is no
2687 // intrinsic advantage to using S32 over S34 in cases where we do not have
2688 // calls but do need a frame pointer (i.e. if we are requested to have one
2689 // because frame pointer elimination is disabled). To keep things simple we
2690 // only ever use S32 as the call ABI stack pointer, and so using it does not
2691 // imply we need a separate frame pointer.
2692 //
2693 // Try to use s32 as the SP, but move it if it would interfere with input
2694 // arguments. This won't work with calls though.
2695 //
2696 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
2697 // registers.
2698 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
2699 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2700 } else {
2702
2703 if (MFI.hasCalls())
2704 report_fatal_error("call in graphics shader with too many input SGPRs");
2705
2706 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
2707 if (!MRI.isLiveIn(Reg)) {
2708 Info.setStackPtrOffsetReg(Reg);
2709 break;
2710 }
2711 }
2712
2713 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2714 report_fatal_error("failed to find register for SP");
2715 }
2716
2717 // hasFP should be accurate for entry functions even before the frame is
2718 // finalized, because it does not rely on the known stack size, only
2719 // properties like whether variable sized objects are present.
2720 if (ST.getFrameLowering()->hasFP(MF)) {
2721 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2722 }
2723}
2724
2727 return !Info->isEntryFunction();
2728}
2729
2731
2732}
2733
2735 MachineBasicBlock *Entry,
2736 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
2738
2739 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2740 if (!IStart)
2741 return;
2742
2743 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
2744 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
2745 MachineBasicBlock::iterator MBBI = Entry->begin();
2746 for (const MCPhysReg *I = IStart; *I; ++I) {
2747 const TargetRegisterClass *RC = nullptr;
2748 if (AMDGPU::SReg_64RegClass.contains(*I))
2749 RC = &AMDGPU::SGPR_64RegClass;
2750 else if (AMDGPU::SReg_32RegClass.contains(*I))
2751 RC = &AMDGPU::SGPR_32RegClass;
2752 else
2753 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2754
2755 Register NewVR = MRI->createVirtualRegister(RC);
2756 // Create copy from CSR to a virtual register.
2757 Entry->addLiveIn(*I);
2758 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
2759 .addReg(*I);
2760
2761 // Insert the copy-back instructions right before the terminator.
2762 for (auto *Exit : Exits)
2763 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
2764 TII->get(TargetOpcode::COPY), *I)
2765 .addReg(NewVR);
2766 }
2767}
2768
2770 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2771 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
2772 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2774
2776 const Function &Fn = MF.getFunction();
2779
2780 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
2781 DiagnosticInfoUnsupported NoGraphicsHSA(
2782 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
2783 DAG.getContext()->diagnose(NoGraphicsHSA);
2784 return DAG.getEntryNode();
2785 }
2786
2789 BitVector Skipped(Ins.size());
2790 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2791 *DAG.getContext());
2792
2793 bool IsGraphics = AMDGPU::isGraphics(CallConv);
2794 bool IsKernel = AMDGPU::isKernel(CallConv);
2795 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
2796
2797 if (IsGraphics) {
2798 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
2799 assert(!UserSGPRInfo.hasDispatchPtr() &&
2800 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
2801 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
2802 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
2803 (void)UserSGPRInfo;
2804 if (!Subtarget->enableFlatScratch())
2805 assert(!UserSGPRInfo.hasFlatScratchInit());
2806 if ((CallConv != CallingConv::AMDGPU_CS &&
2807 CallConv != CallingConv::AMDGPU_Gfx) ||
2808 !Subtarget->hasArchitectedSGPRs())
2809 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
2810 !Info->hasWorkGroupIDZ());
2811 }
2812
2813 if (CallConv == CallingConv::AMDGPU_PS) {
2814 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
2815
2816 // At least one interpolation mode must be enabled or else the GPU will
2817 // hang.
2818 //
2819 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
2820 // set PSInputAddr, the user wants to enable some bits after the compilation
2821 // based on run-time states. Since we can't know what the final PSInputEna
2822 // will look like, so we shouldn't do anything here and the user should take
2823 // responsibility for the correct programming.
2824 //
2825 // Otherwise, the following restrictions apply:
2826 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
2827 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
2828 // enabled too.
2829 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
2830 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
2831 CCInfo.AllocateReg(AMDGPU::VGPR0);
2832 CCInfo.AllocateReg(AMDGPU::VGPR1);
2833 Info->markPSInputAllocated(0);
2834 Info->markPSInputEnabled(0);
2835 }
2836 if (Subtarget->isAmdPalOS()) {
2837 // For isAmdPalOS, the user does not enable some bits after compilation
2838 // based on run-time states; the register values being generated here are
2839 // the final ones set in hardware. Therefore we need to apply the
2840 // workaround to PSInputAddr and PSInputEnable together. (The case where
2841 // a bit is set in PSInputAddr but not PSInputEnable is where the
2842 // frontend set up an input arg for a particular interpolation mode, but
2843 // nothing uses that input arg. Really we should have an earlier pass
2844 // that removes such an arg.)
2845 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
2846 if ((PsInputBits & 0x7F) == 0 ||
2847 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2848 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
2849 }
2850 } else if (IsKernel) {
2851 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
2852 } else {
2853 Splits.append(Ins.begin(), Ins.end());
2854 }
2855
2856 if (IsKernel)
2857 analyzeFormalArgumentsCompute(CCInfo, Ins);
2858
2859 if (IsEntryFunc) {
2860 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
2861 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
2862 if (IsKernel && Subtarget->hasKernargPreload())
2863 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
2864
2865 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
2866 } else if (!IsGraphics) {
2867 // For the fixed ABI, pass workitem IDs in the last argument register.
2868 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
2869
2870 // FIXME: Sink this into allocateSpecialInputSGPRs
2871 if (!Subtarget->enableFlatScratch())
2872 CCInfo.AllocateReg(Info->getScratchRSrcReg());
2873
2874 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
2875 }
2876
2877 if (!IsKernel) {
2878 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
2879 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
2880 }
2881
2883
2884 // FIXME: This is the minimum kernel argument alignment. We should improve
2885 // this to the maximum alignment of the arguments.
2886 //
2887 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
2888 // kern arg offset.
2889 const Align KernelArgBaseAlign = Align(16);
2890
2891 for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2892 const ISD::InputArg &Arg = Ins[i];
2893 if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
2894 InVals.push_back(DAG.getUNDEF(Arg.VT));
2895 continue;
2896 }
2897
2898 CCValAssign &VA = ArgLocs[ArgIdx++];
2899 MVT VT = VA.getLocVT();
2900
2901 if (IsEntryFunc && VA.isMemLoc()) {
2902 VT = Ins[i].VT;
2903 EVT MemVT = VA.getLocVT();
2904
2905 const uint64_t Offset = VA.getLocMemOffset();
2906 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
2907
2908 if (Arg.Flags.isByRef()) {
2909 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
2910
2911 const GCNTargetMachine &TM =
2912 static_cast<const GCNTargetMachine &>(getTargetMachine());
2913 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
2914 Arg.Flags.getPointerAddrSpace())) {
2917 }
2918
2919 InVals.push_back(Ptr);
2920 continue;
2921 }
2922
2923 SDValue NewArg;
2924 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
2925 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2926 // In this case the argument is packed into the previous preload SGPR.
2927 int64_t AlignDownOffset = alignDown(Offset, 4);
2928 int64_t OffsetDiff = Offset - AlignDownOffset;
2929 EVT IntVT = MemVT.changeTypeToInteger();
2930
2934 Register Reg =
2935 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2936
2937 assert(Reg);
2938 Register VReg = MRI.getLiveInVirtReg(Reg);
2939 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2940
2941 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
2942 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
2943
2944 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
2945 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
2946 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
2947 Ins[i].Flags.isSExt(), &Ins[i]);
2948
2949 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
2950 } else {
2954 const SmallVectorImpl<MCRegister> &PreloadRegs =
2955 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2956
2957 SDValue Copy;
2958 if (PreloadRegs.size() == 1) {
2959 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
2960 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
2961 NewArg = DAG.getCopyFromReg(
2962 Chain, DL, VReg,
2964 TRI->getRegSizeInBits(*RC)));
2965
2966 } else {
2967 // If the kernarg alignment does not match the alignment of the SGPR
2968 // tuple RC that can accommodate this argument, it will be built up
2969 // via copies from from the individual SGPRs that the argument was
2970 // preloaded to.
2972 for (auto Reg : PreloadRegs) {
2973 Register VReg = MRI.getLiveInVirtReg(Reg);
2974 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
2975 Elts.push_back(Copy);
2976 }
2977 NewArg =
2978 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
2979 PreloadRegs.size()),
2980 DL, Elts);
2981 }
2982
2983 SDValue CMemVT;
2984 if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
2985 CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
2986 else
2987 CMemVT = DAG.getBitcast(MemVT, NewArg);
2988 NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
2989 Ins[i].Flags.isSExt(), &Ins[i]);
2990 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
2991 }
2992 } else {
2993 NewArg =
2994 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
2995 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
2996 }
2997 Chains.push_back(NewArg.getValue(1));
2998
2999 auto *ParamTy =
3000 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3002 ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3003 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3004 // On SI local pointers are just offsets into LDS, so they are always
3005 // less than 16-bits. On CI and newer they could potentially be
3006 // real pointers, so we can't guarantee their size.
3007 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3008 DAG.getValueType(MVT::i16));
3009 }
3010
3011 InVals.push_back(NewArg);
3012 continue;
3013 } else if (!IsEntryFunc && VA.isMemLoc()) {
3014 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3015 InVals.push_back(Val);
3016 if (!Arg.Flags.isByVal())
3017 Chains.push_back(Val.getValue(1));
3018 continue;
3019 }
3020
3021 assert(VA.isRegLoc() && "Parameter must be in a register!");
3022
3023 Register Reg = VA.getLocReg();
3024 const TargetRegisterClass *RC = nullptr;
3025 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3026 RC = &AMDGPU::VGPR_32RegClass;
3027 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3028 RC = &AMDGPU::SGPR_32RegClass;
3029 else
3030 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3031 EVT ValVT = VA.getValVT();
3032
3033 Reg = MF.addLiveIn(Reg, RC);
3034 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3035
3036 if (Arg.Flags.isSRet()) {
3037 // The return object should be reasonably addressable.
3038
3039 // FIXME: This helps when the return is a real sret. If it is a
3040 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3041 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3042 unsigned NumBits
3044 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3045 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3046 }
3047
3048 // If this is an 8 or 16-bit value, it is really passed promoted
3049 // to 32 bits. Insert an assert[sz]ext to capture this, then
3050 // truncate to the right size.
3051 switch (VA.getLocInfo()) {
3052 case CCValAssign::Full:
3053 break;
3054 case CCValAssign::BCvt:
3055 Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val);
3056 break;
3057 case CCValAssign::SExt:
3058 Val = DAG.getNode(ISD::AssertSext, DL, VT, Val,
3059 DAG.getValueType(ValVT));
3060 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3061 break;
3062 case CCValAssign::ZExt:
3063 Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
3064 DAG.getValueType(ValVT));
3065 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3066 break;
3067 case CCValAssign::AExt:
3068 Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val);
3069 break;
3070 default:
3071 llvm_unreachable("Unknown loc info!");
3072 }
3073
3074 InVals.push_back(Val);
3075 }
3076
3077 // Start adding system SGPRs.
3078 if (IsEntryFunc)
3079 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3080
3081 auto &ArgUsageInfo =
3083 ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
3084
3085 unsigned StackArgSize = CCInfo.getStackSize();
3086 Info->setBytesInStackArgArea(StackArgSize);
3087
3088 return Chains.empty() ? Chain :
3089 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3090}
3091
3092// TODO: If return values can't fit in registers, we should return as many as
3093// possible in registers before passing on stack.
3095 CallingConv::ID CallConv,
3096 MachineFunction &MF, bool IsVarArg,
3098 LLVMContext &Context) const {
3099 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3100 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3101 // for shaders. Vector types should be explicitly handled by CC.
3102 if (AMDGPU::isEntryFunctionCC(CallConv))
3103 return true;
3104
3106 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3107 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3108 return false;
3109
3110 // We must use the stack if return would require unavailable registers.
3111 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3112 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3113 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3114 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3115 return false;
3116
3117 return true;
3118}
3119
3120SDValue
3122 bool isVarArg,
3124 const SmallVectorImpl<SDValue> &OutVals,
3125 const SDLoc &DL, SelectionDAG &DAG) const {
3128
3129 if (AMDGPU::isKernel(CallConv)) {
3130 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3131 OutVals, DL, DAG);
3132 }
3133
3134 bool IsShader = AMDGPU::isShader(CallConv);
3135
3136 Info->setIfReturnsVoid(Outs.empty());
3137 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3138
3139 // CCValAssign - represent the assignment of the return value to a location.
3142
3143 // CCState - Info about the registers and stack slots.
3144 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3145 *DAG.getContext());
3146
3147 // Analyze outgoing return values.
3148 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3149
3150 SDValue Glue;
3152 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3153
3154 // Copy the result values into the output registers.
3155 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3156 ++I, ++RealRVLocIdx) {
3157 CCValAssign &VA = RVLocs[I];
3158 assert(VA.isRegLoc() && "Can only return in registers!");
3159 // TODO: Partially return in registers if return values don't fit.
3160 SDValue Arg = OutVals[RealRVLocIdx];
3161
3162 // Copied from other backends.
3163 switch (VA.getLocInfo()) {
3164 case CCValAssign::Full:
3165 break;
3166 case CCValAssign::BCvt:
3167 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3168 break;
3169 case CCValAssign::SExt:
3170 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3171 break;
3172 case CCValAssign::ZExt:
3173 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3174 break;
3175 case CCValAssign::AExt:
3176 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3177 break;
3178 default:
3179 llvm_unreachable("Unknown loc info!");
3180 }
3181
3182 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3183 Glue = Chain.getValue(1);
3184 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3185 }
3186
3187 // FIXME: Does sret work properly?
3188 if (!Info->isEntryFunction()) {
3189 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3190 const MCPhysReg *I =
3191 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3192 if (I) {
3193 for (; *I; ++I) {
3194 if (AMDGPU::SReg_64RegClass.contains(*I))
3195 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3196 else if (AMDGPU::SReg_32RegClass.contains(*I))
3197 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3198 else
3199 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3200 }
3201 }
3202 }
3203
3204 // Update chain and glue.
3205 RetOps[0] = Chain;
3206 if (Glue.getNode())
3207 RetOps.push_back(Glue);
3208
3209 unsigned Opc = AMDGPUISD::ENDPGM;
3210 if (!IsWaveEnd)
3212 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3213}
3214
3216 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3217 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3218 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3219 SDValue ThisVal) const {
3220 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3221
3222 // Assign locations to each value returned by this call.
3224 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3225 *DAG.getContext());
3226 CCInfo.AnalyzeCallResult(Ins, RetCC);
3227
3228 // Copy all of the result registers out of their specified physreg.
3229 for (unsigned i = 0; i != RVLocs.size(); ++i) {
3230 CCValAssign VA = RVLocs[i];
3231 SDValue Val;
3232
3233 if (VA.isRegLoc()) {
3234 Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3235 Chain = Val.getValue(1);
3236 InGlue = Val.getValue(2);
3237 } else if (VA.isMemLoc()) {
3238 report_fatal_error("TODO: return values in memory");
3239 } else
3240 llvm_unreachable("unknown argument location type");
3241
3242 switch (VA.getLocInfo()) {
3243 case CCValAssign::Full:
3244 break;
3245 case CCValAssign::BCvt:
3246 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3247 break;
3248 case CCValAssign::ZExt:
3249 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3250 DAG.getValueType(VA.getValVT()));
3251 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3252 break;
3253 case CCValAssign::SExt:
3254 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3255 DAG.getValueType(VA.getValVT()));
3256 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3257 break;
3258 case CCValAssign::AExt:
3259 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3260 break;
3261 default:
3262 llvm_unreachable("Unknown loc info!");
3263 }
3264
3265 InVals.push_back(Val);
3266 }
3267
3268 return Chain;
3269}
3270
3271// Add code to pass special inputs required depending on used features separate
3272// from the explicit user arguments present in the IR.
3274 CallLoweringInfo &CLI,
3275 CCState &CCInfo,
3276 const SIMachineFunctionInfo &Info,
3277 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3278 SmallVectorImpl<SDValue> &MemOpChains,
3279 SDValue Chain) const {
3280 // If we don't have a call site, this was a call inserted by
3281 // legalization. These can never use special inputs.
3282 if (!CLI.CB)
3283 return;
3284
3285 SelectionDAG &DAG = CLI.DAG;
3286 const SDLoc &DL = CLI.DL;
3287 const Function &F = DAG.getMachineFunction().getFunction();
3288
3289 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3290 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3291
3292 const AMDGPUFunctionArgInfo *CalleeArgInfo
3294 if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
3295 auto &ArgUsageInfo =
3297 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3298 }
3299
3300 // TODO: Unify with private memory register handling. This is complicated by
3301 // the fact that at least in kernels, the input argument is not necessarily
3302 // in the same location as the input.
3303 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3305 {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
3306 {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
3307 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
3308 {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
3309 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
3310 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
3311 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"},
3312 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID,"amdgpu-no-lds-kernel-id"},
3313 };
3314
3315 for (auto Attr : ImplicitAttrs) {
3316 const ArgDescriptor *OutgoingArg;
3317 const TargetRegisterClass *ArgRC;
3318 LLT ArgTy;
3319
3320 AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
3321
3322 // If the callee does not use the attribute value, skip copying the value.
3323 if (CLI.CB->hasFnAttr(Attr.second))
3324 continue;
3325
3326 std::tie(OutgoingArg, ArgRC, ArgTy) =
3327 CalleeArgInfo->getPreloadedValue(InputID);
3328 if (!OutgoingArg)
3329 continue;
3330
3331 const ArgDescriptor *IncomingArg;
3332 const TargetRegisterClass *IncomingArgRC;
3333 LLT Ty;
3334 std::tie(IncomingArg, IncomingArgRC, Ty) =
3335 CallerArgInfo.getPreloadedValue(InputID);
3336 assert(IncomingArgRC == ArgRC);
3337
3338 // All special arguments are ints for now.
3339 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3340 SDValue InputReg;
3341
3342 if (IncomingArg) {
3343 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3344 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3345 // The implicit arg ptr is special because it doesn't have a corresponding
3346 // input for kernels, and is computed from the kernarg segment pointer.
3347 InputReg = getImplicitArgPtr(DAG, DL);
3348 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3349 std::optional<uint32_t> Id =
3351 if (Id.has_value()) {
3352 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3353 } else {
3354 InputReg = DAG.getUNDEF(ArgVT);
3355 }
3356 } else {
3357 // We may have proven the input wasn't needed, although the ABI is
3358 // requiring it. We just need to allocate the register appropriately.
3359 InputReg = DAG.getUNDEF(ArgVT);
3360 }
3361
3362 if (OutgoingArg->isRegister()) {
3363 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3364 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3365 report_fatal_error("failed to allocate implicit input argument");
3366 } else {
3367 unsigned SpecialArgOffset =
3368 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3369 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3370 SpecialArgOffset);
3371 MemOpChains.push_back(ArgStore);
3372 }
3373 }
3374
3375 // Pack workitem IDs into a single register or pass it as is if already
3376 // packed.
3377 const ArgDescriptor *OutgoingArg;
3378 const TargetRegisterClass *ArgRC;
3379 LLT Ty;
3380
3381 std::tie(OutgoingArg, ArgRC, Ty) =
3383 if (!OutgoingArg)
3384 std::tie(OutgoingArg, ArgRC, Ty) =
3386 if (!OutgoingArg)
3387 std::tie(OutgoingArg, ArgRC, Ty) =
3389 if (!OutgoingArg)
3390 return;
3391
3392 const ArgDescriptor *IncomingArgX = std::get<0>(
3394 const ArgDescriptor *IncomingArgY = std::get<0>(
3396 const ArgDescriptor *IncomingArgZ = std::get<0>(
3398
3399 SDValue InputReg;
3400 SDLoc SL;
3401
3402 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
3403 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
3404 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
3405
3406 // If incoming ids are not packed we need to pack them.
3407 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
3408 NeedWorkItemIDX) {
3409 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
3410 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
3411 } else {
3412 InputReg = DAG.getConstant(0, DL, MVT::i32);
3413 }
3414 }
3415
3416 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
3417 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
3418 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
3419 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
3420 DAG.getShiftAmountConstant(10, MVT::i32, SL));
3421 InputReg = InputReg.getNode() ?
3422 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
3423 }
3424
3425 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
3426 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
3427 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
3428 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
3429 DAG.getShiftAmountConstant(20, MVT::i32, SL));
3430 InputReg = InputReg.getNode() ?
3431 DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
3432 }
3433
3434 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3435 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3436 // We're in a situation where the outgoing function requires the workitem
3437 // ID, but the calling function does not have it (e.g a graphics function
3438 // calling a C calling convention function). This is illegal, but we need
3439 // to produce something.
3440 InputReg = DAG.getUNDEF(MVT::i32);
3441 } else {
3442 // Workitem ids are already packed, any of present incoming arguments
3443 // will carry all required fields.
3445 IncomingArgX ? *IncomingArgX :
3446 IncomingArgY ? *IncomingArgY :
3447 *IncomingArgZ, ~0u);
3448 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
3449 }
3450 }
3451
3452 if (OutgoingArg->isRegister()) {
3453 if (InputReg)
3454 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3455
3456 CCInfo.AllocateReg(OutgoingArg->getRegister());
3457 } else {
3458 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
3459 if (InputReg) {
3460 SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
3461 SpecialArgOffset);
3462 MemOpChains.push_back(ArgStore);
3463 }
3464 }
3465}
3466
3468 return CC == CallingConv::Fast;
3469}
3470
3471/// Return true if we might ever do TCO for calls with this calling convention.
3473 switch (CC) {
3474 case CallingConv::C:
3476 return true;
3477 default:
3478 return canGuaranteeTCO(CC);
3479 }
3480}
3481
3483 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
3485 const SmallVectorImpl<SDValue> &OutVals,
3486 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3487 if (AMDGPU::isChainCC(CalleeCC))
3488 return true;
3489
3490 if (!mayTailCallThisCC(CalleeCC))
3491 return false;
3492
3493 // For a divergent call target, we need to do a waterfall loop over the
3494 // possible callees which precludes us from using a simple jump.
3495 if (Callee->isDivergent())
3496 return false;
3497
3499 const Function &CallerF = MF.getFunction();
3500 CallingConv::ID CallerCC = CallerF.getCallingConv();
3502 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3503
3504 // Kernels aren't callable, and don't have a live in return address so it
3505 // doesn't make sense to do a tail call with entry functions.
3506 if (!CallerPreserved)
3507 return false;
3508
3509 bool CCMatch = CallerCC == CalleeCC;
3510
3512 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3513 return true;
3514 return false;
3515 }
3516
3517 // TODO: Can we handle var args?
3518 if (IsVarArg)
3519 return false;
3520
3521 for (const Argument &Arg : CallerF.args()) {
3522 if (Arg.hasByValAttr())
3523 return false;
3524 }
3525
3526 LLVMContext &Ctx = *DAG.getContext();
3527
3528 // Check that the call results are passed in the same way.
3529 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
3530 CCAssignFnForCall(CalleeCC, IsVarArg),
3531 CCAssignFnForCall(CallerCC, IsVarArg)))
3532 return false;
3533
3534 // The callee has to preserve all registers the caller needs to preserve.
3535 if (!CCMatch) {
3536 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3537 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3538 return false;
3539 }
3540
3541 // Nothing more to check if the callee is taking no arguments.
3542 if (Outs.empty())
3543 return true;
3544
3546 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3547
3548 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
3549
3550 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
3551 // If the stack arguments for this call do not fit into our own save area then
3552 // the call cannot be made tail.
3553 // TODO: Is this really necessary?
3554 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
3555 return false;
3556
3557 const MachineRegisterInfo &MRI = MF.getRegInfo();
3558 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
3559}
3560
3562 if (!CI->isTailCall())
3563 return false;
3564
3565 const Function *ParentFn = CI->getParent()->getParent();
3567 return false;
3568 return true;
3569}
3570
3571// The wave scratch offset register is used as the global base pointer.
3573 SmallVectorImpl<SDValue> &InVals) const {
3574 CallingConv::ID CallConv = CLI.CallConv;
3575 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
3576
3577 SelectionDAG &DAG = CLI.DAG;
3578
3579 TargetLowering::ArgListEntry RequestedExec;
3580 if (IsChainCallConv) {
3581 // The last argument should be the value that we need to put in EXEC.
3582 // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
3583 // don't treat it like the rest of the arguments.
3584 RequestedExec = CLI.Args.back();
3585 assert(RequestedExec.Node && "No node for EXEC");
3586
3587 if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
3588 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
3589
3590 assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
3591 CLI.Outs.pop_back();
3592 CLI.OutVals.pop_back();
3593
3594 if (RequestedExec.Ty->isIntegerTy(64)) {
3595 assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
3596 CLI.Outs.pop_back();
3597 CLI.OutVals.pop_back();
3598 }
3599
3600 assert(CLI.Outs.back().OrigArgIndex != 2 &&
3601 "Haven't popped all the pieces of the EXEC mask");
3602 }
3603
3604 const SDLoc &DL = CLI.DL;
3606 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
3608 SDValue Chain = CLI.Chain;
3609 SDValue Callee = CLI.Callee;
3610 bool &IsTailCall = CLI.IsTailCall;
3611 bool IsVarArg = CLI.IsVarArg;
3612 bool IsSibCall = false;
3614
3615 if (Callee.isUndef() || isNullConstant(Callee)) {
3616 if (!CLI.IsTailCall) {
3617 for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
3618 InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
3619 }
3620
3621 return Chain;
3622 }
3623
3624 if (IsVarArg) {
3625 return lowerUnhandledCall(CLI, InVals,
3626 "unsupported call to variadic function ");
3627 }
3628
3629 if (!CLI.CB)
3630 report_fatal_error("unsupported libcall legalization");
3631
3632 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
3633 return lowerUnhandledCall(CLI, InVals,
3634 "unsupported required tail call to function ");
3635 }
3636
3637 if (IsTailCall) {
3639 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3640 if (!IsTailCall &&
3641 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
3642 report_fatal_error("failed to perform tail call elimination on a call "
3643 "site marked musttail or on llvm.amdgcn.cs.chain");
3644 }
3645
3646 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
3647
3648 // A sibling call is one where we're under the usual C ABI and not planning
3649 // to change that but can still do a tail call:
3650 if (!TailCallOpt && IsTailCall)
3651 IsSibCall = true;
3652
3653 if (IsTailCall)
3654 ++NumTailCalls;
3655 }
3656
3659 SmallVector<SDValue, 8> MemOpChains;
3660
3661 // Analyze operands of the call, assigning locations to each operand.
3663 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
3664 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
3665
3666 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
3667 // With a fixed ABI, allocate fixed registers before user arguments.
3668 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
3669 }
3670
3671 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
3672
3673 // Get a count of how many bytes are to be pushed on the stack.
3674 unsigned NumBytes = CCInfo.getStackSize();
3675
3676 if (IsSibCall) {
3677 // Since we're not changing the ABI to make this a tail call, the memory
3678 // operands are already available in the caller's incoming argument space.
3679 NumBytes = 0;
3680 }
3681
3682 // FPDiff is the byte offset of the call's argument area from the callee's.
3683 // Stores to callee stack arguments will be placed in FixedStackSlots offset
3684 // by this amount for a tail call. In a sibling call it must be 0 because the
3685 // caller will deallocate the entire stack and the callee still expects its
3686 // arguments to begin at SP+0. Completely unused for non-tail calls.
3687 int32_t FPDiff = 0;
3688 MachineFrameInfo &MFI = MF.getFrameInfo();
3689
3690 // Adjust the stack pointer for the new arguments...
3691 // These operations are automatically eliminated by the prolog/epilog pass
3692 if (!IsSibCall)
3693 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
3694
3695 if (!IsSibCall || IsChainCallConv) {
3696 if (!Subtarget->enableFlatScratch()) {
3697 SmallVector<SDValue, 4> CopyFromChains;
3698
3699 // In the HSA case, this should be an identity copy.
3700 SDValue ScratchRSrcReg
3701 = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
3702 RegsToPass.emplace_back(IsChainCallConv
3703 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3704 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3705 ScratchRSrcReg);
3706 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
3707 Chain = DAG.getTokenFactor(DL, CopyFromChains);
3708 }
3709 }
3710
3711 MVT PtrVT = MVT::i32;
3712
3713 // Walk the register/memloc assignments, inserting copies/loads.
3714 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3715 CCValAssign &VA = ArgLocs[i];
3716 SDValue Arg = OutVals[i];
3717
3718 // Promote the value if needed.
3719 switch (VA.getLocInfo()) {
3720 case CCValAssign::Full:
3721 break;
3722 case CCValAssign::BCvt:
3723 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3724 break;
3725 case CCValAssign::ZExt:
3726 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3727 break;
3728 case CCValAssign::SExt:
3729 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3730 break;
3731 case CCValAssign::AExt:
3732 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3733 break;
3734 case CCValAssign::FPExt:
3735 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
3736 break;
3737 default:
3738 llvm_unreachable("Unknown loc info!");
3739 }
3740
3741 if (VA.isRegLoc()) {
3742 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
3743 } else {
3744 assert(VA.isMemLoc());
3745
3746 SDValue DstAddr;
3747 MachinePointerInfo DstInfo;
3748
3749 unsigned LocMemOffset = VA.getLocMemOffset();
3750 int32_t Offset = LocMemOffset;
3751
3752 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
3753 MaybeAlign Alignment;
3754
3755 if (IsTailCall) {
3756 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3757 unsigned OpSize = Flags.isByVal() ?
3758 Flags.getByValSize() : VA.getValVT().getStoreSize();
3759
3760 // FIXME: We can have better than the minimum byval required alignment.
3761 Alignment =
3762 Flags.isByVal()
3763 ? Flags.getNonZeroByValAlign()
3764 : commonAlignment(Subtarget->getStackAlignment(), Offset);
3765
3766 Offset = Offset + FPDiff;
3767 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
3768
3769 DstAddr = DAG.getFrameIndex(FI, PtrVT);
3770 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
3771
3772 // Make sure any stack arguments overlapping with where we're storing
3773 // are loaded before this eventual operation. Otherwise they'll be
3774 // clobbered.
3775
3776 // FIXME: Why is this really necessary? This seems to just result in a
3777 // lot of code to copy the stack and write them back to the same
3778 // locations, which are supposed to be immutable?
3779 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
3780 } else {
3781 // Stores to the argument stack area are relative to the stack pointer.
3782 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
3783 MVT::i32);
3784 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
3785 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
3786 Alignment =
3787 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
3788 }
3789
3790 if (Outs[i].Flags.isByVal()) {
3791 SDValue SizeNode =
3792 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
3793 SDValue Cpy =
3794 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
3795 Outs[i].Flags.getNonZeroByValAlign(),
3796 /*isVol = */ false, /*AlwaysInline = */ true,
3797 /*isTailCall = */ false, DstInfo,
3799
3800 MemOpChains.push_back(Cpy);
3801 } else {
3802 SDValue Store =
3803 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
3804 MemOpChains.push_back(Store);
3805 }
3806 }
3807 }
3808
3809 if (!MemOpChains.empty())
3810 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
3811
3812 // Build a sequence of copy-to-reg nodes chained together with token chain
3813 // and flag operands which copy the outgoing args into the appropriate regs.
3814 SDValue InGlue;
3815 for (auto &RegToPass : RegsToPass) {
3816 Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
3817 RegToPass.second, InGlue);
3818 InGlue = Chain.getValue(1);
3819 }
3820
3821
3822 // We don't usually want to end the call-sequence here because we would tidy
3823 // the frame up *after* the call, however in the ABI-changing tail-call case
3824 // we've carefully laid out the parameters so that when sp is reset they'll be
3825 // in the correct location.
3826 if (IsTailCall && !IsSibCall) {
3827 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
3828 InGlue = Chain.getValue(1);
3829 }
3830
3831 std::vector<SDValue> Ops;
3832 Ops.push_back(Chain);
3833 Ops.push_back(Callee);
3834 // Add a redundant copy of the callee global which will not be legalized, as
3835 // we need direct access to the callee later.
3836 if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
3837 const GlobalValue *GV = GSD->getGlobal();
3838 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
3839 } else {
3840 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
3841 }
3842
3843 if (IsTailCall) {
3844 // Each tail call may have to adjust the stack by a different amount, so
3845 // this information must travel along with the operation for eventual
3846 // consumption by emitEpilogue.
3847 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
3848 }
3849
3850 if (IsChainCallConv)
3851 Ops.push_back(RequestedExec.Node);
3852
3853 // Add argument registers to the end of the list so that they are known live
3854 // into the call.
3855 for (auto &RegToPass : RegsToPass) {
3856 Ops.push_back(DAG.getRegister(RegToPass.first,
3857 RegToPass.second.getValueType()));
3858 }
3859
3860 // Add a register mask operand representing the call-preserved registers.
3861 auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
3862 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
3863 assert(Mask && "Missing call preserved mask for calling convention");
3864 Ops.push_back(DAG.getRegisterMask(Mask));
3865
3866 if (InGlue.getNode())
3867 Ops.push_back(InGlue);
3868
3869 // NOTE: This potentially results in *two* glue operands, and the wrong one
3870 // might possibly show up where the other was intended. In particular,
3871 // Emitter::EmitMachineNode() expects only the glued convergence token if it
3872 // exists. Similarly, the selection of the call expects to match only the
3873 // InGlue operand if it exists.
3874 if (SDValue Token = CLI.ConvergenceControlToken) {
3875 Ops.push_back(SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE,
3876 DL, MVT::Glue, Token),
3877 0));
3878 }
3879
3880 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3881
3882 // If we're doing a tall call, use a TC_RETURN here rather than an
3883 // actual call instruction.
3884 if (IsTailCall) {
3885 MFI.setHasTailCall();
3886 unsigned OPC = AMDGPUISD::TC_RETURN;
3887 switch (CallConv) {
3890 break;
3894 break;
3895 }
3896
3897 return DAG.getNode(OPC, DL, NodeTys, Ops);
3898 }
3899
3900 // Returns a chain and a flag for retval copy to use.
3901 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
3902 Chain = Call.getValue(0);
3903 InGlue = Call.getValue(1);
3904
3905 uint64_t CalleePopBytes = NumBytes;
3906 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
3907 if (!Ins.empty())
3908 InGlue = Chain.getValue(1);
3909
3910 // Handle result values, copying them out of physregs into vregs that we
3911 // return.
3912 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
3913 InVals, /*IsThisReturn=*/false, SDValue());
3914}
3915
3916// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
3917// except for applying the wave size scale to the increment amount.
3919 SDValue Op, SelectionDAG &DAG) const {
3920 const MachineFunction &MF = DAG.getMachineFunction();
3922
3923 SDLoc dl(Op);
3924 EVT VT = Op.getValueType();
3925 SDValue Tmp1 = Op;
3926 SDValue Tmp2 = Op.getValue(1);
3927 SDValue Tmp3 = Op.getOperand(2);
3928 SDValue Chain = Tmp1.getOperand(0);
3929
3930 Register SPReg = Info->getStackPtrOffsetReg();
3931
3932 // Chain the dynamic stack allocation so that it doesn't modify the stack
3933 // pointer when other instructions are using the stack.
3934 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
3935
3936 SDValue Size = Tmp2.getOperand(1);
3937 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
3938 Chain = SP.getValue(1);
3939 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3940 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
3941 unsigned Opc =
3944
3945 SDValue ScaledSize = DAG.getNode(
3946 ISD::SHL, dl, VT, Size,
3947 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
3948
3949 Align StackAlign = TFL->getStackAlign();
3950 Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
3951 if (Alignment && *Alignment > StackAlign) {
3952 Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
3953 DAG.getConstant(-(uint64_t)Alignment->value()
3954 << Subtarget->getWavefrontSizeLog2(),
3955 dl, VT));
3956 }
3957
3958 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
3959 Tmp2 = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
3960
3961 return DAG.getMergeValues({Tmp1, Tmp2}, dl);
3962}
3963
3965 SelectionDAG &DAG) const {
3966 // We only handle constant sizes here to allow non-entry block, static sized
3967 // allocas. A truly dynamic value is more difficult to support because we
3968 // don't know if the size value is uniform or not. If the size isn't uniform,
3969 // we would need to do a wave reduction to get the maximum size to know how
3970 // much to increment the uniform stack pointer.
3971 SDValue Size = Op.getOperand(1);
3972 if (isa<ConstantSDNode>(Size))
3973 return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
3974
3976}
3977
3979 if (Op.getValueType() != MVT::i32)
3980 return Op; // Defer to cannot select error.
3981
3983 SDLoc SL(Op);
3984
3985 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
3986
3987 // Convert from wave uniform to swizzled vector address. This should protect
3988 // from any edge cases where the stacksave result isn't directly used with
3989 // stackrestore.
3990 SDValue VectorAddress =
3991 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
3992 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
3993}
3994
3996 SelectionDAG &DAG) const {
3997 SDLoc SL(Op);
3998 assert(Op.getValueType() == MVT::i32);
3999
4000 uint32_t BothRoundHwReg =
4002 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4003
4004 SDValue IntrinID =
4005 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4006 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4007 Op.getOperand(0), IntrinID, GetRoundBothImm);
4008
4009 // There are two rounding modes, one for f32 and one for f64/f16. We only
4010 // report in the standard value range if both are the same.
4011 //
4012 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4013 // ties away from zero is not supported, and the other values are rotated by
4014 // 1.
4015 //
4016 // If the two rounding modes are not the same, report a target defined value.
4017
4018 // Mode register rounding mode fields:
4019 //
4020 // [1:0] Single-precision round mode.
4021 // [3:2] Double/Half-precision round mode.
4022 //
4023 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4024 //
4025 // Hardware Spec
4026 // Toward-0 3 0
4027 // Nearest Even 0 1
4028 // +Inf 1 2
4029 // -Inf 2 3
4030 // NearestAway0 N/A 4
4031 //
4032 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4033 // table we can index by the raw hardware mode.
4034 //
4035 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4036
4037 SDValue BitTable =
4039
4040 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4041 SDValue RoundModeTimesNumBits =
4042 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4043
4044 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4045 // knew only one mode was demanded.
4046 SDValue TableValue =
4047 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4048 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4049
4050 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4051 SDValue TableEntry =
4052 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4053
4054 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4055 // if it's an extended value.
4056 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4057 SDValue IsStandardValue =
4058 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4059 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4060 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4061 TableEntry, EnumOffset);
4062
4063 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4064}
4065
4067 SelectionDAG &DAG) const {
4068 SDLoc SL(Op);
4069
4070 SDValue NewMode = Op.getOperand(1);
4071 assert(NewMode.getValueType() == MVT::i32);
4072
4073 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4074 // hardware MODE.fp_round values.
4075 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4076 uint32_t ClampedVal = std::min(
4077 static_cast<uint32_t>(ConstMode->getZExtValue()),
4079 NewMode = DAG.getConstant(
4080 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4081 } else {
4082 // If we know the input can only be one of the supported standard modes in
4083 // the range 0-3, we can use a simplified mapping to hardware values.
4084 KnownBits KB = DAG.computeKnownBits(NewMode);
4085 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4086 // The supported standard values are 0-3. The extended values start at 8. We
4087 // need to offset by 4 if the value is in the extended range.
4088
4089 if (UseReducedTable) {
4090 // Truncate to the low 32-bits.
4091 SDValue BitTable = DAG.getConstant(
4092 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4093
4094 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4095 SDValue RoundModeTimesNumBits =
4096 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4097
4098 NewMode =
4099 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4100
4101 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4102 // the table extracted bits into inline immediates.
4103 } else {
4104 // table_index = umin(value, value - 4)
4105 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4106 SDValue BitTable =
4108
4109 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4110 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4111 SDValue IndexVal =
4112 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4113
4114 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4115 SDValue RoundModeTimesNumBits =
4116 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4117
4118 SDValue TableValue =
4119 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4120 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4121
4122 // No need to mask out the high bits since the setreg will ignore them
4123 // anyway.
4124 NewMode = TruncTable;
4125 }
4126
4127 // Insert a readfirstlane in case the value is a VGPR. We could do this
4128 // earlier and keep more operations scalar, but that interferes with
4129 // combining the source.
4130 SDValue ReadFirstLaneID =
4131 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4132 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4133 ReadFirstLaneID, NewMode);
4134 }
4135
4136 // N.B. The setreg will be later folded into s_round_mode on supported
4137 // targets.
4138 SDValue IntrinID =
4139 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4140 uint32_t BothRoundHwReg =
4142 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4143
4144 SDValue SetReg =
4145 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4146 IntrinID, RoundBothImm, NewMode);
4147
4148 return SetReg;
4149}
4150
4152 if (Op->isDivergent())
4153 return SDValue();
4154
4155 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4160 break;
4161 default:
4162 return SDValue();
4163 }
4164
4165 return Op;
4166}
4167
4168// Work around DAG legality rules only based on the result type.
4170 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4171 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4172 EVT SrcVT = Src.getValueType();
4173
4174 if (SrcVT.getScalarType() != MVT::bf16)
4175 return Op;
4176
4177 SDLoc SL(Op);
4178 SDValue BitCast =
4179 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4180
4181 EVT DstVT = Op.getValueType();
4182 if (IsStrict)
4183 llvm_unreachable("Need STRICT_BF16_TO_FP");
4184
4185 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4186}
4187
4189 SDLoc SL(Op);
4190 if (Op.getValueType() != MVT::i64)
4191 return Op;
4192
4193 uint32_t ModeHwReg =
4195 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4196 uint32_t TrapHwReg =
4198 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4199
4200 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4201 SDValue IntrinID =
4202 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4203 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4204 Op.getOperand(0), IntrinID, ModeHwRegImm);
4205 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4206 Op.getOperand(0), IntrinID, TrapHwRegImm);
4207 SDValue TokenReg =
4208 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4209 GetTrapReg.getValue(1));
4210
4211 SDValue CvtPtr =
4212 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4213 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4214
4215 return DAG.getMergeValues({Result, TokenReg}, SL);
4216}
4217
4219 SDLoc SL(Op);
4220 if (Op.getOperand(1).getValueType() != MVT::i64)
4221 return Op;
4222
4223 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4224 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4225 DAG.getConstant(0, SL, MVT::i32));
4226 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4227 DAG.getConstant(1, SL, MVT::i32));
4228
4229 SDValue ReadFirstLaneID =
4230 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4231 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4232 ReadFirstLaneID, NewModeReg);
4233 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4234 ReadFirstLaneID, NewTrapReg);
4235
4236 unsigned ModeHwReg =
4238 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4239 unsigned TrapHwReg =
4241 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4242
4243 SDValue IntrinID =
4244 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4245 SDValue SetModeReg =
4246 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4247 IntrinID, ModeHwRegImm, NewModeReg);
4248 SDValue SetTrapReg =
4249 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4250 IntrinID, TrapHwRegImm, NewTrapReg);
4251 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4252}
4253
4255 const MachineFunction &MF) const {
4257 .Case("m0", AMDGPU::M0)
4258 .Case("exec", AMDGPU::EXEC)
4259 .Case("exec_lo", AMDGPU::EXEC_LO)
4260 .Case("exec_hi", AMDGPU::EXEC_HI)
4261 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4262 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4263 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4264 .Default(Register());
4265
4266 if (Reg == AMDGPU::NoRegister) {
4267 report_fatal_error(Twine("invalid register name \""
4268 + StringRef(RegName) + "\"."));
4269
4270 }
4271
4272 if (!Subtarget->hasFlatScrRegister() &&
4273 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
4274 report_fatal_error(Twine("invalid register \""
4275 + StringRef(RegName) + "\" for subtarget."));
4276 }
4277
4278 switch (Reg) {
4279 case AMDGPU::M0:
4280 case AMDGPU::EXEC_LO:
4281 case AMDGPU::EXEC_HI:
4282 case AMDGPU::FLAT_SCR_LO:
4283 case AMDGPU::FLAT_SCR_HI:
4284 if (VT.getSizeInBits() == 32)
4285 return Reg;
4286 break;
4287 case AMDGPU::EXEC:
4288 case AMDGPU::FLAT_SCR:
4289 if (VT.getSizeInBits() == 64)
4290 return Reg;
4291 break;
4292 default:
4293 llvm_unreachable("missing register type checking");
4294 }
4295
4296 report_fatal_error(Twine("invalid type for register \""
4297 + StringRef(RegName) + "\"."));
4298}
4299
4300// If kill is not the last instruction, split the block so kill is always a
4301// proper terminator.
4304 MachineBasicBlock *BB) const {
4305 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
4307 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
4308 return SplitBB;
4309}
4310
4311// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
4312// \p MI will be the only instruction in the loop body block. Otherwise, it will
4313// be the first instruction in the remainder block.
4314//
4315/// \returns { LoopBody, Remainder }
4316static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4320
4321 // To insert the loop we need to split the block. Move everything after this
4322 // point to a new block, and insert a new empty block between the two.
4324 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
4326 ++MBBI;
4327
4328 MF->insert(MBBI, LoopBB);
4329 MF->insert(MBBI, RemainderBB);
4330
4331 LoopBB->addSuccessor(LoopBB);
4332 LoopBB->addSuccessor(RemainderBB);
4333
4334 // Move the rest of the block into a new block.
4335 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
4336
4337 if (InstInLoop) {
4338 auto Next = std::next(I);
4339
4340 // Move instruction to loop body.
4341 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
4342
4343 // Move the rest of the block.
4344 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
4345 } else {
4346 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
4347 }
4348
4349 MBB.addSuccessor(LoopBB);
4350
4351 return std::pair(LoopBB, RemainderBB);
4352}
4353
4354/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
4356 MachineBasicBlock *MBB = MI.getParent();
4358 auto I = MI.getIterator();
4359 auto E = std::next(I);
4360
4361 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
4362 .addImm(0);
4363
4364 MIBundleBuilder Bundler(*MBB, I, E);
4365 finalizeBundle(*MBB, Bundler.begin());
4366}
4367
4370 MachineBasicBlock *BB) const {
4371 const DebugLoc &DL = MI.getDebugLoc();
4372
4374
4375 MachineBasicBlock *LoopBB;
4376 MachineBasicBlock *RemainderBB;
4378
4379 // Apparently kill flags are only valid if the def is in the same block?
4380 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
4381 Src->setIsKill(false);
4382
4383 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
4384
4385 MachineBasicBlock::iterator I = LoopBB->end();
4386
4387 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
4389
4390 // Clear TRAP_STS.MEM_VIOL
4391 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
4392 .addImm(0)
4393 .addImm(EncodedReg);
4394
4396
4397 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4398
4399 // Load and check TRAP_STS.MEM_VIOL
4400 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
4401 .addImm(EncodedReg);
4402
4403 // FIXME: Do we need to use an isel pseudo that may clobber scc?
4404 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
4405 .addReg(Reg, RegState::Kill)
4406 .addImm(0);
4407 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4408 .addMBB(LoopBB);
4409
4410 return RemainderBB;
4411}
4412
4413// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
4414// wavefront. If the value is uniform and just happens to be in a VGPR, this
4415// will only do one iteration. In the worst case, this will loop 64 times.
4416//
4417// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
4420 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
4421 const DebugLoc &DL, const MachineOperand &Idx,
4422 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
4423 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
4424 Register &SGPRIdxReg) {
4425
4426 MachineFunction *MF = OrigBB.getParent();
4427 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4428 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4430
4431 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4432 Register PhiExec = MRI.createVirtualRegister(BoolRC);
4433 Register NewExec = MRI.createVirtualRegister(BoolRC);
4434 Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4435 Register CondReg = MRI.createVirtualRegister(BoolRC);
4436
4437 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
4438 .addReg(InitReg)
4439 .addMBB(&OrigBB)
4440 .addReg(ResultReg)
4441 .addMBB(&LoopBB);
4442
4443 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
4444 .addReg(InitSaveExecReg)
4445 .addMBB(&OrigBB)
4446 .addReg(NewExec)
4447 .addMBB(&LoopBB);
4448
4449 // Read the next variant <- also loop target.
4450 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4451 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
4452
4453 // Compare the just read M0 value to all possible Idx values.
4454 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4455 .addReg(CurrentIdxReg)
4456 .addReg(Idx.getReg(), 0, Idx.getSubReg());
4457
4458 // Update EXEC, save the original EXEC value to VCC.
4459 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4460 : AMDGPU::S_AND_SAVEEXEC_B64),
4461 NewExec)
4462 .addReg(CondReg, RegState::Kill);
4463
4464 MRI.setSimpleHint(NewExec, CondReg);
4465
4466 if (UseGPRIdxMode) {
4467 if (Offset == 0) {
4468 SGPRIdxReg = CurrentIdxReg;
4469 } else {
4470 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4471 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4472 .addReg(CurrentIdxReg, RegState::Kill)
4473 .addImm(Offset);
4474 }
4475 } else {
4476 // Move index from VCC into M0
4477 if (Offset == 0) {
4478 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4479 .addReg(CurrentIdxReg, RegState::Kill);
4480 } else {
4481 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4482 .addReg(CurrentIdxReg, RegState::Kill)
4483 .addImm(Offset);
4484 }
4485 }
4486
4487 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
4488 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4489 MachineInstr *InsertPt =
4490 BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4491 : AMDGPU::S_XOR_B64_term), Exec)
4492 .addReg(Exec)
4493 .addReg(NewExec);
4494
4495 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
4496 // s_cbranch_scc0?
4497
4498 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
4499 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
4500 .addMBB(&LoopBB);
4501
4502 return InsertPt->getIterator();
4503}
4504
4505// This has slightly sub-optimal regalloc when the source vector is killed by
4506// the read. The register allocator does not understand that the kill is
4507// per-workitem, so is kept alive for the whole loop so we end up not re-using a
4508// subregister from it, using 1 more VGPR than necessary. This was saved when
4509// this was expanded after register allocation.
4512 unsigned InitResultReg, unsigned PhiReg, int Offset,
4513 bool UseGPRIdxMode, Register &SGPRIdxReg) {
4515 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4516 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4518 const DebugLoc &DL = MI.getDebugLoc();
4520
4521 const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4522 Register DstReg = MI.getOperand(0).getReg();
4523 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
4524 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
4525 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4526 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4527
4528 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
4529
4530 // Save the EXEC mask
4531 BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec)
4532 .addReg(Exec);
4533
4534 MachineBasicBlock *LoopBB;
4535 MachineBasicBlock *RemainderBB;
4536 std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false);
4537
4538 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4539
4540 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
4541 InitResultReg, DstReg, PhiReg, TmpExec,
4542 Offset, UseGPRIdxMode, SGPRIdxReg);
4543
4544 MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
4546 ++MBBI;
4547 MF->insert(MBBI, LandingPad);
4548 LoopBB->removeSuccessor(RemainderBB);
4549 LandingPad->addSuccessor(RemainderBB);
4550 LoopBB->addSuccessor(LandingPad);
4551 MachineBasicBlock::iterator First = LandingPad->begin();
4552 BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
4553 .addReg(SaveExec);
4554
4555 return InsPt;
4556}
4557
4558// Returns subreg index, offset
4559static std::pair<unsigned, int>
4561 const TargetRegisterClass *SuperRC,
4562 unsigned VecReg,
4563 int Offset) {
4564 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
4565
4566 // Skip out of bounds offsets, or else we would end up using an undefined
4567 // register.
4568 if (Offset >= NumElts || Offset < 0)
4569 return std::pair(AMDGPU::sub0, Offset);
4570
4571 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
4572}
4573
4576 int Offset) {
4577 MachineBasicBlock *MBB = MI.getParent();
4578 const DebugLoc &DL = MI.getDebugLoc();
4580
4581 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4582
4583 assert(Idx->getReg() != AMDGPU::NoRegister);
4584
4585 if (Offset == 0) {
4586 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
4587 } else {
4588 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4589 .add(*Idx)
4590 .addImm(Offset);
4591 }
4592}
4593
4596 int Offset) {
4597 MachineBasicBlock *MBB = MI.getParent();
4598 const DebugLoc &DL = MI.getDebugLoc();
4600
4601 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4602
4603 if (Offset == 0)
4604 return Idx->getReg();
4605
4606 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4607 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
4608 .add(*Idx)
4609 .addImm(Offset);
4610 return Tmp;
4611}
4612
4615 const GCNSubtarget &ST) {
4616 const SIInstrInfo *TII = ST.getInstrInfo();
4617 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4620
4621 Register Dst = MI.getOperand(0).getReg();
4622 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4623 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
4624 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4625
4626 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
4627 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4628
4629 unsigned SubReg;
4630 std::tie(SubReg, Offset)
4631 = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
4632
4633 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4634
4635 // Check for a SGPR index.
4636 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4638 const DebugLoc &DL = MI.getDebugLoc();
4639
4640 if (UseGPRIdxMode) {
4641 // TODO: Look at the uses to avoid the copy. This may require rescheduling
4642 // to avoid interfering with other uses, so probably requires a new
4643 // optimization pass.
4645
4646 const MCInstrDesc &GPRIDXDesc =
4647 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4648 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4649 .addReg(SrcReg)
4650 .addReg(Idx)
4651 .addImm(SubReg);
4652 } else {
4654
4655 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4656 .addReg(SrcReg, 0, SubReg)
4657 .addReg(SrcReg, RegState::Implicit);
4658 }
4659
4660 MI.eraseFromParent();
4661
4662 return &MBB;
4663 }
4664
4665 // Control flow needs to be inserted if indexing with a VGPR.
4666 const DebugLoc &DL = MI.getDebugLoc();
4668
4669 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4670 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4671
4672 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
4673
4674 Register SGPRIdxReg;
4675 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
4676 UseGPRIdxMode, SGPRIdxReg);
4677
4678 MachineBasicBlock *LoopBB = InsPt->getParent();
4679
4680 if (UseGPRIdxMode) {
4681 const MCInstrDesc &GPRIDXDesc =
4682 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
4683
4684 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4685 .addReg(SrcReg)
4686 .addReg(SGPRIdxReg)
4687 .addImm(SubReg);
4688 } else {
4689 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4690 .addReg(SrcReg, 0, SubReg)
4691 .addReg(SrcReg, RegState::Implicit);
4692 }
4693
4694 MI.eraseFromParent();
4695
4696 return LoopBB;
4697}
4698
4701 const GCNSubtarget &ST) {
4702 const SIInstrInfo *TII = ST.getInstrInfo();
4703 const SIRegisterInfo &TRI = TII->getRegisterInfo();
4706
4707 Register Dst = MI.getOperand(0).getReg();
4708 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
4709 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
4710 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
4711 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
4712 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
4713 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
4714
4715 // This can be an immediate, but will be folded later.
4716 assert(Val->getReg());
4717
4718 unsigned SubReg;
4719 std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
4720 SrcVec->getReg(),
4721 Offset);
4722 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4723
4724 if (Idx->getReg() == AMDGPU::NoRegister) {
4726 const DebugLoc &DL = MI.getDebugLoc();
4727
4728 assert(Offset == 0);
4729
4730 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
4731 .add(*SrcVec)
4732 .add(*Val)
4733 .addImm(SubReg);
4734
4735 MI.eraseFromParent();
4736 return &MBB;
4737 }
4738
4739 // Check for a SGPR index.
4740 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4742 const DebugLoc &DL = MI.getDebugLoc();
4743
4744 if (UseGPRIdxMode) {
4746
4747 const MCInstrDesc &GPRIDXDesc =
4748 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4749 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
4750 .addReg(SrcVec->getReg())
4751 .add(*Val)
4752 .addReg(Idx)
4753 .addImm(SubReg);
4754 } else {
4756
4757 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4758 TRI.getRegSizeInBits(*VecRC), 32, false);
4759 BuildMI(MBB, I, DL, MovRelDesc, Dst)
4760 .addReg(SrcVec->getReg())
4761 .add(*Val)
4762 .addImm(SubReg);
4763 }
4764 MI.eraseFromParent();
4765 return &MBB;
4766 }
4767
4768 // Control flow needs to be inserted if indexing with a VGPR.
4769 if (Val->isReg())
4770 MRI.clearKillFlags(Val->getReg());
4771
4772 const DebugLoc &DL = MI.getDebugLoc();
4773
4774 Register PhiReg = MRI.createVirtualRegister(VecRC);
4775
4776 Register SGPRIdxReg;
4777 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
4778 UseGPRIdxMode, SGPRIdxReg);
4779 MachineBasicBlock *LoopBB = InsPt->getParent();
4780
4781 if (UseGPRIdxMode) {
4782 const MCInstrDesc &GPRIDXDesc =
4783 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
4784
4785 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
4786 .addReg(PhiReg)
4787 .add(*Val)
4788 .addReg(SGPRIdxReg)
4789 .addImm(AMDGPU::sub0);
4790 } else {
4791 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
4792 TRI.getRegSizeInBits(*VecRC), 32, false);
4793 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
4794 .addReg(PhiReg)
4795 .add(*Val)
4796 .addImm(AMDGPU::sub0);
4797 }
4798
4799 MI.eraseFromParent();
4800 return LoopBB;
4801}
4802
4805 const GCNSubtarget &ST,
4806 unsigned Opc) {
4808 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4809 const DebugLoc &DL = MI.getDebugLoc();
4810 const SIInstrInfo *TII = ST.getInstrInfo();
4811
4812 // Reduction operations depend on whether the input operand is SGPR or VGPR.
4813 Register SrcReg = MI.getOperand(1).getReg();
4814 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
4815 Register DstReg = MI.getOperand(0).getReg();
4816 MachineBasicBlock *RetBB = nullptr;
4817 if (isSGPR) {
4818 // These operations with a uniform value i.e. SGPR are idempotent.
4819 // Reduced value will be same as given sgpr.
4820 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
4821 RetBB = &BB;
4822 } else {
4823 // TODO: Implement DPP Strategy and switch based on immediate strategy
4824 // operand. For now, for all the cases (default, Iterative and DPP we use
4825 // iterative approach by default.)
4826
4827 // To reduce the VGPR using iterative approach, we need to iterate
4828 // over all the active lanes. Lowering consists of ComputeLoop,
4829 // which iterate over only active lanes. We use copy of EXEC register
4830 // as induction variable and every active lane modifies it using bitset0
4831 // so that we will get the next active lane for next iteration.
4833 Register SrcReg = MI.getOperand(1).getReg();
4834
4835 // Create Control flow for loop
4836 // Split MI's Machine Basic block into For loop
4837 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
4838
4839 // Create virtual registers required for lowering.
4840 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
4841 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
4842 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
4843 Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
4844
4845 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
4846 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4847 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
4848
4849 Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
4850 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
4851
4852 bool IsWave32 = ST.isWave32();
4853 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4854 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4855
4856 // Create initail values of induction variable from Exec, Accumulator and
4857 // insert branch instr to newly created ComputeBlockk
4858 uint32_t InitalValue =
4859 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4860 auto TmpSReg =
4861 BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
4862 BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4863 .addImm(InitalValue);
4864 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
4865
4866 // Start constructing ComputeLoop
4867 I = ComputeLoop->end();
4868 auto Accumulator =
4869 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
4870 .addReg(InitalValReg)
4871 .addMBB(&BB);
4872 auto ActiveBits =
4873 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
4874 .addReg(TmpSReg->getOperand(0).getReg())
4875 .addMBB(&BB);
4876
4877 // Perform the computations
4878 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4879 auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
4880 .addReg(ActiveBits->getOperand(0).getReg());
4881 auto LaneValue = BuildMI(*ComputeLoop, I, DL,
4882 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4883 .addReg(SrcReg)
4884 .addReg(FF1->getOperand(0).getReg());
4885 auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
4886 .addReg(Accumulator->getOperand(0).getReg())
4887 .addReg(LaneValue->getOperand(0).getReg());
4888
4889 // Manipulate the iterator to get the next active lane
4890 unsigned BITSETOpc =
4891 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4892 auto NewActiveBits =
4893 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
4894 .addReg(FF1->getOperand(0).getReg())
4895 .addReg(ActiveBits->getOperand(0).getReg());
4896
4897 // Add phi nodes
4898 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4899 .addMBB(ComputeLoop);
4900 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4901 .addMBB(ComputeLoop);
4902
4903 // Creating branching
4904 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4905 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
4906 .addReg(NewActiveBits->getOperand(0).getReg())
4907 .addImm(0);
4908 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
4909 .addMBB(ComputeLoop);
4910
4911 RetBB = ComputeEnd;
4912 }
4913 MI.eraseFromParent();
4914 return RetBB;
4915}
4916
4918 MachineInstr &MI, MachineBasicBlock *BB) const {
4919
4921 MachineFunction *MF = BB->getParent();
4923
4924 switch (MI.getOpcode()) {
4925 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4926 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
4927 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4928 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
4929 case AMDGPU::S_UADDO_PSEUDO:
4930 case AMDGPU::S_USUBO_PSEUDO: {
4931 const DebugLoc &DL = MI.getDebugLoc();
4932 MachineOperand &Dest0 = MI.getOperand(0);
4933 MachineOperand &Dest1 = MI.getOperand(1);
4934 MachineOperand &Src0 = MI.getOperand(2);
4935 MachineOperand &Src1 = MI.getOperand(3);
4936
4937 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4938 ? AMDGPU::S_ADD_I32
4939 : AMDGPU::S_SUB_I32;
4940 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
4941
4942 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
4943 .addImm(1)
4944 .addImm(0);
4945
4946 MI.eraseFromParent();
4947 return BB;
4948 }
4949 case AMDGPU::S_ADD_U64_PSEUDO:
4950 case AMDGPU::S_SUB_U64_PSEUDO: {
4951 // For targets older than GFX12, we emit a sequence of 32-bit operations.
4952 // For GFX12, we emit s_add_u64 and s_sub_u64.
4953 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
4955 const DebugLoc &DL = MI.getDebugLoc();
4956 MachineOperand &Dest = MI.getOperand(0);
4957 MachineOperand &Src0 = MI.getOperand(1);
4958 MachineOperand &Src1 = MI.getOperand(2);
4959 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4960 if (Subtarget->hasScalarAddSub64()) {
4961 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4962 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
4963 .add(Src0)
4964 .add(Src1);
4965 } else {
4966 const SIRegisterInfo *TRI = ST.getRegisterInfo();
4967 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
4968
4969 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4970 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4971
4972 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
4973 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4974 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
4975 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4976
4977 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
4978 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4979 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
4980 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4981
4982 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4983 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4984 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
4985 .add(Src0Sub0)
4986 .add(Src1Sub0);
4987 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
4988 .add(Src0Sub1)
4989 .add(Src1Sub1);
4990 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
4991 .addReg(DestSub0)
4992 .addImm(AMDGPU::sub0)
4993 .addReg(DestSub1)
4994 .addImm(AMDGPU::sub1);
4995 }
4996 MI.eraseFromParent();
4997 return BB;
4998 }
4999 case AMDGPU::V_ADD_U64_PSEUDO:
5000 case AMDGPU::V_SUB_U64_PSEUDO: {
5002 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5003 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5004 const DebugLoc &DL = MI.getDebugLoc();
5005
5006 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5007
5008 MachineOperand &Dest = MI.getOperand(0);
5009 MachineOperand &Src0 = MI.getOperand(1);
5010 MachineOperand &Src1 = MI.getOperand(2);
5011
5012 if (IsAdd && ST.hasLshlAddB64()) {
5013 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
5014 Dest.getReg())
5015 .add(Src0)
5016 .addImm(0)
5017 .add(Src1);
5018 TII->legalizeOperands(*Add);
5019 MI.eraseFromParent();
5020 return BB;
5021 }
5022
5023 const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5024
5025 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5026 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5027
5028 Register CarryReg = MRI.createVirtualRegister(CarryRC);
5029 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
5030
5031 const TargetRegisterClass *Src0RC = Src0.isReg()
5032 ? MRI.getRegClass(Src0.getReg())
5033 : &AMDGPU::VReg_64RegClass;
5034 const TargetRegisterClass *Src1RC = Src1.isReg()
5035 ? MRI.getRegClass(Src1.getReg())
5036 : &AMDGPU::VReg_64RegClass;
5037
5038 const TargetRegisterClass *Src0SubRC =
5039 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5040 const TargetRegisterClass *Src1SubRC =
5041 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5042
5043 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
5044 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5045 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
5046 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5047
5048 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
5049 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5050 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
5051 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5052
5053 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5054 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
5055 .addReg(CarryReg, RegState::Define)
5056 .add(SrcReg0Sub0)
5057 .add(SrcReg1Sub0)
5058 .addImm(0); // clamp bit
5059
5060 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5061 MachineInstr *HiHalf =
5062 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
5063 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
5064 .add(SrcReg0Sub1)
5065 .add(SrcReg1Sub1)
5066 .addReg(CarryReg, RegState::Kill)
5067 .addImm(0); // clamp bit
5068
5069 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5070 .addReg(DestSub0)
5071 .addImm(AMDGPU::sub0)
5072 .addReg(DestSub1)
5073 .addImm(AMDGPU::sub1);
5074 TII->legalizeOperands(*LoHalf);
5075 TII->legalizeOperands(*HiHalf);
5076 MI.eraseFromParent();
5077 return BB;
5078 }
5079 case AMDGPU::S_ADD_CO_PSEUDO:
5080 case AMDGPU::S_SUB_CO_PSEUDO: {
5081 // This pseudo has a chance to be selected
5082 // only from uniform add/subcarry node. All the VGPR operands
5083 // therefore assumed to be splat vectors.
5085 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5086 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5088 const DebugLoc &DL = MI.getDebugLoc();
5089 MachineOperand &Dest = MI.getOperand(0);
5090 MachineOperand &CarryDest = MI.getOperand(1);
5091 MachineOperand &Src0 = MI.getOperand(2);
5092 MachineOperand &Src1 = MI.getOperand(3);
5093 MachineOperand &Src2 = MI.getOperand(4);
5094 unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5095 ? AMDGPU::S_ADDC_U32
5096 : AMDGPU::S_SUBB_U32;
5097 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
5098 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5099 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5100 .addReg(Src0.getReg());
5101 Src0.setReg(RegOp0);
5102 }
5103 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
5104 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5105 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5106 .addReg(Src1.getReg());
5107 Src1.setReg(RegOp1);
5108 }
5109 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5110 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
5111 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5112 .addReg(Src2.getReg());
5113 Src2.setReg(RegOp2);
5114 }
5115
5116 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
5117 unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
5118 assert(WaveSize == 64 || WaveSize == 32);
5119
5120 if (WaveSize == 64) {
5121 if (ST.hasScalarCompareEq64()) {
5122 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
5123 .addReg(Src2.getReg())
5124 .addImm(0);
5125 } else {
5126 const TargetRegisterClass *SubRC =
5127 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5128 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
5129 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5130 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
5131 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5132 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5133
5134 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
5135 .add(Src2Sub0)
5136 .add(Src2Sub1);
5137
5138 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5139 .addReg(Src2_32, RegState::Kill)
5140 .addImm(0);
5141 }
5142 } else {
5143 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5144 .addReg(Src2.getReg())
5145 .addImm(0);
5146 }
5147
5148 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
5149
5150 unsigned SelOpc =
5151 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5152
5153 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
5154 .addImm(-1)
5155 .addImm(0);
5156
5157 MI.eraseFromParent();
5158 return BB;
5159 }
5160 case AMDGPU::SI_INIT_M0: {
5161 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
5162 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5163 .add(MI.getOperand(0));
5164 MI.eraseFromParent();
5165 return BB;
5166 }
5167 case AMDGPU::GET_GROUPSTATICSIZE: {
5168 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
5169 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
5170 DebugLoc DL = MI.getDebugLoc();
5171 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
5172 .add(MI.getOperand(0))
5173 .addImm(MFI->getLDSSize());
5174 MI.eraseFromParent();
5175 return BB;
5176 }
5177 case AMDGPU::GET_SHADERCYCLESHILO: {
5180 const DebugLoc &DL = MI.getDebugLoc();
5181 // The algorithm is:
5182 //
5183 // hi1 = getreg(SHADER_CYCLES_HI)
5184 // lo1 = getreg(SHADER_CYCLES_LO)
5185 // hi2 = getreg(SHADER_CYCLES_HI)
5186 //
5187 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
5188 // Otherwise there was overflow and the result is hi2:0. In both cases the
5189 // result should represent the actual time at some point during the sequence
5190 // of three getregs.
5191 using namespace AMDGPU::Hwreg;
5192 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5193 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
5194 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5195 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5196 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
5197 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5198 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5199 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
5200 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5201 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
5202 .addReg(RegHi1)
5203 .addReg(RegHi2);
5204 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5205 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
5206 .addReg(RegLo1)
5207 .addImm(0);
5208 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
5209 .add(MI.getOperand(0))
5210 .addReg(RegLo)
5211 .addImm(AMDGPU::sub0)
5212 .addReg(RegHi2)
5213 .addImm(AMDGPU::sub1);
5214 MI.eraseFromParent();
5215 return BB;
5216 }
5217 case AMDGPU::SI_INDIRECT_SRC_V1:
5218 case AMDGPU::SI_INDIRECT_SRC_V2:
5219 case AMDGPU::SI_INDIRECT_SRC_V4:
5220 case AMDGPU::SI_INDIRECT_SRC_V8:
5221 case AMDGPU::SI_INDIRECT_SRC_V9:
5222 case AMDGPU::SI_INDIRECT_SRC_V10:
5223 case AMDGPU::SI_INDIRECT_SRC_V11:
5224 case AMDGPU::SI_INDIRECT_SRC_V12:
5225 case AMDGPU::SI_INDIRECT_SRC_V16:
5226 case AMDGPU::SI_INDIRECT_SRC_V32:
5227 return emitIndirectSrc(MI, *BB, *getSubtarget());
5228 case AMDGPU::SI_INDIRECT_DST_V1:
5229 case AMDGPU::SI_INDIRECT_DST_V2:
5230 case AMDGPU::SI_INDIRECT_DST_V4:
5231 case AMDGPU::SI_INDIRECT_DST_V8:
5232 case AMDGPU::SI_INDIRECT_DST_V9:
5233 case AMDGPU::SI_INDIRECT_DST_V10:
5234 case AMDGPU::SI_INDIRECT_DST_V11:
5235 case AMDGPU::SI_INDIRECT_DST_V12:
5236 case AMDGPU::SI_INDIRECT_DST_V16:
5237 case AMDGPU::SI_INDIRECT_DST_V32:
5238 return emitIndirectDst(MI, *BB, *getSubtarget());
5239 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5240 case AMDGPU::SI_KILL_I1_PSEUDO:
5241 return splitKillBlock(MI, BB);
5242 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5244 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5245 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5246
5247 Register Dst = MI.getOperand(0).getReg();
5248 const MachineOperand &Src0 = MI.getOperand(1);
5249 const MachineOperand &Src1 = MI.getOperand(2);
5250 const DebugLoc &DL = MI.getDebugLoc();
5251 Register SrcCond = MI.getOperand(3).getReg();
5252
5253 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5254 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5255 const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5256 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5257
5258 const TargetRegisterClass *Src0RC = Src0.isReg()
5259 ? MRI.getRegClass(Src0.getReg())
5260 : &AMDGPU::VReg_64RegClass;
5261 const TargetRegisterClass *Src1RC = Src1.isReg()
5262 ? MRI.getRegClass(Src1.getReg())
5263 : &AMDGPU::VReg_64RegClass;
5264
5265 const TargetRegisterClass *Src0SubRC =
5266 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5267 const TargetRegisterClass *Src1SubRC =
5268 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5269
5270 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5271 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5272 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5273 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5274
5275 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5276 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5277 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5278 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5279
5280 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
5281 .addReg(SrcCond);
5282 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5283 .addImm(0)
5284 .add(Src0Sub0)
5285 .addImm(0)
5286 .add(Src1Sub0)
5287 .addReg(SrcCondCopy);
5288 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5289 .addImm(0)
5290 .add(Src0Sub1)
5291 .addImm(0)
5292 .add(Src1Sub1)
5293 .addReg(SrcCondCopy);
5294
5295 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5296 .addReg(DstLo)
5297 .addImm(AMDGPU::sub0)
5298 .addReg(DstHi)
5299 .addImm(AMDGPU::sub1);
5300 MI.eraseFromParent();
5301 return BB;
5302 }
5303 case AMDGPU::SI_BR_UNDEF: {
5305 const DebugLoc &DL = MI.getDebugLoc();
5306 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5307 .add(MI.getOperand(0));
5308 Br->getOperand(1).setIsUndef(); // read undef SCC
5309 MI.eraseFromParent();
5310 return BB;
5311 }
5312 case AMDGPU::ADJCALLSTACKUP:
5313 case AMDGPU::ADJCALLSTACKDOWN: {
5315 MachineInstrBuilder MIB(*MF, &MI);
5316 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
5317 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
5318 return BB;
5319 }
5320 case AMDGPU::SI_CALL_ISEL: {
5322 const DebugLoc &DL = MI.getDebugLoc();
5323
5324 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
5325
5327 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5328
5329 for (const MachineOperand &MO : MI.operands())
5330 MIB.add(MO);
5331
5332 MIB.cloneMemRefs(MI);
5333 MI.eraseFromParent();
5334 return BB;
5335 }
5336 case AMDGPU::V_ADD_CO_U32_e32:
5337 case AMDGPU::V_SUB_CO_U32_e32:
5338 case AMDGPU::V_SUBREV_CO_U32_e32: {
5339 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
5340 const DebugLoc &DL = MI.getDebugLoc();
5341 unsigned Opc = MI.getOpcode();
5342
5343 bool NeedClampOperand = false;
5344 if (TII->pseudoToMCOpcode(Opc) == -1) {
5345 Opc = AMDGPU::getVOPe64(Opc);
5346 NeedClampOperand = true;
5347 }
5348
5349 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
5350 if (TII->isVOP3(*I)) {
5351 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5352 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5353 I.addReg(TRI->getVCC(), RegState::Define);
5354 }
5355 I.add(MI.getOperand(1))
5356 .add(MI.getOperand(2));
5357 if (NeedClampOperand)
5358 I.addImm(0); // clamp bit for e64 encoding
5359
5360 TII->legalizeOperands(*I);
5361
5362 MI.eraseFromParent();
5363 return BB;
5364 }
5365 case AMDGPU::V_ADDC_U32_e32:
5366 case AMDGPU::V_SUBB_U32_e32:
5367 case AMDGPU::V_SUBBREV_U32_e32:
5368 // These instructions have an implicit use of vcc which counts towards the
5369 // constant bus limit.
5370 TII->legalizeOperands(MI);
5371 return BB;
5372 case AMDGPU::DS_GWS_INIT:
5373 case AMDGPU::DS_GWS_SEMA_BR:
5374 case AMDGPU::DS_GWS_BARRIER:
5375 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::data0);
5376 [[fallthrough]];
5377 case AMDGPU::DS_GWS_SEMA_V:
5378 case AMDGPU::DS_GWS_SEMA_P:
5379 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5380 // A s_waitcnt 0 is required to be the instruction immediately following.
5381 if (getSubtarget()->hasGWSAutoReplay()) {
5383 return BB;
5384 }
5385
5386 return emitGWSMemViolTestLoop(MI, BB);
5387 case AMDGPU::S_SETREG_B32: {
5388 // Try to optimize cases that only set the denormal mode or rounding mode.
5389 //
5390 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
5391 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
5392 // instead.
5393 //
5394 // FIXME: This could be predicates on the immediate, but tablegen doesn't
5395 // allow you to have a no side effect instruction in the output of a
5396 // sideeffecting pattern.
5397 auto [ID, Offset, Width] =
5398 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
5400 return BB;
5401
5402 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5403 const unsigned SetMask = WidthMask << Offset;
5404
5405 if (getSubtarget()->hasDenormModeInst()) {
5406 unsigned SetDenormOp = 0;
5407 unsigned SetRoundOp = 0;
5408
5409 // The dedicated instructions can only set the whole denorm or round mode
5410 // at once, not a subset of bits in either.
5411 if (SetMask ==
5413 // If this fully sets both the round and denorm mode, emit the two
5414 // dedicated instructions for these.
5415 SetRoundOp = AMDGPU::S_ROUND_MODE;
5416 SetDenormOp = AMDGPU::S_DENORM_MODE;
5417 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
5418 SetRoundOp = AMDGPU::S_ROUND_MODE;
5419 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
5420 SetDenormOp = AMDGPU::S_DENORM_MODE;
5421 }
5422
5423 if (SetRoundOp || SetDenormOp) {
5425 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
5426 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5427 unsigned ImmVal = Def->getOperand(1).getImm();
5428 if (SetRoundOp) {
5429 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
5430 .addImm(ImmVal & 0xf);
5431
5432 // If we also have the denorm mode, get just the denorm mode bits.
5433 ImmVal >>= 4;
5434 }
5435
5436 if (SetDenormOp) {
5437 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
5438 .addImm(ImmVal & 0xf);
5439 }
5440
5441 MI.eraseFromParent();
5442 return BB;
5443 }
5444 }
5445 }
5446
5447 // If only FP bits are touched, used the no side effects pseudo.
5448 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
5449 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
5450 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
5451
5452 return BB;
5453 }
5454 case AMDGPU::S_INVERSE_BALLOT_U32:
5455 case AMDGPU::S_INVERSE_BALLOT_U64: {
5457 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5458 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5459 const DebugLoc &DL = MI.getDebugLoc();
5460 const Register DstReg = MI.getOperand(0).getReg();
5461 Register MaskReg = MI.getOperand(1).getReg();
5462
5463 const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
5464
5465 if (IsVALU) {
5466 MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
5467 }
5468
5469 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
5470 MI.eraseFromParent();
5471 return BB;
5472 }
5473 case AMDGPU::ENDPGM_TRAP: {
5474 const DebugLoc &DL = MI.getDebugLoc();
5475 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
5476 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
5477 MI.addOperand(MachineOperand::CreateImm(0));
5478 return BB;
5479 }
5480
5481 // We need a block split to make the real endpgm a terminator. We also don't
5482 // want to break phis in successor blocks, so we can't just delete to the
5483 // end of the block.
5484
5485 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
5487 MF->push_back(TrapBB);
5488 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
5489 .addImm(0);
5490 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5491 .addMBB(TrapBB);
5492
5493 BB->addSuccessor(TrapBB);
5494 MI.eraseFromParent();
5495 return SplitBB;
5496 }
5497 case AMDGPU::SIMULATED_TRAP: {
5498 assert(Subtarget->hasPrivEnabledTrap2NopBug());
5500 MachineBasicBlock *SplitBB =
5501 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
5502 MI.eraseFromParent();
5503 return SplitBB;
5504 }
5505 default:
5506 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
5507 if (!MI.mayStore())
5509 return BB;
5510 }
5512 }
5513}
5514
5516 // This currently forces unfolding various combinations of fsub into fma with
5517 // free fneg'd operands. As long as we have fast FMA (controlled by
5518 // isFMAFasterThanFMulAndFAdd), we should perform these.
5519
5520 // When fma is quarter rate, for f64 where add / sub are at best half rate,
5521 // most of these combines appear to be cycle neutral but save on instruction
5522 // count / code size.
5523 return true;
5524}
5525
5527
5529 EVT VT) const {
5530 if (!VT.isVector()) {
5531 return MVT::i1;
5532 }
5533 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
5534}
5535
5537 // TODO: Should i16 be used always if legal? For now it would force VALU
5538 // shifts.
5539 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5540}
5541
5543 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
5544 ? Ty.changeElementSize(16)
5545 : Ty.changeElementSize(32);
5546}
5547
5548// Answering this is somewhat tricky and depends on the specific device which
5549// have different rates for fma or all f64 operations.
5550//
5551// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
5552// regardless of which device (although the number of cycles differs between
5553// devices), so it is always profitable for f64.
5554//
5555// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
5556// only on full rate devices. Normally, we should prefer selecting v_mad_f32
5557// which we can always do even without fused FP ops since it returns the same
5558// result as the separate operations and since it is always full
5559// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
5560// however does not support denormals, so we do report fma as faster if we have
5561// a fast fma device and require denormals.
5562//
5564 EVT VT) const {
5565 VT = VT.getScalarType();
5566
5567 switch (VT.getSimpleVT().SimpleTy) {
5568 case MVT::f32: {
5569 // If mad is not available this depends only on if f32 fma is full rate.
5570 if (!Subtarget->hasMadMacF32Insts())
5571 return Subtarget->hasFastFMAF32();
5572
5573 // Otherwise f32 mad is always full rate and returns the same result as
5574 // the separate operations so should be preferred over fma.
5575 // However does not support denormals.
5577 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
5578
5579 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
5580 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
5581 }
5582 case MVT::f64:
5583 return true;
5584 case MVT::f16:
5585 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
5586 default:
5587 break;
5588 }
5589
5590 return false;
5591}
5592
5594 LLT Ty) const {
5595 switch (Ty.getScalarSizeInBits()) {
5596 case 16:
5597 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
5598 case 32:
5599 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
5600 case 64:
5601 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
5602 default:
5603 break;
5604 }
5605
5606 return false;
5607}
5608
5610 if (!Ty.isScalar())
5611 return false;
5612
5613 if (Ty.getScalarSizeInBits() == 16)
5614 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
5615 if (Ty.getScalarSizeInBits() == 32)
5616 return Subtarget->hasMadMacF32Insts() &&
5617 denormalModeIsFlushAllF32(*MI.getMF());
5618
5619 return false;
5620}
5621
5623 const SDNode *N) const {
5624 // TODO: Check future ftz flag
5625 // v_mad_f32/v_mac_f32 do not support denormals.
5626 EVT VT = N->getValueType(0);
5627 if (VT == MVT::f32)
5628 return Subtarget->hasMadMacF32Insts() &&
5630 if (VT == MVT::f16) {
5631 return Subtarget->hasMadF16() &&
5633 }
5634
5635 return false;
5636}
5637
5638//===----------------------------------------------------------------------===//
5639// Custom DAG Lowering Operations
5640//===----------------------------------------------------------------------===//
5641
5642// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5643// wider vector type is legal.
5645 SelectionDAG &DAG) const {
5646 unsigned Opc = Op.getOpcode();
5647 EVT VT = Op.getValueType();
5648 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5649 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5650 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5651 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5652
5653 SDValue Lo, Hi;
5654 std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
5655
5656 SDLoc SL(Op);
5657 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
5658 Op->getFlags());
5659 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
5660 Op->getFlags());
5661
5662 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5663}
5664
5665// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
5666// wider vector type is legal.
5668 SelectionDAG &DAG) const {
5669 unsigned Opc = Op.getOpcode();
5670 EVT VT = Op.getValueType();
5671 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5672 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5673 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5674 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5675
5676 SDValue Lo0, Hi0;
5677 std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
5678 SDValue Lo1, Hi1;
5679 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5680
5681 SDLoc SL(Op);
5682
5683 SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
5684 Op->getFlags());
5685 SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
5686 Op->getFlags());
5687
5688 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5689}
5690
5692 SelectionDAG &DAG) const {
5693 unsigned Opc = Op.getOpcode();
5694 EVT VT = Op.getValueType();
5695 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5696 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5697 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5698 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5699 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5700 VT == MVT::v32bf16);
5701
5702 SDValue Lo0, Hi0;
5703 SDValue Op0 = Op.getOperand(0);
5704 std::tie(Lo0, Hi0) = Op0.getValueType().isVector()
5705 ? DAG.SplitVectorOperand(Op.getNode(), 0)
5706 : std::pair(Op0, Op0);
5707 SDValue Lo1, Hi1;
5708 std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
5709 SDValue Lo2, Hi2;
5710 std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
5711
5712 SDLoc SL(Op);
5713 auto ResVT = DAG.GetSplitDestVTs(VT);
5714
5715 SDValue OpLo = DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
5716 Op->getFlags());
5717 SDValue OpHi = DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
5718 Op->getFlags());
5719
5720 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
5721}
5722
5723
5725 switch (Op.getOpcode()) {
5726 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
5727 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
5728 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
5729 case ISD::LOAD: {
5730 SDValue Result = LowerLOAD(Op, DAG);
5731 assert((!Result.getNode() ||
5732 Result.getNode()->getNumValues() == 2) &&
5733 "Load should return a value and a chain");
5734 return Result;
5735 }
5736 case ISD::FSQRT: {
5737 EVT VT = Op.getValueType();
5738 if (VT == MVT::f32)
5739 return lowerFSQRTF32(Op, DAG);
5740 if (VT == MVT::f64)
5741 return lowerFSQRTF64(Op, DAG);
5742 return SDValue();
5743 }
5744 case ISD::FSIN:
5745 case ISD::FCOS:
5746 return LowerTrig(Op, DAG);
5747 case ISD::SELECT: return LowerSELECT(Op, DAG);
5748 case ISD::FDIV: return LowerFDIV(Op, DAG);
5749 case ISD::FFREXP: return LowerFFREXP(Op, DAG);
5750 case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
5751 case ISD::STORE: return LowerSTORE(Op, DAG);
5752 case ISD::GlobalAddress: {
5755 return LowerGlobalAddress(MFI, Op, DAG);
5756 }
5757 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
5758 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
5759 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
5760 case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
5762 return lowerINSERT_SUBVECTOR(Op, DAG);
5764 return lowerINSERT_VECTOR_ELT(Op, DAG);
5766 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
5768 return lowerVECTOR_SHUFFLE(Op, DAG);
5770 return lowerSCALAR_TO_VECTOR(Op, DAG);
5771 case ISD::BUILD_VECTOR:
5772 return lowerBUILD_VECTOR(Op, DAG);
5773 case ISD::FP_ROUND:
5775 return lowerFP_ROUND(Op, DAG);
5776 case ISD::FPTRUNC_ROUND: {
5777 unsigned Opc;
5778 SDLoc DL(Op);
5779
5780 if (Op.getOperand(0)->getValueType(0) != MVT::f32)
5781 return SDValue();
5782
5783 // Get the rounding mode from the last operand
5784 int RoundMode = Op.getConstantOperandVal(1);
5785 if (RoundMode == (int)RoundingMode::TowardPositive)
5787 else if (RoundMode == (int)RoundingMode::TowardNegative)
5789 else
5790 return SDValue();
5791
5792 return DAG.getNode(Opc, DL, Op.getNode()->getVTList(), Op->getOperand(0));
5793 }
5794 case ISD::TRAP:
5795 return lowerTRAP(Op, DAG);
5796 case ISD::DEBUGTRAP:
5797 return lowerDEBUGTRAP(Op, DAG);
5798 case ISD::FABS:
5799 case ISD::FNEG:
5800 case ISD::FCANONICALIZE:
5801 case ISD::BSWAP:
5802 return splitUnaryVectorOp(Op, DAG);
5803 case ISD::FMINNUM:
5804 case ISD::FMAXNUM:
5805 return lowerFMINNUM_FMAXNUM(Op, DAG);
5806 case ISD::FLDEXP:
5807 case ISD::STRICT_FLDEXP:
5808 return lowerFLDEXP(Op, DAG);
5809 case ISD::FMA:
5810 return splitTernaryVectorOp(Op, DAG);
5811 case ISD::FP_TO_SINT:
5812 case ISD::FP_TO_UINT:
5813 return LowerFP_TO_INT(Op, DAG);
5814 case ISD::SHL:
5815 case ISD::SRA:
5816 case ISD::SRL:
5817 case ISD::ADD:
5818 case ISD::SUB:
5819 case ISD::SMIN:
5820 case ISD::SMAX:
5821 case ISD::UMIN:
5822 case ISD::UMAX:
5823 case ISD::FADD:
5824 case ISD::FMUL:
5825 case ISD::FMINNUM_IEEE:
5826 case ISD::FMAXNUM_IEEE:
5827 case ISD::FMINIMUM:
5828 case ISD::FMAXIMUM:
5829 case ISD::UADDSAT:
5830 case ISD::USUBSAT:
5831 case ISD::SADDSAT:
5832 case ISD::SSUBSAT:
5833 return splitBinaryVectorOp(Op, DAG);
5834 case ISD::MUL:
5835 return lowerMUL(Op, DAG);
5836 case ISD::SMULO:
5837 case ISD::UMULO:
5838 return lowerXMULO(Op, DAG);
5839 case ISD::SMUL_LOHI:
5840 case ISD::UMUL_LOHI:
5841 return lowerXMUL_LOHI(Op, DAG);
5843 return LowerDYNAMIC_STACKALLOC(Op, DAG);
5844 case ISD::STACKSAVE:
5845 return LowerSTACKSAVE(Op, DAG);
5846 case ISD::GET_ROUNDING:
5847 return lowerGET_ROUNDING(Op, DAG);
5848 case ISD::SET_ROUNDING:
5849 return lowerSET_ROUNDING(Op, DAG);
5850 case ISD::PREFETCH:
5851 return lowerPREFETCH(Op, DAG);
5852 case ISD::FP_EXTEND:
5854 return lowerFP_EXTEND(Op, DAG);
5855 case ISD::GET_FPENV:
5856 return lowerGET_FPENV(Op, DAG);
5857 case ISD::SET_FPENV:
5858 return lowerSET_FPENV(Op, DAG);
5859 }
5860 return SDValue();
5861}
5862
5863// Used for D16: Casts the result of an instruction into the right vector,
5864// packs values if loads return unpacked values.
5866 const SDLoc &DL,
5867 SelectionDAG &DAG, bool Unpacked) {
5868 if (!LoadVT.isVector())
5869 return Result;
5870
5871 // Cast back to the original packed type or to a larger type that is a
5872 // multiple of 32 bit for D16. Widening the return type is a required for
5873 // legalization.
5874 EVT FittingLoadVT = LoadVT;
5875 if ((LoadVT.getVectorNumElements() % 2) == 1) {
5876 FittingLoadVT =
5878 LoadVT.getVectorNumElements() + 1);
5879 }
5880
5881 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
5882 // Truncate to v2i16/v4i16.
5883 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
5884
5885 // Workaround legalizer not scalarizing truncate after vector op
5886 // legalization but not creating intermediate vector trunc.
5888 DAG.ExtractVectorElements(Result, Elts);
5889 for (SDValue &Elt : Elts)
5890 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
5891
5892 // Pad illegal v1i16/v3fi6 to v4i16
5893 if ((LoadVT.getVectorNumElements() % 2) == 1)
5894 Elts.push_back(DAG.getUNDEF(MVT::i16));
5895
5896 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
5897
5898 // Bitcast to original type (v2f16/v4f16).
5899 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5900 }
5901
5902 // Cast back to the original packed type.
5903 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
5904}
5905
5906SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
5907 MemSDNode *M,
5908 SelectionDAG &DAG,
5910 bool IsIntrinsic) const {
5911 SDLoc DL(M);
5912
5913 bool Unpacked = Subtarget->hasUnpackedD16VMem();
5914 EVT LoadVT = M->getValueType(0);
5915
5916 EVT EquivLoadVT = LoadVT;
5917 if (LoadVT.isVector()) {
5918 if (Unpacked) {
5919 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
5920 LoadVT.getVectorNumElements());
5921 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
5922 // Widen v3f16 to legal type
5923 EquivLoadVT =
5925 LoadVT.getVectorNumElements() + 1);
5926 }
5927 }
5928
5929 // Change from v4f16/v2f16 to EquivLoadVT.
5930 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
5931
5933 = DAG.getMemIntrinsicNode(
5934 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
5935 VTList, Ops, M->getMemoryVT(),
5936 M->getMemOperand());
5937
5938 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
5939
5940 return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
5941}
5942
5943SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
5944 SelectionDAG &DAG,
5945 ArrayRef<SDValue> Ops) const {
5946 SDLoc DL(M);
5947 EVT LoadVT = M->getValueType(0);
5948 EVT EltType = LoadVT.getScalarType();
5949 EVT IntVT = LoadVT.changeTypeToInteger();
5950
5951 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
5952
5953 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
5954 bool IsTFE = M->getNumValues() == 3;
5955
5956 unsigned Opc;
5957 if (IsFormat) {
5960 } else {
5961 // TODO: Support non-format TFE loads.
5962 if (IsTFE)
5963 return SDValue();
5965 }
5966
5967 if (IsD16) {
5968 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
5969 }
5970
5971 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
5972 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
5973 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
5974
5975 if (isTypeLegal(LoadVT)) {
5976 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
5977 M->getMemOperand(), DAG);
5978 }
5979
5980 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
5981 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
5982 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
5983 M->getMemOperand(), DAG);
5984 return DAG.getMergeValues(
5985 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
5986 DL);
5987}
5988
5990 SDNode *N, SelectionDAG &DAG) {
5991 EVT VT = N->getValueType(0);
5992 unsigned CondCode = N->getConstantOperandVal(3);
5993 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
5994 return DAG.getUNDEF(VT);
5995
5996 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
5997
5998 SDValue LHS = N->getOperand(1);
5999 SDValue RHS = N->getOperand(2);
6000
6001 SDLoc DL(N);
6002
6003 EVT CmpVT = LHS.getValueType();
6004 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
6005 unsigned PromoteOp = ICmpInst::isSigned(IcInput) ?
6007 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
6008 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
6009 }
6010
6011 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
6012
6013 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6014 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6015
6016 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
6017 DAG.getCondCode(CCOpcode));
6018 if (VT.bitsEq(CCVT))
6019 return SetCC;
6020 return DAG.getZExtOrTrunc(SetCC, DL, VT);
6021}
6022
6024 SDNode *N, SelectionDAG &DAG) {
6025 EVT VT = N->getValueType(0);
6026
6027 unsigned CondCode = N->getConstantOperandVal(3);
6028 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
6029 return DAG.getUNDEF(VT);
6030
6031 SDValue Src0 = N->getOperand(1);
6032 SDValue Src1 = N->getOperand(2);
6033 EVT CmpVT = Src0.getValueType();
6034 SDLoc SL(N);
6035
6036 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
6037 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
6038 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
6039 }
6040
6041 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
6042 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
6043 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
6044 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
6045 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0,
6046 Src1, DAG.getCondCode(CCOpcode));
6047 if (VT.bitsEq(CCVT))
6048 return SetCC;
6049 return DAG.getZExtOrTrunc(SetCC, SL, VT);
6050}
6051
6053 SelectionDAG &DAG) {
6054 EVT VT = N->getValueType(0);
6055 SDValue Src = N->getOperand(1);
6056 SDLoc SL(N);
6057
6058 if (Src.getOpcode() == ISD::SETCC) {
6059 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
6060 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
6061 Src.getOperand(1), Src.getOperand(2));
6062 }
6063 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
6064 // (ballot 0) -> 0
6065 if (Arg->isZero())
6066 return DAG.getConstant(0, SL, VT);
6067
6068 // (ballot 1) -> EXEC/EXEC_LO
6069 if (Arg->isOne()) {
6070 Register Exec;
6071 if (VT.getScalarSizeInBits() == 32)
6072 Exec = AMDGPU::EXEC_LO;
6073 else if (VT.getScalarSizeInBits() == 64)
6074 Exec = AMDGPU::EXEC;
6075 else
6076 return SDValue();
6077
6078 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
6079 }
6080 }
6081
6082 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
6083 // ISD::SETNE)
6084 return DAG.getNode(
6085 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
6086 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
6087}
6088
6091 SelectionDAG &DAG) const {
6092 switch (N->getOpcode()) {
6094 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
6095 Results.push_back(Res);
6096 return;
6097 }
6099 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
6100 Results.push_back(Res);
6101 return;
6102 }
6104 unsigned IID = N->getConstantOperandVal(0);
6105 switch (IID) {
6106 case Intrinsic::amdgcn_make_buffer_rsrc:
6107 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
6108 return;
6109 case Intrinsic::amdgcn_cvt_pkrtz: {
6110 SDValue Src0 = N->getOperand(1);
6111 SDValue Src1 = N->getOperand(2);
6112 SDLoc SL(N);
6113 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
6114 Src0, Src1);
6115 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
6116 return;
6117 }
6118 case Intrinsic::amdgcn_cvt_pknorm_i16:
6119 case Intrinsic::amdgcn_cvt_pknorm_u16:
6120 case Intrinsic::amdgcn_cvt_pk_i16:
6121 case Intrinsic::amdgcn_cvt_pk_u16: {
6122 SDValue Src0 = N->getOperand(1);
6123 SDValue Src1 = N->getOperand(2);
6124 SDLoc SL(N);
6125 unsigned Opcode;
6126
6127 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6129 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6131 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6133 else
6135
6136 EVT VT = N->getValueType(0);
6137 if (isTypeLegal(VT))
6138 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
6139 else {
6140 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
6141 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
6142 }
6143 return;
6144 }
6145 case Intrinsic::amdgcn_s_buffer_load: {
6146 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
6147 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
6148 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
6149 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
6150 // s_buffer_load_i8.
6151 if (!Subtarget->hasScalarSubwordLoads())
6152 return;
6153 SDValue Op = SDValue(N, 0);
6154 SDValue Rsrc = Op.getOperand(1);
6155 SDValue Offset = Op.getOperand(2);
6156 SDValue CachePolicy = Op.getOperand(3);
6157 EVT VT = Op.getValueType();
6158 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
6159 SDLoc DL(Op);
6161 const DataLayout &DataLayout = DAG.getDataLayout();
6162 Align Alignment =
6168 VT.getStoreSize(), Alignment);
6169 SDValue LoadVal;
6170 if (!Offset->isDivergent()) {
6171 SDValue Ops[] = {Rsrc, // source register
6172 Offset, CachePolicy};
6173 SDValue BufferLoad =
6175 DAG.getVTList(MVT::i32), Ops, VT, MMO);
6176 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
6177 } else {
6178 SDValue Ops[] = {
6179 DAG.getEntryNode(), // Chain
6180 Rsrc, // rsrc
6181 DAG.getConstant(0, DL, MVT::i32), // vindex
6182 {}, // voffset
6183 {}, // soffset
6184 {}, // offset
6185 CachePolicy, // cachepolicy
6186 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
6187 };
6188 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
6189 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
6190 }
6191 Results.push_back(LoadVal);
6192 return;
6193 }
6194 }
6195 break;
6196 }
6198 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
6199 if (Res.getOpcode() == ISD::MERGE_VALUES) {
6200 // FIXME: Hacky
6201 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
6202 Results.push_back(Res.getOperand(I));
6203 }
6204 } else {
6205 Results.push_back(Res);
6206 Results.push_back(Res.getValue(1));
6207 }
6208 return;
6209 }
6210
6211 break;
6212 }
6213 case ISD::SELECT: {
6214 SDLoc SL(N);
6215 EVT VT = N->getValueType(0);
6216 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
6217 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
6218 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
6219
6220 EVT SelectVT = NewVT;
6221 if (NewVT.bitsLT(MVT::i32)) {
6222 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
6223 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
6224 SelectVT = MVT::i32;
6225 }
6226
6227 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, SelectVT,
6228 N->getOperand(0), LHS, RHS);
6229
6230 if (NewVT != SelectVT)
6231 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
6232 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
6233 return;
6234 }
6235 case ISD::FNEG: {
6236 if (N->getValueType(0) != MVT::v2f16)
6237 break;
6238
6239 SDLoc SL(N);
6240 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6241
6242 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
6243 BC,
6244 DAG.getConstant(0x80008000, SL, MVT::i32));
6245 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6246 return;
6247 }
6248 case ISD::FABS: {
6249 if (N->getValueType(0) != MVT::v2f16)
6250 break;
6251
6252 SDLoc SL(N);
6253 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
6254
6255 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
6256 BC,
6257 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
6258 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
6259 return;
6260 }
6261 case ISD::FSQRT: {
6262 if (N->getValueType(0) != MVT::f16)
6263 break;
6264 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
6265 break;
6266 }
6267 default:
6269 break;
6270 }
6271}
6272
6273/// Helper function for LowerBRCOND
6274static SDNode *findUser(SDValue Value, unsigned Opcode) {
6275
6276 SDNode *Parent = Value.getNode();
6277 for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end();
6278 I != E; ++I) {
6279
6280 if (I.getUse().get() != Value)
6281 continue;
6282
6283 if (I->getOpcode() == Opcode)
6284 return *I;
6285 }
6286 return nullptr;
6287}
6288
6289unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
6290 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
6291 switch (Intr->getConstantOperandVal(1)) {
6292 case Intrinsic::amdgcn_if:
6293 return AMDGPUISD::IF;
6294 case Intrinsic::amdgcn_else:
6295 return AMDGPUISD::ELSE;
6296 case Intrinsic::amdgcn_loop:
6297 return AMDGPUISD::LOOP;
6298 case Intrinsic::amdgcn_end_cf:
6299 llvm_unreachable("should not occur");
6300 default:
6301 return 0;
6302 }
6303 }
6304
6305 // break, if_break, else_break are all only used as inputs to loop, not
6306 // directly as branch conditions.
6307 return 0;
6308}
6309
6311 const Triple &TT = getTargetMachine().getTargetTriple();
6315}
6316
6318 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
6319 return false;
6320
6321 // FIXME: Either avoid relying on address space here or change the default
6322 // address space for functions to avoid the explicit check.
6323 return (GV->getValueType()->isFunctionTy() ||
6326}
6327
6329 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
6330}
6331
6333 if (!GV->hasExternalLinkage())
6334 return true;
6335
6336 const auto OS = getTargetMachine().getTargetTriple().getOS();
6337 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
6338}
6339
6340/// This transforms the control flow intrinsics to get the branch destination as
6341/// last parameter, also switches branch target with BR if the need arise
6342SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
6343 SelectionDAG &DAG) const {
6344 SDLoc DL(BRCOND);
6345
6346 SDNode *Intr = BRCOND.getOperand(1).getNode();
6347 SDValue Target = BRCOND.getOperand(2);
6348 SDNode *BR = nullptr;
6349 SDNode *SetCC = nullptr;
6350
6351 if (Intr->getOpcode() == ISD::SETCC) {
6352 // As long as we negate the condition everything is fine
6353 SetCC = Intr;
6354 Intr = SetCC->getOperand(0).getNode();
6355
6356 } else {
6357 // Get the target from BR if we don't negate the condition
6358 BR = findUser(BRCOND, ISD::BR);
6359 assert(BR && "brcond missing unconditional branch user");
6360 Target = BR->getOperand(1);
6361 }
6362
6363 unsigned CFNode = isCFIntrinsic(Intr);
6364 if (CFNode == 0) {
6365 // This is a uniform branch so we don't need to legalize.
6366 return BRCOND;
6367 }
6368
6369 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
6370 Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN;
6371
6372 assert(!SetCC ||
6373 (SetCC->getConstantOperandVal(1) == 1 &&
6374 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
6375 ISD::SETNE));
6376
6377 // operands of the new intrinsic call
6379 if (HaveChain)
6380 Ops.push_back(BRCOND.getOperand(0));
6381
6382 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
6383 Ops.push_back(Target);
6384
6385 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
6386
6387 // build the new intrinsic call
6388 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
6389
6390 if (!HaveChain) {
6391 SDValue Ops[] = {
6392 SDValue(Result, 0),
6393 BRCOND.getOperand(0)
6394 };
6395
6396 Result = DAG.getMergeValues(Ops, DL).getNode();
6397 }
6398
6399 if (BR) {
6400 // Give the branch instruction our target
6401 SDValue Ops[] = {
6402 BR->getOperand(0),
6403 BRCOND.getOperand(2)
6404 };
6405 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
6406 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
6407 }
6408
6409 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
6410
6411 // Copy the intrinsic results to registers
6412 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
6414 if (!CopyToReg)
6415 continue;
6416
6417 Chain = DAG.getCopyToReg(
6418 Chain, DL,
6419 CopyToReg->getOperand(1),
6420 SDValue(Result, i - 1),
6421 SDValue());
6422
6423 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
6424 }
6425
6426 // Remove the old intrinsic from the chain
6428 SDValue(Intr, Intr->getNumValues() - 1),
6429 Intr->getOperand(0));
6430
6431 return Chain;
6432}
6433
6434SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
6435 SelectionDAG &DAG) const {
6436 MVT VT = Op.getSimpleValueType();
6437 SDLoc DL(Op);
6438 // Checking the depth
6439 if (Op.getConstantOperandVal(0) != 0)
6440 return DAG.getConstant(0, DL, VT);
6441
6444 // Check for kernel and shader functions
6445 if (Info->isEntryFunction())
6446 return DAG.getConstant(0, DL, VT);
6447
6448 MachineFrameInfo &MFI = MF.getFrameInfo();
6449 // There is a call to @llvm.returnaddress in this function
6450 MFI.setReturnAddressIsTaken(true);
6451
6453 // Get the return address reg and mark it as an implicit live-in
6454 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent()));
6455
6456 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
6457}
6458
6459SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
6460 SDValue Op,
6461 const SDLoc &DL,
6462 EVT VT) const {
6463 return Op.getValueType().bitsLE(VT) ?
6464 DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
6465 DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
6466 DAG.getTargetConstant(0, DL, MVT::i32));
6467}
6468
6469SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
6470 assert(Op.getValueType() == MVT::f16 &&
6471 "Do not know how to custom lower FP_ROUND for non-f16 type");
6472
6473 SDValue Src = Op.getOperand(0);
6474 EVT SrcVT = Src.getValueType();
6475 if (SrcVT != MVT::f64)
6476 return Op;
6477
6478 // TODO: Handle strictfp
6479 if (Op.getOpcode() != ISD::FP_ROUND)
6480 return Op;
6481
6482 SDLoc DL(Op);
6483
6484 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
6485 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
6486 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
6487}
6488
6489SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
6490 SelectionDAG &DAG) const {
6491 EVT VT = Op.getValueType();
6492 const MachineFunction &MF = DAG.getMachineFunction();
6494 bool IsIEEEMode = Info->getMode().IEEE;
6495
6496 // FIXME: Assert during selection that this is only selected for
6497 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
6498 // mode functions, but this happens to be OK since it's only done in cases
6499 // where there is known no sNaN.
6500 if (IsIEEEMode)
6501 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
6502
6503 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6504 VT == MVT::v16bf16)
6505 return splitBinaryVectorOp(Op, DAG);
6506 return Op;
6507}
6508
6509SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
6510 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
6511 EVT VT = Op.getValueType();
6512 assert(VT == MVT::f16);
6513
6514 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
6515 EVT ExpVT = Exp.getValueType();
6516 if (ExpVT == MVT::i16)
6517 return Op;
6518
6519 SDLoc DL(Op);
6520
6521 // Correct the exponent type for f16 to i16.
6522 // Clamp the range of the exponent to the instruction's range.
6523
6524 // TODO: This should be a generic narrowing legalization, and can easily be
6525 // for GlobalISel.
6526
6527 SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
6528 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
6529
6530 SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
6531 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
6532
6533 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
6534
6535 if (IsStrict) {
6536 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
6537 {Op.getOperand(0), Op.getOperand(1), TruncExp});
6538 }
6539
6540 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
6541}
6542
6543// Custom lowering for vector multiplications and s_mul_u64.
6544SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
6545 EVT VT = Op.getValueType();
6546
6547 // Split vector operands.
6548 if (VT.isVector())
6549 return splitBinaryVectorOp(Op, DAG);
6550
6551 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
6552
6553 // There are four ways to lower s_mul_u64:
6554 //
6555 // 1. If all the operands are uniform, then we lower it as it is.
6556 //
6557 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
6558 // multiplications because there is not a vector equivalent of s_mul_u64.
6559 //
6560 // 3. If the cost model decides that it is more efficient to use vector
6561 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
6562 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
6563 //
6564 // 4. If the cost model decides to use vector registers and both of the
6565 // operands are zero-extended/sign-extended from 32-bits, then we split the
6566 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
6567 // possible to check if the operands are zero-extended or sign-extended in
6568 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
6569 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
6570 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
6571 // If the cost model decides that we have to use vector registers, then
6572 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
6573 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
6574 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
6575 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
6576 // SIInstrInfo.cpp .
6577
6578 if (Op->isDivergent())
6579 return SDValue();
6580
6581 SDValue Op0 = Op.getOperand(0);
6582 SDValue Op1 = Op.getOperand(1);
6583 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
6584 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
6585 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
6586 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
6587 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
6588 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
6589 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
6590 SDLoc SL(Op);
6591 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6592 return SDValue(
6593 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6594 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
6595 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
6596 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6597 return SDValue(
6598 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6599 // If all the operands are uniform, then we lower s_mul_u64 as it is.
6600 return Op;
6601}
6602
6603SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
6604 EVT VT = Op.getValueType();
6605 SDLoc SL(Op);
6606 SDValue LHS = Op.getOperand(0);
6607 SDValue RHS = Op.getOperand(1);
6608 bool isSigned = Op.getOpcode() == ISD::SMULO;
6609
6610 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
6611 const APInt &C = RHSC->getAPIntValue();
6612 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
6613 if (C.isPowerOf2()) {
6614 // smulo(x, signed_min) is same as umulo(x, signed_min).
6615 bool UseArithShift = isSigned && !C.isMinSignedValue();
6616 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
6617 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
6618 SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
6619 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
6620 SL, VT, Result, ShiftAmt),
6621 LHS, ISD::SETNE);
6622 return DAG.getMergeValues({ Result, Overflow }, SL);
6623 }
6624 }
6625
6626 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
6628 SL, VT, LHS, RHS);
6629
6630 SDValue Sign = isSigned
6631 ? DAG.getNode(ISD::SRA, SL, VT, Result,
6632 DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
6633 : DAG.getConstant(0, SL, VT);
6634 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
6635
6636 return DAG.getMergeValues({ Result, Overflow }, SL);
6637}
6638
6639SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
6640 if (Op->isDivergent()) {
6641 // Select to V_MAD_[IU]64_[IU]32.
6642 return Op;
6643 }
6644 if (Subtarget->hasSMulHi()) {
6645 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
6646 return SDValue();
6647 }
6648 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
6649 // calculate the high part, so we might as well do the whole thing with
6650 // V_MAD_[IU]64_[IU]32.
6651 return Op;
6652}
6653
6654SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
6655 if (!Subtarget->isTrapHandlerEnabled() ||
6657 return lowerTrapEndpgm(Op, DAG);
6658
6659 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
6660 lowerTrapHsaQueuePtr(Op, DAG);
6661}
6662
6663SDValue SITargetLowering::lowerTrapEndpgm(
6664 SDValue Op, SelectionDAG &DAG) const {
6665 SDLoc SL(Op);
6666 SDValue Chain = Op.getOperand(0);
6667 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
6668}
6669
6670SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
6671 const SDLoc &DL, Align Alignment, ImplicitParameter Param) const {
6674 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
6676 return DAG.getLoad(VT, DL, DAG.getEntryNode(), Ptr, PtrInfo, Alignment,
6679}
6680
6681SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6682 SDValue Op, SelectionDAG &DAG) const {
6683 SDLoc SL(Op);
6684 SDValue Chain = Op.getOperand(0);
6685
6686 SDValue QueuePtr;
6687 // For code object version 5, QueuePtr is passed through implicit kernarg.
6688 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6690 QueuePtr =
6691 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
6692 } else {
6695 Register UserSGPR = Info->getQueuePtrUserSGPR();
6696
6697 if (UserSGPR == AMDGPU::NoRegister) {
6698 // We probably are in a function incorrectly marked with
6699 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
6700 // trap, so just use a null pointer.
6701 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
6702 } else {
6703 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
6704 MVT::i64);
6705 }
6706 }
6707
6708 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
6709 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
6710 QueuePtr, SDValue());
6711
6713 SDValue Ops[] = {
6714 ToReg,
6715 DAG.getTargetConstant(TrapID, SL, MVT::i16),
6716 SGPR01,
6717 ToReg.getValue(1)
6718 };
6719 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6720}
6721
6722SDValue SITargetLowering::lowerTrapHsa(
6723 SDValue Op, SelectionDAG &DAG) const {
6724 SDLoc SL(Op);
6725 SDValue Chain = Op.getOperand(0);
6726
6727 // We need to simulate the 's_trap 2' instruction on targets that run in
6728 // PRIV=1 (where it is treated as a nop).
6729 if (Subtarget->hasPrivEnabledTrap2NopBug())
6730 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
6731
6733 SDValue Ops[] = {
6734 Chain,
6735 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6736 };
6737 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6738}
6739
6740SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
6741 SDLoc SL(Op);
6742 SDValue Chain = Op.getOperand(0);
6744
6745 if (!Subtarget->isTrapHandlerEnabled() ||
6748 "debugtrap handler not supported",
6749 Op.getDebugLoc(),
6750 DS_Warning);
6751 LLVMContext &Ctx = MF.getFunction().getContext();
6752 Ctx.diagnose(NoTrap);
6753 return Chain;
6754 }
6755
6757 SDValue Ops[] = {
6758 Chain,
6759 DAG.getTargetConstant(TrapID, SL, MVT::i16)
6760 };
6761 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
6762}
6763
6764SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
6765 SelectionDAG &DAG) const {
6766 if (Subtarget->hasApertureRegs()) {
6767 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
6768 ? AMDGPU::SRC_SHARED_BASE
6769 : AMDGPU::SRC_PRIVATE_BASE;
6770 // Note: this feature (register) is broken. When used as a 32-bit operand,
6771 // it returns a wrong value (all zeroes?). The real value is in the upper 32
6772 // bits.
6773 //
6774 // To work around the issue, directly emit a 64 bit mov from this register
6775 // then extract the high bits. Note that this shouldn't even result in a
6776 // shift being emitted and simply become a pair of registers (e.g.):
6777 // s_mov_b64 s[6:7], src_shared_base
6778 // v_mov_b32_e32 v1, s7
6779 //
6780 // FIXME: It would be more natural to emit a CopyFromReg here, but then copy
6781 // coalescing would kick in and it would think it's okay to use the "HI"
6782 // subregister directly (instead of extracting the HI 32 bits) which is an
6783 // artificial (unusable) register.
6784 // Register TableGen definitions would need an overhaul to get rid of the
6785 // artificial "HI" aperture registers and prevent this kind of issue from
6786 // happening.
6787 SDNode *Mov = DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64,
6788 DAG.getRegister(ApertureRegNo, MVT::i64));
6789 return DAG.getNode(
6790 ISD::TRUNCATE, DL, MVT::i32,
6791 DAG.getNode(ISD::SRL, DL, MVT::i64,
6792 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6793 }
6794
6795 // For code object version 5, private_base and shared_base are passed through
6796 // implicit kernargs.
6797 const Module *M = DAG.getMachineFunction().getFunction().getParent();
6801 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
6802 }
6803
6806 Register UserSGPR = Info->getQueuePtrUserSGPR();
6807 if (UserSGPR == AMDGPU::NoRegister) {
6808 // We probably are in a function incorrectly marked with
6809 // amdgpu-no-queue-ptr. This is undefined.
6810 return DAG.getUNDEF(MVT::i32);
6811 }
6812
6813 SDValue QueuePtr = CreateLiveInRegister(
6814 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6815
6816 // Offset into amd_queue_t for group_segment_aperture_base_hi /
6817 // private_segment_aperture_base_hi.
6818 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
6819
6820 SDValue Ptr =
6821 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
6822
6823 // TODO: Use custom target PseudoSourceValue.
6824 // TODO: We should use the value from the IR intrinsic call, but it might not
6825 // be available and how do we get it?
6827 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
6828 commonAlignment(Align(64), StructOffset),
6831}
6832
6833/// Return true if the value is a known valid address, such that a null check is
6834/// not necessary.
6836 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
6837 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6838 isa<BasicBlockSDNode>(Val))
6839 return true;
6840
6841 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6842 return ConstVal->getSExtValue() != TM.getNullPointerValue(AddrSpace);
6843
6844 // TODO: Search through arithmetic, handle arguments and loads
6845 // marked nonnull.
6846 return false;
6847}
6848
6849SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
6850 SelectionDAG &DAG) const {
6851 SDLoc SL(Op);
6852
6853 const AMDGPUTargetMachine &TM =
6854 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
6855
6856 unsigned DestAS, SrcAS;
6857 SDValue Src;
6858 bool IsNonNull = false;
6859 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
6860 SrcAS = ASC->getSrcAddressSpace();
6861 Src = ASC->getOperand(0);
6862 DestAS = ASC->getDestAddressSpace();
6863 } else {
6864 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
6865 Op.getConstantOperandVal(0) ==
6866 Intrinsic::amdgcn_addrspacecast_nonnull);
6867 Src = Op->getOperand(1);
6868 SrcAS = Op->getConstantOperandVal(2);
6869 DestAS = Op->getConstantOperandVal(3);
6870 IsNonNull = true;
6871 }
6872
6873 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
6874
6875 // flat -> local/private
6876 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
6877 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
6878 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
6879 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6880
6881 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6882 return Ptr;
6883
6884 unsigned NullVal = TM.getNullPointerValue(DestAS);
6885 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6886 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
6887
6888 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
6889 SegmentNullPtr);
6890 }
6891 }
6892
6893 // local/private -> flat
6894 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
6895 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
6896 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
6897
6898 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6899 SDValue CvtPtr =
6900 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
6901 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
6902
6903 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
6904 return CvtPtr;
6905
6906 unsigned NullVal = TM.getNullPointerValue(SrcAS);
6907 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
6908
6909 SDValue NonNull
6910 = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
6911
6912 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
6913 FlatNullPtr);
6914 }
6915 }
6916
6917 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6918 Op.getValueType() == MVT::i64) {
6921 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
6922 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
6923 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
6924 }
6925
6926 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
6927 Src.getValueType() == MVT::i64)
6928 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
6929
6930 // global <-> flat are no-ops and never emitted.
6931
6932 const MachineFunction &MF = DAG.getMachineFunction();
6933 DiagnosticInfoUnsupported InvalidAddrSpaceCast(
6934 MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
6935 DAG.getContext()->diagnose(InvalidAddrSpaceCast);
6936
6937 return DAG.getUNDEF(Op->getValueType(0));
6938}
6939
6940// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
6941// the small vector and inserting them into the big vector. That is better than
6942// the default expansion of doing it via a stack slot. Even though the use of
6943// the stack slot would be optimized away afterwards, the stack slot itself
6944// remains.
6945SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
6946 SelectionDAG &DAG) const {
6947 SDValue Vec = Op.getOperand(0);
6948 SDValue Ins = Op.getOperand(1);
6949 SDValue Idx = Op.getOperand(2);
6950 EVT VecVT = Vec.getValueType();
6951 EVT InsVT = Ins.getValueType();
6952 EVT EltVT = VecVT.getVectorElementType();
6953 unsigned InsNumElts = InsVT.getVectorNumElements();
6954 unsigned IdxVal = Idx->getAsZExtVal();
6955 SDLoc SL(Op);
6956
6957 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
6958 // Insert 32-bit registers at a time.
6959 assert(InsNumElts % 2 == 0 && "expect legal vector types");
6960
6961 unsigned VecNumElts = VecVT.getVectorNumElements();
6962 EVT NewVecVT =
6963 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
6964 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6966 MVT::i32, InsNumElts / 2);
6967
6968 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
6969 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
6970
6971 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
6972 SDValue Elt;
6973 if (InsNumElts == 2) {
6974 Elt = Ins;
6975 } else {
6976 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
6977 DAG.getConstant(I, SL, MVT::i32));
6978 }
6979 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
6980 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
6981 }
6982
6983 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
6984 }
6985
6986 for (unsigned I = 0; I != InsNumElts; ++I) {
6987 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
6988 DAG.getConstant(I, SL, MVT::i32));
6989 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
6990 DAG.getConstant(IdxVal + I, SL, MVT::i32));
6991 }
6992 return Vec;
6993}
6994
6995SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
6996 SelectionDAG &DAG) const {
6997 SDValue Vec = Op.getOperand(0);
6998 SDValue InsVal = Op.getOperand(1);
6999 SDValue Idx = Op.getOperand(2);
7000 EVT VecVT = Vec.getValueType();
7001 EVT EltVT = VecVT.getVectorElementType();
7002 unsigned VecSize = VecVT.getSizeInBits();
7003 unsigned EltSize = EltVT.getSizeInBits();
7004 SDLoc SL(Op);
7005
7006 // Specially handle the case of v4i16 with static indexing.
7007 unsigned NumElts = VecVT.getVectorNumElements();
7008 auto KIdx = dyn_cast<ConstantSDNode>(Idx);
7009 if (NumElts == 4 && EltSize == 16 && KIdx) {
7010 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
7011
7012 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7013 DAG.getConstant(0, SL, MVT::i32));
7014 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
7015 DAG.getConstant(1, SL, MVT::i32));
7016
7017 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
7018 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
7019
7020 unsigned Idx = KIdx->getZExtValue();
7021 bool InsertLo = Idx < 2;
7022 SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
7023 InsertLo ? LoVec : HiVec,
7024 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
7025 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
7026
7027 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
7028
7029 SDValue Concat = InsertLo ?
7030 DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
7031 DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
7032
7033 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
7034 }
7035
7036 // Static indexing does not lower to stack access, and hence there is no need
7037 // for special custom lowering to avoid stack access.
7038 if (isa<ConstantSDNode>(Idx))
7039 return SDValue();
7040
7041 // Avoid stack access for dynamic indexing by custom lowering to
7042 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
7043
7044 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
7045
7046 MVT IntVT = MVT::getIntegerVT(VecSize);
7047
7048 // Convert vector index to bit-index and get the required bit mask.
7049 assert(isPowerOf2_32(EltSize));
7050 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7051 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7052 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7053 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
7054 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
7055
7056 // 1. Create a congruent vector with the target value in each element.
7057 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
7058 DAG.getSplatBuildVector(VecVT, SL, InsVal));
7059
7060 // 2. Mask off all other indicies except the required index within (1).
7061 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
7062
7063 // 3. Mask off the required index within the target vector.
7064 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7065 SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
7066 DAG.getNOT(SL, BFM, IntVT), BCVec);
7067
7068 // 4. Get (2) and (3) ORed into the target vector.
7069 SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
7070
7071 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
7072}
7073
7074SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
7075 SelectionDAG &DAG) const {
7076 SDLoc SL(Op);
7077
7078 EVT ResultVT = Op.getValueType();
7079 SDValue Vec = Op.getOperand(0);
7080 SDValue Idx = Op.getOperand(1);
7081 EVT VecVT = Vec.getValueType();
7082 unsigned VecSize = VecVT.getSizeInBits();
7083 EVT EltVT = VecVT.getVectorElementType();
7084
7085 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
7086
7087 // Make sure we do any optimizations that will make it easier to fold
7088 // source modifiers before obscuring it with bit operations.
7089
7090 // XXX - Why doesn't this get called when vector_shuffle is expanded?
7091 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
7092 return Combined;
7093
7094 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7095 SDValue Lo, Hi;
7096 EVT LoVT, HiVT;
7097 std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
7098
7099 if (VecSize == 128) {
7100 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
7101 Lo = DAG.getBitcast(LoVT,
7102 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7103 DAG.getConstant(0, SL, MVT::i32)));
7104 Hi = DAG.getBitcast(HiVT,
7105 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7106 DAG.getConstant(1, SL, MVT::i32)));
7107 } else if (VecSize == 256) {
7108 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
7109 SDValue Parts[4];
7110 for (unsigned P = 0; P < 4; ++P) {
7111 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7112 DAG.getConstant(P, SL, MVT::i32));
7113 }
7114
7115 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7116 Parts[0], Parts[1]));
7117 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
7118 Parts[2], Parts[3]));
7119 } else {
7120 assert(VecSize == 512);
7121
7122 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
7123 SDValue Parts[8];
7124 for (unsigned P = 0; P < 8; ++P) {
7125 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
7126 DAG.getConstant(P, SL, MVT::i32));
7127 }
7128
7129 Lo = DAG.getBitcast(LoVT,
7130 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7131 Parts[0], Parts[1], Parts[2], Parts[3]));
7132 Hi = DAG.getBitcast(HiVT,
7133 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
7134 Parts[4], Parts[5],Parts[6], Parts[7]));
7135 }
7136
7137 EVT IdxVT = Idx.getValueType();
7138 unsigned NElem = VecVT.getVectorNumElements();
7139 assert(isPowerOf2_32(NElem));
7140 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
7141 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
7142 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
7143 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
7144 }
7145
7146 assert(VecSize <= 64);
7147
7148 MVT IntVT = MVT::getIntegerVT(VecSize);
7149
7150 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
7151 SDValue VecBC = peekThroughBitcasts(Vec);
7152 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
7153 SDValue Src = VecBC.getOperand(0);
7154 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7155 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
7156 }
7157
7158 unsigned EltSize = EltVT.getSizeInBits();
7159 assert(isPowerOf2_32(EltSize));
7160
7161 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
7162
7163 // Convert vector index to bit-index (* EltSize)
7164 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
7165
7166 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
7167 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
7168
7169 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7170 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
7171 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
7172 }
7173
7174 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
7175}
7176
7177static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
7178 assert(Elt % 2 == 0);
7179 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7180}
7181
7182SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
7183 SelectionDAG &DAG) const {
7184 SDLoc SL(Op);
7185 EVT ResultVT = Op.getValueType();
7186 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
7187
7188 EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16;
7189 EVT EltVT = PackVT.getVectorElementType();
7190 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
7191
7192 // vector_shuffle <0,1,6,7> lhs, rhs
7193 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
7194 //
7195 // vector_shuffle <6,7,2,3> lhs, rhs
7196 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
7197 //
7198 // vector_shuffle <6,7,0,1> lhs, rhs
7199 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
7200
7201 // Avoid scalarizing when both halves are reading from consecutive elements.
7203 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
7204 if (elementPairIsContiguous(SVN->getMask(), I)) {
7205 const int Idx = SVN->getMaskElt(I);
7206 int VecIdx = Idx < SrcNumElts ? 0 : 1;
7207 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
7208 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL,
7209 PackVT, SVN->getOperand(VecIdx),
7210 DAG.getConstant(EltIdx, SL, MVT::i32));
7211 Pieces.push_back(SubVec);
7212 } else {
7213 const int Idx0 = SVN->getMaskElt(I);
7214 const int Idx1 = SVN->getMaskElt(I + 1);
7215 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7216 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7217 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7218 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7219
7220 SDValue Vec0 = SVN->getOperand(VecIdx0);
7221 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7222 Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32));
7223
7224 SDValue Vec1 = SVN->getOperand(VecIdx1);
7225 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
7226 Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32));
7227 Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 }));
7228 }
7229 }
7230
7231 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
7232}
7233
7234SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
7235 SelectionDAG &DAG) const {
7236 SDValue SVal = Op.getOperand(0);
7237 EVT ResultVT = Op.getValueType();
7238 EVT SValVT = SVal.getValueType();
7239 SDValue UndefVal = DAG.getUNDEF(SValVT);
7240 SDLoc SL(Op);
7241
7243 VElts.push_back(SVal);
7244 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
7245 VElts.push_back(UndefVal);
7246
7247 return DAG.getBuildVector(ResultVT, SL, VElts);
7248}
7249
7250SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
7251 SelectionDAG &DAG) const {
7252 SDLoc SL(Op);
7253 EVT VT = Op.getValueType();
7254
7255 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7256 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7258 VT.getVectorNumElements() / 2);
7259 MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());
7260
7261 // Turn into pair of packed build_vectors.
7262 // TODO: Special case for constants that can be materialized with s_mov_b64.
7263 SmallVector<SDValue, 4> LoOps, HiOps;
7264 for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
7265 LoOps.push_back(Op.getOperand(I));
7266 HiOps.push_back(Op.getOperand(I + E));
7267 }
7268 SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
7269 SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);
7270
7271 SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
7272 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);
7273
7274 SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
7275 { CastLo, CastHi });
7276 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7277 }
7278
7279 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7281 VT.getVectorNumElements() / 4);
7282 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7283
7284 SmallVector<SDValue, 4> Parts[4];
7285 for (unsigned I = 0, E = VT.getVectorNumElements() / 4; I != E; ++I) {
7286 for (unsigned P = 0; P < 4; ++P)
7287 Parts[P].push_back(Op.getOperand(I + P * E));
7288 }
7289 SDValue Casts[4];
7290 for (unsigned P = 0; P < 4; ++P) {
7291 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7292 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7293 }
7294
7295 SDValue Blend =
7296 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 4), SL, Casts);
7297 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7298 }
7299
7300 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7302 VT.getVectorNumElements() / 8);
7303 MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
7304
7305 SmallVector<SDValue, 8> Parts[8];
7306 for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
7307 for (unsigned P = 0; P < 8; ++P)
7308 Parts[P].push_back(Op.getOperand(I + P * E));
7309 }
7310 SDValue Casts[8];
7311 for (unsigned P = 0; P < 8; ++P) {
7312 SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
7313 Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
7314 }
7315
7316 SDValue Blend =
7317 DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
7318 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
7319 }
7320
7321 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7322 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
7323
7324 SDValue Lo = Op.getOperand(0);
7325 SDValue Hi = Op.getOperand(1);
7326
7327 // Avoid adding defined bits with the zero_extend.
7328 if (Hi.isUndef()) {
7329 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7330 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
7331 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
7332 }
7333
7334 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
7335 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
7336
7337 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
7338 DAG.getConstant(16, SL, MVT::i32));
7339 if (Lo.isUndef())
7340 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
7341
7342 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
7343 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
7344
7345 SDValue Or = DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi);
7346 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
7347}
7348
7349bool
7351 // OSes that use ELF REL relocations (instead of RELA) can only store a
7352 // 32-bit addend in the instruction, so it is not safe to allow offset folding
7353 // which can create arbitrary 64-bit addends. (This is only a problem for
7354 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
7355 // the high 32 bits of the addend.)
7356 //
7357 // This should be kept in sync with how HasRelocationAddend is initialized in
7358 // the constructor of ELFAMDGPUAsmBackend.
7359 if (!Subtarget->isAmdHsaOS())
7360 return false;
7361
7362 // We can fold offsets for anything that doesn't require a GOT relocation.
7363 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
7367}
7368
7369static SDValue
7371 const SDLoc &DL, int64_t Offset, EVT PtrVT,
7372 unsigned GAFlags = SIInstrInfo::MO_NONE) {
7373 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
7374 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
7375 // lowered to the following code sequence:
7376 //
7377 // For constant address space:
7378 // s_getpc_b64 s[0:1]
7379 // s_add_u32 s0, s0, $symbol
7380 // s_addc_u32 s1, s1, 0
7381 //
7382 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7383 // a fixup or relocation is emitted to replace $symbol with a literal
7384 // constant, which is a pc-relative offset from the encoding of the $symbol
7385 // operand to the global variable.
7386 //
7387 // For global address space:
7388 // s_getpc_b64 s[0:1]
7389 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
7390 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
7391 //
7392 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
7393 // fixups or relocations are emitted to replace $symbol@*@lo and
7394 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
7395 // which is a 64-bit pc-relative offset from the encoding of the $symbol
7396 // operand to the global variable.
7397 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
7398 SDValue PtrHi;
7399 if (GAFlags == SIInstrInfo::MO_NONE)
7400 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
7401 else
7402 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
7403 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
7404}
7405
7406SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
7407 SDValue Op,
7408 SelectionDAG &DAG) const {
7409 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
7410 SDLoc DL(GSD);
7411 EVT PtrVT = Op.getValueType();
7412
7413 const GlobalValue *GV = GSD->getGlobal();
7419 GV->hasExternalLinkage()) {
7420 Type *Ty = GV->getValueType();
7421 // HIP uses an unsized array `extern __shared__ T s[]` or similar
7422 // zero-sized type in other languages to declare the dynamic shared
7423 // memory which size is not known at the compile time. They will be
7424 // allocated by the runtime and placed directly after the static
7425 // allocated ones. They all share the same offset.
7426 if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
7427 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
7428 // Adjust alignment for that dynamic shared memory array.
7430 MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
7431 MFI->setUsesDynamicLDS(true);
7432 return SDValue(
7433 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
7434 }
7435 }
7437 }
7438
7440 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
7442 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
7443 }
7444
7445 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
7446 SDValue AddrLo = DAG.getTargetGlobalAddress(
7447 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
7448 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
7449
7450 SDValue AddrHi = DAG.getTargetGlobalAddress(
7451 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
7452 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
7453
7454 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
7455 }
7456
7457 if (shouldEmitFixup(GV))
7458 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
7459
7460 if (shouldEmitPCReloc(GV))
7461 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
7463
7464 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
7466
7467 Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
7469 const DataLayout &DataLayout = DAG.getDataLayout();
7470 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
7471 MachinePointerInfo PtrInfo
7473
7474 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
7477}
7478
7480 const SDLoc &DL, SDValue V) const {
7481 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
7482 // the destination register.
7483 //
7484 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
7485 // so we will end up with redundant moves to m0.
7486 //
7487 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
7488
7489 // A Null SDValue creates a glue result.
7490 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
7491 V, Chain);
7492 return SDValue(M0, 0);
7493}
7494
7495SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
7496 SDValue Op,
7497 MVT VT,
7498 unsigned Offset) const {
7499 SDLoc SL(Op);
7500 SDValue Param = lowerKernargMemParameter(
7501 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
7502 // The local size values will have the hi 16-bits as zero.
7503 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
7504 DAG.getValueType(VT));
7505}
7506
7508 EVT VT) {
7510 "non-hsa intrinsic with hsa target",
7511 DL.getDebugLoc());
7512 DAG.getContext()->diagnose(BadIntrin);
7513 return DAG.getUNDEF(VT);
7514}
7515
7517 EVT VT) {
7519 "intrinsic not supported on subtarget",
7520 DL.getDebugLoc());
7521 DAG.getContext()->diagnose(BadIntrin);
7522 return DAG.getUNDEF(VT);
7523}
7524
7526 ArrayRef<SDValue> Elts) {
7527 assert(!Elts.empty());
7528 MVT Type;
7529 unsigned NumElts = Elts.size();
7530
7531 if (NumElts <= 12) {
7532 Type = MVT::getVectorVT(MVT::f32, NumElts);
7533 } else {
7534 assert(Elts.size() <= 16);
7535 Type = MVT::v16f32;
7536 NumElts = 16;
7537 }
7538
7539 SmallVector<SDValue, 16> VecElts(NumElts);
7540 for (unsigned i = 0; i < Elts.size(); ++i) {
7541 SDValue Elt = Elts[i];
7542 if (Elt.getValueType() != MVT::f32)
7543 Elt = DAG.getBitcast(MVT::f32, Elt);
7544 VecElts[i] = Elt;
7545 }
7546 for (unsigned i = Elts.size(); i < NumElts; ++i)
7547 VecElts[i] = DAG.getUNDEF(MVT::f32);
7548
7549 if (NumElts == 1)
7550 return VecElts[0];
7551 return DAG.getBuildVector(Type, DL, VecElts);
7552}
7553
7554static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
7555 SDValue Src, int ExtraElts) {
7556 EVT SrcVT = Src.getValueType();
7557
7559
7560 if (SrcVT.isVector())
7561 DAG.ExtractVectorElements(Src, Elts);
7562 else
7563 Elts.push_back(Src);
7564
7565 SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
7566 while (ExtraElts--)
7567 Elts.push_back(Undef);
7568
7569 return DAG.getBuildVector(CastVT, DL, Elts);
7570}
7571
7572// Re-construct the required return value for a image load intrinsic.
7573// This is more complicated due to the optional use TexFailCtrl which means the required
7574// return type is an aggregate
7576 ArrayRef<EVT> ResultTypes, bool IsTexFail,
7577 bool Unpacked, bool IsD16, int DMaskPop,
7578 int NumVDataDwords, bool IsAtomicPacked16Bit,
7579 const SDLoc &DL) {
7580 // Determine the required return type. This is the same regardless of IsTexFail flag
7581 EVT ReqRetVT = ResultTypes[0];
7582 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
7583 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7584 ? (ReqRetNumElts + 1) / 2
7585 : ReqRetNumElts;
7586
7587 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7588 DMaskPop : (DMaskPop + 1) / 2;
7589
7590 MVT DataDwordVT = NumDataDwords == 1 ?
7591 MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
7592
7593 MVT MaskPopVT = MaskPopDwords == 1 ?
7594 MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
7595
7596 SDValue Data(Result, 0);
7597 SDValue TexFail;
7598
7599 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
7600 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
7601 if (MaskPopVT.isVector()) {
7602 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
7603 SDValue(Result, 0), ZeroIdx);
7604 } else {
7605 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
7606 SDValue(Result, 0), ZeroIdx);
7607 }
7608 }
7609
7610 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
7611 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
7612 NumDataDwords - MaskPopDwords);
7613
7614 if (IsD16)
7615 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
7616
7617 EVT LegalReqRetVT = ReqRetVT;
7618 if (!ReqRetVT.isVector()) {
7619 if (!Data.getValueType().isInteger())
7620 Data = DAG.getNode(ISD::BITCAST, DL,
7621 Data.getValueType().changeTypeToInteger(), Data);
7622 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
7623 } else {
7624 // We need to widen the return vector to a legal type
7625 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
7626 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
7627 LegalReqRetVT =
7629 ReqRetVT.getVectorNumElements() + 1);
7630 }
7631 }
7632 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
7633
7634 if (IsTexFail) {
7635 TexFail =
7636 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
7637 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
7638
7639 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
7640 }
7641
7642 if (Result->getNumValues() == 1)
7643 return Data;
7644
7645 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
7646}
7647
7648static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
7649 SDValue *LWE, bool &IsTexFail) {
7650 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
7651
7652 uint64_t Value = TexFailCtrlConst->getZExtValue();
7653 if (Value) {
7654 IsTexFail = true;
7655 }
7656
7657 SDLoc DL(TexFailCtrlConst);
7658 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
7659 Value &= ~(uint64_t)0x1;
7660 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
7661 Value &= ~(uint64_t)0x2;
7662
7663 return Value == 0;
7664}
7665
7667 MVT PackVectorVT,
7668 SmallVectorImpl<SDValue> &PackedAddrs,
7669 unsigned DimIdx, unsigned EndIdx,
7670 unsigned NumGradients) {
7671 SDLoc DL(Op);
7672 for (unsigned I = DimIdx; I < EndIdx; I++) {
7673 SDValue Addr = Op.getOperand(I);
7674
7675 // Gradients are packed with undef for each coordinate.
7676 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
7677 // 1D: undef,dx/dh; undef,dx/dv
7678 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
7679 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
7680 if (((I + 1) >= EndIdx) ||
7681 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
7682 I == DimIdx + NumGradients - 1))) {
7683 if (Addr.getValueType() != MVT::i16)
7684 Addr = DAG.getBitcast(MVT::i16, Addr);
7685 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
7686 } else {
7687 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
7688 I++;
7689 }
7690 Addr = DAG.getBitcast(MVT::f32, Addr);
7691 PackedAddrs.push_back(Addr);
7692 }
7693}
7694
7695SDValue SITargetLowering::lowerImage(SDValue Op,
7697 SelectionDAG &DAG, bool WithChain) const {
7698 SDLoc DL(Op);
7700 const GCNSubtarget* ST = &MF.getSubtarget<GCNSubtarget>();
7701 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7703 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
7704 unsigned IntrOpcode = Intr->BaseOpcode;
7705 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
7706 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
7707 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
7708
7709 SmallVector<EVT, 3> ResultTypes(Op->values());
7710 SmallVector<EVT, 3> OrigResultTypes(Op->values());
7711 bool IsD16 = false;
7712 bool IsG16 = false;
7713 bool IsA16 = false;
7714 SDValue VData;
7715 int NumVDataDwords;
7716 bool AdjustRetType = false;
7717 bool IsAtomicPacked16Bit = false;
7718
7719 // Offset of intrinsic arguments
7720 const unsigned ArgOffset = WithChain ? 2 : 1;
7721
7722 unsigned DMask;
7723 unsigned DMaskLanes = 0;
7724
7725 if (BaseOpcode->Atomic) {
7726 VData = Op.getOperand(2);
7727
7728 IsAtomicPacked16Bit =
7729 (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7730 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7731
7732 bool Is64Bit = VData.getValueSizeInBits() == 64;
7733 if (BaseOpcode->AtomicX2) {
7734 SDValue VData2 = Op.getOperand(3);
7735 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
7736 {VData, VData2});
7737 if (Is64Bit)
7738 VData = DAG.getBitcast(MVT::v4i32, VData);
7739
7740 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7741 DMask = Is64Bit ? 0xf : 0x3;
7742 NumVDataDwords = Is64Bit ? 4 : 2;
7743 } else {
7744 DMask = Is64Bit ? 0x3 : 0x1;
7745 NumVDataDwords = Is64Bit ? 2 : 1;
7746 }
7747 } else {
7748 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
7749 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
7750
7751 if (BaseOpcode->Store) {
7752 VData = Op.getOperand(2);
7753
7754 MVT StoreVT = VData.getSimpleValueType();
7755 if (StoreVT.getScalarType() == MVT::f16) {
7756 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7757 return Op; // D16 is unsupported for this instruction
7758
7759 IsD16 = true;
7760 VData = handleD16VData(VData, DAG, true);
7761 }
7762
7763 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
7764 } else {
7765 // Work out the num dwords based on the dmask popcount and underlying type
7766 // and whether packing is supported.
7767 MVT LoadVT = ResultTypes[0].getSimpleVT();
7768 if (LoadVT.getScalarType() == MVT::f16) {
7769 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
7770 return Op; // D16 is unsupported for this instruction
7771
7772 IsD16 = true;
7773 }
7774
7775 // Confirm that the return type is large enough for the dmask specified
7776 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
7777 (!LoadVT.isVector() && DMaskLanes > 1))
7778 return Op;
7779
7780 // The sq block of gfx8 and gfx9 do not estimate register use correctly
7781 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
7782 // instructions.
7783 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
7784 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
7785 NumVDataDwords = (DMaskLanes + 1) / 2;
7786 else
7787 NumVDataDwords = DMaskLanes;
7788
7789 AdjustRetType = true;
7790 }
7791 }
7792
7793 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
7795
7796 // Check for 16 bit addresses or derivatives and pack if true.
7797 MVT VAddrVT =
7798 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
7799 MVT VAddrScalarVT = VAddrVT.getScalarType();
7800 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7801 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7802
7803 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
7804 VAddrScalarVT = VAddrVT.getScalarType();
7805 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7806 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7807
7808 // Push back extra arguments.
7809 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
7810 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
7811 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7812 // Special handling of bias when A16 is on. Bias is of type half but
7813 // occupies full 32-bit.
7814 SDValue Bias = DAG.getBuildVector(
7815 MVT::v2f16, DL,
7816 {Op.getOperand(ArgOffset + I), DAG.getUNDEF(MVT::f16)});
7817 VAddrs.push_back(Bias);
7818 } else {
7819 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7820 "Bias needs to be converted to 16 bit in A16 mode");
7821 VAddrs.push_back(Op.getOperand(ArgOffset + I));
7822 }
7823 }
7824
7825 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
7826 // 16 bit gradients are supported, but are tied to the A16 control
7827 // so both gradients and addresses must be 16 bit
7828 LLVM_DEBUG(
7829 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
7830 "require 16 bit args for both gradients and addresses");
7831 return Op;
7832 }
7833
7834 if (IsA16) {
7835 if (!ST->hasA16()) {
7836 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
7837 "support 16 bit addresses\n");
7838 return Op;
7839 }
7840 }
7841
7842 // We've dealt with incorrect input so we know that if IsA16, IsG16
7843 // are set then we have to compress/pack operands (either address,
7844 // gradient or both)
7845 // In the case where a16 and gradients are tied (no G16 support) then we
7846 // have already verified that both IsA16 and IsG16 are true
7847 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
7848 // Activate g16
7849 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
7851 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
7852 }
7853
7854 // Add gradients (packed or unpacked)
7855 if (IsG16) {
7856 // Pack the gradients
7857 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
7858 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
7859 ArgOffset + Intr->GradientStart,
7860 ArgOffset + Intr->CoordStart, Intr->NumGradients);
7861 } else {
7862 for (unsigned I = ArgOffset + Intr->GradientStart;
7863 I < ArgOffset + Intr->CoordStart; I++)
7864 VAddrs.push_back(Op.getOperand(I));
7865 }
7866
7867 // Add addresses (packed or unpacked)
7868 if (IsA16) {
7869 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
7870 ArgOffset + Intr->CoordStart, VAddrEnd,
7871 0 /* No gradients */);
7872 } else {
7873 // Add uncompressed address
7874 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
7875 VAddrs.push_back(Op.getOperand(I));
7876 }
7877
7878 // If the register allocator cannot place the address registers contiguously
7879 // without introducing moves, then using the non-sequential address encoding
7880 // is always preferable, since it saves VALU instructions and is usually a
7881 // wash in terms of code size or even better.
7882 //
7883 // However, we currently have no way of hinting to the register allocator that
7884 // MIMG addresses should be placed contiguously when it is possible to do so,
7885 // so force non-NSA for the common 2-address case as a heuristic.
7886 //
7887 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7888 // allocation when possible.
7889 //
7890 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7891 // set of the remaining addresses.
7892 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
7893 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
7894 const bool UseNSA = ST->hasNSAEncoding() &&
7895 VAddrs.size() >= ST->getNSAThreshold(MF) &&
7896 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
7897 const bool UsePartialNSA =
7898 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
7899
7900 SDValue VAddr;
7901 if (UsePartialNSA) {
7902 VAddr = getBuildDwordsVector(DAG, DL,
7903 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7904 }
7905 else if (!UseNSA) {
7906 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
7907 }
7908
7909 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
7910 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
7911 SDValue Unorm;
7912 if (!BaseOpcode->Sampler) {
7913 Unorm = True;
7914 } else {
7915 uint64_t UnormConst =
7916 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
7917
7918 Unorm = UnormConst ? True : False;
7919 }
7920
7921 SDValue TFE;
7922 SDValue LWE;
7923 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
7924 bool IsTexFail = false;
7925 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7926 return Op;
7927
7928 if (IsTexFail) {
7929 if (!DMaskLanes) {
7930 // Expecting to get an error flag since TFC is on - and dmask is 0
7931 // Force dmask to be at least 1 otherwise the instruction will fail
7932 DMask = 0x1;
7933 DMaskLanes = 1;
7934 NumVDataDwords = 1;
7935 }
7936 NumVDataDwords += 1;
7937 AdjustRetType = true;
7938 }
7939
7940 // Has something earlier tagged that the return type needs adjusting
7941 // This happens if the instruction is a load or has set TexFailCtrl flags
7942 if (AdjustRetType) {
7943 // NumVDataDwords reflects the true number of dwords required in the return type
7944 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7945 // This is a no-op load. This can be eliminated
7946 SDValue Undef = DAG.getUNDEF(Op.getValueType());
7947 if (isa<MemSDNode>(Op))
7948 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
7949 return Undef;
7950 }
7951
7952 EVT NewVT = NumVDataDwords > 1 ?
7953 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
7954 : MVT::i32;
7955
7956 ResultTypes[0] = NewVT;
7957 if (ResultTypes.size() == 3) {
7958 // Original result was aggregate type used for TexFailCtrl results
7959 // The actual instruction returns as a vector type which has now been
7960 // created. Remove the aggregate result.
7961 ResultTypes.erase(&ResultTypes[1]);
7962 }
7963 }
7964
7965 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
7966 if (BaseOpcode->Atomic)
7967 CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
7968 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
7970 return Op;
7971
7973 if (BaseOpcode->Store || BaseOpcode->Atomic)
7974 Ops.push_back(VData); // vdata
7975 if (UsePartialNSA) {
7976 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
7977 Ops.push_back(VAddr);
7978 }
7979 else if (UseNSA)
7980 append_range(Ops, VAddrs);
7981 else
7982 Ops.push_back(VAddr);
7983 Ops.push_back(Op.getOperand(ArgOffset + Intr->RsrcIndex));
7984 if (BaseOpcode->Sampler)
7985 Ops.push_back(Op.getOperand(ArgOffset + Intr->SampIndex));
7986 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
7987 if (IsGFX10Plus)
7988 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
7989 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7990 Ops.push_back(Unorm);
7991 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
7992 Ops.push_back(IsA16 && // r128, a16 for gfx9
7993 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
7994 if (IsGFX10Plus)
7995 Ops.push_back(IsA16 ? True : False);
7996 if (!Subtarget->hasGFX90AInsts()) {
7997 Ops.push_back(TFE); //tfe
7998 } else if (TFE->getAsZExtVal()) {
7999 report_fatal_error("TFE is not supported on this GPU");
8000 }
8001 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8002 Ops.push_back(LWE); // lwe
8003 if (!IsGFX10Plus)
8004 Ops.push_back(DimInfo->DA ? True : False);
8005 if (BaseOpcode->HasD16)
8006 Ops.push_back(IsD16 ? True : False);
8007 if (isa<MemSDNode>(Op))
8008 Ops.push_back(Op.getOperand(0)); // chain
8009
8010 int NumVAddrDwords =
8011 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
8012 int Opcode = -1;
8013
8014 if (IsGFX12Plus) {
8015 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
8016 NumVDataDwords, NumVAddrDwords);
8017 } else if (IsGFX11Plus) {
8018 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8019 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8020 : AMDGPU::MIMGEncGfx11Default,
8021 NumVDataDwords, NumVAddrDwords);
8022 } else if (IsGFX10Plus) {
8023 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
8024 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8025 : AMDGPU::MIMGEncGfx10Default,
8026 NumVDataDwords, NumVAddrDwords);
8027 } else {
8028 if (Subtarget->hasGFX90AInsts()) {
8029 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
8030 NumVDataDwords, NumVAddrDwords);
8031 if (Opcode == -1)
8033 "requested image instruction is not supported on this GPU");
8034 }
8035 if (Opcode == -1 &&
8037 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
8038 NumVDataDwords, NumVAddrDwords);
8039 if (Opcode == -1)
8040 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
8041 NumVDataDwords, NumVAddrDwords);
8042 }
8043 if (Opcode == -1)
8044 return Op;
8045
8046 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
8047 if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
8048 MachineMemOperand *MemRef = MemOp->getMemOperand();
8049 DAG.setNodeMemRefs(NewNode, {MemRef});
8050 }
8051
8052 if (BaseOpcode->AtomicX2) {
8054 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
8055 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
8056 }
8057 if (BaseOpcode->Store)
8058 return SDValue(NewNode, 0);
8059 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
8060 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
8061 NumVDataDwords, IsAtomicPacked16Bit, DL);
8062}
8063
8064SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
8065 SDValue Offset, SDValue CachePolicy,
8066 SelectionDAG &DAG) const {
8068
8069 const DataLayout &DataLayout = DAG.getDataLayout();
8070 Align Alignment =
8072
8077 VT.getStoreSize(), Alignment);
8078
8079 if (!Offset->isDivergent()) {
8080 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
8081
8082 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
8083 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
8084 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
8085 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
8086 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8087 SDValue BufferLoad =
8089 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8090 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8091 }
8092
8093 // Widen vec3 load to vec4.
8094 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
8095 !Subtarget->hasScalarDwordx3Loads()) {
8096 EVT WidenedVT =
8098 auto WidenedOp = DAG.getMemIntrinsicNode(
8099 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
8100 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
8101 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
8102 DAG.getVectorIdxConstant(0, DL));
8103 return Subvector;
8104 }
8105
8107 DAG.getVTList(VT), Ops, VT, MMO);
8108 }
8109
8110 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
8111 // assume that the buffer is unswizzled.
8112 SDValue Ops[] = {
8113 DAG.getEntryNode(), // Chain
8114 Rsrc, // rsrc
8115 DAG.getConstant(0, DL, MVT::i32), // vindex
8116 {}, // voffset
8117 {}, // soffset
8118 {}, // offset
8119 CachePolicy, // cachepolicy
8120 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8121 };
8122 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
8123 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8124 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8125 }
8126
8128 unsigned NumLoads = 1;
8129 MVT LoadVT = VT.getSimpleVT();
8130 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
8131 assert((LoadVT.getScalarType() == MVT::i32 ||
8132 LoadVT.getScalarType() == MVT::f32));
8133
8134 if (NumElts == 8 || NumElts == 16) {
8135 NumLoads = NumElts / 4;
8136 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
8137 }
8138
8139 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
8140
8141 // Use the alignment to ensure that the required offsets will fit into the
8142 // immediate offsets.
8143 setBufferOffsets(Offset, DAG, &Ops[3],
8144 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
8145
8146 uint64_t InstOffset = Ops[5]->getAsZExtVal();
8147 for (unsigned i = 0; i < NumLoads; ++i) {
8148 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
8149 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
8150 LoadVT, MMO, DAG));
8151 }
8152
8153 if (NumElts == 8 || NumElts == 16)
8154 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
8155
8156 return Loads[0];
8157}
8158
8159SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
8160 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
8161 if (!Subtarget->hasArchitectedSGPRs())
8162 return {};
8163 SDLoc SL(Op);
8164 MVT VT = MVT::i32;
8165 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
8166 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
8167 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
8168}
8169
8170SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
8171 unsigned Dim,
8172 const ArgDescriptor &Arg) const {
8173 SDLoc SL(Op);
8175 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
8176 if (MaxID == 0)
8177 return DAG.getConstant(0, SL, MVT::i32);
8178
8179 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
8180 SDLoc(DAG.getEntryNode()), Arg);
8181
8182 // Don't bother inserting AssertZext for packed IDs since we're emitting the
8183 // masking operations anyway.
8184 //
8185 // TODO: We could assert the top bit is 0 for the source copy.
8186 if (Arg.isMasked())
8187 return Val;
8188
8189 // Preserve the known bits after expansion to a copy.
8191 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
8192 DAG.getValueType(SmallVT));
8193}
8194
8195SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
8196 SelectionDAG &DAG) const {
8198 auto MFI = MF.getInfo<SIMachineFunctionInfo>();
8199
8200 EVT VT = Op.getValueType();
8201 SDLoc DL(Op);
8202 unsigned IntrinsicID = Op.getConstantOperandVal(0);
8203
8204 // TODO: Should this propagate fast-math-flags?
8205
8206 switch (IntrinsicID) {
8207 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8208 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
8209 return emitNonHSAIntrinsicError(DAG, DL, VT);
8210 return getPreloadedValue(DAG, *MFI, VT,
8212 }
8213 case Intrinsic::amdgcn_dispatch_ptr:
8214 case Intrinsic::amdgcn_queue_ptr: {
8215 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
8216 DiagnosticInfoUnsupported BadIntrin(
8217 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
8218 DL.getDebugLoc());
8219 DAG.getContext()->diagnose(BadIntrin);
8220 return DAG.getUNDEF(VT);
8221 }
8222
8223 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8225 return getPreloadedValue(DAG, *MFI, VT, RegID);
8226 }
8227 case Intrinsic::amdgcn_implicitarg_ptr: {
8228 if (MFI->isEntryFunction())
8229 return getImplicitArgPtr(DAG, DL);
8230 return getPreloadedValue(DAG, *MFI, VT,
8232 }
8233 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8235 // This only makes sense to call in a kernel, so just lower to null.
8236 return DAG.getConstant(0, DL, VT);
8237 }
8238
8239 return getPreloadedValue(DAG, *MFI, VT,
8241 }
8242 case Intrinsic::amdgcn_dispatch_id: {
8243 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
8244 }
8245 case Intrinsic::amdgcn_rcp:
8246 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
8247 case Intrinsic::amdgcn_rsq:
8248 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8249 case Intrinsic::amdgcn_rsq_legacy:
8251 return emitRemovedIntrinsicError(DAG, DL, VT);
8252 return SDValue();
8253 case Intrinsic::amdgcn_rcp_legacy:
8255 return emitRemovedIntrinsicError(DAG, DL, VT);
8256 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
8257 case Intrinsic::amdgcn_rsq_clamp: {
8259 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
8260
8261 Type *Type = VT.getTypeForEVT(*DAG.getContext());
8264
8265 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
8266 SDValue Tmp = DAG.getNode(ISD::FMINNUM, DL, VT, Rsq,
8267 DAG.getConstantFP(Max, DL, VT));
8268 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
8269 DAG.getConstantFP(Min, DL, VT));
8270 }
8271 case Intrinsic::r600_read_ngroups_x:
8272 if (Subtarget->isAmdHsaOS())
8273 return emitNonHSAIntrinsicError(DAG, DL, VT);
8274
8275 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8277 false);
8278 case Intrinsic::r600_read_ngroups_y:
8279 if (Subtarget->isAmdHsaOS())
8280 return emitNonHSAIntrinsicError(DAG, DL, VT);
8281
8282 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8284 false);
8285 case Intrinsic::r600_read_ngroups_z:
8286 if (Subtarget->isAmdHsaOS())
8287 return emitNonHSAIntrinsicError(DAG, DL, VT);
8288
8289 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8291 false);
8292 case Intrinsic::r600_read_global_size_x:
8293 if (Subtarget->isAmdHsaOS())
8294 return emitNonHSAIntrinsicError(DAG, DL, VT);
8295
8296 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8298 Align(4), false);
8299 case Intrinsic::r600_read_global_size_y:
8300 if (Subtarget->isAmdHsaOS())
8301 return emitNonHSAIntrinsicError(DAG, DL, VT);
8302
8303 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8305 Align(4), false);
8306 case Intrinsic::r600_read_global_size_z:
8307 if (Subtarget->isAmdHsaOS())
8308 return emitNonHSAIntrinsicError(DAG, DL, VT);
8309
8310 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
8312 Align(4), false);
8313 case Intrinsic::r600_read_local_size_x:
8314 if (Subtarget->isAmdHsaOS())
8315 return emitNonHSAIntrinsicError(DAG, DL, VT);
8316
8317 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8319 case Intrinsic::r600_read_local_size_y:
8320 if (Subtarget->isAmdHsaOS())
8321 return emitNonHSAIntrinsicError(DAG, DL, VT);
8322
8323 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8325 case Intrinsic::r600_read_local_size_z:
8326 if (Subtarget->isAmdHsaOS())
8327 return emitNonHSAIntrinsicError(DAG, DL, VT);
8328
8329 return lowerImplicitZextParam(DAG, Op, MVT::i16,
8331 case Intrinsic::amdgcn_workgroup_id_x:
8332 return getPreloadedValue(DAG, *MFI, VT,
8334 case Intrinsic::amdgcn_workgroup_id_y:
8335 return getPreloadedValue(DAG, *MFI, VT,
8337 case Intrinsic::amdgcn_workgroup_id_z:
8338 return getPreloadedValue(DAG, *MFI, VT,
8340 case Intrinsic::amdgcn_wave_id:
8341 return lowerWaveID(DAG, Op);
8342 case Intrinsic::amdgcn_lds_kernel_id: {
8343 if (MFI->isEntryFunction())
8344 return getLDSKernelId(DAG, DL);
8345 return getPreloadedValue(DAG, *MFI, VT,
8347 }
8348 case Intrinsic::amdgcn_workitem_id_x:
8349 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
8350 case Intrinsic::amdgcn_workitem_id_y:
8351 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
8352 case Intrinsic::amdgcn_workitem_id_z:
8353 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
8354 case Intrinsic::amdgcn_wavefrontsize:
8356 SDLoc(Op), MVT::i32);
8357 case Intrinsic::amdgcn_s_buffer_load: {
8358 unsigned CPol = Op.getConstantOperandVal(3);
8359 // s_buffer_load, because of how it's optimized, can't be volatile
8360 // so reject ones with the volatile bit set.
8361 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
8364 return Op;
8365 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8366 DAG);
8367 }
8368 case Intrinsic::amdgcn_fdiv_fast:
8369 return lowerFDIV_FAST(Op, DAG);
8370 case Intrinsic::amdgcn_sin:
8371 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
8372
8373 case Intrinsic::amdgcn_cos:
8374 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
8375
8376 case Intrinsic::amdgcn_mul_u24:
8377 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8378 case Intrinsic::amdgcn_mul_i24:
8379 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));
8380
8381 case Intrinsic::amdgcn_log_clamp: {
8383 return SDValue();
8384
8385 return emitRemovedIntrinsicError(DAG, DL, VT);
8386 }
8387 case Intrinsic::amdgcn_fract:
8388 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
8389
8390 case Intrinsic::amdgcn_class:
8391 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT,
8392 Op.getOperand(1), Op.getOperand(2));
8393 case Intrinsic::amdgcn_div_fmas:
8394 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
8395 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8396 Op.getOperand(4));
8397
8398 case Intrinsic::amdgcn_div_fixup:
8399 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
8400 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8401
8402 case Intrinsic::amdgcn_div_scale: {
8403 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
8404
8405 // Translate to the operands expected by the machine instruction. The
8406 // first parameter must be the same as the first instruction.
8407 SDValue Numerator = Op.getOperand(1);
8408 SDValue Denominator = Op.getOperand(2);
8409
8410 // Note this order is opposite of the machine instruction's operations,
8411 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
8412 // intrinsic has the numerator as the first operand to match a normal
8413 // division operation.
8414
8415 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
8416
8417 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
8418 Denominator, Numerator);
8419 }
8420 case Intrinsic::amdgcn_icmp: {
8421 // There is a Pat that handles this variant, so return it as-is.
8422 if (Op.getOperand(1).getValueType() == MVT::i1 &&
8423 Op.getConstantOperandVal(2) == 0 &&
8424 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
8425 return Op;
8426 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
8427 }
8428 case Intrinsic::amdgcn_fcmp: {
8429 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
8430 }
8431 case Intrinsic::amdgcn_ballot:
8432 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
8433 case Intrinsic::amdgcn_fmed3:
8434 return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
8435 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8436 case Intrinsic::amdgcn_fdot2:
8437 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT,
8438 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
8439 Op.getOperand(4));
8440 case Intrinsic::amdgcn_fmul_legacy:
8441 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
8442 Op.getOperand(1), Op.getOperand(2));
8443 case Intrinsic::amdgcn_sffbh:
8444 return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
8445 case Intrinsic::amdgcn_sbfe:
8446 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
8447 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8448 case Intrinsic::amdgcn_ubfe:
8449 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
8450 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
8451 case Intrinsic::amdgcn_cvt_pkrtz:
8452 case Intrinsic::amdgcn_cvt_pknorm_i16:
8453 case Intrinsic::amdgcn_cvt_pknorm_u16:
8454 case Intrinsic::amdgcn_cvt_pk_i16:
8455 case Intrinsic::amdgcn_cvt_pk_u16: {
8456 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
8457 EVT VT = Op.getValueType();
8458 unsigned Opcode;
8459
8460 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8462 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8464 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8466 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8468 else
8470
8471 if (isTypeLegal(VT))
8472 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
8473
8474 SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
8475 Op.getOperand(1), Op.getOperand(2));
8476 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
8477 }
8478 case Intrinsic::amdgcn_fmad_ftz:
8479 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
8480 Op.getOperand(2), Op.getOperand(3));
8481
8482 case Intrinsic::amdgcn_if_break:
8483 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
8484 Op->getOperand(1), Op->getOperand(2)), 0);
8485
8486 case Intrinsic::amdgcn_groupstaticsize: {
8488 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
8489 return Op;
8490
8491 const Module *M = MF.getFunction().getParent();
8492 const GlobalValue *GV =
8493 M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
8494 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
8496 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8497 }
8498 case Intrinsic::amdgcn_is_shared:
8499 case Intrinsic::amdgcn_is_private: {
8500 SDLoc SL(Op);
8501 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8503 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8504 SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
8505 Op.getOperand(1));
8506
8507 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
8508 DAG.getConstant(1, SL, MVT::i32));
8509 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
8510 }
8511 case Intrinsic::amdgcn_perm:
8512 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
8513 Op.getOperand(2), Op.getOperand(3));
8514 case Intrinsic::amdgcn_reloc_constant: {
8515 Module *M = const_cast<Module *>(MF.getFunction().getParent());
8516 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
8517 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
8518 auto RelocSymbol = cast<GlobalVariable>(
8519 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
8520 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
8522 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
8523 }
8524 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8525 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8529 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8530 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8532 if (Op.getOperand(4).getValueType() == MVT::i32)
8533 return SDValue();
8534
8535 SDLoc SL(Op);
8536 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
8537 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8538 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8539 Op.getOperand(3), IndexKeyi32);
8540 }
8541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8542 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8543 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8544 if (Op.getOperand(6).getValueType() == MVT::i32)
8545 return SDValue();
8546
8547 SDLoc SL(Op);
8548 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
8549 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
8550 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8551 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8552 IndexKeyi32, Op.getOperand(7)});
8553 }
8554 case Intrinsic::amdgcn_addrspacecast_nonnull:
8555 return lowerADDRSPACECAST(Op, DAG);
8556 default:
8557 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8559 return lowerImage(Op, ImageDimIntr, DAG, false);
8560
8561 return Op;
8562 }
8563}
8564
8565// On targets not supporting constant in soffset field, turn zero to
8566// SGPR_NULL to avoid generating an extra s_mov with zero.
8568 const GCNSubtarget *Subtarget) {
8569 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
8570 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8571 return SOffset;
8572}
8573
8574SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
8575 SelectionDAG &DAG,
8576 unsigned NewOpcode) const {
8577 SDLoc DL(Op);
8578
8579 SDValue VData = Op.getOperand(2);
8580 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8581 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8582 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8583 SDValue Ops[] = {
8584 Op.getOperand(0), // Chain
8585 VData, // vdata
8586 Rsrc, // rsrc
8587 DAG.getConstant(0, DL, MVT::i32), // vindex
8588 Offsets.first, // voffset
8589 SOffset, // soffset
8590 Offsets.second, // offset
8591 Op.getOperand(6), // cachepolicy
8592 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8593 };
8594
8595 auto *M = cast<MemSDNode>(Op);
8596
8597 EVT MemVT = VData.getValueType();
8598 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8599 M->getMemOperand());
8600}
8601
8602// Return a value to use for the idxen operand by examining the vindex operand.
8603static unsigned getIdxEn(SDValue VIndex) {
8604 // No need to set idxen if vindex is known to be zero.
8605 return isNullConstant(VIndex) ? 0 : 1;
8606}
8607
8608SDValue
8609SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
8610 unsigned NewOpcode) const {
8611 SDLoc DL(Op);
8612
8613 SDValue VData = Op.getOperand(2);
8614 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
8615 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
8616 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
8617 SDValue Ops[] = {
8618 Op.getOperand(0), // Chain
8619 VData, // vdata
8620 Rsrc, // rsrc
8621 Op.getOperand(4), // vindex
8622 Offsets.first, // voffset
8623 SOffset, // soffset
8624 Offsets.second, // offset
8625 Op.getOperand(7), // cachepolicy
8626 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8627 };
8628
8629 auto *M = cast<MemSDNode>(Op);
8630
8631 EVT MemVT = VData.getValueType();
8632 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
8633 M->getMemOperand());
8634}
8635
8636SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
8637 SelectionDAG &DAG) const {
8638 unsigned IntrID = Op.getConstantOperandVal(1);
8639 SDLoc DL(Op);
8640
8641 switch (IntrID) {
8642 case Intrinsic::amdgcn_ds_ordered_add:
8643 case Intrinsic::amdgcn_ds_ordered_swap: {
8644 MemSDNode *M = cast<MemSDNode>(Op);
8645 SDValue Chain = M->getOperand(0);
8646 SDValue M0 = M->getOperand(2);
8647 SDValue Value = M->getOperand(3);
8648 unsigned IndexOperand = M->getConstantOperandVal(7);
8649 unsigned WaveRelease = M->getConstantOperandVal(8);
8650 unsigned WaveDone = M->getConstantOperandVal(9);
8651
8652 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8653 IndexOperand &= ~0x3f;
8654 unsigned CountDw = 0;
8655
8656 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
8657 CountDw = (IndexOperand >> 24) & 0xf;
8658 IndexOperand &= ~(0xf << 24);
8659
8660 if (CountDw < 1 || CountDw > 4) {
8662 "ds_ordered_count: dword count must be between 1 and 4");
8663 }
8664 }
8665
8666 if (IndexOperand)
8667 report_fatal_error("ds_ordered_count: bad index operand");
8668
8669 if (WaveDone && !WaveRelease)
8670 report_fatal_error("ds_ordered_count: wave_done requires wave_release");
8671
8672 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8673 unsigned ShaderType =
8675 unsigned Offset0 = OrderedCountIndex << 2;
8676 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
8677
8678 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
8679 Offset1 |= (CountDw - 1) << 6;
8680
8681 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
8682 Offset1 |= ShaderType << 2;
8683
8684 unsigned Offset = Offset0 | (Offset1 << 8);
8685
8686 SDValue Ops[] = {
8687 Chain,
8688 Value,
8689 DAG.getTargetConstant(Offset, DL, MVT::i16),
8690 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
8691 };
8693 M->getVTList(), Ops, M->getMemoryVT(),
8694 M->getMemOperand());
8695 }
8696 case Intrinsic::amdgcn_ds_fadd: {
8697 MemSDNode *M = cast<MemSDNode>(Op);
8698 unsigned Opc;
8699 switch (IntrID) {
8700 case Intrinsic::amdgcn_ds_fadd:
8702 break;
8703 }
8704
8705 return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(),
8706 M->getOperand(0), M->getOperand(2), M->getOperand(3),
8707 M->getMemOperand());
8708 }
8709 case Intrinsic::amdgcn_ds_fmin:
8710 case Intrinsic::amdgcn_ds_fmax: {
8711 MemSDNode *M = cast<MemSDNode>(Op);
8712 unsigned Opc;
8713 switch (IntrID) {
8714 case Intrinsic::amdgcn_ds_fmin:
8716 break;
8717 case Intrinsic::amdgcn_ds_fmax:
8719 break;
8720 default:
8721 llvm_unreachable("Unknown intrinsic!");
8722 }
8723 SDValue Ops[] = {
8724 M->getOperand(0), // Chain
8725 M->getOperand(2), // Ptr
8726 M->getOperand(3) // Value
8727 };
8728
8729 return DAG.getMemIntrinsicNode(Opc, SDLoc(Op), M->getVTList(), Ops,
8730 M->getMemoryVT(), M->getMemOperand());
8731 }
8732 case Intrinsic::amdgcn_buffer_load:
8733 case Intrinsic::amdgcn_buffer_load_format: {
8734 unsigned Glc = Op.getConstantOperandVal(5);
8735 unsigned Slc = Op.getConstantOperandVal(6);
8736 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8737 SDValue Ops[] = {
8738 Op.getOperand(0), // Chain
8739 Op.getOperand(2), // rsrc
8740 Op.getOperand(3), // vindex
8741 SDValue(), // voffset -- will be set by setBufferOffsets
8742 SDValue(), // soffset -- will be set by setBufferOffsets
8743 SDValue(), // offset -- will be set by setBufferOffsets
8744 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8745 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8746 };
8747 setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
8748
8749 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8751
8752 EVT VT = Op.getValueType();
8753 EVT IntVT = VT.changeTypeToInteger();
8754 auto *M = cast<MemSDNode>(Op);
8755 EVT LoadVT = Op.getValueType();
8756
8757 if (LoadVT.getScalarType() == MVT::f16)
8758 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
8759 M, DAG, Ops);
8760
8761 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
8762 if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
8763 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
8764 M->getMemOperand());
8765
8766 return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
8767 M->getMemOperand(), DAG);
8768 }
8769 case Intrinsic::amdgcn_raw_buffer_load:
8770 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8771 case Intrinsic::amdgcn_raw_buffer_load_format:
8772 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8773 const bool IsFormat =
8774 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8775 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8776
8777 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8778 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8779 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8780 SDValue Ops[] = {
8781 Op.getOperand(0), // Chain
8782 Rsrc, // rsrc
8783 DAG.getConstant(0, DL, MVT::i32), // vindex
8784 Offsets.first, // voffset
8785 SOffset, // soffset
8786 Offsets.second, // offset
8787 Op.getOperand(5), // cachepolicy, swizzled buffer
8788 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8789 };
8790
8791 auto *M = cast<MemSDNode>(Op);
8792 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8793 }
8794 case Intrinsic::amdgcn_struct_buffer_load:
8795 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8796 case Intrinsic::amdgcn_struct_buffer_load_format:
8797 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8798 const bool IsFormat =
8799 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8800 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8801
8802 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8803 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8804 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8805 SDValue Ops[] = {
8806 Op.getOperand(0), // Chain
8807 Rsrc, // rsrc
8808 Op.getOperand(3), // vindex
8809 Offsets.first, // voffset
8810 SOffset, // soffset
8811 Offsets.second, // offset
8812 Op.getOperand(6), // cachepolicy, swizzled buffer
8813 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8814 };
8815
8816 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
8817 }
8818 case Intrinsic::amdgcn_tbuffer_load: {
8819 MemSDNode *M = cast<MemSDNode>(Op);
8820 EVT LoadVT = Op.getValueType();
8821
8822 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8823 unsigned Dfmt = Op.getConstantOperandVal(7);
8824 unsigned Nfmt = Op.getConstantOperandVal(8);
8825 unsigned Glc = Op.getConstantOperandVal(9);
8826 unsigned Slc = Op.getConstantOperandVal(10);
8827 unsigned IdxEn = getIdxEn(Op.getOperand(3));
8828 SDValue Ops[] = {
8829 Op.getOperand(0), // Chain
8830 Op.getOperand(2), // rsrc
8831 Op.getOperand(3), // vindex
8832 Op.getOperand(4), // voffset
8833 SOffset, // soffset
8834 Op.getOperand(6), // offset
8835 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
8836 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
8837 DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
8838 };
8839
8840 if (LoadVT.getScalarType() == MVT::f16)
8841 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8842 M, DAG, Ops);
8843 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8844 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8845 DAG);
8846 }
8847 case Intrinsic::amdgcn_raw_tbuffer_load:
8848 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8849 MemSDNode *M = cast<MemSDNode>(Op);
8850 EVT LoadVT = Op.getValueType();
8851 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8852 auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
8853 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
8854
8855 SDValue Ops[] = {
8856 Op.getOperand(0), // Chain
8857 Rsrc, // rsrc
8858 DAG.getConstant(0, DL, MVT::i32), // vindex
8859 Offsets.first, // voffset
8860 SOffset, // soffset
8861 Offsets.second, // offset
8862 Op.getOperand(5), // format
8863 Op.getOperand(6), // cachepolicy, swizzled buffer
8864 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8865 };
8866
8867 if (LoadVT.getScalarType() == MVT::f16)
8868 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8869 M, DAG, Ops);
8870 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8871 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8872 DAG);
8873 }
8874 case Intrinsic::amdgcn_struct_tbuffer_load:
8875 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8876 MemSDNode *M = cast<MemSDNode>(Op);
8877 EVT LoadVT = Op.getValueType();
8878 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
8879 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
8880 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
8881
8882 SDValue Ops[] = {
8883 Op.getOperand(0), // Chain
8884 Rsrc, // rsrc
8885 Op.getOperand(3), // vindex
8886 Offsets.first, // voffset
8887 SOffset, // soffset
8888 Offsets.second, // offset
8889 Op.getOperand(6), // format
8890 Op.getOperand(7), // cachepolicy, swizzled buffer
8891 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
8892 };
8893
8894 if (LoadVT.getScalarType() == MVT::f16)
8895 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16,
8896 M, DAG, Ops);
8897 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
8898 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
8899 DAG);
8900 }
8901 case Intrinsic::amdgcn_buffer_atomic_swap:
8902 case Intrinsic::amdgcn_buffer_atomic_add:
8903 case Intrinsic::amdgcn_buffer_atomic_sub:
8904 case Intrinsic::amdgcn_buffer_atomic_csub:
8905 case Intrinsic::amdgcn_buffer_atomic_smin:
8906 case Intrinsic::amdgcn_buffer_atomic_umin:
8907 case Intrinsic::amdgcn_buffer_atomic_smax:
8908 case Intrinsic::amdgcn_buffer_atomic_umax:
8909 case Intrinsic::amdgcn_buffer_atomic_and:
8910 case Intrinsic::amdgcn_buffer_atomic_or:
8911 case Intrinsic::amdgcn_buffer_atomic_xor:
8912 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8913 unsigned Slc = Op.getConstantOperandVal(6);
8914 unsigned IdxEn = getIdxEn(Op.getOperand(4));
8915 SDValue Ops[] = {
8916 Op.getOperand(0), // Chain
8917 Op.getOperand(2), // vdata
8918 Op.getOperand(3), // rsrc
8919 Op.getOperand(4), // vindex
8920 SDValue(), // voffset -- will be set by setBufferOffsets
8921 SDValue(), // soffset -- will be set by setBufferOffsets
8922 SDValue(), // offset -- will be set by setBufferOffsets
8923 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
8924 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
8925 };
8926 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
8927
8928 EVT VT = Op.getValueType();
8929
8930 auto *M = cast<MemSDNode>(Op);
8931 unsigned Opcode = 0;
8932
8933 switch (IntrID) {
8934 case Intrinsic::amdgcn_buffer_atomic_swap:
8936 break;
8937 case Intrinsic::amdgcn_buffer_atomic_add:
8939 break;
8940 case Intrinsic::amdgcn_buffer_atomic_sub:
8942 break;
8943 case Intrinsic::amdgcn_buffer_atomic_csub:
8945 break;
8946 case Intrinsic::amdgcn_buffer_atomic_smin:
8948 break;
8949 case Intrinsic::amdgcn_buffer_atomic_umin:
8951 break;
8952 case Intrinsic::amdgcn_buffer_atomic_smax:
8954 break;
8955 case Intrinsic::amdgcn_buffer_atomic_umax:
8957 break;
8958 case Intrinsic::amdgcn_buffer_atomic_and:
8960 break;
8961 case Intrinsic::amdgcn_buffer_atomic_or:
8963 break;
8964 case Intrinsic::amdgcn_buffer_atomic_xor:
8966 break;
8967 case Intrinsic::amdgcn_buffer_atomic_fadd:
8969 break;
8970 default:
8971 llvm_unreachable("unhandled atomic opcode");
8972 }
8973
8974 return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
8975 M->getMemOperand());
8976 }
8977 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8978 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8979 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8980 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8981 return lowerRawBufferAtomicIntrin(Op, DAG,
8983 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8984 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8985 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
8986 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8987 return lowerStructBufferAtomicIntrin(Op, DAG,
8989 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8990 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8991 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8992 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8993 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8994 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
8995 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8996 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8997 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
8998 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8999 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9000 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
9001 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9002 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9003 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
9004 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9005 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9006 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9007 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9008 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9009 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9010 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9011 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9012 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
9013 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9014 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9015 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
9016 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9017 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9018 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
9019 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9020 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9021 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
9022 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9023 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9024 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9025 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9026 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9027 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9028 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9030 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9031 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9032 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9033 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9034 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9036 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9037 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9038 return lowerRawBufferAtomicIntrin(Op, DAG,
9040 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9042 return lowerStructBufferAtomicIntrin(Op, DAG,
9044 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9045 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9046 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
9047 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9048 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9049 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
9050 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9052 return lowerStructBufferAtomicIntrin(Op, DAG,
9054 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9055 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9056 return lowerStructBufferAtomicIntrin(Op, DAG,
9058 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9060 return lowerStructBufferAtomicIntrin(Op, DAG,
9062 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9064 return lowerStructBufferAtomicIntrin(Op, DAG,
9066 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9067 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9068 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
9069 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9070 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9071 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
9072 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9074 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
9075 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9076 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9077 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
9078 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9080 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
9081 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9082 return lowerStructBufferAtomicIntrin(Op, DAG,
9084
9085 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
9086 unsigned Slc = Op.getConstantOperandVal(7);
9087 unsigned IdxEn = getIdxEn(Op.getOperand(5));
9088 SDValue Ops[] = {
9089 Op.getOperand(0), // Chain
9090 Op.getOperand(2), // src
9091 Op.getOperand(3), // cmp
9092 Op.getOperand(4), // rsrc
9093 Op.getOperand(5), // vindex
9094 SDValue(), // voffset -- will be set by setBufferOffsets
9095 SDValue(), // soffset -- will be set by setBufferOffsets
9096 SDValue(), // offset -- will be set by setBufferOffsets
9097 DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
9098 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9099 };
9100 setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
9101
9102 EVT VT = Op.getValueType();
9103 auto *M = cast<MemSDNode>(Op);
9104
9106 Op->getVTList(), Ops, VT, M->getMemOperand());
9107 }
9108 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9110 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
9111 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9112 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9113 SDValue Ops[] = {
9114 Op.getOperand(0), // Chain
9115 Op.getOperand(2), // src
9116 Op.getOperand(3), // cmp
9117 Rsrc, // rsrc
9118 DAG.getConstant(0, DL, MVT::i32), // vindex
9119 Offsets.first, // voffset
9120 SOffset, // soffset
9121 Offsets.second, // offset
9122 Op.getOperand(7), // cachepolicy
9123 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9124 };
9125 EVT VT = Op.getValueType();
9126 auto *M = cast<MemSDNode>(Op);
9127
9129 Op->getVTList(), Ops, VT, M->getMemOperand());
9130 }
9131 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9132 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9133 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
9134 auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
9135 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
9136 SDValue Ops[] = {
9137 Op.getOperand(0), // Chain
9138 Op.getOperand(2), // src
9139 Op.getOperand(3), // cmp
9140 Rsrc, // rsrc
9141 Op.getOperand(5), // vindex
9142 Offsets.first, // voffset
9143 SOffset, // soffset
9144 Offsets.second, // offset
9145 Op.getOperand(8), // cachepolicy
9146 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9147 };
9148 EVT VT = Op.getValueType();
9149 auto *M = cast<MemSDNode>(Op);
9150
9152 Op->getVTList(), Ops, VT, M->getMemOperand());
9153 }
9154 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9155 MemSDNode *M = cast<MemSDNode>(Op);
9156 SDValue NodePtr = M->getOperand(2);
9157 SDValue RayExtent = M->getOperand(3);
9158 SDValue RayOrigin = M->getOperand(4);
9159 SDValue RayDir = M->getOperand(5);
9160 SDValue RayInvDir = M->getOperand(6);
9161 SDValue TDescr = M->getOperand(7);
9162
9163 assert(NodePtr.getValueType() == MVT::i32 ||
9164 NodePtr.getValueType() == MVT::i64);
9165 assert(RayDir.getValueType() == MVT::v3f16 ||
9166 RayDir.getValueType() == MVT::v3f32);
9167
9168 if (!Subtarget->hasGFX10_AEncoding()) {
9169 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
9170 return SDValue();
9171 }
9172
9173 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
9174 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
9175 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
9176 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
9177 const bool Is64 = NodePtr.getValueType() == MVT::i64;
9178 const unsigned NumVDataDwords = 4;
9179 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9180 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9181 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
9182 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
9183 IsGFX12Plus;
9184 const unsigned BaseOpcodes[2][2] = {
9185 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9186 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9187 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9188 int Opcode;
9189 if (UseNSA) {
9190 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9191 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9192 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9193 : AMDGPU::MIMGEncGfx10NSA,
9194 NumVDataDwords, NumVAddrDwords);
9195 } else {
9196 assert(!IsGFX12Plus);
9197 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
9198 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9199 : AMDGPU::MIMGEncGfx10Default,
9200 NumVDataDwords, NumVAddrDwords);
9201 }
9202 assert(Opcode != -1);
9203
9205
9206 auto packLanes = [&DAG, &Ops, &DL] (SDValue Op, bool IsAligned) {
9208 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
9209 if (Lanes[0].getValueSizeInBits() == 32) {
9210 for (unsigned I = 0; I < 3; ++I)
9211 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
9212 } else {
9213 if (IsAligned) {
9214 Ops.push_back(
9215 DAG.getBitcast(MVT::i32,
9216 DAG.getBuildVector(MVT::v2f16, DL,
9217 { Lanes[0], Lanes[1] })));
9218 Ops.push_back(Lanes[2]);
9219 } else {
9220 SDValue Elt0 = Ops.pop_back_val();
9221 Ops.push_back(
9222 DAG.getBitcast(MVT::i32,
9223 DAG.getBuildVector(MVT::v2f16, DL,
9224 { Elt0, Lanes[0] })));
9225 Ops.push_back(
9226 DAG.getBitcast(MVT::i32,
9227 DAG.getBuildVector(MVT::v2f16, DL,
9228 { Lanes[1], Lanes[2] })));
9229 }
9230 }
9231 };
9232
9233 if (UseNSA && IsGFX11Plus) {
9234 Ops.push_back(NodePtr);
9235 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9236 Ops.push_back(RayOrigin);
9237 if (IsA16) {
9238 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
9239 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
9240 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
9241 for (unsigned I = 0; I < 3; ++I) {
9242 MergedLanes.push_back(DAG.getBitcast(
9243 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
9244 {DirLanes[I], InvDirLanes[I]})));
9245 }
9246 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
9247 } else {
9248 Ops.push_back(RayDir);
9249 Ops.push_back(RayInvDir);
9250 }
9251 } else {
9252 if (Is64)
9253 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
9254 2);
9255 else
9256 Ops.push_back(NodePtr);
9257
9258 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
9259 packLanes(RayOrigin, true);
9260 packLanes(RayDir, true);
9261 packLanes(RayInvDir, false);
9262 }
9263
9264 if (!UseNSA) {
9265 // Build a single vector containing all the operands so far prepared.
9266 if (NumVAddrDwords > 12) {
9267 SDValue Undef = DAG.getUNDEF(MVT::i32);
9268 Ops.append(16 - Ops.size(), Undef);
9269 }
9270 assert(Ops.size() >= 8 && Ops.size() <= 12);
9271 SDValue MergedOps = DAG.getBuildVector(
9272 MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
9273 Ops.clear();
9274 Ops.push_back(MergedOps);
9275 }
9276
9277 Ops.push_back(TDescr);
9278 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
9279 Ops.push_back(M->getChain());
9280
9281 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
9282 MachineMemOperand *MemRef = M->getMemOperand();
9283 DAG.setNodeMemRefs(NewNode, {MemRef});
9284 return SDValue(NewNode, 0);
9285 }
9286 case Intrinsic::amdgcn_global_atomic_fmin:
9287 case Intrinsic::amdgcn_global_atomic_fmax:
9288 case Intrinsic::amdgcn_global_atomic_fmin_num:
9289 case Intrinsic::amdgcn_global_atomic_fmax_num:
9290 case Intrinsic::amdgcn_flat_atomic_fmin:
9291 case Intrinsic::amdgcn_flat_atomic_fmax:
9292 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9293 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9294 MemSDNode *M = cast<MemSDNode>(Op);
9295 SDValue Ops[] = {
9296 M->getOperand(0), // Chain
9297 M->getOperand(2), // Ptr
9298 M->getOperand(3) // Value
9299 };
9300 unsigned Opcode = 0;
9301 switch (IntrID) {
9302 case Intrinsic::amdgcn_global_atomic_fmin:
9303 case Intrinsic::amdgcn_global_atomic_fmin_num:
9304 case Intrinsic::amdgcn_flat_atomic_fmin:
9305 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9307 break;
9308 }
9309 case Intrinsic::amdgcn_global_atomic_fmax:
9310 case Intrinsic::amdgcn_global_atomic_fmax_num:
9311 case Intrinsic::amdgcn_flat_atomic_fmax:
9312 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9314 break;
9315 }
9316 default:
9317 llvm_unreachable("unhandled atomic opcode");
9318 }
9319 return DAG.getMemIntrinsicNode(Opcode, SDLoc(Op),
9320 M->getVTList(), Ops, M->getMemoryVT(),
9321 M->getMemOperand());
9322 }
9323 case Intrinsic::amdgcn_s_get_barrier_state: {
9324 SDValue Chain = Op->getOperand(0);
9326 unsigned Opc;
9327 bool IsInlinableBarID = false;
9328 int64_t BarID;
9329
9330 if (isa<ConstantSDNode>(Op->getOperand(2))) {
9331 BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
9332 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
9333 }
9334
9335 if (IsInlinableBarID) {
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9337 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
9338 Ops.push_back(K);
9339 } else {
9340 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9341 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
9342 Ops.push_back(M0Val.getValue(0));
9343 }
9344
9345 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9346 return SDValue(NewMI, 0);
9347 }
9348 default:
9349
9350 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
9352 return lowerImage(Op, ImageDimIntr, DAG, true);
9353
9354 return SDValue();
9355 }
9356}
9357
9358// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
9359// dwordx4 if on SI and handle TFE loads.
9360SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
9361 SDVTList VTList,
9362 ArrayRef<SDValue> Ops, EVT MemVT,
9363 MachineMemOperand *MMO,
9364 SelectionDAG &DAG) const {
9365 LLVMContext &C = *DAG.getContext();
9367 EVT VT = VTList.VTs[0];
9368
9369 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
9370 bool IsTFE = VTList.NumVTs == 3;
9371 if (IsTFE) {
9372 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
9373 unsigned NumOpDWords = NumValueDWords + 1;
9374 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
9375 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
9376 MachineMemOperand *OpDWordsMMO =
9377 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
9378 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
9379 OpDWordsVT, OpDWordsMMO, DAG);
9381 DAG.getVectorIdxConstant(NumValueDWords, DL));
9382 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
9383 SDValue ValueDWords =
9384 NumValueDWords == 1
9385 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
9387 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
9388 ZeroIdx);
9389 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
9390 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
9391 }
9392
9393 if (!Subtarget->hasDwordx3LoadStores() &&
9394 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9395 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
9396 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
9397 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
9398 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
9399 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
9400 WidenedMemVT, WidenedMMO);
9402 DAG.getVectorIdxConstant(0, DL));
9403 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
9404 }
9405
9406 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
9407}
9408
9409SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
9410 bool ImageStore) const {
9411 EVT StoreVT = VData.getValueType();
9412
9413 // No change for f16 and legal vector D16 types.
9414 if (!StoreVT.isVector())
9415 return VData;
9416
9417 SDLoc DL(VData);
9418 unsigned NumElements = StoreVT.getVectorNumElements();
9419
9420 if (Subtarget->hasUnpackedD16VMem()) {
9421 // We need to unpack the packed data to store.
9422 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9423 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9424
9425 EVT EquivStoreVT =
9426 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
9427 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
9428 return DAG.UnrollVectorOp(ZExt.getNode());
9429 }
9430
9431 // The sq block of gfx8.1 does not estimate register use correctly for d16
9432 // image store instructions. The data operand is computed as if it were not a
9433 // d16 image instruction.
9434 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
9435 // Bitcast to i16
9436 EVT IntStoreVT = StoreVT.changeTypeToInteger();
9437 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9438
9439 // Decompose into scalars
9441 DAG.ExtractVectorElements(IntVData, Elts);
9442
9443 // Group pairs of i16 into v2i16 and bitcast to i32
9444 SmallVector<SDValue, 4> PackedElts;
9445 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
9446 SDValue Pair =
9447 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
9448 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9449 PackedElts.push_back(IntPair);
9450 }
9451 if ((NumElements % 2) == 1) {
9452 // Handle v3i16
9453 unsigned I = Elts.size() / 2;
9454 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
9455 {Elts[I * 2], DAG.getUNDEF(MVT::i16)});
9456 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
9457 PackedElts.push_back(IntPair);
9458 }
9459
9460 // Pad using UNDEF
9461 PackedElts.resize(Elts.size(), DAG.getUNDEF(MVT::i32));
9462
9463 // Build final vector
9464 EVT VecVT =
9465 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
9466 return DAG.getBuildVector(VecVT, DL, PackedElts);
9467 }
9468
9469 if (NumElements == 3) {
9470 EVT IntStoreVT =
9472 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
9473
9474 EVT WidenedStoreVT = EVT::getVectorVT(
9475 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
9476 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
9477 WidenedStoreVT.getStoreSizeInBits());
9478 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
9479 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
9480 }
9481
9482 assert(isTypeLegal(StoreVT));
9483 return VData;
9484}
9485
9486SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
9487 SelectionDAG &DAG) const {
9488 SDLoc DL(Op);
9489 SDValue Chain = Op.getOperand(0);
9490 unsigned IntrinsicID = Op.getConstantOperandVal(1);
9492
9493 switch (IntrinsicID) {
9494 case Intrinsic::amdgcn_exp_compr: {
9495 if (!Subtarget->hasCompressedExport()) {
9496 DiagnosticInfoUnsupported BadIntrin(
9498 "intrinsic not supported on subtarget", DL.getDebugLoc());
9499 DAG.getContext()->diagnose(BadIntrin);
9500 }
9501 SDValue Src0 = Op.getOperand(4);
9502 SDValue Src1 = Op.getOperand(5);
9503 // Hack around illegal type on SI by directly selecting it.
9504 if (isTypeLegal(Src0.getValueType()))
9505 return SDValue();
9506
9507 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
9508 SDValue Undef = DAG.getUNDEF(MVT::f32);
9509 const SDValue Ops[] = {
9510 Op.getOperand(2), // tgt
9511 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
9512 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
9513 Undef, // src2
9514 Undef, // src3
9515 Op.getOperand(7), // vm
9516 DAG.getTargetConstant(1, DL, MVT::i1), // compr
9517 Op.getOperand(3), // en
9518 Op.getOperand(0) // Chain
9519 };
9520
9521 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9522 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
9523 }
9524 case Intrinsic::amdgcn_s_barrier: {
9527 unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
9528 if (WGSize <= ST.getWavefrontSize())
9529 return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
9530 Op.getOperand(0)), 0);
9531 }
9532
9533 // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
9534 if (ST.hasSplitBarriers()) {
9535 SDValue K =
9537 SDValue BarSignal =
9538 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
9539 MVT::Other, K, Op.getOperand(0)),
9540 0);
9541 SDValue BarWait =
9542 SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
9543 BarSignal.getValue(0)),
9544 0);
9545 return BarWait;
9546 }
9547
9548 return SDValue();
9549 };
9550 case Intrinsic::amdgcn_tbuffer_store: {
9551 SDValue VData = Op.getOperand(2);
9552 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9553 if (IsD16)
9554 VData = handleD16VData(VData, DAG);
9555 unsigned Dfmt = Op.getConstantOperandVal(8);
9556 unsigned Nfmt = Op.getConstantOperandVal(9);
9557 unsigned Glc = Op.getConstantOperandVal(10);
9558 unsigned Slc = Op.getConstantOperandVal(11);
9559 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9560 SDValue Ops[] = {
9561 Chain,
9562 VData, // vdata
9563 Op.getOperand(3), // rsrc
9564 Op.getOperand(4), // vindex
9565 Op.getOperand(5), // voffset
9566 Op.getOperand(6), // soffset
9567 Op.getOperand(7), // offset
9568 DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
9569 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9570 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9571 };
9572 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9574 MemSDNode *M = cast<MemSDNode>(Op);
9575 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9576 M->getMemoryVT(), M->getMemOperand());
9577 }
9578
9579 case Intrinsic::amdgcn_struct_tbuffer_store:
9580 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9581 SDValue VData = Op.getOperand(2);
9582 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9583 if (IsD16)
9584 VData = handleD16VData(VData, DAG);
9585 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9586 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9587 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9588 SDValue Ops[] = {
9589 Chain,
9590 VData, // vdata
9591 Rsrc, // rsrc
9592 Op.getOperand(4), // vindex
9593 Offsets.first, // voffset
9594 SOffset, // soffset
9595 Offsets.second, // offset
9596 Op.getOperand(7), // format
9597 Op.getOperand(8), // cachepolicy, swizzled buffer
9598 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9599 };
9600 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9602 MemSDNode *M = cast<MemSDNode>(Op);
9603 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9604 M->getMemoryVT(), M->getMemOperand());
9605 }
9606
9607 case Intrinsic::amdgcn_raw_tbuffer_store:
9608 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9609 SDValue VData = Op.getOperand(2);
9610 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9611 if (IsD16)
9612 VData = handleD16VData(VData, DAG);
9613 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9614 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9615 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9616 SDValue Ops[] = {
9617 Chain,
9618 VData, // vdata
9619 Rsrc, // rsrc
9620 DAG.getConstant(0, DL, MVT::i32), // vindex
9621 Offsets.first, // voffset
9622 SOffset, // soffset
9623 Offsets.second, // offset
9624 Op.getOperand(6), // format
9625 Op.getOperand(7), // cachepolicy, swizzled buffer
9626 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9627 };
9628 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
9630 MemSDNode *M = cast<MemSDNode>(Op);
9631 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9632 M->getMemoryVT(), M->getMemOperand());
9633 }
9634
9635 case Intrinsic::amdgcn_buffer_store:
9636 case Intrinsic::amdgcn_buffer_store_format: {
9637 SDValue VData = Op.getOperand(2);
9638 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
9639 if (IsD16)
9640 VData = handleD16VData(VData, DAG);
9641 unsigned Glc = Op.getConstantOperandVal(6);
9642 unsigned Slc = Op.getConstantOperandVal(7);
9643 unsigned IdxEn = getIdxEn(Op.getOperand(4));
9644 SDValue Ops[] = {
9645 Chain,
9646 VData,
9647 Op.getOperand(3), // rsrc
9648 Op.getOperand(4), // vindex
9649 SDValue(), // voffset -- will be set by setBufferOffsets
9650 SDValue(), // soffset -- will be set by setBufferOffsets
9651 SDValue(), // offset -- will be set by setBufferOffsets
9652 DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
9653 DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
9654 };
9655 setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
9656
9657 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9659 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9660 MemSDNode *M = cast<MemSDNode>(Op);
9661
9662 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9663 EVT VDataType = VData.getValueType().getScalarType();
9664 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9665 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9666
9667 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9668 M->getMemoryVT(), M->getMemOperand());
9669 }
9670
9671 case Intrinsic::amdgcn_raw_buffer_store:
9672 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9673 case Intrinsic::amdgcn_raw_buffer_store_format:
9674 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9675 const bool IsFormat =
9676 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9677 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9678
9679 SDValue VData = Op.getOperand(2);
9680 EVT VDataVT = VData.getValueType();
9681 EVT EltType = VDataVT.getScalarType();
9682 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9683 if (IsD16) {
9684 VData = handleD16VData(VData, DAG);
9685 VDataVT = VData.getValueType();
9686 }
9687
9688 if (!isTypeLegal(VDataVT)) {
9689 VData =
9690 DAG.getNode(ISD::BITCAST, DL,
9691 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9692 }
9693
9694 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9695 auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
9696 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
9697 SDValue Ops[] = {
9698 Chain,
9699 VData,
9700 Rsrc,
9701 DAG.getConstant(0, DL, MVT::i32), // vindex
9702 Offsets.first, // voffset
9703 SOffset, // soffset
9704 Offsets.second, // offset
9705 Op.getOperand(6), // cachepolicy, swizzled buffer
9706 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
9707 };
9708 unsigned Opc =
9710 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9711 MemSDNode *M = cast<MemSDNode>(Op);
9712
9713 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9714 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9715 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
9716
9717 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9718 M->getMemoryVT(), M->getMemOperand());
9719 }
9720
9721 case Intrinsic::amdgcn_struct_buffer_store:
9722 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9723 case Intrinsic::amdgcn_struct_buffer_store_format:
9724 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9725 const bool IsFormat =
9726 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9727 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9728
9729 SDValue VData = Op.getOperand(2);
9730 EVT VDataVT = VData.getValueType();
9731 EVT EltType = VDataVT.getScalarType();
9732 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
9733
9734 if (IsD16) {
9735 VData = handleD16VData(VData, DAG);
9736 VDataVT = VData.getValueType();
9737 }
9738
9739 if (!isTypeLegal(VDataVT)) {
9740 VData =
9741 DAG.getNode(ISD::BITCAST, DL,
9742 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
9743 }
9744
9745 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
9746 auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
9747 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
9748 SDValue Ops[] = {
9749 Chain,
9750 VData,
9751 Rsrc,
9752 Op.getOperand(4), // vindex
9753 Offsets.first, // voffset
9754 SOffset, // soffset
9755 Offsets.second, // offset
9756 Op.getOperand(7), // cachepolicy, swizzled buffer
9757 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
9758 };
9759 unsigned Opc =
9761 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
9762 MemSDNode *M = cast<MemSDNode>(Op);
9763
9764 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
9765 EVT VDataType = VData.getValueType().getScalarType();
9766 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
9767 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
9768
9769 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
9770 M->getMemoryVT(), M->getMemOperand());
9771 }
9772 case Intrinsic::amdgcn_raw_buffer_load_lds:
9773 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9774 case Intrinsic::amdgcn_struct_buffer_load_lds:
9775 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9776 assert(!AMDGPU::isGFX12Plus(*Subtarget));
9777 unsigned Opc;
9778 bool HasVIndex =
9779 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9780 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9781 unsigned OpOffset = HasVIndex ? 1 : 0;
9782 SDValue VOffset = Op.getOperand(5 + OpOffset);
9783 bool HasVOffset = !isNullConstant(VOffset);
9784 unsigned Size = Op->getConstantOperandVal(4);
9785
9786 switch (Size) {
9787 default:
9788 return SDValue();
9789 case 1:
9790 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9791 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9792 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9793 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9794 break;
9795 case 2:
9796 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9797 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9798 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9799 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9800 break;
9801 case 4:
9802 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9803 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9804 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9805 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9806 break;
9807 }
9808
9809 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9810
9812
9813 if (HasVIndex && HasVOffset)
9814 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
9815 { Op.getOperand(5), // VIndex
9816 VOffset }));
9817 else if (HasVIndex)
9818 Ops.push_back(Op.getOperand(5));
9819 else if (HasVOffset)
9820 Ops.push_back(VOffset);
9821
9822 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
9823 Ops.push_back(Rsrc);
9824 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
9825 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
9826 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
9827 Ops.push_back(
9828 DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
9830 Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
9831 Ops.push_back(M0Val.getValue(0)); // Chain
9832 Ops.push_back(M0Val.getValue(1)); // Glue
9833
9834 auto *M = cast<MemSDNode>(Op);
9835 MachineMemOperand *LoadMMO = M->getMemOperand();
9836 // Don't set the offset value here because the pointer points to the base of
9837 // the buffer.
9838 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9839
9840 MachinePointerInfo StorePtrI = LoadPtrI;
9841 LoadPtrI.V = PoisonValue::get(
9845
9846 auto F = LoadMMO->getFlags() &
9848 LoadMMO =
9850 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9851
9853 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
9854 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9855
9856 auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
9857 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9858
9859 return SDValue(Load, 0);
9860 }
9861 case Intrinsic::amdgcn_global_load_lds: {
9862 unsigned Opc;
9863 unsigned Size = Op->getConstantOperandVal(4);
9864 switch (Size) {
9865 default:
9866 return SDValue();
9867 case 1:
9868 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9869 break;
9870 case 2:
9871 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9872 break;
9873 case 4:
9874 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9875 break;
9876 }
9877
9878 auto *M = cast<MemSDNode>(Op);
9879 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
9880
9882
9883 SDValue Addr = Op.getOperand(2); // Global ptr
9884 SDValue VOffset;
9885 // Try to split SAddr and VOffset. Global and LDS pointers share the same
9886 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
9887 if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
9888 SDValue LHS = Addr.getOperand(0);
9889 SDValue RHS = Addr.getOperand(1);
9890
9891 if (LHS->isDivergent())
9892 std::swap(LHS, RHS);
9893
9894 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
9895 RHS.getOperand(0).getValueType() == MVT::i32) {
9896 // add (i64 sgpr), (zero_extend (i32 vgpr))
9897 Addr = LHS;
9898 VOffset = RHS.getOperand(0);
9899 }
9900 }
9901
9902 Ops.push_back(Addr);
9903 if (!Addr->isDivergent()) {
9904 Opc = AMDGPU::getGlobalSaddrOp(Opc);
9905 if (!VOffset)
9906 VOffset = SDValue(
9907 DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
9908 DAG.getTargetConstant(0, DL, MVT::i32)), 0);
9909 Ops.push_back(VOffset);
9910 }
9911
9912 Ops.push_back(Op.getOperand(5)); // Offset
9913 Ops.push_back(Op.getOperand(6)); // CPol
9914 Ops.push_back(M0Val.getValue(0)); // Chain
9915 Ops.push_back(M0Val.getValue(1)); // Glue
9916
9917 MachineMemOperand *LoadMMO = M->getMemOperand();
9918 MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
9919 LoadPtrI.Offset = Op->getConstantOperandVal(5);
9920 MachinePointerInfo StorePtrI = LoadPtrI;
9921 LoadPtrI.V = PoisonValue::get(
9925 auto F = LoadMMO->getFlags() &
9927 LoadMMO =
9929 LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
9931 StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
9932 LoadMMO->getAAInfo());
9933
9934 auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
9935 DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
9936
9937 return SDValue(Load, 0);
9938 }
9939 case Intrinsic::amdgcn_end_cf:
9940 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9941 Op->getOperand(2), Chain), 0);
9942 case Intrinsic::amdgcn_s_barrier_init:
9943 case Intrinsic::amdgcn_s_barrier_join:
9944 case Intrinsic::amdgcn_s_wakeup_barrier: {
9945 SDValue Chain = Op->getOperand(0);
9947 SDValue BarOp = Op->getOperand(2);
9948 unsigned Opc;
9949 bool IsInlinableBarID = false;
9950 int64_t BarVal;
9951
9952 if (isa<ConstantSDNode>(BarOp)) {
9953 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9954 IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
9955 }
9956
9957 if (IsInlinableBarID) {
9958 switch (IntrinsicID) {
9959 default:
9960 return SDValue();
9961 case Intrinsic::amdgcn_s_barrier_init:
9962 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9963 break;
9964 case Intrinsic::amdgcn_s_barrier_join:
9965 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9966 break;
9967 case Intrinsic::amdgcn_s_wakeup_barrier:
9968 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9969 break;
9970 }
9971
9972 SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
9973 Ops.push_back(K);
9974 } else {
9975 switch (IntrinsicID) {
9976 default:
9977 return SDValue();
9978 case Intrinsic::amdgcn_s_barrier_init:
9979 Opc = AMDGPU::S_BARRIER_INIT_M0;
9980 break;
9981 case Intrinsic::amdgcn_s_barrier_join:
9982 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9983 break;
9984 case Intrinsic::amdgcn_s_wakeup_barrier:
9985 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9986 break;
9987 }
9988 }
9989
9990 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9991 SDValue M0Val;
9992 // Member count will be read from M0[16:22]
9993 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
9994 DAG.getShiftAmountConstant(16, MVT::i32, DL));
9995
9996 if (!IsInlinableBarID) {
9997 // If reference to barrier id is not an inline constant then it must be
9998 // referenced with M0[4:0]. Perform an OR with the member count to
9999 // include it in M0.
10000 M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
10001 Op.getOperand(2), M0Val),
10002 0);
10003 }
10004 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
10005 } else if (!IsInlinableBarID) {
10006 Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
10007 }
10008
10009 auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
10010 return SDValue(NewMI, 0);
10011 }
10012 default: {
10013 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
10015 return lowerImage(Op, ImageDimIntr, DAG, true);
10016
10017 return Op;
10018 }
10019 }
10020}
10021
10022// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
10023// offset (the offset that is included in bounds checking and swizzling, to be
10024// split between the instruction's voffset and immoffset fields) and soffset
10025// (the offset that is excluded from bounds checking and swizzling, to go in
10026// the instruction's soffset field). This function takes the first kind of
10027// offset and figures out how to split it between voffset and immoffset.
10028std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
10029 SDValue Offset, SelectionDAG &DAG) const {
10030 SDLoc DL(Offset);
10031 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
10032 SDValue N0 = Offset;
10033 ConstantSDNode *C1 = nullptr;
10034
10035 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10036 N0 = SDValue();
10037 else if (DAG.isBaseWithConstantOffset(N0)) {
10038 C1 = cast<ConstantSDNode>(N0.getOperand(1));
10039 N0 = N0.getOperand(0);
10040 }
10041
10042 if (C1) {
10043 unsigned ImmOffset = C1->getZExtValue();
10044 // If the immediate value is too big for the immoffset field, put only bits
10045 // that would normally fit in the immoffset field. The remaining value that
10046 // is copied/added for the voffset field is a large power of 2, and it
10047 // stands more chance of being CSEd with the copy/add for another similar
10048 // load/store.
10049 // However, do not do that rounding down if that is a negative
10050 // number, as it appears to be illegal to have a negative offset in the
10051 // vgpr, even if adding the immediate offset makes it positive.
10052 unsigned Overflow = ImmOffset & ~MaxImm;
10053 ImmOffset -= Overflow;
10054 if ((int32_t)Overflow < 0) {
10055 Overflow += ImmOffset;
10056 ImmOffset = 0;
10057 }
10058 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
10059 if (Overflow) {
10060 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
10061 if (!N0)
10062 N0 = OverflowVal;
10063 else {
10064 SDValue Ops[] = { N0, OverflowVal };
10065 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
10066 }
10067 }
10068 }
10069 if (!N0)
10070 N0 = DAG.getConstant(0, DL, MVT::i32);
10071 if (!C1)
10072 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
10073 return {N0, SDValue(C1, 0)};
10074}
10075
10076// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
10077// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
10078// pointed to by Offsets.
10079void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
10080 SelectionDAG &DAG, SDValue *Offsets,
10081 Align Alignment) const {
10083 SDLoc DL(CombinedOffset);
10084 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10085 uint32_t Imm = C->getZExtValue();
10086 uint32_t SOffset, ImmOffset;
10087 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10088 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
10089 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10090 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10091 return;
10092 }
10093 }
10094 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
10095 SDValue N0 = CombinedOffset.getOperand(0);
10096 SDValue N1 = CombinedOffset.getOperand(1);
10097 uint32_t SOffset, ImmOffset;
10098 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10099 if (Offset >= 0 &&
10100 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
10101 Offsets[0] = N0;
10102 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
10103 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
10104 return;
10105 }
10106 }
10107
10108 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
10109 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
10110 : DAG.getConstant(0, DL, MVT::i32);
10111
10112 Offsets[0] = CombinedOffset;
10113 Offsets[1] = SOffsetZero;
10114 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
10115}
10116
10117SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
10118 SelectionDAG &DAG) const {
10119 if (!MaybePointer.getValueType().isScalarInteger())
10120 return MaybePointer;
10121
10122 SDLoc DL(MaybePointer);
10123
10124 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
10125 return Rsrc;
10126}
10127
10128// Wrap a global or flat pointer into a buffer intrinsic using the flags
10129// specified in the intrinsic.
10130SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
10131 SelectionDAG &DAG) const {
10132 SDLoc Loc(Op);
10133
10134 SDValue Pointer = Op->getOperand(1);
10135 SDValue Stride = Op->getOperand(2);
10136 SDValue NumRecords = Op->getOperand(3);
10137 SDValue Flags = Op->getOperand(4);
10138
10139 auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10140 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
10141 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
10142 std::optional<uint32_t> ConstStride = std::nullopt;
10143 if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10144 ConstStride = ConstNode->getZExtValue();
10145
10146 SDValue NewHighHalf = Masked;
10147 if (!ConstStride || *ConstStride != 0) {
10148 SDValue ShiftedStride;
10149 if (ConstStride) {
10150 ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
10151 } else {
10152 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
10153 ShiftedStride =
10154 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
10155 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
10156 }
10157 NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10158 }
10159
10160 SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
10161 NewHighHalf, NumRecords, Flags);
10162 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
10163 return RsrcPtr;
10164}
10165
10166// Handle 8 bit and 16 bit buffer loads
10167SDValue
10168SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
10170 MachineMemOperand *MMO) const {
10171 EVT IntVT = LoadVT.changeTypeToInteger();
10172 unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
10174
10175 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
10176 SDValue BufferLoad =
10177 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
10178 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
10179 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
10180
10181 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
10182}
10183
10184// Handle 8 bit and 16 bit buffer stores
10185SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
10186 EVT VDataType, SDLoc DL,
10187 SDValue Ops[],
10188 MemSDNode *M) const {
10189 if (VDataType == MVT::f16)
10190 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
10191
10192 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
10193 Ops[1] = BufferStoreExt;
10194 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
10196 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
10197 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
10198 M->getMemOperand());
10199}
10200
10202 ISD::LoadExtType ExtType, SDValue Op,
10203 const SDLoc &SL, EVT VT) {
10204 if (VT.bitsLT(Op.getValueType()))
10205 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
10206
10207 switch (ExtType) {
10208 case ISD::SEXTLOAD:
10209 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
10210 case ISD::ZEXTLOAD:
10211 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
10212 case ISD::EXTLOAD:
10213 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
10214 case ISD::NON_EXTLOAD:
10215 return Op;
10216 }
10217
10218 llvm_unreachable("invalid ext type");
10219}
10220
10221// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
10222// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
10223SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
10224 SelectionDAG &DAG = DCI.DAG;
10225 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
10226 return SDValue();
10227
10228 // FIXME: Constant loads should all be marked invariant.
10229 unsigned AS = Ld->getAddressSpace();
10230 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
10232 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
10233 return SDValue();
10234
10235 // Don't do this early, since it may interfere with adjacent load merging for
10236 // illegal types. We can avoid losing alignment information for exotic types
10237 // pre-legalize.
10238 EVT MemVT = Ld->getMemoryVT();
10239 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
10240 MemVT.getSizeInBits() >= 32)
10241 return SDValue();
10242
10243 SDLoc SL(Ld);
10244
10245 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
10246 "unexpected vector extload");
10247
10248 // TODO: Drop only high part of range.
10249 SDValue Ptr = Ld->getBasePtr();
10250 SDValue NewLoad = DAG.getLoad(
10251 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
10252 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
10253 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
10254 nullptr); // Drop ranges
10255
10256 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
10257 if (MemVT.isFloatingPoint()) {
10259 "unexpected fp extload");
10260 TruncVT = MemVT.changeTypeToInteger();
10261 }
10262
10263 SDValue Cvt = NewLoad;
10264 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
10265 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
10266 DAG.getValueType(TruncVT));
10267 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
10269 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
10270 } else {
10272 }
10273
10274 EVT VT = Ld->getValueType(0);
10275 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
10276
10277 DCI.AddToWorklist(Cvt.getNode());
10278
10279 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
10280 // the appropriate extension from the 32-bit load.
10281 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
10282 DCI.AddToWorklist(Cvt.getNode());
10283
10284 // Handle conversion back to floating point if necessary.
10285 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
10286
10287 return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
10288}
10289
10291 const SIMachineFunctionInfo &Info) {
10292 // TODO: Should check if the address can definitely not access stack.
10293 if (Info.isEntryFunction())
10294 return Info.getUserSGPRInfo().hasFlatScratchInit();
10295 return true;
10296}
10297
10298SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
10299 SDLoc DL(Op);
10300 LoadSDNode *Load = cast<LoadSDNode>(Op);
10301 ISD::LoadExtType ExtType = Load->getExtensionType();
10302 EVT MemVT = Load->getMemoryVT();
10303
10304 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
10305 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
10306 return SDValue();
10307
10308 // FIXME: Copied from PPC
10309 // First, load into 32 bits, then truncate to 1 bit.
10310
10311 SDValue Chain = Load->getChain();
10312 SDValue BasePtr = Load->getBasePtr();
10313 MachineMemOperand *MMO = Load->getMemOperand();
10314
10315 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10316
10317 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
10318 BasePtr, RealMemVT, MMO);
10319
10320 if (!MemVT.isVector()) {
10321 SDValue Ops[] = {
10322 DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
10323 NewLD.getValue(1)
10324 };
10325
10326 return DAG.getMergeValues(Ops, DL);
10327 }
10328
10330 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
10331 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
10332 DAG.getConstant(I, DL, MVT::i32));
10333
10334 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
10335 }
10336
10337 SDValue Ops[] = {
10338 DAG.getBuildVector(MemVT, DL, Elts),
10339 NewLD.getValue(1)
10340 };
10341
10342 return DAG.getMergeValues(Ops, DL);
10343 }
10344
10345 if (!MemVT.isVector())
10346 return SDValue();
10347
10348 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
10349 "Custom lowering for non-i32 vectors hasn't been implemented.");
10350
10351 Align Alignment = Load->getAlign();
10352 unsigned AS = Load->getAddressSpace();
10353 if (Subtarget->hasLDSMisalignedBug() && AS == AMDGPUAS::FLAT_ADDRESS &&
10354 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
10355 return SplitVectorLoad(Op, DAG);
10356 }
10357
10360 // If there is a possibility that flat instruction access scratch memory
10361 // then we need to use the same legalization rules we use for private.
10362 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10364 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
10366
10367 unsigned NumElements = MemVT.getVectorNumElements();
10368
10369 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10371 if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
10372 if (MemVT.isPow2VectorType() ||
10373 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10374 return SDValue();
10375 return WidenOrSplitVectorLoad(Op, DAG);
10376 }
10377 // Non-uniform loads will be selected to MUBUF instructions, so they
10378 // have the same legalization requirements as global and private
10379 // loads.
10380 //
10381 }
10382
10383 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10386 if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
10387 Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
10388 Alignment >= Align(4) && NumElements < 32) {
10389 if (MemVT.isPow2VectorType() ||
10390 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
10391 return SDValue();
10392 return WidenOrSplitVectorLoad(Op, DAG);
10393 }
10394 // Non-uniform loads will be selected to MUBUF instructions, so they
10395 // have the same legalization requirements as global and private
10396 // loads.
10397 //
10398 }
10399 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
10402 AS == AMDGPUAS::FLAT_ADDRESS) {
10403 if (NumElements > 4)
10404 return SplitVectorLoad(Op, DAG);
10405 // v3 loads not supported on SI.
10406 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10407 return WidenOrSplitVectorLoad(Op, DAG);
10408
10409 // v3 and v4 loads are supported for private and global memory.
10410 return SDValue();
10411 }
10412 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10413 // Depending on the setting of the private_element_size field in the
10414 // resource descriptor, we can only make private accesses up to a certain
10415 // size.
10416 switch (Subtarget->getMaxPrivateElementSize()) {
10417 case 4: {
10418 SDValue Ops[2];
10419 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
10420 return DAG.getMergeValues(Ops, DL);
10421 }
10422 case 8:
10423 if (NumElements > 2)
10424 return SplitVectorLoad(Op, DAG);
10425 return SDValue();
10426 case 16:
10427 // Same as global/flat
10428 if (NumElements > 4)
10429 return SplitVectorLoad(Op, DAG);
10430 // v3 loads not supported on SI.
10431 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10432 return WidenOrSplitVectorLoad(Op, DAG);
10433
10434 return SDValue();
10435 default:
10436 llvm_unreachable("unsupported private_element_size");
10437 }
10438 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10439 unsigned Fast = 0;
10440 auto Flags = Load->getMemOperand()->getFlags();
10442 Load->getAlign(), Flags, &Fast) &&
10443 Fast > 1)
10444 return SDValue();
10445
10446 if (MemVT.isVector())
10447 return SplitVectorLoad(Op, DAG);
10448 }
10449
10451 MemVT, *Load->getMemOperand())) {
10452 SDValue Ops[2];
10453 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
10454 return DAG.getMergeValues(Ops, DL);
10455 }
10456
10457 return SDValue();
10458}
10459
10460SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
10461 EVT VT = Op.getValueType();
10462 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
10463 VT.getSizeInBits() == 512)
10464 return splitTernaryVectorOp(Op, DAG);
10465
10466 assert(VT.getSizeInBits() == 64);
10467
10468 SDLoc DL(Op);
10469 SDValue Cond = Op.getOperand(0);
10470
10471 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
10472 SDValue One = DAG.getConstant(1, DL, MVT::i32);
10473
10474 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
10475 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
10476
10477 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
10478 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
10479
10480 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
10481
10482 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
10483 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
10484
10485 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
10486
10487 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
10488 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
10489}
10490
10491// Catch division cases where we can use shortcuts with rcp and rsq
10492// instructions.
10493SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
10494 SelectionDAG &DAG) const {
10495 SDLoc SL(Op);
10496 SDValue LHS = Op.getOperand(0);
10497 SDValue RHS = Op.getOperand(1);
10498 EVT VT = Op.getValueType();
10499 const SDNodeFlags Flags = Op->getFlags();
10500
10501 bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
10503
10504 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
10505 // Without !fpmath accuracy information, we can't do more because we don't
10506 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
10507 // f16 is always accurate enough
10508 if (!AllowInaccurateRcp && VT != MVT::f16)
10509 return SDValue();
10510
10511 if (CLHS->isExactlyValue(1.0)) {
10512 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
10513 // the CI documentation has a worst case error of 1 ulp.
10514 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
10515 // use it as long as we aren't trying to use denormals.
10516 //
10517 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
10518
10519 // 1.0 / sqrt(x) -> rsq(x)
10520
10521 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
10522 // error seems really high at 2^29 ULP.
10523 // 1.0 / x -> rcp(x)
10524 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10525 }
10526
10527 // Same as for 1.0, but expand the sign out of the constant.
10528 if (CLHS->isExactlyValue(-1.0)) {
10529 // -1.0 / x -> rcp (fneg x)
10530 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
10531 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
10532 }
10533 }
10534
10535 // For f16 require afn or arcp.
10536 // For f32 require afn.
10537 if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
10538 return SDValue();
10539
10540 // Turn into multiply by the reciprocal.
10541 // x / y -> x * (1.0 / y)
10542 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
10543 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
10544}
10545
10546SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
10547 SelectionDAG &DAG) const {
10548 SDLoc SL(Op);
10549 SDValue X = Op.getOperand(0);
10550 SDValue Y = Op.getOperand(1);
10551 EVT VT = Op.getValueType();
10552 const SDNodeFlags Flags = Op->getFlags();
10553
10554 bool AllowInaccurateDiv = Flags.hasApproximateFuncs() ||
10556 if (!AllowInaccurateDiv)
10557 return SDValue();
10558
10559 SDValue NegY = DAG.getNode(ISD::FNEG, SL, VT, Y);
10560 SDValue One = DAG.getConstantFP(1.0, SL, VT);
10561
10562 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
10563 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10564
10565 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
10566 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
10567 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
10568 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
10569 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
10570 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
10571}
10572
10573static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10574 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
10575 SDNodeFlags Flags) {
10576 if (GlueChain->getNumValues() <= 1) {
10577 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
10578 }
10579
10580 assert(GlueChain->getNumValues() == 3);
10581
10582 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10583 switch (Opcode) {
10584 default: llvm_unreachable("no chain equivalent for opcode");
10585 case ISD::FMUL:
10586 Opcode = AMDGPUISD::FMUL_W_CHAIN;
10587 break;
10588 }
10589
10590 return DAG.getNode(Opcode, SL, VTList,
10591 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
10592 Flags);
10593}
10594
10595static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
10596 EVT VT, SDValue A, SDValue B, SDValue C,
10597 SDValue GlueChain, SDNodeFlags Flags) {
10598 if (GlueChain->getNumValues() <= 1) {
10599 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
10600 }
10601
10602 assert(GlueChain->getNumValues() == 3);
10603
10604 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
10605 switch (Opcode) {
10606 default: llvm_unreachable("no chain equivalent for opcode");
10607 case ISD::FMA:
10608 Opcode = AMDGPUISD::FMA_W_CHAIN;
10609 break;
10610 }
10611
10612 return DAG.getNode(Opcode, SL, VTList,
10613 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
10614 Flags);
10615}
10616
10617SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
10618 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10619 return FastLowered;
10620
10621 SDLoc SL(Op);
10622 SDValue Src0 = Op.getOperand(0);
10623 SDValue Src1 = Op.getOperand(1);
10624
10625 SDValue CvtSrc0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
10626 SDValue CvtSrc1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
10627
10628 SDValue RcpSrc1 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, CvtSrc1);
10629 SDValue Quot = DAG.getNode(ISD::FMUL, SL, MVT::f32, CvtSrc0, RcpSrc1);
10630
10631 SDValue FPRoundFlag = DAG.getTargetConstant(0, SL, MVT::i32);
10632 SDValue BestQuot = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot, FPRoundFlag);
10633
10634 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, BestQuot, Src1, Src0);
10635}
10636
10637// Faster 2.5 ULP division that does not support denormals.
10638SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
10639 SDNodeFlags Flags = Op->getFlags();
10640 SDLoc SL(Op);
10641 SDValue LHS = Op.getOperand(1);
10642 SDValue RHS = Op.getOperand(2);
10643
10644 SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
10645
10646 const APFloat K0Val(0x1p+96f);
10647 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
10648
10649 const APFloat K1Val(0x1p-32f);
10650 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
10651
10652 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10653
10654 EVT SetCCVT =
10655 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
10656
10657 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
10658
10659 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
10660
10661 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
10662
10663 // rcp does not support denormals.
10664 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
10665
10666 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
10667
10668 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
10669}
10670
10671// Returns immediate value for setting the F32 denorm mode when using the
10672// S_DENORM_MODE instruction.
10674 const SIMachineFunctionInfo *Info,
10675 const GCNSubtarget *ST) {
10676 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
10677 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
10678 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10679 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
10680}
10681
10682SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
10683 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
10684 return FastLowered;
10685
10686 // The selection matcher assumes anything with a chain selecting to a
10687 // mayRaiseFPException machine instruction. Since we're introducing a chain
10688 // here, we need to explicitly report nofpexcept for the regular fdiv
10689 // lowering.
10690 SDNodeFlags Flags = Op->getFlags();
10691 Flags.setNoFPExcept(true);
10692
10693 SDLoc SL(Op);
10694 SDValue LHS = Op.getOperand(0);
10695 SDValue RHS = Op.getOperand(1);
10696
10697 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
10698
10699 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
10700
10701 SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10702 {RHS, RHS, LHS}, Flags);
10703 SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
10704 {LHS, RHS, LHS}, Flags);
10705
10706 // Denominator is scaled to not be denormal, so using rcp is ok.
10707 SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
10708 DenominatorScaled, Flags);
10709 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
10710 DenominatorScaled, Flags);
10711
10712 using namespace AMDGPU::Hwreg;
10713 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10714 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
10715
10716 const MachineFunction &MF = DAG.getMachineFunction();
10718 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
10719
10720 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
10721 const bool HasDynamicDenormals =
10722 (DenormMode.Input == DenormalMode::Dynamic) ||
10723 (DenormMode.Output == DenormalMode::Dynamic);
10724
10725 SDValue SavedDenormMode;
10726
10727 if (!PreservesDenormals) {
10728 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
10729 // lowering. The chain dependence is insufficient, and we need glue. We do
10730 // not need the glue variants in a strictfp function.
10731
10732 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
10733
10734 SDValue Glue = DAG.getEntryNode();
10735 if (HasDynamicDenormals) {
10736 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
10737 DAG.getVTList(MVT::i32, MVT::Glue),
10738 {BitField, Glue});
10739 SavedDenormMode = SDValue(GetReg, 0);
10740
10741 Glue = DAG.getMergeValues(
10742 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
10743 }
10744
10745 SDNode *EnableDenorm;
10746 if (Subtarget->hasDenormModeInst()) {
10747 const SDValue EnableDenormValue =
10748 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
10749
10750 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
10751 EnableDenormValue)
10752 .getNode();
10753 } else {
10754 const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
10755 SL, MVT::i32);
10756 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10757 {EnableDenormValue, BitField, Glue});
10758 }
10759
10760 SDValue Ops[3] = {
10761 NegDivScale0,
10762 SDValue(EnableDenorm, 0),
10763 SDValue(EnableDenorm, 1)
10764 };
10765
10766 NegDivScale0 = DAG.getMergeValues(Ops, SL);
10767 }
10768
10769 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
10770 ApproxRcp, One, NegDivScale0, Flags);
10771
10772 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
10773 ApproxRcp, Fma0, Flags);
10774
10775 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
10776 Fma1, Fma1, Flags);
10777
10778 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
10779 NumeratorScaled, Mul, Flags);
10780
10781 SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
10782 Fma2, Fma1, Mul, Fma2, Flags);
10783
10784 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
10785 NumeratorScaled, Fma3, Flags);
10786
10787 if (!PreservesDenormals) {
10788 SDNode *DisableDenorm;
10789 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
10790 const SDValue DisableDenormValue = getSPDenormModeValue(
10791 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
10792
10793 DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
10794 Fma4.getValue(1), DisableDenormValue,
10795 Fma4.getValue(2)).getNode();
10796 } else {
10797 assert(HasDynamicDenormals == (bool)SavedDenormMode);
10798 const SDValue DisableDenormValue =
10799 HasDynamicDenormals
10800 ? SavedDenormMode
10801 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
10802
10803 DisableDenorm = DAG.getMachineNode(
10804 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10805 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
10806 }
10807
10808 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
10809 SDValue(DisableDenorm, 0), DAG.getRoot());
10810 DAG.setRoot(OutputChain);
10811 }
10812
10813 SDValue Scale = NumeratorScaled.getValue(1);
10814 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
10815 {Fma4, Fma1, Fma3, Scale}, Flags);
10816
10817 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
10818}
10819
10820SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
10821 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
10822 return FastLowered;
10823
10824 SDLoc SL(Op);
10825 SDValue X = Op.getOperand(0);
10826 SDValue Y = Op.getOperand(1);
10827
10828 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
10829
10830 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
10831
10832 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
10833
10834 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
10835
10836 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
10837
10838 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
10839
10840 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
10841
10842 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
10843
10844 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
10845
10846 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
10847 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
10848
10849 SDValue Fma4 = DAG.getNode(ISD::FMA, SL, MVT::f64,
10850 NegDivScale0, Mul, DivScale1);
10851
10852 SDValue Scale;
10853
10854 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
10855 // Workaround a hardware bug on SI where the condition output from div_scale
10856 // is not usable.
10857
10858 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
10859
10860 // Figure out if the scale to use for div_fmas.
10861 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
10862 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
10863 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
10864 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
10865
10866 SDValue NumHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
10867 SDValue DenHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
10868
10869 SDValue Scale0Hi
10870 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
10871 SDValue Scale1Hi
10872 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
10873
10874 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
10875 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
10876 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
10877 } else {
10878 Scale = DivScale1.getValue(1);
10879 }
10880
10881 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64,
10882 Fma4, Fma3, Mul, Scale);
10883
10884 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
10885}
10886
10887SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
10888 EVT VT = Op.getValueType();
10889
10890 if (VT == MVT::f32)
10891 return LowerFDIV32(Op, DAG);
10892
10893 if (VT == MVT::f64)
10894 return LowerFDIV64(Op, DAG);
10895
10896 if (VT == MVT::f16)
10897 return LowerFDIV16(Op, DAG);
10898
10899 llvm_unreachable("Unexpected type for fdiv");
10900}
10901
10902SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
10903 SDLoc dl(Op);
10904 SDValue Val = Op.getOperand(0);
10905 EVT VT = Val.getValueType();
10906 EVT ResultExpVT = Op->getValueType(1);
10907 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10908
10909 SDValue Mant = DAG.getNode(
10911 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
10912
10913 SDValue Exp = DAG.getNode(
10914 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
10915 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
10916
10917 if (Subtarget->hasFractBug()) {
10918 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
10919 SDValue Inf = DAG.getConstantFP(
10921
10922 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
10923 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
10924 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
10925 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
10926 }
10927
10928 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
10929 return DAG.getMergeValues({Mant, CastExp}, dl);
10930}
10931
10932SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
10933 SDLoc DL(Op);
10934 StoreSDNode *Store = cast<StoreSDNode>(Op);
10935 EVT VT = Store->getMemoryVT();
10936
10937 if (VT == MVT::i1) {
10938 return DAG.getTruncStore(Store->getChain(), DL,
10939 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
10940 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
10941 }
10942
10943 assert(VT.isVector() &&
10944 Store->getValue().getValueType().getScalarType() == MVT::i32);
10945
10946 unsigned AS = Store->getAddressSpace();
10947 if (Subtarget->hasLDSMisalignedBug() &&
10948 AS == AMDGPUAS::FLAT_ADDRESS &&
10949 Store->getAlign().value() < VT.getStoreSize() && VT.getSizeInBits() > 32) {
10950 return SplitVectorStore(Op, DAG);
10951 }
10952
10955 // If there is a possibility that flat instruction access scratch memory
10956 // then we need to use the same legalization rules we use for private.
10957 if (AS == AMDGPUAS::FLAT_ADDRESS &&
10959 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
10961
10962 unsigned NumElements = VT.getVectorNumElements();
10963 if (AS == AMDGPUAS::GLOBAL_ADDRESS ||
10964 AS == AMDGPUAS::FLAT_ADDRESS) {
10965 if (NumElements > 4)
10966 return SplitVectorStore(Op, DAG);
10967 // v3 stores not supported on SI.
10968 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
10969 return SplitVectorStore(Op, DAG);
10970
10972 VT, *Store->getMemOperand()))
10973 return expandUnalignedStore(Store, DAG);
10974
10975 return SDValue();
10976 } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
10977 switch (Subtarget->getMaxPrivateElementSize()) {
10978 case 4:
10979 return scalarizeVectorStore(Store, DAG);
10980 case 8:
10981 if (NumElements > 2)
10982 return SplitVectorStore(Op, DAG);
10983 return SDValue();
10984 case 16:
10985 if (NumElements > 4 ||
10986 (NumElements == 3 && !Subtarget->enableFlatScratch()))
10987 return SplitVectorStore(Op, DAG);
10988 return SDValue();
10989 default:
10990 llvm_unreachable("unsupported private_element_size");
10991 }
10992 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
10993 unsigned Fast = 0;
10994 auto Flags = Store->getMemOperand()->getFlags();
10996 Store->getAlign(), Flags, &Fast) &&
10997 Fast > 1)
10998 return SDValue();
10999
11000 if (VT.isVector())
11001 return SplitVectorStore(Op, DAG);
11002
11003 return expandUnalignedStore(Store, DAG);
11004 }
11005
11006 // Probably an invalid store. If so we'll end up emitting a selection error.
11007 return SDValue();
11008}
11009
11010// Avoid the full correct expansion for f32 sqrt when promoting from f16.
11011SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
11012 SDLoc SL(Op);
11013 assert(!Subtarget->has16BitInsts());
11014 SDNodeFlags Flags = Op->getFlags();
11015 SDValue Ext =
11016 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
11017
11018 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
11019 SDValue Sqrt =
11020 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
11021
11022 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
11023 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
11024}
11025
11026SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
11027 SDLoc DL(Op);
11028 SDNodeFlags Flags = Op->getFlags();
11029 MVT VT = Op.getValueType().getSimpleVT();
11030 const SDValue X = Op.getOperand(0);
11031
11032 if (allowApproxFunc(DAG, Flags)) {
11033 // Instruction is 1ulp but ignores denormals.
11034 return DAG.getNode(
11036 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
11037 }
11038
11039 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
11040 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
11041
11042 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
11043
11044 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
11045
11046 SDValue SqrtX =
11047 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
11048
11049 SDValue SqrtS;
11050 if (needsDenormHandlingF32(DAG, X, Flags)) {
11051 SDValue SqrtID =
11052 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
11053 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
11054
11055 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
11056 SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11057 DAG.getConstant(-1, DL, MVT::i32));
11058 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
11059
11060 SDValue NegSqrtSNextDown =
11061 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
11062
11063 SDValue SqrtVP =
11064 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
11065
11066 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
11067 DAG.getConstant(1, DL, MVT::i32));
11068 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
11069
11070 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
11071 SDValue SqrtVS =
11072 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
11073
11074 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
11075 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
11076
11077 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
11078 Flags);
11079
11080 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
11081 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
11082 Flags);
11083 } else {
11084 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
11085
11086 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
11087
11088 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
11089 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
11090 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
11091
11092 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
11093 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
11094 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
11095
11096 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
11097 SDValue SqrtD =
11098 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
11099 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
11100 }
11101
11102 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
11103
11104 SDValue ScaledDown =
11105 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
11106
11107 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
11108 SDValue IsZeroOrInf =
11109 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11110 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11111
11112 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
11113}
11114
11115SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
11116 // For double type, the SQRT and RSQ instructions don't have required
11117 // precision, we apply Goldschmidt's algorithm to improve the result:
11118 //
11119 // y0 = rsq(x)
11120 // g0 = x * y0
11121 // h0 = 0.5 * y0
11122 //
11123 // r0 = 0.5 - h0 * g0
11124 // g1 = g0 * r0 + g0
11125 // h1 = h0 * r0 + h0
11126 //
11127 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
11128 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
11129 // h2 = h1 * r1 + h1
11130 //
11131 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
11132 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
11133 //
11134 // sqrt(x) = g3
11135
11136 SDNodeFlags Flags = Op->getFlags();
11137
11138 SDLoc DL(Op);
11139
11140 SDValue X = Op.getOperand(0);
11141 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
11142
11143 SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
11144
11145 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
11146
11147 // Scale up input if it is too small.
11148 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
11149 SDValue ScaleUp =
11150 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
11151 SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
11152
11153 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
11154
11155 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
11156
11157 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
11158 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
11159
11160 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
11161 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
11162
11163 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
11164
11165 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
11166
11167 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
11168 SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
11169
11170 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
11171
11172 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
11173 SDValue SqrtD1 =
11174 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
11175
11176 SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
11177
11178 SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
11179 SDValue ScaleDown =
11180 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
11181 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
11182
11183 // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
11184 // with finite only or nsz because rsq(+/-0) = +/-inf
11185
11186 // TODO: Check for DAZ and expand to subnormals
11187 SDValue IsZeroOrInf =
11188 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
11189 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
11190
11191 // If x is +INF, +0, or -0, use its original value
11192 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
11193 Flags);
11194}
11195
11196SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
11197 SDLoc DL(Op);
11198 EVT VT = Op.getValueType();
11199 SDValue Arg = Op.getOperand(0);
11200 SDValue TrigVal;
11201
11202 // Propagate fast-math flags so that the multiply we introduce can be folded
11203 // if Arg is already the result of a multiply by constant.
11204 auto Flags = Op->getFlags();
11205
11206 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
11207
11208 if (Subtarget->hasTrigReducedRange()) {
11209 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11210 TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
11211 } else {
11212 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
11213 }
11214
11215 switch (Op.getOpcode()) {
11216 case ISD::FCOS:
11217 return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
11218 case ISD::FSIN:
11219 return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
11220 default:
11221 llvm_unreachable("Wrong trig opcode");
11222 }
11223}
11224
11225SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const {
11226 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
11227 assert(AtomicNode->isCompareAndSwap());
11228 unsigned AS = AtomicNode->getAddressSpace();
11229
11230 // No custom lowering required for local address space
11232 return Op;
11233
11234 // Non-local address space requires custom lowering for atomic compare
11235 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
11236 SDLoc DL(Op);
11237 SDValue ChainIn = Op.getOperand(0);
11238 SDValue Addr = Op.getOperand(1);
11239 SDValue Old = Op.getOperand(2);
11240 SDValue New = Op.getOperand(3);
11241 EVT VT = Op.getValueType();
11242 MVT SimpleVT = VT.getSimpleVT();
11243 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
11244
11245 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
11246 SDValue Ops[] = { ChainIn, Addr, NewOld };
11247
11248 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL, Op->getVTList(),
11249 Ops, VT, AtomicNode->getMemOperand());
11250}
11251
11252//===----------------------------------------------------------------------===//
11253// Custom DAG optimizations
11254//===----------------------------------------------------------------------===//
11255
11256SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
11257 DAGCombinerInfo &DCI) const {
11258 EVT VT = N->getValueType(0);
11259 EVT ScalarVT = VT.getScalarType();
11260 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11261 return SDValue();
11262
11263 SelectionDAG &DAG = DCI.DAG;
11264 SDLoc DL(N);
11265
11266 SDValue Src = N->getOperand(0);
11267 EVT SrcVT = Src.getValueType();
11268
11269 // TODO: We could try to match extracting the higher bytes, which would be
11270 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
11271 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
11272 // about in practice.
11273 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11274 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
11275 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
11276 DCI.AddToWorklist(Cvt.getNode());
11277
11278 // For the f16 case, fold to a cast to f32 and then cast back to f16.
11279 if (ScalarVT != MVT::f32) {
11280 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
11281 DAG.getTargetConstant(0, DL, MVT::i32));
11282 }
11283 return Cvt;
11284 }
11285 }
11286
11287 return SDValue();
11288}
11289
11290SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
11291 DAGCombinerInfo &DCI) const {
11292 SDValue MagnitudeOp = N->getOperand(0);
11293 SDValue SignOp = N->getOperand(1);
11294 SelectionDAG &DAG = DCI.DAG;
11295 SDLoc DL(N);
11296
11297 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
11298 // lower half with a copy.
11299 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
11300 if (MagnitudeOp.getValueType() == MVT::f64) {
11301 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
11302 SDValue MagLo =
11303 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11304 DAG.getConstant(0, DL, MVT::i32));
11305 SDValue MagHi =
11306 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
11307 DAG.getConstant(1, DL, MVT::i32));
11308
11309 SDValue HiOp =
11310 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
11311
11312 SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
11313
11314 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
11315 }
11316
11317 if (SignOp.getValueType() != MVT::f64)
11318 return SDValue();
11319
11320 // Reduce width of sign operand, we only need the highest bit.
11321 //
11322 // fcopysign f64:x, f64:y ->
11323 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
11324 // TODO: In some cases it might make sense to go all the way to f16.
11325 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11326 SDValue SignAsF32 =
11327 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11328 DAG.getConstant(1, DL, MVT::i32));
11329
11330 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11331 SignAsF32);
11332}
11333
11334// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
11335// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
11336// bits
11337
11338// This is a variant of
11339// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
11340//
11341// The normal DAG combiner will do this, but only if the add has one use since
11342// that would increase the number of instructions.
11343//
11344// This prevents us from seeing a constant offset that can be folded into a
11345// memory instruction's addressing mode. If we know the resulting add offset of
11346// a pointer can be folded into an addressing offset, we can replace the pointer
11347// operand with the add of new constant offset. This eliminates one of the uses,
11348// and may allow the remaining use to also be simplified.
11349//
11350SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
11351 unsigned AddrSpace,
11352 EVT MemVT,
11353 DAGCombinerInfo &DCI) const {
11354 SDValue N0 = N->getOperand(0);
11355 SDValue N1 = N->getOperand(1);
11356
11357 // We only do this to handle cases where it's profitable when there are
11358 // multiple uses of the add, so defer to the standard combine.
11359 if ((N0.getOpcode() != ISD::ADD && N0.getOpcode() != ISD::OR) ||
11360 N0->hasOneUse())
11361 return SDValue();
11362
11363 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
11364 if (!CN1)
11365 return SDValue();
11366
11367 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
11368 if (!CAdd)
11369 return SDValue();
11370
11371 SelectionDAG &DAG = DCI.DAG;
11372
11373 if (N0->getOpcode() == ISD::OR &&
11374 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
11375 return SDValue();
11376
11377 // If the resulting offset is too large, we can't fold it into the
11378 // addressing mode offset.
11379 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
11380 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
11381
11382 AddrMode AM;
11383 AM.HasBaseReg = true;
11384 AM.BaseOffs = Offset.getSExtValue();
11385 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
11386 return SDValue();
11387
11388 SDLoc SL(N);
11389 EVT VT = N->getValueType(0);
11390
11391 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
11392 SDValue COffset = DAG.getConstant(Offset, SL, VT);
11393
11395 Flags.setNoUnsignedWrap(N->getFlags().hasNoUnsignedWrap() &&
11396 (N0.getOpcode() == ISD::OR ||
11397 N0->getFlags().hasNoUnsignedWrap()));
11398
11399 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
11400}
11401
11402/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
11403/// by the chain and intrinsic ID. Theoretically we would also need to check the
11404/// specific intrinsic, but they all place the pointer operand first.
11405static unsigned getBasePtrIndex(const MemSDNode *N) {
11406 switch (N->getOpcode()) {
11407 case ISD::STORE:
11410 return 2;
11411 default:
11412 return 1;
11413 }
11414}
11415
11416SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
11417 DAGCombinerInfo &DCI) const {
11418 SelectionDAG &DAG = DCI.DAG;
11419 SDLoc SL(N);
11420
11421 unsigned PtrIdx = getBasePtrIndex(N);
11422 SDValue Ptr = N->getOperand(PtrIdx);
11423
11424 // TODO: We could also do this for multiplies.
11425 if (Ptr.getOpcode() == ISD::SHL) {
11426 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
11427 N->getMemoryVT(), DCI);
11428 if (NewPtr) {
11429 SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
11430
11431 NewOps[PtrIdx] = NewPtr;
11432 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
11433 }
11434 }
11435
11436 return SDValue();
11437}
11438
11439static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
11440 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11441 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11442 (Opc == ISD::XOR && Val == 0);
11443}
11444
11445// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
11446// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
11447// integer combine opportunities since most 64-bit operations are decomposed
11448// this way. TODO: We won't want this for SALU especially if it is an inline
11449// immediate.
11450SDValue SITargetLowering::splitBinaryBitConstantOp(
11451 DAGCombinerInfo &DCI,
11452 const SDLoc &SL,
11453 unsigned Opc, SDValue LHS,
11454 const ConstantSDNode *CRHS) const {
11455 uint64_t Val = CRHS->getZExtValue();
11456 uint32_t ValLo = Lo_32(Val);
11457 uint32_t ValHi = Hi_32(Val);
11459
11460 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
11461 bitOpWithConstantIsReducible(Opc, ValHi)) ||
11462 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
11463 // If we need to materialize a 64-bit immediate, it will be split up later
11464 // anyway. Avoid creating the harder to understand 64-bit immediate
11465 // materialization.
11466 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
11467 }
11468
11469 return SDValue();
11470}
11471
11473 if (V.getValueType() != MVT::i1)
11474 return false;
11475 switch (V.getOpcode()) {
11476 default:
11477 break;
11478 case ISD::SETCC:
11480 return true;
11481 case ISD::AND:
11482 case ISD::OR:
11483 case ISD::XOR:
11484 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
11485 }
11486 return false;
11487}
11488
11489// If a constant has all zeroes or all ones within each byte return it.
11490// Otherwise return 0.
11492 // 0xff for any zero byte in the mask
11493 uint32_t ZeroByteMask = 0;
11494 if (!(C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11495 if (!(C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11496 if (!(C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11497 if (!(C & 0xff000000)) ZeroByteMask |= 0xff000000;
11498 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
11499 if ((NonZeroByteMask & C) != NonZeroByteMask)
11500 return 0; // Partial bytes selected.
11501 return C;
11502}
11503
11504// Check if a node selects whole bytes from its operand 0 starting at a byte
11505// boundary while masking the rest. Returns select mask as in the v_perm_b32
11506// or -1 if not succeeded.
11507// Note byte select encoding:
11508// value 0-3 selects corresponding source byte;
11509// value 0xc selects zero;
11510// value 0xff selects 0xff.
11512 assert(V.getValueSizeInBits() == 32);
11513
11514 if (V.getNumOperands() != 2)
11515 return ~0;
11516
11517 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
11518 if (!N1)
11519 return ~0;
11520
11521 uint32_t C = N1->getZExtValue();
11522
11523 switch (V.getOpcode()) {
11524 default:
11525 break;
11526 case ISD::AND:
11527 if (uint32_t ConstMask = getConstantPermuteMask(C))
11528 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11529 break;
11530
11531 case ISD::OR:
11532 if (uint32_t ConstMask = getConstantPermuteMask(C))
11533 return (0x03020100 & ~ConstMask) | ConstMask;
11534 break;
11535
11536 case ISD::SHL:
11537 if (C % 8)
11538 return ~0;
11539
11540 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
11541
11542 case ISD::SRL:
11543 if (C % 8)
11544 return ~0;
11545
11546 return uint32_t(0x0c0c0c0c03020100ull >> C);
11547 }
11548
11549 return ~0;
11550}
11551
11552SDValue SITargetLowering::performAndCombine(SDNode *N,
11553 DAGCombinerInfo &DCI) const {
11554 if (DCI.isBeforeLegalize())
11555 return SDValue();
11556
11557 SelectionDAG &DAG = DCI.DAG;
11558 EVT VT = N->getValueType(0);
11559 SDValue LHS = N->getOperand(0);
11560 SDValue RHS = N->getOperand(1);
11561
11562
11563 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
11564 if (VT == MVT::i64 && CRHS) {
11565 if (SDValue Split
11566 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
11567 return Split;
11568 }
11569
11570 if (CRHS && VT == MVT::i32) {
11571 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
11572 // nb = number of trailing zeroes in mask
11573 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
11574 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
11575 uint64_t Mask = CRHS->getZExtValue();
11576 unsigned Bits = llvm::popcount(Mask);
11577 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
11578 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
11579 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
11580 unsigned Shift = CShift->getZExtValue();
11581 unsigned NB = CRHS->getAPIntValue().countr_zero();
11582 unsigned Offset = NB + Shift;
11583 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
11584 SDLoc SL(N);
11585 SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
11586 LHS->getOperand(0),
11587 DAG.getConstant(Offset, SL, MVT::i32),
11588 DAG.getConstant(Bits, SL, MVT::i32));
11589 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
11590 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
11591 DAG.getValueType(NarrowVT));
11592 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
11593 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
11594 return Shl;
11595 }
11596 }
11597 }
11598
11599 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
11600 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
11601 isa<ConstantSDNode>(LHS.getOperand(2))) {
11602 uint32_t Sel = getConstantPermuteMask(Mask);
11603 if (!Sel)
11604 return SDValue();
11605
11606 // Select 0xc for all zero bytes
11607 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11608 SDLoc DL(N);
11609 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
11610 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
11611 }
11612 }
11613
11614 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
11615 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
11616 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
11617 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11618 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
11619
11620 SDValue X = LHS.getOperand(0);
11621 SDValue Y = RHS.getOperand(0);
11622 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
11623 !isTypeLegal(X.getValueType()))
11624 return SDValue();
11625
11626 if (LCC == ISD::SETO) {
11627 if (X != LHS.getOperand(1))
11628 return SDValue();
11629
11630 if (RCC == ISD::SETUNE) {
11631 const ConstantFPSDNode *C1 = dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
11632 if (!C1 || !C1->isInfinity() || C1->isNegative())
11633 return SDValue();
11634
11641
11642 static_assert(((~(SIInstrFlags::S_NAN |
11645 SIInstrFlags::P_INFINITY)) & 0x3ff) == Mask,
11646 "mask not equal");
11647
11648 SDLoc DL(N);
11649 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
11650 X, DAG.getConstant(Mask, DL, MVT::i32));
11651 }
11652 }
11653 }
11654
11655 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
11656 std::swap(LHS, RHS);
11657
11658 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
11659 RHS.hasOneUse()) {
11660 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
11661 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan | n_nan)
11662 // and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan | n_nan)
11663 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
11664 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
11665 (RHS.getOperand(0) == LHS.getOperand(0) &&
11666 LHS.getOperand(0) == LHS.getOperand(1))) {
11667 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
11668 unsigned NewMask = LCC == ISD::SETO ?
11669 Mask->getZExtValue() & ~OrdMask :
11670 Mask->getZExtValue() & OrdMask;
11671
11672 SDLoc DL(N);
11673 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
11674 DAG.getConstant(NewMask, DL, MVT::i32));
11675 }
11676 }
11677
11678 if (VT == MVT::i32 &&
11679 (RHS.getOpcode() == ISD::SIGN_EXTEND || LHS.getOpcode() == ISD::SIGN_EXTEND)) {
11680 // and x, (sext cc from i1) => select cc, x, 0
11681 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
11682 std::swap(LHS, RHS);
11683 if (isBoolSGPR(RHS.getOperand(0)))
11684 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0),
11685 LHS, DAG.getConstant(0, SDLoc(N), MVT::i32));
11686 }
11687
11688 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
11690 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
11691 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11692 uint32_t LHSMask = getPermuteMask(LHS);
11693 uint32_t RHSMask = getPermuteMask(RHS);
11694 if (LHSMask != ~0u && RHSMask != ~0u) {
11695 // Canonicalize the expression in an attempt to have fewer unique masks
11696 // and therefore fewer registers used to hold the masks.
11697 if (LHSMask > RHSMask) {
11698 std::swap(LHSMask, RHSMask);
11699 std::swap(LHS, RHS);
11700 }
11701
11702 // Select 0xc for each lane used from source operand. Zero has 0xc mask
11703 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
11704 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11705 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11706
11707 // Check of we need to combine values from two sources within a byte.
11708 if (!(LHSUsedLanes & RHSUsedLanes) &&
11709 // If we select high and lower word keep it for SDWA.
11710 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
11711 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11712 // Each byte in each mask is either selector mask 0-3, or has higher
11713 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
11714 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
11715 // mask which is not 0xff wins. By anding both masks we have a correct
11716 // result except that 0x0c shall be corrected to give 0x0c only.
11717 uint32_t Mask = LHSMask & RHSMask;
11718 for (unsigned I = 0; I < 32; I += 8) {
11719 uint32_t ByteSel = 0xff << I;
11720 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11721 Mask &= (0x0c << I) & 0xffffffff;
11722 }
11723
11724 // Add 4 to each active LHS lane. It will not affect any existing 0xff
11725 // or 0x0c.
11726 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
11727 SDLoc DL(N);
11728
11729 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
11730 LHS.getOperand(0), RHS.getOperand(0),
11731 DAG.getConstant(Sel, DL, MVT::i32));
11732 }
11733 }
11734 }
11735
11736 return SDValue();
11737}
11738
11739// A key component of v_perm is a mapping between byte position of the src
11740// operands, and the byte position of the dest. To provide such, we need: 1. the
11741// node that provides x byte of the dest of the OR, and 2. the byte of the node
11742// used to provide that x byte. calculateByteProvider finds which node provides
11743// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
11744// and finds an ultimate src and byte position For example: The supported
11745// LoadCombine pattern for vector loads is as follows
11746// t1
11747// or
11748// / \
11749// t2 t3
11750// zext shl
11751// | | \
11752// t4 t5 16
11753// or anyext
11754// / \ |
11755// t6 t7 t8
11756// srl shl or
11757// / | / \ / \
11758// t9 t10 t11 t12 t13 t14
11759// trunc* 8 trunc* 8 and and
11760// | | / | | \
11761// t15 t16 t17 t18 t19 t20
11762// trunc* 255 srl -256
11763// | / \
11764// t15 t15 16
11765//
11766// *In this example, the truncs are from i32->i16
11767//
11768// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
11769// respectively. calculateSrcByte would find (given node) -> ultimate src &
11770// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
11771// After finding the mapping, we can combine the tree into vperm t15, t16,
11772// 0x05000407
11773
11774// Find the source and byte position from a node.
11775// \p DestByte is the byte position of the dest of the or that the src
11776// ultimately provides. \p SrcIndex is the byte of the src that maps to this
11777// dest of the or byte. \p Depth tracks how many recursive iterations we have
11778// performed.
11779static const std::optional<ByteProvider<SDValue>>
11780calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
11781 unsigned Depth = 0) {
11782 // We may need to recursively traverse a series of SRLs
11783 if (Depth >= 6)
11784 return std::nullopt;
11785
11786 if (Op.getValueSizeInBits() < 8)
11787 return std::nullopt;
11788
11789 if (Op.getValueType().isVector())
11790 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11791
11792 switch (Op->getOpcode()) {
11793 case ISD::TRUNCATE: {
11794 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11795 }
11796
11797 case ISD::SIGN_EXTEND:
11798 case ISD::ZERO_EXTEND:
11800 SDValue NarrowOp = Op->getOperand(0);
11801 auto NarrowVT = NarrowOp.getValueType();
11802 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
11803 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11804 NarrowVT = VTSign->getVT();
11805 }
11806 if (!NarrowVT.isByteSized())
11807 return std::nullopt;
11808 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
11809
11810 if (SrcIndex >= NarrowByteWidth)
11811 return std::nullopt;
11812 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11813 }
11814
11815 case ISD::SRA:
11816 case ISD::SRL: {
11817 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11818 if (!ShiftOp)
11819 return std::nullopt;
11820
11821 uint64_t BitShift = ShiftOp->getZExtValue();
11822
11823 if (BitShift % 8 != 0)
11824 return std::nullopt;
11825
11826 SrcIndex += BitShift / 8;
11827
11828 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
11829 }
11830
11831 default: {
11832 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
11833 }
11834 }
11835 llvm_unreachable("fully handled switch");
11836}
11837
11838// For a byte position in the result of an Or, traverse the tree and find the
11839// node (and the byte of the node) which ultimately provides this {Or,
11840// BytePosition}. \p Op is the operand we are currently examining. \p Index is
11841// the byte position of the Op that corresponds with the originally requested
11842// byte of the Or \p Depth tracks how many recursive iterations we have
11843// performed. \p StartingIndex is the originally requested byte of the Or
11844static const std::optional<ByteProvider<SDValue>>
11845calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
11846 unsigned StartingIndex = 0) {
11847 // Finding Src tree of RHS of or typically requires at least 1 additional
11848 // depth
11849 if (Depth > 6)
11850 return std::nullopt;
11851
11852 unsigned BitWidth = Op.getScalarValueSizeInBits();
11853 if (BitWidth % 8 != 0)
11854 return std::nullopt;
11855 if (Index > BitWidth / 8 - 1)
11856 return std::nullopt;
11857
11858 bool IsVec = Op.getValueType().isVector();
11859 switch (Op.getOpcode()) {
11860 case ISD::OR: {
11861 if (IsVec)
11862 return std::nullopt;
11863
11864 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
11865 StartingIndex);
11866 if (!RHS)
11867 return std::nullopt;
11868 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
11869 StartingIndex);
11870 if (!LHS)
11871 return std::nullopt;
11872 // A well formed Or will have two ByteProviders for each byte, one of which
11873 // is constant zero
11874 if (!LHS->isConstantZero() && !RHS->isConstantZero())
11875 return std::nullopt;
11876 if (!LHS || LHS->isConstantZero())
11877 return RHS;
11878 if (!RHS || RHS->isConstantZero())
11879 return LHS;
11880 return std::nullopt;
11881 }
11882
11883 case ISD::AND: {
11884 if (IsVec)
11885 return std::nullopt;
11886
11887 auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11888 if (!BitMaskOp)
11889 return std::nullopt;
11890
11891 uint32_t BitMask = BitMaskOp->getZExtValue();
11892 // Bits we expect for our StartingIndex
11893 uint32_t IndexMask = 0xFF << (Index * 8);
11894
11895 if ((IndexMask & BitMask) != IndexMask) {
11896 // If the result of the and partially provides the byte, then it
11897 // is not well formatted
11898 if (IndexMask & BitMask)
11899 return std::nullopt;
11901 }
11902
11903 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
11904 }
11905
11906 case ISD::FSHR: {
11907 if (IsVec)
11908 return std::nullopt;
11909
11910 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
11911 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
11912 if (!ShiftOp || Op.getValueType().isVector())
11913 return std::nullopt;
11914
11915 uint64_t BitsProvided = Op.getValueSizeInBits();
11916 if (BitsProvided % 8 != 0)
11917 return std::nullopt;
11918
11919 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11920 if (BitShift % 8)
11921 return std::nullopt;
11922
11923 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11924 uint64_t ByteShift = BitShift / 8;
11925
11926 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
11927 uint64_t BytesProvided = BitsProvided / 8;
11928 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11929 NewIndex %= BytesProvided;
11930 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
11931 }
11932
11933 case ISD::SRA:
11934 case ISD::SRL: {
11935 if (IsVec)
11936 return std::nullopt;
11937
11938 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11939 if (!ShiftOp)
11940 return std::nullopt;
11941
11942 uint64_t BitShift = ShiftOp->getZExtValue();
11943 if (BitShift % 8)
11944 return std::nullopt;
11945
11946 auto BitsProvided = Op.getScalarValueSizeInBits();
11947 if (BitsProvided % 8 != 0)
11948 return std::nullopt;
11949
11950 uint64_t BytesProvided = BitsProvided / 8;
11951 uint64_t ByteShift = BitShift / 8;
11952 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
11953 // If the byte we are trying to provide (as tracked by index) falls in this
11954 // range, then the SRL provides the byte. The byte of interest of the src of
11955 // the SRL is Index + ByteShift
11956 return BytesProvided - ByteShift > Index
11957 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
11958 Index + ByteShift)
11960 }
11961
11962 case ISD::SHL: {
11963 if (IsVec)
11964 return std::nullopt;
11965
11966 auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
11967 if (!ShiftOp)
11968 return std::nullopt;
11969
11970 uint64_t BitShift = ShiftOp->getZExtValue();
11971 if (BitShift % 8 != 0)
11972 return std::nullopt;
11973 uint64_t ByteShift = BitShift / 8;
11974
11975 // If we are shifting by an amount greater than (or equal to)
11976 // the index we are trying to provide, then it provides 0s. If not,
11977 // then this bytes are not definitively 0s, and the corresponding byte
11978 // of interest is Index - ByteShift of the src
11979 return Index < ByteShift
11981 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
11982 Depth + 1, StartingIndex);
11983 }
11984 case ISD::ANY_EXTEND:
11985 case ISD::SIGN_EXTEND:
11986 case ISD::ZERO_EXTEND:
11988 case ISD::AssertZext:
11989 case ISD::AssertSext: {
11990 if (IsVec)
11991 return std::nullopt;
11992
11993 SDValue NarrowOp = Op->getOperand(0);
11994 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
11995 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
11996 Op->getOpcode() == ISD::AssertZext ||
11997 Op->getOpcode() == ISD::AssertSext) {
11998 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
11999 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12000 }
12001 if (NarrowBitWidth % 8 != 0)
12002 return std::nullopt;
12003 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12004
12005 if (Index >= NarrowByteWidth)
12006 return Op.getOpcode() == ISD::ZERO_EXTEND
12007 ? std::optional<ByteProvider<SDValue>>(
12009 : std::nullopt;
12010 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
12011 }
12012
12013 case ISD::TRUNCATE: {
12014 if (IsVec)
12015 return std::nullopt;
12016
12017 uint64_t NarrowByteWidth = BitWidth / 8;
12018
12019 if (NarrowByteWidth >= Index) {
12020 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
12021 StartingIndex);
12022 }
12023
12024 return std::nullopt;
12025 }
12026
12027 case ISD::CopyFromReg: {
12028 if (BitWidth / 8 > Index)
12029 return calculateSrcByte(Op, StartingIndex, Index);
12030
12031 return std::nullopt;
12032 }
12033
12034 case ISD::LOAD: {
12035 auto L = cast<LoadSDNode>(Op.getNode());
12036
12037 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12038 if (NarrowBitWidth % 8 != 0)
12039 return std::nullopt;
12040 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12041
12042 // If the width of the load does not reach byte we are trying to provide for
12043 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
12044 // question
12045 if (Index >= NarrowByteWidth) {
12046 return L->getExtensionType() == ISD::ZEXTLOAD
12047 ? std::optional<ByteProvider<SDValue>>(
12049 : std::nullopt;
12050 }
12051
12052 if (NarrowByteWidth > Index) {
12053 return calculateSrcByte(Op, StartingIndex, Index);
12054 }
12055
12056 return std::nullopt;
12057 }
12058
12059 case ISD::BSWAP: {
12060 if (IsVec)
12061 return std::nullopt;
12062
12063 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
12064 Depth + 1, StartingIndex);
12065 }
12066
12068 auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
12069 if (!IdxOp)
12070 return std::nullopt;
12071 auto VecIdx = IdxOp->getZExtValue();
12072 auto ScalarSize = Op.getScalarValueSizeInBits();
12073 if (ScalarSize != 32) {
12074 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
12075 }
12076
12077 return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
12078 StartingIndex, Index);
12079 }
12080
12081 case AMDGPUISD::PERM: {
12082 if (IsVec)
12083 return std::nullopt;
12084
12085 auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
12086 if (!PermMask)
12087 return std::nullopt;
12088
12089 auto IdxMask =
12090 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
12091 if (IdxMask > 0x07 && IdxMask != 0x0c)
12092 return std::nullopt;
12093
12094 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12095 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12096
12097 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
12100 }
12101
12102 default: {
12103 return std::nullopt;
12104 }
12105 }
12106
12107 llvm_unreachable("fully handled switch");
12108}
12109
12110// Returns true if the Operand is a scalar and is 16 bits
12111static bool isExtendedFrom16Bits(SDValue &Operand) {
12112
12113 switch (Operand.getOpcode()) {
12114 case ISD::ANY_EXTEND:
12115 case ISD::SIGN_EXTEND:
12116 case ISD::ZERO_EXTEND: {
12117 auto OpVT = Operand.getOperand(0).getValueType();
12118 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
12119 }
12120 case ISD::LOAD: {
12121 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
12122 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12123 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
12124 ExtType == ISD::EXTLOAD) {
12125 auto MemVT = L->getMemoryVT();
12126 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
12127 }
12128 return L->getMemoryVT().getSizeInBits() == 16;
12129 }
12130 default:
12131 return false;
12132 }
12133}
12134
12135// Returns true if the mask matches consecutive bytes, and the first byte
12136// begins at a power of 2 byte offset from 0th byte
12137static bool addresses16Bits(int Mask) {
12138 int Low8 = Mask & 0xff;
12139 int Hi8 = (Mask & 0xff00) >> 8;
12140
12141 assert(Low8 < 8 && Hi8 < 8);
12142 // Are the bytes contiguous in the order of increasing addresses.
12143 bool IsConsecutive = (Hi8 - Low8 == 1);
12144 // Is the first byte at location that is aligned for 16 bit instructions.
12145 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
12146 // In this case, we still need code to extract the 16 bit operand, so it
12147 // is better to use i8 v_perm
12148 bool Is16Aligned = !(Low8 % 2);
12149
12150 return IsConsecutive && Is16Aligned;
12151}
12152
12153// Do not lower into v_perm if the operands are actually 16 bit
12154// and the selected bits (based on PermMask) correspond with two
12155// easily addressable 16 bit operands.
12157 SDValue &OtherOp) {
12158 int Low16 = PermMask & 0xffff;
12159 int Hi16 = (PermMask & 0xffff0000) >> 16;
12160
12161 auto TempOp = peekThroughBitcasts(Op);
12162 auto TempOtherOp = peekThroughBitcasts(OtherOp);
12163
12164 auto OpIs16Bit =
12165 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
12166 if (!OpIs16Bit)
12167 return true;
12168
12169 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12170 isExtendedFrom16Bits(TempOtherOp);
12171 if (!OtherOpIs16Bit)
12172 return true;
12173
12174 // Do we cleanly address both
12175 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
12176}
12177
12179 unsigned DWordOffset) {
12180 SDValue Ret;
12181
12182 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12183 // ByteProvider must be at least 8 bits
12184 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12185
12186 if (TypeSize <= 32)
12187 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
12188
12189 if (Src.getValueType().isVector()) {
12190 auto ScalarTySize = Src.getScalarValueSizeInBits();
12191 auto ScalarTy = Src.getValueType().getScalarType();
12192 if (ScalarTySize == 32) {
12193 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
12194 DAG.getConstant(DWordOffset, SL, MVT::i32));
12195 }
12196 if (ScalarTySize > 32) {
12197 Ret = DAG.getNode(
12198 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
12199 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12200 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12201 if (ShiftVal)
12202 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
12203 DAG.getConstant(ShiftVal, SL, MVT::i32));
12204 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12205 }
12206
12207 assert(ScalarTySize < 32);
12208 auto NumElements = TypeSize / ScalarTySize;
12209 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12210 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12211 auto NumElementsIn32 = 32 / ScalarTySize;
12212 auto NumAvailElements = DWordOffset < Trunc32Elements
12213 ? NumElementsIn32
12214 : NumElements - NormalizedTrunc;
12215
12217 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
12218 NumAvailElements);
12219
12220 Ret = DAG.getBuildVector(
12221 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
12222 VecSrcs);
12223 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12224 }
12225
12226 /// Scalar Type
12227 auto ShiftVal = 32 * DWordOffset;
12228 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
12229 DAG.getConstant(ShiftVal, SL, MVT::i32));
12230 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
12231}
12232
12234 SelectionDAG &DAG = DCI.DAG;
12235 [[maybe_unused]] EVT VT = N->getValueType(0);
12237
12238 // VT is known to be MVT::i32, so we need to provide 4 bytes.
12239 assert(VT == MVT::i32);
12240 for (int i = 0; i < 4; i++) {
12241 // Find the ByteProvider that provides the ith byte of the result of OR
12242 std::optional<ByteProvider<SDValue>> P =
12243 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
12244 // TODO support constantZero
12245 if (!P || P->isConstantZero())
12246 return SDValue();
12247
12248 PermNodes.push_back(*P);
12249 }
12250 if (PermNodes.size() != 4)
12251 return SDValue();
12252
12253 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12254 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12255 uint64_t PermMask = 0x00000000;
12256 for (size_t i = 0; i < PermNodes.size(); i++) {
12257 auto PermOp = PermNodes[i];
12258 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
12259 // by sizeof(Src2) = 4
12260 int SrcByteAdjust = 4;
12261
12262 // If the Src uses a byte from a different DWORD, then it corresponds
12263 // with a difference source
12264 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12265 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12266 if (SecondSrc)
12267 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12268 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12269 return SDValue();
12270
12271 // Set the index of the second distinct Src node
12272 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12273 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12274 SrcByteAdjust = 0;
12275 }
12276 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12278 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12279 }
12280 SDLoc DL(N);
12281 SDValue Op = *PermNodes[FirstSrc.first].Src;
12282 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
12283 assert(Op.getValueSizeInBits() == 32);
12284
12285 // Check that we are not just extracting the bytes in order from an op
12286 if (!SecondSrc) {
12287 int Low16 = PermMask & 0xffff;
12288 int Hi16 = (PermMask & 0xffff0000) >> 16;
12289
12290 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12291 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12292
12293 // The perm op would really just produce Op. So combine into Op
12294 if (WellFormedLow && WellFormedHi)
12295 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
12296 }
12297
12298 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
12299
12300 if (SecondSrc) {
12301 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
12302 assert(OtherOp.getValueSizeInBits() == 32);
12303 }
12304
12305 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
12306
12307 assert(Op.getValueType().isByteSized() &&
12308 OtherOp.getValueType().isByteSized());
12309
12310 // If the ultimate src is less than 32 bits, then we will only be
12311 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
12312 // CalculateByteProvider would not have returned Op as source if we
12313 // used a byte that is outside its ValueType. Thus, we are free to
12314 // ANY_EXTEND as the extended bits are dont-cares.
12315 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
12316 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
12317
12318 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
12319 DAG.getConstant(PermMask, DL, MVT::i32));
12320 }
12321 return SDValue();
12322}
12323
12324SDValue SITargetLowering::performOrCombine(SDNode *N,
12325 DAGCombinerInfo &DCI) const {
12326 SelectionDAG &DAG = DCI.DAG;
12327 SDValue LHS = N->getOperand(0);
12328 SDValue RHS = N->getOperand(1);
12329
12330 EVT VT = N->getValueType(0);
12331 if (VT == MVT::i1) {
12332 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
12333 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
12334 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
12335 SDValue Src = LHS.getOperand(0);
12336 if (Src != RHS.getOperand(0))
12337 return SDValue();
12338
12339 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
12340 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
12341 if (!CLHS || !CRHS)
12342 return SDValue();
12343
12344 // Only 10 bits are used.
12345 static const uint32_t MaxMask = 0x3ff;
12346
12347 uint32_t NewMask = (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
12348 SDLoc DL(N);
12349 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1,
12350 Src, DAG.getConstant(NewMask, DL, MVT::i32));
12351 }
12352
12353 return SDValue();
12354 }
12355
12356 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
12357 if (isa<ConstantSDNode>(RHS) && LHS.hasOneUse() &&
12358 LHS.getOpcode() == AMDGPUISD::PERM &&
12359 isa<ConstantSDNode>(LHS.getOperand(2))) {
12360 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
12361 if (!Sel)
12362 return SDValue();
12363
12364 Sel |= LHS.getConstantOperandVal(2);
12365 SDLoc DL(N);
12366 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
12367 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
12368 }
12369
12370 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
12372 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
12373 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12374
12375 // If all the uses of an or need to extract the individual elements, do not
12376 // attempt to lower into v_perm
12377 auto usesCombinedOperand = [](SDNode *OrUse) {
12378 // If we have any non-vectorized use, then it is a candidate for v_perm
12379 if (OrUse->getOpcode() != ISD::BITCAST ||
12380 !OrUse->getValueType(0).isVector())
12381 return true;
12382
12383 // If we have any non-vectorized use, then it is a candidate for v_perm
12384 for (auto VUse : OrUse->uses()) {
12385 if (!VUse->getValueType(0).isVector())
12386 return true;
12387
12388 // If the use of a vector is a store, then combining via a v_perm
12389 // is beneficial.
12390 // TODO -- whitelist more uses
12391 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
12392 if (VUse->getOpcode() == VectorwiseOp)
12393 return true;
12394 }
12395 return false;
12396 };
12397
12398 if (!any_of(N->uses(), usesCombinedOperand))
12399 return SDValue();
12400
12401 uint32_t LHSMask = getPermuteMask(LHS);
12402 uint32_t RHSMask = getPermuteMask(RHS);
12403
12404 if (LHSMask != ~0u && RHSMask != ~0u) {
12405 // Canonicalize the expression in an attempt to have fewer unique masks
12406 // and therefore fewer registers used to hold the masks.
12407 if (LHSMask > RHSMask) {
12408 std::swap(LHSMask, RHSMask);
12409 std::swap(LHS, RHS);
12410 }
12411
12412 // Select 0xc for each lane used from source operand. Zero has 0xc mask
12413 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
12414 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12415 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12416
12417 // Check of we need to combine values from two sources within a byte.
12418 if (!(LHSUsedLanes & RHSUsedLanes) &&
12419 // If we select high and lower word keep it for SDWA.
12420 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
12421 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12422 // Kill zero bytes selected by other mask. Zero value is 0xc.
12423 LHSMask &= ~RHSUsedLanes;
12424 RHSMask &= ~LHSUsedLanes;
12425 // Add 4 to each active LHS lane
12426 LHSMask |= LHSUsedLanes & 0x04040404;
12427 // Combine masks
12428 uint32_t Sel = LHSMask | RHSMask;
12429 SDLoc DL(N);
12430
12431 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32,
12432 LHS.getOperand(0), RHS.getOperand(0),
12433 DAG.getConstant(Sel, DL, MVT::i32));
12434 }
12435 }
12436 if (LHSMask == ~0u || RHSMask == ~0u) {
12437 if (SDValue Perm = matchPERM(N, DCI))
12438 return Perm;
12439 }
12440 }
12441
12442 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12443 return SDValue();
12444
12445 // TODO: This could be a generic combine with a predicate for extracting the
12446 // high half of an integer being free.
12447
12448 // (or i64:x, (zero_extend i32:y)) ->
12449 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
12450 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
12451 RHS.getOpcode() != ISD::ZERO_EXTEND)
12452 std::swap(LHS, RHS);
12453
12454 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
12455 SDValue ExtSrc = RHS.getOperand(0);
12456 EVT SrcVT = ExtSrc.getValueType();
12457 if (SrcVT == MVT::i32) {
12458 SDLoc SL(N);
12459 SDValue LowLHS, HiBits;
12460 std::tie(LowLHS, HiBits) = split64BitValue(LHS, DAG);
12461 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
12462
12463 DCI.AddToWorklist(LowOr.getNode());
12464 DCI.AddToWorklist(HiBits.getNode());
12465
12466 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
12467 LowOr, HiBits);
12468 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
12469 }
12470 }
12471
12472 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
12473 if (CRHS) {
12474 if (SDValue Split
12475 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
12476 N->getOperand(0), CRHS))
12477 return Split;
12478 }
12479
12480 return SDValue();
12481}
12482
12483SDValue SITargetLowering::performXorCombine(SDNode *N,
12484 DAGCombinerInfo &DCI) const {
12485 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
12486 return RV;
12487
12488 SDValue LHS = N->getOperand(0);
12489 SDValue RHS = N->getOperand(1);
12490
12491 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
12492 SelectionDAG &DAG = DCI.DAG;
12493
12494 EVT VT = N->getValueType(0);
12495 if (CRHS && VT == MVT::i64) {
12496 if (SDValue Split
12497 = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
12498 return Split;
12499 }
12500
12501 // Make sure to apply the 64-bit constant splitting fold before trying to fold
12502 // fneg-like xors into 64-bit select.
12503 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
12504 // This looks like an fneg, try to fold as a source modifier.
12505 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
12506 shouldFoldFNegIntoSrc(N, LHS)) {
12507 // xor (select c, a, b), 0x80000000 ->
12508 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
12509 SDLoc DL(N);
12510 SDValue CastLHS =
12511 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
12512 SDValue CastRHS =
12513 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
12514 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
12515 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
12516 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
12517 LHS->getOperand(0), FNegLHS, FNegRHS);
12518 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
12519 }
12520 }
12521
12522 return SDValue();
12523}
12524
12525SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
12526 DAGCombinerInfo &DCI) const {
12527 if (!Subtarget->has16BitInsts() ||
12528 DCI.getDAGCombineLevel() < AfterLegalizeDAG)
12529 return SDValue();
12530
12531 EVT VT = N->getValueType(0);
12532 if (VT != MVT::i32)
12533 return SDValue();
12534
12535 SDValue Src = N->getOperand(0);
12536 if (Src.getValueType() != MVT::i16)
12537 return SDValue();
12538
12539 return SDValue();
12540}
12541
12542SDValue
12543SITargetLowering::performSignExtendInRegCombine(SDNode *N,
12544 DAGCombinerInfo &DCI) const {
12545 SDValue Src = N->getOperand(0);
12546 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
12547
12548 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
12549 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
12550 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
12551 VTSign->getVT() == MVT::i8) ||
12552 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
12553 VTSign->getVT() == MVT::i16))) {
12554 assert(Subtarget->hasScalarSubwordLoads() &&
12555 "s_buffer_load_{u8, i8} are supported "
12556 "in GFX12 (or newer) architectures.");
12557 EVT VT = Src.getValueType();
12558 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
12561 SDLoc DL(N);
12562 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12563 SDValue Ops[] = {
12564 Src.getOperand(0), // source register
12565 Src.getOperand(1), // offset
12566 Src.getOperand(2) // cachePolicy
12567 };
12568 auto *M = cast<MemSDNode>(Src);
12569 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12570 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
12571 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
12572 return LoadVal;
12573 } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
12574 VTSign->getVT() == MVT::i8) ||
12575 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
12576 VTSign->getVT() == MVT::i16)) &&
12577 Src.hasOneUse()) {
12578 auto *M = cast<MemSDNode>(Src);
12579 SDValue Ops[] = {
12580 Src.getOperand(0), // Chain
12581 Src.getOperand(1), // rsrc
12582 Src.getOperand(2), // vindex
12583 Src.getOperand(3), // voffset
12584 Src.getOperand(4), // soffset
12585 Src.getOperand(5), // offset
12586 Src.getOperand(6),
12587 Src.getOperand(7)
12588 };
12589 // replace with BUFFER_LOAD_BYTE/SHORT
12590 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12591 Src.getOperand(0).getValueType());
12592 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ?
12594 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N),
12595 ResList,
12596 Ops, M->getMemoryVT(),
12597 M->getMemOperand());
12598 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12599 BufferLoadSignExt.getValue(1)}, SDLoc(N));
12600 }
12601 return SDValue();
12602}
12603
12604SDValue SITargetLowering::performClassCombine(SDNode *N,
12605 DAGCombinerInfo &DCI) const {
12606 SelectionDAG &DAG = DCI.DAG;
12607 SDValue Mask = N->getOperand(1);
12608
12609 // fp_class x, 0 -> false
12610 if (isNullConstant(Mask))
12611 return DAG.getConstant(0, SDLoc(N), MVT::i1);
12612
12613 if (N->getOperand(0).isUndef())
12614 return DAG.getUNDEF(MVT::i1);
12615
12616 return SDValue();
12617}
12618
12619SDValue SITargetLowering::performRcpCombine(SDNode *N,
12620 DAGCombinerInfo &DCI) const {
12621 EVT VT = N->getValueType(0);
12622 SDValue N0 = N->getOperand(0);
12623
12624 if (N0.isUndef()) {
12625 return DCI.DAG.getConstantFP(
12627 VT);
12628 }
12629
12630 if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
12631 N0.getOpcode() == ISD::SINT_TO_FP)) {
12632 return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
12633 N->getFlags());
12634 }
12635
12636 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
12637 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
12638 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
12639 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
12640 N0.getOperand(0), N->getFlags());
12641 }
12642
12644}
12645
12647 unsigned MaxDepth) const {
12648 unsigned Opcode = Op.getOpcode();
12649 if (Opcode == ISD::FCANONICALIZE)
12650 return true;
12651
12652 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
12653 const auto &F = CFP->getValueAPF();
12654 if (F.isNaN() && F.isSignaling())
12655 return false;
12656 if (!F.isDenormal())
12657 return true;
12658
12659 DenormalMode Mode =
12660 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
12661 return Mode == DenormalMode::getIEEE();
12662 }
12663
12664 // If source is a result of another standard FP operation it is already in
12665 // canonical form.
12666 if (MaxDepth == 0)
12667 return false;
12668
12669 switch (Opcode) {
12670 // These will flush denorms if required.
12671 case ISD::FADD:
12672 case ISD::FSUB:
12673 case ISD::FMUL:
12674 case ISD::FCEIL:
12675 case ISD::FFLOOR:
12676 case ISD::FMA:
12677 case ISD::FMAD:
12678 case ISD::FSQRT:
12679 case ISD::FDIV:
12680 case ISD::FREM:
12681 case ISD::FP_ROUND:
12682 case ISD::FP_EXTEND:
12683 case ISD::FP16_TO_FP:
12684 case ISD::FP_TO_FP16:
12685 case ISD::BF16_TO_FP:
12686 case ISD::FP_TO_BF16:
12687 case ISD::FLDEXP:
12690 case AMDGPUISD::RCP:
12691 case AMDGPUISD::RSQ:
12695 case AMDGPUISD::LOG:
12696 case AMDGPUISD::EXP:
12700 case AMDGPUISD::FRACT:
12707 case AMDGPUISD::SIN_HW:
12708 case AMDGPUISD::COS_HW:
12709 return true;
12710
12711 // It can/will be lowered or combined as a bit operation.
12712 // Need to check their input recursively to handle.
12713 case ISD::FNEG:
12714 case ISD::FABS:
12715 case ISD::FCOPYSIGN:
12716 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12717
12718 case ISD::AND:
12719 if (Op.getValueType() == MVT::i32) {
12720 // Be careful as we only know it is a bitcast floating point type. It
12721 // could be f32, v2f16, we have no way of knowing. Luckily the constant
12722 // value that we optimize for, which comes up in fp32 to bf16 conversions,
12723 // is valid to optimize for all types.
12724 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
12725 if (RHS->getZExtValue() == 0xffff0000) {
12726 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12727 }
12728 }
12729 }
12730 break;
12731
12732 case ISD::FSIN:
12733 case ISD::FCOS:
12734 case ISD::FSINCOS:
12735 return Op.getValueType().getScalarType() != MVT::f16;
12736
12737 case ISD::FMINNUM:
12738 case ISD::FMAXNUM:
12739 case ISD::FMINNUM_IEEE:
12740 case ISD::FMAXNUM_IEEE:
12741 case ISD::FMINIMUM:
12742 case ISD::FMAXIMUM:
12743 case AMDGPUISD::CLAMP:
12744 case AMDGPUISD::FMED3:
12745 case AMDGPUISD::FMAX3:
12746 case AMDGPUISD::FMIN3:
12748 case AMDGPUISD::FMINIMUM3: {
12749 // FIXME: Shouldn't treat the generic operations different based these.
12750 // However, we aren't really required to flush the result from
12751 // minnum/maxnum..
12752
12753 // snans will be quieted, so we only need to worry about denormals.
12754 if (Subtarget->supportsMinMaxDenormModes() ||
12755 // FIXME: denormalsEnabledForType is broken for dynamic
12756 denormalsEnabledForType(DAG, Op.getValueType()))
12757 return true;
12758
12759 // Flushing may be required.
12760 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
12761 // targets need to check their input recursively.
12762
12763 // FIXME: Does this apply with clamp? It's implemented with max.
12764 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
12765 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
12766 return false;
12767 }
12768
12769 return true;
12770 }
12771 case ISD::SELECT: {
12772 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
12773 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
12774 }
12775 case ISD::BUILD_VECTOR: {
12776 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
12777 SDValue SrcOp = Op.getOperand(i);
12778 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
12779 return false;
12780 }
12781
12782 return true;
12783 }
12786 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12787 }
12789 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
12790 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
12791 }
12792 case ISD::UNDEF:
12793 // Could be anything.
12794 return false;
12795
12796 case ISD::BITCAST:
12797 // TODO: This is incorrect as it loses track of the operand's type. We may
12798 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
12799 // same bits that are canonicalized in one type need not be in the other.
12800 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
12801 case ISD::TRUNCATE: {
12802 // Hack round the mess we make when legalizing extract_vector_elt
12803 if (Op.getValueType() == MVT::i16) {
12804 SDValue TruncSrc = Op.getOperand(0);
12805 if (TruncSrc.getValueType() == MVT::i32 &&
12806 TruncSrc.getOpcode() == ISD::BITCAST &&
12807 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
12808 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
12809 }
12810 }
12811 return false;
12812 }
12814 unsigned IntrinsicID = Op.getConstantOperandVal(0);
12815 // TODO: Handle more intrinsics
12816 switch (IntrinsicID) {
12817 case Intrinsic::amdgcn_cvt_pkrtz:
12818 case Intrinsic::amdgcn_cubeid:
12819 case Intrinsic::amdgcn_frexp_mant:
12820 case Intrinsic::amdgcn_fdot2:
12821 case Intrinsic::amdgcn_rcp:
12822 case Intrinsic::amdgcn_rsq:
12823 case Intrinsic::amdgcn_rsq_clamp:
12824 case Intrinsic::amdgcn_rcp_legacy:
12825 case Intrinsic::amdgcn_rsq_legacy:
12826 case Intrinsic::amdgcn_trig_preop:
12827 case Intrinsic::amdgcn_log:
12828 case Intrinsic::amdgcn_exp2:
12829 case Intrinsic::amdgcn_sqrt:
12830 return true;
12831 default:
12832 break;
12833 }
12834
12835 break;
12836 }
12837 default:
12838 break;
12839 }
12840
12841 // FIXME: denormalsEnabledForType is broken for dynamic
12842 return denormalsEnabledForType(DAG, Op.getValueType()) &&
12843 DAG.isKnownNeverSNaN(Op);
12844}
12845
12847 unsigned MaxDepth) const {
12848 const MachineRegisterInfo &MRI = MF.getRegInfo();
12849 MachineInstr *MI = MRI.getVRegDef(Reg);
12850 unsigned Opcode = MI->getOpcode();
12851
12852 if (Opcode == AMDGPU::G_FCANONICALIZE)
12853 return true;
12854
12855 std::optional<FPValueAndVReg> FCR;
12856 // Constant splat (can be padded with undef) or scalar constant.
12857 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
12858 if (FCR->Value.isSignaling())
12859 return false;
12860 if (!FCR->Value.isDenormal())
12861 return true;
12862
12863 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
12864 return Mode == DenormalMode::getIEEE();
12865 }
12866
12867 if (MaxDepth == 0)
12868 return false;
12869
12870 switch (Opcode) {
12871 case AMDGPU::G_FADD:
12872 case AMDGPU::G_FSUB:
12873 case AMDGPU::G_FMUL:
12874 case AMDGPU::G_FCEIL:
12875 case AMDGPU::G_FFLOOR:
12876 case AMDGPU::G_FRINT:
12877 case AMDGPU::G_FNEARBYINT:
12878 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12879 case AMDGPU::G_INTRINSIC_TRUNC:
12880 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12881 case AMDGPU::G_FMA:
12882 case AMDGPU::G_FMAD:
12883 case AMDGPU::G_FSQRT:
12884 case AMDGPU::G_FDIV:
12885 case AMDGPU::G_FREM:
12886 case AMDGPU::G_FPOW:
12887 case AMDGPU::G_FPEXT:
12888 case AMDGPU::G_FLOG:
12889 case AMDGPU::G_FLOG2:
12890 case AMDGPU::G_FLOG10:
12891 case AMDGPU::G_FPTRUNC:
12892 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12893 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12894 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12895 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12896 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12897 return true;
12898 case AMDGPU::G_FNEG:
12899 case AMDGPU::G_FABS:
12900 case AMDGPU::G_FCOPYSIGN:
12901 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
12902 case AMDGPU::G_FMINNUM:
12903 case AMDGPU::G_FMAXNUM:
12904 case AMDGPU::G_FMINNUM_IEEE:
12905 case AMDGPU::G_FMAXNUM_IEEE:
12906 case AMDGPU::G_FMINIMUM:
12907 case AMDGPU::G_FMAXIMUM: {
12908 if (Subtarget->supportsMinMaxDenormModes() ||
12909 // FIXME: denormalsEnabledForType is broken for dynamic
12910 denormalsEnabledForType(MRI.getType(Reg), MF))
12911 return true;
12912
12913 [[fallthrough]];
12914 }
12915 case AMDGPU::G_BUILD_VECTOR:
12916 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
12917 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
12918 return false;
12919 return true;
12920 case AMDGPU::G_INTRINSIC:
12921 case AMDGPU::G_INTRINSIC_CONVERGENT:
12922 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
12923 case Intrinsic::amdgcn_fmul_legacy:
12924 case Intrinsic::amdgcn_fmad_ftz:
12925 case Intrinsic::amdgcn_sqrt:
12926 case Intrinsic::amdgcn_fmed3:
12927 case Intrinsic::amdgcn_sin:
12928 case Intrinsic::amdgcn_cos:
12929 case Intrinsic::amdgcn_log:
12930 case Intrinsic::amdgcn_exp2:
12931 case Intrinsic::amdgcn_log_clamp:
12932 case Intrinsic::amdgcn_rcp:
12933 case Intrinsic::amdgcn_rcp_legacy:
12934 case Intrinsic::amdgcn_rsq:
12935 case Intrinsic::amdgcn_rsq_clamp:
12936 case Intrinsic::amdgcn_rsq_legacy:
12937 case Intrinsic::amdgcn_div_scale:
12938 case Intrinsic::amdgcn_div_fmas:
12939 case Intrinsic::amdgcn_div_fixup:
12940 case Intrinsic::amdgcn_fract:
12941 case Intrinsic::amdgcn_cvt_pkrtz:
12942 case Intrinsic::amdgcn_cubeid:
12943 case Intrinsic::amdgcn_cubema:
12944 case Intrinsic::amdgcn_cubesc:
12945 case Intrinsic::amdgcn_cubetc:
12946 case Intrinsic::amdgcn_frexp_mant:
12947 case Intrinsic::amdgcn_fdot2:
12948 case Intrinsic::amdgcn_trig_preop:
12949 return true;
12950 default:
12951 break;
12952 }
12953
12954 [[fallthrough]];
12955 default:
12956 return false;
12957 }
12958
12959 llvm_unreachable("invalid operation");
12960}
12961
12962// Constant fold canonicalize.
12963SDValue SITargetLowering::getCanonicalConstantFP(
12964 SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
12965 // Flush denormals to 0 if not enabled.
12966 if (C.isDenormal()) {
12967 DenormalMode Mode =
12968 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
12969 if (Mode == DenormalMode::getPreserveSign()) {
12970 return DAG.getConstantFP(
12971 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
12972 }
12973
12974 if (Mode != DenormalMode::getIEEE())
12975 return SDValue();
12976 }
12977
12978 if (C.isNaN()) {
12979 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
12980 if (C.isSignaling()) {
12981 // Quiet a signaling NaN.
12982 // FIXME: Is this supposed to preserve payload bits?
12983 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12984 }
12985
12986 // Make sure it is the canonical NaN bitpattern.
12987 //
12988 // TODO: Can we use -1 as the canonical NaN value since it's an inline
12989 // immediate?
12990 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
12991 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
12992 }
12993
12994 // Already canonical.
12995 return DAG.getConstantFP(C, SL, VT);
12996}
12997
12999 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
13000}
13001
13002SDValue SITargetLowering::performFCanonicalizeCombine(
13003 SDNode *N,
13004 DAGCombinerInfo &DCI) const {
13005 SelectionDAG &DAG = DCI.DAG;
13006 SDValue N0 = N->getOperand(0);
13007 EVT VT = N->getValueType(0);
13008
13009 // fcanonicalize undef -> qnan
13010 if (N0.isUndef()) {
13012 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
13013 }
13014
13015 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
13016 EVT VT = N->getValueType(0);
13017 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
13018 }
13019
13020 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
13021 // (fcanonicalize k)
13022 //
13023 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
13024
13025 // TODO: This could be better with wider vectors that will be split to v2f16,
13026 // and to consider uses since there aren't that many packed operations.
13027 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
13028 isTypeLegal(MVT::v2f16)) {
13029 SDLoc SL(N);
13030 SDValue NewElts[2];
13031 SDValue Lo = N0.getOperand(0);
13032 SDValue Hi = N0.getOperand(1);
13033 EVT EltVT = Lo.getValueType();
13034
13036 for (unsigned I = 0; I != 2; ++I) {
13037 SDValue Op = N0.getOperand(I);
13038 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
13039 NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
13040 CFP->getValueAPF());
13041 } else if (Op.isUndef()) {
13042 // Handled below based on what the other operand is.
13043 NewElts[I] = Op;
13044 } else {
13045 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
13046 }
13047 }
13048
13049 // If one half is undef, and one is constant, prefer a splat vector rather
13050 // than the normal qNaN. If it's a register, prefer 0.0 since that's
13051 // cheaper to use and may be free with a packed operation.
13052 if (NewElts[0].isUndef()) {
13053 if (isa<ConstantFPSDNode>(NewElts[1]))
13054 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13055 NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
13056 }
13057
13058 if (NewElts[1].isUndef()) {
13059 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13060 NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
13061 }
13062
13063 return DAG.getBuildVector(VT, SL, NewElts);
13064 }
13065 }
13066
13067 return SDValue();
13068}
13069
13070static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
13071 switch (Opc) {
13072 case ISD::FMAXNUM:
13073 case ISD::FMAXNUM_IEEE:
13074 return AMDGPUISD::FMAX3;
13075 case ISD::FMAXIMUM:
13076 return AMDGPUISD::FMAXIMUM3;
13077 case ISD::SMAX:
13078 return AMDGPUISD::SMAX3;
13079 case ISD::UMAX:
13080 return AMDGPUISD::UMAX3;
13081 case ISD::FMINNUM:
13082 case ISD::FMINNUM_IEEE:
13083 return AMDGPUISD::FMIN3;
13084 case ISD::FMINIMUM:
13085 return AMDGPUISD::FMINIMUM3;
13086 case ISD::SMIN:
13087 return AMDGPUISD::SMIN3;
13088 case ISD::UMIN:
13089 return AMDGPUISD::UMIN3;
13090 default:
13091 llvm_unreachable("Not a min/max opcode");
13092 }
13093}
13094
13095SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
13096 const SDLoc &SL, SDValue Src,
13097 SDValue MinVal,
13098 SDValue MaxVal,
13099 bool Signed) const {
13100
13101 // med3 comes from
13102 // min(max(x, K0), K1), K0 < K1
13103 // max(min(x, K0), K1), K1 < K0
13104 //
13105 // "MinVal" and "MaxVal" respectively refer to the rhs of the
13106 // min/max op.
13107 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
13108 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
13109
13110 if (!MinK || !MaxK)
13111 return SDValue();
13112
13113 if (Signed) {
13114 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
13115 return SDValue();
13116 } else {
13117 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
13118 return SDValue();
13119 }
13120
13121 EVT VT = MinK->getValueType(0);
13122 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
13123 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
13124 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13125
13126 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
13127 // not available, but this is unlikely to be profitable as constants
13128 // will often need to be materialized & extended, especially on
13129 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
13130 return SDValue();
13131}
13132
13134 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
13135 return C;
13136
13137 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op)) {
13138 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
13139 return C;
13140 }
13141
13142 return nullptr;
13143}
13144
13145SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
13146 const SDLoc &SL,
13147 SDValue Op0,
13148 SDValue Op1) const {
13150 if (!K1)
13151 return SDValue();
13152
13154 if (!K0)
13155 return SDValue();
13156
13157 // Ordered >= (although NaN inputs should have folded away by now).
13158 if (K0->getValueAPF() > K1->getValueAPF())
13159 return SDValue();
13160
13161 const MachineFunction &MF = DAG.getMachineFunction();
13163
13164 // TODO: Check IEEE bit enabled?
13165 EVT VT = Op0.getValueType();
13166 if (Info->getMode().DX10Clamp) {
13167 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
13168 // hardware fmed3 behavior converting to a min.
13169 // FIXME: Should this be allowing -0.0?
13170 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
13171 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
13172 }
13173
13174 // med3 for f16 is only available on gfx9+, and not available for v2f16.
13175 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
13176 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
13177 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
13178 // then give the other result, which is different from med3 with a NaN
13179 // input.
13180 SDValue Var = Op0.getOperand(0);
13181 if (!DAG.isKnownNeverSNaN(Var))
13182 return SDValue();
13183
13185
13186 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
13187 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
13188 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0),
13189 Var, SDValue(K0, 0), SDValue(K1, 0));
13190 }
13191 }
13192
13193 return SDValue();
13194}
13195
13196SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
13197 DAGCombinerInfo &DCI) const {
13198 SelectionDAG &DAG = DCI.DAG;
13199
13200 EVT VT = N->getValueType(0);
13201 unsigned Opc = N->getOpcode();
13202 SDValue Op0 = N->getOperand(0);
13203 SDValue Op1 = N->getOperand(1);
13204
13205 // Only do this if the inner op has one use since this will just increases
13206 // register pressure for no benefit.
13207
13208 if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
13209 !VT.isVector() &&
13210 (VT == MVT::i32 || VT == MVT::f32 ||
13211 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) {
13212 // max(max(a, b), c) -> max3(a, b, c)
13213 // min(min(a, b), c) -> min3(a, b, c)
13214 if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
13215 SDLoc DL(N);
13216 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13217 DL,
13218 N->getValueType(0),
13219 Op0.getOperand(0),
13220 Op0.getOperand(1),
13221 Op1);
13222 }
13223
13224 // Try commuted.
13225 // max(a, max(b, c)) -> max3(a, b, c)
13226 // min(a, min(b, c)) -> min3(a, b, c)
13227 if (Op1.getOpcode() == Opc && Op1.hasOneUse()) {
13228 SDLoc DL(N);
13229 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
13230 DL,
13231 N->getValueType(0),
13232 Op0,
13233 Op1.getOperand(0),
13234 Op1.getOperand(1));
13235 }
13236 }
13237
13238 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
13239 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
13240 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
13241 if (SDValue Med3 = performIntMed3ImmCombine(
13242 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
13243 return Med3;
13244 }
13245 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
13246 if (SDValue Med3 = performIntMed3ImmCombine(
13247 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
13248 return Med3;
13249 }
13250
13251 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
13252 if (SDValue Med3 = performIntMed3ImmCombine(
13253 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
13254 return Med3;
13255 }
13256 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
13257 if (SDValue Med3 = performIntMed3ImmCombine(
13258 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
13259 return Med3;
13260 }
13261
13262 // fminnum(fmaxnum(x, K0), K1), K0 < K1 && !is_snan(x) -> fmed3(x, K0, K1)
13263 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
13264 (Opc == ISD::FMINNUM_IEEE && Op0.getOpcode() == ISD::FMAXNUM_IEEE) ||
13265 (Opc == AMDGPUISD::FMIN_LEGACY &&
13266 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
13267 (VT == MVT::f32 || VT == MVT::f64 ||
13268 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
13269 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
13270 Op0.hasOneUse()) {
13271 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
13272 return Res;
13273 }
13274
13275 return SDValue();
13276}
13277
13279 if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
13280 if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
13281 // FIXME: Should this be allowing -0.0?
13282 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13283 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13284 }
13285 }
13286
13287 return false;
13288}
13289
13290// FIXME: Should only worry about snans for version with chain.
13291SDValue SITargetLowering::performFMed3Combine(SDNode *N,
13292 DAGCombinerInfo &DCI) const {
13293 EVT VT = N->getValueType(0);
13294 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
13295 // NaNs. With a NaN input, the order of the operands may change the result.
13296
13297 SelectionDAG &DAG = DCI.DAG;
13298 SDLoc SL(N);
13299
13300 SDValue Src0 = N->getOperand(0);
13301 SDValue Src1 = N->getOperand(1);
13302 SDValue Src2 = N->getOperand(2);
13303
13304 if (isClampZeroToOne(Src0, Src1)) {
13305 // const_a, const_b, x -> clamp is safe in all cases including signaling
13306 // nans.
13307 // FIXME: Should this be allowing -0.0?
13308 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
13309 }
13310
13311 const MachineFunction &MF = DAG.getMachineFunction();
13313
13314 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
13315 // handling no dx10-clamp?
13316 if (Info->getMode().DX10Clamp) {
13317 // If NaNs is clamped to 0, we are free to reorder the inputs.
13318
13319 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13320 std::swap(Src0, Src1);
13321
13322 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13323 std::swap(Src1, Src2);
13324
13325 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13326 std::swap(Src0, Src1);
13327
13328 if (isClampZeroToOne(Src1, Src2))
13329 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
13330 }
13331
13332 return SDValue();
13333}
13334
13335SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
13336 DAGCombinerInfo &DCI) const {
13337 SDValue Src0 = N->getOperand(0);
13338 SDValue Src1 = N->getOperand(1);
13339 if (Src0.isUndef() && Src1.isUndef())
13340 return DCI.DAG.getUNDEF(N->getValueType(0));
13341 return SDValue();
13342}
13343
13344// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
13345// expanded into a set of cmp/select instructions.
13347 unsigned NumElem,
13348 bool IsDivergentIdx,
13349 const GCNSubtarget *Subtarget) {
13351 return false;
13352
13353 unsigned VecSize = EltSize * NumElem;
13354
13355 // Sub-dword vectors of size 2 dword or less have better implementation.
13356 if (VecSize <= 64 && EltSize < 32)
13357 return false;
13358
13359 // Always expand the rest of sub-dword instructions, otherwise it will be
13360 // lowered via memory.
13361 if (EltSize < 32)
13362 return true;
13363
13364 // Always do this if var-idx is divergent, otherwise it will become a loop.
13365 if (IsDivergentIdx)
13366 return true;
13367
13368 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
13369 unsigned NumInsts = NumElem /* Number of compares */ +
13370 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
13371
13372 // On some architectures (GFX9) movrel is not available and it's better
13373 // to expand.
13374 if (!Subtarget->hasMovrel())
13375 return NumInsts <= 16;
13376
13377 // If movrel is available, use it instead of expanding for vector of 8
13378 // elements.
13379 return NumInsts <= 15;
13380}
13381
13383 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
13384 if (isa<ConstantSDNode>(Idx))
13385 return false;
13386
13387 SDValue Vec = N->getOperand(0);
13388 EVT VecVT = Vec.getValueType();
13389 EVT EltVT = VecVT.getVectorElementType();
13390 unsigned EltSize = EltVT.getSizeInBits();
13391 unsigned NumElem = VecVT.getVectorNumElements();
13392
13394 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
13395}
13396
13397SDValue SITargetLowering::performExtractVectorEltCombine(
13398 SDNode *N, DAGCombinerInfo &DCI) const {
13399 SDValue Vec = N->getOperand(0);
13400 SelectionDAG &DAG = DCI.DAG;
13401
13402 EVT VecVT = Vec.getValueType();
13403 EVT VecEltVT = VecVT.getVectorElementType();
13404 EVT ResVT = N->getValueType(0);
13405
13406 unsigned VecSize = VecVT.getSizeInBits();
13407 unsigned VecEltSize = VecEltVT.getSizeInBits();
13408
13409 if ((Vec.getOpcode() == ISD::FNEG ||
13411 SDLoc SL(N);
13412 SDValue Idx = N->getOperand(1);
13413 SDValue Elt =
13414 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
13415 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
13416 }
13417
13418 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
13419 // =>
13420 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
13421 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
13422 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
13423 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13424 SDLoc SL(N);
13425 SDValue Idx = N->getOperand(1);
13426 unsigned Opc = Vec.getOpcode();
13427
13428 switch(Opc) {
13429 default:
13430 break;
13431 // TODO: Support other binary operations.
13432 case ISD::FADD:
13433 case ISD::FSUB:
13434 case ISD::FMUL:
13435 case ISD::ADD:
13436 case ISD::UMIN:
13437 case ISD::UMAX:
13438 case ISD::SMIN:
13439 case ISD::SMAX:
13440 case ISD::FMAXNUM:
13441 case ISD::FMINNUM:
13442 case ISD::FMAXNUM_IEEE:
13443 case ISD::FMINNUM_IEEE:
13444 case ISD::FMAXIMUM:
13445 case ISD::FMINIMUM: {
13446 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13447 Vec.getOperand(0), Idx);
13448 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
13449 Vec.getOperand(1), Idx);
13450
13451 DCI.AddToWorklist(Elt0.getNode());
13452 DCI.AddToWorklist(Elt1.getNode());
13453 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
13454 }
13455 }
13456 }
13457
13458 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
13460 SDLoc SL(N);
13461 SDValue Idx = N->getOperand(1);
13462 SDValue V;
13463 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13464 SDValue IC = DAG.getVectorIdxConstant(I, SL);
13465 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
13466 if (I == 0)
13467 V = Elt;
13468 else
13469 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
13470 }
13471 return V;
13472 }
13473
13474 if (!DCI.isBeforeLegalize())
13475 return SDValue();
13476
13477 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
13478 // elements. This exposes more load reduction opportunities by replacing
13479 // multiple small extract_vector_elements with a single 32-bit extract.
13480 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
13481 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
13482 VecSize > 32 && VecSize % 32 == 0 && Idx) {
13483 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
13484
13485 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
13486 unsigned EltIdx = BitIndex / 32;
13487 unsigned LeftoverBitIdx = BitIndex % 32;
13488 SDLoc SL(N);
13489
13490 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
13491 DCI.AddToWorklist(Cast.getNode());
13492
13493 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
13494 DAG.getConstant(EltIdx, SL, MVT::i32));
13495 DCI.AddToWorklist(Elt.getNode());
13496 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
13497 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
13498 DCI.AddToWorklist(Srl.getNode());
13499
13500 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
13501 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
13502 DCI.AddToWorklist(Trunc.getNode());
13503
13504 if (VecEltVT == ResVT) {
13505 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
13506 }
13507
13508 assert(ResVT.isScalarInteger());
13509 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
13510 }
13511
13512 return SDValue();
13513}
13514
13515SDValue
13516SITargetLowering::performInsertVectorEltCombine(SDNode *N,
13517 DAGCombinerInfo &DCI) const {
13518 SDValue Vec = N->getOperand(0);
13519 SDValue Idx = N->getOperand(2);
13520 EVT VecVT = Vec.getValueType();
13521 EVT EltVT = VecVT.getVectorElementType();
13522
13523 // INSERT_VECTOR_ELT (<n x e>, var-idx)
13524 // => BUILD_VECTOR n x select (e, const-idx)
13526 return SDValue();
13527
13528 SelectionDAG &DAG = DCI.DAG;
13529 SDLoc SL(N);
13530 SDValue Ins = N->getOperand(1);
13531 EVT IdxVT = Idx.getValueType();
13532
13534 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
13535 SDValue IC = DAG.getConstant(I, SL, IdxVT);
13536 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
13537 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
13538 Ops.push_back(V);
13539 }
13540
13541 return DAG.getBuildVector(VecVT, SL, Ops);
13542}
13543
13544/// Return the source of an fp_extend from f16 to f32, or a converted FP
13545/// constant.
13547 if (Src.getOpcode() == ISD::FP_EXTEND &&
13548 Src.getOperand(0).getValueType() == MVT::f16) {
13549 return Src.getOperand(0);
13550 }
13551
13552 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13553 APFloat Val = CFP->getValueAPF();
13554 bool LosesInfo = true;
13556 if (!LosesInfo)
13557 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
13558 }
13559
13560 return SDValue();
13561}
13562
13563SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
13564 DAGCombinerInfo &DCI) const {
13565 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
13566 "combine only useful on gfx8");
13567
13568 SDValue TruncSrc = N->getOperand(0);
13569 EVT VT = N->getValueType(0);
13570 if (VT != MVT::f16)
13571 return SDValue();
13572
13573 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
13574 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
13575 return SDValue();
13576
13577 SelectionDAG &DAG = DCI.DAG;
13578 SDLoc SL(N);
13579
13580 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
13581 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
13582 // casting back.
13583
13584 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
13585 // fmin(fmax(a, b), fmax(fmin(a, b), c))
13586 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
13587 if (!A)
13588 return SDValue();
13589
13590 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
13591 if (!B)
13592 return SDValue();
13593
13594 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
13595 if (!C)
13596 return SDValue();
13597
13598 // This changes signaling nan behavior. If an input is a signaling nan, it
13599 // would have been quieted by the fpext originally. We don't care because
13600 // these are unconstrained ops. If we needed to insert quieting canonicalizes
13601 // we would be worse off than just doing the promotion.
13602 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
13603 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
13604 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
13605 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
13606}
13607
13608unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
13609 const SDNode *N0,
13610 const SDNode *N1) const {
13611 EVT VT = N0->getValueType(0);
13612
13613 // Only do this if we are not trying to support denormals. v_mad_f32 does not
13614 // support denormals ever.
13615 if (((VT == MVT::f32 &&
13617 (VT == MVT::f16 && Subtarget->hasMadF16() &&
13620 return ISD::FMAD;
13621
13622 const TargetOptions &Options = DAG.getTarget().Options;
13623 if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
13624 (N0->getFlags().hasAllowContract() &&
13625 N1->getFlags().hasAllowContract())) &&
13627 return ISD::FMA;
13628 }
13629
13630 return 0;
13631}
13632
13633// For a reassociatable opcode perform:
13634// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
13635SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
13636 SelectionDAG &DAG) const {
13637 EVT VT = N->getValueType(0);
13638 if (VT != MVT::i32 && VT != MVT::i64)
13639 return SDValue();
13640
13641 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
13642 return SDValue();
13643
13644 unsigned Opc = N->getOpcode();
13645 SDValue Op0 = N->getOperand(0);
13646 SDValue Op1 = N->getOperand(1);
13647
13648 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
13649 return SDValue();
13650
13651 if (Op0->isDivergent())
13652 std::swap(Op0, Op1);
13653
13654 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
13655 return SDValue();
13656
13657 SDValue Op2 = Op1.getOperand(1);
13658 Op1 = Op1.getOperand(0);
13659 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
13660 return SDValue();
13661
13662 if (Op1->isDivergent())
13663 std::swap(Op1, Op2);
13664
13665 SDLoc SL(N);
13666 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
13667 return DAG.getNode(Opc, SL, VT, Add1, Op2);
13668}
13669
13670static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
13671 EVT VT,
13672 SDValue N0, SDValue N1, SDValue N2,
13673 bool Signed) {
13675 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
13676 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
13677 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
13678}
13679
13680// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
13681// multiplies, if any.
13682//
13683// Full 64-bit multiplies that feed into an addition are lowered here instead
13684// of using the generic expansion. The generic expansion ends up with
13685// a tree of ADD nodes that prevents us from using the "add" part of the
13686// MAD instruction. The expansion produced here results in a chain of ADDs
13687// instead of a tree.
13688SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
13689 DAGCombinerInfo &DCI) const {
13690 assert(N->getOpcode() == ISD::ADD);
13691
13692 SelectionDAG &DAG = DCI.DAG;
13693 EVT VT = N->getValueType(0);
13694 SDLoc SL(N);
13695 SDValue LHS = N->getOperand(0);
13696 SDValue RHS = N->getOperand(1);
13697
13698 if (VT.isVector())
13699 return SDValue();
13700
13701 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
13702 // result in scalar registers for uniform values.
13703 if (!N->isDivergent() && Subtarget->hasSMulHi())
13704 return SDValue();
13705
13706 unsigned NumBits = VT.getScalarSizeInBits();
13707 if (NumBits <= 32 || NumBits > 64)
13708 return SDValue();
13709
13710 if (LHS.getOpcode() != ISD::MUL) {
13711 assert(RHS.getOpcode() == ISD::MUL);
13712 std::swap(LHS, RHS);
13713 }
13714
13715 // Avoid the fold if it would unduly increase the number of multiplies due to
13716 // multiple uses, except on hardware with full-rate multiply-add (which is
13717 // part of full-rate 64-bit ops).
13718 if (!Subtarget->hasFullRate64Ops()) {
13719 unsigned NumUsers = 0;
13720 for (SDNode *Use : LHS->uses()) {
13721 // There is a use that does not feed into addition, so the multiply can't
13722 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
13723 if (Use->getOpcode() != ISD::ADD)
13724 return SDValue();
13725
13726 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
13727 // MUL + 3xADD + 3xADDC over 3xMAD.
13728 ++NumUsers;
13729 if (NumUsers >= 3)
13730 return SDValue();
13731 }
13732 }
13733
13734 SDValue MulLHS = LHS.getOperand(0);
13735 SDValue MulRHS = LHS.getOperand(1);
13736 SDValue AddRHS = RHS;
13737
13738 // Always check whether operands are small unsigned values, since that
13739 // knowledge is useful in more cases. Check for small signed values only if
13740 // doing so can unlock a shorter code sequence.
13741 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
13742 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
13743
13744 bool MulSignedLo = false;
13745 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13746 MulSignedLo = numBitsSigned(MulLHS, DAG) <= 32 &&
13747 numBitsSigned(MulRHS, DAG) <= 32;
13748 }
13749
13750 // The operands and final result all have the same number of bits. If
13751 // operands need to be extended, they can be extended with garbage. The
13752 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
13753 // truncated away in the end.
13754 if (VT != MVT::i64) {
13755 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
13756 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
13757 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
13758 }
13759
13760 // The basic code generated is conceptually straightforward. Pseudo code:
13761 //
13762 // accum = mad_64_32 lhs.lo, rhs.lo, accum
13763 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
13764 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
13765 //
13766 // The second and third lines are optional, depending on whether the factors
13767 // are {sign,zero}-extended or not.
13768 //
13769 // The actual DAG is noisier than the pseudo code, but only due to
13770 // instructions that disassemble values into low and high parts, and
13771 // assemble the final result.
13772 SDValue One = DAG.getConstant(1, SL, MVT::i32);
13773
13774 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
13775 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
13776 SDValue Accum =
13777 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13778
13779 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13780 SDValue AccumLo, AccumHi;
13781 std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13782
13783 if (!MulLHSUnsigned32) {
13784 auto MulLHSHi =
13785 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
13786 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
13787 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13788 }
13789
13790 if (!MulRHSUnsigned32) {
13791 auto MulRHSHi =
13792 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
13793 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
13794 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
13795 }
13796
13797 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
13798 Accum = DAG.getBitcast(MVT::i64, Accum);
13799 }
13800
13801 if (VT != MVT::i64)
13802 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
13803 return Accum;
13804}
13805
13806// Collect the ultimate src of each of the mul node's operands, and confirm
13807// each operand is 8 bytes.
13808static std::optional<ByteProvider<SDValue>>
13809handleMulOperand(const SDValue &MulOperand) {
13810 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
13811 if (!Byte0 || Byte0->isConstantZero()) {
13812 return std::nullopt;
13813 }
13814 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
13815 if (Byte1 && !Byte1->isConstantZero()) {
13816 return std::nullopt;
13817 }
13818 return Byte0;
13819}
13820
13821static unsigned addPermMasks(unsigned First, unsigned Second) {
13822 unsigned FirstCs = First & 0x0c0c0c0c;
13823 unsigned SecondCs = Second & 0x0c0c0c0c;
13824 unsigned FirstNoCs = First & ~0x0c0c0c0c;
13825 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13826
13827 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13828 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13829 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13830 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13831
13832 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13833}
13834
13835struct DotSrc {
13837 int64_t PermMask;
13839};
13840
13844 SmallVectorImpl<DotSrc> &Src1s, int Step) {
13845
13846 assert(Src0.Src.has_value() && Src1.Src.has_value());
13847 // Src0s and Src1s are empty, just place arbitrarily.
13848 if (Step == 0) {
13849 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
13850 Src0.SrcOffset / 4});
13851 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
13852 Src1.SrcOffset / 4});
13853 return;
13854 }
13855
13856 for (int BPI = 0; BPI < 2; BPI++) {
13857 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
13858 if (BPI == 1) {
13859 BPP = {Src1, Src0};
13860 }
13861 unsigned ZeroMask = 0x0c0c0c0c;
13862 unsigned FMask = 0xFF << (8 * (3 - Step));
13863
13864 unsigned FirstMask =
13865 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13866 unsigned SecondMask =
13867 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13868 // Attempt to find Src vector which contains our SDValue, if so, add our
13869 // perm mask to the existing one. If we are unable to find a match for the
13870 // first SDValue, attempt to find match for the second.
13871 int FirstGroup = -1;
13872 for (int I = 0; I < 2; I++) {
13873 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
13874 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
13875 return IterElt.SrcOp == *BPP.first.Src &&
13876 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13877 };
13878
13879 auto Match = llvm::find_if(Srcs, MatchesFirst);
13880 if (Match != Srcs.end()) {
13881 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
13882 FirstGroup = I;
13883 break;
13884 }
13885 }
13886 if (FirstGroup != -1) {
13887 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
13888 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
13889 return IterElt.SrcOp == *BPP.second.Src &&
13890 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13891 };
13892 auto Match = llvm::find_if(Srcs, MatchesSecond);
13893 if (Match != Srcs.end()) {
13894 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
13895 } else
13896 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13897 return;
13898 }
13899 }
13900
13901 // If we have made it here, then we could not find a match in Src0s or Src1s
13902 // for either Src0 or Src1, so just place them arbitrarily.
13903
13904 unsigned ZeroMask = 0x0c0c0c0c;
13905 unsigned FMask = 0xFF << (8 * (3 - Step));
13906
13907 Src0s.push_back(
13908 {*Src0.Src,
13909 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13910 Src1.SrcOffset / 4});
13911 Src1s.push_back(
13912 {*Src1.Src,
13913 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13914 Src1.SrcOffset / 4});
13915
13916 return;
13917}
13918
13920 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
13921 bool IsAny) {
13922
13923 // If we just have one source, just permute it accordingly.
13924 if (Srcs.size() == 1) {
13925 auto Elt = Srcs.begin();
13926 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
13927
13928 // v_perm will produce the original value
13929 if (Elt->PermMask == 0x3020100)
13930 return EltOp;
13931
13932 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13933 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
13934 }
13935
13936 auto FirstElt = Srcs.begin();
13937 auto SecondElt = std::next(FirstElt);
13938
13940
13941 // If we have multiple sources in the chain, combine them via perms (using
13942 // calculated perm mask) and Ors.
13943 while (true) {
13944 auto FirstMask = FirstElt->PermMask;
13945 auto SecondMask = SecondElt->PermMask;
13946
13947 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13948 unsigned FirstPlusFour = FirstMask | 0x04040404;
13949 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
13950 // original 0x0C.
13951 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13952
13953 auto PermMask = addPermMasks(FirstMask, SecondMask);
13954 auto FirstVal =
13955 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13956 auto SecondVal =
13957 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
13958
13959 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
13960 SecondVal,
13961 DAG.getConstant(PermMask, SL, MVT::i32)));
13962
13963 FirstElt = std::next(SecondElt);
13964 if (FirstElt == Srcs.end())
13965 break;
13966
13967 SecondElt = std::next(FirstElt);
13968 // If we only have a FirstElt, then just combine that into the cumulative
13969 // source node.
13970 if (SecondElt == Srcs.end()) {
13971 auto EltOp =
13972 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
13973
13974 Perms.push_back(
13975 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
13976 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
13977 break;
13978 }
13979 }
13980
13981 assert(Perms.size() == 1 || Perms.size() == 2);
13982 return Perms.size() == 2
13983 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
13984 : Perms[0];
13985}
13986
13987static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
13988 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13989 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13990 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13991 EntryMask += ZeroMask;
13992 }
13993}
13994
13995static bool isMul(const SDValue Op) {
13996 auto Opcode = Op.getOpcode();
13997
13998 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
13999 Opcode == AMDGPUISD::MUL_I24);
14000}
14001
14002static std::optional<bool>
14004 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
14005 const SDValue &S1Op, const SelectionDAG &DAG) {
14006 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
14007 // of the dot4 is irrelevant.
14008 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
14009 return false;
14010
14011 auto Known0 = DAG.computeKnownBits(S0Op, 0);
14012 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
14013 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14014 auto Known1 = DAG.computeKnownBits(S1Op, 0);
14015 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
14016 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14017
14018 assert(!(S0IsUnsigned && S0IsSigned));
14019 assert(!(S1IsUnsigned && S1IsSigned));
14020
14021 // There are 9 possible permutations of
14022 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
14023
14024 // In two permutations, the sign bits are known to be the same for both Ops,
14025 // so simply return Signed / Unsigned corresponding to the MSB
14026
14027 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14028 return S0IsSigned;
14029
14030 // In another two permutations, the sign bits are known to be opposite. In
14031 // this case return std::nullopt to indicate a bad match.
14032
14033 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14034 return std::nullopt;
14035
14036 // In the remaining five permutations, we don't know the value of the sign
14037 // bit for at least one Op. Since we have a valid ByteProvider, we know that
14038 // the upper bits must be extension bits. Thus, the only ways for the sign
14039 // bit to be unknown is if it was sign extended from unknown value, or if it
14040 // was any extended. In either case, it is correct to use the signed
14041 // version of the signedness semantics of dot4
14042
14043 // In two of such permutations, we known the sign bit is set for
14044 // one op, and the other is unknown. It is okay to used signed version of
14045 // dot4.
14046 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14047 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14048 return true;
14049
14050 // In one such permutation, we don't know either of the sign bits. It is okay
14051 // to used the signed version of dot4.
14052 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14053 return true;
14054
14055 // In two of such permutations, we known the sign bit is unset for
14056 // one op, and the other is unknown. Return std::nullopt to indicate a
14057 // bad match.
14058 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14059 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14060 return std::nullopt;
14061
14062 llvm_unreachable("Fully covered condition");
14063}
14064
14065SDValue SITargetLowering::performAddCombine(SDNode *N,
14066 DAGCombinerInfo &DCI) const {
14067 SelectionDAG &DAG = DCI.DAG;
14068 EVT VT = N->getValueType(0);
14069 SDLoc SL(N);
14070 SDValue LHS = N->getOperand(0);
14071 SDValue RHS = N->getOperand(1);
14072
14073 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
14074 if (Subtarget->hasMad64_32()) {
14075 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
14076 return Folded;
14077 }
14078 }
14079
14080 if (SDValue V = reassociateScalarOps(N, DAG)) {
14081 return V;
14082 }
14083
14084 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
14085 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
14086 SDValue TempNode(N, 0);
14087 std::optional<bool> IsSigned;
14091
14092 // Match the v_dot4 tree, while collecting src nodes.
14093 int ChainLength = 0;
14094 for (int I = 0; I < 4; I++) {
14095 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
14096 if (MulIdx == -1)
14097 break;
14098 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14099 if (!Src0)
14100 break;
14101 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14102 if (!Src1)
14103 break;
14104
14105 auto IterIsSigned = checkDot4MulSignedness(
14106 TempNode->getOperand(MulIdx), *Src0, *Src1,
14107 TempNode->getOperand(MulIdx)->getOperand(0),
14108 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14109 if (!IterIsSigned)
14110 break;
14111 if (!IsSigned)
14112 IsSigned = *IterIsSigned;
14113 if (*IterIsSigned != *IsSigned)
14114 break;
14115 placeSources(*Src0, *Src1, Src0s, Src1s, I);
14116 auto AddIdx = 1 - MulIdx;
14117 // Allow the special case where add (add (mul24, 0), mul24) became ->
14118 // add (mul24, mul24).
14119 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
14120 Src2s.push_back(TempNode->getOperand(AddIdx));
14121 auto Src0 =
14122 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
14123 if (!Src0)
14124 break;
14125 auto Src1 =
14126 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
14127 if (!Src1)
14128 break;
14129 auto IterIsSigned = checkDot4MulSignedness(
14130 TempNode->getOperand(AddIdx), *Src0, *Src1,
14131 TempNode->getOperand(AddIdx)->getOperand(0),
14132 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14133 if (!IterIsSigned)
14134 break;
14135 assert(IsSigned);
14136 if (*IterIsSigned != *IsSigned)
14137 break;
14138 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
14139 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
14140 ChainLength = I + 2;
14141 break;
14142 }
14143
14144 TempNode = TempNode->getOperand(AddIdx);
14145 Src2s.push_back(TempNode);
14146 ChainLength = I + 1;
14147 if (TempNode->getNumOperands() < 2)
14148 break;
14149 LHS = TempNode->getOperand(0);
14150 RHS = TempNode->getOperand(1);
14151 }
14152
14153 if (ChainLength < 2)
14154 return SDValue();
14155
14156 // Masks were constructed with assumption that we would find a chain of
14157 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
14158 // 0x0c) so they do not affect dot calculation.
14159 if (ChainLength < 4) {
14160 fixMasks(Src0s, ChainLength);
14161 fixMasks(Src1s, ChainLength);
14162 }
14163
14164 SDValue Src0, Src1;
14165
14166 // If we are just using a single source for both, and have permuted the
14167 // bytes consistently, we can just use the sources without permuting
14168 // (commutation).
14169 bool UseOriginalSrc = false;
14170 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
14171 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
14172 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
14173 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
14174 SmallVector<unsigned, 4> SrcBytes;
14175 auto Src0Mask = Src0s.begin()->PermMask;
14176 SrcBytes.push_back(Src0Mask & 0xFF000000);
14177 bool UniqueEntries = true;
14178 for (auto I = 1; I < 4; I++) {
14179 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
14180
14181 if (is_contained(SrcBytes, NextByte)) {
14182 UniqueEntries = false;
14183 break;
14184 }
14185 SrcBytes.push_back(NextByte);
14186 }
14187
14188 if (UniqueEntries) {
14189 UseOriginalSrc = true;
14190
14191 auto FirstElt = Src0s.begin();
14192 auto FirstEltOp =
14193 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
14194
14195 auto SecondElt = Src1s.begin();
14196 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
14197 SecondElt->DWordOffset);
14198
14199 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
14200 MVT::getIntegerVT(32));
14201 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
14202 MVT::getIntegerVT(32));
14203 }
14204 }
14205
14206 if (!UseOriginalSrc) {
14207 Src0 = resolveSources(DAG, SL, Src0s, false, true);
14208 Src1 = resolveSources(DAG, SL, Src1s, false, true);
14209 }
14210
14211 assert(IsSigned);
14212 SDValue Src2 =
14213 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14214
14215 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
14216 : Intrinsic::amdgcn_udot4,
14217 SL, MVT::i64);
14218
14219 assert(!VT.isVector());
14220 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
14221 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
14222
14223 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
14224 }
14225
14226 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14227 return SDValue();
14228
14229 // add x, zext (setcc) => uaddo_carry x, 0, setcc
14230 // add x, sext (setcc) => usubo_carry x, 0, setcc
14231 unsigned Opc = LHS.getOpcode();
14232 if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
14233 Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
14234 std::swap(RHS, LHS);
14235
14236 Opc = RHS.getOpcode();
14237 switch (Opc) {
14238 default: break;
14239 case ISD::ZERO_EXTEND:
14240 case ISD::SIGN_EXTEND:
14241 case ISD::ANY_EXTEND: {
14242 auto Cond = RHS.getOperand(0);
14243 // If this won't be a real VOPC output, we would still need to insert an
14244 // extra instruction anyway.
14245 if (!isBoolSGPR(Cond))
14246 break;
14247 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14248 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14250 return DAG.getNode(Opc, SL, VTList, Args);
14251 }
14252 case ISD::UADDO_CARRY: {
14253 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
14254 if (!isNullConstant(RHS.getOperand(1)))
14255 break;
14256 SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
14257 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
14258 }
14259 }
14260 return SDValue();
14261}
14262
14263SDValue SITargetLowering::performSubCombine(SDNode *N,
14264 DAGCombinerInfo &DCI) const {
14265 SelectionDAG &DAG = DCI.DAG;
14266 EVT VT = N->getValueType(0);
14267
14268 if (VT != MVT::i32)
14269 return SDValue();
14270
14271 SDLoc SL(N);
14272 SDValue LHS = N->getOperand(0);
14273 SDValue RHS = N->getOperand(1);
14274
14275 // sub x, zext (setcc) => usubo_carry x, 0, setcc
14276 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
14277 unsigned Opc = RHS.getOpcode();
14278 switch (Opc) {
14279 default: break;
14280 case ISD::ZERO_EXTEND:
14281 case ISD::SIGN_EXTEND:
14282 case ISD::ANY_EXTEND: {
14283 auto Cond = RHS.getOperand(0);
14284 // If this won't be a real VOPC output, we would still need to insert an
14285 // extra instruction anyway.
14286 if (!isBoolSGPR(Cond))
14287 break;
14288 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
14289 SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
14291 return DAG.getNode(Opc, SL, VTList, Args);
14292 }
14293 }
14294
14295 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
14296 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
14297 if (!isNullConstant(LHS.getOperand(1)))
14298 return SDValue();
14299 SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
14300 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
14301 }
14302 return SDValue();
14303}
14304
14305SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
14306 DAGCombinerInfo &DCI) const {
14307
14308 if (N->getValueType(0) != MVT::i32)
14309 return SDValue();
14310
14311 if (!isNullConstant(N->getOperand(1)))
14312 return SDValue();
14313
14314 SelectionDAG &DAG = DCI.DAG;
14315 SDValue LHS = N->getOperand(0);
14316
14317 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
14318 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
14319 unsigned LHSOpc = LHS.getOpcode();
14320 unsigned Opc = N->getOpcode();
14321 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
14322 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
14323 SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
14324 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
14325 }
14326 return SDValue();
14327}
14328
14329SDValue SITargetLowering::performFAddCombine(SDNode *N,
14330 DAGCombinerInfo &DCI) const {
14331 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14332 return SDValue();
14333
14334 SelectionDAG &DAG = DCI.DAG;
14335 EVT VT = N->getValueType(0);
14336
14337 SDLoc SL(N);
14338 SDValue LHS = N->getOperand(0);
14339 SDValue RHS = N->getOperand(1);
14340
14341 // These should really be instruction patterns, but writing patterns with
14342 // source modifiers is a pain.
14343
14344 // fadd (fadd (a, a), b) -> mad 2.0, a, b
14345 if (LHS.getOpcode() == ISD::FADD) {
14346 SDValue A = LHS.getOperand(0);
14347 if (A == LHS.getOperand(1)) {
14348 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14349 if (FusedOp != 0) {
14350 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14351 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
14352 }
14353 }
14354 }
14355
14356 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
14357 if (RHS.getOpcode() == ISD::FADD) {
14358 SDValue A = RHS.getOperand(0);
14359 if (A == RHS.getOperand(1)) {
14360 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14361 if (FusedOp != 0) {
14362 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14363 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
14364 }
14365 }
14366 }
14367
14368 return SDValue();
14369}
14370
14371SDValue SITargetLowering::performFSubCombine(SDNode *N,
14372 DAGCombinerInfo &DCI) const {
14373 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
14374 return SDValue();
14375
14376 SelectionDAG &DAG = DCI.DAG;
14377 SDLoc SL(N);
14378 EVT VT = N->getValueType(0);
14379 assert(!VT.isVector());
14380
14381 // Try to get the fneg to fold into the source modifier. This undoes generic
14382 // DAG combines and folds them into the mad.
14383 //
14384 // Only do this if we are not trying to support denormals. v_mad_f32 does
14385 // not support denormals ever.
14386 SDValue LHS = N->getOperand(0);
14387 SDValue RHS = N->getOperand(1);
14388 if (LHS.getOpcode() == ISD::FADD) {
14389 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
14390 SDValue A = LHS.getOperand(0);
14391 if (A == LHS.getOperand(1)) {
14392 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
14393 if (FusedOp != 0){
14394 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
14395 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
14396
14397 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
14398 }
14399 }
14400 }
14401
14402 if (RHS.getOpcode() == ISD::FADD) {
14403 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
14404
14405 SDValue A = RHS.getOperand(0);
14406 if (A == RHS.getOperand(1)) {
14407 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
14408 if (FusedOp != 0){
14409 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
14410 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
14411 }
14412 }
14413 }
14414
14415 return SDValue();
14416}
14417
14418SDValue SITargetLowering::performFDivCombine(SDNode *N,
14419 DAGCombinerInfo &DCI) const {
14420 SelectionDAG &DAG = DCI.DAG;
14421 SDLoc SL(N);
14422 EVT VT = N->getValueType(0);
14423 if (VT != MVT::f16 || !Subtarget->has16BitInsts())
14424 return SDValue();
14425
14426 SDValue LHS = N->getOperand(0);
14427 SDValue RHS = N->getOperand(1);
14428
14429 SDNodeFlags Flags = N->getFlags();
14430 SDNodeFlags RHSFlags = RHS->getFlags();
14431 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
14432 !RHS->hasOneUse())
14433 return SDValue();
14434
14435 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
14436 bool IsNegative = false;
14437 if (CLHS->isExactlyValue(1.0) ||
14438 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14439 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
14440 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
14441 if (RHS.getOpcode() == ISD::FSQRT) {
14442 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
14443 SDValue Rsq =
14444 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
14445 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
14446 }
14447 }
14448 }
14449
14450 return SDValue();
14451}
14452
14453SDValue SITargetLowering::performFMACombine(SDNode *N,
14454 DAGCombinerInfo &DCI) const {
14455 SelectionDAG &DAG = DCI.DAG;
14456 EVT VT = N->getValueType(0);
14457 SDLoc SL(N);
14458
14459 if (!Subtarget->hasDot7Insts() || VT != MVT::f32)
14460 return SDValue();
14461
14462 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
14463 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
14464 SDValue Op1 = N->getOperand(0);
14465 SDValue Op2 = N->getOperand(1);
14466 SDValue FMA = N->getOperand(2);
14467
14468 if (FMA.getOpcode() != ISD::FMA ||
14469 Op1.getOpcode() != ISD::FP_EXTEND ||
14470 Op2.getOpcode() != ISD::FP_EXTEND)
14471 return SDValue();
14472
14473 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
14474 // regardless of the denorm mode setting. Therefore,
14475 // unsafe-fp-math/fp-contract is sufficient to allow generating fdot2.
14476 const TargetOptions &Options = DAG.getTarget().Options;
14477 if (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
14478 (N->getFlags().hasAllowContract() &&
14479 FMA->getFlags().hasAllowContract())) {
14480 Op1 = Op1.getOperand(0);
14481 Op2 = Op2.getOperand(0);
14482 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14484 return SDValue();
14485
14486 SDValue Vec1 = Op1.getOperand(0);
14487 SDValue Idx1 = Op1.getOperand(1);
14488 SDValue Vec2 = Op2.getOperand(0);
14489
14490 SDValue FMAOp1 = FMA.getOperand(0);
14491 SDValue FMAOp2 = FMA.getOperand(1);
14492 SDValue FMAAcc = FMA.getOperand(2);
14493
14494 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
14495 FMAOp2.getOpcode() != ISD::FP_EXTEND)
14496 return SDValue();
14497
14498 FMAOp1 = FMAOp1.getOperand(0);
14499 FMAOp2 = FMAOp2.getOperand(0);
14500 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
14502 return SDValue();
14503
14504 SDValue Vec3 = FMAOp1.getOperand(0);
14505 SDValue Vec4 = FMAOp2.getOperand(0);
14506 SDValue Idx2 = FMAOp1.getOperand(1);
14507
14508 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
14509 // Idx1 and Idx2 cannot be the same.
14510 Idx1 == Idx2)
14511 return SDValue();
14512
14513 if (Vec1 == Vec2 || Vec3 == Vec4)
14514 return SDValue();
14515
14516 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
14517 return SDValue();
14518
14519 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14520 (Vec1 == Vec4 && Vec2 == Vec3)) {
14521 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
14522 DAG.getTargetConstant(0, SL, MVT::i1));
14523 }
14524 }
14525 return SDValue();
14526}
14527
14528SDValue SITargetLowering::performSetCCCombine(SDNode *N,
14529 DAGCombinerInfo &DCI) const {
14530 SelectionDAG &DAG = DCI.DAG;
14531 SDLoc SL(N);
14532
14533 SDValue LHS = N->getOperand(0);
14534 SDValue RHS = N->getOperand(1);
14535 EVT VT = LHS.getValueType();
14536 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
14537
14538 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14539 if (!CRHS) {
14540 CRHS = dyn_cast<ConstantSDNode>(LHS);
14541 if (CRHS) {
14542 std::swap(LHS, RHS);
14544 }
14545 }
14546
14547 if (CRHS) {
14548 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
14549 isBoolSGPR(LHS.getOperand(0))) {
14550 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
14551 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
14552 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
14553 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
14554 if ((CRHS->isAllOnes() &&
14555 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
14556 (CRHS->isZero() &&
14557 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
14558 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14559 DAG.getConstant(-1, SL, MVT::i1));
14560 if ((CRHS->isAllOnes() &&
14561 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
14562 (CRHS->isZero() &&
14563 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
14564 return LHS.getOperand(0);
14565 }
14566
14567 const APInt &CRHSVal = CRHS->getAPIntValue();
14568 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
14569 LHS.getOpcode() == ISD::SELECT &&
14570 isa<ConstantSDNode>(LHS.getOperand(1)) &&
14571 isa<ConstantSDNode>(LHS.getOperand(2)) &&
14572 LHS.getConstantOperandVal(1) != LHS.getConstantOperandVal(2) &&
14573 isBoolSGPR(LHS.getOperand(0))) {
14574 // Given CT != FT:
14575 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
14576 // setcc (select cc, CT, CF), CF, ne => cc
14577 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
14578 // setcc (select cc, CT, CF), CT, eq => cc
14579 const APInt &CT = LHS.getConstantOperandAPInt(1);
14580 const APInt &CF = LHS.getConstantOperandAPInt(2);
14581
14582 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
14583 (CT == CRHSVal && CC == ISD::SETNE))
14584 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
14585 DAG.getConstant(-1, SL, MVT::i1));
14586 if ((CF == CRHSVal && CC == ISD::SETNE) ||
14587 (CT == CRHSVal && CC == ISD::SETEQ))
14588 return LHS.getOperand(0);
14589 }
14590 }
14591
14592 if (VT != MVT::f32 && VT != MVT::f64 &&
14593 (!Subtarget->has16BitInsts() || VT != MVT::f16))
14594 return SDValue();
14595
14596 // Match isinf/isfinite pattern
14597 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
14598 // (fcmp one (fabs x), inf) -> (fp_class x,
14599 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
14600 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) && LHS.getOpcode() == ISD::FABS) {
14601 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
14602 if (!CRHS)
14603 return SDValue();
14604
14605 const APFloat &APF = CRHS->getValueAPF();
14606 if (APF.isInfinity() && !APF.isNegative()) {
14607 const unsigned IsInfMask = SIInstrFlags::P_INFINITY |
14609 const unsigned IsFiniteMask = SIInstrFlags::N_ZERO |
14615 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
14616 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
14617 DAG.getConstant(Mask, SL, MVT::i32));
14618 }
14619 }
14620
14621 return SDValue();
14622}
14623
14624SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
14625 DAGCombinerInfo &DCI) const {
14626 SelectionDAG &DAG = DCI.DAG;
14627 SDLoc SL(N);
14628 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
14629
14630 SDValue Src = N->getOperand(0);
14631 SDValue Shift = N->getOperand(0);
14632
14633 // TODO: Extend type shouldn't matter (assuming legal types).
14634 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
14635 Shift = Shift.getOperand(0);
14636
14637 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
14638 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
14639 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
14640 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
14641 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
14642 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
14643 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
14644 SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
14645 SDLoc(Shift.getOperand(0)), MVT::i32);
14646
14647 unsigned ShiftOffset = 8 * Offset;
14648 if (Shift.getOpcode() == ISD::SHL)
14649 ShiftOffset -= C->getZExtValue();
14650 else
14651 ShiftOffset += C->getZExtValue();
14652
14653 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14654 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
14655 MVT::f32, Shifted);
14656 }
14657 }
14658 }
14659
14660 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14661 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
14662 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
14663 // We simplified Src. If this node is not dead, visit it again so it is
14664 // folded properly.
14665 if (N->getOpcode() != ISD::DELETED_NODE)
14666 DCI.AddToWorklist(N);
14667 return SDValue(N, 0);
14668 }
14669
14670 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
14671 if (SDValue DemandedSrc =
14673 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
14674
14675 return SDValue();
14676}
14677
14678SDValue SITargetLowering::performClampCombine(SDNode *N,
14679 DAGCombinerInfo &DCI) const {
14680 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
14681 if (!CSrc)
14682 return SDValue();
14683
14684 const MachineFunction &MF = DCI.DAG.getMachineFunction();
14685 const APFloat &F = CSrc->getValueAPF();
14686 APFloat Zero = APFloat::getZero(F.getSemantics());
14687 if (F < Zero ||
14688 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
14689 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
14690 }
14691
14692 APFloat One(F.getSemantics(), "1.0");
14693 if (F > One)
14694 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
14695
14696 return SDValue(CSrc, 0);
14697}
14698
14699
14701 DAGCombinerInfo &DCI) const {
14702 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
14703 return SDValue();
14704 switch (N->getOpcode()) {
14705 case ISD::ADD:
14706 return performAddCombine(N, DCI);
14707 case ISD::SUB:
14708 return performSubCombine(N, DCI);
14709 case ISD::UADDO_CARRY:
14710 case ISD::USUBO_CARRY:
14711 return performAddCarrySubCarryCombine(N, DCI);
14712 case ISD::FADD:
14713 return performFAddCombine(N, DCI);
14714 case ISD::FSUB:
14715 return performFSubCombine(N, DCI);
14716 case ISD::FDIV:
14717 return performFDivCombine(N, DCI);
14718 case ISD::SETCC:
14719 return performSetCCCombine(N, DCI);
14720 case ISD::FMAXNUM:
14721 case ISD::FMINNUM:
14722 case ISD::FMAXNUM_IEEE:
14723 case ISD::FMINNUM_IEEE:
14724 case ISD::FMAXIMUM:
14725 case ISD::FMINIMUM:
14726 case ISD::SMAX:
14727 case ISD::SMIN:
14728 case ISD::UMAX:
14729 case ISD::UMIN:
14732 return performMinMaxCombine(N, DCI);
14733 case ISD::FMA:
14734 return performFMACombine(N, DCI);
14735 case ISD::AND:
14736 return performAndCombine(N, DCI);
14737 case ISD::OR:
14738 return performOrCombine(N, DCI);
14739 case ISD::FSHR: {
14741 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
14742 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14743 return matchPERM(N, DCI);
14744 }
14745 break;
14746 }
14747 case ISD::XOR:
14748 return performXorCombine(N, DCI);
14749 case ISD::ZERO_EXTEND:
14750 return performZeroExtendCombine(N, DCI);
14752 return performSignExtendInRegCombine(N , DCI);
14754 return performClassCombine(N, DCI);
14755 case ISD::FCANONICALIZE:
14756 return performFCanonicalizeCombine(N, DCI);
14757 case AMDGPUISD::RCP:
14758 return performRcpCombine(N, DCI);
14759 case ISD::FLDEXP:
14760 case AMDGPUISD::FRACT:
14761 case AMDGPUISD::RSQ:
14764 case AMDGPUISD::RSQ_CLAMP: {
14765 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
14766 SDValue Src = N->getOperand(0);
14767 if (Src.isUndef())
14768 return Src;
14769 break;
14770 }
14771 case ISD::SINT_TO_FP:
14772 case ISD::UINT_TO_FP:
14773 return performUCharToFloatCombine(N, DCI);
14774 case ISD::FCOPYSIGN:
14775 return performFCopySignCombine(N, DCI);
14780 return performCvtF32UByteNCombine(N, DCI);
14781 case AMDGPUISD::FMED3:
14782 return performFMed3Combine(N, DCI);
14784 return performCvtPkRTZCombine(N, DCI);
14785 case AMDGPUISD::CLAMP:
14786 return performClampCombine(N, DCI);
14787 case ISD::SCALAR_TO_VECTOR: {
14788 SelectionDAG &DAG = DCI.DAG;
14789 EVT VT = N->getValueType(0);
14790
14791 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
14792 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14793 SDLoc SL(N);
14794 SDValue Src = N->getOperand(0);
14795 EVT EltVT = Src.getValueType();
14796 if (EltVT != MVT::i16)
14797 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
14798
14799 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
14800 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
14801 }
14802
14803 break;
14804 }
14806 return performExtractVectorEltCombine(N, DCI);
14808 return performInsertVectorEltCombine(N, DCI);
14809 case ISD::FP_ROUND:
14810 return performFPRoundCombine(N, DCI);
14811 case ISD::LOAD: {
14812 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
14813 return Widened;
14814 [[fallthrough]];
14815 }
14816 default: {
14817 if (!DCI.isBeforeLegalize()) {
14818 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
14819 return performMemSDNodeCombine(MemNode, DCI);
14820 }
14821
14822 break;
14823 }
14824 }
14825
14827}
14828
14829/// Helper function for adjustWritemask
14830static unsigned SubIdx2Lane(unsigned Idx) {
14831 switch (Idx) {
14832 default: return ~0u;
14833 case AMDGPU::sub0: return 0;
14834 case AMDGPU::sub1: return 1;
14835 case AMDGPU::sub2: return 2;
14836 case AMDGPU::sub3: return 3;
14837 case AMDGPU::sub4: return 4; // Possible with TFE/LWE
14838 }
14839}
14840
14841/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
14842SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
14843 SelectionDAG &DAG) const {
14844 unsigned Opcode = Node->getMachineOpcode();
14845
14846 // Subtract 1 because the vdata output is not a MachineSDNode operand.
14847 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
14848 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
14849 return Node; // not implemented for D16
14850
14851 SDNode *Users[5] = { nullptr };
14852 unsigned Lane = 0;
14853 unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
14854 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
14855 unsigned NewDmask = 0;
14856 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
14857 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
14858 bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
14859 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
14860 ? true
14861 : false;
14862 unsigned TFCLane = 0;
14863 bool HasChain = Node->getNumValues() > 1;
14864
14865 if (OldDmask == 0) {
14866 // These are folded out, but on the chance it happens don't assert.
14867 return Node;
14868 }
14869
14870 unsigned OldBitsSet = llvm::popcount(OldDmask);
14871 // Work out which is the TFE/LWE lane if that is enabled.
14872 if (UsesTFC) {
14873 TFCLane = OldBitsSet;
14874 }
14875
14876 // Try to figure out the used register components
14877 for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
14878 I != E; ++I) {
14879
14880 // Don't look at users of the chain.
14881 if (I.getUse().getResNo() != 0)
14882 continue;
14883
14884 // Abort if we can't understand the usage
14885 if (!I->isMachineOpcode() ||
14886 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14887 return Node;
14888
14889 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
14890 // Note that subregs are packed, i.e. Lane==0 is the first bit set
14891 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
14892 // set, etc.
14893 Lane = SubIdx2Lane(I->getConstantOperandVal(1));
14894 if (Lane == ~0u)
14895 return Node;
14896
14897 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
14898 if (UsesTFC && Lane == TFCLane) {
14899 Users[Lane] = *I;
14900 } else {
14901 // Set which texture component corresponds to the lane.
14902 unsigned Comp;
14903 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14904 Comp = llvm::countr_zero(Dmask);
14905 Dmask &= ~(1 << Comp);
14906 }
14907
14908 // Abort if we have more than one user per component.
14909 if (Users[Lane])
14910 return Node;
14911
14912 Users[Lane] = *I;
14913 NewDmask |= 1 << Comp;
14914 }
14915 }
14916
14917 // Don't allow 0 dmask, as hardware assumes one channel enabled.
14918 bool NoChannels = !NewDmask;
14919 if (NoChannels) {
14920 if (!UsesTFC) {
14921 // No uses of the result and not using TFC. Then do nothing.
14922 return Node;
14923 }
14924 // If the original dmask has one channel - then nothing to do
14925 if (OldBitsSet == 1)
14926 return Node;
14927 // Use an arbitrary dmask - required for the instruction to work
14928 NewDmask = 1;
14929 }
14930 // Abort if there's no change
14931 if (NewDmask == OldDmask)
14932 return Node;
14933
14934 unsigned BitsSet = llvm::popcount(NewDmask);
14935
14936 // Check for TFE or LWE - increase the number of channels by one to account
14937 // for the extra return value
14938 // This will need adjustment for D16 if this is also included in
14939 // adjustWriteMask (this function) but at present D16 are excluded.
14940 unsigned NewChannels = BitsSet + UsesTFC;
14941
14942 int NewOpcode =
14943 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
14944 assert(NewOpcode != -1 &&
14945 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
14946 "failed to find equivalent MIMG op");
14947
14948 // Adjust the writemask in the node
14950 Ops.insert(Ops.end(), Node->op_begin(), Node->op_begin() + DmaskIdx);
14951 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
14952 Ops.insert(Ops.end(), Node->op_begin() + DmaskIdx + 1, Node->op_end());
14953
14954 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
14955
14956 MVT ResultVT = NewChannels == 1 ?
14957 SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
14958 NewChannels == 5 ? 8 : NewChannels);
14959 SDVTList NewVTList = HasChain ?
14960 DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
14961
14962
14963 MachineSDNode *NewNode = DAG.getMachineNode(NewOpcode, SDLoc(Node),
14964 NewVTList, Ops);
14965
14966 if (HasChain) {
14967 // Update chain.
14968 DAG.setNodeMemRefs(NewNode, Node->memoperands());
14969 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
14970 }
14971
14972 if (NewChannels == 1) {
14973 assert(Node->hasNUsesOfValue(1, 0));
14974 SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
14975 SDLoc(Node), Users[Lane]->getValueType(0),
14976 SDValue(NewNode, 0));
14977 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
14978 return nullptr;
14979 }
14980
14981 // Update the users of the node with the new indices
14982 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
14983 SDNode *User = Users[i];
14984 if (!User) {
14985 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
14986 // Users[0] is still nullptr because channel 0 doesn't really have a use.
14987 if (i || !NoChannels)
14988 continue;
14989 } else {
14990 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
14991 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
14992 if (NewUser != User) {
14993 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
14994 DAG.RemoveDeadNode(User);
14995 }
14996 }
14997
14998 switch (Idx) {
14999 default: break;
15000 case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
15001 case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
15002 case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
15003 case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
15004 }
15005 }
15006
15007 DAG.RemoveDeadNode(Node);
15008 return nullptr;
15009}
15010
15012 if (Op.getOpcode() == ISD::AssertZext)
15013 Op = Op.getOperand(0);
15014
15015 return isa<FrameIndexSDNode>(Op);
15016}
15017
15018/// Legalize target independent instructions (e.g. INSERT_SUBREG)
15019/// with frame index operands.
15020/// LLVM assumes that inputs are to these instructions are registers.
15022 SelectionDAG &DAG) const {
15023 if (Node->getOpcode() == ISD::CopyToReg) {
15024 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15025 SDValue SrcVal = Node->getOperand(2);
15026
15027 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
15028 // to try understanding copies to physical registers.
15029 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
15030 SDLoc SL(Node);
15032 SDValue VReg = DAG.getRegister(
15033 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15034
15035 SDNode *Glued = Node->getGluedNode();
15036 SDValue ToVReg
15037 = DAG.getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15038 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
15039 SDValue ToResultReg
15040 = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
15041 VReg, ToVReg.getValue(1));
15042 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
15043 DAG.RemoveDeadNode(Node);
15044 return ToResultReg.getNode();
15045 }
15046 }
15047
15049 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
15050 if (!isFrameIndexOp(Node->getOperand(i))) {
15051 Ops.push_back(Node->getOperand(i));
15052 continue;
15053 }
15054
15055 SDLoc DL(Node);
15056 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
15057 Node->getOperand(i).getValueType(),
15058 Node->getOperand(i)), 0));
15059 }
15060
15061 return DAG.UpdateNodeOperands(Node, Ops);
15062}
15063
15064/// Fold the instructions after selecting them.
15065/// Returns null if users were already updated.
15067 SelectionDAG &DAG) const {
15069 unsigned Opcode = Node->getMachineOpcode();
15070
15071 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
15072 !TII->isGather4(Opcode) &&
15073 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
15074 return adjustWritemask(Node, DAG);
15075 }
15076
15077 if (Opcode == AMDGPU::INSERT_SUBREG ||
15078 Opcode == AMDGPU::REG_SEQUENCE) {
15080 return Node;
15081 }
15082
15083 switch (Opcode) {
15084 case AMDGPU::V_DIV_SCALE_F32_e64:
15085 case AMDGPU::V_DIV_SCALE_F64_e64: {
15086 // Satisfy the operand register constraint when one of the inputs is
15087 // undefined. Ordinarily each undef value will have its own implicit_def of
15088 // a vreg, so force these to use a single register.
15089 SDValue Src0 = Node->getOperand(1);
15090 SDValue Src1 = Node->getOperand(3);
15091 SDValue Src2 = Node->getOperand(5);
15092
15093 if ((Src0.isMachineOpcode() &&
15094 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
15095 (Src0 == Src1 || Src0 == Src2))
15096 break;
15097
15098 MVT VT = Src0.getValueType().getSimpleVT();
15099 const TargetRegisterClass *RC =
15100 getRegClassFor(VT, Src0.getNode()->isDivergent());
15101
15103 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
15104
15105 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node),
15106 UndefReg, Src0, SDValue());
15107
15108 // src0 must be the same register as src1 or src2, even if the value is
15109 // undefined, so make sure we don't violate this constraint.
15110 if (Src0.isMachineOpcode() &&
15111 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
15112 if (Src1.isMachineOpcode() &&
15113 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15114 Src0 = Src1;
15115 else if (Src2.isMachineOpcode() &&
15116 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
15117 Src0 = Src2;
15118 else {
15119 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
15120 Src0 = UndefReg;
15121 Src1 = UndefReg;
15122 }
15123 } else
15124 break;
15125
15126 SmallVector<SDValue, 9> Ops(Node->op_begin(), Node->op_end());
15127 Ops[1] = Src0;
15128 Ops[3] = Src1;
15129 Ops[5] = Src2;
15130 Ops.push_back(ImpDef.getValue(1));
15131 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
15132 }
15133 default:
15134 break;
15135 }
15136
15137 return Node;
15138}
15139
15140// Any MIMG instructions that use tfe or lwe require an initialization of the
15141// result register that will be written in the case of a memory access failure.
15142// The required code is also added to tie this init code to the result of the
15143// img instruction.
15146 const SIRegisterInfo &TRI = TII->getRegisterInfo();
15147 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
15148 MachineBasicBlock &MBB = *MI.getParent();
15149
15150 int DstIdx =
15151 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
15152 unsigned InitIdx = 0;
15153
15154 if (TII->isImage(MI)) {
15155 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
15156 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
15157 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
15158
15159 if (!TFE && !LWE) // intersect_ray
15160 return;
15161
15162 unsigned TFEVal = TFE ? TFE->getImm() : 0;
15163 unsigned LWEVal = LWE ? LWE->getImm() : 0;
15164 unsigned D16Val = D16 ? D16->getImm() : 0;
15165
15166 if (!TFEVal && !LWEVal)
15167 return;
15168
15169 // At least one of TFE or LWE are non-zero
15170 // We have to insert a suitable initialization of the result value and
15171 // tie this to the dest of the image instruction.
15172
15173 // Calculate which dword we have to initialize to 0.
15174 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
15175
15176 // check that dmask operand is found.
15177 assert(MO_Dmask && "Expected dmask operand in instruction");
15178
15179 unsigned dmask = MO_Dmask->getImm();
15180 // Determine the number of active lanes taking into account the
15181 // Gather4 special case
15182 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
15183
15184 bool Packed = !Subtarget->hasUnpackedD16VMem();
15185
15186 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15187
15188 // Abandon attempt if the dst size isn't large enough
15189 // - this is in fact an error but this is picked up elsewhere and
15190 // reported correctly.
15191 uint32_t DstSize =
15192 TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15193 if (DstSize < InitIdx)
15194 return;
15195 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
15196 InitIdx = TRI.getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
15197 } else {
15198 return;
15199 }
15200
15201 const DebugLoc &DL = MI.getDebugLoc();
15202
15203 // Create a register for the initialization value.
15204 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
15205 unsigned NewDst = 0; // Final initialized value will be in here
15206
15207 // If PRTStrictNull feature is enabled (the default) then initialize
15208 // all the result registers to 0, otherwise just the error indication
15209 // register (VGPRn+1)
15210 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
15211 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
15212
15213 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
15214 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15215 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
15216 // Initialize dword
15217 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
15218 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
15219 .addImm(0);
15220 // Insert into the super-reg
15221 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
15222 .addReg(PrevDst)
15223 .addReg(SubReg)
15225
15226 PrevDst = NewDst;
15227 }
15228
15229 // Add as an implicit operand
15230 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
15231
15232 // Tie the just added implicit operand to the dst
15233 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
15234}
15235
15236/// Assign the register class depending on the number of
15237/// bits set in the writemask
15239 SDNode *Node) const {
15241
15242 MachineFunction *MF = MI.getParent()->getParent();
15245
15246 if (TII->isVOP3(MI.getOpcode())) {
15247 // Make sure constant bus requirements are respected.
15248 TII->legalizeOperandsVOP3(MRI, MI);
15249
15250 // Prefer VGPRs over AGPRs in mAI instructions where possible.
15251 // This saves a chain-copy of registers and better balance register
15252 // use between vgpr and agpr as agpr tuples tend to be big.
15253 if (!MI.getDesc().operands().empty()) {
15254 unsigned Opc = MI.getOpcode();
15255 bool HasAGPRs = Info->mayNeedAGPRs();
15256 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15257 int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
15258 for (auto I :
15259 {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
15260 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
15261 if (I == -1)
15262 break;
15263 if ((I == Src2Idx) && (HasAGPRs))
15264 break;
15265 MachineOperand &Op = MI.getOperand(I);
15266 if (!Op.isReg() || !Op.getReg().isVirtual())
15267 continue;
15268 auto *RC = TRI->getRegClassForReg(MRI, Op.getReg());
15269 if (!TRI->hasAGPRs(RC))
15270 continue;
15271 auto *Src = MRI.getUniqueVRegDef(Op.getReg());
15272 if (!Src || !Src->isCopy() ||
15273 !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg()))
15274 continue;
15275 auto *NewRC = TRI->getEquivalentVGPRClass(RC);
15276 // All uses of agpr64 and agpr32 can also accept vgpr except for
15277 // v_accvgpr_read, but we do not produce agpr reads during selection,
15278 // so no use checks are needed.
15279 MRI.setRegClass(Op.getReg(), NewRC);
15280 }
15281
15282 if (!HasAGPRs)
15283 return;
15284
15285 // Resolve the rest of AV operands to AGPRs.
15286 if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
15287 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15288 auto *RC = TRI->getRegClassForReg(MRI, Src2->getReg());
15289 if (TRI->isVectorSuperClass(RC)) {
15290 auto *NewRC = TRI->getEquivalentAGPRClass(RC);
15291 MRI.setRegClass(Src2->getReg(), NewRC);
15292 if (Src2->isTied())
15293 MRI.setRegClass(MI.getOperand(0).getReg(), NewRC);
15294 }
15295 }
15296 }
15297 }
15298
15299 return;
15300 }
15301
15302 if (TII->isImage(MI))
15303 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
15304}
15305
15307 uint64_t Val) {
15308 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
15309 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
15310}
15311
15313 const SDLoc &DL,
15314 SDValue Ptr) const {
15316
15317 // Build the half of the subregister with the constants before building the
15318 // full 128-bit register. If we are building multiple resource descriptors,
15319 // this will allow CSEing of the 2-component register.
15320 const SDValue Ops0[] = {
15321 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
15322 buildSMovImm32(DAG, DL, 0),
15323 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15324 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
15325 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)
15326 };
15327
15328 SDValue SubRegHi = SDValue(DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL,
15329 MVT::v2i32, Ops0), 0);
15330
15331 // Combine the constants and the pointer.
15332 const SDValue Ops1[] = {
15333 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15334 Ptr,
15335 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
15336 SubRegHi,
15337 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)
15338 };
15339
15340 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
15341}
15342
15343/// Return a resource descriptor with the 'Add TID' bit enabled
15344/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
15345/// of the resource descriptor) to create an offset, which is added to
15346/// the resource pointer.
15348 SDValue Ptr, uint32_t RsrcDword1,
15349 uint64_t RsrcDword2And3) const {
15350 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
15351 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
15352 if (RsrcDword1) {
15353 PtrHi = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
15354 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
15355 0);
15356 }
15357
15358 SDValue DataLo = buildSMovImm32(DAG, DL,
15359 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15360 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
15361
15362 const SDValue Ops[] = {
15363 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
15364 PtrLo,
15365 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
15366 PtrHi,
15367 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
15368 DataLo,
15369 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
15370 DataHi,
15371 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)
15372 };
15373
15374 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
15375}
15376
15377//===----------------------------------------------------------------------===//
15378// SI Inline Assembly Support
15379//===----------------------------------------------------------------------===//
15380
15381std::pair<unsigned, const TargetRegisterClass *>
15383 StringRef Constraint,
15384 MVT VT) const {
15385 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
15386
15387 const TargetRegisterClass *RC = nullptr;
15388 if (Constraint.size() == 1) {
15389 const unsigned BitWidth = VT.getSizeInBits();
15390 switch (Constraint[0]) {
15391 default:
15392 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15393 case 's':
15394 case 'r':
15395 switch (BitWidth) {
15396 case 16:
15397 RC = &AMDGPU::SReg_32RegClass;
15398 break;
15399 case 64:
15400 RC = &AMDGPU::SGPR_64RegClass;
15401 break;
15402 default:
15404 if (!RC)
15405 return std::pair(0U, nullptr);
15406 break;
15407 }
15408 break;
15409 case 'v':
15410 switch (BitWidth) {
15411 case 16:
15412 RC = &AMDGPU::VGPR_32RegClass;
15413 break;
15414 default:
15415 RC = TRI->getVGPRClassForBitWidth(BitWidth);
15416 if (!RC)
15417 return std::pair(0U, nullptr);
15418 break;
15419 }
15420 break;
15421 case 'a':
15422 if (!Subtarget->hasMAIInsts())
15423 break;
15424 switch (BitWidth) {
15425 case 16:
15426 RC = &AMDGPU::AGPR_32RegClass;
15427 break;
15428 default:
15429 RC = TRI->getAGPRClassForBitWidth(BitWidth);
15430 if (!RC)
15431 return std::pair(0U, nullptr);
15432 break;
15433 }
15434 break;
15435 }
15436 // We actually support i128, i16 and f16 as inline parameters
15437 // even if they are not reported as legal
15438 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
15439 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
15440 return std::pair(0U, RC);
15441 }
15442
15443 if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
15444 StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
15445 if (RegName.consume_front("v")) {
15446 RC = &AMDGPU::VGPR_32RegClass;
15447 } else if (RegName.consume_front("s")) {
15448 RC = &AMDGPU::SGPR_32RegClass;
15449 } else if (RegName.consume_front("a")) {
15450 RC = &AMDGPU::AGPR_32RegClass;
15451 }
15452
15453 if (RC) {
15454 uint32_t Idx;
15455 if (RegName.consume_front("[")) {
15456 uint32_t End;
15457 bool Failed = RegName.consumeInteger(10, Idx);
15458 Failed |= !RegName.consume_front(":");
15459 Failed |= RegName.consumeInteger(10, End);
15460 Failed |= !RegName.consume_back("]");
15461 if (!Failed) {
15462 uint32_t Width = (End - Idx + 1) * 32;
15463 MCRegister Reg = RC->getRegister(Idx);
15465 RC = TRI->getVGPRClassForBitWidth(Width);
15466 else if (SIRegisterInfo::isSGPRClass(RC))
15467 RC = TRI->getSGPRClassForBitWidth(Width);
15468 else if (SIRegisterInfo::isAGPRClass(RC))
15469 RC = TRI->getAGPRClassForBitWidth(Width);
15470 if (RC) {
15471 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15472 return std::pair(Reg, RC);
15473 }
15474 }
15475 } else {
15476 bool Failed = RegName.getAsInteger(10, Idx);
15477 if (!Failed && Idx < RC->getNumRegs())
15478 return std::pair(RC->getRegister(Idx), RC);
15479 }
15480 }
15481 }
15482
15483 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
15484 if (Ret.first)
15485 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
15486
15487 return Ret;
15488}
15489
15490static bool isImmConstraint(StringRef Constraint) {
15491 if (Constraint.size() == 1) {
15492 switch (Constraint[0]) {
15493 default: break;
15494 case 'I':
15495 case 'J':
15496 case 'A':
15497 case 'B':
15498 case 'C':
15499 return true;
15500 }
15501 } else if (Constraint == "DA" ||
15502 Constraint == "DB") {
15503 return true;
15504 }
15505 return false;
15506}
15507
15510 if (Constraint.size() == 1) {
15511 switch (Constraint[0]) {
15512 default: break;
15513 case 's':
15514 case 'v':
15515 case 'a':
15516 return C_RegisterClass;
15517 }
15518 }
15519 if (isImmConstraint(Constraint)) {
15520 return C_Other;
15521 }
15522 return TargetLowering::getConstraintType(Constraint);
15523}
15524
15525static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
15527 Val = Val & maskTrailingOnes<uint64_t>(Size);
15528 }
15529 return Val;
15530}
15531
15533 StringRef Constraint,
15534 std::vector<SDValue> &Ops,
15535 SelectionDAG &DAG) const {
15536 if (isImmConstraint(Constraint)) {
15537 uint64_t Val;
15538 if (getAsmOperandConstVal(Op, Val) &&
15539 checkAsmConstraintVal(Op, Constraint, Val)) {
15540 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
15541 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
15542 }
15543 } else {
15544 TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
15545 }
15546}
15547
15549 unsigned Size = Op.getScalarValueSizeInBits();
15550 if (Size > 64)
15551 return false;
15552
15553 if (Size == 16 && !Subtarget->has16BitInsts())
15554 return false;
15555
15556 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
15557 Val = C->getSExtValue();
15558 return true;
15559 }
15560 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
15561 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15562 return true;
15563 }
15564 if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
15565 if (Size != 16 || Op.getNumOperands() != 2)
15566 return false;
15567 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
15568 return false;
15569 if (ConstantSDNode *C = V->getConstantSplatNode()) {
15570 Val = C->getSExtValue();
15571 return true;
15572 }
15573 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
15574 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
15575 return true;
15576 }
15577 }
15578
15579 return false;
15580}
15581
15583 uint64_t Val) const {
15584 if (Constraint.size() == 1) {
15585 switch (Constraint[0]) {
15586 case 'I':
15588 case 'J':
15589 return isInt<16>(Val);
15590 case 'A':
15591 return checkAsmConstraintValA(Op, Val);
15592 case 'B':
15593 return isInt<32>(Val);
15594 case 'C':
15595 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
15597 default:
15598 break;
15599 }
15600 } else if (Constraint.size() == 2) {
15601 if (Constraint == "DA") {
15602 int64_t HiBits = static_cast<int32_t>(Val >> 32);
15603 int64_t LoBits = static_cast<int32_t>(Val);
15604 return checkAsmConstraintValA(Op, HiBits, 32) &&
15605 checkAsmConstraintValA(Op, LoBits, 32);
15606 }
15607 if (Constraint == "DB") {
15608 return true;
15609 }
15610 }
15611 llvm_unreachable("Invalid asm constraint");
15612}
15613
15615 unsigned MaxSize) const {
15616 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
15617 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
15618 if (Size == 16) {
15619 MVT VT = Op.getSimpleValueType();
15620 switch (VT.SimpleTy) {
15621 default:
15622 return false;
15623 case MVT::i16:
15624 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
15625 case MVT::f16:
15626 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
15627 case MVT::bf16:
15628 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
15629 case MVT::v2i16:
15630 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
15631 case MVT::v2f16:
15632 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
15633 case MVT::v2bf16:
15634 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
15635 }
15636 }
15637 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
15638 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
15639 return true;
15640 return false;
15641}
15642
15643static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
15644 switch (UnalignedClassID) {
15645 case AMDGPU::VReg_64RegClassID:
15646 return AMDGPU::VReg_64_Align2RegClassID;
15647 case AMDGPU::VReg_96RegClassID:
15648 return AMDGPU::VReg_96_Align2RegClassID;
15649 case AMDGPU::VReg_128RegClassID:
15650 return AMDGPU::VReg_128_Align2RegClassID;
15651 case AMDGPU::VReg_160RegClassID:
15652 return AMDGPU::VReg_160_Align2RegClassID;
15653 case AMDGPU::VReg_192RegClassID:
15654 return AMDGPU::VReg_192_Align2RegClassID;
15655 case AMDGPU::VReg_224RegClassID:
15656 return AMDGPU::VReg_224_Align2RegClassID;
15657 case AMDGPU::VReg_256RegClassID:
15658 return AMDGPU::VReg_256_Align2RegClassID;
15659 case AMDGPU::VReg_288RegClassID:
15660 return AMDGPU::VReg_288_Align2RegClassID;
15661 case AMDGPU::VReg_320RegClassID:
15662 return AMDGPU::VReg_320_Align2RegClassID;
15663 case AMDGPU::VReg_352RegClassID:
15664 return AMDGPU::VReg_352_Align2RegClassID;
15665 case AMDGPU::VReg_384RegClassID:
15666 return AMDGPU::VReg_384_Align2RegClassID;
15667 case AMDGPU::VReg_512RegClassID:
15668 return AMDGPU::VReg_512_Align2RegClassID;
15669 case AMDGPU::VReg_1024RegClassID:
15670 return AMDGPU::VReg_1024_Align2RegClassID;
15671 case AMDGPU::AReg_64RegClassID:
15672 return AMDGPU::AReg_64_Align2RegClassID;
15673 case AMDGPU::AReg_96RegClassID:
15674 return AMDGPU::AReg_96_Align2RegClassID;
15675 case AMDGPU::AReg_128RegClassID:
15676 return AMDGPU::AReg_128_Align2RegClassID;
15677 case AMDGPU::AReg_160RegClassID:
15678 return AMDGPU::AReg_160_Align2RegClassID;
15679 case AMDGPU::AReg_192RegClassID:
15680 return AMDGPU::AReg_192_Align2RegClassID;
15681 case AMDGPU::AReg_256RegClassID:
15682 return AMDGPU::AReg_256_Align2RegClassID;
15683 case AMDGPU::AReg_512RegClassID:
15684 return AMDGPU::AReg_512_Align2RegClassID;
15685 case AMDGPU::AReg_1024RegClassID:
15686 return AMDGPU::AReg_1024_Align2RegClassID;
15687 default:
15688 return -1;
15689 }
15690}
15691
15692// Figure out which registers should be reserved for stack access. Only after
15693// the function is legalized do we know all of the non-spill stack objects or if
15694// calls are present.
15698 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
15699 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15700 const SIInstrInfo *TII = ST.getInstrInfo();
15701
15702 if (Info->isEntryFunction()) {
15703 // Callable functions have fixed registers used for stack access.
15705 }
15706
15707 // TODO: Move this logic to getReservedRegs()
15708 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
15709 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15710 Register SReg = ST.isWave32()
15711 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15712 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
15713 &AMDGPU::SGPR_64RegClass);
15714 Info->setSGPRForEXECCopy(SReg);
15715
15716 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
15717 Info->getStackPtrOffsetReg()));
15718 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15719 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
15720
15721 // We need to worry about replacing the default register with itself in case
15722 // of MIR testcases missing the MFI.
15723 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15724 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
15725
15726 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15727 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
15728
15729 Info->limitOccupancy(MF);
15730
15731 if (ST.isWave32() && !MF.empty()) {
15732 for (auto &MBB : MF) {
15733 for (auto &MI : MBB) {
15734 TII->fixImplicitOperands(MI);
15735 }
15736 }
15737 }
15738
15739 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
15740 // classes if required. Ideally the register class constraints would differ
15741 // per-subtarget, but there's no easy way to achieve that right now. This is
15742 // not a problem for VGPRs because the correctly aligned VGPR class is implied
15743 // from using them as the register class for legal types.
15744 if (ST.needsAlignedVGPRs()) {
15745 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
15746 const Register Reg = Register::index2VirtReg(I);
15747 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
15748 if (!RC)
15749 continue;
15750 int NewClassID = getAlignedAGPRClassID(RC->getID());
15751 if (NewClassID != -1)
15752 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
15753 }
15754 }
15755
15757}
15758
15760 KnownBits &Known,
15761 const APInt &DemandedElts,
15762 const SelectionDAG &DAG,
15763 unsigned Depth) const {
15764 Known.resetAll();
15765 unsigned Opc = Op.getOpcode();
15766 switch (Opc) {
15768 unsigned IID = Op.getConstantOperandVal(0);
15769 switch (IID) {
15770 case Intrinsic::amdgcn_mbcnt_lo:
15771 case Intrinsic::amdgcn_mbcnt_hi: {
15772 const GCNSubtarget &ST =
15774 // These return at most the (wavefront size - 1) + src1
15775 // As long as src1 is an immediate we can calc known bits
15776 KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
15777 unsigned Src1ValBits = Src1Known.countMaxActiveBits();
15778 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15779 // Cater for potential carry
15780 MaxActiveBits += Src1ValBits ? 1 : 0;
15781 unsigned Size = Op.getValueType().getSizeInBits();
15782 if (MaxActiveBits < Size)
15783 Known.Zero.setHighBits(Size - MaxActiveBits);
15784 return;
15785 }
15786 }
15787 break;
15788 }
15789 }
15791 Op, Known, DemandedElts, DAG, Depth);
15792}
15793
15795 const int FI, KnownBits &Known, const MachineFunction &MF) const {
15797
15798 // Set the high bits to zero based on the maximum allowed scratch size per
15799 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
15800 // calculation won't overflow, so assume the sign bit is never set.
15801 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
15802}
15803
15805 KnownBits &Known, unsigned Dim) {
15806 unsigned MaxValue =
15807 ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
15808 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
15809}
15810
15812 GISelKnownBits &KB, Register R, KnownBits &Known, const APInt &DemandedElts,
15813 const MachineRegisterInfo &MRI, unsigned Depth) const {
15814 const MachineInstr *MI = MRI.getVRegDef(R);
15815 switch (MI->getOpcode()) {
15816 case AMDGPU::G_INTRINSIC:
15817 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15818 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15819 case Intrinsic::amdgcn_workitem_id_x:
15820 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
15821 break;
15822 case Intrinsic::amdgcn_workitem_id_y:
15823 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 1);
15824 break;
15825 case Intrinsic::amdgcn_workitem_id_z:
15826 knownBitsForWorkitemID(*getSubtarget(), KB, Known, 2);
15827 break;
15828 case Intrinsic::amdgcn_mbcnt_lo:
15829 case Intrinsic::amdgcn_mbcnt_hi: {
15830 // These return at most the wavefront size - 1.
15831 unsigned Size = MRI.getType(R).getSizeInBits();
15832 Known.Zero.setHighBits(Size - getSubtarget()->getWavefrontSizeLog2());
15833 break;
15834 }
15835 case Intrinsic::amdgcn_groupstaticsize: {
15836 // We can report everything over the maximum size as 0. We can't report
15837 // based on the actual size because we don't know if it's accurate or not
15838 // at any given point.
15839 Known.Zero.setHighBits(
15840 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
15841 break;
15842 }
15843 }
15844 break;
15845 }
15846 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15847 Known.Zero.setHighBits(24);
15848 break;
15849 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15850 Known.Zero.setHighBits(16);
15851 break;
15852 case AMDGPU::G_AMDGPU_SMED3:
15853 case AMDGPU::G_AMDGPU_UMED3: {
15854 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
15855
15856 KnownBits Known2;
15857 KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
15858 if (Known2.isUnknown())
15859 break;
15860
15861 KnownBits Known1;
15862 KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
15863 if (Known1.isUnknown())
15864 break;
15865
15866 KnownBits Known0;
15867 KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
15868 if (Known0.isUnknown())
15869 break;
15870
15871 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
15872 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
15873 Known.One = Known0.One & Known1.One & Known2.One;
15874 break;
15875 }
15876 }
15877}
15878
15881 unsigned Depth) const {
15882 const MachineInstr *MI = MRI.getVRegDef(R);
15883 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
15884 // FIXME: Can this move to generic code? What about the case where the call
15885 // site specifies a lower alignment?
15886 Intrinsic::ID IID = GI->getIntrinsicID();
15888 AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
15889 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
15890 return *RetAlign;
15891 }
15892 return Align(1);
15893}
15894
15897 const Align CacheLineAlign = Align(64);
15898
15899 // Pre-GFX10 target did not benefit from loop alignment
15900 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
15901 getSubtarget()->hasInstFwdPrefetchBug())
15902 return PrefAlign;
15903
15904 // On GFX10 I$ is 4 x 64 bytes cache lines.
15905 // By default prefetcher keeps one cache line behind and reads two ahead.
15906 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
15907 // behind and one ahead.
15908 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
15909 // If loop fits 64 bytes it always spans no more than two cache lines and
15910 // does not need an alignment.
15911 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
15912 // Else if loop is less or equal 192 bytes we need two lines behind.
15913
15915 const MachineBasicBlock *Header = ML->getHeader();
15916 if (Header->getAlignment() != PrefAlign)
15917 return Header->getAlignment(); // Already processed.
15918
15919 unsigned LoopSize = 0;
15920 for (const MachineBasicBlock *MBB : ML->blocks()) {
15921 // If inner loop block is aligned assume in average half of the alignment
15922 // size to be added as nops.
15923 if (MBB != Header)
15924 LoopSize += MBB->getAlignment().value() / 2;
15925
15926 for (const MachineInstr &MI : *MBB) {
15927 LoopSize += TII->getInstSizeInBytes(MI);
15928 if (LoopSize > 192)
15929 return PrefAlign;
15930 }
15931 }
15932
15933 if (LoopSize <= 64)
15934 return PrefAlign;
15935
15936 if (LoopSize <= 128)
15937 return CacheLineAlign;
15938
15939 // If any of parent loops is surrounded by prefetch instructions do not
15940 // insert new for inner loop, which would reset parent's settings.
15941 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
15942 if (MachineBasicBlock *Exit = P->getExitBlock()) {
15943 auto I = Exit->getFirstNonDebugInstr();
15944 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15945 return CacheLineAlign;
15946 }
15947 }
15948
15949 MachineBasicBlock *Pre = ML->getLoopPreheader();
15950 MachineBasicBlock *Exit = ML->getExitBlock();
15951
15952 if (Pre && Exit) {
15953 auto PreTerm = Pre->getFirstTerminator();
15954 if (PreTerm == Pre->begin() ||
15955 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15956 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15957 .addImm(1); // prefetch 2 lines behind PC
15958
15959 auto ExitHead = Exit->getFirstNonDebugInstr();
15960 if (ExitHead == Exit->end() ||
15961 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15962 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
15963 .addImm(2); // prefetch 1 line behind PC
15964 }
15965
15966 return CacheLineAlign;
15967}
15968
15970static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
15971 assert(N->getOpcode() == ISD::CopyFromReg);
15972 do {
15973 // Follow the chain until we find an INLINEASM node.
15974 N = N->getOperand(0).getNode();
15975 if (N->getOpcode() == ISD::INLINEASM ||
15976 N->getOpcode() == ISD::INLINEASM_BR)
15977 return true;
15978 } while (N->getOpcode() == ISD::CopyFromReg);
15979 return false;
15980}
15981
15984 UniformityInfo *UA) const {
15985 switch (N->getOpcode()) {
15986 case ISD::CopyFromReg: {
15987 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
15988 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
15989 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
15990 Register Reg = R->getReg();
15991
15992 // FIXME: Why does this need to consider isLiveIn?
15993 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
15994 return !TRI->isSGPRReg(MRI, Reg);
15995
15996 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
15997 return UA->isDivergent(V);
15998
16000 return !TRI->isSGPRReg(MRI, Reg);
16001 }
16002 case ISD::LOAD: {
16003 const LoadSDNode *L = cast<LoadSDNode>(N);
16004 unsigned AS = L->getAddressSpace();
16005 // A flat load may access private memory.
16007 }
16008 case ISD::CALLSEQ_END:
16009 return true;
16011 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
16013 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
16035 // Target-specific read-modify-write atomics are sources of divergence.
16036 return true;
16037 default:
16038 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
16039 // Generic read-modify-write atomics are sources of divergence.
16040 return A->readMem() && A->writeMem();
16041 }
16042 return false;
16043 }
16044}
16045
16047 EVT VT) const {
16048 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
16049 case MVT::f32:
16051 case MVT::f64:
16052 case MVT::f16:
16054 default:
16055 return false;
16056 }
16057}
16058
16060 LLT Ty, const MachineFunction &MF) const {
16061 switch (Ty.getScalarSizeInBits()) {
16062 case 32:
16063 return !denormalModeIsFlushAllF32(MF);
16064 case 64:
16065 case 16:
16066 return !denormalModeIsFlushAllF64F16(MF);
16067 default:
16068 return false;
16069 }
16070}
16071
16073 const SelectionDAG &DAG,
16074 bool SNaN,
16075 unsigned Depth) const {
16076 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
16077 const MachineFunction &MF = DAG.getMachineFunction();
16079
16080 if (Info->getMode().DX10Clamp)
16081 return true; // Clamped to 0.
16082 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
16083 }
16084
16086 SNaN, Depth);
16087}
16088
16089#if 0
16090// FIXME: This should be checked before unsafe fp atomics are enabled
16091// Global FP atomic instructions have a hardcoded FP mode and do not support
16092// FP32 denormals, and only support v2f16 denormals.
16093static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
16095 auto DenormMode = RMW->getParent()->getParent()->getDenormalMode(Flt);
16096 if (&Flt == &APFloat::IEEEsingle())
16097 return DenormMode == DenormalMode::getPreserveSign();
16098 return DenormMode == DenormalMode::getIEEE();
16099}
16100#endif
16101
16102// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
16103// floating point atomic instructions. May generate more efficient code,
16104// but may not respect rounding and denormal modes, and may give incorrect
16105// results for certain memory destinations.
16107 return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
16108 "true";
16109}
16110
16112 LLVMContext &Ctx = RMW->getContext();
16114 Ctx.getSyncScopeNames(SSNs);
16115 StringRef MemScope = SSNs[RMW->getSyncScopeID()].empty()
16116 ? "system"
16117 : SSNs[RMW->getSyncScopeID()];
16118
16119 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
16120 << "Hardware instruction generated for atomic "
16121 << RMW->getOperationName(RMW->getOperation())
16122 << " operation at memory scope " << MemScope;
16123}
16124
16127 unsigned AS = RMW->getPointerAddressSpace();
16128 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
16130
16131 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
16133 ORE.emit([=]() {
16134 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
16135 });
16136 return Kind;
16137 };
16138
16139 auto SSID = RMW->getSyncScopeID();
16140 bool HasSystemScope =
16141 SSID == SyncScope::System ||
16142 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
16143
16144 switch (RMW->getOperation()) {
16145 case AtomicRMWInst::Sub:
16146 case AtomicRMWInst::Or:
16147 case AtomicRMWInst::Xor: {
16148 // Atomic sub/or/xor do not work over PCI express, but atomic add
16149 // does. InstCombine transforms these with 0 to or, so undo that.
16150 if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
16151 if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
16152 ConstVal && ConstVal->isNullValue())
16154 }
16155
16156 break;
16157 }
16158 case AtomicRMWInst::FAdd: {
16159 Type *Ty = RMW->getType();
16160
16161 // TODO: Handle REGION_ADDRESS
16162 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
16163 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
16164 // is fixed to round-to-nearest-even.
16165 //
16166 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
16167 // round-to-nearest-even.
16168 //
16169 // We ignore the rounding mode problem, even in strictfp. The C++ standard
16170 // suggests it is OK if the floating-point mode may not match the calling
16171 // thread.
16172 if (Ty->isFloatTy()) {
16175 }
16176
16177 if (Ty->isDoubleTy()) {
16178 // Ignores denormal mode, but we don't consider flushing mandatory.
16181 }
16182
16183 // TODO: Handle v2f16/v2bf16 cases for gfx940
16185 }
16186
16190
16191 // TODO: gfx940 supports v2f16 and v2bf16
16192 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
16194
16197
16198 // Always expand system scope fp atomics.
16199 if (HasSystemScope)
16201
16202 // global and flat atomic fadd f64: gfx90a, gfx940.
16203 if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
16204 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16205
16206 if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16207 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16208 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16209 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16210 // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16211 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16212 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16213 }
16214
16215 // flat atomic fadd f32: gfx940, gfx11+.
16216 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16217 if (Subtarget->hasFlatAtomicFaddF32Inst())
16218 return ReportUnsafeHWInst(AtomicExpansionKind::None);
16219
16220 // If it is in flat address space, and the type is float, we will try to
16221 // expand it, if the target supports global and lds atomic fadd. The
16222 // reason we need that is, in the expansion, we emit the check of address
16223 // space. If it is in global address space, we emit the global atomic
16224 // fadd; if it is in shared address space, we emit the LDS atomic fadd.
16225 if (Subtarget->hasLDSFPAtomicAddF32()) {
16226 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16228 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16230 }
16231 }
16232
16234 }
16237 case AtomicRMWInst::Min:
16238 case AtomicRMWInst::Max:
16240 case AtomicRMWInst::UMax: {
16243 if (RMW->getType()->isFloatTy() &&
16246
16247 // Always expand system scope min/max atomics.
16248 if (HasSystemScope)
16250 }
16251 break;
16252 }
16253 default:
16254 break;
16255 }
16256
16258}
16259
16265}
16266
16269 return SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
16272}
16273
16279}
16280
16281const TargetRegisterClass *
16282SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
16284 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
16285 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16286 return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
16287 : &AMDGPU::SReg_32RegClass;
16288 if (!TRI->isSGPRClass(RC) && !isDivergent)
16289 return TRI->getEquivalentSGPRClass(RC);
16290 else if (TRI->isSGPRClass(RC) && isDivergent)
16291 return TRI->getEquivalentVGPRClass(RC);
16292
16293 return RC;
16294}
16295
16296// FIXME: This is a workaround for DivergenceAnalysis not understanding always
16297// uniform values (as produced by the mask results of control flow intrinsics)
16298// used outside of divergent blocks. The phi users need to also be treated as
16299// always uniform.
16300//
16301// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
16302static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
16303 unsigned WaveSize) {
16304 // FIXME: We assume we never cast the mask results of a control flow
16305 // intrinsic.
16306 // Early exit if the type won't be consistent as a compile time hack.
16307 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
16308 if (!IT || IT->getBitWidth() != WaveSize)
16309 return false;
16310
16311 if (!isa<Instruction>(V))
16312 return false;
16313 if (!Visited.insert(V).second)
16314 return false;
16315 bool Result = false;
16316 for (const auto *U : V->users()) {
16317 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16318 if (V == U->getOperand(1)) {
16319 switch (Intrinsic->getIntrinsicID()) {
16320 default:
16321 Result = false;
16322 break;
16323 case Intrinsic::amdgcn_if_break:
16324 case Intrinsic::amdgcn_if:
16325 case Intrinsic::amdgcn_else:
16326 Result = true;
16327 break;
16328 }
16329 }
16330 if (V == U->getOperand(0)) {
16331 switch (Intrinsic->getIntrinsicID()) {
16332 default:
16333 Result = false;
16334 break;
16335 case Intrinsic::amdgcn_end_cf:
16336 case Intrinsic::amdgcn_loop:
16337 Result = true;
16338 break;
16339 }
16340 }
16341 } else {
16342 Result = hasCFUser(U, Visited, WaveSize);
16343 }
16344 if (Result)
16345 break;
16346 }
16347 return Result;
16348}
16349
16351 const Value *V) const {
16352 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
16353 if (CI->isInlineAsm()) {
16354 // FIXME: This cannot give a correct answer. This should only trigger in
16355 // the case where inline asm returns mixed SGPR and VGPR results, used
16356 // outside the defining block. We don't have a specific result to
16357 // consider, so this assumes if any value is SGPR, the overall register
16358 // also needs to be SGPR.
16359 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
16361 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
16362 for (auto &TC : TargetConstraints) {
16363 if (TC.Type == InlineAsm::isOutput) {
16366 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16367 if (RC && SIRI->isSGPRClass(RC))
16368 return true;
16369 }
16370 }
16371 }
16372 }
16374 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
16375}
16376
16378 SDNode::use_iterator I = N->use_begin(), E = N->use_end();
16379 for (; I != E; ++I) {
16380 if (MemSDNode *M = dyn_cast<MemSDNode>(*I)) {
16381 if (getBasePtrIndex(M) == I.getOperandNo())
16382 return true;
16383 }
16384 }
16385 return false;
16386}
16387
16389 SDValue N1) const {
16390 if (!N0.hasOneUse())
16391 return false;
16392 // Take care of the opportunity to keep N0 uniform
16393 if (N0->isDivergent() || !N1->isDivergent())
16394 return true;
16395 // Check if we have a good chance to form the memory access pattern with the
16396 // base and offset
16397 return (DAG.isBaseWithConstantOffset(N0) &&
16398 hasMemSDNodeUser(*N0->use_begin()));
16399}
16400
16402 Register N0, Register N1) const {
16403 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
16404}
16405
16408 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16410 if (I.getMetadata("amdgpu.noclobber"))
16411 Flags |= MONoClobber;
16412 if (I.getMetadata("amdgpu.last.use"))
16413 Flags |= MOLastUse;
16414 return Flags;
16415}
16416
16418 SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI,
16419 const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const {
16420 if (User->getOpcode() != ISD::CopyToReg)
16421 return false;
16422 if (!Def->isMachineOpcode())
16423 return false;
16424 MachineSDNode *MDef = dyn_cast<MachineSDNode>(Def);
16425 if (!MDef)
16426 return false;
16427
16428 unsigned ResNo = User->getOperand(Op).getResNo();
16429 if (User->getOperand(Op)->getValueType(ResNo) != MVT::i1)
16430 return false;
16431 const MCInstrDesc &II = TII->get(MDef->getMachineOpcode());
16432 if (II.isCompare() && II.hasImplicitDefOfPhysReg(AMDGPU::SCC)) {
16433 PhysReg = AMDGPU::SCC;
16434 const TargetRegisterClass *RC =
16435 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16436 Cost = RC->getCopyCost();
16437 return true;
16438 }
16439 return false;
16440}
16441
16444
16447 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
16448 assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
16449 "this cannot be replaced with add");
16451 return;
16452 }
16453
16454 assert(Subtarget->hasAtomicFaddInsts() &&
16455 "target should have atomic fadd instructions");
16456 assert(AI->getType()->isFloatTy() &&
16458 "generic atomicrmw expansion only supports FP32 operand in flat "
16459 "address space");
16460 assert(Op == AtomicRMWInst::FAdd && "only fadd is supported for now");
16461
16462 // Given: atomicrmw fadd ptr %addr, float %val ordering
16463 //
16464 // With this expansion we produce the following code:
16465 // [...]
16466 // br label %atomicrmw.check.shared
16467 //
16468 // atomicrmw.check.shared:
16469 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
16470 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
16471 //
16472 // atomicrmw.shared:
16473 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
16474 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
16475 // float %val ordering
16476 // br label %atomicrmw.phi
16477 //
16478 // atomicrmw.check.private:
16479 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
16480 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
16481 //
16482 // atomicrmw.private:
16483 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
16484 // %loaded.private = load float, ptr addrspace(5) %cast.private
16485 // %val.new = fadd float %loaded.private, %val
16486 // store float %val.new, ptr addrspace(5) %cast.private
16487 // br label %atomicrmw.phi
16488 //
16489 // atomicrmw.global:
16490 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
16491 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
16492 // float %val ordering
16493 // br label %atomicrmw.phi
16494 //
16495 // atomicrmw.phi:
16496 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
16497 // [ %loaded.private, %atomicrmw.private ],
16498 // [ %loaded.global, %atomicrmw.global ]
16499 // br label %atomicrmw.end
16500 //
16501 // atomicrmw.end:
16502 // [...]
16503
16504 IRBuilder<> Builder(AI);
16505 LLVMContext &Ctx = Builder.getContext();
16506
16507 BasicBlock *BB = Builder.GetInsertBlock();
16508 Function *F = BB->getParent();
16509 BasicBlock *ExitBB =
16510 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
16511 BasicBlock *CheckSharedBB =
16512 BasicBlock::Create(Ctx, "atomicrmw.check.shared", F, ExitBB);
16513 BasicBlock *SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
16514 BasicBlock *CheckPrivateBB =
16515 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
16516 BasicBlock *PrivateBB =
16517 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
16518 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
16519 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
16520
16521 Value *Val = AI->getValOperand();
16522 Type *ValTy = Val->getType();
16523 Value *Addr = AI->getPointerOperand();
16524
16525 auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
16526 Value *Val) -> Value * {
16527 AtomicRMWInst *OldVal =
16528 Builder.CreateAtomicRMW(AI->getOperation(), Addr, Val, AI->getAlign(),
16529 AI->getOrdering(), AI->getSyncScopeID());
16531 AI->getAllMetadata(MDs);
16532 for (auto &P : MDs)
16533 OldVal->setMetadata(P.first, P.second);
16534 return OldVal;
16535 };
16536
16537 std::prev(BB->end())->eraseFromParent();
16538 Builder.SetInsertPoint(BB);
16539 Builder.CreateBr(CheckSharedBB);
16540
16541 Builder.SetInsertPoint(CheckSharedBB);
16542 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
16543 {Addr}, nullptr, "is.shared");
16544 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16545
16546 Builder.SetInsertPoint(SharedBB);
16547 Value *CastToLocal = Builder.CreateAddrSpaceCast(
16549 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16550 Builder.CreateBr(PhiBB);
16551
16552 Builder.SetInsertPoint(CheckPrivateBB);
16553 CallInst *IsPrivate = Builder.CreateIntrinsic(
16554 Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
16555 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
16556
16557 Builder.SetInsertPoint(PrivateBB);
16558 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
16560 Value *LoadedPrivate =
16561 Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
16562 Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
16563 Builder.CreateStore(NewVal, CastToPrivate);
16564 Builder.CreateBr(PhiBB);
16565
16566 Builder.SetInsertPoint(GlobalBB);
16567 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
16569 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
16570 Builder.CreateBr(PhiBB);
16571
16572 Builder.SetInsertPoint(PhiBB);
16573 PHINode *Loaded = Builder.CreatePHI(ValTy, 3, "loaded.phi");
16574 Loaded->addIncoming(LoadedShared, SharedBB);
16575 Loaded->addIncoming(LoadedPrivate, PrivateBB);
16576 Loaded->addIncoming(LoadedGlobal, GlobalBB);
16577 Builder.CreateBr(ExitBB);
16578
16579 AI->replaceAllUsesWith(Loaded);
16580 AI->eraseFromParent();
16581}
16582
16583LoadInst *
16585 IRBuilder<> Builder(AI);
16586 auto Order = AI->getOrdering();
16587
16588 // The optimization removes store aspect of the atomicrmw. Therefore, cache
16589 // must be flushed if the atomic ordering had a release semantics. This is
16590 // not necessary a fence, a release fence just coincides to do that flush.
16591 // Avoid replacing of an atomicrmw with a release semantics.
16592 if (isReleaseOrStronger(Order))
16593 return nullptr;
16594
16595 LoadInst *LI = Builder.CreateAlignedLoad(
16596 AI->getType(), AI->getPointerOperand(), AI->getAlign());
16597 LI->setAtomic(Order, AI->getSyncScopeID());
16598 LI->copyMetadata(*AI);
16599 LI->takeName(AI);
16600 AI->replaceAllUsesWith(LI);
16601 AI->eraseFromParent();
16602 return LI;
16603}
static bool isMul(MachineInstr *MI)
unsigned SubReg
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
unsigned Intr
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static const LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
#define LLVM_ATTRIBUTE_UNUSED
Definition: Compiler.h:203
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
static const unsigned MaxDepth
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition: SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition: SIDefines.h:1171
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition: SIDefines.h:1168
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
raw_pwrite_stream & OS
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
LLVM IR instance of the generic uniformity analysis.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition: Value.cpp:469
static constexpr int Concat[]
Value * RHS
Value * LHS
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition: APFloat.h:988
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition: APFloat.cpp:5196
bool isNegative() const
Definition: APFloat.h:1295
APInt bitcastToAPInt() const
Definition: APFloat.h:1210
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition: APFloat.h:1006
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition: APFloat.h:966
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition: APFloat.h:957
bool isInfinity() const
Definition: APFloat.h:1292
Class for arbitrary precision integers.
Definition: APInt.h:76
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition: APInt.h:1370
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition: APInt.h:236
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition: APInt.h:444
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition: APInt.h:1589
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition: APInt.h:274
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition: APInt.h:1215
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition: APInt.h:1199
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:684
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:867
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
Value * getPointerOperand()
Definition: Instructions.h:910
void setOperation(BinOp Operation)
Definition: Instructions.h:861
BinOp getOperation() const
Definition: Instructions.h:845
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
Definition: Instructions.h:901
Value * getValOperand()
Definition: Instructions.h:914
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
Definition: Instructions.h:887
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:918
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:570
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
BitVector & set()
Definition: BitVector.h:351
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
Definition: ByteProvider.h:30
static ByteProvider getConstantZero()
Definition: ByteProvider.h:73
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
Definition: ByteProvider.h:66
std::optional< ISelOp > Src
Definition: ByteProvider.h:57
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
bool isRegLoc() const
Register getLocReg() const
LocInfo getLocInfo() const
bool isMemLoc() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
Definition: InstrTypes.h:1828
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
unsigned arg_size() const
Definition: InstrTypes.h:1685
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
bool isSigned() const
Definition: InstrTypes.h:1265
bool isFPPredicate() const
Definition: InstrTypes.h:1122
bool isIntPredicate() const
Definition: InstrTypes.h:1123
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:205
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition: Constant.h:41
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition: Constants.cpp:90
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:865
bool isBigEndian() const
Definition: DataLayout.h:239
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
A debug info location.
Definition: DebugLoc.h:33
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Definition: DerivedTypes.h:103
Type * getParamType(unsigned i) const
Parameter type accessors.
Definition: DerivedTypes.h:135
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
iterator_range< arg_iterator > args()
Definition: Function.h:842
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:264
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:358
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition: Function.cpp:744
bool hasPrefetch() const
Definition: GCNSubtarget.h:895
bool hasD16Images() const
Definition: GCNSubtarget.h:690
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:468
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
Definition: GCNSubtarget.h:459
bool hasDot7Insts() const
Definition: GCNSubtarget.h:789
bool hasApertureRegs() const
Definition: GCNSubtarget.h:588
bool hasFlatInstOffsets() const
Definition: GCNSubtarget.h:618
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasDLInsts() const
Definition: GCNSubtarget.h:759
bool hasBCNT(unsigned Size) const
Definition: GCNSubtarget.h:402
bool hasMAIInsts() const
Definition: GCNSubtarget.h:809
bool hasMultiDwordFlatScratchAddressing() const
Definition: GCNSubtarget.h:670
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
Definition: GCNSubtarget.h:518
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
Definition: GCNSubtarget.h:576
const SIInstrInfo * getInstrInfo() const override
Definition: GCNSubtarget.h:257
bool hasDot1Insts() const
Definition: GCNSubtarget.h:765
bool hasAtomicFaddRtnInsts() const
Definition: GCNSubtarget.h:831
Align getStackAlignment() const
Definition: GCNSubtarget.h:908
bool hasScalarSubwordLoads() const
Definition: GCNSubtarget.h:446
bool enableFlatScratch() const
Definition: GCNSubtarget.h:643
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
Definition: GCNSubtarget.h:614
bool supportsGetDoorbellID() const
Definition: GCNSubtarget.h:452
bool hasFlatAtomicFaddF32Inst() const
Definition: GCNSubtarget.h:847
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:269
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasMad64_32() const
Definition: GCNSubtarget.h:735
bool useDS128() const
Definition: GCNSubtarget.h:528
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
Definition: GCNSubtarget.h:448
const SIFrameLowering * getFrameLowering() const override
Definition: GCNSubtarget.h:261
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:580
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
Definition: GCNSubtarget.h:418
bool hasIntClamp() const
Definition: GCNSubtarget.h:348
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:999
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:368
bool isTrapHandlerEnabled() const
Definition: GCNSubtarget.h:592
bool hasLDSFPAtomicAddF64() const
Definition: GCNSubtarget.h:969
bool hasFlatGlobalInsts() const
Definition: GCNSubtarget.h:622
bool getScalarizeGlobalBehavior() const
Definition: GCNSubtarget.h:921
bool hasScalarSMulU64() const
Definition: GCNSubtarget.h:724
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
Definition: GCNSubtarget.h:327
bool hasShaderCyclesHiLoRegisters() const
Definition: GCNSubtarget.h:875
bool hasFFBL() const
Definition: GCNSubtarget.h:406
bool hasNSAEncoding() const
bool hasSMemRealTime() const
Definition: GCNSubtarget.h:940
bool usePRTStrictNull() const
Definition: GCNSubtarget.h:550
bool hasMed3_16() const
Definition: GCNSubtarget.h:414
bool hasMovrel() const
Definition: GCNSubtarget.h:944
bool hasBFI() const
Definition: GCNSubtarget.h:394
bool hasUnalignedBufferAccessEnabled() const
Definition: GCNSubtarget.h:568
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:335
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
Definition: GCNSubtarget.h:513
bool hasFFBH() const
Definition: GCNSubtarget.h:410
bool hasAtomicFaddInsts() const
Definition: GCNSubtarget.h:827
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
Definition: GCNSubtarget.h:833
bool hasScalarDwordx3Loads() const
Definition: GCNSubtarget.h:958
bool hasLDSFPAtomicAddF32() const
Definition: GCNSubtarget.h:968
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
Definition: GCNSubtarget.h:538
bool hasDot8Insts() const
Definition: GCNSubtarget.h:793
bool hasDS96AndDS128() const
Definition: GCNSubtarget.h:533
bool useFlatForGlobal() const
Definition: GCNSubtarget.h:522
Generation getGeneration() const
Definition: GCNSubtarget.h:308
bool hasScalarAddSub64() const
Definition: GCNSubtarget.h:722
bool hasUnpackedD16VMem() const
Definition: GCNSubtarget.h:726
bool hasAddr64() const
Definition: GCNSubtarget.h:372
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
Definition: GCNSubtarget.h:422
bool hasPackedTID() const
bool hasAddNoCarry() const
Definition: GCNSubtarget.h:718
bool hasFractBug() const
Definition: GCNSubtarget.h:386
bool hasGDS() const
bool hasBFE() const
Definition: GCNSubtarget.h:390
bool hasGWSAutoReplay() const
Definition: GCNSubtarget.h:705
bool hasKernargSegmentPtr() const
bool hasPrivateSegmentBuffer() const
bool hasImplicitBufferPtr() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
Definition: GlobalValue.h:511
unsigned getAddressSpace() const
Definition: GlobalValue.h:205
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
Type * getValueType() const
Definition: GlobalValue.h:296
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1533
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1120
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1790
LLVMContext & getContext() const
Definition: IRBuilder.h:176
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1803
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1854
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1114
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2132
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
Definition: Instruction.h:341
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
Definition: Instruction.cpp:87
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
Definition: Instruction.h:377
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
constexpr unsigned getScalarSizeInBits() const
Definition: LowLevelType.h:267
constexpr bool isScalar() const
Definition: LowLevelType.h:146
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
Definition: LowLevelType.h:42
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
Definition: LowLevelType.h:57
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
Definition: LowLevelType.h:193
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
Definition: LowLevelType.h:221
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Definition: MCInstrDesc.h:198
bool isCompare() const
Return true if this instruction is a comparison.
Definition: MCInstrDesc.h:341
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Definition: MCInstrDesc.cpp:32
Wrapper class representing physical registers. Should be passed by value.
Definition: MCRegister.h:33
Metadata node.
Definition: Metadata.h:1067
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:585
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
Definition: MachineInstr.h:69
const MachineOperand & getOperand(unsigned i) const
Definition: MachineInstr.h:568
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition: ModRef.h:198
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition: ModRef.h:192
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition: ModRef.h:195
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
Register getReg() const
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition: Register.h:84
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition: Register.h:95
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:225
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
Definition: SelectionDAG.h:722
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
Definition: SelectionDAG.h:954
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
Definition: SelectionDAG.h:551
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
Definition: SelectionDAG.h:470
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
Definition: SelectionDAG.h:478
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
Definition: SelectionDAG.h:828
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:472
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
Definition: SelectionDAG.h:473
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
Definition: SelectionDAG.h:773
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:676
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
Definition: SelectionDAG.h:469
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
Definition: SelectionDAG.h:799
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
Definition: SelectionDAG.h:845
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
Definition: SelectionDAG.h:485
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
Definition: SelectionDAG.h:560
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
Definition: SelectionDAG.h:554
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition: StringRef.h:845
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Definition: StringRef.h:257
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
Definition: StringRef.h:131
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:269
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
OSType getOS() const
Get the parsed operating system type of this triple.
Definition: Triple.h:382
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:154
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition: Type.h:157
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition: Type.h:246
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
iterator_range< use_iterator > uses()
Definition: Value.h:376
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
constexpr bool isZero() const
Definition: TypeSize.h:156
self_iterator getIterator()
Definition: ilist_node.h:109
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:415
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:422
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
Definition: CallingConv.h:197
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ MaxID
The highest possible ID. Must be some 2^k - 1.
Definition: CallingConv.h:271
@ AMDGPU_Gfx
Used for AMD graphics targets.
Definition: CallingConv.h:232
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:249
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
Definition: CallingConv.h:245
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
Definition: CallingConv.h:194
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:751
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition: ISDOpcodes.h:237
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
Definition: ISDOpcodes.h:1129
@ CTLZ_ZERO_UNDEF
Definition: ISDOpcodes.h:724
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition: ISDOpcodes.h:44
@ SET_FPENV
Sets the current floating-point environment.
Definition: ISDOpcodes.h:1005
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition: ISDOpcodes.h:251
@ ATOMIC_LOAD_NAND
Definition: ISDOpcodes.h:1276
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition: ISDOpcodes.h:560
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_LOAD_MAX
Definition: ISDOpcodes.h:1278
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ ATOMIC_LOAD_UMIN
Definition: ISDOpcodes.h:1279
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition: ISDOpcodes.h:488
@ FMAXNUM_IEEE
Definition: ISDOpcodes.h:986
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition: ISDOpcodes.h:784
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition: ISDOpcodes.h:199
@ RETURNADDR
Definition: ISDOpcodes.h:95
@ GlobalAddress
Definition: ISDOpcodes.h:78
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
Definition: ISDOpcodes.h:1261
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:791
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition: ISDOpcodes.h:544
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition: ISDOpcodes.h:914
@ FPTRUNC_ROUND
Definition: ISDOpcodes.h:481
@ ATOMIC_LOAD_OR
Definition: ISDOpcodes.h:1274
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:904
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition: ISDOpcodes.h:230
@ ATOMIC_LOAD_XOR
Definition: ISDOpcodes.h:1275
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
Definition: ISDOpcodes.h:940
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
Definition: ISDOpcodes.h:1407
@ ATOMIC_LOAD_FADD
Definition: ISDOpcodes.h:1281
@ SET_ROUNDING
Set rounding mode.
Definition: ISDOpcodes.h:886
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:775
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition: ISDOpcodes.h:621
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
Definition: ISDOpcodes.h:1195
@ BR
Control flow instructions. These all have token chains.
Definition: ISDOpcodes.h:1054
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition: ISDOpcodes.h:723
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
Definition: ISDOpcodes.h:1228
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:995
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:931
@ BR_CC
BR_CC - Conditional branch.
Definition: ISDOpcodes.h:1084
@ ATOMIC_LOAD_MIN
Definition: ISDOpcodes.h:1277
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:501
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition: ISDOpcodes.h:508
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:350
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ UNDEF
UNDEF - An undefined node.
Definition: ISDOpcodes.h:212
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition: ISDOpcodes.h:223
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition: ISDOpcodes.h:209
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition: ISDOpcodes.h:881
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:652
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
Definition: ISDOpcodes.h:1023
@ GET_FPENV
Gets the current floating-point environment.
Definition: ISDOpcodes.h:1000
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition: ISDOpcodes.h:601
@ ATOMIC_LOAD_AND
Definition: ISDOpcodes.h:1272
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition: ISDOpcodes.h:574
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
Definition: ISDOpcodes.h:985
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition: ISDOpcodes.h:203
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:781
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
Definition: ISDOpcodes.h:1218
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition: ISDOpcodes.h:743
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
Definition: ISDOpcodes.h:1255
@ ATOMIC_LOAD_UMAX
Definition: ISDOpcodes.h:1280
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:332
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
Definition: ISDOpcodes.h:1048
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition: ISDOpcodes.h:799
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:675
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:889
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition: ISDOpcodes.h:304
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
Definition: ISDOpcodes.h:1104
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
Definition: ISDOpcodes.h:923
@ ATOMIC_LOAD_UDEC_WRAP
Definition: ISDOpcodes.h:1286
@ ATOMIC_LOAD_ADD
Definition: ISDOpcodes.h:1270
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition: ISDOpcodes.h:466
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ ATOMIC_LOAD_SUB
Definition: ISDOpcodes.h:1271
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:837
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
Definition: ISDOpcodes.h:1189
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:471
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ TRAP
TRAP - Trapping instruction.
Definition: ISDOpcodes.h:1215
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition: ISDOpcodes.h:525
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition: ISDOpcodes.h:52
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
Definition: ISDOpcodes.h:1269
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
Definition: ISDOpcodes.h:945
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:870
@ STRICT_FLDEXP
Definition: ISDOpcodes.h:415
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:908
@ INLINEASM
INLINEASM - Represents an inline asm block.
Definition: ISDOpcodes.h:1101
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:787
@ BRCOND
BRCOND - Conditional branch.
Definition: ISDOpcodes.h:1077
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition: ISDOpcodes.h:764
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition: ISDOpcodes.h:61
@ ATOMIC_LOAD_UINC_WRAP
Definition: ISDOpcodes.h:1285
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:341
@ AssertZext
Definition: ISDOpcodes.h:62
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition: ISDOpcodes.h:516
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1530
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
Definition: ISDOpcodes.h:1510
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
Definition: Function.cpp:1029
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
Offsets
Offsets in bytes from the start of the input buffer.
Definition: SIInstrInfo.h:1542
@ System
Synchronized with respect to all concurrently executing threads.
Definition: LLVMContext.h:57
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
constexpr double inv_pi
Definition: MathExtras.h:38
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition: Analysis.cpp:233
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition: MathExtras.h:228
int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition: bit.h:385
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
@ Done
Definition: Threading.h:61
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition: bit.h:317
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition: SIInstrInfo.h:41
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition: bit.h:281
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition: MathExtras.h:138
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition: Analysis.cpp:199
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition: MathExtras.h:143
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ AfterLegalizeDAG
Definition: DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition: DAGCombine.h:18
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
Definition: VE.h:375
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition: MathExtras.h:219
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
@ DS_Warning
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition: SIInstrInfo.h:45
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition: Alignment.h:212
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
SDValue SrcOp
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
Definition: APFloat.cpp:249
static constexpr roundingMode rmNearestTiesToEven
Definition: APFloat.h:230
static const fltSemantics & IEEEhalf() LLVM_READNONE
Definition: APFloat.cpp:247
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
uint64_t value() const
This is a hole in the type system and should not be abused.
Definition: Alignment.h:85
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition: SCCPSolver.h:41
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition: ValueTypes.h:34
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition: ValueTypes.h:380
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition: ValueTypes.h:120
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition: ValueTypes.h:290
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition: ValueTypes.h:233
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition: ValueTypes.h:455
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition: ValueTypes.h:397
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition: ValueTypes.h:313
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition: ValueTypes.h:246
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:202
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:156
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition: ValueTypes.h:151
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
unsigned getOrigArgIndex() const
bool isUnknown() const
Returns true if we don't know any bits.
Definition: KnownBits.h:63
void resetAll()
Resets the known state of all bits.
Definition: KnownBits.h:71
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition: KnownBits.h:292
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition: KnownBits.h:244
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals