40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
49#define DEBUG_TYPE "si-lower"
54 "amdgpu-disable-loop-alignment",
55 cl::desc(
"Do not align and prefetch loops"),
59 "amdgpu-use-divergent-register-indexing",
61 cl::desc(
"Use indirect register addressing for divergent indexes"),
75 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
76 for (
unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
78 return AMDGPU::SGPR0 + Reg;
194 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
195 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
196 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
197 MVT::i1, MVT::v32i32},
201 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
202 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
203 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
204 MVT::i1, MVT::v32i32},
276 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
283 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
284 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
285 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
288 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
289 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
290 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
294 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
295 MVT::v3i16, MVT::v4i16, MVT::Other},
300 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
316 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
317 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
318 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
319 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
320 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
321 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
322 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
323 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
355 for (
MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
369 for (
MVT Vec64 : { MVT::v3i64, MVT::v3f64 }) {
383 for (
MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
397 for (
MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
411 for (
MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
426 {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
435 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
436 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
441 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
445 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
446 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
447 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
448 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
529 {MVT::f32, MVT::f64},
Legal);
622 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
623 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
624 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
749 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
760 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
764 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
768 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
769 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
791 {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
792 MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
795 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
803 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
819 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
839 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
840 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
841 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
842 MVT::v32f16, MVT::v32bf16},
858 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
860 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
865 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
866 MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
870 {MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
871 MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
872 MVT::i16, MVT::i8, MVT::i128},
876 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
877 MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
973 EVT DestVT,
EVT SrcVT)
const {
983 LLT DestTy,
LLT SrcTy)
const {
984 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->
hasMadMixInsts()) ||
985 (Opcode == TargetOpcode::G_FMA && Subtarget->
hasFmaMixInsts())) &&
1011 return (ScalarVT == MVT::bf16 ? MVT::i32 : MVT::v2f16);
1013 return VT.
isInteger() ? MVT::i32 : MVT::f32;
1040 return (NumElts + 1) / 2;
1046 return NumElts * ((
Size + 31) / 32);
1055 EVT VT,
EVT &IntermediateVT,
1056 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1065 if (ScalarVT == MVT::bf16) {
1066 RegisterVT = MVT::i32;
1067 IntermediateVT = MVT::v2bf16;
1069 RegisterVT = VT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
1070 IntermediateVT = RegisterVT;
1072 NumIntermediates = (NumElts + 1) / 2;
1073 return NumIntermediates;
1078 IntermediateVT = RegisterVT;
1079 NumIntermediates = NumElts;
1080 return NumIntermediates;
1083 if (Size < 16 && Subtarget->has16BitInsts()) {
1085 RegisterVT = MVT::i16;
1086 IntermediateVT = ScalarVT;
1087 NumIntermediates = NumElts;
1088 return NumIntermediates;
1093 RegisterVT = MVT::i32;
1094 IntermediateVT = ScalarVT;
1095 NumIntermediates = NumElts;
1096 return NumIntermediates;
1100 RegisterVT = MVT::i32;
1101 IntermediateVT = RegisterVT;
1102 NumIntermediates = NumElts * ((
Size + 31) / 32);
1103 return NumIntermediates;
1108 Context,
CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1112 assert(MaxNumLanes != 0);
1114 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1115 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1126 auto *ST = dyn_cast<StructType>(Ty);
1131 assert(ST->getNumContainedTypes() == 2 &&
1132 ST->getContainedType(1)->isIntegerTy(32));
1147 DL.getPointerSizeInBits(AS) == 192)
1157 DL.getPointerSizeInBits(AS) == 160) ||
1159 DL.getPointerSizeInBits(AS) == 192))
1167 unsigned IntrID)
const {
1169 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1183 if (RsrcIntr->IsImage)
1187 if (
auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->
getType())) {
1194 Info.ptrVal = RsrcArg;
1202 unsigned MaxNumLanes = 4;
1204 if (RsrcIntr->IsImage) {
1228 if (RsrcIntr->IsImage) {
1229 unsigned DMask = cast<ConstantInt>(CI.
getArgOperand(1))->getZExtValue();
1250 case Intrinsic::amdgcn_raw_buffer_load_lds:
1251 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1252 case Intrinsic::amdgcn_struct_buffer_load_lds:
1253 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
1254 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1265 case Intrinsic::amdgcn_ds_ordered_add:
1266 case Intrinsic::amdgcn_ds_ordered_swap:
1267 case Intrinsic::amdgcn_ds_fadd:
1268 case Intrinsic::amdgcn_ds_fmin:
1269 case Intrinsic::amdgcn_ds_fmax: {
1282 case Intrinsic::amdgcn_buffer_atomic_fadd: {
1290 if (!Vol || !Vol->
isZero())
1295 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1296 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1299 Info.ptrVal =
nullptr;
1304 case Intrinsic::amdgcn_ds_append:
1305 case Intrinsic::amdgcn_ds_consume: {
1318 case Intrinsic::amdgcn_global_atomic_csub: {
1328 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
1338 case Intrinsic::amdgcn_global_atomic_fadd:
1339 case Intrinsic::amdgcn_global_atomic_fmin:
1340 case Intrinsic::amdgcn_global_atomic_fmax:
1341 case Intrinsic::amdgcn_global_atomic_fmin_num:
1342 case Intrinsic::amdgcn_global_atomic_fmax_num:
1343 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1344 case Intrinsic::amdgcn_flat_atomic_fadd:
1345 case Intrinsic::amdgcn_flat_atomic_fmin:
1346 case Intrinsic::amdgcn_flat_atomic_fmax:
1347 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1348 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1349 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1350 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1351 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
1362 case Intrinsic::amdgcn_global_load_tr_b64:
1363 case Intrinsic::amdgcn_global_load_tr_b128: {
1371 case Intrinsic::amdgcn_ds_gws_init:
1372 case Intrinsic::amdgcn_ds_gws_barrier:
1373 case Intrinsic::amdgcn_ds_gws_sema_v:
1374 case Intrinsic::amdgcn_ds_gws_sema_br:
1375 case Intrinsic::amdgcn_ds_gws_sema_p:
1376 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1386 Info.memVT = MVT::i32;
1390 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1396 case Intrinsic::amdgcn_global_load_lds: {
1398 unsigned Width = cast<ConstantInt>(CI.
getArgOperand(2))->getZExtValue();
1404 case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
1414 Info.memVT = MVT::i32;
1429 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1432 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1433 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1445 Type *&AccessTy)
const {
1448 case Intrinsic::amdgcn_atomic_cond_sub_u32:
1449 case Intrinsic::amdgcn_ds_append:
1450 case Intrinsic::amdgcn_ds_consume:
1451 case Intrinsic::amdgcn_ds_fadd:
1452 case Intrinsic::amdgcn_ds_fmax:
1453 case Intrinsic::amdgcn_ds_fmin:
1454 case Intrinsic::amdgcn_ds_ordered_add:
1455 case Intrinsic::amdgcn_ds_ordered_swap:
1456 case Intrinsic::amdgcn_flat_atomic_fadd:
1457 case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
1458 case Intrinsic::amdgcn_flat_atomic_fmax:
1459 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1460 case Intrinsic::amdgcn_flat_atomic_fmin:
1461 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1462 case Intrinsic::amdgcn_global_atomic_csub:
1463 case Intrinsic::amdgcn_global_atomic_fadd:
1464 case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
1465 case Intrinsic::amdgcn_global_atomic_fmax:
1466 case Intrinsic::amdgcn_global_atomic_fmax_num:
1467 case Intrinsic::amdgcn_global_atomic_fmin:
1468 case Intrinsic::amdgcn_global_atomic_fmin_num:
1469 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1470 case Intrinsic::amdgcn_global_load_tr_b64:
1471 case Intrinsic::amdgcn_global_load_tr_b128:
1474 case Intrinsic::amdgcn_global_load_lds:
1485bool SITargetLowering::isLegalFlatAddressingMode(
const AddrMode &AM,
1491 return AM.BaseOffs == 0 && AM.Scale == 0;
1494 return AM.Scale == 0 &&
1496 AM.BaseOffs, AddrSpace, FlatVariant));
1518 return isLegalMUBUFAddressingMode(AM);
1521bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1532 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1544 if (AM.HasBaseReg) {
1575 return isLegalMUBUFAddressingMode(AM);
1582 if (Ty->
isSized() &&
DL.getTypeStoreSize(Ty) < 4)
1623 : isLegalMUBUFAddressingMode(AM);
1671 unsigned Size,
unsigned AddrSpace,
Align Alignment,
1685 Alignment < RequiredAlignment)
1706 RequiredAlignment =
Align(4);
1724 *IsFast = (Alignment >= RequiredAlignment) ? 64
1725 : (Alignment <
Align(4)) ? 32
1747 *IsFast = (Alignment >= RequiredAlignment) ? 96
1748 : (Alignment <
Align(4)) ? 32
1761 RequiredAlignment =
Align(8);
1772 *IsFast = (Alignment >= RequiredAlignment) ? 128
1773 : (Alignment <
Align(4)) ? 32
1790 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
1792 return Alignment >= RequiredAlignment ||
1797 bool AlignedBy4 = Alignment >=
Align(4);
1799 *IsFast = AlignedBy4;
1801 return AlignedBy4 ||
1811 bool AlignedBy4 = Alignment >=
Align(4);
1813 *IsFast = AlignedBy4;
1824 return Alignment >=
Align(4) ||
1838 return Size >= 32 && Alignment >=
Align(4);
1843 unsigned *IsFast)
const {
1845 Alignment, Flags, IsFast);
1855 if (
Op.size() >= 16 &&
1859 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
1867 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1877 unsigned DestAS)
const {
1885 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
1889 const MemSDNode *MemNode = cast<MemSDNode>(
N);
1909 unsigned Index)
const {
1956 std::tie(InputPtrReg, RC, ArgTy) =
1966 MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
1972 const SDLoc &SL)
const {
1979 const SDLoc &SL)
const {
1982 std::optional<uint32_t> KnownSize =
1984 if (KnownSize.has_value())
2011 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2020SDValue SITargetLowering::lowerKernargMemParameter(
2032 int64_t OffsetDiff =
Offset - AlignDownOffset;
2038 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2048 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2059 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2106 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2135 Reg = &WorkGroupIDX;
2136 RC = &AMDGPU::SReg_32RegClass;
2140 Reg = &WorkGroupIDY;
2141 RC = &AMDGPU::SReg_32RegClass;
2145 Reg = &WorkGroupIDZ;
2146 RC = &AMDGPU::SReg_32RegClass;
2177 for (
unsigned I = 0, E = Ins.size(), PSInputNum = 0;
I != E; ++
I) {
2181 "vector type argument should have been split");
2186 bool SkipArg = !Arg->
Used && !
Info->isPSInputAllocated(PSInputNum);
2195 "unexpected vector split in ps argument type");
2209 Info->markPSInputAllocated(PSInputNum);
2211 Info->markPSInputEnabled(PSInputNum);
2228 if (
Info.hasWorkItemIDX()) {
2234 Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2238 if (
Info.hasWorkItemIDY()) {
2244 unsigned Reg = AMDGPU::VGPR1;
2252 if (
Info.hasWorkItemIDZ()) {
2258 unsigned Reg = AMDGPU::VGPR2;
2278 if (RegIdx == ArgVGPRs.
size()) {
2285 unsigned Reg = ArgVGPRs[RegIdx];
2287 assert(Reg != AMDGPU::NoRegister);
2297 unsigned NumArgRegs) {
2300 if (RegIdx == ArgSGPRs.
size())
2303 unsigned Reg = ArgSGPRs[RegIdx];
2305 assert(Reg != AMDGPU::NoRegister);
2319 assert(Reg != AMDGPU::NoRegister);
2345 const unsigned Mask = 0x3ff;
2348 if (
Info.hasWorkItemIDX()) {
2350 Info.setWorkItemIDX(Arg);
2353 if (
Info.hasWorkItemIDY()) {
2355 Info.setWorkItemIDY(Arg);
2358 if (
Info.hasWorkItemIDZ())
2370 const unsigned Mask = 0x3ff;
2395 if (
Info.hasImplicitArgPtr())
2403 if (
Info.hasWorkGroupIDX())
2406 if (
Info.hasWorkGroupIDY())
2409 if (
Info.hasWorkGroupIDZ())
2412 if (
Info.hasLDSKernelId())
2424 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
2431 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
2437 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
2445 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
2460 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
2466 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
2481 unsigned LastExplicitArgOffset =
2484 bool InPreloadSequence =
true;
2486 for (
auto &Arg :
F.args()) {
2487 if (!InPreloadSequence || !Arg.hasInRegAttr())
2490 int ArgIdx = Arg.getArgNo();
2493 if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
2494 (
int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
2497 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
2498 (
int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
2500 assert(ArgLocs[ArgIdx].isMemLoc());
2501 auto &ArgLoc = ArgLocs[InIdx];
2503 unsigned ArgOffset = ArgLoc.getLocMemOffset();
2505 unsigned NumAllocSGPRs =
2506 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
2509 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
2510 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
2511 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
2515 unsigned Padding = ArgOffset - LastExplicitArgOffset;
2516 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
2518 if (PaddingSGPRs + NumAllocSGPRs + 1 >
2520 InPreloadSequence =
false;
2526 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
2528 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
2530 if (PreloadRegs->
size() > 1)
2531 RC = &AMDGPU::SGPR_32RegClass;
2532 for (
auto &Reg : *PreloadRegs) {
2538 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
2547 if (
Info.hasLDSKernelId()) {
2549 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2559 bool IsShader)
const {
2567 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
2569 unsigned CurrentUserSGPRs =
Info.getNumUserSGPRs();
2573 unsigned NumRequiredSystemSGPRs =
Info.hasWorkGroupIDX() +
2574 Info.hasWorkGroupIDY() +
2575 Info.hasWorkGroupIDZ() +
2576 Info.hasWorkGroupInfo();
2577 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
2579 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2584 if (!HasArchitectedSGPRs) {
2585 if (
Info.hasWorkGroupIDX()) {
2587 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2591 if (
Info.hasWorkGroupIDY()) {
2593 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2597 if (
Info.hasWorkGroupIDZ()) {
2599 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2604 if (
Info.hasWorkGroupInfo()) {
2606 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
2610 if (
Info.hasPrivateSegmentWaveByteOffset()) {
2612 unsigned PrivateSegmentWaveByteOffsetReg;
2615 PrivateSegmentWaveByteOffsetReg =
2616 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
2620 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
2622 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
2625 PrivateSegmentWaveByteOffsetReg =
Info.addPrivateSegmentWaveByteOffset();
2627 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
2628 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
2632 Info.getNumPreloadedSGPRs() >= 16);
2647 if (HasStackObjects)
2648 Info.setHasNonSpillStackObjects(
true);
2653 HasStackObjects =
true;
2657 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
2659 if (!ST.enableFlatScratch()) {
2660 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
2667 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
2669 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
2679 Info.setScratchRSrcReg(ReservedBufferReg);
2698 if (!
MRI.isLiveIn(AMDGPU::SGPR32)) {
2699 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
2706 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
2707 if (!
MRI.isLiveIn(Reg)) {
2708 Info.setStackPtrOffsetReg(Reg);
2713 if (
Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
2720 if (ST.getFrameLowering()->hasFP(MF)) {
2721 Info.setFrameOffsetReg(AMDGPU::SGPR33);
2727 return !
Info->isEntryFunction();
2739 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
2748 if (AMDGPU::SReg_64RegClass.
contains(*
I))
2749 RC = &AMDGPU::SGPR_64RegClass;
2750 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
2751 RC = &AMDGPU::SGPR_32RegClass;
2757 Entry->addLiveIn(*
I);
2762 for (
auto *Exit : Exits)
2764 TII->get(TargetOpcode::COPY), *
I)
2782 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc());
2801 !
Info->hasLDSKernelId() && !
Info->hasWorkItemIDX() &&
2802 !
Info->hasWorkItemIDY() && !
Info->hasWorkItemIDZ());
2810 !
Info->hasWorkGroupIDZ());
2829 if ((
Info->getPSInputAddr() & 0x7F) == 0 ||
2830 ((
Info->getPSInputAddr() & 0xF) == 0 &&
Info->isPSInputAllocated(11))) {
2833 Info->markPSInputAllocated(0);
2834 Info->markPSInputEnabled(0);
2845 unsigned PsInputBits =
Info->getPSInputAddr() &
Info->getPSInputEnable();
2846 if ((PsInputBits & 0x7F) == 0 ||
2847 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
2850 }
else if (IsKernel) {
2853 Splits.
append(Ins.begin(), Ins.end());
2866 }
else if (!IsGraphics) {
2891 for (
unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
2901 if (IsEntryFunc && VA.
isMemLoc()) {
2924 if (Arg.
isOrigArg() &&
Info->getArgInfo().PreloadKernArgs.count(i)) {
2928 int64_t OffsetDiff =
Offset - AlignDownOffset;
2935 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
2946 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
2947 Ins[i].Flags.isSExt(), &Ins[i]);
2955 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
2958 if (PreloadRegs.
size() == 1) {
2959 Register VReg =
MRI.getLiveInVirtReg(PreloadRegs[0]);
2964 TRI->getRegSizeInBits(*RC)));
2972 for (
auto Reg : PreloadRegs) {
2979 PreloadRegs.size()),
2988 NewArg = convertArgType(DAG, VT, MemVT,
DL, CMemVT,
2989 Ins[i].Flags.isSExt(), &Ins[i]);
2994 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
2995 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3000 dyn_cast<PointerType>(FType->
getParamType(Ins[i].getOrigArgIndex()));
3013 }
else if (!IsEntryFunc && VA.
isMemLoc()) {
3014 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3025 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3026 RC = &AMDGPU::VGPR_32RegClass;
3027 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3028 RC = &AMDGPU::SGPR_32RegClass;
3081 auto &ArgUsageInfo =
3086 Info->setBytesInStackArgArea(StackArgSize);
3088 return Chains.
empty() ? Chain :
3112 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
3113 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3114 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3137 bool IsWaveEnd =
Info->returnsVoid() && IsShader;
3155 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3156 ++
I, ++RealRVLocIdx) {
3160 SDValue Arg = OutVals[RealRVLocIdx];
3188 if (!
Info->isEntryFunction()) {
3194 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3196 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3212 return DAG.
getNode(Opc,
DL, MVT::Other, RetOps);
3229 for (
unsigned i = 0; i != RVLocs.
size(); ++i) {
3295 auto &ArgUsageInfo =
3297 CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
3326 std::tie(OutgoingArg, ArgRC, ArgTy) =
3334 std::tie(IncomingArg, IncomingArgRC, Ty) =
3336 assert(IncomingArgRC == ArgRC);
3339 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3347 InputReg = getImplicitArgPtr(DAG,
DL);
3349 std::optional<uint32_t> Id =
3351 if (Id.has_value()) {
3363 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3367 unsigned SpecialArgOffset =
3381 std::tie(OutgoingArg, ArgRC, Ty) =
3384 std::tie(OutgoingArg, ArgRC, Ty) =
3387 std::tie(OutgoingArg, ArgRC, Ty) =
3402 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
3403 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
3404 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
3421 InputReg = InputReg.
getNode() ?
3430 InputReg = InputReg.
getNode() ?
3434 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
3435 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
3445 IncomingArgX ? *IncomingArgX :
3446 IncomingArgY ? *IncomingArgY :
3447 *IncomingArgZ, ~0u);
3454 RegsToPass.emplace_back(OutgoingArg->
getRegister(), InputReg);
3495 if (Callee->isDivergent())
3502 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
3506 if (!CallerPreserved)
3509 bool CCMatch = CallerCC == CalleeCC;
3522 if (Arg.hasByValAttr())
3536 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
3537 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3546 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
3580 if (IsChainCallConv) {
3584 RequestedExec = CLI.
Args.back();
3585 assert(RequestedExec.
Node &&
"No node for EXEC");
3590 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Unexpected last arg");
3591 CLI.
Outs.pop_back();
3595 assert(CLI.
Outs.back().OrigArgIndex == 2 &&
"Exec wasn't split up");
3596 CLI.
Outs.pop_back();
3601 "Haven't popped all the pieces of the EXEC mask");
3612 bool IsSibCall =
false;
3617 for (
unsigned I = 0, E = CLI.
Ins.size();
I != E; ++
I)
3626 "unsupported call to variadic function ");
3634 "unsupported required tail call to function ");
3639 Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
3643 "site marked musttail or on llvm.amdgcn.cs.chain");
3650 if (!TailCallOpt && IsTailCall)
3695 if (!IsSibCall || IsChainCallConv) {
3702 RegsToPass.emplace_back(IsChainCallConv
3703 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
3704 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
3711 MVT PtrVT = MVT::i32;
3714 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
3742 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
3750 int32_t
Offset = LocMemOffset;
3757 unsigned OpSize = Flags.isByVal() ?
3763 ? Flags.getNonZeroByValAlign()
3790 if (Outs[i].Flags.isByVal()) {
3792 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
3795 Outs[i].Flags.getNonZeroByValAlign(),
3803 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
3809 if (!MemOpChains.
empty())
3815 for (
auto &RegToPass : RegsToPass) {
3817 RegToPass.second, InGlue);
3826 if (IsTailCall && !IsSibCall) {
3831 std::vector<SDValue> Ops;
3832 Ops.push_back(Chain);
3833 Ops.push_back(Callee);
3850 if (IsChainCallConv)
3851 Ops.push_back(RequestedExec.
Node);
3855 for (
auto &RegToPass : RegsToPass) {
3857 RegToPass.second.getValueType()));
3862 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
3863 assert(Mask &&
"Missing call preserved mask for calling convention");
3867 Ops.push_back(InGlue);
3876 DL, MVT::Glue, Token),
3897 return DAG.
getNode(OPC,
DL, NodeTys, Ops);
3902 Chain = Call.getValue(0);
3903 InGlue = Call.getValue(1);
3905 uint64_t CalleePopBytes = NumBytes;
3924 EVT VT =
Op.getValueType();
3939 MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
3950 Tmp1 = DAG.
getNode(Opc, dl, VT, SP, ScaledSize);
3951 if (Alignment && *Alignment > StackAlign) {
3972 if (isa<ConstantSDNode>(
Size))
3979 if (
Op.getValueType() != MVT::i32)
3998 assert(
Op.getValueType() == MVT::i32);
4007 Op.getOperand(0), IntrinID, GetRoundBothImm);
4041 SDValue RoundModeTimesNumBits =
4061 TableEntry, EnumOffset);
4075 if (
auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4077 static_cast<uint32_t>(ConstMode->getZExtValue()),
4089 if (UseReducedTable) {
4095 SDValue RoundModeTimesNumBits =
4115 SDValue RoundModeTimesNumBits =
4124 NewMode = TruncTable;
4133 ReadFirstLaneID, NewMode);
4146 IntrinID, RoundBothImm, NewMode);
4152 if (
Op->isDivergent())
4155 switch (cast<MemSDNode>(
Op)->getAddressSpace()) {
4171 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4172 EVT SrcVT = Src.getValueType();
4181 EVT DstVT =
Op.getValueType();
4190 if (
Op.getValueType() != MVT::i64)
4204 Op.getOperand(0), IntrinID, ModeHwRegImm);
4206 Op.getOperand(0), IntrinID, TrapHwRegImm);
4220 if (
Op.getOperand(1).getValueType() != MVT::i64)
4232 ReadFirstLaneID, NewModeReg);
4234 ReadFirstLaneID, NewTrapReg);
4236 unsigned ModeHwReg =
4239 unsigned TrapHwReg =
4247 IntrinID, ModeHwRegImm, NewModeReg);
4250 IntrinID, TrapHwRegImm, NewTrapReg);
4257 .
Case(
"m0", AMDGPU::M0)
4258 .
Case(
"exec", AMDGPU::EXEC)
4259 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4260 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4261 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4262 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4263 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4266 if (Reg == AMDGPU::NoRegister) {
4280 case AMDGPU::EXEC_LO:
4281 case AMDGPU::EXEC_HI:
4282 case AMDGPU::FLAT_SCR_LO:
4283 case AMDGPU::FLAT_SCR_HI:
4288 case AMDGPU::FLAT_SCR:
4307 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
4316static std::pair<MachineBasicBlock *, MachineBasicBlock *>
4338 auto Next = std::next(
I);
4351 return std::pair(LoopBB, RemainderBB);
4358 auto I =
MI.getIterator();
4359 auto E = std::next(
I);
4381 Src->setIsKill(
false);
4397 Register Reg =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4400 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
4422 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
4423 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
4432 Register PhiExec =
MRI.createVirtualRegister(BoolRC);
4433 Register NewExec =
MRI.createVirtualRegister(BoolRC);
4434 Register CurrentIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4435 Register CondReg =
MRI.createVirtualRegister(BoolRC);
4443 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
4450 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
4454 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
4459 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32
4460 : AMDGPU::S_AND_SAVEEXEC_B64),
4464 MRI.setSimpleHint(NewExec, CondReg);
4466 if (UseGPRIdxMode) {
4468 SGPRIdxReg = CurrentIdxReg;
4470 SGPRIdxReg =
MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
4471 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
4478 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
4481 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
4488 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4490 BuildMI(LoopBB,
I,
DL,
TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term
4491 : AMDGPU::S_XOR_B64_term), Exec)
4512 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
4513 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
4521 const auto *BoolXExecRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
4523 Register SaveExec =
MRI.createVirtualRegister(BoolXExecRC);
4524 Register TmpExec =
MRI.createVirtualRegister(BoolXExecRC);
4525 unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4526 unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4541 InitResultReg, DstReg, PhiReg, TmpExec,
4542 Offset, UseGPRIdxMode, SGPRIdxReg);
4559static std::pair<unsigned, int>
4564 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
4569 return std::pair(AMDGPU::sub0,
Offset);
4583 assert(
Idx->getReg() != AMDGPU::NoRegister);
4604 return Idx->getReg();
4606 Register Tmp =
MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
4623 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
4624 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4633 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4636 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4640 if (UseGPRIdxMode) {
4647 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4660 MI.eraseFromParent();
4669 Register PhiReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4670 Register InitReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
4676 UseGPRIdxMode, SGPRIdxReg);
4680 if (UseGPRIdxMode) {
4682 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
4684 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4689 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
4694 MI.eraseFromParent();
4711 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
4722 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
4724 if (
Idx->getReg() == AMDGPU::NoRegister) {
4735 MI.eraseFromParent();
4740 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
4744 if (UseGPRIdxMode) {
4748 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4757 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4758 TRI.getRegSizeInBits(*VecRC), 32,
false);
4764 MI.eraseFromParent();
4774 Register PhiReg =
MRI.createVirtualRegister(VecRC);
4778 UseGPRIdxMode, SGPRIdxReg);
4781 if (UseGPRIdxMode) {
4783 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
4785 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
4791 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
4792 TRI.getRegSizeInBits(*VecRC), 32,
false);
4793 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
4799 MI.eraseFromParent();
4814 bool isSGPR =
TRI->isSGPRClass(
MRI.getRegClass(SrcReg));
4842 Register LoopIterator =
MRI.createVirtualRegister(WaveMaskRegClass);
4843 Register InitalValReg =
MRI.createVirtualRegister(DstRegClass);
4845 Register AccumulatorReg =
MRI.createVirtualRegister(DstRegClass);
4846 Register ActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4847 Register NewActiveBitsReg =
MRI.createVirtualRegister(WaveMaskRegClass);
4849 Register FF1Reg =
MRI.createVirtualRegister(DstRegClass);
4850 Register LaneValueReg =
MRI.createVirtualRegister(DstRegClass);
4852 bool IsWave32 = ST.isWave32();
4853 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
4854 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
4859 (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
4862 BuildMI(BB,
I,
DL,
TII->get(AMDGPU::S_MOV_B32), InitalValReg)
4867 I = ComputeLoop->end();
4869 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
4873 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
4874 .
addReg(TmpSReg->getOperand(0).getReg())
4878 unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
4879 auto FF1 =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(SFFOpc), FF1Reg)
4880 .
addReg(ActiveBits->getOperand(0).getReg());
4881 auto LaneValue =
BuildMI(*ComputeLoop,
I,
DL,
4882 TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
4884 .
addReg(FF1->getOperand(0).getReg());
4885 auto NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
TII->get(Opc), DstReg)
4887 .
addReg(LaneValue->getOperand(0).getReg());
4890 unsigned BITSETOpc =
4891 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
4892 auto NewActiveBits =
4893 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
4894 .
addReg(FF1->getOperand(0).getReg())
4895 .
addReg(ActiveBits->getOperand(0).getReg());
4898 Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
4899 .addMBB(ComputeLoop);
4900 ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
4901 .addMBB(ComputeLoop);
4904 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
4906 .
addReg(NewActiveBits->getOperand(0).getReg())
4908 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
4913 MI.eraseFromParent();
4924 switch (
MI.getOpcode()) {
4925 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
4927 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
4929 case AMDGPU::S_UADDO_PSEUDO:
4930 case AMDGPU::S_USUBO_PSEUDO: {
4937 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
4939 : AMDGPU::S_SUB_I32;
4946 MI.eraseFromParent();
4949 case AMDGPU::S_ADD_U64_PSEUDO:
4950 case AMDGPU::S_SUB_U64_PSEUDO: {
4959 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
4961 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
4969 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4970 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4973 MI,
MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4975 MI,
MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4978 MI,
MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
4980 MI,
MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
4982 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
4983 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
4996 MI.eraseFromParent();
4999 case AMDGPU::V_ADD_U64_PSEUDO:
5000 case AMDGPU::V_SUB_U64_PSEUDO: {
5006 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
5012 if (IsAdd && ST.hasLshlAddB64()) {
5018 TII->legalizeOperands(*
Add);
5019 MI.eraseFromParent();
5023 const auto *CarryRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5025 Register DestSub0 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5026 Register DestSub1 =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5028 Register CarryReg =
MRI.createVirtualRegister(CarryRC);
5029 Register DeadCarryReg =
MRI.createVirtualRegister(CarryRC);
5033 : &AMDGPU::VReg_64RegClass;
5036 : &AMDGPU::VReg_64RegClass;
5039 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5041 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5044 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5046 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5049 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5051 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5053 unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
5060 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
5074 TII->legalizeOperands(*LoHalf);
5075 TII->legalizeOperands(*HiHalf);
5076 MI.eraseFromParent();
5079 case AMDGPU::S_ADD_CO_PSEUDO:
5080 case AMDGPU::S_SUB_CO_PSEUDO: {
5094 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
5095 ? AMDGPU::S_ADDC_U32
5096 : AMDGPU::S_SUBB_U32;
5098 Register RegOp0 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5099 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
5104 Register RegOp1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5105 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
5109 Register RegOp2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5111 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
5117 unsigned WaveSize =
TRI->getRegSizeInBits(*Src2RC);
5118 assert(WaveSize == 64 || WaveSize == 32);
5120 if (WaveSize == 64) {
5121 if (ST.hasScalarCompareEq64()) {
5127 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
5129 MII,
MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
5131 MII,
MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
5132 Register Src2_32 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5134 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
5151 (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
5157 MI.eraseFromParent();
5160 case AMDGPU::SI_INIT_M0: {
5162 TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
5163 .
add(
MI.getOperand(0));
5164 MI.eraseFromParent();
5167 case AMDGPU::GET_GROUPSTATICSIZE: {
5172 .
add(
MI.getOperand(0))
5174 MI.eraseFromParent();
5177 case AMDGPU::GET_SHADERCYCLESHILO: {
5191 using namespace AMDGPU::Hwreg;
5192 Register RegHi1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5194 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5195 Register RegLo1 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5197 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
5198 Register RegHi2 =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5200 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
5204 Register RegLo =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5209 .
add(
MI.getOperand(0))
5214 MI.eraseFromParent();
5217 case AMDGPU::SI_INDIRECT_SRC_V1:
5218 case AMDGPU::SI_INDIRECT_SRC_V2:
5219 case AMDGPU::SI_INDIRECT_SRC_V4:
5220 case AMDGPU::SI_INDIRECT_SRC_V8:
5221 case AMDGPU::SI_INDIRECT_SRC_V9:
5222 case AMDGPU::SI_INDIRECT_SRC_V10:
5223 case AMDGPU::SI_INDIRECT_SRC_V11:
5224 case AMDGPU::SI_INDIRECT_SRC_V12:
5225 case AMDGPU::SI_INDIRECT_SRC_V16:
5226 case AMDGPU::SI_INDIRECT_SRC_V32:
5228 case AMDGPU::SI_INDIRECT_DST_V1:
5229 case AMDGPU::SI_INDIRECT_DST_V2:
5230 case AMDGPU::SI_INDIRECT_DST_V4:
5231 case AMDGPU::SI_INDIRECT_DST_V8:
5232 case AMDGPU::SI_INDIRECT_DST_V9:
5233 case AMDGPU::SI_INDIRECT_DST_V10:
5234 case AMDGPU::SI_INDIRECT_DST_V11:
5235 case AMDGPU::SI_INDIRECT_DST_V12:
5236 case AMDGPU::SI_INDIRECT_DST_V16:
5237 case AMDGPU::SI_INDIRECT_DST_V32:
5239 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
5240 case AMDGPU::SI_KILL_I1_PSEUDO:
5242 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
5251 Register SrcCond =
MI.getOperand(3).getReg();
5253 Register DstLo =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5254 Register DstHi =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5255 const auto *CondRC =
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
5256 Register SrcCondCopy =
MRI.createVirtualRegister(CondRC);
5260 : &AMDGPU::VReg_64RegClass;
5263 : &AMDGPU::VReg_64RegClass;
5266 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5268 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5271 MI,
MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5273 MI,
MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5276 MI,
MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5278 MI,
MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5300 MI.eraseFromParent();
5303 case AMDGPU::SI_BR_UNDEF: {
5307 .
add(
MI.getOperand(0));
5309 MI.eraseFromParent();
5312 case AMDGPU::ADJCALLSTACKUP:
5313 case AMDGPU::ADJCALLSTACKDOWN: {
5320 case AMDGPU::SI_CALL_ISEL: {
5324 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
5327 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
5333 MI.eraseFromParent();
5336 case AMDGPU::V_ADD_CO_U32_e32:
5337 case AMDGPU::V_SUB_CO_U32_e32:
5338 case AMDGPU::V_SUBREV_CO_U32_e32: {
5341 unsigned Opc =
MI.getOpcode();
5343 bool NeedClampOperand =
false;
5344 if (
TII->pseudoToMCOpcode(Opc) == -1) {
5346 NeedClampOperand =
true;
5350 if (
TII->isVOP3(*
I)) {
5355 I.add(
MI.getOperand(1))
5356 .add(
MI.getOperand(2));
5357 if (NeedClampOperand)
5360 TII->legalizeOperands(*
I);
5362 MI.eraseFromParent();
5365 case AMDGPU::V_ADDC_U32_e32:
5366 case AMDGPU::V_SUBB_U32_e32:
5367 case AMDGPU::V_SUBBREV_U32_e32:
5370 TII->legalizeOperands(
MI);
5372 case AMDGPU::DS_GWS_INIT:
5373 case AMDGPU::DS_GWS_SEMA_BR:
5374 case AMDGPU::DS_GWS_BARRIER:
5375 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::data0);
5377 case AMDGPU::DS_GWS_SEMA_V:
5378 case AMDGPU::DS_GWS_SEMA_P:
5379 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
5387 case AMDGPU::S_SETREG_B32: {
5402 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
5403 const unsigned SetMask = WidthMask <<
Offset;
5406 unsigned SetDenormOp = 0;
5407 unsigned SetRoundOp = 0;
5415 SetRoundOp = AMDGPU::S_ROUND_MODE;
5416 SetDenormOp = AMDGPU::S_DENORM_MODE;
5418 SetRoundOp = AMDGPU::S_ROUND_MODE;
5420 SetDenormOp = AMDGPU::S_DENORM_MODE;
5423 if (SetRoundOp || SetDenormOp) {
5426 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
5427 unsigned ImmVal = Def->getOperand(1).getImm();
5441 MI.eraseFromParent();
5450 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
5454 case AMDGPU::S_INVERSE_BALLOT_U32:
5455 case AMDGPU::S_INVERSE_BALLOT_U64: {
5460 const Register DstReg =
MI.getOperand(0).getReg();
5461 Register MaskReg =
MI.getOperand(1).getReg();
5463 const bool IsVALU =
TRI->isVectorRegister(
MRI, MaskReg);
5466 MaskReg =
TII->readlaneVGPRToSGPR(MaskReg,
MI,
MRI);
5470 MI.eraseFromParent();
5473 case AMDGPU::ENDPGM_TRAP: {
5476 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
5494 MI.eraseFromParent();
5497 case AMDGPU::SIMULATED_TRAP: {
5501 TII->insertSimulatedTrap(
MRI, *BB,
MI,
MI.getDebugLoc());
5502 MI.eraseFromParent();
5539 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
5626 EVT VT =
N->getValueType(0);
5630 if (VT == MVT::f16) {
5646 unsigned Opc =
Op.getOpcode();
5647 EVT VT =
Op.getValueType();
5648 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5649 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5650 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5651 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5669 unsigned Opc =
Op.getOpcode();
5670 EVT VT =
Op.getValueType();
5671 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
5672 VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
5673 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5674 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
5693 unsigned Opc =
Op.getOpcode();
5694 EVT VT =
Op.getValueType();
5695 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
5696 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
5697 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
5698 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
5699 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
5700 VT == MVT::v32bf16);
5706 : std::pair(Op0, Op0);
5725 switch (
Op.getOpcode()) {
5731 assert((!Result.getNode() ||
5732 Result.getNode()->getNumValues() == 2) &&
5733 "Load should return a value and a chain");
5737 EVT VT =
Op.getValueType();
5739 return lowerFSQRTF32(
Op, DAG);
5741 return lowerFSQRTF64(
Op, DAG);
5746 return LowerTrig(
Op, DAG);
5755 return LowerGlobalAddress(MFI,
Op, DAG);
5762 return lowerINSERT_SUBVECTOR(
Op, DAG);
5764 return lowerINSERT_VECTOR_ELT(
Op, DAG);
5766 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
5768 return lowerVECTOR_SHUFFLE(
Op, DAG);
5770 return lowerSCALAR_TO_VECTOR(
Op, DAG);
5772 return lowerBUILD_VECTOR(
Op, DAG);
5775 return lowerFP_ROUND(
Op, DAG);
5780 if (
Op.getOperand(0)->getValueType(0) != MVT::f32)
5784 int RoundMode =
Op.getConstantOperandVal(1);
5792 return DAG.
getNode(Opc,
DL,
Op.getNode()->getVTList(),
Op->getOperand(0));
5795 return lowerTRAP(
Op, DAG);
5797 return lowerDEBUGTRAP(
Op, DAG);
5805 return lowerFMINNUM_FMAXNUM(
Op, DAG);
5808 return lowerFLDEXP(
Op, DAG);
5835 return lowerMUL(
Op, DAG);
5838 return lowerXMULO(
Op, DAG);
5841 return lowerXMUL_LOHI(
Op, DAG);
5874 EVT FittingLoadVT = LoadVT;
5906SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
5910 bool IsIntrinsic)
const {
5914 EVT LoadVT =
M->getValueType(0);
5916 EVT EquivLoadVT = LoadVT;
5935 VTList, Ops,
M->getMemoryVT(),
5936 M->getMemOperand());
5947 EVT LoadVT =
M->getValueType(0);
5953 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
5954 bool IsTFE =
M->getNumValues() == 3;
5973 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
M->getMemOperand());
5976 return getMemIntrinsicNode(Opc,
DL,
M->getVTList(), Ops, IntVT,
5977 M->getMemOperand(), DAG);
5982 SDValue MemNode = getMemIntrinsicNode(Opc,
DL, VTList, Ops, CastVT,
5983 M->getMemOperand(), DAG);
5991 EVT VT =
N->getValueType(0);
5992 unsigned CondCode =
N->getConstantOperandVal(3);
6003 EVT CmpVT =
LHS.getValueType();
6004 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
6025 EVT VT =
N->getValueType(0);
6027 unsigned CondCode =
N->getConstantOperandVal(3);
6036 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
6054 EVT VT =
N->getValueType(0);
6061 Src.getOperand(1), Src.getOperand(2));
6072 Exec = AMDGPU::EXEC_LO;
6074 Exec = AMDGPU::EXEC;
6092 switch (
N->getOpcode()) {
6104 unsigned IID =
N->getConstantOperandVal(0);
6106 case Intrinsic::amdgcn_make_buffer_rsrc:
6107 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
6109 case Intrinsic::amdgcn_cvt_pkrtz: {
6118 case Intrinsic::amdgcn_cvt_pknorm_i16:
6119 case Intrinsic::amdgcn_cvt_pknorm_u16:
6120 case Intrinsic::amdgcn_cvt_pk_i16:
6121 case Intrinsic::amdgcn_cvt_pk_u16: {
6127 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
6129 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
6131 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
6136 EVT VT =
N->getValueType(0);
6145 case Intrinsic::amdgcn_s_buffer_load: {
6157 EVT VT =
Op.getValueType();
6158 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
6170 if (!
Offset->isDivergent()) {
6189 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
6201 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
6202 Results.push_back(Res.getOperand(
I));
6206 Results.push_back(Res.getValue(1));
6215 EVT VT =
N->getValueType(0);
6220 EVT SelectVT = NewVT;
6221 if (NewVT.
bitsLT(MVT::i32)) {
6224 SelectVT = MVT::i32;
6230 if (NewVT != SelectVT)
6236 if (
N->getValueType(0) != MVT::v2f16)
6249 if (
N->getValueType(0) != MVT::v2f16)
6262 if (
N->getValueType(0) != MVT::f16)
6280 if (
I.getUse().get() !=
Value)
6283 if (
I->getOpcode() == Opcode)
6289unsigned SITargetLowering::isCFIntrinsic(
const SDNode *
Intr)
const {
6291 switch (
Intr->getConstantOperandVal(1)) {
6292 case Intrinsic::amdgcn_if:
6294 case Intrinsic::amdgcn_else:
6296 case Intrinsic::amdgcn_loop:
6298 case Intrinsic::amdgcn_end_cf:
6346 SDNode *
Intr = BRCOND.getOperand(1).getNode();
6359 assert(BR &&
"brcond missing unconditional branch user");
6360 Target = BR->getOperand(1);
6363 unsigned CFNode = isCFIntrinsic(
Intr);
6382 Ops.
append(
Intr->op_begin() + (HaveChain ? 2 : 1),
Intr->op_end());
6412 for (
unsigned i = 1, e =
Intr->getNumValues() - 1; i != e; ++i) {
6429 Intr->getOperand(0));
6436 MVT VT =
Op.getSimpleValueType();
6439 if (
Op.getConstantOperandVal(0) != 0)
6445 if (
Info->isEntryFunction())
6463 return Op.getValueType().bitsLE(VT) ?
6470 assert(
Op.getValueType() == MVT::f16 &&
6471 "Do not know how to custom lower FP_ROUND for non-f16 type");
6474 EVT SrcVT = Src.getValueType();
6475 if (SrcVT != MVT::f64)
6491 EVT VT =
Op.getValueType();
6494 bool IsIEEEMode =
Info->getMode().IEEE;
6503 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
6511 EVT VT =
Op.getValueType();
6515 EVT ExpVT =
Exp.getValueType();
6516 if (ExpVT == MVT::i16)
6537 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
6545 EVT VT =
Op.getValueType();
6551 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
6578 if (
Op->isDivergent())
6591 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
6593 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
6596 if (Op0SignBits >= 33 && Op1SignBits >= 33)
6598 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
6604 EVT VT =
Op.getValueType();
6611 const APInt &
C = RHSC->getAPIntValue();
6613 if (
C.isPowerOf2()) {
6615 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
6620 SL, VT, Result, ShiftAmt),
6640 if (
Op->isDivergent()) {
6657 return lowerTrapEndpgm(
Op, DAG);
6660 lowerTrapHsaQueuePtr(
Op, DAG);
6663SDValue SITargetLowering::lowerTrapEndpgm(
6671 const SDLoc &
DL,
Align Alignment, ImplicitParameter Param)
const {
6681SDValue SITargetLowering::lowerTrapHsaQueuePtr(
6691 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
6697 if (UserSGPR == AMDGPU::NoRegister) {
6722SDValue SITargetLowering::lowerTrapHsa(
6748 "debugtrap handler not supported",
6764SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
6768 ? AMDGPU::SRC_SHARED_BASE
6769 : AMDGPU::SRC_PRIVATE_BASE;
6792 {SDValue(Mov, 0), DAG.getConstant(32, DL, MVT::i64)}));
6801 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
6807 if (UserSGPR == AMDGPU::NoRegister) {
6814 DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
6837 if (isa<FrameIndexSDNode>(Val) || isa<GlobalAddressSDNode>(Val) ||
6838 isa<BasicBlockSDNode>(Val))
6841 if (
auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
6842 return ConstVal->getSExtValue() !=
TM.getNullPointerValue(AddrSpace);
6856 unsigned DestAS, SrcAS;
6858 bool IsNonNull =
false;
6859 if (
const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(
Op)) {
6860 SrcAS = ASC->getSrcAddressSpace();
6861 Src = ASC->getOperand(0);
6862 DestAS = ASC->getDestAddressSpace();
6865 Op.getConstantOperandVal(0) ==
6866 Intrinsic::amdgcn_addrspacecast_nonnull);
6867 Src =
Op->getOperand(1);
6868 SrcAS =
Op->getConstantOperandVal(2);
6869 DestAS =
Op->getConstantOperandVal(3);
6884 unsigned NullVal =
TM.getNullPointerValue(DestAS);
6898 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
6906 unsigned NullVal =
TM.getNullPointerValue(SrcAS);
6918 Op.getValueType() == MVT::i64) {
6927 Src.getValueType() == MVT::i64)
6951 EVT InsVT =
Ins.getValueType();
6954 unsigned IdxVal =
Idx->getAsZExtVal();
6959 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
6964 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
6966 MVT::i32, InsNumElts / 2);
6971 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
6973 if (InsNumElts == 2) {
6986 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
7008 auto KIdx = dyn_cast<ConstantSDNode>(
Idx);
7009 if (NumElts == 4 && EltSize == 16 && KIdx) {
7020 unsigned Idx = KIdx->getZExtValue();
7021 bool InsertLo =
Idx < 2;
7023 InsertLo ? LoVec : HiVec,
7038 if (isa<ConstantSDNode>(
Idx))
7044 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
7050 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
7066 DAG.
getNOT(SL, BFM, IntVT), BCVec);
7078 EVT ResultVT =
Op.getValueType();
7091 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
7094 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
7099 if (VecSize == 128) {
7107 }
else if (VecSize == 256) {
7110 for (
unsigned P = 0;
P < 4; ++
P) {
7116 Parts[0], Parts[1]));
7118 Parts[2], Parts[3]));
7124 for (
unsigned P = 0;
P < 8; ++
P) {
7131 Parts[0], Parts[1], Parts[2], Parts[3]));
7134 Parts[4], Parts[5],Parts[6], Parts[7]));
7137 EVT IdxVT =
Idx.getValueType();
7154 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
7169 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
7179 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
7185 EVT ResultVT =
Op.getValueType();
7188 EVT PackVT = ResultVT.
isInteger() ? MVT::v2i16 : MVT::v2f16;
7190 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
7206 int VecIdx =
Idx < SrcNumElts ? 0 : 1;
7207 int EltIdx =
Idx < SrcNumElts ?
Idx :
Idx - SrcNumElts;
7215 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
7216 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
7217 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
7218 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
7237 EVT ResultVT =
Op.getValueType();
7253 EVT VT =
Op.getValueType();
7255 if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7256 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
7275 { CastLo, CastHi });
7279 if (VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v16bf16) {
7286 for (
unsigned P = 0;
P < 4; ++
P)
7287 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7290 for (
unsigned P = 0;
P < 4; ++
P) {
7300 if (VT == MVT::v32i16 || VT == MVT::v32f16 || VT == MVT::v32bf16) {
7307 for (
unsigned P = 0;
P < 8; ++
P)
7308 Parts[
P].push_back(
Op.getOperand(
I +
P * E));
7311 for (
unsigned P = 0;
P < 8; ++
P) {
7321 assert(VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16);
7373 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
7411 EVT PtrVT =
Op.getValueType();
7427 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
7500 SDValue Param = lowerKernargMemParameter(
7510 "non-hsa intrinsic with hsa target",
7519 "intrinsic not supported on subtarget",
7529 unsigned NumElts = Elts.
size();
7531 if (NumElts <= 12) {
7540 for (
unsigned i = 0; i < Elts.
size(); ++i) {
7546 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
7547 VecElts[i] = DAG.
getUNDEF(MVT::f32);
7556 EVT SrcVT = Src.getValueType();
7577 bool Unpacked,
bool IsD16,
int DMaskPop,
7578 int NumVDataDwords,
bool IsAtomicPacked16Bit,
7581 EVT ReqRetVT = ResultTypes[0];
7583 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
7584 ? (ReqRetNumElts + 1) / 2
7587 int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
7588 DMaskPop : (DMaskPop + 1) / 2;
7590 MVT DataDwordVT = NumDataDwords == 1 ?
7593 MVT MaskPopVT = MaskPopDwords == 1 ?
7599 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
7610 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
7612 NumDataDwords - MaskPopDwords);
7617 EVT LegalReqRetVT = ReqRetVT;
7619 if (!
Data.getValueType().isInteger())
7621 Data.getValueType().changeTypeToInteger(),
Data);
7642 if (Result->getNumValues() == 1)
7649 SDValue *LWE,
bool &IsTexFail) {
7650 auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.
getNode());
7669 unsigned DimIdx,
unsigned EndIdx,
7670 unsigned NumGradients) {
7672 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
7680 if (((
I + 1) >= EndIdx) ||
7681 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
7682 I == DimIdx + NumGradients - 1))) {
7683 if (
Addr.getValueType() != MVT::i16)
7704 unsigned IntrOpcode =
Intr->BaseOpcode;
7716 bool AdjustRetType =
false;
7717 bool IsAtomicPacked16Bit =
false;
7720 const unsigned ArgOffset = WithChain ? 2 : 1;
7723 unsigned DMaskLanes = 0;
7725 if (BaseOpcode->Atomic) {
7726 VData =
Op.getOperand(2);
7728 IsAtomicPacked16Bit =
7729 (
Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7730 Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7733 if (BaseOpcode->AtomicX2) {
7740 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
7741 DMask = Is64Bit ? 0xf : 0x3;
7742 NumVDataDwords = Is64Bit ? 4 : 2;
7744 DMask = Is64Bit ? 0x3 : 0x1;
7745 NumVDataDwords = Is64Bit ? 2 : 1;
7748 DMask =
Op->getConstantOperandVal(ArgOffset +
Intr->DMaskIndex);
7751 if (BaseOpcode->Store) {
7752 VData =
Op.getOperand(2);
7760 VData = handleD16VData(VData, DAG,
true);
7777 (!LoadVT.
isVector() && DMaskLanes > 1))
7785 NumVDataDwords = (DMaskLanes + 1) / 2;
7787 NumVDataDwords = DMaskLanes;
7789 AdjustRetType =
true;
7793 unsigned VAddrEnd = ArgOffset +
Intr->VAddrEnd;
7798 Op.getOperand(ArgOffset +
Intr->GradientStart).getSimpleValueType();
7800 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7801 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7803 VAddrVT =
Op.getOperand(ArgOffset +
Intr->CoordStart).getSimpleValueType();
7805 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
7806 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
7809 for (
unsigned I =
Intr->VAddrStart; I < Intr->GradientStart;
I++) {
7810 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
7811 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
7816 {
Op.getOperand(ArgOffset +
I), DAG.
getUNDEF(MVT::f16)});
7820 "Bias needs to be converted to 16 bit in A16 mode");
7825 if (BaseOpcode->Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
7829 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
7830 "require 16 bit args for both gradients and addresses");
7835 if (!
ST->hasA16()) {
7836 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
7837 "support 16 bit addresses\n");
7847 if (BaseOpcode->Gradients && IsG16 &&
ST->hasG16()) {
7851 IntrOpcode = G16MappingInfo->
G16;
7859 ArgOffset +
Intr->GradientStart,
7860 ArgOffset +
Intr->CoordStart,
Intr->NumGradients);
7862 for (
unsigned I = ArgOffset +
Intr->GradientStart;
7863 I < ArgOffset + Intr->CoordStart;
I++)
7870 ArgOffset +
Intr->CoordStart, VAddrEnd,
7874 for (
unsigned I = ArgOffset +
Intr->CoordStart;
I < VAddrEnd;
I++)
7892 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->Sampler);
7893 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
7894 const bool UseNSA =
ST->hasNSAEncoding() &&
7895 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
7896 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
7897 const bool UsePartialNSA =
7898 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
7901 if (UsePartialNSA) {
7903 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
7912 if (!BaseOpcode->Sampler) {
7916 Op.getConstantOperandVal(ArgOffset +
Intr->UnormIndex);
7918 Unorm = UnormConst ? True : False;
7923 SDValue TexFail =
Op.getOperand(ArgOffset +
Intr->TexFailCtrlIndex);
7924 bool IsTexFail =
false;
7925 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
7936 NumVDataDwords += 1;
7937 AdjustRetType =
true;
7942 if (AdjustRetType) {
7944 if (DMaskLanes == 0 && !BaseOpcode->Store) {
7947 if (isa<MemSDNode>(
Op))
7952 EVT NewVT = NumVDataDwords > 1 ?
7956 ResultTypes[0] = NewVT;
7957 if (ResultTypes.size() == 3) {
7961 ResultTypes.erase(&ResultTypes[1]);
7965 unsigned CPol =
Op.getConstantOperandVal(ArgOffset +
Intr->CachePolicyIndex);
7966 if (BaseOpcode->Atomic)
7973 if (BaseOpcode->Store || BaseOpcode->Atomic)
7975 if (UsePartialNSA) {
7984 if (BaseOpcode->Sampler)
7989 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
7993 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
8001 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
8005 if (BaseOpcode->HasD16)
8007 if (isa<MemSDNode>(
Op))
8010 int NumVAddrDwords =
8016 NumVDataDwords, NumVAddrDwords);
8017 }
else if (IsGFX11Plus) {
8019 UseNSA ? AMDGPU::MIMGEncGfx11NSA
8020 : AMDGPU::MIMGEncGfx11Default,
8021 NumVDataDwords, NumVAddrDwords);
8022 }
else if (IsGFX10Plus) {
8024 UseNSA ? AMDGPU::MIMGEncGfx10NSA
8025 : AMDGPU::MIMGEncGfx10Default,
8026 NumVDataDwords, NumVAddrDwords);
8030 NumVDataDwords, NumVAddrDwords);
8033 "requested image instruction is not supported on this GPU");
8038 NumVDataDwords, NumVAddrDwords);
8041 NumVDataDwords, NumVAddrDwords);
8047 if (
auto MemOp = dyn_cast<MemSDNode>(
Op)) {
8052 if (BaseOpcode->AtomicX2) {
8057 if (BaseOpcode->Store)
8061 NumVDataDwords, IsAtomicPacked16Bit,
DL);
8079 if (!
Offset->isDivergent()) {
8124 return handleByteShortBufferLoads(DAG, VT,
DL, Ops, MMO);
8128 unsigned NumLoads = 1;
8134 if (NumElts == 8 || NumElts == 16) {
8135 NumLoads = NumElts / 4;
8143 setBufferOffsets(
Offset, DAG, &Ops[3],
8144 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
8147 for (
unsigned i = 0; i < NumLoads; ++i) {
8153 if (NumElts == 8 || NumElts == 16)
8200 EVT VT =
Op.getValueType();
8202 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
8206 switch (IntrinsicID) {
8207 case Intrinsic::amdgcn_implicit_buffer_ptr: {
8210 return getPreloadedValue(DAG, *MFI, VT,
8213 case Intrinsic::amdgcn_dispatch_ptr:
8214 case Intrinsic::amdgcn_queue_ptr: {
8217 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
8223 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr ?
8225 return getPreloadedValue(DAG, *MFI, VT, RegID);
8227 case Intrinsic::amdgcn_implicitarg_ptr: {
8229 return getImplicitArgPtr(DAG,
DL);
8230 return getPreloadedValue(DAG, *MFI, VT,
8233 case Intrinsic::amdgcn_kernarg_segment_ptr: {
8239 return getPreloadedValue(DAG, *MFI, VT,
8242 case Intrinsic::amdgcn_dispatch_id: {
8245 case Intrinsic::amdgcn_rcp:
8247 case Intrinsic::amdgcn_rsq:
8249 case Intrinsic::amdgcn_rsq_legacy:
8253 case Intrinsic::amdgcn_rcp_legacy:
8257 case Intrinsic::amdgcn_rsq_clamp: {
8271 case Intrinsic::r600_read_ngroups_x:
8275 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8278 case Intrinsic::r600_read_ngroups_y:
8282 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8285 case Intrinsic::r600_read_ngroups_z:
8289 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8292 case Intrinsic::r600_read_global_size_x:
8296 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8299 case Intrinsic::r600_read_global_size_y:
8303 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8306 case Intrinsic::r600_read_global_size_z:
8310 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
8313 case Intrinsic::r600_read_local_size_x:
8317 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8319 case Intrinsic::r600_read_local_size_y:
8323 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8325 case Intrinsic::r600_read_local_size_z:
8329 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
8331 case Intrinsic::amdgcn_workgroup_id_x:
8332 return getPreloadedValue(DAG, *MFI, VT,
8334 case Intrinsic::amdgcn_workgroup_id_y:
8335 return getPreloadedValue(DAG, *MFI, VT,
8337 case Intrinsic::amdgcn_workgroup_id_z:
8338 return getPreloadedValue(DAG, *MFI, VT,
8340 case Intrinsic::amdgcn_wave_id:
8341 return lowerWaveID(DAG,
Op);
8342 case Intrinsic::amdgcn_lds_kernel_id: {
8344 return getLDSKernelId(DAG,
DL);
8345 return getPreloadedValue(DAG, *MFI, VT,
8348 case Intrinsic::amdgcn_workitem_id_x:
8349 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
8350 case Intrinsic::amdgcn_workitem_id_y:
8351 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
8352 case Intrinsic::amdgcn_workitem_id_z:
8353 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
8354 case Intrinsic::amdgcn_wavefrontsize:
8357 case Intrinsic::amdgcn_s_buffer_load: {
8358 unsigned CPol =
Op.getConstantOperandVal(3);
8365 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8368 case Intrinsic::amdgcn_fdiv_fast:
8369 return lowerFDIV_FAST(
Op, DAG);
8370 case Intrinsic::amdgcn_sin:
8373 case Intrinsic::amdgcn_cos:
8376 case Intrinsic::amdgcn_mul_u24:
8378 case Intrinsic::amdgcn_mul_i24:
8381 case Intrinsic::amdgcn_log_clamp: {
8387 case Intrinsic::amdgcn_fract:
8390 case Intrinsic::amdgcn_class:
8392 Op.getOperand(1),
Op.getOperand(2));
8393 case Intrinsic::amdgcn_div_fmas:
8395 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8398 case Intrinsic::amdgcn_div_fixup:
8400 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8402 case Intrinsic::amdgcn_div_scale: {
8415 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
8418 Denominator, Numerator);
8420 case Intrinsic::amdgcn_icmp: {
8422 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
8423 Op.getConstantOperandVal(2) == 0 &&
8428 case Intrinsic::amdgcn_fcmp: {
8431 case Intrinsic::amdgcn_ballot:
8433 case Intrinsic::amdgcn_fmed3:
8435 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8436 case Intrinsic::amdgcn_fdot2:
8438 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3),
8440 case Intrinsic::amdgcn_fmul_legacy:
8442 Op.getOperand(1),
Op.getOperand(2));
8443 case Intrinsic::amdgcn_sffbh:
8445 case Intrinsic::amdgcn_sbfe:
8447 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8448 case Intrinsic::amdgcn_ubfe:
8450 Op.getOperand(1),
Op.getOperand(2),
Op.getOperand(3));
8451 case Intrinsic::amdgcn_cvt_pkrtz:
8452 case Intrinsic::amdgcn_cvt_pknorm_i16:
8453 case Intrinsic::amdgcn_cvt_pknorm_u16:
8454 case Intrinsic::amdgcn_cvt_pk_i16:
8455 case Intrinsic::amdgcn_cvt_pk_u16: {
8457 EVT VT =
Op.getValueType();
8460 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
8462 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
8464 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
8466 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
8472 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
8475 Op.getOperand(1),
Op.getOperand(2));
8478 case Intrinsic::amdgcn_fmad_ftz:
8480 Op.getOperand(2),
Op.getOperand(3));
8482 case Intrinsic::amdgcn_if_break:
8484 Op->getOperand(1),
Op->getOperand(2)), 0);
8486 case Intrinsic::amdgcn_groupstaticsize: {
8498 case Intrinsic::amdgcn_is_shared:
8499 case Intrinsic::amdgcn_is_private: {
8501 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
8503 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
8511 case Intrinsic::amdgcn_perm:
8513 Op.getOperand(2),
Op.getOperand(3));
8514 case Intrinsic::amdgcn_reloc_constant: {
8518 auto RelocSymbol = cast<GlobalVariable>(
8524 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8525 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8526 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8527 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8528 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8529 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8530 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8531 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8532 if (
Op.getOperand(4).getValueType() == MVT::i32)
8538 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
8539 Op.getOperand(3), IndexKeyi32);
8541 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8542 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8543 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8544 if (
Op.getOperand(6).getValueType() == MVT::i32)
8550 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
8551 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
8552 IndexKeyi32, Op.getOperand(7)});
8554 case Intrinsic::amdgcn_addrspacecast_nonnull:
8555 return lowerADDRSPACECAST(
Op, DAG);
8559 return lowerImage(
Op, ImageDimIntr, DAG,
false);
8570 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
8576 unsigned NewOpcode)
const {
8580 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8581 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8595 auto *
M = cast<MemSDNode>(
Op);
8599 M->getMemOperand());
8610 unsigned NewOpcode)
const {
8614 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
8615 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
8629 auto *
M = cast<MemSDNode>(
Op);
8633 M->getMemOperand());
8638 unsigned IntrID =
Op.getConstantOperandVal(1);
8642 case Intrinsic::amdgcn_ds_ordered_add:
8643 case Intrinsic::amdgcn_ds_ordered_swap: {
8648 unsigned IndexOperand =
M->getConstantOperandVal(7);
8649 unsigned WaveRelease =
M->getConstantOperandVal(8);
8650 unsigned WaveDone =
M->getConstantOperandVal(9);
8652 unsigned OrderedCountIndex = IndexOperand & 0x3f;
8653 IndexOperand &= ~0x3f;
8654 unsigned CountDw = 0;
8657 CountDw = (IndexOperand >> 24) & 0xf;
8658 IndexOperand &= ~(0xf << 24);
8660 if (CountDw < 1 || CountDw > 4) {
8662 "ds_ordered_count: dword count must be between 1 and 4");
8669 if (WaveDone && !WaveRelease)
8672 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
8673 unsigned ShaderType =
8675 unsigned Offset0 = OrderedCountIndex << 2;
8676 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (
Instruction << 4);
8679 Offset1 |= (CountDw - 1) << 6;
8682 Offset1 |= ShaderType << 2;
8684 unsigned Offset = Offset0 | (Offset1 << 8);
8693 M->getVTList(), Ops,
M->getMemoryVT(),
8694 M->getMemOperand());
8696 case Intrinsic::amdgcn_ds_fadd: {
8700 case Intrinsic::amdgcn_ds_fadd:
8706 M->getOperand(0),
M->getOperand(2),
M->getOperand(3),
8707 M->getMemOperand());
8709 case Intrinsic::amdgcn_ds_fmin:
8710 case Intrinsic::amdgcn_ds_fmax: {
8714 case Intrinsic::amdgcn_ds_fmin:
8717 case Intrinsic::amdgcn_ds_fmax:
8730 M->getMemoryVT(),
M->getMemOperand());
8732 case Intrinsic::amdgcn_buffer_load:
8733 case Intrinsic::amdgcn_buffer_load_format: {
8734 unsigned Glc =
Op.getConstantOperandVal(5);
8735 unsigned Slc =
Op.getConstantOperandVal(6);
8747 setBufferOffsets(
Op.getOperand(4), DAG, &Ops[3]);
8749 unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
8752 EVT VT =
Op.getValueType();
8754 auto *
M = cast<MemSDNode>(
Op);
8755 EVT LoadVT =
Op.getValueType();
8763 return handleByteShortBufferLoads(DAG, LoadVT,
DL, Ops,
8764 M->getMemOperand());
8766 return getMemIntrinsicNode(Opc,
DL,
Op->getVTList(), Ops, IntVT,
8767 M->getMemOperand(), DAG);
8769 case Intrinsic::amdgcn_raw_buffer_load:
8770 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8771 case Intrinsic::amdgcn_raw_buffer_load_format:
8772 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
8773 const bool IsFormat =
8774 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
8775 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
8777 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8778 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8791 auto *
M = cast<MemSDNode>(
Op);
8792 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
8794 case Intrinsic::amdgcn_struct_buffer_load:
8795 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8796 case Intrinsic::amdgcn_struct_buffer_load_format:
8797 case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
8798 const bool IsFormat =
8799 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
8800 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
8802 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8803 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8816 return lowerIntrinsicLoad(cast<MemSDNode>(
Op), IsFormat, DAG, Ops);
8818 case Intrinsic::amdgcn_tbuffer_load: {
8820 EVT LoadVT =
Op.getValueType();
8823 unsigned Dfmt =
Op.getConstantOperandVal(7);
8824 unsigned Nfmt =
Op.getConstantOperandVal(8);
8825 unsigned Glc =
Op.getConstantOperandVal(9);
8826 unsigned Slc =
Op.getConstantOperandVal(10);
8844 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8847 case Intrinsic::amdgcn_raw_tbuffer_load:
8848 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
8850 EVT LoadVT =
Op.getValueType();
8851 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8852 auto Offsets = splitBufferOffsets(
Op.getOperand(3), DAG);
8871 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8874 case Intrinsic::amdgcn_struct_tbuffer_load:
8875 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
8877 EVT LoadVT =
Op.getValueType();
8878 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
8879 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
8898 Op->getVTList(), Ops, LoadVT,
M->getMemOperand(),
8901 case Intrinsic::amdgcn_buffer_atomic_swap:
8902 case Intrinsic::amdgcn_buffer_atomic_add:
8903 case Intrinsic::amdgcn_buffer_atomic_sub:
8904 case Intrinsic::amdgcn_buffer_atomic_csub:
8905 case Intrinsic::amdgcn_buffer_atomic_smin:
8906 case Intrinsic::amdgcn_buffer_atomic_umin:
8907 case Intrinsic::amdgcn_buffer_atomic_smax:
8908 case Intrinsic::amdgcn_buffer_atomic_umax:
8909 case Intrinsic::amdgcn_buffer_atomic_and:
8910 case Intrinsic::amdgcn_buffer_atomic_or:
8911 case Intrinsic::amdgcn_buffer_atomic_xor:
8912 case Intrinsic::amdgcn_buffer_atomic_fadd: {
8913 unsigned Slc =
Op.getConstantOperandVal(6);
8926 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
8928 EVT VT =
Op.getValueType();
8930 auto *
M = cast<MemSDNode>(
Op);
8931 unsigned Opcode = 0;
8934 case Intrinsic::amdgcn_buffer_atomic_swap:
8937 case Intrinsic::amdgcn_buffer_atomic_add:
8940 case Intrinsic::amdgcn_buffer_atomic_sub:
8943 case Intrinsic::amdgcn_buffer_atomic_csub:
8946 case Intrinsic::amdgcn_buffer_atomic_smin:
8949 case Intrinsic::amdgcn_buffer_atomic_umin:
8952 case Intrinsic::amdgcn_buffer_atomic_smax:
8955 case Intrinsic::amdgcn_buffer_atomic_umax:
8958 case Intrinsic::amdgcn_buffer_atomic_and:
8961 case Intrinsic::amdgcn_buffer_atomic_or:
8964 case Intrinsic::amdgcn_buffer_atomic_xor:
8967 case Intrinsic::amdgcn_buffer_atomic_fadd:
8975 M->getMemOperand());
8977 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8978 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8980 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
8981 return lowerRawBufferAtomicIntrin(
Op, DAG,
8983 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8984 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8986 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
8987 return lowerStructBufferAtomicIntrin(
Op, DAG,
8989 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8990 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8992 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8993 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8995 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8996 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8998 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8999 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
9001 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
9002 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
9004 case Intrinsic::amdgcn_raw_buffer_atomic_add:
9005 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
9007 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
9008 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
9010 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
9011 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
9013 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
9014 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
9016 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
9017 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
9019 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
9020 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
9022 case Intrinsic::amdgcn_raw_buffer_atomic_and:
9023 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
9025 case Intrinsic::amdgcn_raw_buffer_atomic_or:
9026 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
9028 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
9029 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
9031 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
9032 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
9034 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
9035 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
9037 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
9038 return lowerRawBufferAtomicIntrin(
Op, DAG,
9040 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
9041 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
9042 return lowerStructBufferAtomicIntrin(
Op, DAG,
9044 case Intrinsic::amdgcn_struct_buffer_atomic_add:
9045 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
9047 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
9048 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
9050 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
9051 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
9052 return lowerStructBufferAtomicIntrin(
Op, DAG,
9054 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
9055 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
9056 return lowerStructBufferAtomicIntrin(
Op, DAG,
9058 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
9059 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
9060 return lowerStructBufferAtomicIntrin(
Op, DAG,
9062 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
9063 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
9064 return lowerStructBufferAtomicIntrin(
Op, DAG,
9066 case Intrinsic::amdgcn_struct_buffer_atomic_and:
9067 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
9069 case Intrinsic::amdgcn_struct_buffer_atomic_or:
9070 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
9072 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
9073 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
9075 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
9076 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
9078 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
9079 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
9081 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
9082 return lowerStructBufferAtomicIntrin(
Op, DAG,
9085 case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
9086 unsigned Slc =
Op.getConstantOperandVal(7);
9100 setBufferOffsets(
Op.getOperand(6), DAG, &Ops[5]);
9102 EVT VT =
Op.getValueType();
9103 auto *
M = cast<MemSDNode>(
Op);
9106 Op->getVTList(), Ops, VT,
M->getMemOperand());
9108 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
9109 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
9110 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
9111 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9125 EVT VT =
Op.getValueType();
9126 auto *
M = cast<MemSDNode>(
Op);
9129 Op->getVTList(), Ops, VT,
M->getMemOperand());
9131 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
9132 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
9133 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
9134 auto Offsets = splitBufferOffsets(
Op.getOperand(6), DAG);
9148 EVT VT =
Op.getValueType();
9149 auto *
M = cast<MemSDNode>(
Op);
9152 Op->getVTList(), Ops, VT,
M->getMemOperand());
9154 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
9156 SDValue NodePtr =
M->getOperand(2);
9157 SDValue RayExtent =
M->getOperand(3);
9158 SDValue RayOrigin =
M->getOperand(4);
9160 SDValue RayInvDir =
M->getOperand(6);
9178 const unsigned NumVDataDwords = 4;
9179 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
9180 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
9184 const unsigned BaseOpcodes[2][2] = {
9185 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
9186 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
9187 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
9191 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
9192 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
9193 : AMDGPU::MIMGEncGfx10NSA,
9194 NumVDataDwords, NumVAddrDwords);
9198 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
9199 : AMDGPU::MIMGEncGfx10Default,
9200 NumVDataDwords, NumVAddrDwords);
9206 auto packLanes = [&DAG, &Ops, &
DL] (
SDValue Op,
bool IsAligned) {
9209 if (Lanes[0].getValueSizeInBits() == 32) {
9210 for (
unsigned I = 0;
I < 3; ++
I)
9217 { Lanes[0], Lanes[1] })));
9224 { Elt0, Lanes[0] })));
9228 { Lanes[1], Lanes[2] })));
9233 if (UseNSA && IsGFX11Plus) {
9241 for (
unsigned I = 0;
I < 3; ++
I) {
9244 {DirLanes[I], InvDirLanes[I]})));
9259 packLanes(RayOrigin,
true);
9260 packLanes(RayDir,
true);
9261 packLanes(RayInvDir,
false);
9266 if (NumVAddrDwords > 12) {
9286 case Intrinsic::amdgcn_global_atomic_fmin:
9287 case Intrinsic::amdgcn_global_atomic_fmax:
9288 case Intrinsic::amdgcn_global_atomic_fmin_num:
9289 case Intrinsic::amdgcn_global_atomic_fmax_num:
9290 case Intrinsic::amdgcn_flat_atomic_fmin:
9291 case Intrinsic::amdgcn_flat_atomic_fmax:
9292 case Intrinsic::amdgcn_flat_atomic_fmin_num:
9293 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9300 unsigned Opcode = 0;
9302 case Intrinsic::amdgcn_global_atomic_fmin:
9303 case Intrinsic::amdgcn_global_atomic_fmin_num:
9304 case Intrinsic::amdgcn_flat_atomic_fmin:
9305 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
9309 case Intrinsic::amdgcn_global_atomic_fmax:
9310 case Intrinsic::amdgcn_global_atomic_fmax_num:
9311 case Intrinsic::amdgcn_flat_atomic_fmax:
9312 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
9320 M->getVTList(), Ops,
M->getMemoryVT(),
9321 M->getMemOperand());
9323 case Intrinsic::amdgcn_s_get_barrier_state: {
9327 bool IsInlinableBarID =
false;
9330 if (isa<ConstantSDNode>(
Op->getOperand(2))) {
9331 BarID = cast<ConstantSDNode>(
Op->getOperand(2))->getSExtValue();
9335 if (IsInlinableBarID) {
9336 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
9340 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
9352 return lowerImage(
Op, ImageDimIntr, DAG,
true);
9360SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
9370 bool IsTFE = VTList.
NumVTs == 3;
9373 unsigned NumOpDWords = NumValueDWords + 1;
9378 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList, Ops,
9379 OpDWordsVT, OpDWordsMMO, DAG);
9394 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
9400 WidenedMemVT, WidenedMMO);
9410 bool ImageStore)
const {
9445 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
9451 if ((NumElements % 2) == 1) {
9453 unsigned I = Elts.
size() / 2;
9469 if (NumElements == 3) {
9490 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
9493 switch (IntrinsicID) {
9494 case Intrinsic::amdgcn_exp_compr: {
9498 "intrinsic not supported on subtarget",
DL.getDebugLoc());
9521 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
9524 case Intrinsic::amdgcn_s_barrier: {
9527 unsigned WGSize =
ST.getFlatWorkGroupSizes(MF.
getFunction()).second;
9528 if (WGSize <=
ST.getWavefrontSize())
9530 Op.getOperand(0)), 0);
9534 if (
ST.hasSplitBarriers()) {
9539 MVT::Other, K,
Op.getOperand(0)),
9550 case Intrinsic::amdgcn_tbuffer_store: {
9554 VData = handleD16VData(VData, DAG);
9555 unsigned Dfmt =
Op.getConstantOperandVal(8);
9556 unsigned Nfmt =
Op.getConstantOperandVal(9);
9557 unsigned Glc =
Op.getConstantOperandVal(10);
9558 unsigned Slc =
Op.getConstantOperandVal(11);
9576 M->getMemoryVT(),
M->getMemOperand());
9579 case Intrinsic::amdgcn_struct_tbuffer_store:
9580 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
9584 VData = handleD16VData(VData, DAG);
9585 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9586 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9604 M->getMemoryVT(),
M->getMemOperand());
9607 case Intrinsic::amdgcn_raw_tbuffer_store:
9608 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
9612 VData = handleD16VData(VData, DAG);
9613 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9614 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9632 M->getMemoryVT(),
M->getMemOperand());
9635 case Intrinsic::amdgcn_buffer_store:
9636 case Intrinsic::amdgcn_buffer_store_format: {
9640 VData = handleD16VData(VData, DAG);
9641 unsigned Glc =
Op.getConstantOperandVal(6);
9642 unsigned Slc =
Op.getConstantOperandVal(7);
9655 setBufferOffsets(
Op.getOperand(5), DAG, &Ops[4]);
9657 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
9664 if (VDataType == MVT::i8 || VDataType == MVT::i16)
9665 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9668 M->getMemoryVT(),
M->getMemOperand());
9671 case Intrinsic::amdgcn_raw_buffer_store:
9672 case Intrinsic::amdgcn_raw_ptr_buffer_store:
9673 case Intrinsic::amdgcn_raw_buffer_store_format:
9674 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
9675 const bool IsFormat =
9676 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
9677 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
9684 VData = handleD16VData(VData, DAG);
9694 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9695 auto Offsets = splitBufferOffsets(
Op.getOperand(4), DAG);
9715 return handleByteShortBufferStores(DAG, VDataVT,
DL, Ops, M);
9718 M->getMemoryVT(),
M->getMemOperand());
9721 case Intrinsic::amdgcn_struct_buffer_store:
9722 case Intrinsic::amdgcn_struct_ptr_buffer_store:
9723 case Intrinsic::amdgcn_struct_buffer_store_format:
9724 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
9725 const bool IsFormat =
9726 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
9727 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
9735 VData = handleD16VData(VData, DAG);
9745 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
9746 auto Offsets = splitBufferOffsets(
Op.getOperand(5), DAG);
9767 return handleByteShortBufferStores(DAG, VDataType,
DL, Ops, M);
9770 M->getMemoryVT(),
M->getMemOperand());
9772 case Intrinsic::amdgcn_raw_buffer_load_lds:
9773 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
9774 case Intrinsic::amdgcn_struct_buffer_load_lds:
9775 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
9779 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
9780 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
9781 unsigned OpOffset = HasVIndex ? 1 : 0;
9782 SDValue VOffset =
Op.getOperand(5 + OpOffset);
9784 unsigned Size =
Op->getConstantOperandVal(4);
9790 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
9791 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
9792 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
9793 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
9796 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
9797 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
9798 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
9799 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
9802 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
9803 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
9804 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
9805 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
9813 if (HasVIndex && HasVOffset)
9819 else if (HasVOffset)
9822 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
9826 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
9834 auto *
M = cast<MemSDNode>(
Op);
9861 case Intrinsic::amdgcn_global_load_lds: {
9863 unsigned Size =
Op->getConstantOperandVal(4);
9868 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
9871 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
9874 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
9878 auto *
M = cast<MemSDNode>(
Op);
9891 if (
LHS->isDivergent())
9895 RHS.getOperand(0).getValueType() == MVT::i32) {
9898 VOffset =
RHS.getOperand(0);
9903 if (!
Addr->isDivergent()) {
9919 LoadPtrI.
Offset =
Op->getConstantOperandVal(5);
9939 case Intrinsic::amdgcn_end_cf:
9941 Op->getOperand(2), Chain), 0);
9942 case Intrinsic::amdgcn_s_barrier_init:
9943 case Intrinsic::amdgcn_s_barrier_join:
9944 case Intrinsic::amdgcn_s_wakeup_barrier: {
9949 bool IsInlinableBarID =
false;
9952 if (isa<ConstantSDNode>(BarOp)) {
9953 BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
9957 if (IsInlinableBarID) {
9958 switch (IntrinsicID) {
9961 case Intrinsic::amdgcn_s_barrier_init:
9962 Opc = AMDGPU::S_BARRIER_INIT_IMM;
9964 case Intrinsic::amdgcn_s_barrier_join:
9965 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
9967 case Intrinsic::amdgcn_s_wakeup_barrier:
9968 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
9975 switch (IntrinsicID) {
9978 case Intrinsic::amdgcn_s_barrier_init:
9979 Opc = AMDGPU::S_BARRIER_INIT_M0;
9981 case Intrinsic::amdgcn_s_barrier_join:
9982 Opc = AMDGPU::S_BARRIER_JOIN_M0;
9984 case Intrinsic::amdgcn_s_wakeup_barrier:
9985 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
9990 if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
9996 if (!IsInlinableBarID) {
10001 Op.getOperand(2), M0Val),
10005 }
else if (!IsInlinableBarID) {
10015 return lowerImage(
Op, ImageDimIntr, DAG,
true);
10028std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
10035 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
10038 C1 = cast<ConstantSDNode>(N0.
getOperand(1));
10052 unsigned Overflow = ImmOffset & ~MaxImm;
10053 ImmOffset -= Overflow;
10054 if ((int32_t)Overflow < 0) {
10055 Overflow += ImmOffset;
10060 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
10064 SDValue Ops[] = { N0, OverflowVal };
10079void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
10081 Align Alignment)
const {
10084 if (
auto *
C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
10087 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
10098 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
10100 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
10117SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
10120 return MaybePointer;
10136 SDValue NumRecords =
Op->getOperand(3);
10139 auto [LowHalf, HighHalf] = DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
10142 std::optional<uint32_t> ConstStride = std::nullopt;
10143 if (
auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
10144 ConstStride = ConstNode->getZExtValue();
10146 SDValue NewHighHalf = Masked;
10147 if (!ConstStride || *ConstStride != 0) {
10150 ShiftedStride = DAG.
getConstant(*ConstStride << 16, Loc, MVT::i32);
10157 NewHighHalf = DAG.
getNode(
ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
10161 NewHighHalf, NumRecords, Flags);
10168SITargetLowering::handleByteShortBufferLoads(
SelectionDAG &DAG,
EVT LoadVT,
10189 if (VDataType == MVT::f16)
10193 Ops[1] = BufferStoreExt;
10198 M->getMemOperand());
10223SDValue SITargetLowering::widenLoad(
LoadSDNode *Ld, DAGCombinerInfo &DCI)
const {
10239 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
10246 "unexpected vector extload");
10259 "unexpected fp extload");
10277 DCI.AddToWorklist(Cvt.
getNode());
10282 DCI.AddToWorklist(Cvt.
getNode());
10293 if (
Info.isEntryFunction())
10294 return Info.getUserSGPRInfo().hasFlatScratchInit();
10302 EVT MemVT =
Load->getMemoryVT();
10315 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
10318 BasePtr, RealMemVT, MMO);
10348 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
10349 "Custom lowering for non-i32 vectors hasn't been implemented.");
10352 unsigned AS =
Load->getAddressSpace();
10371 if (!
Op->isDivergent() && Alignment >=
Align(4) && NumElements < 32) {
10388 Alignment >=
Align(4) && NumElements < 32) {
10403 if (NumElements > 4)
10423 if (NumElements > 2)
10428 if (NumElements > 4)
10440 auto Flags =
Load->getMemOperand()->getFlags();
10442 Load->getAlign(), Flags, &
Fast) &&
10451 MemVT, *
Load->getMemOperand())) {
10461 EVT VT =
Op.getValueType();
10498 EVT VT =
Op.getValueType();
10501 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs() ||
10508 if (!AllowInaccurateRcp && VT != MVT::f16)
10511 if (CLHS->isExactlyValue(1.0)) {
10528 if (CLHS->isExactlyValue(-1.0)) {
10537 if (!AllowInaccurateRcp && (VT != MVT::f16 || !
Flags.hasAllowReciprocal()))
10551 EVT VT =
Op.getValueType();
10554 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs() ||
10556 if (!AllowInaccurateDiv)
10577 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
10590 return DAG.
getNode(Opcode, SL, VTList,
10599 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
10612 return DAG.
getNode(Opcode, SL, VTList,
10618 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10619 return FastLowered;
10646 const APFloat K0Val(0x1p+96f);
10649 const APFloat K1Val(0x1p-32f);
10676 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
10677 uint32_t DPDenormModeDefault =
Info->getMode().fpDenormModeDPValue();
10678 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
10683 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
10684 return FastLowered;
10691 Flags.setNoFPExcept(
true);
10708 DenominatorScaled, Flags);
10710 DenominatorScaled, Flags);
10712 using namespace AMDGPU::Hwreg;
10713 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
10721 const bool HasDynamicDenormals =
10727 if (!PreservesDenormals) {
10735 if (HasDynamicDenormals) {
10739 SavedDenormMode =
SDValue(GetReg, 0);
10747 const SDValue EnableDenormValue =
10756 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
10757 {EnableDenormValue,
BitField, Glue});
10770 ApproxRcp, One, NegDivScale0, Flags);
10773 ApproxRcp, Fma0, Flags);
10776 Fma1, Fma1, Flags);
10779 NumeratorScaled,
Mul, Flags);
10782 Fma2, Fma1,
Mul, Fma2, Flags);
10785 NumeratorScaled, Fma3, Flags);
10787 if (!PreservesDenormals) {
10794 Fma4.
getValue(1), DisableDenormValue,
10797 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
10798 const SDValue DisableDenormValue =
10799 HasDynamicDenormals
10804 AMDGPU::S_SETREG_B32, SL, MVT::Other,
10815 {Fma4, Fma1, Fma3, Scale},
Flags);
10821 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
10822 return FastLowered;
10850 NegDivScale0,
Mul, DivScale1);
10882 Fma4, Fma3,
Mul, Scale);
10888 EVT VT =
Op.getValueType();
10890 if (VT == MVT::f32)
10891 return LowerFDIV32(
Op, DAG);
10893 if (VT == MVT::f64)
10894 return LowerFDIV64(
Op, DAG);
10896 if (VT == MVT::f16)
10897 return LowerFDIV16(
Op, DAG);
10906 EVT ResultExpVT =
Op->getValueType(1);
10907 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
10937 if (VT == MVT::i1) {
10940 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
10944 Store->getValue().getValueType().getScalarType() == MVT::i32);
10946 unsigned AS =
Store->getAddressSpace();
10965 if (NumElements > 4)
10972 VT, *
Store->getMemOperand()))
10981 if (NumElements > 2)
10985 if (NumElements > 4 ||
10994 auto Flags =
Store->getMemOperand()->getFlags();
11029 MVT VT =
Op.getValueType().getSimpleVT();
11198 EVT VT =
Op.getValueType();
11215 switch (
Op.getOpcode()) {
11241 EVT VT =
Op.getValueType();
11257 DAGCombinerInfo &DCI)
const {
11258 EVT VT =
N->getValueType(0);
11260 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
11267 EVT SrcVT = Src.getValueType();
11273 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
11276 DCI.AddToWorklist(Cvt.
getNode());
11279 if (ScalarVT != MVT::f32) {
11291 DAGCombinerInfo &DCI)
const {
11292 SDValue MagnitudeOp =
N->getOperand(0);
11293 SDValue SignOp =
N->getOperand(1);
11351 unsigned AddrSpace,
11353 DAGCombinerInfo &DCI)
const {
11383 AM.HasBaseReg =
true;
11384 AM.BaseOffs =
Offset.getSExtValue();
11389 EVT VT =
N->getValueType(0);
11395 Flags.setNoUnsignedWrap(
N->getFlags().hasNoUnsignedWrap() &&
11406 switch (
N->getOpcode()) {
11417 DAGCombinerInfo &DCI)
const {
11426 SDValue NewPtr = performSHLPtrCombine(
Ptr.getNode(),
N->getAddressSpace(),
11427 N->getMemoryVT(), DCI);
11431 NewOps[PtrIdx] = NewPtr;
11440 return (Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
11441 (Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
11450SDValue SITargetLowering::splitBinaryBitConstantOp(
11451 DAGCombinerInfo &DCI,
11473 if (V.getValueType() != MVT::i1)
11475 switch (V.getOpcode()) {
11494 if (!(
C & 0x000000ff)) ZeroByteMask |= 0x000000ff;
11495 if (!(
C & 0x0000ff00)) ZeroByteMask |= 0x0000ff00;
11496 if (!(
C & 0x00ff0000)) ZeroByteMask |= 0x00ff0000;
11497 if (!(
C & 0xff000000)) ZeroByteMask |= 0xff000000;
11498 uint32_t NonZeroByteMask = ~ZeroByteMask;
11499 if ((NonZeroByteMask &
C) != NonZeroByteMask)
11512 assert(V.getValueSizeInBits() == 32);
11514 if (V.getNumOperands() != 2)
11523 switch (V.getOpcode()) {
11528 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
11533 return (0x03020100 & ~ConstMask) | ConstMask;
11540 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
11546 return uint32_t(0x0c0c0c0c03020100ull >>
C);
11553 DAGCombinerInfo &DCI)
const {
11554 if (DCI.isBeforeLegalize())
11558 EVT VT =
N->getValueType(0);
11564 if (VT == MVT::i64 && CRHS) {
11570 if (CRHS && VT == MVT::i32) {
11579 if (
auto *CShift = dyn_cast<ConstantSDNode>(
LHS->getOperand(1))) {
11580 unsigned Shift = CShift->getZExtValue();
11582 unsigned Offset = NB + Shift;
11583 if ((
Offset & (Bits - 1)) == 0) {
11586 LHS->getOperand(0),
11601 isa<ConstantSDNode>(
LHS.getOperand(2))) {
11607 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
11622 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
11627 if (
X !=
LHS.getOperand(1))
11665 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
11666 LHS.getOperand(0) ==
LHS.getOperand(1))) {
11669 Mask->getZExtValue() & ~OrdMask :
11670 Mask->getZExtValue() & OrdMask;
11678 if (VT == MVT::i32 &&
11691 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
11694 if (LHSMask != ~0u && RHSMask != ~0u) {
11697 if (LHSMask > RHSMask) {
11704 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11705 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
11708 if (!(LHSUsedLanes & RHSUsedLanes) &&
11711 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
11718 for (
unsigned I = 0;
I < 32;
I += 8) {
11720 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
11721 Mask &= (0x0c <<
I) & 0xffffffff;
11730 LHS.getOperand(0),
RHS.getOperand(0),
11779static const std::optional<ByteProvider<SDValue>>
11781 unsigned Depth = 0) {
11784 return std::nullopt;
11786 if (
Op.getValueSizeInBits() < 8)
11787 return std::nullopt;
11789 if (
Op.getValueType().isVector())
11792 switch (
Op->getOpcode()) {
11803 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11804 NarrowVT = VTSign->getVT();
11807 return std::nullopt;
11810 if (SrcIndex >= NarrowByteWidth)
11811 return std::nullopt;
11817 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11819 return std::nullopt;
11821 uint64_t BitShift = ShiftOp->getZExtValue();
11823 if (BitShift % 8 != 0)
11824 return std::nullopt;
11826 SrcIndex += BitShift / 8;
11844static const std::optional<ByteProvider<SDValue>>
11846 unsigned StartingIndex = 0) {
11850 return std::nullopt;
11852 unsigned BitWidth =
Op.getScalarValueSizeInBits();
11854 return std::nullopt;
11856 return std::nullopt;
11858 bool IsVec =
Op.getValueType().isVector();
11859 switch (
Op.getOpcode()) {
11862 return std::nullopt;
11867 return std::nullopt;
11871 return std::nullopt;
11874 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
11875 return std::nullopt;
11876 if (!
LHS ||
LHS->isConstantZero())
11878 if (!
RHS ||
RHS->isConstantZero())
11880 return std::nullopt;
11885 return std::nullopt;
11887 auto BitMaskOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11889 return std::nullopt;
11891 uint32_t BitMask = BitMaskOp->getZExtValue();
11895 if ((IndexMask & BitMask) != IndexMask) {
11898 if (IndexMask & BitMask)
11899 return std::nullopt;
11908 return std::nullopt;
11911 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
11912 if (!ShiftOp ||
Op.getValueType().isVector())
11913 return std::nullopt;
11915 uint64_t BitsProvided =
Op.getValueSizeInBits();
11916 if (BitsProvided % 8 != 0)
11917 return std::nullopt;
11919 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
11921 return std::nullopt;
11923 uint64_t ConcatSizeInBytes = BitsProvided / 4;
11924 uint64_t ByteShift = BitShift / 8;
11926 uint64_t NewIndex = (
Index + ByteShift) % ConcatSizeInBytes;
11927 uint64_t BytesProvided = BitsProvided / 8;
11928 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
11929 NewIndex %= BytesProvided;
11936 return std::nullopt;
11938 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11940 return std::nullopt;
11942 uint64_t BitShift = ShiftOp->getZExtValue();
11944 return std::nullopt;
11946 auto BitsProvided =
Op.getScalarValueSizeInBits();
11947 if (BitsProvided % 8 != 0)
11948 return std::nullopt;
11950 uint64_t BytesProvided = BitsProvided / 8;
11951 uint64_t ByteShift = BitShift / 8;
11956 return BytesProvided - ByteShift >
Index
11964 return std::nullopt;
11966 auto ShiftOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
11968 return std::nullopt;
11970 uint64_t BitShift = ShiftOp->getZExtValue();
11971 if (BitShift % 8 != 0)
11972 return std::nullopt;
11973 uint64_t ByteShift = BitShift / 8;
11979 return Index < ByteShift
11982 Depth + 1, StartingIndex);
11991 return std::nullopt;
11998 auto *VTSign = cast<VTSDNode>(
Op->getOperand(1));
11999 NarrowBitWidth = VTSign->getVT().getSizeInBits();
12001 if (NarrowBitWidth % 8 != 0)
12002 return std::nullopt;
12003 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12005 if (
Index >= NarrowByteWidth)
12007 ? std::optional<ByteProvider<SDValue>>(
12015 return std::nullopt;
12019 if (NarrowByteWidth >=
Index) {
12024 return std::nullopt;
12031 return std::nullopt;
12035 auto L = cast<LoadSDNode>(
Op.getNode());
12037 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
12038 if (NarrowBitWidth % 8 != 0)
12039 return std::nullopt;
12040 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
12045 if (
Index >= NarrowByteWidth) {
12047 ? std::optional<ByteProvider<SDValue>>(
12052 if (NarrowByteWidth >
Index) {
12056 return std::nullopt;
12061 return std::nullopt;
12064 Depth + 1, StartingIndex);
12068 auto IdxOp = dyn_cast<ConstantSDNode>(
Op->getOperand(1));
12070 return std::nullopt;
12071 auto VecIdx = IdxOp->getZExtValue();
12072 auto ScalarSize =
Op.getScalarValueSizeInBits();
12073 if (ScalarSize != 32) {
12074 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 +
Index;
12078 StartingIndex,
Index);
12083 return std::nullopt;
12085 auto PermMask = dyn_cast<ConstantSDNode>(
Op->getOperand(2));
12087 return std::nullopt;
12090 (PermMask->getZExtValue() & (0xFF << (
Index * 8))) >> (
Index * 8);
12091 if (IdxMask > 0x07 && IdxMask != 0x0c)
12092 return std::nullopt;
12094 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
12095 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
12097 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
12103 return std::nullopt;
12118 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
12122 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
12125 auto MemVT = L->getMemoryVT();
12128 return L->getMemoryVT().getSizeInBits() == 16;
12138 int Low8 = Mask & 0xff;
12139 int Hi8 = (Mask & 0xff00) >> 8;
12141 assert(Low8 < 8 && Hi8 < 8);
12143 bool IsConsecutive = (Hi8 - Low8 == 1);
12148 bool Is16Aligned = !(Low8 % 2);
12150 return IsConsecutive && Is16Aligned;
12158 int Low16 = PermMask & 0xffff;
12159 int Hi16 = (PermMask & 0xffff0000) >> 16;
12169 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
12171 if (!OtherOpIs16Bit)
12179 unsigned DWordOffset) {
12182 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
12184 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
12189 if (Src.getValueType().isVector()) {
12190 auto ScalarTySize = Src.getScalarValueSizeInBits();
12191 auto ScalarTy = Src.getValueType().getScalarType();
12192 if (ScalarTySize == 32) {
12196 if (ScalarTySize > 32) {
12199 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
12200 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
12207 assert(ScalarTySize < 32);
12208 auto NumElements =
TypeSize / ScalarTySize;
12209 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
12210 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
12211 auto NumElementsIn32 = 32 / ScalarTySize;
12212 auto NumAvailElements = DWordOffset < Trunc32Elements
12214 : NumElements - NormalizedTrunc;
12227 auto ShiftVal = 32 * DWordOffset;
12235 [[maybe_unused]]
EVT VT =
N->getValueType(0);
12240 for (
int i = 0; i < 4; i++) {
12242 std::optional<ByteProvider<SDValue>>
P =
12245 if (!
P ||
P->isConstantZero())
12250 if (PermNodes.
size() != 4)
12253 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
12254 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
12256 for (
size_t i = 0; i < PermNodes.
size(); i++) {
12257 auto PermOp = PermNodes[i];
12260 int SrcByteAdjust = 4;
12264 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
12265 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
12267 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
12268 ((PermOp.SrcOffset / 4) != SecondSrc->second))
12272 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
12273 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
12276 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
12278 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
12281 SDValue Op = *PermNodes[FirstSrc.first].Src;
12283 assert(
Op.getValueSizeInBits() == 32);
12287 int Low16 = PermMask & 0xffff;
12288 int Hi16 = (PermMask & 0xffff0000) >> 16;
12290 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
12291 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
12294 if (WellFormedLow && WellFormedHi)
12298 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
12307 assert(
Op.getValueType().isByteSized() &&
12325 DAGCombinerInfo &DCI)
const {
12330 EVT VT =
N->getValueType(0);
12331 if (VT == MVT::i1) {
12336 if (Src !=
RHS.getOperand(0))
12341 if (!CLHS || !CRHS)
12345 static const uint32_t MaxMask = 0x3ff;
12359 isa<ConstantSDNode>(
LHS.getOperand(2))) {
12364 Sel |=
LHS.getConstantOperandVal(2);
12373 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
12377 auto usesCombinedOperand = [](
SDNode *OrUse) {
12380 !OrUse->getValueType(0).isVector())
12384 for (
auto VUse : OrUse->uses()) {
12385 if (!VUse->getValueType(0).isVector())
12392 if (VUse->getOpcode() == VectorwiseOp)
12398 if (!
any_of(
N->uses(), usesCombinedOperand))
12404 if (LHSMask != ~0u && RHSMask != ~0u) {
12407 if (LHSMask > RHSMask) {
12414 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12415 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
12418 if (!(LHSUsedLanes & RHSUsedLanes) &&
12421 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
12423 LHSMask &= ~RHSUsedLanes;
12424 RHSMask &= ~LHSUsedLanes;
12426 LHSMask |= LHSUsedLanes & 0x04040404;
12432 LHS.getOperand(0),
RHS.getOperand(0),
12436 if (LHSMask == ~0u || RHSMask == ~0u) {
12442 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
12457 if (SrcVT == MVT::i32) {
12463 DCI.AddToWorklist(LowOr.
getNode());
12464 DCI.AddToWorklist(HiBits.
getNode());
12472 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(
N->getOperand(1));
12476 N->getOperand(0), CRHS))
12484 DAGCombinerInfo &DCI)
const {
12485 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
12494 EVT VT =
N->getValueType(0);
12495 if (CRHS && VT == MVT::i64) {
12517 LHS->getOperand(0), FNegLHS, FNegRHS);
12526 DAGCombinerInfo &DCI)
const {
12531 EVT VT =
N->getValueType(0);
12532 if (VT != MVT::i32)
12536 if (Src.getValueType() != MVT::i16)
12543SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
12544 DAGCombinerInfo &DCI)
const {
12546 auto *VTSign = cast<VTSDNode>(
N->getOperand(1));
12551 VTSign->getVT() == MVT::i8) ||
12553 VTSign->getVT() == MVT::i16))) {
12555 "s_buffer_load_{u8, i8} are supported "
12556 "in GFX12 (or newer) architectures.");
12557 EVT VT = Src.getValueType();
12562 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
12568 auto *
M = cast<MemSDNode>(Src);
12569 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
12570 Opc,
DL, ResList, Ops,
M->getMemoryVT(),
M->getMemOperand());
12574 VTSign->getVT() == MVT::i8) ||
12576 VTSign->getVT() == MVT::i16)) &&
12578 auto *
M = cast<MemSDNode>(Src);
12590 SDVTList ResList = DCI.DAG.getVTList(MVT::i32,
12591 Src.getOperand(0).getValueType());
12594 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc,
SDLoc(
N),
12596 Ops,
M->getMemoryVT(),
12597 M->getMemOperand());
12598 return DCI.DAG.getMergeValues({BufferLoadSignExt,
12605 DAGCombinerInfo &DCI)
const {
12613 if (
N->getOperand(0).isUndef())
12620 DAGCombinerInfo &DCI)
const {
12621 EVT VT =
N->getValueType(0);
12625 return DCI.DAG.getConstantFP(
12648 unsigned Opcode =
Op.getOpcode();
12652 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(
Op)) {
12653 const auto &
F = CFP->getValueAPF();
12654 if (
F.isNaN() &&
F.isSignaling())
12656 if (!
F.isDenormal())
12719 if (
Op.getValueType() == MVT::i32) {
12724 if (
auto *
RHS = dyn_cast<ConstantSDNode>(
Op.getOperand(1))) {
12725 if (
RHS->getZExtValue() == 0xffff0000) {
12735 return Op.getValueType().getScalarType() != MVT::f16;
12803 if (
Op.getValueType() == MVT::i16) {
12814 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
12816 switch (IntrinsicID) {
12817 case Intrinsic::amdgcn_cvt_pkrtz:
12818 case Intrinsic::amdgcn_cubeid:
12819 case Intrinsic::amdgcn_frexp_mant:
12820 case Intrinsic::amdgcn_fdot2:
12821 case Intrinsic::amdgcn_rcp:
12822 case Intrinsic::amdgcn_rsq:
12823 case Intrinsic::amdgcn_rsq_clamp:
12824 case Intrinsic::amdgcn_rcp_legacy:
12825 case Intrinsic::amdgcn_rsq_legacy:
12826 case Intrinsic::amdgcn_trig_preop:
12827 case Intrinsic::amdgcn_log:
12828 case Intrinsic::amdgcn_exp2:
12829 case Intrinsic::amdgcn_sqrt:
12850 unsigned Opcode =
MI->getOpcode();
12852 if (Opcode == AMDGPU::G_FCANONICALIZE)
12855 std::optional<FPValueAndVReg> FCR;
12858 if (FCR->Value.isSignaling())
12860 if (!FCR->Value.isDenormal())
12871 case AMDGPU::G_FADD:
12872 case AMDGPU::G_FSUB:
12873 case AMDGPU::G_FMUL:
12874 case AMDGPU::G_FCEIL:
12875 case AMDGPU::G_FFLOOR:
12876 case AMDGPU::G_FRINT:
12877 case AMDGPU::G_FNEARBYINT:
12878 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
12879 case AMDGPU::G_INTRINSIC_TRUNC:
12880 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
12881 case AMDGPU::G_FMA:
12882 case AMDGPU::G_FMAD:
12883 case AMDGPU::G_FSQRT:
12884 case AMDGPU::G_FDIV:
12885 case AMDGPU::G_FREM:
12886 case AMDGPU::G_FPOW:
12887 case AMDGPU::G_FPEXT:
12888 case AMDGPU::G_FLOG:
12889 case AMDGPU::G_FLOG2:
12890 case AMDGPU::G_FLOG10:
12891 case AMDGPU::G_FPTRUNC:
12892 case AMDGPU::G_AMDGPU_RCP_IFLAG:
12893 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
12894 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
12895 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
12896 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
12898 case AMDGPU::G_FNEG:
12899 case AMDGPU::G_FABS:
12900 case AMDGPU::G_FCOPYSIGN:
12902 case AMDGPU::G_FMINNUM:
12903 case AMDGPU::G_FMAXNUM:
12904 case AMDGPU::G_FMINNUM_IEEE:
12905 case AMDGPU::G_FMAXNUM_IEEE:
12906 case AMDGPU::G_FMINIMUM:
12907 case AMDGPU::G_FMAXIMUM: {
12915 case AMDGPU::G_BUILD_VECTOR:
12920 case AMDGPU::G_INTRINSIC:
12921 case AMDGPU::G_INTRINSIC_CONVERGENT:
12923 case Intrinsic::amdgcn_fmul_legacy:
12924 case Intrinsic::amdgcn_fmad_ftz:
12925 case Intrinsic::amdgcn_sqrt:
12926 case Intrinsic::amdgcn_fmed3:
12927 case Intrinsic::amdgcn_sin:
12928 case Intrinsic::amdgcn_cos:
12929 case Intrinsic::amdgcn_log:
12930 case Intrinsic::amdgcn_exp2:
12931 case Intrinsic::amdgcn_log_clamp:
12932 case Intrinsic::amdgcn_rcp:
12933 case Intrinsic::amdgcn_rcp_legacy:
12934 case Intrinsic::amdgcn_rsq:
12935 case Intrinsic::amdgcn_rsq_clamp:
12936 case Intrinsic::amdgcn_rsq_legacy:
12937 case Intrinsic::amdgcn_div_scale:
12938 case Intrinsic::amdgcn_div_fmas:
12939 case Intrinsic::amdgcn_div_fixup:
12940 case Intrinsic::amdgcn_fract:
12941 case Intrinsic::amdgcn_cvt_pkrtz:
12942 case Intrinsic::amdgcn_cubeid:
12943 case Intrinsic::amdgcn_cubema:
12944 case Intrinsic::amdgcn_cubesc:
12945 case Intrinsic::amdgcn_cubetc:
12946 case Intrinsic::amdgcn_frexp_mant:
12947 case Intrinsic::amdgcn_fdot2:
12948 case Intrinsic::amdgcn_trig_preop:
12963SDValue SITargetLowering::getCanonicalConstantFP(
12966 if (
C.isDenormal()) {
12980 if (
C.isSignaling()) {
12999 return Op.isUndef() || isa<ConstantFPSDNode>(
Op);
13002SDValue SITargetLowering::performFCanonicalizeCombine(
13004 DAGCombinerInfo &DCI)
const {
13007 EVT VT =
N->getValueType(0);
13016 EVT VT =
N->getValueType(0);
13017 return getCanonicalConstantFP(DAG,
SDLoc(
N), VT, CFP->getValueAPF());
13033 EVT EltVT =
Lo.getValueType();
13036 for (
unsigned I = 0;
I != 2; ++
I) {
13039 NewElts[
I] = getCanonicalConstantFP(DAG, SL, EltVT,
13040 CFP->getValueAPF());
13041 }
else if (
Op.isUndef()) {
13053 if (isa<ConstantFPSDNode>(NewElts[1]))
13054 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
13059 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
13110 if (!MinK || !MaxK)
13123 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->
hasMed3_16()))
13124 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
13166 if (
Info->getMode().DX10Clamp) {
13175 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->
hasMed3_16())) {
13197 DAGCombinerInfo &DCI)
const {
13200 EVT VT =
N->getValueType(0);
13201 unsigned Opc =
N->getOpcode();
13210 (VT == MVT::i32 || VT == MVT::f32 ||
13211 ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->
hasMin3Max3_16()))) {
13218 N->getValueType(0),
13231 N->getValueType(0),
13241 if (
SDValue Med3 = performIntMed3ImmCombine(
13246 if (
SDValue Med3 = performIntMed3ImmCombine(
13252 if (
SDValue Med3 = performIntMed3ImmCombine(
13257 if (
SDValue Med3 = performIntMed3ImmCombine(
13267 (VT == MVT::f32 || VT == MVT::f64 ||
13271 if (
SDValue Res = performFPMed3ImmCombine(DAG,
SDLoc(
N), Op0, Op1))
13282 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
13283 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
13292 DAGCombinerInfo &DCI)
const {
13293 EVT VT =
N->getValueType(0);
13316 if (
Info->getMode().DX10Clamp) {
13319 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13322 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
13325 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
13336 DAGCombinerInfo &DCI)
const {
13340 return DCI.DAG.getUNDEF(
N->getValueType(0));
13348 bool IsDivergentIdx,
13353 unsigned VecSize = EltSize * NumElem;
13356 if (VecSize <= 64 && EltSize < 32)
13365 if (IsDivergentIdx)
13369 unsigned NumInsts = NumElem +
13370 ((EltSize + 31) / 32) * NumElem ;
13375 return NumInsts <= 16;
13379 return NumInsts <= 15;
13384 if (isa<ConstantSDNode>(
Idx))
13397SDValue SITargetLowering::performExtractVectorEltCombine(
13398 SDNode *
N, DAGCombinerInfo &DCI)
const {
13404 EVT ResVT =
N->getValueType(0);
13423 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
13451 DCI.AddToWorklist(Elt0.
getNode());
13452 DCI.AddToWorklist(Elt1.
getNode());
13474 if (!DCI.isBeforeLegalize())
13480 auto *
Idx = dyn_cast<ConstantSDNode>(
N->getOperand(1));
13481 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.
isByteSized() &&
13482 VecSize > 32 && VecSize % 32 == 0 &&
Idx) {
13485 unsigned BitIndex =
Idx->getZExtValue() * VecEltSize;
13486 unsigned EltIdx = BitIndex / 32;
13487 unsigned LeftoverBitIdx = BitIndex % 32;
13491 DCI.AddToWorklist(Cast.
getNode());
13495 DCI.AddToWorklist(Elt.
getNode());
13498 DCI.AddToWorklist(Srl.
getNode());
13502 DCI.AddToWorklist(Trunc.
getNode());
13504 if (VecEltVT == ResVT) {
13516SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
13517 DAGCombinerInfo &DCI)
const {
13531 EVT IdxVT =
Idx.getValueType();
13548 Src.getOperand(0).getValueType() == MVT::f16) {
13549 return Src.getOperand(0);
13552 if (
auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
13553 APFloat Val = CFP->getValueAPF();
13554 bool LosesInfo =
true;
13564 DAGCombinerInfo &DCI)
const {
13566 "combine only useful on gfx8");
13568 SDValue TruncSrc =
N->getOperand(0);
13569 EVT VT =
N->getValueType(0);
13570 if (VT != MVT::f16)
13608unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
13610 const SDNode *N1)
const {
13615 if (((VT == MVT::f32 &&
13617 (VT == MVT::f16 && Subtarget->
hasMadF16() &&
13637 EVT VT =
N->getValueType(0);
13638 if (VT != MVT::i32 && VT != MVT::i64)
13644 unsigned Opc =
N->getOpcode();
13667 return DAG.
getNode(Opc, SL, VT, Add1, Op2);
13689 DAGCombinerInfo &DCI)
const {
13693 EVT VT =
N->getValueType(0);
13703 if (!
N->isDivergent() && Subtarget->
hasSMulHi())
13707 if (NumBits <= 32 || NumBits > 64)
13719 unsigned NumUsers = 0;
13744 bool MulSignedLo =
false;
13745 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
13754 if (VT != MVT::i64) {
13777 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
13779 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
13781 std::tie(AccumLo, AccumHi) = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
13783 if (!MulLHSUnsigned32) {
13790 if (!MulRHSUnsigned32) {
13801 if (VT != MVT::i64)
13808static std::optional<ByteProvider<SDValue>>
13811 if (!Byte0 || Byte0->isConstantZero()) {
13812 return std::nullopt;
13815 if (Byte1 && !Byte1->isConstantZero()) {
13816 return std::nullopt;
13822 unsigned FirstCs =
First & 0x0c0c0c0c;
13823 unsigned SecondCs = Second & 0x0c0c0c0c;
13824 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
13825 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
13827 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
13828 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
13829 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
13830 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
13832 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
13856 for (
int BPI = 0; BPI < 2; BPI++) {
13859 BPP = {Src1, Src0};
13861 unsigned ZeroMask = 0x0c0c0c0c;
13862 unsigned FMask = 0xFF << (8 * (3 - Step));
13864 unsigned FirstMask =
13865 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13866 unsigned SecondMask =
13867 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
13871 int FirstGroup = -1;
13872 for (
int I = 0;
I < 2;
I++) {
13874 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
13875 return IterElt.SrcOp == *BPP.first.Src &&
13876 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
13886 if (FirstGroup != -1) {
13888 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
13889 return IterElt.SrcOp == *BPP.second.Src &&
13890 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
13896 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
13904 unsigned ZeroMask = 0x0c0c0c0c;
13905 unsigned FMask = 0xFF << (8 * (3 - Step));
13909 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13913 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
13924 if (Srcs.
size() == 1) {
13925 auto Elt = Srcs.
begin();
13929 if (Elt->PermMask == 0x3020100)
13936 auto FirstElt = Srcs.
begin();
13937 auto SecondElt = std::next(FirstElt);
13944 auto FirstMask = FirstElt->PermMask;
13945 auto SecondMask = SecondElt->PermMask;
13947 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
13948 unsigned FirstPlusFour = FirstMask | 0x04040404;
13951 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
13963 FirstElt = std::next(SecondElt);
13964 if (FirstElt == Srcs.
end())
13967 SecondElt = std::next(FirstElt);
13970 if (SecondElt == Srcs.
end()) {
13976 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
13982 return Perms.
size() == 2
13988 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
13989 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
13990 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
13991 EntryMask += ZeroMask;
13996 auto Opcode =
Op.getOpcode();
14002static std::optional<bool>
14013 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
14016 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
14018 assert(!(S0IsUnsigned && S0IsSigned));
14019 assert(!(S1IsUnsigned && S1IsSigned));
14027 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
14033 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
14034 return std::nullopt;
14046 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
14047 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
14052 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
14058 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
14059 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
14060 return std::nullopt;
14066 DAGCombinerInfo &DCI)
const {
14068 EVT VT =
N->getValueType(0);
14075 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
14080 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
14087 std::optional<bool> IsSigned;
14093 int ChainLength = 0;
14094 for (
int I = 0;
I < 4;
I++) {
14095 auto MulIdx =
isMul(LHS) ? 0 :
isMul(RHS) ? 1 : -1;
14098 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
14101 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
14106 TempNode->getOperand(MulIdx), *Src0, *Src1,
14107 TempNode->getOperand(MulIdx)->getOperand(0),
14108 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
14112 IsSigned = *IterIsSigned;
14113 if (*IterIsSigned != *IsSigned)
14116 auto AddIdx = 1 - MulIdx;
14119 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
14120 Src2s.
push_back(TempNode->getOperand(AddIdx));
14130 TempNode->getOperand(AddIdx), *Src0, *Src1,
14131 TempNode->getOperand(AddIdx)->getOperand(0),
14132 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
14136 if (*IterIsSigned != *IsSigned)
14140 ChainLength =
I + 2;
14144 TempNode = TempNode->getOperand(AddIdx);
14146 ChainLength =
I + 1;
14147 if (TempNode->getNumOperands() < 2)
14149 LHS = TempNode->getOperand(0);
14150 RHS = TempNode->getOperand(1);
14153 if (ChainLength < 2)
14159 if (ChainLength < 4) {
14169 bool UseOriginalSrc =
false;
14170 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
14171 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
14172 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
14173 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
14175 auto Src0Mask = Src0s.
begin()->PermMask;
14176 SrcBytes.
push_back(Src0Mask & 0xFF000000);
14177 bool UniqueEntries =
true;
14178 for (
auto I = 1;
I < 4;
I++) {
14179 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
14182 UniqueEntries =
false;
14188 if (UniqueEntries) {
14189 UseOriginalSrc =
true;
14191 auto FirstElt = Src0s.
begin();
14195 auto SecondElt = Src1s.
begin();
14197 SecondElt->DWordOffset);
14206 if (!UseOriginalSrc) {
14213 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
14216 : Intrinsic::amdgcn_udot4,
14226 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
14231 unsigned Opc =
LHS.getOpcode();
14236 Opc =
RHS.getOpcode();
14242 auto Cond =
RHS.getOperand(0);
14250 return DAG.
getNode(Opc, SL, VTList, Args);
14264 DAGCombinerInfo &DCI)
const {
14266 EVT VT =
N->getValueType(0);
14268 if (VT != MVT::i32)
14277 unsigned Opc =
RHS.getOpcode();
14283 auto Cond =
RHS.getOperand(0);
14291 return DAG.
getNode(Opc, SL, VTList, Args);
14305SDValue SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
14306 DAGCombinerInfo &DCI)
const {
14308 if (
N->getValueType(0) != MVT::i32)
14319 unsigned LHSOpc =
LHS.getOpcode();
14320 unsigned Opc =
N->getOpcode();
14330 DAGCombinerInfo &DCI)
const {
14335 EVT VT =
N->getValueType(0);
14347 if (
A ==
LHS.getOperand(1)) {
14348 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14349 if (FusedOp != 0) {
14351 return DAG.
getNode(FusedOp, SL, VT,
A, Two, RHS);
14359 if (
A ==
RHS.getOperand(1)) {
14360 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14361 if (FusedOp != 0) {
14363 return DAG.
getNode(FusedOp, SL, VT,
A, Two, LHS);
14372 DAGCombinerInfo &DCI)
const {
14378 EVT VT =
N->getValueType(0);
14391 if (
A ==
LHS.getOperand(1)) {
14392 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
14397 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
14406 if (
A ==
RHS.getOperand(1)) {
14407 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
14410 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo, LHS);
14419 DAGCombinerInfo &DCI)
const {
14422 EVT VT =
N->getValueType(0);
14436 bool IsNegative =
false;
14437 if (CLHS->isExactlyValue(1.0) ||
14438 (IsNegative = CLHS->isExactlyValue(-1.0))) {
14454 DAGCombinerInfo &DCI)
const {
14456 EVT VT =
N->getValueType(0);
14478 (
N->getFlags().hasAllowContract() &&
14479 FMA->getFlags().hasAllowContract())) {
14513 if (Vec1 == Vec2 || Vec3 == Vec4)
14519 if ((Vec1 == Vec3 && Vec2 == Vec4) ||
14520 (Vec1 == Vec4 && Vec2 == Vec3)) {
14529 DAGCombinerInfo &DCI)
const {
14535 EVT VT =
LHS.getValueType();
14538 auto CRHS = dyn_cast<ConstantSDNode>(RHS);
14540 CRHS = dyn_cast<ConstantSDNode>(LHS);
14564 return LHS.getOperand(0);
14570 isa<ConstantSDNode>(
LHS.getOperand(1)) &&
14571 isa<ConstantSDNode>(
LHS.getOperand(2)) &&
14572 LHS.getConstantOperandVal(1) !=
LHS.getConstantOperandVal(2) &&
14579 const APInt &CT =
LHS.getConstantOperandAPInt(1);
14580 const APInt &CF =
LHS.getConstantOperandAPInt(2);
14588 return LHS.getOperand(0);
14592 if (VT != MVT::f32 && VT != MVT::f64 &&
14625 DAGCombinerInfo &DCI)
const {
14643 if (
auto *
C = dyn_cast<ConstantSDNode>(Shift.
getOperand(1))) {
14647 unsigned ShiftOffset = 8 *
Offset;
14649 ShiftOffset -=
C->getZExtValue();
14651 ShiftOffset +=
C->getZExtValue();
14653 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
14655 MVT::f32, Shifted);
14666 DCI.AddToWorklist(
N);
14673 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
14679 DAGCombinerInfo &DCI)
const {
14689 return DCI.DAG.getConstantFP(Zero,
SDLoc(
N),
N->getValueType(0));
14692 APFloat One(
F.getSemantics(),
"1.0");
14694 return DCI.DAG.getConstantFP(One,
SDLoc(
N),
N->getValueType(0));
14704 switch (
N->getOpcode()) {
14706 return performAddCombine(
N, DCI);
14708 return performSubCombine(
N, DCI);
14711 return performAddCarrySubCarryCombine(
N, DCI);
14713 return performFAddCombine(
N, DCI);
14715 return performFSubCombine(
N, DCI);
14717 return performFDivCombine(
N, DCI);
14719 return performSetCCCombine(
N, DCI);
14732 return performMinMaxCombine(
N, DCI);
14734 return performFMACombine(
N, DCI);
14736 return performAndCombine(
N, DCI);
14738 return performOrCombine(
N, DCI);
14741 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
14742 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14748 return performXorCombine(
N, DCI);
14750 return performZeroExtendCombine(
N, DCI);
14752 return performSignExtendInRegCombine(
N , DCI);
14754 return performClassCombine(
N, DCI);
14756 return performFCanonicalizeCombine(
N, DCI);
14758 return performRcpCombine(
N, DCI);
14773 return performUCharToFloatCombine(
N, DCI);
14775 return performFCopySignCombine(
N, DCI);
14780 return performCvtF32UByteNCombine(
N, DCI);
14782 return performFMed3Combine(
N, DCI);
14784 return performCvtPkRTZCombine(
N, DCI);
14786 return performClampCombine(
N, DCI);
14789 EVT VT =
N->getValueType(0);
14792 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
14795 EVT EltVT = Src.getValueType();
14796 if (EltVT != MVT::i16)
14806 return performExtractVectorEltCombine(
N, DCI);
14808 return performInsertVectorEltCombine(
N, DCI);
14810 return performFPRoundCombine(
N, DCI);
14812 if (
SDValue Widened = widenLoad(cast<LoadSDNode>(
N), DCI))
14818 if (
MemSDNode *MemNode = dyn_cast<MemSDNode>(
N))
14819 return performMemSDNodeCombine(MemNode, DCI);
14832 default:
return ~0u;
14833 case AMDGPU::sub0:
return 0;
14834 case AMDGPU::sub1:
return 1;
14835 case AMDGPU::sub2:
return 2;
14836 case AMDGPU::sub3:
return 3;
14837 case AMDGPU::sub4:
return 4;
14844 unsigned Opcode =
Node->getMachineOpcode();
14848 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
14854 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
14855 unsigned NewDmask = 0;
14858 bool UsesTFC = ((int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
14859 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx)))
14862 unsigned TFCLane = 0;
14863 bool HasChain =
Node->getNumValues() > 1;
14865 if (OldDmask == 0) {
14873 TFCLane = OldBitsSet;
14881 if (
I.getUse().getResNo() != 0)
14885 if (!
I->isMachineOpcode() ||
14886 I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
14898 if (UsesTFC && Lane == TFCLane) {
14903 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
14905 Dmask &= ~(1 << Comp);
14913 NewDmask |= 1 << Comp;
14918 bool NoChannels = !NewDmask;
14925 if (OldBitsSet == 1)
14931 if (NewDmask == OldDmask)
14940 unsigned NewChannels = BitsSet + UsesTFC;
14944 assert(NewOpcode != -1 &&
14945 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
14946 "failed to find equivalent MIMG op");
14954 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
14956 MVT ResultVT = NewChannels == 1 ?
14958 NewChannels == 5 ? 8 : NewChannels);
14972 if (NewChannels == 1) {
14982 for (
unsigned i = 0,
Idx = AMDGPU::sub0; i < 5; ++i) {
14987 if (i || !NoChannels)
14992 if (NewUser !=
User) {
15000 case AMDGPU::sub0:
Idx = AMDGPU::sub1;
break;
15001 case AMDGPU::sub1:
Idx = AMDGPU::sub2;
break;
15002 case AMDGPU::sub2:
Idx = AMDGPU::sub3;
break;
15003 case AMDGPU::sub3:
Idx = AMDGPU::sub4;
break;
15013 Op =
Op.getOperand(0);
15015 return isa<FrameIndexSDNode>(
Op);
15024 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
15025 SDValue SrcVal = Node->getOperand(2);
15033 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
15035 SDNode *Glued = Node->getGluedNode();
15037 = DAG.
getCopyToReg(Node->getOperand(0), SL, VReg, SrcVal,
15044 return ToResultReg.
getNode();
15049 for (
unsigned i = 0; i < Node->getNumOperands(); ++i) {
15057 Node->getOperand(i).getValueType(),
15058 Node->getOperand(i)), 0));
15069 unsigned Opcode = Node->getMachineOpcode();
15071 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
15072 !
TII->isGather4(Opcode) &&
15074 return adjustWritemask(Node, DAG);
15077 if (Opcode == AMDGPU::INSERT_SUBREG ||
15078 Opcode == AMDGPU::REG_SEQUENCE) {
15084 case AMDGPU::V_DIV_SCALE_F32_e64:
15085 case AMDGPU::V_DIV_SCALE_F64_e64: {
15089 SDValue Src0 = Node->getOperand(1);
15090 SDValue Src1 = Node->getOperand(3);
15091 SDValue Src2 = Node->getOperand(5);
15095 (Src0 == Src1 || Src0 == Src2))
15152 unsigned InitIdx = 0;
15154 if (
TII->isImage(
MI)) {
15162 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
15163 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
15164 unsigned D16Val = D16 ? D16->getImm() : 0;
15166 if (!TFEVal && !LWEVal)
15177 assert(MO_Dmask &&
"Expected dmask operand in instruction");
15179 unsigned dmask = MO_Dmask->
getImm();
15186 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
15192 TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15193 if (DstSize < InitIdx)
15196 InitIdx =
TRI.getRegSizeInBits(*
TII->getOpRegClass(
MI, DstIdx)) / 32;
15204 Register PrevDst =
MRI.cloneVirtualRegister(
MI.getOperand(DstIdx).getReg());
15205 unsigned NewDst = 0;
15214 for (; SizeLeft; SizeLeft--, CurrIdx++) {
15215 NewDst =
MRI.createVirtualRegister(
TII->getOpRegClass(
MI, DstIdx));
15233 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
15246 if (
TII->isVOP3(
MI.getOpcode())) {
15248 TII->legalizeOperandsVOP3(
MRI,
MI);
15253 if (!
MI.getDesc().operands().empty()) {
15254 unsigned Opc =
MI.getOpcode();
15255 bool HasAGPRs =
Info->mayNeedAGPRs();
15263 if ((
I == Src2Idx) && (HasAGPRs))
15266 if (!
Op.isReg() || !
Op.getReg().isVirtual())
15268 auto *RC =
TRI->getRegClassForReg(
MRI,
Op.getReg());
15269 if (!
TRI->hasAGPRs(RC))
15271 auto *Src =
MRI.getUniqueVRegDef(
Op.getReg());
15272 if (!Src || !Src->isCopy() ||
15273 !
TRI->isSGPRReg(
MRI, Src->getOperand(1).getReg()))
15275 auto *NewRC =
TRI->getEquivalentVGPRClass(RC);
15279 MRI.setRegClass(
Op.getReg(), NewRC);
15286 if (
auto *Src2 =
TII->getNamedOperand(
MI, AMDGPU::OpName::src2)) {
15287 if (Src2->isReg() && Src2->getReg().isVirtual()) {
15288 auto *RC =
TRI->getRegClassForReg(
MRI, Src2->getReg());
15289 if (
TRI->isVectorSuperClass(RC)) {
15290 auto *NewRC =
TRI->getEquivalentAGPRClass(RC);
15291 MRI.setRegClass(Src2->getReg(), NewRC);
15292 if (Src2->isTied())
15293 MRI.setRegClass(
MI.getOperand(0).getReg(), NewRC);
15302 if (
TII->isImage(
MI))
15303 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
15329 MVT::v2i32, Ops0), 0);
15359 RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
15381std::pair<unsigned, const TargetRegisterClass *>
15388 if (Constraint.
size() == 1) {
15390 switch (Constraint[0]) {
15397 RC = &AMDGPU::SReg_32RegClass;
15400 RC = &AMDGPU::SGPR_64RegClass;
15405 return std::pair(0U,
nullptr);
15412 RC = &AMDGPU::VGPR_32RegClass;
15417 return std::pair(0U,
nullptr);
15426 RC = &AMDGPU::AGPR_32RegClass;
15431 return std::pair(0U,
nullptr);
15440 return std::pair(0U, RC);
15445 if (
RegName.consume_front(
"v")) {
15446 RC = &AMDGPU::VGPR_32RegClass;
15447 }
else if (
RegName.consume_front(
"s")) {
15448 RC = &AMDGPU::SGPR_32RegClass;
15449 }
else if (
RegName.consume_front(
"a")) {
15450 RC = &AMDGPU::AGPR_32RegClass;
15455 if (
RegName.consume_front(
"[")) {
15465 RC =
TRI->getVGPRClassForBitWidth(Width);
15467 RC =
TRI->getSGPRClassForBitWidth(Width);
15469 RC =
TRI->getAGPRClassForBitWidth(Width);
15471 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
15472 return std::pair(Reg, RC);
15477 if (!
Failed && Idx < RC->getNumRegs())
15485 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
15491 if (Constraint.
size() == 1) {
15492 switch (Constraint[0]) {
15501 }
else if (Constraint ==
"DA" ||
15502 Constraint ==
"DB") {
15510 if (Constraint.
size() == 1) {
15511 switch (Constraint[0]) {
15527 Val = Val & maskTrailingOnes<uint64_t>(
Size);
15534 std::vector<SDValue> &Ops,
15549 unsigned Size =
Op.getScalarValueSizeInBits();
15557 Val =
C->getSExtValue();
15561 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15567 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
15570 Val =
C->getSExtValue();
15574 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
15584 if (Constraint.
size() == 1) {
15585 switch (Constraint[0]) {
15589 return isInt<16>(Val);
15593 return isInt<32>(Val);
15600 }
else if (Constraint.
size() == 2) {
15601 if (Constraint ==
"DA") {
15602 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
15603 int64_t LoBits =
static_cast<int32_t
>(Val);
15607 if (Constraint ==
"DB") {
15615 unsigned MaxSize)
const {
15616 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
15619 MVT VT =
Op.getSimpleValueType();
15644 switch (UnalignedClassID) {
15645 case AMDGPU::VReg_64RegClassID:
15646 return AMDGPU::VReg_64_Align2RegClassID;
15647 case AMDGPU::VReg_96RegClassID:
15648 return AMDGPU::VReg_96_Align2RegClassID;
15649 case AMDGPU::VReg_128RegClassID:
15650 return AMDGPU::VReg_128_Align2RegClassID;
15651 case AMDGPU::VReg_160RegClassID:
15652 return AMDGPU::VReg_160_Align2RegClassID;
15653 case AMDGPU::VReg_192RegClassID:
15654 return AMDGPU::VReg_192_Align2RegClassID;
15655 case AMDGPU::VReg_224RegClassID:
15656 return AMDGPU::VReg_224_Align2RegClassID;
15657 case AMDGPU::VReg_256RegClassID:
15658 return AMDGPU::VReg_256_Align2RegClassID;
15659 case AMDGPU::VReg_288RegClassID:
15660 return AMDGPU::VReg_288_Align2RegClassID;
15661 case AMDGPU::VReg_320RegClassID:
15662 return AMDGPU::VReg_320_Align2RegClassID;
15663 case AMDGPU::VReg_352RegClassID:
15664 return AMDGPU::VReg_352_Align2RegClassID;
15665 case AMDGPU::VReg_384RegClassID:
15666 return AMDGPU::VReg_384_Align2RegClassID;
15667 case AMDGPU::VReg_512RegClassID:
15668 return AMDGPU::VReg_512_Align2RegClassID;
15669 case AMDGPU::VReg_1024RegClassID:
15670 return AMDGPU::VReg_1024_Align2RegClassID;
15671 case AMDGPU::AReg_64RegClassID:
15672 return AMDGPU::AReg_64_Align2RegClassID;
15673 case AMDGPU::AReg_96RegClassID:
15674 return AMDGPU::AReg_96_Align2RegClassID;
15675 case AMDGPU::AReg_128RegClassID:
15676 return AMDGPU::AReg_128_Align2RegClassID;
15677 case AMDGPU::AReg_160RegClassID:
15678 return AMDGPU::AReg_160_Align2RegClassID;
15679 case AMDGPU::AReg_192RegClassID:
15680 return AMDGPU::AReg_192_Align2RegClassID;
15681 case AMDGPU::AReg_256RegClassID:
15682 return AMDGPU::AReg_256_Align2RegClassID;
15683 case AMDGPU::AReg_512RegClassID:
15684 return AMDGPU::AReg_512_Align2RegClassID;
15685 case AMDGPU::AReg_1024RegClassID:
15686 return AMDGPU::AReg_1024_Align2RegClassID;
15702 if (
Info->isEntryFunction()) {
15709 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
15711 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
15712 :
TRI->getAlignedHighSGPRForRC(MF, 2,
15713 &AMDGPU::SGPR_64RegClass);
15714 Info->setSGPRForEXECCopy(SReg);
15717 Info->getStackPtrOffsetReg()));
15718 if (
Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
15719 MRI.replaceRegWith(AMDGPU::SP_REG,
Info->getStackPtrOffsetReg());
15723 if (
Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
15724 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG,
Info->getScratchRSrcReg());
15726 if (
Info->getFrameOffsetReg() != AMDGPU::FP_REG)
15727 MRI.replaceRegWith(AMDGPU::FP_REG,
Info->getFrameOffsetReg());
15729 Info->limitOccupancy(MF);
15731 if (ST.isWave32() && !MF.
empty()) {
15732 for (
auto &
MBB : MF) {
15733 for (
auto &
MI :
MBB) {
15734 TII->fixImplicitOperands(
MI);
15744 if (ST.needsAlignedVGPRs()) {
15745 for (
unsigned I = 0, E =
MRI.getNumVirtRegs();
I != E; ++
I) {
15751 if (NewClassID != -1)
15752 MRI.setRegClass(Reg,
TRI->getRegClass(NewClassID));
15761 const APInt &DemandedElts,
15763 unsigned Depth)
const {
15765 unsigned Opc =
Op.getOpcode();
15768 unsigned IID =
Op.getConstantOperandVal(0);
15770 case Intrinsic::amdgcn_mbcnt_lo:
15771 case Intrinsic::amdgcn_mbcnt_hi: {
15778 unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
15780 MaxActiveBits += Src1ValBits ? 1 : 0;
15781 unsigned Size =
Op.getValueType().getSizeInBits();
15782 if (MaxActiveBits <
Size)
15791 Op, Known, DemandedElts, DAG,
Depth);
15806 unsigned MaxValue =
15815 switch (
MI->getOpcode()) {
15816 case AMDGPU::G_INTRINSIC:
15817 case AMDGPU::G_INTRINSIC_CONVERGENT: {
15819 case Intrinsic::amdgcn_workitem_id_x:
15822 case Intrinsic::amdgcn_workitem_id_y:
15825 case Intrinsic::amdgcn_workitem_id_z:
15828 case Intrinsic::amdgcn_mbcnt_lo:
15829 case Intrinsic::amdgcn_mbcnt_hi: {
15831 unsigned Size =
MRI.getType(R).getSizeInBits();
15835 case Intrinsic::amdgcn_groupstaticsize: {
15846 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
15849 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
15852 case AMDGPU::G_AMDGPU_SMED3:
15853 case AMDGPU::G_AMDGPU_UMED3: {
15854 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
15881 unsigned Depth)
const {
15883 if (
auto *GI = dyn_cast<GIntrinsic>(
MI)) {
15889 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
15916 if (Header->getAlignment() != PrefAlign)
15917 return Header->getAlignment();
15919 unsigned LoopSize = 0;
15927 LoopSize +=
TII->getInstSizeInBytes(
MI);
15928 if (LoopSize > 192)
15933 if (LoopSize <= 64)
15936 if (LoopSize <= 128)
15937 return CacheLineAlign;
15943 auto I = Exit->getFirstNonDebugInstr();
15944 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
15945 return CacheLineAlign;
15954 if (PreTerm == Pre->
begin() ||
15955 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
15959 auto ExitHead = Exit->getFirstNonDebugInstr();
15960 if (ExitHead == Exit->end() ||
15961 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
15966 return CacheLineAlign;
15974 N =
N->getOperand(0).getNode();
15985 switch (
N->getOpcode()) {
15993 if (Reg.isPhysical() ||
MRI.isLiveIn(Reg))
15994 return !
TRI->isSGPRReg(
MRI, Reg);
16000 return !
TRI->isSGPRReg(
MRI, Reg);
16004 unsigned AS = L->getAddressSpace();
16038 if (
auto *
A = dyn_cast<AtomicSDNode>(
N)) {
16040 return A->readMem() &&
A->writeMem();
16075 unsigned Depth)
const {
16080 if (
Info->getMode().DX10Clamp)
16093static bool fpModeMatchesGlobalFPAtomicMode(
const AtomicRMWInst *RMW) {
16107 return F->getFnAttribute(
"amdgpu-unsafe-fp-atomics").getValueAsString() !=
16120 <<
"Hardware instruction generated for atomic "
16122 <<
" operation at memory scope " << MemScope;
16140 bool HasSystemScope =
16199 if (HasSystemScope)
16248 if (HasSystemScope)
16285 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
16287 : &AMDGPU::SReg_32RegClass;
16288 if (!
TRI->isSGPRClass(RC) && !isDivergent)
16289 return TRI->getEquivalentSGPRClass(RC);
16290 else if (
TRI->isSGPRClass(RC) && isDivergent)
16291 return TRI->getEquivalentVGPRClass(RC);
16303 unsigned WaveSize) {
16308 if (!
IT ||
IT->getBitWidth() != WaveSize)
16311 if (!isa<Instruction>(V))
16313 if (!Visited.
insert(V).second)
16315 bool Result =
false;
16316 for (
const auto *U : V->users()) {
16317 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
16318 if (V == U->getOperand(1)) {
16319 switch (Intrinsic->getIntrinsicID()) {
16323 case Intrinsic::amdgcn_if_break:
16324 case Intrinsic::amdgcn_if:
16325 case Intrinsic::amdgcn_else:
16330 if (V == U->getOperand(0)) {
16331 switch (Intrinsic->getIntrinsicID()) {
16335 case Intrinsic::amdgcn_end_cf:
16336 case Intrinsic::amdgcn_loop:
16342 Result =
hasCFUser(U, Visited, WaveSize);
16351 const Value *V)
const {
16352 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
16353 if (CI->isInlineAsm()) {
16362 for (
auto &TC : TargetConstraints) {
16366 SIRI, TC.ConstraintCode, TC.ConstraintVT).second;
16379 for (;
I != E; ++
I) {
16380 if (
MemSDNode *M = dyn_cast<MemSDNode>(*
I)) {
16403 return MRI.hasOneNonDBGUse(N0);
16410 if (
I.getMetadata(
"amdgpu.noclobber"))
16412 if (
I.getMetadata(
"amdgpu.last.use"))
16422 if (!Def->isMachineOpcode())
16433 PhysReg = AMDGPU::SCC;
16435 TRI->getMinimalPhysRegClass(PhysReg, Def->getSimpleValueType(ResNo));
16449 "this cannot be replaced with add");
16455 "target should have atomic fadd instructions");
16458 "generic atomicrmw expansion only supports FP32 operand in flat "
16532 for (
auto &
P : MDs)
16543 {
Addr},
nullptr,
"is.shared");
16544 Builder.
CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
16549 Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
16554 Intrinsic::amdgcn_is_private, {}, {
Addr},
nullptr,
"is.private");
16560 Value *LoadedPrivate =
16561 Builder.
CreateLoad(ValTy, CastToPrivate,
"loaded.private");
16569 Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
static bool isMul(MachineInstr *MI)
unsigned const MachineRegisterInfo * MRI
static unsigned getIntrinsicID(const SDNode *N)
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
static bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
#define LLVM_ATTRIBUTE_UNUSED
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
static bool isUndef(ArrayRef< int > Mask)
iv Induction Variable Users
static const unsigned MaxDepth
Contains matchers for matching SSA Machine Instructions.
unsigned const TargetRegisterInfo * TRI
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static EVT memVTFromLoadIntrData(Type *Ty, unsigned MaxNumLanes)
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static bool isClampZeroToOne(SDValue A, SDValue B)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
bool unsafeFPAtomicsDisabled(Function *F)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB, KnownBits &Known, unsigned Dim)
static LLVM_ATTRIBUTE_UNUSED bool isCopyFromRegOfInlineAsm(const SDNode *N)
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getIdxEn(SDValue VIndex)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
SI DAG Lowering interface definition.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static constexpr int Concat[]
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
void setFuncArgInfo(const Function &F, const AMDGPUFunctionArgInfo &ArgInfo)
static bool isUniformMMO(const MachineMemOperand *MMO)
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
uint32_t getLDSSize() const
void setUsesDynamicLDS(bool DynLDS)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
bool isEntryFunction() const
bool hasMadMacF32Insts() const
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool hasMadMixInsts() const
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool isAmdHsaOrMesa(const Function &F) const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasInv2PiInlineImm() const
bool hasVOP3PInsts() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
bool empty() const
empty - Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Min
*p = old <signed v ? old : v
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
MemoryEffects getMemoryEffects() const
Returns memory effects of the function.
LLVM Basic Block Representation.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
bool isFPPredicate() const
bool isIntPredicate() const
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Diagnostic information for unsupported feature in backend.
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
iterator_range< arg_iterator > args()
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
bool hasD16Images() const
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool hasDot7Insts() const
bool hasApertureRegs() const
bool hasFlatInstOffsets() const
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasGFX90AInsts() const
bool hasBCNT(unsigned Size) const
bool hasMultiDwordFlatScratchAddressing() const
bool hasArchitectedSGPRs() const
bool hasDenormModeInst() const
bool hasPrivEnabledTrap2NopBug() const
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
bool hasDot1Insts() const
bool hasAtomicFaddRtnInsts() const
Align getStackAlignment() const
bool hasScalarSubwordLoads() const
bool enableFlatScratch() const
bool hasDwordx3LoadStores() const
bool hasFlatScrRegister() const
bool supportsGetDoorbellID() const
bool hasFlatAtomicFaddF32Inst() const
bool hasKernargPreload() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getMaxNumVGPRs(unsigned WavesPerEU) const
bool hasLDSMisalignedBug() const
bool hasUserSGPRInit16Bug() const
TrapHandlerAbi getTrapHandlerAbi() const
const SIFrameLowering * getFrameLowering() const override
bool hasUnalignedScratchAccess() const
bool hasRestrictedSOffset() const
bool hasMin3Max3_16() const
bool hasGFX10_AEncoding() const
bool hasPackedFP32Ops() const
bool hasGFX940Insts() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
bool hasLDSFPAtomicAddF64() const
bool hasFlatGlobalInsts() const
bool getScalarizeGlobalBehavior() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasShaderCyclesHiLoRegisters() const
bool hasNSAEncoding() const
bool hasSMemRealTime() const
bool usePRTStrictNull() const
bool hasUnalignedBufferAccessEnabled() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasImageGather4D16Bug() const
bool supportsMinMaxDenormModes() const
bool hasAtomicFaddInsts() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasAtomicFaddNoRtnInsts() const
bool hasScalarDwordx3Loads() const
bool hasLDSFPAtomicAddF32() const
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDot8Insts() const
bool hasDS96AndDS128() const
bool useFlatForGlobal() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasIEEEMinMax() const
bool hasFmaMixInsts() const
bool hasPackedTID() const
bool hasAddNoCarry() const
bool hasGWSAutoReplay() const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
virtual void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
const MachineFunction & getMachineFunction() const
int64_t getOffset() const
unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateFAdd(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
BasicBlock::iterator GetInsertPoint() const
BasicBlock * GetInsertBlock() const
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
LLVMContext & getContext() const
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadata(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
Get all metadata attached to this Instruction.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
void getSyncScopeNames(SmallVectorImpl< StringRef > &SSNs) const
getSyncScopeNames - Populates client supplied SmallVector with synchronization scope names registered...
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
bool isCompare() const
Return true if this instruction is a comparison.
bool hasImplicitDefOfPhysReg(unsigned Reg, const MCRegisterInfo *MRI=nullptr) const
Return true if this instruction implicitly defines the specified physical register.
Wrapper class representing physical registers. Should be passed by value.
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
bool bitsLT(MVT VT) const
Return true if this has less bits than VT.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
void removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs=false)
Remove successor from the successors list of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
Align getAlignment() const
Return alignment of the basic block.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
const MachinePointerInfo & getPointerInfo() const
Flags getFlags() const
Return the raw flags of the source value,.
AAMDNodes getAAInfo() const
Return the AA tags for the memory reference.
Align getBaseAlign() const
Return the minimum known alignment in bytes of the base address, without the offset.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
A Module instance is used to store all the information related to an LLVM module.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
AnalysisType & getAnalysis() const
getAnalysis<AnalysisType>() - This function is used by subclasses to get to the analysis information ...
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
This class provides iterator support for SDUse operands that use a specific SDNode.
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getMachineOpcode() const
This may only be called if isMachineOpcode returns true.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
use_iterator use_begin() const
Provide iteration support to walk over all uses of an SDNode.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
static use_iterator use_end()
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
bool isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, uint64_t FlatVariant) const
Returns if Offset is legal for the subtarget as the offset to a FLAT encoded instruction.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op, const TargetRegisterInfo *TRI, const TargetInstrInfo *TII, unsigned &PhysReg, int &Cost) const override
Allows the target to handle physreg-carried dependency in target-specific way.
EVT getOptimalMemOpType(const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::v5i32 because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth=5) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isMemOpUniform(const SDNode *N) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
void computeKnownBitsForTargetInstr(GISelKnownBits &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
bool getAddrModeArguments(IntrinsicInst *, SmallVectorImpl< Value * > &, Type *&) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
bool isKnownNeverSNaN(SDValue Op, unsigned Depth=0) const
SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS)
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const Pass * getPass() const
SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), AAResults *AA=nullptr)
SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getRegister(unsigned Reg, EVT VT)
void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
static const fltSemantics & EVTToAPFloatSemantics(EVT VT)
Returns an APFloat semantics tag appropriate for the given type.
const TargetMachine & getTarget() const
SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, unsigned Reg, SDValue N)
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond)
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
SDValue getValueType(EVT)
SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
bool isKnownNeverNaN(SDValue Op, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, unsigned Reg, EVT VT)
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
SDValue getRegisterMask(const uint32_t *RegMask)
SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getCondCode(ISD::CondCode Cond)
bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL, bool LegalTypes=true)
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=0, const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
constexpr size_t size() const
size - Get the string size.
constexpr const char * data() const
data - Get a pointer to the start of the string (which may not be null terminated).
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "look through" ops that don't contri...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
int getCopyCost() const
Return the cost of copying a value between two registers in this class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
A Use represents the edge between a Value definition and its users.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< use_iterator > uses()
void takeName(Value *V)
Transfer the name from V to this value.
constexpr bool isZero() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
@ BUFFER_ATOMIC_COND_SUB_U32
@ TBUFFER_LOAD_FORMAT_D16
@ BUFFER_ATOMIC_FADD_BF16
@ TBUFFER_STORE_FORMAT_D16
@ BUFFER_STORE_FORMAT_D16
@ CLAMP
CLAMP value between 0.0 and 1.0.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
LLVM_READONLY int getGlobalSaddrOp(uint16_t Opcode)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
LLVM_READONLY int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isChainCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, uint64_t NamedIdx)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
bool isShader(CallingConv::ID cc)
bool isGFX10Plus(const MCSubtargetInfo &STI)
LLVM_READONLY int getVOPe64(uint16_t Opcode)
@ TowardZeroF32_TowardNegativeF64
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
bool isGraphics(CallingConv::ID cc)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
AttributeList getAttributes(LLVMContext &C, ID id)
Return the attributes for an intrinsic.
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Define
Register definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ System
Synchronized with respect to all concurrently executing threads.
Reg
All possible values of the reg field in the ModR/M byte.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
int popcount(T Value) noexcept
Count the number of set bits in a value.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
testing::Matcher< const detail::ErrorHolder & > Failed()
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
unsigned getUndefRegState(bool B)
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
unsigned M0(unsigned Val)
int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
static const fltSemantics & IEEEsingle() LLVM_READNONE
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
bool isInteger() const
Return true if this is an integer or a vector integer type.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
bool isUnknown() const
Returns true if we don't know any bits.
void resetAll()
Resets the known state of all bits.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
int64_t Offset
Offset - This is an offset from the base Value*.
PointerUnion< const Value *, const PseudoSourceValue * > V
This is the IR pointer value for the access, or it is null if unknown.
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
bool DX10Clamp
Used by the vector ALU to force DX10-style treatment of NaNs: when set, clamp NaN to zero; otherwise,...
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const