34#include "llvm/IR/IntrinsicsAMDGPU.h"
35#include "llvm/IR/IntrinsicsR600.h"
37#define DEBUG_TYPE "amdgpu-legalinfo"
40using namespace LegalizeActions;
41using namespace LegalizeMutations;
42using namespace LegalityPredicates;
43using namespace MIPatternMatch;
47 "amdgpu-global-isel-new-legality",
48 cl::desc(
"Use GlobalISel desired legality, rather than try to use"
49 "rules compatible with selection patterns"),
74 const LLT Ty = Query.Types[TypeIdx];
81 EltSize > 1 && EltSize < 32 &&
88 const LLT Ty = Query.Types[TypeIdx];
95 const LLT Ty = Query.Types[TypeIdx];
103 const LLT Ty = Query.Types[TypeIdx];
105 return std::pair(TypeIdx,
112 const LLT Ty = Query.Types[TypeIdx];
115 unsigned Pieces = (
Size + 63) / 64;
126 const LLT Ty = Query.Types[TypeIdx];
131 const int NextMul32 = (
Size + 31) / 32;
135 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
143 const LLT Ty = Query.Types[TypeIdx];
148 assert(EltSize == 32 || EltSize == 64);
153 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
190 const LLT Ty = Query.Types[TypeIdx];
197 const LLT Ty = Query.Types[TypeIdx];
207 const LLT QueryTy = Query.Types[TypeIdx];
214 const LLT QueryTy = Query.Types[TypeIdx];
221 const LLT QueryTy = Query.Types[TypeIdx];
232 return EltSize == 16 || EltSize % 32 == 0;
237 return EltSize == 32 || EltSize == 64 ||
239 EltSize == 128 || EltSize == 256;
266 LLT Ty = Query.Types[TypeIdx];
274 const LLT QueryTy = Query.Types[TypeIdx];
362 const LLT Ty = Query.Types[TypeIdx];
364 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.
getSizeInBits();
372 bool IsLoad,
bool IsAtomic) {
376 return ST.enableFlatScratch() ? 128 : 32;
378 return ST.useDS128() ? 128 : 64;
389 return IsLoad ? 512 : 128;
394 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
403 const bool IsLoad = Query.
Opcode != AMDGPU::G_STORE;
408 unsigned AS = Query.
Types[1].getAddressSpace();
422 if (IsLoad && MemSize <
Size)
423 MemSize = std::max(MemSize,
Align);
432 AtomicOrdering::NotAtomic))
443 if (!ST.hasDwordx3LoadStores())
456 if (AlignBits < MemSize) {
459 Align(AlignBits / 8)))
502 return EltSize != 32 && EltSize != 64;
517 if (
Size != MemSizeInBits)
533 uint64_t AlignInBits,
unsigned AddrSpace,
543 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
554 if (AlignInBits < RoundedSize)
561 RoundedSize, AddrSpace,
Align(AlignInBits / 8),
568 if (Query.
MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
573 Query.
Types[1].getAddressSpace(), Opcode);
593 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
596 Register VectorReg =
MRI.createGenericVirtualRegister(VectorTy);
597 std::array<Register, 4> VectorElems;
598 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
599 for (
unsigned I = 0;
I < NumParts; ++
I)
601 B.buildExtractVectorElementConstant(
S32, VectorReg,
I).getReg(0);
602 B.buildMergeValues(MO, VectorElems);
606 Register BitcastReg =
MRI.createGenericVirtualRegister(VectorTy);
607 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
608 auto Scalar =
B.buildBitcast(ScalarTy, BitcastReg);
609 B.buildIntToPtr(MO, Scalar);
629 const unsigned NumParts =
PointerTy.getSizeInBits() / 32;
630 auto Unmerged =
B.buildUnmerge(
LLT::scalar(32), Pointer);
631 for (
unsigned I = 0;
I < NumParts; ++
I)
633 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
635 Register Scalar =
B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
636 return B.buildBitcast(VectorTy, Scalar).getReg(0);
653 using namespace TargetOpcode;
655 auto GetAddrSpacePtr = [&
TM](
unsigned AS) {
668 const LLT BufferStridedPtr =
671 const LLT CodePtr = FlatPtr;
673 const std::initializer_list<LLT> AddrSpaces64 = {
674 GlobalPtr, ConstantPtr, FlatPtr
677 const std::initializer_list<LLT> AddrSpaces32 = {
678 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
681 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
683 const std::initializer_list<LLT> FPTypesBase = {
687 const std::initializer_list<LLT> FPTypes16 = {
691 const std::initializer_list<LLT> FPTypesPK16 = {
721 .clampMaxNumElementsStrict(0,
S16, 2)
729 .clampMaxNumElementsStrict(0,
S16, 2)
739 .clampMaxNumElementsStrict(0,
S16, 2)
747 .clampMaxNumElementsStrict(0,
S16, 2)
757 .minScalarOrElt(0,
S16)
774 .widenScalarToNextMultipleOf(0, 32)
796 .widenScalarToNextMultipleOf(0, 32)
804 .widenScalarToNextMultipleOf(0, 32);
815 .minScalarOrElt(0,
S32)
834 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
846 .clampMaxNumElements(0,
S8, 2)
865 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
877 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
884 .clampScalar(0,
S16,
S64);
916 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
917 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
940 G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
974 .legalFor(FPTypesPK16)
988 .clampScalar(0,
S16,
S64);
1013 .clampScalar(0,
S32,
S64);
1018 .clampScalar(0,
S32,
S64);
1024 .clampScalar(0,
S32,
S64)
1025 .clampScalar(1,
S32,
S32)
1032 .clampScalar(1,
S32,
S32)
1068 FMad.customFor({
S32,
S16});
1070 FMad.customFor({
S32});
1072 FMad.customFor({
S16});
1080 FRem.minScalar(0,
S32)
1089 .clampMaxNumElements(0,
S16, 2)
1100 .clampScalar(0,
S32,
S64)
1101 .widenScalarToNextPow2(1, 32);
1129 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1135 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1139 if (
ST.has16BitInsts()) {
1140 getActionDefinitionsBuilder(
1141 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1143 .clampScalar(0,
S16,
S64)
1146 getActionDefinitionsBuilder(
1147 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1149 .clampScalar(0,
S32,
S64)
1152 getActionDefinitionsBuilder(
1153 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1156 .clampScalar(0,
S32,
S64)
1160 getActionDefinitionsBuilder(G_PTR_ADD)
1161 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1164 .scalarSameSizeAs(1, 0);
1166 getActionDefinitionsBuilder(G_PTRMASK)
1168 .scalarSameSizeAs(1, 0)
1172 getActionDefinitionsBuilder(G_ICMP)
1183 .legalForCartesianProduct(
1184 {
S1}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1185 .legalForCartesianProduct(
1186 {
S32}, {
S32,
S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1187 if (
ST.has16BitInsts()) {
1188 CmpBuilder.legalFor({{
S1,
S16}});
1192 .widenScalarToNextPow2(1)
1193 .clampScalar(1,
S32,
S64)
1198 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1199 {
S1},
ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1201 if (
ST.hasSALUFloatInsts())
1202 FCmpBuilder.legalForCartesianProduct({
S32}, {
S16,
S32});
1205 .widenScalarToNextPow2(1)
1206 .clampScalar(1,
S32,
S64)
1210 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1211 if (
ST.has16BitInsts())
1212 ExpOps.customFor({{
S32}, {
S16}});
1214 ExpOps.customFor({
S32});
1215 ExpOps.clampScalar(0, MinScalarFPTy,
S32)
1218 getActionDefinitionsBuilder(G_FPOWI)
1219 .clampScalar(0, MinScalarFPTy,
S32)
1222 auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
1223 Log2Ops.customFor({
S32});
1224 if (
ST.has16BitInsts())
1225 Log2Ops.legalFor({
S16});
1227 Log2Ops.customFor({
S16});
1228 Log2Ops.scalarize(0)
1232 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1233 LogOps.customFor({
S32,
S16});
1234 LogOps.clampScalar(0, MinScalarFPTy,
S32)
1238 getActionDefinitionsBuilder(G_CTPOP)
1240 .clampScalar(0,
S32,
S32)
1241 .widenScalarToNextPow2(1, 32)
1242 .clampScalar(1,
S32,
S64)
1244 .widenScalarToNextPow2(0, 32);
1247 if (
ST.has16BitInsts())
1248 getActionDefinitionsBuilder(G_IS_FPCLASS)
1249 .legalForCartesianProduct({
S1}, FPTypes16)
1250 .widenScalarToNextPow2(1)
1254 getActionDefinitionsBuilder(G_IS_FPCLASS)
1255 .legalForCartesianProduct({
S1}, FPTypesBase)
1256 .lowerFor({
S1,
S16})
1257 .widenScalarToNextPow2(1)
1264 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1266 .clampScalar(0,
S32,
S32)
1267 .clampScalar(1,
S32,
S64)
1268 .widenScalarToNextPow2(0, 32)
1269 .widenScalarToNextPow2(1, 32)
1273 getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
1275 .clampScalar(0,
S32,
S32)
1276 .clampScalar(1,
S32,
S64)
1278 .widenScalarToNextPow2(0, 32)
1279 .widenScalarToNextPow2(1, 32);
1283 getActionDefinitionsBuilder(G_BITREVERSE)
1285 .clampScalar(0,
S32,
S64)
1287 .widenScalarToNextPow2(0);
1289 if (
ST.has16BitInsts()) {
1290 getActionDefinitionsBuilder(G_BSWAP)
1292 .clampMaxNumElementsStrict(0,
S16, 2)
1295 .widenScalarToNextPow2(0)
1296 .clampScalar(0,
S16,
S32)
1299 if (
ST.hasVOP3PInsts()) {
1300 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1302 .clampMaxNumElements(0,
S16, 2)
1304 .widenScalarToNextPow2(0)
1308 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1310 .widenScalarToNextPow2(0)
1317 getActionDefinitionsBuilder(G_BSWAP)
1322 .widenScalarToNextPow2(0)
1327 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1330 .widenScalarToNextPow2(0)
1335 getActionDefinitionsBuilder(G_INTTOPTR)
1337 .legalForCartesianProduct(AddrSpaces64, {
S64})
1338 .legalForCartesianProduct(AddrSpaces32, {
S32})
1351 getActionDefinitionsBuilder(G_PTRTOINT)
1353 .legalForCartesianProduct(AddrSpaces64, {
S64})
1354 .legalForCartesianProduct(AddrSpaces32, {
S32})
1367 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1371 const auto needToSplitMemOp = [=](
const LegalityQuery &Query,
1372 bool IsLoad) ->
bool {
1376 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1390 unsigned NumRegs = (MemSize + 31) / 32;
1392 if (!
ST.hasDwordx3LoadStores())
1403 unsigned GlobalAlign32 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1404 unsigned GlobalAlign16 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1405 unsigned GlobalAlign8 =
ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1411 for (
unsigned Op : {G_LOAD, G_STORE}) {
1412 const bool IsStore =
Op == G_STORE;
1414 auto &Actions = getActionDefinitionsBuilder(
Op);
1417 Actions.legalForTypesWithMemDesc({{
S32, GlobalPtr,
S32, GlobalAlign32},
1420 {
S64, GlobalPtr,
S64, GlobalAlign32},
1423 {
S32, GlobalPtr,
S8, GlobalAlign8},
1424 {
S32, GlobalPtr,
S16, GlobalAlign16},
1426 {
S32, LocalPtr,
S32, 32},
1427 {
S64, LocalPtr,
S64, 32},
1429 {
S32, LocalPtr,
S8, 8},
1430 {
S32, LocalPtr,
S16, 16},
1433 {
S32, PrivatePtr,
S32, 32},
1434 {
S32, PrivatePtr,
S8, 8},
1435 {
S32, PrivatePtr,
S16, 16},
1438 {
S32, ConstantPtr,
S32, GlobalAlign32},
1441 {
S64, ConstantPtr,
S64, GlobalAlign32},
1442 {
V2S32, ConstantPtr,
V2S32, GlobalAlign32}});
1451 Actions.unsupportedIf(
1452 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1466 Actions.customIf(
typeIs(1, Constant32Ptr));
1492 return !Query.
Types[0].isVector() &&
1493 needToSplitMemOp(Query,
Op == G_LOAD);
1495 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1500 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1503 if (DstSize > MemSize)
1509 if (MemSize > MaxSize)
1517 return Query.
Types[0].isVector() &&
1518 needToSplitMemOp(Query,
Op == G_LOAD);
1520 [=](
const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1534 unsigned MemSize = Query.
MMODescrs[0].MemoryTy.getSizeInBits();
1535 if (MemSize > MaxSize) {
1539 if (MaxSize % EltSize == 0) {
1545 unsigned NumPieces = MemSize / MaxSize;
1549 if (NumPieces == 1 || NumPieces >= NumElts ||
1550 NumElts % NumPieces != 0)
1551 return std::pair(0, EltTy);
1559 return std::pair(0, EltTy);
1574 return std::pair(0, EltTy);
1578 .widenScalarToNextPow2(0)
1584 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1585 .legalForTypesWithMemDesc({{
S32, GlobalPtr,
S8, 8},
1586 {
S32, GlobalPtr,
S16, 2 * 8},
1587 {
S32, LocalPtr,
S8, 8},
1588 {
S32, LocalPtr,
S16, 16},
1589 {
S32, PrivatePtr,
S8, 8},
1590 {
S32, PrivatePtr,
S16, 16},
1591 {
S32, ConstantPtr,
S8, 8},
1592 {
S32, ConstantPtr,
S16, 2 * 8}})
1598 if (
ST.hasFlatAddressSpace()) {
1599 ExtLoads.legalForTypesWithMemDesc(
1600 {{
S32, FlatPtr,
S8, 8}, {
S32, FlatPtr,
S16, 16}});
1608 ExtLoads.customIf(
typeIs(1, Constant32Ptr));
1610 ExtLoads.clampScalar(0,
S32,
S32)
1611 .widenScalarToNextPow2(0)
1614 auto &Atomics = getActionDefinitionsBuilder(
1615 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1616 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1617 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1618 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1619 .legalFor({{
S32, GlobalPtr}, {
S32, LocalPtr},
1620 {
S64, GlobalPtr}, {
S64, LocalPtr},
1621 {
S32, RegionPtr}, {
S64, RegionPtr}});
1622 if (
ST.hasFlatAddressSpace()) {
1623 Atomics.legalFor({{
S32, FlatPtr}, {
S64, FlatPtr}});
1626 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1627 if (
ST.hasLDSFPAtomicAddF32()) {
1628 Atomic.legalFor({{
S32, LocalPtr}, {
S32, RegionPtr}});
1629 if (
ST.hasLdsAtomicAddF64())
1630 Atomic.legalFor({{
S64, LocalPtr}});
1631 if (
ST.hasAtomicDsPkAdd16Insts())
1632 Atomic.legalFor({{
V2S16, LocalPtr}});
1634 if (
ST.hasAtomicFaddInsts())
1635 Atomic.legalFor({{
S32, GlobalPtr}});
1636 if (
ST.hasFlatAtomicFaddF32Inst())
1637 Atomic.legalFor({{
S32, FlatPtr}});
1639 if (
ST.hasGFX90AInsts()) {
1652 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1653 .customFor({{
S32, GlobalPtr}, {
S64, GlobalPtr},
1654 {
S32, FlatPtr}, {
S64, FlatPtr}})
1655 .legalFor({{
S32, LocalPtr}, {
S64, LocalPtr},
1656 {
S32, RegionPtr}, {
S64, RegionPtr}});
1660 getActionDefinitionsBuilder(G_SELECT)
1662 LocalPtr, FlatPtr, PrivatePtr,
1666 .clampScalar(0,
S16,
S64)
1670 .clampMaxNumElements(0,
S32, 2)
1671 .clampMaxNumElements(0, LocalPtr, 2)
1672 .clampMaxNumElements(0, PrivatePtr, 2)
1674 .widenScalarToNextPow2(0)
1679 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1681 if (
ST.has16BitInsts()) {
1682 if (
ST.hasVOP3PInsts()) {
1684 .clampMaxNumElements(0,
S16, 2);
1686 Shifts.legalFor({{
S16,
S16}});
1689 Shifts.widenScalarIf(
1694 const LLT AmountTy = Query.
Types[1];
1699 Shifts.clampScalar(1,
S32,
S32);
1700 Shifts.widenScalarToNextPow2(0, 16);
1701 Shifts.clampScalar(0,
S16,
S64);
1703 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1711 Shifts.clampScalar(1,
S32,
S32);
1712 Shifts.widenScalarToNextPow2(0, 32);
1713 Shifts.clampScalar(0,
S32,
S64);
1715 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1720 Shifts.scalarize(0);
1722 for (
unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1723 unsigned VecTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1724 unsigned EltTypeIdx =
Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1725 unsigned IdxTypeIdx = 2;
1727 getActionDefinitionsBuilder(
Op)
1729 const LLT EltTy = Query.
Types[EltTypeIdx];
1730 const LLT VecTy = Query.
Types[VecTypeIdx];
1731 const LLT IdxTy = Query.
Types[IdxTypeIdx];
1733 const bool isLegalVecType =
1743 return (EltSize == 32 || EltSize == 64) &&
1758 const LLT EltTy = Query.
Types[EltTypeIdx];
1759 const LLT VecTy = Query.
Types[VecTypeIdx];
1763 const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
1768 .clampScalar(EltTypeIdx,
S32,
S64)
1769 .clampScalar(VecTypeIdx,
S32,
S64)
1770 .clampScalar(IdxTypeIdx,
S32,
S32)
1771 .clampMaxNumElements(VecTypeIdx,
S32, 32)
1781 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1783 const LLT &EltTy = Query.
Types[1].getElementType();
1784 return Query.
Types[0] != EltTy;
1787 for (
unsigned Op : {G_EXTRACT, G_INSERT}) {
1788 unsigned BigTyIdx =
Op == G_EXTRACT ? 1 : 0;
1789 unsigned LitTyIdx =
Op == G_EXTRACT ? 0 : 1;
1792 getActionDefinitionsBuilder(
Op)
1798 const LLT BigTy = Query.
Types[BigTyIdx];
1803 const LLT BigTy = Query.
Types[BigTyIdx];
1804 const LLT LitTy = Query.
Types[LitTyIdx];
1810 const LLT BigTy = Query.
Types[BigTyIdx];
1816 const LLT LitTy = Query.
Types[LitTyIdx];
1821 .widenScalarToNextPow2(BigTyIdx, 32);
1825 auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
1835 if (
ST.hasScalarPackInsts()) {
1838 .minScalarOrElt(0,
S16)
1841 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1845 BuildVector.customFor({
V2S16,
S16});
1846 BuildVector.minScalarOrElt(0,
S32);
1848 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
1856 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
1858 .clampMaxNumElements(0,
S32, 32)
1859 .clampMaxNumElements(1,
S16, 2)
1860 .clampMaxNumElements(0,
S16, 64);
1862 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
1865 for (
unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
1866 unsigned BigTyIdx =
Op == G_MERGE_VALUES ? 0 : 1;
1867 unsigned LitTyIdx =
Op == G_MERGE_VALUES ? 1 : 0;
1869 auto notValidElt = [=](
const LegalityQuery &Query,
unsigned TypeIdx) {
1870 const LLT Ty = Query.
Types[TypeIdx];
1881 auto &Builder = getActionDefinitionsBuilder(
Op)
1885 const LLT BigTy = Query.
Types[BigTyIdx];
1891 .widenScalarToNextPow2(LitTyIdx, 16)
1899 .clampScalar(LitTyIdx,
S32,
S512)
1900 .widenScalarToNextPow2(LitTyIdx, 32)
1903 [=](
const LegalityQuery &Query) {
return notValidElt(Query, LitTyIdx); },
1906 [=](
const LegalityQuery &Query) {
return notValidElt(Query, BigTyIdx); },
1910 if (
Op == G_MERGE_VALUES) {
1911 Builder.widenScalarIf(
1914 const LLT Ty = Query.
Types[LitTyIdx];
1920 Builder.widenScalarIf(
1922 const LLT Ty = Query.
Types[BigTyIdx];
1928 const LLT &Ty = Query.
Types[BigTyIdx];
1930 if (NewSizeInBits >= 256) {
1932 if (RoundedTo < NewSizeInBits)
1933 NewSizeInBits = RoundedTo;
1935 return std::pair(BigTyIdx,
LLT::scalar(NewSizeInBits));
1944 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
1945 .legalFor({{
S32}, {
S64}});
1947 if (
ST.hasVOP3PInsts()) {
1948 SextInReg.lowerFor({{
V2S16}})
1952 .clampMaxNumElementsStrict(0,
S16, 2);
1953 }
else if (
ST.has16BitInsts()) {
1954 SextInReg.lowerFor({{
S32}, {
S64}, {
S16}});
1958 SextInReg.lowerFor({{
S32}, {
S64}});
1963 .clampScalar(0,
S32,
S64)
1966 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
1971 getActionDefinitionsBuilder(G_FSHR)
1974 .clampMaxNumElementsStrict(0,
S16, 2)
1978 if (
ST.hasVOP3PInsts()) {
1979 getActionDefinitionsBuilder(G_FSHL)
1981 .clampMaxNumElementsStrict(0,
S16, 2)
1985 getActionDefinitionsBuilder(G_FSHL)
1990 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
1993 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({
S64});
1995 getActionDefinitionsBuilder(G_FENCE)
1998 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2003 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2005 .clampScalar(1,
S32,
S32)
2006 .clampScalar(0,
S32,
S64)
2007 .widenScalarToNextPow2(0)
2010 getActionDefinitionsBuilder(
2014 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2015 G_READ_REGISTER, G_WRITE_REGISTER,
2020 if (
ST.hasIEEEMinMax()) {
2021 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2022 .legalFor(FPTypesPK16)
2023 .clampMaxNumElements(0,
S16, 2)
2027 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
2030 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2033 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2035 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2036 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2037 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2040 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2042 getLegacyLegalizerInfo().computeTables();
2052 switch (
MI.getOpcode()) {
2053 case TargetOpcode::G_ADDRSPACE_CAST:
2055 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2057 case TargetOpcode::G_FCEIL:
2059 case TargetOpcode::G_FREM:
2061 case TargetOpcode::G_INTRINSIC_TRUNC:
2063 case TargetOpcode::G_SITOFP:
2065 case TargetOpcode::G_UITOFP:
2067 case TargetOpcode::G_FPTOSI:
2069 case TargetOpcode::G_FPTOUI:
2071 case TargetOpcode::G_FMINNUM:
2072 case TargetOpcode::G_FMAXNUM:
2073 case TargetOpcode::G_FMINNUM_IEEE:
2074 case TargetOpcode::G_FMAXNUM_IEEE:
2076 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2078 case TargetOpcode::G_INSERT_VECTOR_ELT:
2080 case TargetOpcode::G_FSIN:
2081 case TargetOpcode::G_FCOS:
2083 case TargetOpcode::G_GLOBAL_VALUE:
2085 case TargetOpcode::G_LOAD:
2086 case TargetOpcode::G_SEXTLOAD:
2087 case TargetOpcode::G_ZEXTLOAD:
2089 case TargetOpcode::G_STORE:
2091 case TargetOpcode::G_FMAD:
2093 case TargetOpcode::G_FDIV:
2095 case TargetOpcode::G_FFREXP:
2097 case TargetOpcode::G_FSQRT:
2099 case TargetOpcode::G_UDIV:
2100 case TargetOpcode::G_UREM:
2101 case TargetOpcode::G_UDIVREM:
2103 case TargetOpcode::G_SDIV:
2104 case TargetOpcode::G_SREM:
2105 case TargetOpcode::G_SDIVREM:
2107 case TargetOpcode::G_ATOMIC_CMPXCHG:
2109 case TargetOpcode::G_FLOG2:
2111 case TargetOpcode::G_FLOG:
2112 case TargetOpcode::G_FLOG10:
2114 case TargetOpcode::G_FEXP2:
2116 case TargetOpcode::G_FEXP:
2117 case TargetOpcode::G_FEXP10:
2119 case TargetOpcode::G_FPOW:
2121 case TargetOpcode::G_FFLOOR:
2123 case TargetOpcode::G_BUILD_VECTOR:
2124 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2126 case TargetOpcode::G_MUL:
2128 case TargetOpcode::G_CTLZ:
2129 case TargetOpcode::G_CTTZ:
2131 case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
2133 case TargetOpcode::G_STACKSAVE:
2135 case TargetOpcode::G_GET_FPENV:
2137 case TargetOpcode::G_SET_FPENV:
2139 case TargetOpcode::G_TRAP:
2141 case TargetOpcode::G_DEBUGTRAP:
2161 if (ST.hasApertureRegs()) {
2166 ? AMDGPU::SRC_SHARED_BASE
2167 : AMDGPU::SRC_PRIVATE_BASE;
2176 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2177 B.buildInstr(AMDGPU::S_MOV_B64, {Dst}, {
Register(ApertureRegNo)});
2178 return B.buildUnmerge(
S32, Dst).getReg(1);
2183 Register LoadAddr =
MRI.createGenericVirtualRegister(
2193 ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), Param);
2195 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
2209 B.buildPtrAdd(LoadAddr, KernargPtrReg,
2212 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2215 Register QueuePtr =
MRI.createGenericVirtualRegister(
2231 B.buildPtrAdd(LoadAddr, QueuePtr,
2232 B.buildConstant(
LLT::scalar(64), StructOffset).getReg(0));
2233 return B.buildLoad(
S32, LoadAddr, *MMO).getReg(0);
2241 switch (Def->getOpcode()) {
2242 case AMDGPU::G_FRAME_INDEX:
2243 case AMDGPU::G_GLOBAL_VALUE:
2244 case AMDGPU::G_BLOCK_ADDR:
2246 case AMDGPU::G_CONSTANT: {
2247 const ConstantInt *CI = Def->getOperand(1).getCImm();
2264 assert(
MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2265 (isa<GIntrinsic>(
MI) && cast<GIntrinsic>(
MI).getIntrinsicID() ==
2266 Intrinsic::amdgcn_addrspacecast_nonnull));
2270 Register Src = isa<GIntrinsic>(
MI) ?
MI.getOperand(2).getReg()
2271 :
MI.getOperand(1).getReg();
2272 LLT DstTy =
MRI.getType(Dst);
2273 LLT SrcTy =
MRI.getType(Src);
2284 if (
TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2285 MI.setDesc(
B.getTII().get(TargetOpcode::G_BITCAST));
2296 B.buildExtract(Dst, Src, 0);
2297 MI.eraseFromParent();
2301 unsigned NullVal =
TM.getNullPointerValue(DestAS);
2303 auto SegmentNull =
B.buildConstant(DstTy, NullVal);
2304 auto FlatNull =
B.buildConstant(SrcTy, 0);
2307 auto PtrLo32 =
B.buildExtract(DstTy, Src, 0);
2311 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2313 MI.eraseFromParent();
2325 Register SrcAsInt =
B.buildPtrToInt(
S32, Src).getReg(0);
2329 auto BuildPtr =
B.buildMergeLikeInstr(DstTy, {SrcAsInt, ApertureReg});
2334 B.buildCopy(Dst, BuildPtr);
2335 MI.eraseFromParent();
2339 auto SegmentNull =
B.buildConstant(SrcTy,
TM.getNullPointerValue(SrcAS));
2340 auto FlatNull =
B.buildConstant(DstTy,
TM.getNullPointerValue(DestAS));
2343 SegmentNull.getReg(0));
2345 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2347 MI.eraseFromParent();
2354 B.buildExtract(Dst, Src, 0);
2355 MI.eraseFromParent();
2363 auto PtrLo =
B.buildPtrToInt(
S32, Src);
2364 auto HighAddr =
B.buildConstant(
S32, AddrHiVal);
2365 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2366 MI.eraseFromParent();
2371 MF.
getFunction(),
"invalid addrspacecast",
B.getDebugLoc());
2374 Ctx.
diagnose(InvalidAddrSpaceCast);
2376 MI.eraseFromParent();
2384 LLT Ty =
MRI.getType(Src);
2390 auto C1 =
B.buildFConstant(Ty, C1Val);
2391 auto CopySign =
B.buildFCopysign(Ty, C1, Src);
2394 auto Tmp1 =
B.buildFAdd(Ty, Src, CopySign);
2395 auto Tmp2 =
B.buildFSub(Ty, Tmp1, CopySign);
2397 auto C2 =
B.buildFConstant(Ty, C2Val);
2398 auto Fabs =
B.buildFAbs(Ty, Src);
2401 B.buildSelect(
MI.getOperand(0).getReg(),
Cond, Src, Tmp2);
2402 MI.eraseFromParent();
2420 auto Trunc =
B.buildIntrinsicTrunc(
S64, Src);
2422 const auto Zero =
B.buildFConstant(
S64, 0.0);
2423 const auto One =
B.buildFConstant(
S64, 1.0);
2426 auto And =
B.buildAnd(
S1, Lt0, NeTrunc);
2427 auto Add =
B.buildSelect(
S64,
And, One, Zero);
2430 B.buildFAdd(
MI.getOperand(0).getReg(), Trunc,
Add);
2431 MI.eraseFromParent();
2439 Register Src0Reg =
MI.getOperand(1).getReg();
2440 Register Src1Reg =
MI.getOperand(2).getReg();
2441 auto Flags =
MI.getFlags();
2442 LLT Ty =
MRI.getType(DstReg);
2444 auto Div =
B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2445 auto Trunc =
B.buildIntrinsicTrunc(Ty, Div, Flags);
2446 auto Neg =
B.buildFNeg(Ty, Trunc, Flags);
2447 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2448 MI.eraseFromParent();
2454 const unsigned FractBits = 52;
2455 const unsigned ExpBits = 11;
2458 auto Const0 =
B.buildConstant(
S32, FractBits - 32);
2459 auto Const1 =
B.buildConstant(
S32, ExpBits);
2461 auto ExpPart =
B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {
S32})
2463 .addUse(Const0.getReg(0))
2464 .addUse(Const1.getReg(0));
2466 return B.buildSub(
S32, ExpPart,
B.buildConstant(
S32, 1023));
2480 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2487 const unsigned FractBits = 52;
2490 const auto SignBitMask =
B.buildConstant(
S32, UINT32_C(1) << 31);
2491 auto SignBit =
B.buildAnd(
S32,
Hi, SignBitMask);
2493 const auto FractMask =
B.buildConstant(
S64, (UINT64_C(1) << FractBits) - 1);
2495 const auto Zero32 =
B.buildConstant(
S32, 0);
2498 auto SignBit64 =
B.buildMergeLikeInstr(
S64, {Zero32, SignBit});
2500 auto Shr =
B.buildAShr(
S64, FractMask, Exp);
2501 auto Not =
B.buildNot(
S64, Shr);
2502 auto Tmp0 =
B.buildAnd(
S64, Src, Not);
2503 auto FiftyOne =
B.buildConstant(
S32, FractBits - 1);
2508 auto Tmp1 =
B.buildSelect(
S64, ExpLt0, SignBit64, Tmp0);
2509 B.buildSelect(
MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2510 MI.eraseFromParent();
2526 auto Unmerge =
B.buildUnmerge({
S32,
S32}, Src);
2527 auto ThirtyTwo =
B.buildConstant(
S32, 32);
2529 if (
MRI.getType(Dst) ==
S64) {
2530 auto CvtHi =
Signed ?
B.buildSITOFP(
S64, Unmerge.getReg(1))
2531 :
B.buildUITOFP(
S64, Unmerge.getReg(1));
2533 auto CvtLo =
B.buildUITOFP(
S64, Unmerge.getReg(0));
2534 auto LdExp =
B.buildFLdexp(
S64, CvtHi, ThirtyTwo);
2537 B.buildFAdd(Dst, LdExp, CvtLo);
2538 MI.eraseFromParent();
2544 auto One =
B.buildConstant(
S32, 1);
2548 auto ThirtyOne =
B.buildConstant(
S32, 31);
2549 auto X =
B.buildXor(
S32, Unmerge.getReg(0), Unmerge.getReg(1));
2550 auto OppositeSign =
B.buildAShr(
S32,
X, ThirtyOne);
2551 auto MaxShAmt =
B.buildAdd(
S32, ThirtyTwo, OppositeSign);
2552 auto LS =
B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {
S32})
2553 .addUse(Unmerge.getReg(1));
2554 auto LS2 =
B.buildSub(
S32, LS, One);
2555 ShAmt =
B.buildUMin(
S32, LS2, MaxShAmt);
2557 ShAmt =
B.buildCTLZ(
S32, Unmerge.getReg(1));
2558 auto Norm =
B.buildShl(
S64, Src, ShAmt);
2559 auto Unmerge2 =
B.buildUnmerge({
S32,
S32}, Norm);
2560 auto Adjust =
B.buildUMin(
S32, One, Unmerge2.getReg(0));
2561 auto Norm2 =
B.buildOr(
S32, Unmerge2.getReg(1), Adjust);
2562 auto FVal =
Signed ?
B.buildSITOFP(
S32, Norm2) :
B.buildUITOFP(
S32, Norm2);
2563 auto Scale =
B.buildSub(
S32, ThirtyTwo, ShAmt);
2564 B.buildFLdexp(Dst, FVal, Scale);
2565 MI.eraseFromParent();
2582 const LLT SrcLT =
MRI.getType(Src);
2585 unsigned Flags =
MI.getFlags();
2596 auto Trunc =
B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2604 Sign =
B.buildAShr(
S32, Src,
B.buildConstant(
S32, 31));
2605 Trunc =
B.buildFAbs(
S32, Trunc, Flags);
2609 K0 =
B.buildFConstant(
2610 S64, llvm::bit_cast<double>(UINT64_C( 0x3df0000000000000)));
2611 K1 =
B.buildFConstant(
2612 S64, llvm::bit_cast<double>(UINT64_C( 0xc1f0000000000000)));
2614 K0 =
B.buildFConstant(
2615 S32, llvm::bit_cast<float>(UINT32_C( 0x2f800000)));
2616 K1 =
B.buildFConstant(
2617 S32, llvm::bit_cast<float>(UINT32_C( 0xcf800000)));
2620 auto Mul =
B.buildFMul(SrcLT, Trunc, K0, Flags);
2621 auto FloorMul =
B.buildFFloor(SrcLT,
Mul, Flags);
2622 auto Fma =
B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2625 :
B.buildFPTOUI(
S32, FloorMul);
2626 auto Lo =
B.buildFPTOUI(
S32, Fma);
2630 Sign =
B.buildMergeLikeInstr(
S64, {Sign, Sign});
2632 B.buildSub(Dst,
B.buildXor(
S64,
B.buildMergeLikeInstr(
S64, {Lo, Hi}), Sign),
2635 B.buildMergeLikeInstr(Dst, {
Lo,
Hi});
2636 MI.eraseFromParent();
2646 const bool IsIEEEOp =
MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
2647 MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
2670 LLT VecTy =
MRI.getType(Vec);
2683 auto IntVec =
B.buildPtrToInt(IntVecTy, Vec);
2684 auto IntElt =
B.buildExtractVectorElement(IntTy, IntVec,
MI.getOperand(2));
2685 B.buildIntToPtr(Dst, IntElt);
2687 MI.eraseFromParent();
2694 std::optional<ValueAndVReg> MaybeIdxVal =
2698 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2701 auto Unmerge =
B.buildUnmerge(EltTy, Vec);
2702 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
2707 MI.eraseFromParent();
2722 LLT VecTy =
MRI.getType(Vec);
2736 auto IntVecSource =
B.buildPtrToInt(IntVecTy, Vec);
2737 auto IntIns =
B.buildPtrToInt(IntTy, Ins);
2738 auto IntVecDest =
B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
2740 B.buildIntToPtr(Dst, IntVecDest);
2741 MI.eraseFromParent();
2748 std::optional<ValueAndVReg> MaybeIdxVal =
2753 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
2756 if (IdxVal < NumElts) {
2758 for (
unsigned i = 0; i < NumElts; ++i)
2759 SrcRegs.
push_back(
MRI.createGenericVirtualRegister(EltTy));
2760 B.buildUnmerge(SrcRegs, Vec);
2762 SrcRegs[IdxVal] =
MI.getOperand(2).getReg();
2763 B.buildMergeLikeInstr(Dst, SrcRegs);
2768 MI.eraseFromParent();
2778 LLT Ty =
MRI.getType(DstReg);
2779 unsigned Flags =
MI.getFlags();
2784 auto MulVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
2785 TrigVal =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
2786 .addUse(MulVal.getReg(0))
2790 TrigVal =
B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
2793 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
2797 MI.eraseFromParent();
2805 unsigned GAFlags)
const {
2806 assert(isInt<32>(
Offset + 4) &&
"32-bit offset is expected!");
2834 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
2845 if (!
B.getMRI()->getRegClassOrNull(PCReg))
2846 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
2849 B.buildExtract(DstReg, PCReg, 0);
2863 Register AddrLo = !RequiresHighHalf && !
MRI.getRegClassOrNull(DstReg)
2865 :
MRI.createGenericVirtualRegister(
S32);
2867 if (!
MRI.getRegClassOrNull(AddrLo))
2868 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
2871 B.buildInstr(AMDGPU::S_MOV_B32)
2876 if (RequiresHighHalf) {
2878 "Must provide a 64-bit pointer type!");
2881 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
2883 B.buildInstr(AMDGPU::S_MOV_B32)
2893 if (!
MRI.getRegClassOrNull(AddrDst))
2894 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
2896 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
2900 if (AddrDst != DstReg)
2901 B.buildCast(DstReg, AddrDst);
2902 }
else if (AddrLo != DstReg) {
2905 B.buildCast(DstReg, AddrLo);
2913 LLT Ty =
MRI.getType(DstReg);
2922 GV->
getName() !=
"llvm.amdgcn.module.lds") {
2925 Fn,
"local memory global used by non-kernel function",
MI.getDebugLoc(),
2935 B.buildUndef(DstReg);
2936 MI.eraseFromParent();
2956 if (
B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
2960 auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {
S32});
2961 B.buildIntToPtr(DstReg, Sz);
2962 MI.eraseFromParent();
2968 *cast<GlobalVariable>(GV)));
2969 MI.eraseFromParent();
2975 MI.eraseFromParent();
2983 MI.eraseFromParent();
2989 MI.eraseFromParent();
2994 Register GOTAddr =
MRI.createGenericVirtualRegister(PtrTy);
3007 auto Load =
B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3008 B.buildExtract(DstReg, Load, 0);
3010 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3012 MI.eraseFromParent();
3030 LLT PtrTy =
MRI.getType(PtrReg);
3035 auto Cast =
B.buildAddrSpaceCast(ConstPtr, PtrReg);
3037 MI.getOperand(1).setReg(Cast.getReg(0));
3042 if (
MI.getOpcode() != AMDGPU::G_LOAD)
3046 LLT ValTy =
MRI.getType(ValReg);
3068 if (WideMemSize == ValSize) {
3074 MI.setMemRefs(MF, {WideMMO});
3080 if (ValSize > WideMemSize)
3087 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3088 B.buildTrunc(ValReg, WideLoad).getReg(0);
3095 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3096 B.buildExtract(ValReg, WideLoad, 0);
3100 WideLoad =
B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3101 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3105 MI.eraseFromParent();
3118 Register DataReg =
MI.getOperand(0).getReg();
3119 LLT DataTy =
MRI.getType(DataReg);
3133 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
3162 "this should not have been custom lowered");
3164 LLT ValTy =
MRI.getType(CmpVal);
3167 Register PackedVal =
B.buildBuildVector(VecTy, { NewVal, CmpVal }).
getReg(0);
3169 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3173 .setMemRefs(
MI.memoperands());
3175 MI.eraseFromParent();
3184 case TargetOpcode::G_INTRINSIC: {
3186 case Intrinsic::amdgcn_frexp_mant:
3194 case TargetOpcode::G_FFREXP: {
3199 case TargetOpcode::G_FPEXT: {
3223std::pair<Register, Register>
3225 unsigned Flags)
const {
3230 auto SmallestNormal =
B.buildFConstant(
3232 auto IsLtSmallestNormal =
3235 auto Scale32 =
B.buildFConstant(F32, 0x1.0p+32);
3236 auto One =
B.buildFConstant(F32, 1.0);
3238 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3239 auto ScaledInput =
B.buildFMul(F32, Src, ScaleFactor, Flags);
3241 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3254 LLT Ty =
B.getMRI()->getType(Dst);
3255 unsigned Flags =
MI.getFlags();
3260 auto Ext =
B.buildFPExt(F32, Src, Flags);
3261 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3262 .addUse(Ext.getReg(0))
3264 B.buildFPTrunc(Dst,
Log2, Flags);
3265 MI.eraseFromParent();
3273 B.buildIntrinsic(Intrinsic::amdgcn_log, {
MI.getOperand(0)})
3276 MI.eraseFromParent();
3280 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3281 .addUse(ScaledInput)
3284 auto ThirtyTwo =
B.buildFConstant(Ty, 32.0);
3285 auto Zero =
B.buildFConstant(Ty, 0.0);
3287 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3288 B.buildFSub(Dst,
Log2, ResultOffset, Flags);
3290 MI.eraseFromParent();
3296 auto FMul =
B.buildFMul(Ty,
X,
Y, Flags);
3297 return B.buildFAdd(Ty,
FMul, Z, Flags).getReg(0);
3302 const bool IsLog10 =
MI.getOpcode() == TargetOpcode::G_FLOG10;
3303 assert(IsLog10 ||
MI.getOpcode() == TargetOpcode::G_FLOG);
3308 unsigned Flags =
MI.getFlags();
3309 const LLT Ty =
MRI.getType(
X);
3319 TM.Options.ApproxFuncFPMath ||
TM.Options.UnsafeFPMath) {
3321 Register LogVal =
MRI.createGenericVirtualRegister(F32);
3322 auto PromoteSrc =
B.buildFPExt(F32,
X);
3324 B.buildFPTrunc(Dst, LogVal);
3329 MI.eraseFromParent();
3338 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(
X).setMIFlags(Flags);
3343 const float c_log10 = 0x1.344134p-2f;
3344 const float cc_log10 = 0x1.09f79ep-26f;
3347 const float c_log = 0x1.62e42ep-1f;
3348 const float cc_log = 0x1.efa39ep-25f;
3350 auto C =
B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3351 auto CC =
B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3353 R =
B.buildFMul(Ty,
Y,
C, Flags).getReg(0);
3354 auto NegR =
B.buildFNeg(Ty, R, Flags);
3355 auto FMA0 =
B.buildFMA(Ty,
Y,
C, NegR, Flags);
3356 auto FMA1 =
B.buildFMA(Ty,
Y,
CC, FMA0, Flags);
3357 R =
B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
3360 const float ch_log10 = 0x1.344000p-2f;
3361 const float ct_log10 = 0x1.3509f6p-18f;
3364 const float ch_log = 0x1.62e000p-1f;
3365 const float ct_log = 0x1.0bfbe8p-15f;
3367 auto CH =
B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3368 auto CT =
B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3370 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3371 auto YH =
B.buildAnd(Ty,
Y, MaskConst);
3372 auto YT =
B.buildFSub(Ty,
Y, YH, Flags);
3373 auto YTCT =
B.buildFMul(Ty, YT, CT, Flags);
3376 getMad(
B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
3378 R =
getMad(
B, Ty, YH.getReg(0),
CH.getReg(0), Mad1, Flags);
3381 const bool IsFiniteOnly =
3385 if (!IsFiniteOnly) {
3388 auto Fabs =
B.buildFAbs(Ty,
Y);
3391 R =
B.buildSelect(Ty, IsFinite, R,
Y, Flags).getReg(0);
3395 auto Zero =
B.buildFConstant(Ty, 0.0);
3397 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3398 auto Shift =
B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3399 B.buildFSub(Dst, R, Shift, Flags);
3401 B.buildCopy(Dst, R);
3404 MI.eraseFromParent();
3410 unsigned Flags)
const {
3411 const double Log2BaseInverted =
3414 LLT Ty =
B.getMRI()->getType(Dst);
3419 auto LogSrc =
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3422 auto ScaledResultOffset =
B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3423 auto Zero =
B.buildFConstant(Ty, 0.0);
3425 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3426 auto Log2Inv =
B.buildFConstant(Ty, Log2BaseInverted);
3429 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3431 auto Mul =
B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3432 B.buildFAdd(Dst,
Mul, ResultOffset, Flags);
3440 ?
B.buildFLog2(Ty, Src, Flags)
3441 :
B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3444 auto Log2BaseInvertedOperand =
B.buildFConstant(Ty, Log2BaseInverted);
3445 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3456 unsigned Flags =
MI.getFlags();
3457 LLT Ty =
B.getMRI()->getType(Dst);
3463 auto Ext =
B.buildFPExt(F32, Src, Flags);
3464 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3465 .addUse(Ext.getReg(0))
3467 B.buildFPTrunc(Dst,
Log2, Flags);
3468 MI.eraseFromParent();
3478 MI.eraseFromParent();
3486 auto RangeCheckConst =
B.buildFConstant(Ty, -0x1.f80000p+6f);
3488 RangeCheckConst, Flags);
3490 auto SixtyFour =
B.buildFConstant(Ty, 0x1.0p+6f);
3491 auto Zero =
B.buildFConstant(Ty, 0.0);
3492 auto AddOffset =
B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3493 auto AddInput =
B.buildFAdd(F32, Src, AddOffset, Flags);
3495 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3496 .addUse(AddInput.getReg(0))
3499 auto TwoExpNeg64 =
B.buildFConstant(Ty, 0x1.0p-64f);
3500 auto One =
B.buildFConstant(Ty, 1.0);
3501 auto ResultScale =
B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3502 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3503 MI.eraseFromParent();
3509 LLT Ty =
B.getMRI()->getType(Dst);
3514 auto Mul =
B.buildFMul(Ty,
X, Log2E, Flags);
3518 .addUse(
Mul.getReg(0))
3521 B.buildFExp2(Dst,
Mul.getReg(0), Flags);
3527 auto Threshold =
B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3530 auto ScaleOffset =
B.buildFConstant(Ty, 0x1.0p+6f);
3531 auto ScaledX =
B.buildFAdd(Ty,
X, ScaleOffset, Flags);
3532 auto AdjustedX =
B.buildSelect(Ty, NeedsScaling, ScaledX,
X, Flags);
3535 auto ExpInput =
B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3537 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3538 .addUse(ExpInput.getReg(0))
3541 auto ResultScaleFactor =
B.buildFConstant(Ty, 0x1.969d48p-93f);
3542 auto AdjustedResult =
B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3543 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3551 const unsigned Flags =
MI.getFlags();
3554 LLT Ty =
MRI.getType(Dst);
3557 const bool IsExp10 =
MI.getOpcode() == TargetOpcode::G_FEXP10;
3564 MI.eraseFromParent();
3572 auto Ext =
B.buildFPExt(F32,
X, Flags);
3573 Register Lowered =
MRI.createGenericVirtualRegister(F32);
3575 B.buildFPTrunc(Dst, Lowered, Flags);
3576 MI.eraseFromParent();
3586 MI.eraseFromParent();
3614 const unsigned FlagsNoContract = Flags &
~MachineInstr::FmContract;
3619 const float cc_exp = 0x1.4ae0bep-26f;
3620 const float c_exp10 = 0x1.a934f0p+1f;
3621 const float cc_exp10 = 0x1.2f346ep-24f;
3623 auto C =
B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
3624 PH =
B.buildFMul(Ty,
X,
C, Flags).getReg(0);
3625 auto NegPH =
B.buildFNeg(Ty, PH, Flags);
3626 auto FMA0 =
B.buildFMA(Ty,
X,
C, NegPH, Flags);
3628 auto CC =
B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
3629 PL =
B.buildFMA(Ty,
X,
CC, FMA0, Flags).getReg(0);
3631 const float ch_exp = 0x1.714000p+0f;
3632 const float cl_exp = 0x1.47652ap-12f;
3634 const float ch_exp10 = 0x1.a92000p+1f;
3635 const float cl_exp10 = 0x1.4f0978p-11f;
3637 auto MaskConst =
B.buildConstant(Ty, 0xfffff000);
3638 auto XH =
B.buildAnd(Ty,
X, MaskConst);
3639 auto XL =
B.buildFSub(Ty,
X, XH, Flags);
3641 auto CH =
B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
3642 PH =
B.buildFMul(Ty, XH,
CH, Flags).getReg(0);
3644 auto CL =
B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
3645 auto XLCL =
B.buildFMul(Ty, XL, CL, Flags);
3648 getMad(
B, Ty, XL.getReg(0),
CH.getReg(0), XLCL.getReg(0), Flags);
3649 PL =
getMad(
B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
3652 auto E =
B.buildIntrinsicRoundeven(Ty, PH, Flags);
3655 auto PHSubE =
B.buildFSub(Ty, PH, E, FlagsNoContract);
3656 auto A =
B.buildFAdd(Ty, PHSubE, PL, Flags);
3659 auto Exp2 =
B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3660 .addUse(
A.getReg(0))
3662 auto R =
B.buildFLdexp(Ty, Exp2, IntE, Flags);
3664 auto UnderflowCheckConst =
3665 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
3666 auto Zero =
B.buildFConstant(Ty, 0.0);
3670 R =
B.buildSelect(Ty, Underflow, Zero, R);
3675 auto OverflowCheckConst =
3676 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
3681 R =
B.buildSelect(Ty, Overflow, Inf, R, Flags);
3684 B.buildCopy(Dst, R);
3685 MI.eraseFromParent();
3694 unsigned Flags =
MI.getFlags();
3695 LLT Ty =
B.getMRI()->getType(Dst);
3700 auto Log =
B.buildFLog2(F32, Src0, Flags);
3701 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3702 .addUse(Log.getReg(0))
3705 B.buildFExp2(Dst,
Mul, Flags);
3706 }
else if (Ty == F16) {
3708 auto Log =
B.buildFLog2(F16, Src0, Flags);
3709 auto Ext0 =
B.buildFPExt(F32, Log, Flags);
3710 auto Ext1 =
B.buildFPExt(F32, Src1, Flags);
3711 auto Mul =
B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
3712 .addUse(Ext0.getReg(0))
3713 .addUse(Ext1.getReg(0))
3715 B.buildFExp2(Dst,
B.buildFPTrunc(F16,
Mul), Flags);
3719 MI.eraseFromParent();
3727 ModSrc = SrcFNeg->getOperand(1).getReg();
3729 ModSrc = SrcFAbs->getOperand(1).getReg();
3731 ModSrc = SrcFAbs->getOperand(1).getReg();
3742 Register OrigSrc =
MI.getOperand(1).getReg();
3743 unsigned Flags =
MI.getFlags();
3745 "this should not have been custom lowered");
3755 auto Fract =
B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
3767 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
3769 Register Min =
MRI.createGenericVirtualRegister(F64);
3775 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
3777 B.buildFMinNum(Min, Fract, Const, Flags);
3782 CorrectedFract =
B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
3785 auto NegFract =
B.buildFNeg(F64, CorrectedFract, Flags);
3786 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
3788 MI.eraseFromParent();
3804 if (
MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
3806 Src0 =
B.buildTrunc(
S16,
MI.getOperand(1).getReg()).getReg(0);
3807 Src1 =
B.buildTrunc(
S16,
MI.getOperand(2).getReg()).getReg(0);
3810 auto Merge =
B.buildMergeLikeInstr(
S32, {Src0, Src1});
3811 B.buildBitcast(Dst,
Merge);
3813 MI.eraseFromParent();
3830 bool UsePartialMad64_32,
3831 bool SeparateOddAlignedProducts)
const {
3846 auto getZero32 = [&]() ->
Register {
3848 Zero32 =
B.buildConstant(
S32, 0).getReg(0);
3851 auto getZero64 = [&]() ->
Register {
3853 Zero64 =
B.buildConstant(
S64, 0).getReg(0);
3858 for (
unsigned i = 0; i < Src0.
size(); ++i) {
3869 if (CarryIn.empty())
3872 bool HaveCarryOut =
true;
3874 if (CarryIn.size() == 1) {
3876 LocalAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3880 CarryAccum = getZero32();
3882 CarryAccum =
B.buildZExt(
S32, CarryIn[0]).getReg(0);
3883 for (
unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
3885 B.buildUAdde(
S32,
S1, CarryAccum, getZero32(), CarryIn[i])
3890 LocalAccum = getZero32();
3891 HaveCarryOut =
false;
3896 B.buildUAdde(
S32,
S1, CarryAccum, LocalAccum, CarryIn.back());
3897 LocalAccum =
Add.getReg(0);
3911 auto buildMadChain =
3914 assert((DstIndex + 1 < Accum.
size() && LocalAccum.size() == 2) ||
3915 (DstIndex + 1 >= Accum.
size() && LocalAccum.size() == 1));
3922 if (LocalAccum.size() == 1 &&
3923 (!UsePartialMad64_32 || !CarryIn.empty())) {
3926 unsigned j1 = DstIndex - j0;
3927 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3931 auto Mul =
B.buildMul(
S32, Src0[j0], Src1[j1]);
3933 LocalAccum[0] =
Mul.getReg(0);
3935 if (CarryIn.empty()) {
3936 LocalAccum[0] =
B.buildAdd(
S32, LocalAccum[0],
Mul).getReg(0);
3939 B.buildUAdde(
S32,
S1, LocalAccum[0],
Mul, CarryIn.back())
3945 }
while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
3949 if (j0 <= DstIndex) {
3950 bool HaveSmallAccum =
false;
3953 if (LocalAccum[0]) {
3954 if (LocalAccum.size() == 1) {
3955 Tmp =
B.buildAnyExt(
S64, LocalAccum[0]).getReg(0);
3956 HaveSmallAccum =
true;
3957 }
else if (LocalAccum[1]) {
3958 Tmp =
B.buildMergeLikeInstr(
S64, LocalAccum).getReg(0);
3959 HaveSmallAccum =
false;
3961 Tmp =
B.buildZExt(
S64, LocalAccum[0]).getReg(0);
3962 HaveSmallAccum =
true;
3965 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
3967 HaveSmallAccum =
true;
3971 unsigned j1 = DstIndex - j0;
3972 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
3976 auto Mad =
B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {
S64,
S1},
3977 {Src0[j0], Src1[j1], Tmp});
3978 Tmp = Mad.getReg(0);
3979 if (!HaveSmallAccum)
3980 CarryOut.push_back(Mad.getReg(1));
3981 HaveSmallAccum =
false;
3984 }
while (j0 <= DstIndex);
3986 auto Unmerge =
B.buildUnmerge(
S32, Tmp);
3987 LocalAccum[0] = Unmerge.getReg(0);
3988 if (LocalAccum.size() > 1)
3989 LocalAccum[1] = Unmerge.getReg(1);
4016 for (
unsigned i = 0; i <= Accum.
size() / 2; ++i) {
4017 Carry OddCarryIn = std::move(OddCarry);
4018 Carry EvenCarryIn = std::move(EvenCarry);
4023 if (2 * i < Accum.
size()) {
4024 auto LocalAccum = Accum.
drop_front(2 * i).take_front(2);
4025 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4030 if (!SeparateOddAlignedProducts) {
4031 auto LocalAccum = Accum.
drop_front(2 * i - 1).take_front(2);
4032 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4034 bool IsHighest = 2 * i >= Accum.
size();
4038 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4044 Lo =
B.buildUAddo(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0]);
4046 Lo =
B.buildAdd(
S32, Accum[2 * i - 1], SeparateOddOut[0]);
4048 Lo =
B.buildUAdde(
S32,
S1, Accum[2 * i - 1], SeparateOddOut[0],
4051 Accum[2 * i - 1] =
Lo->getOperand(0).getReg();
4054 auto Hi =
B.buildUAdde(
S32,
S1, Accum[2 * i], SeparateOddOut[1],
4055 Lo->getOperand(1).getReg());
4056 Accum[2 * i] =
Hi.getReg(0);
4057 SeparateOddCarry =
Hi.getReg(1);
4064 if (
Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4065 EvenCarryIn.push_back(CarryOut);
4067 if (2 * i < Accum.
size()) {
4068 if (
Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4069 OddCarry.push_back(CarryOut);
4082 assert(
MI.getOpcode() == TargetOpcode::G_MUL);
4091 LLT Ty =
MRI.getType(DstReg);
4095 unsigned NumParts =
Size / 32;
4111 for (
unsigned i = 0; i < NumParts; ++i) {
4115 B.buildUnmerge(Src0Parts, Src0);
4116 B.buildUnmerge(Src1Parts, Src1);
4119 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4120 SeparateOddAlignedProducts);
4122 B.buildMergeLikeInstr(DstReg, AccumRegs);
4123 MI.eraseFromParent();
4135 LLT DstTy =
MRI.getType(Dst);
4136 LLT SrcTy =
MRI.getType(Src);
4138 unsigned NewOpc =
MI.getOpcode() == AMDGPU::G_CTLZ
4139 ? AMDGPU::G_AMDGPU_FFBH_U32
4140 : AMDGPU::G_AMDGPU_FFBL_B32;
4141 auto Tmp =
B.buildInstr(NewOpc, {DstTy}, {Src});
4144 MI.eraseFromParent();
4150 if (
MI.getOpcode() != TargetOpcode::G_XOR)
4153 return ConstVal && *ConstVal == -1;
4160 Register CondDef =
MI.getOperand(0).getReg();
4161 if (!
MRI.hasOneNonDBGUse(CondDef))
4169 if (!
MRI.hasOneNonDBGUse(NegatedCond))
4175 UseMI = &*
MRI.use_instr_nodbg_begin(NegatedCond);
4184 if (Next == Parent->
end()) {
4188 UncondBrTarget = &*NextMBB;
4190 if (Next->getOpcode() != AMDGPU::G_BR)
4208 *ArgRC,
B.getDebugLoc(), ArgTy);
4212 const unsigned Mask = Arg->
getMask();
4213 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4220 auto ShiftAmt =
B.buildConstant(
S32, Shift);
4221 AndMaskSrc =
B.buildLShr(
S32, LiveIn, ShiftAmt).getReg(0);
4224 B.buildAnd(DstReg, AndMaskSrc,
B.buildConstant(
S32, Mask >> Shift));
4226 B.buildCopy(DstReg, LiveIn);
4255 Arg = &WorkGroupIDX;
4256 ArgRC = &AMDGPU::SReg_32RegClass;
4260 Arg = &WorkGroupIDY;
4261 ArgRC = &AMDGPU::SReg_32RegClass;
4265 Arg = &WorkGroupIDZ;
4266 ArgRC = &AMDGPU::SReg_32RegClass;
4281 B.buildConstant(DstReg, 0);
4287 B.buildUndef(DstReg);
4291 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4302 MI.eraseFromParent();
4308 B.buildConstant(
MI.getOperand(0).getReg(),
C);
4309 MI.eraseFromParent();
4330 B.buildUndef(DstReg);
4331 MI.eraseFromParent();
4335 if (Arg->isMasked()) {
4349 MI.eraseFromParent();
4356 Register KernArgReg =
B.getMRI()->createGenericVirtualRegister(PtrTy);
4366 return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0);
4374 Align Alignment)
const {
4378 "unexpected kernarg parameter type");
4382 B.buildLoad(DstReg,
Ptr, PtrInfo,
Align(4),
4385 MI.eraseFromParent();
4393 LLT DstTy =
MRI.getType(Dst);
4420 auto FloatY =
B.buildUITOFP(
S32,
Y);
4421 auto RcpIFlag =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {FloatY});
4422 auto Scale =
B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f7ffffe));
4423 auto ScaledY =
B.buildFMul(
S32, RcpIFlag, Scale);
4424 auto Z =
B.buildFPTOUI(
S32, ScaledY);
4427 auto NegY =
B.buildSub(
S32,
B.buildConstant(
S32, 0),
Y);
4428 auto NegYZ =
B.buildMul(
S32, NegY, Z);
4429 Z =
B.buildAdd(
S32, Z,
B.buildUMulH(
S32, Z, NegYZ));
4432 auto Q =
B.buildUMulH(
S32,
X, Z);
4433 auto R =
B.buildSub(
S32,
X,
B.buildMul(
S32, Q,
Y));
4436 auto One =
B.buildConstant(
S32, 1);
4439 Q =
B.buildSelect(
S32,
Cond,
B.buildAdd(
S32, Q, One), Q);
4445 B.buildSelect(DstDivReg,
Cond,
B.buildAdd(
S32, Q, One), Q);
4448 B.buildSelect(DstRemReg,
Cond,
B.buildSub(
S32, R,
Y), R);
4467 auto Unmerge =
B.buildUnmerge(
S32, Val);
4469 auto CvtLo =
B.buildUITOFP(
S32, Unmerge.getReg(0));
4470 auto CvtHi =
B.buildUITOFP(
S32, Unmerge.getReg(1));
4472 auto Mad =
B.buildFMAD(
4474 B.buildFConstant(
S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
4476 auto Rcp =
B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {
S32}, {Mad});
4477 auto Mul1 =
B.buildFMul(
4478 S32, Rcp,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x5f7ffffc)));
4481 auto Mul2 =
B.buildFMul(
4482 S32, Mul1,
B.buildFConstant(
S32, llvm::bit_cast<float>(0x2f800000)));
4483 auto Trunc =
B.buildIntrinsicTrunc(
S32, Mul2);
4486 auto Mad2 =
B.buildFMAD(
4487 S32, Trunc,
B.buildFConstant(
S32, llvm::bit_cast<float>(0xcf800000)),
4490 auto ResultLo =
B.buildFPTOUI(
S32, Mad2);
4491 auto ResultHi =
B.buildFPTOUI(
S32, Trunc);
4493 return {ResultLo.getReg(0), ResultHi.getReg(0)};
4508 auto Rcp =
B.buildMergeLikeInstr(
S64, {RcpLo, RcpHi});
4510 auto Zero64 =
B.buildConstant(
S64, 0);
4511 auto NegDenom =
B.buildSub(
S64, Zero64, Denom);
4513 auto MulLo1 =
B.buildMul(
S64, NegDenom, Rcp);
4514 auto MulHi1 =
B.buildUMulH(
S64, Rcp, MulLo1);
4516 auto UnmergeMulHi1 =
B.buildUnmerge(
S32, MulHi1);
4517 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
4518 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
4520 auto Add1_Lo =
B.buildUAddo(
S32,
S1, RcpLo, MulHi1_Lo);
4521 auto Add1_Hi =
B.buildUAdde(
S32,
S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
4522 auto Add1 =
B.buildMergeLikeInstr(
S64, {Add1_Lo, Add1_Hi});
4524 auto MulLo2 =
B.buildMul(
S64, NegDenom, Add1);
4525 auto MulHi2 =
B.buildUMulH(
S64, Add1, MulLo2);
4526 auto UnmergeMulHi2 =
B.buildUnmerge(
S32, MulHi2);
4527 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
4528 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
4530 auto Zero32 =
B.buildConstant(
S32, 0);
4531 auto Add2_Lo =
B.buildUAddo(
S32,
S1, Add1_Lo, MulHi2_Lo);
4532 auto Add2_Hi =
B.buildUAdde(
S32,
S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
4533 auto Add2 =
B.buildMergeLikeInstr(
S64, {Add2_Lo, Add2_Hi});
4535 auto UnmergeNumer =
B.buildUnmerge(
S32, Numer);
4536 Register NumerLo = UnmergeNumer.getReg(0);
4537 Register NumerHi = UnmergeNumer.getReg(1);
4539 auto MulHi3 =
B.buildUMulH(
S64, Numer, Add2);
4540 auto Mul3 =
B.buildMul(
S64, Denom, MulHi3);
4541 auto UnmergeMul3 =
B.buildUnmerge(
S32, Mul3);
4542 Register Mul3_Lo = UnmergeMul3.getReg(0);
4543 Register Mul3_Hi = UnmergeMul3.getReg(1);
4544 auto Sub1_Lo =
B.buildUSubo(
S32,
S1, NumerLo, Mul3_Lo);
4545 auto Sub1_Hi =
B.buildUSube(
S32,
S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
4546 auto Sub1_Mi =
B.buildSub(
S32, NumerHi, Mul3_Hi);
4547 auto Sub1 =
B.buildMergeLikeInstr(
S64, {Sub1_Lo, Sub1_Hi});
4549 auto UnmergeDenom =
B.buildUnmerge(
S32, Denom);
4550 Register DenomLo = UnmergeDenom.getReg(0);
4551 Register DenomHi = UnmergeDenom.getReg(1);
4554 auto C1 =
B.buildSExt(
S32, CmpHi);
4557 auto C2 =
B.buildSExt(
S32, CmpLo);
4560 auto C3 =
B.buildSelect(
S32, CmpEq, C2, C1);
4567 auto Sub2_Lo =
B.buildUSubo(
S32,
S1, Sub1_Lo, DenomLo);
4568 auto Sub2_Mi =
B.buildUSube(
S32,
S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
4569 auto Sub2_Hi =
B.buildUSube(
S32,
S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
4570 auto Sub2 =
B.buildMergeLikeInstr(
S64, {Sub2_Lo, Sub2_Hi});
4572 auto One64 =
B.buildConstant(
S64, 1);
4573 auto Add3 =
B.buildAdd(
S64, MulHi3, One64);
4579 auto C6 =
B.buildSelect(
4583 auto Add4 =
B.buildAdd(
S64, Add3, One64);
4584 auto Sub3_Lo =
B.buildUSubo(
S32,
S1, Sub2_Lo, DenomLo);
4586 auto Sub3_Mi =
B.buildUSube(
S32,
S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
4587 auto Sub3_Hi =
B.buildUSube(
S32,
S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
4588 auto Sub3 =
B.buildMergeLikeInstr(
S64, {Sub3_Lo, Sub3_Hi});
4594 auto Sel1 =
B.buildSelect(
4601 auto Sel2 =
B.buildSelect(
4612 switch (
MI.getOpcode()) {
4615 case AMDGPU::G_UDIV: {
4616 DstDivReg =
MI.getOperand(0).getReg();
4619 case AMDGPU::G_UREM: {
4620 DstRemReg =
MI.getOperand(0).getReg();
4623 case AMDGPU::G_UDIVREM: {
4624 DstDivReg =
MI.getOperand(0).getReg();
4625 DstRemReg =
MI.getOperand(1).getReg();
4632 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4633 Register Num =
MI.getOperand(FirstSrcOpIdx).getReg();
4634 Register Den =
MI.getOperand(FirstSrcOpIdx + 1).getReg();
4635 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4644 MI.eraseFromParent();
4654 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
4655 if (Ty !=
S32 && Ty !=
S64)
4658 const unsigned FirstSrcOpIdx =
MI.getNumExplicitDefs();
4663 auto LHSign =
B.buildAShr(Ty,
LHS, SignBitOffset);
4664 auto RHSign =
B.buildAShr(Ty,
RHS, SignBitOffset);
4666 LHS =
B.buildAdd(Ty,
LHS, LHSign).getReg(0);
4667 RHS =
B.buildAdd(Ty,
RHS, RHSign).getReg(0);
4669 LHS =
B.buildXor(Ty,
LHS, LHSign).getReg(0);
4670 RHS =
B.buildXor(Ty,
RHS, RHSign).getReg(0);
4672 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
4673 switch (
MI.getOpcode()) {
4676 case AMDGPU::G_SDIV: {
4677 DstDivReg =
MI.getOperand(0).getReg();
4678 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4681 case AMDGPU::G_SREM: {
4682 DstRemReg =
MI.getOperand(0).getReg();
4683 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4686 case AMDGPU::G_SDIVREM: {
4687 DstDivReg =
MI.getOperand(0).getReg();
4688 DstRemReg =
MI.getOperand(1).getReg();
4689 TmpDivReg =
MRI.createGenericVirtualRegister(Ty);
4690 TmpRemReg =
MRI.createGenericVirtualRegister(Ty);
4701 auto Sign =
B.buildXor(Ty, LHSign, RHSign).getReg(0);
4702 auto SignXor =
B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
4703 B.buildSub(DstDivReg, SignXor, Sign);
4707 auto Sign = LHSign.getReg(0);
4708 auto SignXor =
B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
4709 B.buildSub(DstRemReg, SignXor, Sign);
4712 MI.eraseFromParent();
4723 LLT ResTy =
MRI.getType(Res);
4730 if (!AllowInaccurateRcp && ResTy !=
LLT::scalar(16))
4741 if (CLHS->isExactlyValue(1.0)) {
4742 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4746 MI.eraseFromParent();
4751 if (CLHS->isExactlyValue(-1.0)) {
4752 auto FNeg =
B.buildFNeg(ResTy,
RHS, Flags);
4753 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
4754 .addUse(FNeg.getReg(0))
4757 MI.eraseFromParent();
4764 if (!AllowInaccurateRcp && (ResTy !=
LLT::scalar(16) ||
4769 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4772 B.buildFMul(Res,
LHS, RCP, Flags);
4774 MI.eraseFromParent();
4785 LLT ResTy =
MRI.getType(Res);
4791 if (!AllowInaccurateRcp)
4794 auto NegY =
B.buildFNeg(ResTy,
Y);
4795 auto One =
B.buildFConstant(ResTy, 1.0);
4797 auto R =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
4801 auto Tmp0 =
B.buildFMA(ResTy, NegY, R, One);
4802 R =
B.buildFMA(ResTy, Tmp0, R, R);
4804 auto Tmp1 =
B.buildFMA(ResTy, NegY, R, One);
4805 R =
B.buildFMA(ResTy, Tmp1, R, R);
4807 auto Ret =
B.buildFMul(ResTy,
X, R);
4808 auto Tmp2 =
B.buildFMA(ResTy, NegY, Ret,
X);
4810 B.buildFMA(Res, Tmp2, R, Ret);
4811 MI.eraseFromParent();
4830 auto LHSExt =
B.buildFPExt(
S32,
LHS, Flags);
4831 auto RHSExt =
B.buildFPExt(
S32,
RHS, Flags);
4833 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4834 .addUse(RHSExt.getReg(0))
4837 auto QUOT =
B.buildFMul(
S32, LHSExt, RCP, Flags);
4838 auto RDst =
B.buildFPTrunc(
S16, QUOT, Flags);
4840 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4841 .addUse(RDst.getReg(0))
4846 MI.eraseFromParent();
4859 unsigned SPDenormMode =
4862 if (ST.hasDenormModeInst()) {
4864 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
4866 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
4867 B.buildInstr(AMDGPU::S_DENORM_MODE)
4868 .addImm(NewDenormModeValue);
4871 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
4872 .addImm(SPDenormMode)
4894 auto One =
B.buildFConstant(
S32, 1.0f);
4896 auto DenominatorScaled =
4897 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4902 auto NumeratorScaled =
4903 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S32,
S1})
4909 auto ApproxRcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
4910 .addUse(DenominatorScaled.getReg(0))
4912 auto NegDivScale0 =
B.buildFNeg(
S32, DenominatorScaled, Flags);
4915 const bool HasDynamicDenormals =
4920 if (!PreservesDenormals) {
4921 if (HasDynamicDenormals) {
4922 SavedSPDenormMode =
MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
4923 B.buildInstr(AMDGPU::S_GETREG_B32)
4924 .addDef(SavedSPDenormMode)
4930 auto Fma0 =
B.buildFMA(
S32, NegDivScale0, ApproxRcp, One, Flags);
4931 auto Fma1 =
B.buildFMA(
S32, Fma0, ApproxRcp, ApproxRcp, Flags);
4932 auto Mul =
B.buildFMul(
S32, NumeratorScaled, Fma1, Flags);
4933 auto Fma2 =
B.buildFMA(
S32, NegDivScale0,
Mul, NumeratorScaled, Flags);
4934 auto Fma3 =
B.buildFMA(
S32, Fma2, Fma1,
Mul, Flags);
4935 auto Fma4 =
B.buildFMA(
S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
4937 if (!PreservesDenormals) {
4938 if (HasDynamicDenormals) {
4939 assert(SavedSPDenormMode);
4940 B.buildInstr(AMDGPU::S_SETREG_B32)
4941 .addReg(SavedSPDenormMode)
4947 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S32})
4948 .addUse(Fma4.getReg(0))
4949 .addUse(Fma1.getReg(0))
4950 .addUse(Fma3.getReg(0))
4951 .addUse(NumeratorScaled.getReg(1))
4954 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
4955 .addUse(Fmas.getReg(0))
4960 MI.eraseFromParent();
4979 auto One =
B.buildFConstant(
S64, 1.0);
4981 auto DivScale0 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
4987 auto NegDivScale0 =
B.buildFNeg(
S64, DivScale0.getReg(0), Flags);
4989 auto Rcp =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S64})
4990 .addUse(DivScale0.getReg(0))
4993 auto Fma0 =
B.buildFMA(
S64, NegDivScale0, Rcp, One, Flags);
4994 auto Fma1 =
B.buildFMA(
S64, Rcp, Fma0, Rcp, Flags);
4995 auto Fma2 =
B.buildFMA(
S64, NegDivScale0, Fma1, One, Flags);
4997 auto DivScale1 =
B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {
S64,
S1})
5003 auto Fma3 =
B.buildFMA(
S64, Fma1, Fma2, Fma1, Flags);
5004 auto Mul =
B.buildFMul(
S64, DivScale1.getReg(0), Fma3, Flags);
5005 auto Fma4 =
B.buildFMA(
S64, NegDivScale0,
Mul, DivScale1.getReg(0), Flags);
5014 auto NumUnmerge =
B.buildUnmerge(
S32,
LHS);
5015 auto DenUnmerge =
B.buildUnmerge(
S32,
RHS);
5016 auto Scale0Unmerge =
B.buildUnmerge(
S32, DivScale0);
5017 auto Scale1Unmerge =
B.buildUnmerge(
S32, DivScale1);
5020 Scale1Unmerge.getReg(1));
5022 Scale0Unmerge.getReg(1));
5023 Scale =
B.buildXor(
S1, CmpNum, CmpDen).getReg(0);
5025 Scale = DivScale1.getReg(1);
5028 auto Fmas =
B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {
S64})
5029 .addUse(Fma4.getReg(0))
5030 .addUse(Fma3.getReg(0))
5031 .addUse(
Mul.getReg(0))
5035 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup,
ArrayRef(Res))
5036 .addUse(Fmas.getReg(0))
5041 MI.eraseFromParent();
5053 LLT Ty =
MRI.getType(Res0);
5056 auto Mant =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5059 auto Exp =
B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5064 auto Fabs =
B.buildFAbs(Ty, Val);
5068 auto Zero =
B.buildConstant(InstrExpTy, 0);
5069 Exp =
B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5070 Mant =
B.buildSelect(Ty, IsFinite, Mant, Val);
5073 B.buildCopy(Res0, Mant);
5074 B.buildSExtOrTrunc(Res1, Exp);
5076 MI.eraseFromParent();
5091 auto Abs =
B.buildFAbs(
S32,
RHS, Flags);
5094 auto C0 =
B.buildFConstant(
S32, 0x1p+96f);
5095 auto C1 =
B.buildFConstant(
S32, 0x1p-32f);
5096 auto C2 =
B.buildFConstant(
S32, 1.0f);
5099 auto Sel =
B.buildSelect(
S32, CmpRes, C1, C2, Flags);
5101 auto Mul0 =
B.buildFMul(
S32,
RHS, Sel, Flags);
5103 auto RCP =
B.buildIntrinsic(Intrinsic::amdgcn_rcp, {
S32})
5104 .addUse(Mul0.getReg(0))
5107 auto Mul1 =
B.buildFMul(
S32,
LHS, RCP, Flags);
5109 B.buildFMul(Res, Sel, Mul1, Flags);
5111 MI.eraseFromParent();
5120 unsigned Flags =
MI.getFlags();
5123 auto Ext =
B.buildFPExt(F32,
MI.getOperand(1), Flags);
5124 auto Log2 =
B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5125 .addUse(Ext.getReg(0))
5127 B.buildFPTrunc(
MI.getOperand(0),
Log2, Flags);
5128 MI.eraseFromParent();
5138 const unsigned Flags =
MI.getFlags();
5147 MI.eraseFromParent();
5151 auto ScaleThreshold =
B.buildFConstant(F32, 0x1.0p-96f);
5153 auto ScaleUpFactor =
B.buildFConstant(F32, 0x1.0p+32f);
5154 auto ScaledX =
B.buildFMul(F32,
X, ScaleUpFactor, Flags);
5155 auto SqrtX =
B.buildSelect(F32, NeedScale, ScaledX,
X, Flags);
5157 Register SqrtS =
MRI.createGenericVirtualRegister(F32);
5160 .addUse(SqrtX.getReg(0))
5163 auto NegOne =
B.buildConstant(I32, -1);
5164 auto SqrtSNextDown =
B.buildAdd(I32, SqrtS, NegOne);
5166 auto NegSqrtSNextDown =
B.buildFNeg(F32, SqrtSNextDown, Flags);
5167 auto SqrtVP =
B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5169 auto PosOne =
B.buildConstant(I32, 1);
5170 auto SqrtSNextUp =
B.buildAdd(I32, SqrtS, PosOne);
5172 auto NegSqrtSNextUp =
B.buildFNeg(F32, SqrtSNextUp, Flags);
5173 auto SqrtVS =
B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5175 auto Zero =
B.buildFConstant(F32, 0.0f);
5179 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5183 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5186 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5187 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5189 auto Half =
B.buildFConstant(F32, 0.5f);
5190 auto SqrtH =
B.buildFMul(F32, SqrtR, Half, Flags);
5191 auto NegSqrtH =
B.buildFNeg(F32, SqrtH, Flags);
5192 auto SqrtE =
B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5193 SqrtH =
B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5194 SqrtS =
B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5195 auto NegSqrtS =
B.buildFNeg(F32, SqrtS, Flags);
5196 auto SqrtD =
B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5197 SqrtS =
B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5200 auto ScaleDownFactor =
B.buildFConstant(F32, 0x1.0p-16f);
5202 auto ScaledDown =
B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5204 SqrtS =
B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5207 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5209 MI.eraseFromParent();
5241 assert(
MRI.getType(Dst) == F64 &&
"only expect to lower f64 sqrt");
5244 unsigned Flags =
MI.getFlags();
5246 auto ScaleConstant =
B.buildFConstant(F64, 0x1.0p-767);
5248 auto ZeroInt =
B.buildConstant(
S32, 0);
5252 auto ScaleUpFactor =
B.buildConstant(
S32, 256);
5253 auto ScaleUp =
B.buildSelect(
S32, Scaling, ScaleUpFactor, ZeroInt);
5254 auto SqrtX =
B.buildFLdexp(F64,
X, ScaleUp, Flags);
5257 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
5259 auto Half =
B.buildFConstant(F64, 0.5);
5260 auto SqrtH0 =
B.buildFMul(F64, SqrtY, Half);
5261 auto SqrtS0 =
B.buildFMul(F64, SqrtX, SqrtY);
5263 auto NegSqrtH0 =
B.buildFNeg(F64, SqrtH0);
5264 auto SqrtR0 =
B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5266 auto SqrtS1 =
B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5267 auto SqrtH1 =
B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5269 auto NegSqrtS1 =
B.buildFNeg(F64, SqrtS1);
5270 auto SqrtD0 =
B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
5272 auto SqrtS2 =
B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
5274 auto NegSqrtS2 =
B.buildFNeg(F64, SqrtS2);
5275 auto SqrtD1 =
B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
5277 auto SqrtRet =
B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
5280 auto ScaleDownFactor =
B.buildConstant(
S32, -128);
5281 auto ScaleDown =
B.buildSelect(
S32, Scaling, ScaleDownFactor, ZeroInt);
5282 SqrtRet =
B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
5291 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
5293 MI.eraseFromParent();
5300 LLT Ty =
MRI.getType(
MI.getOperand(0).getReg());
5324 auto Flags =
MI.getFlags();
5326 LLT Ty =
MRI.getType(Dst);
5336 auto Rsq =
B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
5346 auto ClampMax = UseIEEE ?
B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
5347 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
5352 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
5354 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
5355 MI.eraseFromParent();
5361 case Intrinsic::amdgcn_ds_fadd:
5362 return AMDGPU::G_ATOMICRMW_FADD;
5363 case Intrinsic::amdgcn_ds_fmin:
5364 return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
5365 case Intrinsic::amdgcn_ds_fmax:
5366 return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
5382 for (
int I = 6;
I > 3; --
I)
5383 MI.removeOperand(
I);
5385 MI.removeOperand(1);
5396 LLT DstTy =
MRI.getType(DstReg);
5399 Register KernargPtrReg =
MRI.createGenericVirtualRegister(DstTy);
5405 B.buildPtrAdd(DstReg, KernargPtrReg,
B.buildConstant(IdxTy,
Offset).getReg(0));
5416 Register Pointer =
MI.getOperand(2).getReg();
5418 Register NumRecords =
MI.getOperand(4).getReg();
5423 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5424 auto Unmerge =
B.buildUnmerge(
S32, Pointer);
5425 Register LowHalf = Unmerge.getReg(0);
5426 Register HighHalf = Unmerge.getReg(1);
5428 auto AndMask =
B.buildConstant(
S32, 0x0000ffff);
5429 auto Masked =
B.buildAnd(
S32, HighHalf, AndMask);
5432 std::optional<ValueAndVReg> StrideConst =
5434 if (!StrideConst || !StrideConst->Value.isZero()) {
5437 uint32_t StrideVal = StrideConst->Value.getZExtValue();
5438 uint32_t ShiftedStrideVal = StrideVal << 16;
5439 ShiftedStride =
B.buildConstant(
S32, ShiftedStrideVal);
5441 auto ExtStride =
B.buildAnyExt(
S32, Stride);
5442 auto ShiftConst =
B.buildConstant(
S32, 16);
5443 ShiftedStride =
B.buildShl(
S32, ExtStride, ShiftConst);
5445 NewHighHalf =
B.buildOr(
S32, Masked, ShiftedStride);
5448 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
5449 MI.eraseFromParent();
5466 MI.eraseFromParent();
5474 std::optional<uint32_t> KnownSize =
5476 if (KnownSize.has_value())
5477 B.buildConstant(DstReg, *KnownSize);
5495 MI.eraseFromParent();
5502 unsigned AddrSpace)
const {
5504 auto Unmerge =
B.buildUnmerge(
LLT::scalar(32),
MI.getOperand(2).getReg());
5508 MI.eraseFromParent();
5518std::pair<Register, unsigned>
5527 std::tie(BaseReg, ImmOffset) =
5531 if (
MRI.getType(BaseReg).isPointer())
5532 BaseReg =
B.buildPtrToInt(
MRI.getType(OrigOffset), BaseReg).getReg(0);
5542 unsigned Overflow = ImmOffset & ~MaxImm;
5543 ImmOffset -= Overflow;
5544 if ((int32_t)Overflow < 0) {
5545 Overflow += ImmOffset;
5549 if (Overflow != 0) {
5551 BaseReg =
B.buildConstant(
S32, Overflow).getReg(0);
5553 auto OverflowVal =
B.buildConstant(
S32, Overflow);
5554 BaseReg =
B.buildAdd(
S32, BaseReg, OverflowVal).getReg(0);
5559 BaseReg =
B.buildConstant(
S32, 0).getReg(0);
5561 return std::pair(BaseReg, ImmOffset);
5568 bool ImageStore)
const {
5571 LLT StoreVT =
MRI.getType(Reg);
5575 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5578 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5579 WideRegs.
push_back(
B.buildAnyExt(
S32, Unmerge.getReg(
I)).getReg(0));
5590 Reg =
B.buildBitcast(
S32, Reg).getReg(0);
5592 PackedRegs.
resize(2,
B.buildUndef(
S32).getReg(0));
5599 auto Unmerge =
B.buildUnmerge(
S16, Reg);
5600 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5602 PackedRegs.
resize(6,
B.buildUndef(
S16).getReg(0));
5610 auto Unmerge =
B.buildUnmerge(
S32, Reg);
5611 for (
int I = 0, E = Unmerge->getNumOperands() - 1;
I != E; ++
I)
5613 PackedRegs.
resize(4,
B.buildUndef(
S32).getReg(0));
5631 LLT Ty =
MRI->getType(VData);
5659 bool IsFormat)
const {
5661 LLT Ty =
MRI.getType(VData);
5663 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5676 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5679 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
5683 VIndex =
MI.getOperand(3).getReg();
5686 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5689 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5690 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5694 Format =
MI.getOperand(5 + OpOffset).getImm();
5698 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5704 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
5705 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
5706 }
else if (IsFormat) {
5707 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
5708 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
5712 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
5715 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
5718 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
5723 auto MIB =
B.buildInstr(Opc)
5734 MIB.addImm(AuxiliaryData)
5735 .addImm(HasVIndex ? -1 : 0)
5736 .addMemOperand(MMO);
5738 MI.eraseFromParent();
5744 unsigned ImmOffset,
unsigned Format,
5747 auto MIB =
B.buildInstr(Opc)
5758 MIB.addImm(AuxiliaryData)
5759 .addImm(HasVIndex ? -1 : 0)
5760 .addMemOperand(MMO);
5767 bool IsTyped)
const {
5777 assert(
MI.getNumExplicitDefs() == 1 ||
MI.getNumExplicitDefs() == 2);
5778 bool IsTFE =
MI.getNumExplicitDefs() == 2;
5780 StatusDst =
MI.getOperand(1).getReg();
5785 Register RSrc =
MI.getOperand(2 + OpOffset).getReg();
5788 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
5791 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps + OpOffset;
5794 VIndex =
MI.getOperand(3 + OpOffset).getReg();
5797 VIndex =
B.buildConstant(
S32, 0).getReg(0);
5800 Register VOffset =
MI.getOperand(3 + OpOffset).getReg();
5801 Register SOffset =
MI.getOperand(4 + OpOffset).getReg();
5805 Format =
MI.getOperand(5 + OpOffset).getImm();
5809 unsigned AuxiliaryData =
MI.getOperand(5 + OpOffset).getImm();
5812 LLT Ty =
MRI.getType(Dst);
5817 Dst =
MI.getOperand(0).getReg();
5820 const bool IsD16 = IsFormat && (EltTy.
getSizeInBits() == 16);
5831 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
5832 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
5833 }
else if (IsFormat) {
5837 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
5839 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
5840 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
5847 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
5850 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
5853 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
5860 unsigned NumLoadDWords = NumValueDWords + 1;
5862 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(LoadTy);
5863 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5864 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5865 if (NumValueDWords == 1) {
5866 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
5869 for (
unsigned I = 0;
I != NumValueDWords; ++
I)
5870 LoadElts.
push_back(
B.getMRI()->createGenericVirtualRegister(
S32));
5872 B.buildUnmerge(LoadElts, LoadDstReg);
5874 B.buildMergeLikeInstr(Dst, LoadElts);
5878 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(
S32);
5879 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5880 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5881 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5882 B.buildTrunc(Dst, LoadDstReg);
5883 }
else if (Unpacked && IsD16 && Ty.
isVector()) {
5885 Register LoadDstReg =
B.getMRI()->createGenericVirtualRegister(UnpackedTy);
5886 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
5887 Format, AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5888 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
5890 auto Unmerge =
B.buildUnmerge(
S32, LoadDstReg);
5892 for (
unsigned I = 0,
N = Unmerge->getNumOperands() - 1;
I !=
N; ++
I)
5893 Repack.
push_back(
B.buildTrunc(EltTy, Unmerge.getReg(
I)).getReg(0));
5894 B.buildMergeLikeInstr(Dst, Repack);
5897 AuxiliaryData, MMO, IsTyped, HasVIndex,
B);
5900 MI.eraseFromParent();
5906 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
5907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
5908 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
5909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
5910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
5911 case Intrinsic::amdgcn_raw_buffer_atomic_add:
5912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
5913 case Intrinsic::amdgcn_struct_buffer_atomic_add:
5914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
5915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
5916 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
5917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
5918 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
5919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
5920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
5921 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
5922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
5923 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
5924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
5925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
5926 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
5927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
5928 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
5929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
5930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
5931 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
5932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
5933 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
5934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
5935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
5936 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
5937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
5938 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
5939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
5940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
5941 case Intrinsic::amdgcn_raw_buffer_atomic_and:
5942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
5943 case Intrinsic::amdgcn_struct_buffer_atomic_and:
5944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
5945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
5946 case Intrinsic::amdgcn_raw_buffer_atomic_or:
5947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
5948 case Intrinsic::amdgcn_struct_buffer_atomic_or:
5949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
5950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
5951 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
5952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
5953 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
5954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
5955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
5956 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
5957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
5958 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
5959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
5960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
5961 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
5962 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
5963 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
5964 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
5965 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
5966 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
5967 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
5968 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
5969 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
5970 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
5971 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
5972 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
5973 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
5974 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
5975 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
5976 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
5977 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
5978 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
5979 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
5980 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
5981 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
5982 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
5983 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
5984 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
5985 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
5986 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
5987 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
5988 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
5989 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
5990 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
5991 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6000 const bool IsCmpSwap =
6001 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6002 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6003 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6004 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6015 CmpVal =
MI.getOperand(3).getReg();
6020 Register RSrc =
MI.getOperand(3 + OpOffset).getReg();
6021 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6024 const bool HasVIndex =
MI.getNumOperands() == NumVIndexOps;
6027 VIndex =
MI.getOperand(4 + OpOffset).getReg();
6030 VIndex =
B.buildConstant(
LLT::scalar(32), 0).getReg(0);
6033 Register VOffset =
MI.getOperand(4 + OpOffset).getReg();
6034 Register SOffset =
MI.getOperand(5 + OpOffset).getReg();
6035 unsigned AuxiliaryData =
MI.getOperand(6 + OpOffset).getImm();
6054 .addImm(AuxiliaryData)
6055 .addImm(HasVIndex ? -1 : 0)
6056 .addMemOperand(MMO);
6058 MI.eraseFromParent();
6068 bool IsA16,
bool IsG16) {
6071 auto EndIdx =
Intr->VAddrEnd;
6073 for (
unsigned I =
Intr->VAddrStart;
I < EndIdx;
I++) {
6080 if ((I < Intr->GradientStart) ||
6081 (
I >=
Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6082 (
I >=
Intr->CoordStart && !IsA16)) {
6083 if ((I < Intr->GradientStart) && IsA16 &&
6084 (
B.getMRI()->getType(AddrReg) ==
S16)) {
6085 assert(
I ==
Intr->BiasIndex &&
"Got unexpected 16-bit extra argument");
6089 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6093 "Bias needs to be converted to 16 bit in A16 mode");
6095 AddrReg =
B.buildBitcast(
V2S16, AddrReg).getReg(0);
6101 if (((
I + 1) >= EndIdx) ||
6102 ((
Intr->NumGradients / 2) % 2 == 1 &&
6103 (
I ==
static_cast<unsigned>(
Intr->GradientStart +
6104 (
Intr->NumGradients / 2) - 1) ||
6105 I ==
static_cast<unsigned>(
Intr->GradientStart +
6106 Intr->NumGradients - 1))) ||
6108 !
MI.getOperand(ArgOffset +
I + 1).isReg()) {
6110 B.buildBuildVector(
V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
6115 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
6126 int DimIdx,
int NumVAddrs) {
6130 for (
int I = 0;
I != NumVAddrs; ++
I) {
6132 if (
SrcOp.isReg()) {
6138 int NumAddrRegs = AddrRegs.
size();
6139 if (NumAddrRegs != 1) {
6142 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
6145 for (
int I = 1;
I != NumVAddrs; ++
I) {
6148 MI.getOperand(DimIdx +
I).setReg(AMDGPU::NoRegister);
6170 const unsigned NumDefs =
MI.getNumExplicitDefs();
6171 const unsigned ArgOffset = NumDefs + 1;
6172 bool IsTFE = NumDefs == 2;
6186 Register VData =
MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
6187 LLT Ty =
MRI->getType(VData);
6189 const bool IsAtomicPacked16Bit =
6190 (BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
6191 BaseOpcode->
BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
6195 MRI->getType(
MI.getOperand(ArgOffset +
Intr->GradientStart).getReg());
6197 MRI->getType(
MI.getOperand(ArgOffset +
Intr->CoordStart).getReg());
6200 const bool IsA16 = AddrTy ==
S16;
6204 if (!BaseOpcode->
Atomic) {
6205 DMask =
MI.getOperand(ArgOffset +
Intr->DMaskIndex).getImm();
6208 }
else if (DMask != 0) {
6210 }
else if (!IsTFE && !BaseOpcode->
Store) {
6212 B.buildUndef(
MI.getOperand(0));
6213 MI.eraseFromParent();
6221 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
6222 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
6223 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
6224 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
6225 unsigned NewOpcode = NumDefs == 0 ? StoreOpcode : LoadOpcode;
6228 MI.setDesc(
B.getTII().get(NewOpcode));
6232 if (IsTFE && DMask == 0) {
6235 MI.getOperand(ArgOffset +
Intr->DMaskIndex).setImm(DMask);
6238 if (BaseOpcode->
Atomic) {
6240 LLT Ty =
MRI->getType(VData0);
6243 if (Ty.
isVector() && !IsAtomicPacked16Bit)
6250 auto Concat =
B.buildBuildVector(PackedTy, {VData0, VData1});
6251 MI.getOperand(2).setReg(
Concat.getReg(0));
6252 MI.getOperand(3).setReg(AMDGPU::NoRegister);
6256 unsigned CorrectedNumVAddrs =
Intr->NumVAddrs;
6265 if (IsA16 && !ST.
hasA16()) {
6273 if (IsA16 || IsG16) {
6283 (PackedRegs.
size() <= NSAMaxSize || HasPartialNSA);
6284 const bool UsePartialNSA =
6285 UseNSA && HasPartialNSA && PackedRegs.
size() > NSAMaxSize;
6287 if (UsePartialNSA) {
6291 auto Concat =
B.buildConcatVectors(
6292 PackedAddrTy,
ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
6293 PackedRegs[NSAMaxSize - 1] =
Concat.getReg(0);
6294 PackedRegs.
resize(NSAMaxSize);
6295 }
else if (!UseNSA && PackedRegs.
size() > 1) {
6297 auto Concat =
B.buildConcatVectors(PackedAddrTy, PackedRegs);
6298 PackedRegs[0] =
Concat.getReg(0);
6302 const unsigned NumPacked = PackedRegs.
size();
6303 for (
unsigned I =
Intr->VAddrStart; I < Intr->VAddrEnd;
I++) {
6305 if (!
SrcOp.isReg()) {
6312 if (
I -
Intr->VAddrStart < NumPacked)
6313 SrcOp.setReg(PackedRegs[
I -
Intr->VAddrStart]);
6315 SrcOp.setReg(AMDGPU::NoRegister);
6334 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
6335 const bool UsePartialNSA =
6336 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
6338 if (UsePartialNSA) {
6340 ArgOffset +
Intr->VAddrStart + NSAMaxSize - 1,
6341 Intr->NumVAddrs - NSAMaxSize + 1);
6342 }
else if (!UseNSA &&
Intr->NumVAddrs > 1) {
6355 if (BaseOpcode->
Store) {
6361 if (RepackedReg != VData) {
6362 MI.getOperand(1).setReg(RepackedReg);
6373 if (NumElts < DMaskLanes)
6376 if (NumElts > 4 || DMaskLanes > 4)
6386 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
6387 const LLT AdjustedTy =
6410 unsigned RoundedElts = (AdjustedTy.
getSizeInBits() + 31) / 32;
6411 unsigned RoundedSize = 32 * RoundedElts;
6415 RegTy = !IsTFE && EltSize == 16 ?
V2S16 :
S32;
6420 if (!IsTFE && (RoundedTy == Ty || !Ty.
isVector()))
6426 B.setInsertPt(*
MI.getParent(), ++
MI.getIterator());
6430 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
6431 const int ResultNumRegs = LoadResultTy.
getSizeInBits() / 32;
6433 Register NewResultReg =
MRI->createGenericVirtualRegister(LoadResultTy);
6435 MI.getOperand(0).setReg(NewResultReg);
6443 Dst1Reg =
MI.getOperand(1).getReg();
6444 if (
MRI->getType(Dst1Reg) !=
S32)
6448 MI.removeOperand(1);
6452 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
6461 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
6463 if (ResultNumRegs == 1) {
6465 ResultRegs[0] = NewResultReg;
6468 for (
int I = 0;
I != NumDataRegs; ++
I)
6469 ResultRegs[
I] =
MRI->createGenericVirtualRegister(RegTy);
6470 B.buildUnmerge(ResultRegs, NewResultReg);
6475 ResultRegs.
resize(NumDataRegs);
6481 B.buildTrunc(DstReg, ResultRegs[0]);
6487 B.buildBitcast(DstReg, ResultRegs[0]);
6501 Reg =
B.buildBitcast(
V2S16, Reg).getReg(0);
6504 Reg =
B.buildTrunc(
S16, Reg).getReg(0);
6508 auto padWithUndef = [&](
LLT Ty,
int NumElts) {
6511 Register Undef =
B.buildUndef(Ty).getReg(0);
6512 for (
int I = 0;
I != NumElts; ++
I)
6517 LLT ResTy =
MRI->getType(ResultRegs[0]);
6519 padWithUndef(ResTy, NumElts - ResultRegs.
size());
6520 B.buildBuildVector(DstReg, ResultRegs);
6531 if (ResultRegs.
size() == 1) {
6532 NewResultReg = ResultRegs[0];
6533 }
else if (ResultRegs.
size() == 2) {
6535 NewResultReg =
B.buildConcatVectors(
V4S16, ResultRegs).getReg(0);
6541 if (
MRI->getType(DstReg).getNumElements() <
6542 MRI->getType(NewResultReg).getNumElements()) {
6543 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
6545 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
6550 padWithUndef(ResTy, RegsToCover - ResultRegs.
size());
6551 B.buildConcatVectors(DstReg, ResultRegs);
6560 Register OrigDst =
MI.getOperand(0).getReg();
6562 LLT Ty =
B.getMRI()->getType(OrigDst);
6568 Opc =
Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
6569 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
6572 Dst =
B.getMRI()->createGenericVirtualRegister(
LLT::scalar(32));
6574 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
6583 B.setInsertPt(
B.getMBB(),
MI);
6588 B.setInsertPt(
B.getMBB(),
MI);
6594 MI.setDesc(
B.getTII().get(Opc));
6595 MI.removeOperand(1);
6599 const unsigned MemSize = (
Size + 7) / 8;
6600 const Align MemAlign(std::min(MemSize, 4u));
6606 MI.addMemOperand(MF, MMO);
6607 if (Dst != OrigDst) {
6608 MI.getOperand(0).setReg(Dst);
6609 B.setInsertPt(
B.getMBB(), ++
B.getInsertPt());
6610 B.buildTrunc(OrigDst, Dst);
6648 MI.eraseFromParent();
6658 BuildMI(*TrapBB, TrapBB->
end(),
DL,
B.getTII().get(AMDGPU::S_ENDPGM))
6660 BuildMI(BB, &
MI,
DL,
B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
6664 MI.eraseFromParent();
6673 Register SGPR01(AMDGPU::SGPR0_SGPR1);
6682 Register KernargPtrReg =
MRI.createGenericVirtualRegister(
6698 Register LoadAddr =
MRI.createGenericVirtualRegister(
6700 B.buildPtrAdd(LoadAddr, KernargPtrReg,
6703 Register Temp =
B.buildLoad(
S64, LoadAddr, *MMO).getReg(0);
6704 B.buildCopy(SGPR01, Temp);
6705 B.buildInstr(AMDGPU::S_TRAP)
6708 MI.eraseFromParent();
6719 B.buildCopy(SGPR01, LiveIn);
6720 B.buildInstr(AMDGPU::S_TRAP)
6724 MI.eraseFromParent();
6736 MI.eraseFromParent();
6740 B.buildInstr(AMDGPU::S_TRAP)
6742 MI.eraseFromParent();
6754 "debugtrap handler not supported",
6756 LLVMContext &Ctx =
B.getMF().getFunction().getContext();
6760 B.buildInstr(AMDGPU::S_TRAP)
6764 MI.eraseFromParent();
6777 Register NodePtr =
MI.getOperand(2).getReg();
6778 Register RayExtent =
MI.getOperand(3).getReg();
6779 Register RayOrigin =
MI.getOperand(4).getReg();
6781 Register RayInvDir =
MI.getOperand(6).getReg();
6786 "intrinsic not supported on subtarget",
6788 B.getMF().getFunction().getContext().diagnose(BadIntrin);
6795 const bool IsA16 =
MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
6796 const bool Is64 =
MRI.getType(NodePtr).getSizeInBits() == 64;
6797 const unsigned NumVDataDwords = 4;
6798 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
6799 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
6803 const unsigned BaseOpcodes[2][2] = {
6804 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
6805 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
6806 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
6810 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
6811 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
6812 : AMDGPU::MIMGEncGfx10NSA,
6813 NumVDataDwords, NumVAddrDwords);
6817 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
6818 : AMDGPU::MIMGEncGfx10Default,
6819 NumVDataDwords, NumVAddrDwords);
6824 if (UseNSA && IsGFX11Plus) {
6826 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
6827 auto Merged =
B.buildMergeLikeInstr(
6828 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
6834 packLanes(RayOrigin);
6837 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
6838 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
6839 auto MergedDir =
B.buildMergeLikeInstr(
6842 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(0),
6843 UnmergeRayDir.getReg(0)}))
6846 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(1),
6847 UnmergeRayDir.getReg(1)}))
6850 S32,
B.buildMergeLikeInstr(
V2S16, {UnmergeRayInvDir.getReg(2),
6851 UnmergeRayDir.getReg(2)}))
6856 packLanes(RayInvDir);
6860 auto Unmerge =
B.buildUnmerge({
S32,
S32}, NodePtr);
6869 auto Unmerge =
B.buildUnmerge({
S32,
S32,
S32}, Src);
6875 packLanes(RayOrigin);
6877 auto UnmergeRayDir =
B.buildUnmerge({
S16,
S16,
S16}, RayDir);
6878 auto UnmergeRayInvDir =
B.buildUnmerge({
S16,
S16,
S16}, RayInvDir);
6882 B.buildMergeLikeInstr(R1,
6883 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
6884 B.buildMergeLikeInstr(
6885 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
6886 B.buildMergeLikeInstr(
6887 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
6893 packLanes(RayInvDir);
6900 Register MergedOps =
B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
6905 auto MIB =
B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
6914 .addImm(IsA16 ? 1 : 0)
6917 MI.eraseFromParent();
6924 int RoundMode =
MI.getOperand(2).getImm();
6927 Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD;
6929 Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD;
6934 .addDef(
MI.getOperand(0).getReg())
6935 .addUse(
MI.getOperand(1).getReg());
6937 MI.eraseFromParent();
6947 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
6948 MI.eraseFromParent();
6959 auto TTMP8 =
B.buildCopy(
S32,
Register(AMDGPU::TTMP8));
6960 auto LSB =
B.buildConstant(
S32, 25);
6961 auto Width =
B.buildConstant(
S32, 5);
6962 B.buildUbfx(DstReg, TTMP8, LSB, Width);
6963 MI.eraseFromParent();
6977 if (
MRI.getType(Src) !=
S64)
6981 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
6985 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {
S32},
6988 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
6989 MI.eraseFromParent();
6997 if (
MRI.getType(Src) !=
S64)
7000 auto Unmerge =
B.buildUnmerge({
S32,
S32},
MI.getOperand(0));
7004 .addReg(Unmerge.getReg(0));
7008 .addReg(Unmerge.getReg(1));
7009 MI.eraseFromParent();
7019 auto IntrID = cast<GIntrinsic>(
MI).getIntrinsicID();
7021 case Intrinsic::amdgcn_if:
7022 case Intrinsic::amdgcn_else: {
7025 bool Negated =
false;
7037 std::swap(CondBrTarget, UncondBrTarget);
7039 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7040 if (IntrID == Intrinsic::amdgcn_if) {
7041 B.buildInstr(AMDGPU::SI_IF)
7044 .addMBB(UncondBrTarget);
7046 B.buildInstr(AMDGPU::SI_ELSE)
7049 .addMBB(UncondBrTarget);
7058 B.buildBr(*CondBrTarget);
7061 MRI.setRegClass(Def,
TRI->getWaveMaskRegClass());
7062 MRI.setRegClass(
Use,
TRI->getWaveMaskRegClass());
7063 MI.eraseFromParent();
7064 BrCond->eraseFromParent();
7070 case Intrinsic::amdgcn_loop: {
7073 bool Negated =
false;
7083 std::swap(CondBrTarget, UncondBrTarget);
7085 B.setInsertPt(
B.getMBB(), BrCond->getIterator());
7086 B.buildInstr(AMDGPU::SI_LOOP)
7088 .addMBB(UncondBrTarget);
7093 B.buildBr(*CondBrTarget);
7095 MI.eraseFromParent();
7096 BrCond->eraseFromParent();
7097 MRI.setRegClass(Reg,
TRI->getWaveMaskRegClass());
7103 case Intrinsic::amdgcn_addrspacecast_nonnull:
7105 case Intrinsic::amdgcn_make_buffer_rsrc:
7107 case Intrinsic::amdgcn_kernarg_segment_ptr:
7110 B.buildConstant(
MI.getOperand(0).getReg(), 0);
7111 MI.eraseFromParent();
7117 case Intrinsic::amdgcn_implicitarg_ptr:
7119 case Intrinsic::amdgcn_workitem_id_x:
7122 case Intrinsic::amdgcn_workitem_id_y:
7125 case Intrinsic::amdgcn_workitem_id_z:
7128 case Intrinsic::amdgcn_workgroup_id_x:
7131 case Intrinsic::amdgcn_workgroup_id_y:
7134 case Intrinsic::amdgcn_workgroup_id_z:
7137 case Intrinsic::amdgcn_wave_id:
7139 case Intrinsic::amdgcn_lds_kernel_id:
7142 case Intrinsic::amdgcn_dispatch_ptr:
7145 case Intrinsic::amdgcn_queue_ptr:
7148 case Intrinsic::amdgcn_implicit_buffer_ptr:
7151 case Intrinsic::amdgcn_dispatch_id:
7154 case Intrinsic::r600_read_ngroups_x:
7158 case Intrinsic::r600_read_ngroups_y:
7161 case Intrinsic::r600_read_ngroups_z:
7164 case Intrinsic::r600_read_local_size_x:
7167 case Intrinsic::r600_read_local_size_y:
7171 case Intrinsic::r600_read_local_size_z:
7173 case Intrinsic::r600_read_global_size_x:
7175 case Intrinsic::r600_read_global_size_y:
7177 case Intrinsic::r600_read_global_size_z:
7179 case Intrinsic::amdgcn_fdiv_fast:
7181 case Intrinsic::amdgcn_is_shared:
7183 case Intrinsic::amdgcn_is_private:
7185 case Intrinsic::amdgcn_wavefrontsize: {
7187 MI.eraseFromParent();
7190 case Intrinsic::amdgcn_s_buffer_load:
7192 case Intrinsic::amdgcn_raw_buffer_store:
7193 case Intrinsic::amdgcn_raw_ptr_buffer_store:
7194 case Intrinsic::amdgcn_struct_buffer_store:
7195 case Intrinsic::amdgcn_struct_ptr_buffer_store:
7197 case Intrinsic::amdgcn_raw_buffer_store_format:
7198 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
7199 case Intrinsic::amdgcn_struct_buffer_store_format:
7200 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
7202 case Intrinsic::amdgcn_raw_tbuffer_store:
7203 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
7204 case Intrinsic::amdgcn_struct_tbuffer_store:
7205 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
7207 case Intrinsic::amdgcn_raw_buffer_load:
7208 case Intrinsic::amdgcn_raw_ptr_buffer_load:
7209 case Intrinsic::amdgcn_struct_buffer_load:
7210 case Intrinsic::amdgcn_struct_ptr_buffer_load:
7212 case Intrinsic::amdgcn_raw_buffer_load_format:
7213 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
7214 case Intrinsic::amdgcn_struct_buffer_load_format:
7215 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
7217 case Intrinsic::amdgcn_raw_tbuffer_load:
7218 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
7219 case Intrinsic::amdgcn_struct_tbuffer_load:
7220 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
7222 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
7223 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
7224 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
7225 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
7226 case Intrinsic::amdgcn_raw_buffer_atomic_add:
7227 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
7228 case Intrinsic::amdgcn_struct_buffer_atomic_add:
7229 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
7230 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
7231 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
7232 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
7233 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
7234 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
7235 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
7236 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
7237 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
7238 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
7239 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
7240 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
7241 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
7242 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
7243 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
7244 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
7245 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
7246 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
7247 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
7248 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
7249 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
7250 case Intrinsic::amdgcn_raw_buffer_atomic_and:
7251 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
7252 case Intrinsic::amdgcn_struct_buffer_atomic_and:
7253 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
7254 case Intrinsic::amdgcn_raw_buffer_atomic_or:
7255 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
7256 case Intrinsic::amdgcn_struct_buffer_atomic_or:
7257 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
7258 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
7259 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
7260 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
7261 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
7262 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
7263 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
7264 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
7265 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
7266 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
7267 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
7268 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
7269 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
7270 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
7271 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
7272 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
7273 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
7274 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
7275 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
7276 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
7277 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
7278 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
7279 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
7280 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
7281 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
7282 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
7283 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
7284 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
7285 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
7286 case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
7287 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
7288 case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
7289 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
7291 case Intrinsic::amdgcn_rsq_clamp:
7293 case Intrinsic::amdgcn_ds_fadd:
7294 case Intrinsic::amdgcn_ds_fmin:
7295 case Intrinsic::amdgcn_ds_fmax:
7297 case Intrinsic::amdgcn_image_bvh_intersect_ray:
7299 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
7300 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
7301 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
7302 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
7303 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
7304 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
7305 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
7306 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
7310 MI.getOperand(5).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7313 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
7314 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
7315 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
7319 MI.getOperand(7).setReg(
B.buildAnyExt(
S32,
Index).getReg(0));
7322 case Intrinsic::amdgcn_fmed3: {
7328 MI.setDesc(
B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
7329 MI.removeOperand(1);
unsigned const MachineRegisterInfo * MRI
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
static constexpr unsigned FPEnvModeBitField
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterType(LLT Ty)
static bool isRegisterVectorElementType(LLT EltTy)
static bool isRegisterSize(unsigned Size)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
static std::initializer_list< LLT > AllS32Vectors
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
static std::initializer_list< LLT > AllS16Vectors
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
static bool isRegisterClassType(LLT Ty)
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
static constexpr unsigned FPEnvTrapBitField
static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx)
static constexpr unsigned MaxRegisterSize
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static const LLT MaxScalar
static bool hasBufferRsrcWorkaround(const LLT Ty)
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
static std::initializer_list< LLT > AllS64Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
static std::initializer_list< LLT > AllScalarTypes
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID)
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
static bool isRegisterVectorType(LLT Ty)
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
unsigned const TargetRegisterInfo * TRI
static unsigned getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
#define FP_DENORM_FLUSH_NONE
Interface definition for SIInstrInfo.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static constexpr int Concat[]
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsTyped, bool IsFormat) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, bool IsFormat) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool IsFormat, bool IsTyped) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isEntryFunction() const
bool isModuleEntryFunction() const
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasTrigReducedRange() const
unsigned getWavefrontSize() const
bool hasVOP3PInsts() const
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
@ ICMP_SLT
signed less than
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ ICMP_UGE
unsigned greater or equal
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
This is the shared class of boolean and integer constants.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This class represents an Operation in the Expression.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
bool hasImageStoreD16Bug() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasArchitectedSGPRs() const
bool hasPrivEnabledTrap2NopBug() const
const SIInstrInfo * getInstrInfo() const override
bool hasScalarSubwordLoads() const
bool supportsGetDoorbellID() const
TrapHandlerAbi getTrapHandlerAbi() const
bool hasGFX10_AEncoding() const
const SITargetLowering * getTargetLowering() const override
bool hasPackedFP32Ops() const
bool hasFullRate64Ops() const
bool isTrapHandlerEnabled() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool hasScalarSMulU64() const
bool hasNSAEncoding() const
unsigned getNSAMaxSize(bool HasSampler=false) const
bool hasScalarDwordx3Loads() const
Generation getGeneration() const
bool hasScalarAddSub64() const
bool hasUnpackedD16VMem() const
bool hasAddNoCarry() const
bool hasPartialNSAEncoding() const
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
KnownBits getKnownBits(Register R)
Simple wrapper observer that takes several observers, and calls each one for each event.
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
constexpr bool isPointerVector() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr ElementCount getElementCount() const
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr bool isPointerOrPointerVector() const
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
This is an important class for using LLVM in a threaded context.
void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LegalizeResult lowerFMad(MachineInstr &MI)
GISelKnownBits * getKnownBits() const
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
const LLVMTargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineBasicBlock - Allocate a new MachineBasicBlock.
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
MutableArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Wrapper class representing virtual and physical registers.
constexpr bool isValid() const
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
static constexpr bool isPhysicalRegister(unsigned Reg)
Return true if the specified register number is in the physical register namespace.
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
MachineBasicBlock * insertSimulatedTrap(MachineRegisterInfo &MRI, MachineBasicBlock &MBB, MachineInstr &MI, const DebugLoc &DL) const
Build instructions that simulate the behavior of a s_trap 2 instructions for hardware (namely,...
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool hasWorkGroupIDZ() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
The instances of the Type class are immutable: once they are created, they are never changed.
A Use represents the edge between a Value definition and its users.
StringRef getName() const
Return a constant reference to the value's name.
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE bool isKernel(CallingConv::ID CC)
bool isGFX11(const MCSubtargetInfo &STI)
bool isCompute(CallingConv::ID cc)
unsigned getAMDHSACodeObjectVersion(const Module &M)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelKnownBits *KnownBits=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size)
True if the total bitwidth of the specified type index is Size bits.
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
int popcount(T Value) noexcept
Count the number of set bits in a value.
const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
@ Mul
Product of integers.
@ TowardPositive
roundTowardPositive.
@ TowardNegative
roundTowardNegative.
void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
unsigned Log2(Align A)
Returns the log2 of the alignment.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
std::function< bool(const LegalityQuery &)> LegalityPredicate
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
static constexpr uint64_t encode(Fields... Values)
MIMGBaseOpcode BaseOpcode
static const fltSemantics & IEEEsingle() LLVM_READNONE
static const fltSemantics & IEEEdouble() LLVM_READNONE
This struct is a compact representation of a valid (non-zero power of two) alignment.
uint64_t value() const
This is a hole in the type system and should not be abused.
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
This class contains a discriminated union of information about pointers in memory operands,...
static MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.