LLVM 19.0.0git
NVPTXISelDAGToDAG.cpp
Go to the documentation of this file.
1//===-- NVPTXISelDAGToDAG.cpp - A dag to dag inst selector for NVPTX ------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines an instruction selector for the NVPTX target.
10//
11//===----------------------------------------------------------------------===//
12
13#include "NVPTXISelDAGToDAG.h"
15#include "NVPTXUtilities.h"
18#include "llvm/IR/GlobalValue.h"
20#include "llvm/IR/IntrinsicsNVPTX.h"
23#include "llvm/Support/Debug.h"
27
28using namespace llvm;
29
30#define DEBUG_TYPE "nvptx-isel"
31#define PASS_NAME "NVPTX DAG->DAG Pattern Instruction Selection"
32
33static cl::opt<bool>
34 EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden,
35 cl::desc("Enable reciprocal sqrt optimization"));
36
37/// createNVPTXISelDag - This pass converts a legalized DAG into a
38/// NVPTX-specific DAG, ready for instruction scheduling.
40 llvm::CodeGenOptLevel OptLevel) {
41 return new NVPTXDAGToDAGISelLegacy(TM, OptLevel);
42}
43
45 CodeGenOptLevel OptLevel)
47 ID, std::make_unique<NVPTXDAGToDAGISel>(tm, OptLevel)) {}
48
50
52
54 CodeGenOptLevel OptLevel)
55 : SelectionDAGISel(tm, OptLevel), TM(tm) {
56 doMulWide = (OptLevel > CodeGenOptLevel::None);
57}
58
62}
63
64int NVPTXDAGToDAGISel::getDivF32Level() const {
66}
67
68bool NVPTXDAGToDAGISel::usePrecSqrtF32() const {
70}
71
72bool NVPTXDAGToDAGISel::useF32FTZ() const {
74}
75
76bool NVPTXDAGToDAGISel::allowFMA() const {
78 return TL->allowFMA(*MF, OptLevel);
79}
80
81bool NVPTXDAGToDAGISel::allowUnsafeFPMath() const {
83 return TL->allowUnsafeFPMath(*MF);
84}
85
86bool NVPTXDAGToDAGISel::doRsqrtOpt() const { return EnableRsqrtOpt; }
87
88/// Select - Select instructions not customized! Used for
89/// expanded, promoted and normal instructions.
90void NVPTXDAGToDAGISel::Select(SDNode *N) {
91
92 if (N->isMachineOpcode()) {
93 N->setNodeId(-1);
94 return; // Already selected.
95 }
96
97 switch (N->getOpcode()) {
98 case ISD::LOAD:
100 if (tryLoad(N))
101 return;
102 break;
103 case ISD::STORE:
105 if (tryStore(N))
106 return;
107 break;
109 if (tryEXTRACT_VECTOR_ELEMENT(N))
110 return;
111 break;
113 SelectSETP_F16X2(N);
114 return;
116 SelectSETP_BF16X2(N);
117 return;
118 case NVPTXISD::LoadV2:
119 case NVPTXISD::LoadV4:
120 if (tryLoadVector(N))
121 return;
122 break;
123 case NVPTXISD::LDGV2:
124 case NVPTXISD::LDGV4:
125 case NVPTXISD::LDUV2:
126 case NVPTXISD::LDUV4:
127 if (tryLDGLDU(N))
128 return;
129 break;
132 if (tryStoreVector(N))
133 return;
134 break;
138 if (tryLoadParam(N))
139 return;
140 break;
144 if (tryStoreRetval(N))
145 return;
146 break;
152 if (tryStoreParam(N))
153 return;
154 break;
156 if (tryIntrinsicNoChain(N))
157 return;
158 break;
160 if (tryIntrinsicChain(N))
161 return;
162 break;
337 if (tryTextureIntrinsic(N))
338 return;
339 break;
505 if (trySurfaceIntrinsic(N))
506 return;
507 break;
508 case ISD::AND:
509 case ISD::SRA:
510 case ISD::SRL:
511 // Try to select BFE
512 if (tryBFE(N))
513 return;
514 break;
516 SelectAddrSpaceCast(N);
517 return;
518 case ISD::ConstantFP:
519 if (tryConstantFP(N))
520 return;
521 break;
522 default:
523 break;
524 }
525 SelectCode(N);
526}
527
528bool NVPTXDAGToDAGISel::tryIntrinsicChain(SDNode *N) {
529 unsigned IID = N->getConstantOperandVal(1);
530 switch (IID) {
531 default:
532 return false;
533 case Intrinsic::nvvm_ldg_global_f:
534 case Intrinsic::nvvm_ldg_global_i:
535 case Intrinsic::nvvm_ldg_global_p:
536 case Intrinsic::nvvm_ldu_global_f:
537 case Intrinsic::nvvm_ldu_global_i:
538 case Intrinsic::nvvm_ldu_global_p:
539 return tryLDGLDU(N);
540 }
541}
542
543// There's no way to specify FP16 and BF16 immediates in .(b)f16 ops, so we
544// have to load them into an .(b)f16 register first.
545bool NVPTXDAGToDAGISel::tryConstantFP(SDNode *N) {
546 if (N->getValueType(0) != MVT::f16 && N->getValueType(0) != MVT::bf16)
547 return false;
549 cast<ConstantFPSDNode>(N)->getValueAPF(), SDLoc(N), N->getValueType(0));
550 SDNode *LoadConstF16 = CurDAG->getMachineNode(
551 (N->getValueType(0) == MVT::f16 ? NVPTX::LOAD_CONST_F16
552 : NVPTX::LOAD_CONST_BF16),
553 SDLoc(N), N->getValueType(0), Val);
554 ReplaceNode(N, LoadConstF16);
555 return true;
556}
557
558// Map ISD:CONDCODE value to appropriate CmpMode expected by
559// NVPTXInstPrinter::printCmpMode()
560static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
562 unsigned PTXCmpMode = [](ISD::CondCode CC) {
563 switch (CC) {
564 default:
565 llvm_unreachable("Unexpected condition code.");
566 case ISD::SETOEQ:
567 return CmpMode::EQ;
568 case ISD::SETOGT:
569 return CmpMode::GT;
570 case ISD::SETOGE:
571 return CmpMode::GE;
572 case ISD::SETOLT:
573 return CmpMode::LT;
574 case ISD::SETOLE:
575 return CmpMode::LE;
576 case ISD::SETONE:
577 return CmpMode::NE;
578 case ISD::SETO:
579 return CmpMode::NUM;
580 case ISD::SETUO:
581 return CmpMode::NotANumber;
582 case ISD::SETUEQ:
583 return CmpMode::EQU;
584 case ISD::SETUGT:
585 return CmpMode::GTU;
586 case ISD::SETUGE:
587 return CmpMode::GEU;
588 case ISD::SETULT:
589 return CmpMode::LTU;
590 case ISD::SETULE:
591 return CmpMode::LEU;
592 case ISD::SETUNE:
593 return CmpMode::NEU;
594 case ISD::SETEQ:
595 return CmpMode::EQ;
596 case ISD::SETGT:
597 return CmpMode::GT;
598 case ISD::SETGE:
599 return CmpMode::GE;
600 case ISD::SETLT:
601 return CmpMode::LT;
602 case ISD::SETLE:
603 return CmpMode::LE;
604 case ISD::SETNE:
605 return CmpMode::NE;
606 }
607 }(CondCode.get());
608
609 if (FTZ)
610 PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
611
612 return PTXCmpMode;
613}
614
615bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
616 unsigned PTXCmpMode =
617 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
618 SDLoc DL(N);
620 NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
621 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
622 ReplaceNode(N, SetP);
623 return true;
624}
625
626bool NVPTXDAGToDAGISel::SelectSETP_BF16X2(SDNode *N) {
627 unsigned PTXCmpMode =
628 getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
629 SDLoc DL(N);
631 NVPTX::SETP_bf16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
632 N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
633 ReplaceNode(N, SetP);
634 return true;
635}
636
637// Find all instances of extract_vector_elt that use this v2f16 vector
638// and coalesce them into a scattering move instruction.
639bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
640 SDValue Vector = N->getOperand(0);
641
642 // We only care about 16x2 as it's the only real vector type we
643 // need to deal with.
644 MVT VT = Vector.getSimpleValueType();
645 if (!Isv2x16VT(VT))
646 return false;
647 // Find and record all uses of this vector that extract element 0 or 1.
649 for (auto *U : Vector.getNode()->uses()) {
650 if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
651 continue;
652 if (U->getOperand(0) != Vector)
653 continue;
654 if (const ConstantSDNode *IdxConst =
655 dyn_cast<ConstantSDNode>(U->getOperand(1))) {
656 if (IdxConst->getZExtValue() == 0)
657 E0.push_back(U);
658 else if (IdxConst->getZExtValue() == 1)
659 E1.push_back(U);
660 else
661 llvm_unreachable("Invalid vector index.");
662 }
663 }
664
665 // There's no point scattering f16x2 if we only ever access one
666 // element of it.
667 if (E0.empty() || E1.empty())
668 return false;
669
670 // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
671 // into f16,f16 SplitF16x2(V)
672 MVT EltVT = VT.getVectorElementType();
673 SDNode *ScatterOp =
674 CurDAG->getMachineNode(NVPTX::I32toV2I16, SDLoc(N), EltVT, EltVT, Vector);
675 for (auto *Node : E0)
676 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
677 for (auto *Node : E1)
678 ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
679
680 return true;
681}
682
683static unsigned int getCodeAddrSpace(MemSDNode *N) {
684 const Value *Src = N->getMemOperand()->getValue();
685
686 if (!Src)
688
689 if (auto *PT = dyn_cast<PointerType>(Src->getType())) {
690 switch (PT->getAddressSpace()) {
697 default: break;
698 }
699 }
701}
702
703static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget,
704 unsigned CodeAddrSpace, MachineFunction *F) {
705 // We use ldg (i.e. ld.global.nc) for invariant loads from the global address
706 // space.
707 //
708 // We have two ways of identifying invariant loads: Loads may be explicitly
709 // marked as invariant, or we may infer them to be invariant.
710 //
711 // We currently infer invariance for loads from
712 // - constant global variables, and
713 // - kernel function pointer params that are noalias (i.e. __restrict) and
714 // never written to.
715 //
716 // TODO: Perform a more powerful invariance analysis (ideally IPO, and ideally
717 // not during the SelectionDAG phase).
718 //
719 // TODO: Infer invariance only at -O2. We still want to use ldg at -O0 for
720 // explicitly invariant loads because these are how clang tells us to use ldg
721 // when the user uses a builtin.
722 if (!Subtarget.hasLDG() || CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL)
723 return false;
724
725 if (N->isInvariant())
726 return true;
727
728 bool IsKernelFn = isKernelFunction(F->getFunction());
729
730 // We use getUnderlyingObjects() here instead of getUnderlyingObject() mainly
731 // because the former looks through phi nodes while the latter does not. We
732 // need to look through phi nodes to handle pointer induction variables.
734 getUnderlyingObjects(N->getMemOperand()->getValue(), Objs);
735
736 return all_of(Objs, [&](const Value *V) {
737 if (auto *A = dyn_cast<const Argument>(V))
738 return IsKernelFn && A->onlyReadsMemory() && A->hasNoAliasAttr();
739 if (auto *GV = dyn_cast<const GlobalVariable>(V))
740 return GV->isConstant();
741 return false;
742 });
743}
744
745bool NVPTXDAGToDAGISel::tryIntrinsicNoChain(SDNode *N) {
746 unsigned IID = N->getConstantOperandVal(0);
747 switch (IID) {
748 default:
749 return false;
750 case Intrinsic::nvvm_texsurf_handle_internal:
751 SelectTexSurfHandle(N);
752 return true;
753 }
754}
755
756void NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
757 // Op 0 is the intrinsic ID
758 SDValue Wrapper = N->getOperand(1);
759 SDValue GlobalVal = Wrapper.getOperand(0);
760 ReplaceNode(N, CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N),
761 MVT::i64, GlobalVal));
762}
763
764void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
765 SDValue Src = N->getOperand(0);
766 AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
767 unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
768 unsigned DstAddrSpace = CastN->getDestAddressSpace();
769 assert(SrcAddrSpace != DstAddrSpace &&
770 "addrspacecast must be between different address spaces");
771
772 if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
773 // Specific to generic
774 unsigned Opc;
775 switch (SrcAddrSpace) {
776 default: report_fatal_error("Bad address space in addrspacecast");
778 Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
779 break;
781 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
782 ? NVPTX::cvta_shared_6432
783 : NVPTX::cvta_shared_64)
784 : NVPTX::cvta_shared;
785 break;
787 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
788 ? NVPTX::cvta_const_6432
789 : NVPTX::cvta_const_64)
790 : NVPTX::cvta_const;
791 break;
793 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
794 ? NVPTX::cvta_local_6432
795 : NVPTX::cvta_local_64)
796 : NVPTX::cvta_local;
797 break;
798 }
799 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
800 Src));
801 return;
802 } else {
803 // Generic to specific
804 if (SrcAddrSpace != 0)
805 report_fatal_error("Cannot cast between two non-generic address spaces");
806 unsigned Opc;
807 switch (DstAddrSpace) {
808 default: report_fatal_error("Bad address space in addrspacecast");
810 Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
811 break;
813 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
814 ? NVPTX::cvta_to_shared_3264
815 : NVPTX::cvta_to_shared_64)
816 : NVPTX::cvta_to_shared;
817 break;
819 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
820 ? NVPTX::cvta_to_const_3264
821 : NVPTX::cvta_to_const_64)
822 : NVPTX::cvta_to_const;
823 break;
825 Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
826 ? NVPTX::cvta_to_local_3264
827 : NVPTX::cvta_to_local_64)
828 : NVPTX::cvta_to_local;
829 break;
831 Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
832 : NVPTX::nvvm_ptr_gen_to_param;
833 break;
834 }
835 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
836 Src));
837 return;
838 }
839}
840
841// Helper function template to reduce amount of boilerplate code for
842// opcode selection.
843static std::optional<unsigned>
845 unsigned Opcode_i16, unsigned Opcode_i32,
846 std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
847 std::optional<unsigned> Opcode_f64) {
848 switch (VT) {
849 case MVT::i1:
850 case MVT::i8:
851 return Opcode_i8;
852 case MVT::i16:
853 return Opcode_i16;
854 case MVT::i32:
855 return Opcode_i32;
856 case MVT::i64:
857 return Opcode_i64;
858 case MVT::f16:
859 case MVT::bf16:
860 return Opcode_i16;
861 case MVT::v2f16:
862 case MVT::v2bf16:
863 case MVT::v2i16:
864 case MVT::v4i8:
865 return Opcode_i32;
866 case MVT::f32:
867 return Opcode_f32;
868 case MVT::f64:
869 return Opcode_f64;
870 default:
871 return std::nullopt;
872 }
873}
874
875static int getLdStRegType(EVT VT) {
876 if (VT.isFloatingPoint())
877 switch (VT.getSimpleVT().SimpleTy) {
878 case MVT::f16:
879 case MVT::bf16:
880 case MVT::v2f16:
881 case MVT::v2bf16:
883 default:
885 }
886 else
888}
889
890bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
891 SDLoc dl(N);
892 MemSDNode *LD = cast<MemSDNode>(N);
893 assert(LD->readMem() && "Expected load");
894 LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
895 EVT LoadedVT = LD->getMemoryVT();
896 SDNode *NVPTXLD = nullptr;
897
898 // do not support pre/post inc/dec
899 if (PlainLoad && PlainLoad->isIndexed())
900 return false;
901
902 if (!LoadedVT.isSimple())
903 return false;
904
905 AtomicOrdering Ordering = LD->getSuccessOrdering();
906 // In order to lower atomic loads with stronger guarantees we would need to
907 // use load.acquire or insert fences. However these features were only added
908 // with PTX ISA 6.0 / sm_70.
909 // TODO: Check if we can actually use the new instructions and implement them.
910 if (isStrongerThanMonotonic(Ordering))
911 return false;
912
913 // Address Space Setting
914 unsigned int CodeAddrSpace = getCodeAddrSpace(LD);
915 if (canLowerToLDG(LD, *Subtarget, CodeAddrSpace, MF)) {
916 return tryLDGLDU(N);
917 }
918
919 unsigned int PointerSize =
920 CurDAG->getDataLayout().getPointerSizeInBits(LD->getAddressSpace());
921
922 // Volatile Setting
923 // - .volatile is only available for .global and .shared
924 // - .volatile has the same memory synchronization semantics as .relaxed.sys
925 bool isVolatile = LD->isVolatile() || Ordering == AtomicOrdering::Monotonic;
926 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
927 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
928 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
929 isVolatile = false;
930
931 // Type Setting: fromType + fromTypeWidth
932 //
933 // Sign : ISD::SEXTLOAD
934 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
935 // type is integer
936 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
937 MVT SimpleVT = LoadedVT.getSimpleVT();
938 MVT ScalarVT = SimpleVT.getScalarType();
939 // Read at least 8 bits (predicates are stored as 8-bit values)
940 unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
941 unsigned int fromType;
942
943 // Vector Setting
944 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
945 if (SimpleVT.isVector()) {
946 assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
947 "Unexpected vector type");
948 // v2f16/v2bf16/v2i16 is loaded using ld.b32
949 fromTypeWidth = 32;
950 }
951
952 if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
954 else
955 fromType = getLdStRegType(ScalarVT);
956
957 // Create the machine instruction DAG
958 SDValue Chain = N->getOperand(0);
959 SDValue N1 = N->getOperand(1);
962 std::optional<unsigned> Opcode;
963 MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
964
965 if (SelectDirectAddr(N1, Addr)) {
966 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar,
967 NVPTX::LD_i32_avar, NVPTX::LD_i64_avar,
968 NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
969 if (!Opcode)
970 return false;
971 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
972 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
973 getI32Imm(fromTypeWidth, dl), Addr, Chain };
974 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
975 } else if (PointerSize == 64 ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
976 : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
977 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
978 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
979 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
980 if (!Opcode)
981 return false;
982 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
983 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
984 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
985 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
986 } else if (PointerSize == 64 ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
987 : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
988 if (PointerSize == 64)
989 Opcode =
990 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
991 NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64,
992 NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
993 else
994 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari,
995 NVPTX::LD_i32_ari, NVPTX::LD_i64_ari,
996 NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
997 if (!Opcode)
998 return false;
999 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
1000 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1001 getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
1002 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1003 } else {
1004 if (PointerSize == 64)
1005 Opcode =
1006 pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
1007 NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64,
1008 NVPTX::LD_f32_areg_64, NVPTX::LD_f64_areg_64);
1009 else
1010 Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg,
1011 NVPTX::LD_i32_areg, NVPTX::LD_i64_areg,
1012 NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
1013 if (!Opcode)
1014 return false;
1015 SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(CodeAddrSpace, dl),
1016 getI32Imm(vecType, dl), getI32Imm(fromType, dl),
1017 getI32Imm(fromTypeWidth, dl), N1, Chain };
1018 NVPTXLD = CurDAG->getMachineNode(*Opcode, dl, TargetVT, MVT::Other, Ops);
1019 }
1020
1021 if (!NVPTXLD)
1022 return false;
1023
1024 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1025 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXLD), {MemRef});
1026
1027 ReplaceNode(N, NVPTXLD);
1028 return true;
1029}
1030
1031bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
1032
1033 SDValue Chain = N->getOperand(0);
1034 SDValue Op1 = N->getOperand(1);
1036 std::optional<unsigned> Opcode;
1037 SDLoc DL(N);
1038 SDNode *LD;
1039 MemSDNode *MemSD = cast<MemSDNode>(N);
1040 EVT LoadedVT = MemSD->getMemoryVT();
1041
1042 if (!LoadedVT.isSimple())
1043 return false;
1044
1045 // Address Space Setting
1046 unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD);
1047 if (canLowerToLDG(MemSD, *Subtarget, CodeAddrSpace, MF)) {
1048 return tryLDGLDU(N);
1049 }
1050
1051 unsigned int PointerSize =
1053
1054 // Volatile Setting
1055 // - .volatile is only availalble for .global and .shared
1056 bool IsVolatile = MemSD->isVolatile();
1057 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1058 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1059 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1060 IsVolatile = false;
1061
1062 // Vector Setting
1063 MVT SimpleVT = LoadedVT.getSimpleVT();
1064
1065 // Type Setting: fromType + fromTypeWidth
1066 //
1067 // Sign : ISD::SEXTLOAD
1068 // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
1069 // type is integer
1070 // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
1071 MVT ScalarVT = SimpleVT.getScalarType();
1072 // Read at least 8 bits (predicates are stored as 8-bit values)
1073 unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
1074 unsigned int FromType;
1075 // The last operand holds the original LoadSDNode::getExtensionType() value
1076 unsigned ExtensionType = cast<ConstantSDNode>(
1077 N->getOperand(N->getNumOperands() - 1))->getZExtValue();
1078 if (ExtensionType == ISD::SEXTLOAD)
1080 else
1081 FromType = getLdStRegType(ScalarVT);
1082
1083 unsigned VecType;
1084
1085 switch (N->getOpcode()) {
1086 case NVPTXISD::LoadV2:
1088 break;
1089 case NVPTXISD::LoadV4:
1091 break;
1092 default:
1093 return false;
1094 }
1095
1096 EVT EltVT = N->getValueType(0);
1097
1098 // v8x16 is a special case. PTX doesn't have ld.v8.16
1099 // instruction. Instead, we split the vector into v2x16 chunks and
1100 // load them with ld.v4.b32.
1101 if (Isv2x16VT(EltVT)) {
1102 assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
1103 EltVT = MVT::i32;
1105 FromTypeWidth = 32;
1106 }
1107
1108 if (SelectDirectAddr(Op1, Addr)) {
1109 switch (N->getOpcode()) {
1110 default:
1111 return false;
1112 case NVPTXISD::LoadV2:
1113 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1114 NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
1115 NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
1116 NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
1117 break;
1118 case NVPTXISD::LoadV4:
1119 Opcode =
1120 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
1121 NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar,
1122 std::nullopt, NVPTX::LDV_f32_v4_avar, std::nullopt);
1123 break;
1124 }
1125 if (!Opcode)
1126 return false;
1127 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1128 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1129 getI32Imm(FromTypeWidth, DL), Addr, Chain };
1130 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1131 } else if (PointerSize == 64
1132 ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
1133 : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
1134 switch (N->getOpcode()) {
1135 default:
1136 return false;
1137 case NVPTXISD::LoadV2:
1138 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1139 NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
1140 NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
1141 NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
1142 break;
1143 case NVPTXISD::LoadV4:
1144 Opcode =
1145 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
1146 NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi,
1147 std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
1148 break;
1149 }
1150 if (!Opcode)
1151 return false;
1152 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1153 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1154 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1155 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1156 } else if (PointerSize == 64
1157 ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1158 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1159 if (PointerSize == 64) {
1160 switch (N->getOpcode()) {
1161 default:
1162 return false;
1163 case NVPTXISD::LoadV2:
1164 Opcode =
1166 NVPTX::LDV_i8_v2_ari_64, NVPTX::LDV_i16_v2_ari_64,
1167 NVPTX::LDV_i32_v2_ari_64, NVPTX::LDV_i64_v2_ari_64,
1168 NVPTX::LDV_f32_v2_ari_64, NVPTX::LDV_f64_v2_ari_64);
1169 break;
1170 case NVPTXISD::LoadV4:
1171 Opcode = pickOpcodeForVT(
1172 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
1173 NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, std::nullopt,
1174 NVPTX::LDV_f32_v4_ari_64, std::nullopt);
1175 break;
1176 }
1177 } else {
1178 switch (N->getOpcode()) {
1179 default:
1180 return false;
1181 case NVPTXISD::LoadV2:
1182 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1183 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
1184 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
1185 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
1186 break;
1187 case NVPTXISD::LoadV4:
1188 Opcode =
1189 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
1190 NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari,
1191 std::nullopt, NVPTX::LDV_f32_v4_ari, std::nullopt);
1192 break;
1193 }
1194 }
1195 if (!Opcode)
1196 return false;
1197 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1198 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1199 getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
1200
1201 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1202 } else {
1203 if (PointerSize == 64) {
1204 switch (N->getOpcode()) {
1205 default:
1206 return false;
1207 case NVPTXISD::LoadV2:
1208 Opcode = pickOpcodeForVT(
1209 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
1210 NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
1211 NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
1212 NVPTX::LDV_f64_v2_areg_64);
1213 break;
1214 case NVPTXISD::LoadV4:
1215 Opcode = pickOpcodeForVT(
1216 EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
1217 NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, std::nullopt,
1218 NVPTX::LDV_f32_v4_areg_64, std::nullopt);
1219 break;
1220 }
1221 } else {
1222 switch (N->getOpcode()) {
1223 default:
1224 return false;
1225 case NVPTXISD::LoadV2:
1226 Opcode =
1227 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
1228 NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
1229 NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f32_v2_areg,
1230 NVPTX::LDV_f64_v2_areg);
1231 break;
1232 case NVPTXISD::LoadV4:
1233 Opcode =
1234 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
1235 NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg,
1236 std::nullopt, NVPTX::LDV_f32_v4_areg, std::nullopt);
1237 break;
1238 }
1239 }
1240 if (!Opcode)
1241 return false;
1242 SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
1243 getI32Imm(VecType, DL), getI32Imm(FromType, DL),
1244 getI32Imm(FromTypeWidth, DL), Op1, Chain };
1245 LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
1246 }
1247
1248 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1249 CurDAG->setNodeMemRefs(cast<MachineSDNode>(LD), {MemRef});
1250
1251 ReplaceNode(N, LD);
1252 return true;
1253}
1254
1255bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
1256
1257 SDValue Chain = N->getOperand(0);
1258 SDValue Op1;
1259 MemSDNode *Mem;
1260 bool IsLDG = true;
1261
1262 // If this is an LDG intrinsic, the address is the third operand. If its an
1263 // LDG/LDU SD node (from custom vector handling), then its the second operand
1264 if (N->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
1265 Op1 = N->getOperand(2);
1266 Mem = cast<MemIntrinsicSDNode>(N);
1267 unsigned IID = N->getConstantOperandVal(1);
1268 switch (IID) {
1269 default:
1270 return false;
1271 case Intrinsic::nvvm_ldg_global_f:
1272 case Intrinsic::nvvm_ldg_global_i:
1273 case Intrinsic::nvvm_ldg_global_p:
1274 IsLDG = true;
1275 break;
1276 case Intrinsic::nvvm_ldu_global_f:
1277 case Intrinsic::nvvm_ldu_global_i:
1278 case Intrinsic::nvvm_ldu_global_p:
1279 IsLDG = false;
1280 break;
1281 }
1282 } else {
1283 Op1 = N->getOperand(1);
1284 Mem = cast<MemSDNode>(N);
1285 }
1286
1287 std::optional<unsigned> Opcode;
1288 SDLoc DL(N);
1289 SDNode *LD;
1291 EVT OrigType = N->getValueType(0);
1292
1293 EVT EltVT = Mem->getMemoryVT();
1294 unsigned NumElts = 1;
1295 if (EltVT.isVector()) {
1296 NumElts = EltVT.getVectorNumElements();
1297 EltVT = EltVT.getVectorElementType();
1298 // vectors of 16bits type are loaded/stored as multiples of v2x16 elements.
1299 if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) ||
1300 (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) ||
1301 (EltVT == MVT::i16 && OrigType == MVT::v2i16)) {
1302 assert(NumElts % 2 == 0 && "Vector must have even number of elements");
1303 EltVT = OrigType;
1304 NumElts /= 2;
1305 } else if (OrigType == MVT::v4i8) {
1306 EltVT = OrigType;
1307 NumElts = 1;
1308 }
1309 }
1310
1311 // Build the "promoted" result VTList for the load. If we are really loading
1312 // i8s, then the return type will be promoted to i16 since we do not expose
1313 // 8-bit registers in NVPTX.
1314 EVT NodeVT = (EltVT == MVT::i8) ? MVT::i16 : EltVT;
1315 SmallVector<EVT, 5> InstVTs;
1316 for (unsigned i = 0; i != NumElts; ++i) {
1317 InstVTs.push_back(NodeVT);
1318 }
1319 InstVTs.push_back(MVT::Other);
1320 SDVTList InstVTList = CurDAG->getVTList(InstVTs);
1321
1322 if (SelectDirectAddr(Op1, Addr)) {
1323 switch (N->getOpcode()) {
1324 default:
1325 return false;
1326 case ISD::LOAD:
1328 if (IsLDG)
1329 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1330 NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1331 NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
1332 NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1333 NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
1334 NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1335 NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1336 else
1337 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1338 NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1339 NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
1340 NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1341 NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
1342 NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1343 NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1344 break;
1345 case NVPTXISD::LoadV2:
1346 case NVPTXISD::LDGV2:
1347 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1348 NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1349 NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1350 NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1351 NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1352 NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1353 NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1354 break;
1355 case NVPTXISD::LDUV2:
1356 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1357 NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1358 NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1359 NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1360 NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1361 NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1362 NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1363 break;
1364 case NVPTXISD::LoadV4:
1365 case NVPTXISD::LDGV4:
1366 Opcode = pickOpcodeForVT(
1367 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1368 NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1369 NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1370 NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1371 break;
1372 case NVPTXISD::LDUV4:
1373 Opcode = pickOpcodeForVT(
1374 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1375 NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1376 NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1377 NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1378 break;
1379 }
1380 if (!Opcode)
1381 return false;
1382 SDValue Ops[] = { Addr, Chain };
1383 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1384 } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
1385 : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
1386 if (TM.is64Bit()) {
1387 switch (N->getOpcode()) {
1388 default:
1389 return false;
1390 case ISD::LOAD:
1392 if (IsLDG)
1393 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1394 NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
1395 NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
1396 NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
1397 NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
1398 NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
1399 NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
1400 else
1401 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1402 NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
1403 NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
1404 NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
1405 NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
1406 NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
1407 NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
1408 break;
1409 case NVPTXISD::LoadV2:
1410 case NVPTXISD::LDGV2:
1411 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1412 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
1413 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
1414 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
1415 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
1416 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
1417 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
1418 break;
1419 case NVPTXISD::LDUV2:
1420 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1421 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
1422 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
1423 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
1424 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
1425 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
1426 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
1427 break;
1428 case NVPTXISD::LoadV4:
1429 case NVPTXISD::LDGV4:
1430 Opcode = pickOpcodeForVT(
1431 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
1432 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
1433 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, std::nullopt,
1434 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, std::nullopt);
1435 break;
1436 case NVPTXISD::LDUV4:
1437 Opcode = pickOpcodeForVT(
1438 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
1439 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
1440 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, std::nullopt,
1441 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, std::nullopt);
1442 break;
1443 }
1444 } else {
1445 switch (N->getOpcode()) {
1446 default:
1447 return false;
1448 case ISD::LOAD:
1450 if (IsLDG)
1451 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1452 NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
1453 NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
1454 NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
1455 NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
1456 NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
1457 NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
1458 else
1459 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1460 NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
1461 NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
1462 NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
1463 NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
1464 NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
1465 NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
1466 break;
1467 case NVPTXISD::LoadV2:
1468 case NVPTXISD::LDGV2:
1469 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1470 NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
1471 NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
1472 NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
1473 NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
1474 NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
1475 NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
1476 break;
1477 case NVPTXISD::LDUV2:
1478 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1479 NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
1480 NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
1481 NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
1482 NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
1483 NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
1484 NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
1485 break;
1486 case NVPTXISD::LoadV4:
1487 case NVPTXISD::LDGV4:
1488 Opcode = pickOpcodeForVT(
1489 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
1490 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
1491 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, std::nullopt,
1492 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, std::nullopt);
1493 break;
1494 case NVPTXISD::LDUV4:
1495 Opcode = pickOpcodeForVT(
1496 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
1497 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
1498 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, std::nullopt,
1499 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, std::nullopt);
1500 break;
1501 }
1502 }
1503 if (!Opcode)
1504 return false;
1505 SDValue Ops[] = {Base, Offset, Chain};
1506 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1507 } else {
1508 if (TM.is64Bit()) {
1509 switch (N->getOpcode()) {
1510 default:
1511 return false;
1512 case ISD::LOAD:
1514 if (IsLDG)
1515 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1516 NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
1517 NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
1518 NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
1519 NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
1520 NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
1521 NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
1522 else
1523 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1524 NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
1525 NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
1526 NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
1527 NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
1528 NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
1529 NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
1530 break;
1531 case NVPTXISD::LoadV2:
1532 case NVPTXISD::LDGV2:
1533 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1534 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
1535 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
1536 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
1537 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
1538 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
1539 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
1540 break;
1541 case NVPTXISD::LDUV2:
1542 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1543 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
1544 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
1545 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
1546 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
1547 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
1548 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
1549 break;
1550 case NVPTXISD::LoadV4:
1551 case NVPTXISD::LDGV4:
1552 Opcode = pickOpcodeForVT(
1553 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
1554 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
1555 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, std::nullopt,
1556 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, std::nullopt);
1557 break;
1558 case NVPTXISD::LDUV4:
1559 Opcode = pickOpcodeForVT(
1560 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
1561 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
1562 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, std::nullopt,
1563 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, std::nullopt);
1564 break;
1565 }
1566 } else {
1567 switch (N->getOpcode()) {
1568 default:
1569 return false;
1570 case ISD::LOAD:
1572 if (IsLDG)
1573 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1574 NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
1575 NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
1576 NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
1577 NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
1578 NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
1579 NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
1580 else
1581 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1582 NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
1583 NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
1584 NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
1585 NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
1586 NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
1587 NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
1588 break;
1589 case NVPTXISD::LoadV2:
1590 case NVPTXISD::LDGV2:
1591 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1592 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
1593 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
1594 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
1595 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
1596 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
1597 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
1598 break;
1599 case NVPTXISD::LDUV2:
1600 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1601 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
1602 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
1603 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
1604 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
1605 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
1606 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
1607 break;
1608 case NVPTXISD::LoadV4:
1609 case NVPTXISD::LDGV4:
1610 Opcode = pickOpcodeForVT(
1611 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
1612 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
1613 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, std::nullopt,
1614 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, std::nullopt);
1615 break;
1616 case NVPTXISD::LDUV4:
1617 Opcode = pickOpcodeForVT(
1618 EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
1619 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
1620 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, std::nullopt,
1621 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, std::nullopt);
1622 break;
1623 }
1624 }
1625 if (!Opcode)
1626 return false;
1627 SDValue Ops[] = { Op1, Chain };
1628 LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
1629 }
1630
1631 // For automatic generation of LDG (through SelectLoad[Vector], not the
1632 // intrinsics), we may have an extending load like:
1633 //
1634 // i32,ch = load<LD1[%data1(addrspace=1)], zext from i8> t0, t7, undef:i64
1635 //
1636 // In this case, the matching logic above will select a load for the original
1637 // memory type (in this case, i8) and our types will not match (the node needs
1638 // to return an i32 in this case). Our LDG/LDU nodes do not support the
1639 // concept of sign-/zero-extension, so emulate it here by adding an explicit
1640 // CVT instruction. Ptxas should clean up any redundancies here.
1641
1642 LoadSDNode *LdNode = dyn_cast<LoadSDNode>(N);
1643
1644 if (OrigType != EltVT &&
1645 (LdNode || (OrigType.isFloatingPoint() && EltVT.isFloatingPoint()))) {
1646 // We have an extending-load. The instruction we selected operates on the
1647 // smaller type, but the SDNode we are replacing has the larger type. We
1648 // need to emit a CVT to make the types match.
1649 unsigned CvtOpc =
1650 GetConvertOpcode(OrigType.getSimpleVT(), EltVT.getSimpleVT(), LdNode);
1651
1652 // For each output value, apply the manual sign/zero-extension and make sure
1653 // all users of the load go through that CVT.
1654 for (unsigned i = 0; i != NumElts; ++i) {
1655 SDValue Res(LD, i);
1656 SDValue OrigVal(N, i);
1657
1658 SDNode *CvtNode =
1659 CurDAG->getMachineNode(CvtOpc, DL, OrigType, Res,
1661 DL, MVT::i32));
1662 ReplaceUses(OrigVal, SDValue(CvtNode, 0));
1663 }
1664 }
1665
1666 ReplaceNode(N, LD);
1667 return true;
1668}
1669
1670bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
1671 SDLoc dl(N);
1672 MemSDNode *ST = cast<MemSDNode>(N);
1673 assert(ST->writeMem() && "Expected store");
1674 StoreSDNode *PlainStore = dyn_cast<StoreSDNode>(N);
1675 AtomicSDNode *AtomicStore = dyn_cast<AtomicSDNode>(N);
1676 assert((PlainStore || AtomicStore) && "Expected store");
1677 EVT StoreVT = ST->getMemoryVT();
1678 SDNode *NVPTXST = nullptr;
1679
1680 // do not support pre/post inc/dec
1681 if (PlainStore && PlainStore->isIndexed())
1682 return false;
1683
1684 if (!StoreVT.isSimple())
1685 return false;
1686
1687 AtomicOrdering Ordering = ST->getSuccessOrdering();
1688 // In order to lower atomic loads with stronger guarantees we would need to
1689 // use store.release or insert fences. However these features were only added
1690 // with PTX ISA 6.0 / sm_70.
1691 // TODO: Check if we can actually use the new instructions and implement them.
1692 if (isStrongerThanMonotonic(Ordering))
1693 return false;
1694
1695 // Address Space Setting
1696 unsigned int CodeAddrSpace = getCodeAddrSpace(ST);
1697 unsigned int PointerSize =
1698 CurDAG->getDataLayout().getPointerSizeInBits(ST->getAddressSpace());
1699
1700 // Volatile Setting
1701 // - .volatile is only available for .global and .shared
1702 // - .volatile has the same memory synchronization semantics as .relaxed.sys
1703 bool isVolatile = ST->isVolatile() || Ordering == AtomicOrdering::Monotonic;
1704 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1705 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1706 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1707 isVolatile = false;
1708
1709 // Vector Setting
1710 MVT SimpleVT = StoreVT.getSimpleVT();
1711 unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
1712
1713 // Type Setting: toType + toTypeWidth
1714 // - for integer type, always use 'u'
1715 //
1716 MVT ScalarVT = SimpleVT.getScalarType();
1717 unsigned toTypeWidth = ScalarVT.getSizeInBits();
1718 if (SimpleVT.isVector()) {
1719 assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
1720 "Unexpected vector type");
1721 // v2x16 is stored using st.b32
1722 toTypeWidth = 32;
1723 }
1724
1725 unsigned int toType = getLdStRegType(ScalarVT);
1726
1727 // Create the machine instruction DAG
1728 SDValue Chain = ST->getChain();
1729 SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
1730 SDValue BasePtr = ST->getBasePtr();
1731 SDValue Addr;
1733 std::optional<unsigned> Opcode;
1734 MVT::SimpleValueType SourceVT =
1735 Value.getNode()->getSimpleValueType(0).SimpleTy;
1736
1737 if (SelectDirectAddr(BasePtr, Addr)) {
1738 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
1739 NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
1740 NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
1741 if (!Opcode)
1742 return false;
1743 SDValue Ops[] = {Value,
1744 getI32Imm(isVolatile, dl),
1745 getI32Imm(CodeAddrSpace, dl),
1746 getI32Imm(vecType, dl),
1747 getI32Imm(toType, dl),
1748 getI32Imm(toTypeWidth, dl),
1749 Addr,
1750 Chain};
1751 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1752 } else if (PointerSize == 64
1753 ? SelectADDRsi64(BasePtr.getNode(), BasePtr, Base, Offset)
1754 : SelectADDRsi(BasePtr.getNode(), BasePtr, Base, Offset)) {
1755 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
1756 NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
1757 NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
1758 if (!Opcode)
1759 return false;
1760 SDValue Ops[] = {Value,
1761 getI32Imm(isVolatile, dl),
1762 getI32Imm(CodeAddrSpace, dl),
1763 getI32Imm(vecType, dl),
1764 getI32Imm(toType, dl),
1765 getI32Imm(toTypeWidth, dl),
1766 Base,
1767 Offset,
1768 Chain};
1769 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1770 } else if (PointerSize == 64
1771 ? SelectADDRri64(BasePtr.getNode(), BasePtr, Base, Offset)
1772 : SelectADDRri(BasePtr.getNode(), BasePtr, Base, Offset)) {
1773 if (PointerSize == 64)
1774 Opcode =
1775 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
1776 NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64,
1777 NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
1778 else
1779 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
1780 NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
1781 NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
1782 if (!Opcode)
1783 return false;
1784
1785 SDValue Ops[] = {Value,
1786 getI32Imm(isVolatile, dl),
1787 getI32Imm(CodeAddrSpace, dl),
1788 getI32Imm(vecType, dl),
1789 getI32Imm(toType, dl),
1790 getI32Imm(toTypeWidth, dl),
1791 Base,
1792 Offset,
1793 Chain};
1794 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1795 } else {
1796 if (PointerSize == 64)
1797 Opcode =
1798 pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
1799 NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
1800 NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
1801 else
1802 Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
1803 NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
1804 NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
1805 if (!Opcode)
1806 return false;
1807 SDValue Ops[] = {Value,
1808 getI32Imm(isVolatile, dl),
1809 getI32Imm(CodeAddrSpace, dl),
1810 getI32Imm(vecType, dl),
1811 getI32Imm(toType, dl),
1812 getI32Imm(toTypeWidth, dl),
1813 BasePtr,
1814 Chain};
1815 NVPTXST = CurDAG->getMachineNode(*Opcode, dl, MVT::Other, Ops);
1816 }
1817
1818 if (!NVPTXST)
1819 return false;
1820
1821 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
1822 CurDAG->setNodeMemRefs(cast<MachineSDNode>(NVPTXST), {MemRef});
1823 ReplaceNode(N, NVPTXST);
1824 return true;
1825}
1826
1827bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
1828 SDValue Chain = N->getOperand(0);
1829 SDValue Op1 = N->getOperand(1);
1831 std::optional<unsigned> Opcode;
1832 SDLoc DL(N);
1833 SDNode *ST;
1834 EVT EltVT = Op1.getValueType();
1835 MemSDNode *MemSD = cast<MemSDNode>(N);
1836 EVT StoreVT = MemSD->getMemoryVT();
1837
1838 // Address Space Setting
1839 unsigned CodeAddrSpace = getCodeAddrSpace(MemSD);
1840 if (CodeAddrSpace == NVPTX::PTXLdStInstCode::CONSTANT) {
1841 report_fatal_error("Cannot store to pointer that points to constant "
1842 "memory space");
1843 }
1844 unsigned int PointerSize =
1846
1847 // Volatile Setting
1848 // - .volatile is only availalble for .global and .shared
1849 bool IsVolatile = MemSD->isVolatile();
1850 if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
1851 CodeAddrSpace != NVPTX::PTXLdStInstCode::SHARED &&
1852 CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
1853 IsVolatile = false;
1854
1855 // Type Setting: toType + toTypeWidth
1856 // - for integer type, always use 'u'
1857 assert(StoreVT.isSimple() && "Store value is not simple");
1858 MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
1859 unsigned ToTypeWidth = ScalarVT.getSizeInBits();
1860 unsigned ToType = getLdStRegType(ScalarVT);
1861
1863 SDValue N2;
1864 unsigned VecType;
1865
1866 switch (N->getOpcode()) {
1867 case NVPTXISD::StoreV2:
1869 StOps.push_back(N->getOperand(1));
1870 StOps.push_back(N->getOperand(2));
1871 N2 = N->getOperand(3);
1872 break;
1873 case NVPTXISD::StoreV4:
1875 StOps.push_back(N->getOperand(1));
1876 StOps.push_back(N->getOperand(2));
1877 StOps.push_back(N->getOperand(3));
1878 StOps.push_back(N->getOperand(4));
1879 N2 = N->getOperand(5);
1880 break;
1881 default:
1882 return false;
1883 }
1884
1885 // v8x16 is a special case. PTX doesn't have st.v8.x16
1886 // instruction. Instead, we split the vector into v2x16 chunks and
1887 // store them with st.v4.b32.
1888 if (Isv2x16VT(EltVT)) {
1889 assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
1890 EltVT = MVT::i32;
1892 ToTypeWidth = 32;
1893 }
1894
1895 StOps.push_back(getI32Imm(IsVolatile, DL));
1896 StOps.push_back(getI32Imm(CodeAddrSpace, DL));
1897 StOps.push_back(getI32Imm(VecType, DL));
1898 StOps.push_back(getI32Imm(ToType, DL));
1899 StOps.push_back(getI32Imm(ToTypeWidth, DL));
1900
1901 if (SelectDirectAddr(N2, Addr)) {
1902 switch (N->getOpcode()) {
1903 default:
1904 return false;
1905 case NVPTXISD::StoreV2:
1906 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1907 NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
1908 NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
1909 NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
1910 break;
1911 case NVPTXISD::StoreV4:
1912 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1913 NVPTX::STV_i8_v4_avar, NVPTX::STV_i16_v4_avar,
1914 NVPTX::STV_i32_v4_avar, std::nullopt,
1915 NVPTX::STV_f32_v4_avar, std::nullopt);
1916 break;
1917 }
1918 StOps.push_back(Addr);
1919 } else if (PointerSize == 64 ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
1920 : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
1921 switch (N->getOpcode()) {
1922 default:
1923 return false;
1924 case NVPTXISD::StoreV2:
1925 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1926 NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
1927 NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
1928 NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
1929 break;
1930 case NVPTXISD::StoreV4:
1931 Opcode =
1932 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
1933 NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi,
1934 std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
1935 break;
1936 }
1937 StOps.push_back(Base);
1938 StOps.push_back(Offset);
1939 } else if (PointerSize == 64 ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
1940 : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
1941 if (PointerSize == 64) {
1942 switch (N->getOpcode()) {
1943 default:
1944 return false;
1945 case NVPTXISD::StoreV2:
1946 Opcode =
1948 NVPTX::STV_i8_v2_ari_64, NVPTX::STV_i16_v2_ari_64,
1949 NVPTX::STV_i32_v2_ari_64, NVPTX::STV_i64_v2_ari_64,
1950 NVPTX::STV_f32_v2_ari_64, NVPTX::STV_f64_v2_ari_64);
1951 break;
1952 case NVPTXISD::StoreV4:
1953 Opcode = pickOpcodeForVT(
1954 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
1955 NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, std::nullopt,
1956 NVPTX::STV_f32_v4_ari_64, std::nullopt);
1957 break;
1958 }
1959 } else {
1960 switch (N->getOpcode()) {
1961 default:
1962 return false;
1963 case NVPTXISD::StoreV2:
1964 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1965 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
1966 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
1967 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
1968 break;
1969 case NVPTXISD::StoreV4:
1970 Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1971 NVPTX::STV_i8_v4_ari, NVPTX::STV_i16_v4_ari,
1972 NVPTX::STV_i32_v4_ari, std::nullopt,
1973 NVPTX::STV_f32_v4_ari, std::nullopt);
1974 break;
1975 }
1976 }
1977 StOps.push_back(Base);
1978 StOps.push_back(Offset);
1979 } else {
1980 if (PointerSize == 64) {
1981 switch (N->getOpcode()) {
1982 default:
1983 return false;
1984 case NVPTXISD::StoreV2:
1985 Opcode = pickOpcodeForVT(
1986 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
1987 NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
1988 NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
1989 NVPTX::STV_f64_v2_areg_64);
1990 break;
1991 case NVPTXISD::StoreV4:
1992 Opcode = pickOpcodeForVT(
1993 EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
1994 NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, std::nullopt,
1995 NVPTX::STV_f32_v4_areg_64, std::nullopt);
1996 break;
1997 }
1998 } else {
1999 switch (N->getOpcode()) {
2000 default:
2001 return false;
2002 case NVPTXISD::StoreV2:
2003 Opcode =
2004 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
2005 NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
2006 NVPTX::STV_i64_v2_areg, NVPTX::STV_f32_v2_areg,
2007 NVPTX::STV_f64_v2_areg);
2008 break;
2009 case NVPTXISD::StoreV4:
2010 Opcode =
2011 pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
2012 NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg,
2013 std::nullopt, NVPTX::STV_f32_v4_areg, std::nullopt);
2014 break;
2015 }
2016 }
2017 StOps.push_back(N2);
2018 }
2019
2020 if (!Opcode)
2021 return false;
2022
2023 StOps.push_back(Chain);
2024
2025 ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, StOps);
2026
2027 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2028 CurDAG->setNodeMemRefs(cast<MachineSDNode>(ST), {MemRef});
2029
2030 ReplaceNode(N, ST);
2031 return true;
2032}
2033
2034bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
2035 SDValue Chain = Node->getOperand(0);
2036 SDValue Offset = Node->getOperand(2);
2037 SDValue Glue = Node->getOperand(3);
2038 SDLoc DL(Node);
2039 MemSDNode *Mem = cast<MemSDNode>(Node);
2040
2041 unsigned VecSize;
2042 switch (Node->getOpcode()) {
2043 default:
2044 return false;
2046 VecSize = 1;
2047 break;
2049 VecSize = 2;
2050 break;
2052 VecSize = 4;
2053 break;
2054 }
2055
2056 EVT EltVT = Node->getValueType(0);
2057 EVT MemVT = Mem->getMemoryVT();
2058
2059 std::optional<unsigned> Opcode;
2060
2061 switch (VecSize) {
2062 default:
2063 return false;
2064 case 1:
2065 Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
2066 NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
2067 NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
2068 NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
2069 break;
2070 case 2:
2071 Opcode =
2072 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
2073 NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
2074 NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F32,
2075 NVPTX::LoadParamMemV2F64);
2076 break;
2077 case 4:
2078 Opcode =
2079 pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
2080 NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
2081 std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
2082 break;
2083 }
2084 if (!Opcode)
2085 return false;
2086
2087 SDVTList VTs;
2088 if (VecSize == 1) {
2089 VTs = CurDAG->getVTList(EltVT, MVT::Other, MVT::Glue);
2090 } else if (VecSize == 2) {
2091 VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
2092 } else {
2093 EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
2094 VTs = CurDAG->getVTList(EVTs);
2095 }
2096
2097 unsigned OffsetVal = Offset->getAsZExtVal();
2098
2100 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2101 Ops.push_back(Chain);
2102 Ops.push_back(Glue);
2103
2104 ReplaceNode(Node, CurDAG->getMachineNode(*Opcode, DL, VTs, Ops));
2105 return true;
2106}
2107
2108bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
2109 SDLoc DL(N);
2110 SDValue Chain = N->getOperand(0);
2111 SDValue Offset = N->getOperand(1);
2112 unsigned OffsetVal = Offset->getAsZExtVal();
2113 MemSDNode *Mem = cast<MemSDNode>(N);
2114
2115 // How many elements do we have?
2116 unsigned NumElts = 1;
2117 switch (N->getOpcode()) {
2118 default:
2119 return false;
2121 NumElts = 1;
2122 break;
2124 NumElts = 2;
2125 break;
2127 NumElts = 4;
2128 break;
2129 }
2130
2131 // Build vector of operands
2133 for (unsigned i = 0; i < NumElts; ++i)
2134 Ops.push_back(N->getOperand(i + 2));
2135 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2136 Ops.push_back(Chain);
2137
2138 // Determine target opcode
2139 // If we have an i1, use an 8-bit store. The lowering code in
2140 // NVPTXISelLowering will have already emitted an upcast.
2141 std::optional<unsigned> Opcode = 0;
2142 switch (NumElts) {
2143 default:
2144 return false;
2145 case 1:
2147 NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
2148 NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
2149 NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
2150 if (Opcode == NVPTX::StoreRetvalI8) {
2151 // Fine tune the opcode depending on the size of the operand.
2152 // This helps to avoid creating redundant COPY instructions in
2153 // InstrEmitter::AddRegisterOperand().
2154 switch (Ops[0].getSimpleValueType().SimpleTy) {
2155 default:
2156 break;
2157 case MVT::i32:
2158 Opcode = NVPTX::StoreRetvalI8TruncI32;
2159 break;
2160 case MVT::i64:
2161 Opcode = NVPTX::StoreRetvalI8TruncI64;
2162 break;
2163 }
2164 }
2165 break;
2166 case 2:
2168 NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
2169 NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
2170 NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
2171 break;
2172 case 4:
2174 NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
2175 NVPTX::StoreRetvalV4I32, std::nullopt,
2176 NVPTX::StoreRetvalV4F32, std::nullopt);
2177 break;
2178 }
2179 if (!Opcode)
2180 return false;
2181
2182 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
2183 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2184 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2185
2186 ReplaceNode(N, Ret);
2187 return true;
2188}
2189
2190// Helpers for constructing opcode (ex: NVPTX::StoreParamV4F32_iiri)
2191#define getOpcV2H(ty, opKind0, opKind1) \
2192 NVPTX::StoreParamV2##ty##_##opKind0##opKind1
2193
2194#define getOpcV2H1(ty, opKind0, isImm1) \
2195 (isImm1) ? getOpcV2H(ty, opKind0, i) : getOpcV2H(ty, opKind0, r)
2196
2197#define getOpcodeForVectorStParamV2(ty, isimm) \
2198 (isimm[0]) ? getOpcV2H1(ty, i, isimm[1]) : getOpcV2H1(ty, r, isimm[1])
2199
2200#define getOpcV4H(ty, opKind0, opKind1, opKind2, opKind3) \
2201 NVPTX::StoreParamV4##ty##_##opKind0##opKind1##opKind2##opKind3
2202
2203#define getOpcV4H3(ty, opKind0, opKind1, opKind2, isImm3) \
2204 (isImm3) ? getOpcV4H(ty, opKind0, opKind1, opKind2, i) \
2205 : getOpcV4H(ty, opKind0, opKind1, opKind2, r)
2206
2207#define getOpcV4H2(ty, opKind0, opKind1, isImm2, isImm3) \
2208 (isImm2) ? getOpcV4H3(ty, opKind0, opKind1, i, isImm3) \
2209 : getOpcV4H3(ty, opKind0, opKind1, r, isImm3)
2210
2211#define getOpcV4H1(ty, opKind0, isImm1, isImm2, isImm3) \
2212 (isImm1) ? getOpcV4H2(ty, opKind0, i, isImm2, isImm3) \
2213 : getOpcV4H2(ty, opKind0, r, isImm2, isImm3)
2214
2215#define getOpcodeForVectorStParamV4(ty, isimm) \
2216 (isimm[0]) ? getOpcV4H1(ty, i, isimm[1], isimm[2], isimm[3]) \
2217 : getOpcV4H1(ty, r, isimm[1], isimm[2], isimm[3])
2218
2219#define getOpcodeForVectorStParam(n, ty, isimm) \
2220 (n == 2) ? getOpcodeForVectorStParamV2(ty, isimm) \
2221 : getOpcodeForVectorStParamV4(ty, isimm)
2222
2224 unsigned NumElts,
2226 SelectionDAG *CurDAG, SDLoc DL) {
2227 // Determine which inputs are registers and immediates make new operators
2228 // with constant values
2229 SmallVector<bool, 4> IsImm(NumElts, false);
2230 for (unsigned i = 0; i < NumElts; i++) {
2231 IsImm[i] = (isa<ConstantSDNode>(Ops[i]) || isa<ConstantFPSDNode>(Ops[i]));
2232 if (IsImm[i]) {
2233 SDValue Imm = Ops[i];
2234 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2235 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2236 const ConstantFP *CF = ConstImm->getConstantFPValue();
2237 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2238 } else {
2239 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2240 const ConstantInt *CI = ConstImm->getConstantIntValue();
2241 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2242 }
2243 Ops[i] = Imm;
2244 }
2245 }
2246
2247 // Get opcode for MemTy, size, and register/immediate operand ordering
2248 switch (MemTy) {
2249 case MVT::i8:
2250 return getOpcodeForVectorStParam(NumElts, I8, IsImm);
2251 case MVT::i16:
2252 return getOpcodeForVectorStParam(NumElts, I16, IsImm);
2253 case MVT::i32:
2254 return getOpcodeForVectorStParam(NumElts, I32, IsImm);
2255 case MVT::i64:
2256 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2257 return getOpcodeForVectorStParamV2(I64, IsImm);
2258 case MVT::f32:
2259 return getOpcodeForVectorStParam(NumElts, F32, IsImm);
2260 case MVT::f64:
2261 assert(NumElts == 2 && "MVT too large for NumElts > 2");
2262 return getOpcodeForVectorStParamV2(F64, IsImm);
2263
2264 // These cases don't support immediates, just use the all register version
2265 // and generate moves.
2266 case MVT::i1:
2267 return (NumElts == 2) ? NVPTX::StoreParamV2I8_rr
2268 : NVPTX::StoreParamV4I8_rrrr;
2269 case MVT::f16:
2270 case MVT::bf16:
2271 return (NumElts == 2) ? NVPTX::StoreParamV2I16_rr
2272 : NVPTX::StoreParamV4I16_rrrr;
2273 case MVT::v2f16:
2274 case MVT::v2bf16:
2275 case MVT::v2i16:
2276 case MVT::v4i8:
2277 return (NumElts == 2) ? NVPTX::StoreParamV2I32_rr
2278 : NVPTX::StoreParamV4I32_rrrr;
2279 default:
2280 llvm_unreachable("Cannot select st.param for unknown MemTy");
2281 }
2282}
2283
2284bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
2285 SDLoc DL(N);
2286 SDValue Chain = N->getOperand(0);
2287 SDValue Param = N->getOperand(1);
2288 unsigned ParamVal = Param->getAsZExtVal();
2289 SDValue Offset = N->getOperand(2);
2290 unsigned OffsetVal = Offset->getAsZExtVal();
2291 MemSDNode *Mem = cast<MemSDNode>(N);
2292 SDValue Glue = N->getOperand(N->getNumOperands() - 1);
2293
2294 // How many elements do we have?
2295 unsigned NumElts;
2296 switch (N->getOpcode()) {
2297 default:
2298 llvm_unreachable("Unexpected opcode");
2302 NumElts = 1;
2303 break;
2305 NumElts = 2;
2306 break;
2308 NumElts = 4;
2309 break;
2310 }
2311
2312 // Build vector of operands
2314 for (unsigned i = 0; i < NumElts; ++i)
2315 Ops.push_back(N->getOperand(i + 3));
2316 Ops.push_back(CurDAG->getTargetConstant(ParamVal, DL, MVT::i32));
2317 Ops.push_back(CurDAG->getTargetConstant(OffsetVal, DL, MVT::i32));
2318 Ops.push_back(Chain);
2319 Ops.push_back(Glue);
2320
2321 // Determine target opcode
2322 // If we have an i1, use an 8-bit store. The lowering code in
2323 // NVPTXISelLowering will have already emitted an upcast.
2324 std::optional<unsigned> Opcode;
2325 switch (N->getOpcode()) {
2326 default:
2327 switch (NumElts) {
2328 default:
2329 llvm_unreachable("Unexpected NumElts");
2330 case 1: {
2332 SDValue Imm = Ops[0];
2333 if (MemTy != MVT::f16 && MemTy != MVT::v2f16 &&
2334 (isa<ConstantSDNode>(Imm) || isa<ConstantFPSDNode>(Imm))) {
2335 // Convert immediate to target constant
2336 if (MemTy == MVT::f32 || MemTy == MVT::f64) {
2337 const ConstantFPSDNode *ConstImm = cast<ConstantFPSDNode>(Imm);
2338 const ConstantFP *CF = ConstImm->getConstantFPValue();
2339 Imm = CurDAG->getTargetConstantFP(*CF, DL, Imm->getValueType(0));
2340 } else {
2341 const ConstantSDNode *ConstImm = cast<ConstantSDNode>(Imm);
2342 const ConstantInt *CI = ConstImm->getConstantIntValue();
2343 Imm = CurDAG->getTargetConstant(*CI, DL, Imm->getValueType(0));
2344 }
2345 Ops[0] = Imm;
2346 // Use immediate version of store param
2347 Opcode = pickOpcodeForVT(MemTy, NVPTX::StoreParamI8_i,
2348 NVPTX::StoreParamI16_i, NVPTX::StoreParamI32_i,
2349 NVPTX::StoreParamI64_i, NVPTX::StoreParamF32_i,
2350 NVPTX::StoreParamF64_i);
2351 } else
2352 Opcode =
2354 NVPTX::StoreParamI8_r, NVPTX::StoreParamI16_r,
2355 NVPTX::StoreParamI32_r, NVPTX::StoreParamI64_r,
2356 NVPTX::StoreParamF32_r, NVPTX::StoreParamF64_r);
2357 if (Opcode == NVPTX::StoreParamI8_r) {
2358 // Fine tune the opcode depending on the size of the operand.
2359 // This helps to avoid creating redundant COPY instructions in
2360 // InstrEmitter::AddRegisterOperand().
2361 switch (Ops[0].getSimpleValueType().SimpleTy) {
2362 default:
2363 break;
2364 case MVT::i32:
2365 Opcode = NVPTX::StoreParamI8TruncI32_r;
2366 break;
2367 case MVT::i64:
2368 Opcode = NVPTX::StoreParamI8TruncI64_r;
2369 break;
2370 }
2371 }
2372 break;
2373 }
2374 case 2:
2375 case 4: {
2377 Opcode = pickOpcodeForVectorStParam(Ops, NumElts, MemTy, CurDAG, DL);
2378 break;
2379 }
2380 }
2381 break;
2382 // Special case: if we have a sign-extend/zero-extend node, insert the
2383 // conversion instruction first, and use that as the value operand to
2384 // the selected StoreParam node.
2386 Opcode = NVPTX::StoreParamI32_r;
2388 MVT::i32);
2389 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u32_u16, DL,
2390 MVT::i32, Ops[0], CvtNone);
2391 Ops[0] = SDValue(Cvt, 0);
2392 break;
2393 }
2395 Opcode = NVPTX::StoreParamI32_r;
2397 MVT::i32);
2398 SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_s32_s16, DL,
2399 MVT::i32, Ops[0], CvtNone);
2400 Ops[0] = SDValue(Cvt, 0);
2401 break;
2402 }
2403 }
2404
2405 SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
2406 SDNode *Ret = CurDAG->getMachineNode(*Opcode, DL, RetVTs, Ops);
2407 MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
2408 CurDAG->setNodeMemRefs(cast<MachineSDNode>(Ret), {MemRef});
2409
2410 ReplaceNode(N, Ret);
2411 return true;
2412}
2413
2414bool NVPTXDAGToDAGISel::tryTextureIntrinsic(SDNode *N) {
2415 unsigned Opc = 0;
2416
2417 switch (N->getOpcode()) {
2418 default: return false;
2420 Opc = NVPTX::TEX_1D_F32_S32_RR;
2421 break;
2423 Opc = NVPTX::TEX_1D_F32_F32_RR;
2424 break;
2426 Opc = NVPTX::TEX_1D_F32_F32_LEVEL_RR;
2427 break;
2429 Opc = NVPTX::TEX_1D_F32_F32_GRAD_RR;
2430 break;
2432 Opc = NVPTX::TEX_1D_S32_S32_RR;
2433 break;
2435 Opc = NVPTX::TEX_1D_S32_F32_RR;
2436 break;
2438 Opc = NVPTX::TEX_1D_S32_F32_LEVEL_RR;
2439 break;
2441 Opc = NVPTX::TEX_1D_S32_F32_GRAD_RR;
2442 break;
2444 Opc = NVPTX::TEX_1D_U32_S32_RR;
2445 break;
2447 Opc = NVPTX::TEX_1D_U32_F32_RR;
2448 break;
2450 Opc = NVPTX::TEX_1D_U32_F32_LEVEL_RR;
2451 break;
2453 Opc = NVPTX::TEX_1D_U32_F32_GRAD_RR;
2454 break;
2456 Opc = NVPTX::TEX_1D_ARRAY_F32_S32_RR;
2457 break;
2459 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_RR;
2460 break;
2462 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL_RR;
2463 break;
2465 Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD_RR;
2466 break;
2468 Opc = NVPTX::TEX_1D_ARRAY_S32_S32_RR;
2469 break;
2471 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_RR;
2472 break;
2474 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_LEVEL_RR;
2475 break;
2477 Opc = NVPTX::TEX_1D_ARRAY_S32_F32_GRAD_RR;
2478 break;
2480 Opc = NVPTX::TEX_1D_ARRAY_U32_S32_RR;
2481 break;
2483 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_RR;
2484 break;
2486 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_LEVEL_RR;
2487 break;
2489 Opc = NVPTX::TEX_1D_ARRAY_U32_F32_GRAD_RR;
2490 break;
2492 Opc = NVPTX::TEX_2D_F32_S32_RR;
2493 break;
2495 Opc = NVPTX::TEX_2D_F32_F32_RR;
2496 break;
2498 Opc = NVPTX::TEX_2D_F32_F32_LEVEL_RR;
2499 break;
2501 Opc = NVPTX::TEX_2D_F32_F32_GRAD_RR;
2502 break;
2504 Opc = NVPTX::TEX_2D_S32_S32_RR;
2505 break;
2507 Opc = NVPTX::TEX_2D_S32_F32_RR;
2508 break;
2510 Opc = NVPTX::TEX_2D_S32_F32_LEVEL_RR;
2511 break;
2513 Opc = NVPTX::TEX_2D_S32_F32_GRAD_RR;
2514 break;
2516 Opc = NVPTX::TEX_2D_U32_S32_RR;
2517 break;
2519 Opc = NVPTX::TEX_2D_U32_F32_RR;
2520 break;
2522 Opc = NVPTX::TEX_2D_U32_F32_LEVEL_RR;
2523 break;
2525 Opc = NVPTX::TEX_2D_U32_F32_GRAD_RR;
2526 break;
2528 Opc = NVPTX::TEX_2D_ARRAY_F32_S32_RR;
2529 break;
2531 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_RR;
2532 break;
2534 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL_RR;
2535 break;
2537 Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD_RR;
2538 break;
2540 Opc = NVPTX::TEX_2D_ARRAY_S32_S32_RR;
2541 break;
2543 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_RR;
2544 break;
2546 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_LEVEL_RR;
2547 break;
2549 Opc = NVPTX::TEX_2D_ARRAY_S32_F32_GRAD_RR;
2550 break;
2552 Opc = NVPTX::TEX_2D_ARRAY_U32_S32_RR;
2553 break;
2555 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_RR;
2556 break;
2558 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_LEVEL_RR;
2559 break;
2561 Opc = NVPTX::TEX_2D_ARRAY_U32_F32_GRAD_RR;
2562 break;
2564 Opc = NVPTX::TEX_3D_F32_S32_RR;
2565 break;
2567 Opc = NVPTX::TEX_3D_F32_F32_RR;
2568 break;
2570 Opc = NVPTX::TEX_3D_F32_F32_LEVEL_RR;
2571 break;
2573 Opc = NVPTX::TEX_3D_F32_F32_GRAD_RR;
2574 break;
2576 Opc = NVPTX::TEX_3D_S32_S32_RR;
2577 break;
2579 Opc = NVPTX::TEX_3D_S32_F32_RR;
2580 break;
2582 Opc = NVPTX::TEX_3D_S32_F32_LEVEL_RR;
2583 break;
2585 Opc = NVPTX::TEX_3D_S32_F32_GRAD_RR;
2586 break;
2588 Opc = NVPTX::TEX_3D_U32_S32_RR;
2589 break;
2591 Opc = NVPTX::TEX_3D_U32_F32_RR;
2592 break;
2594 Opc = NVPTX::TEX_3D_U32_F32_LEVEL_RR;
2595 break;
2597 Opc = NVPTX::TEX_3D_U32_F32_GRAD_RR;
2598 break;
2600 Opc = NVPTX::TEX_CUBE_F32_F32_RR;
2601 break;
2603 Opc = NVPTX::TEX_CUBE_F32_F32_LEVEL_RR;
2604 break;
2606 Opc = NVPTX::TEX_CUBE_S32_F32_RR;
2607 break;
2609 Opc = NVPTX::TEX_CUBE_S32_F32_LEVEL_RR;
2610 break;
2612 Opc = NVPTX::TEX_CUBE_U32_F32_RR;
2613 break;
2615 Opc = NVPTX::TEX_CUBE_U32_F32_LEVEL_RR;
2616 break;
2618 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_RR;
2619 break;
2621 Opc = NVPTX::TEX_CUBE_ARRAY_F32_F32_LEVEL_RR;
2622 break;
2624 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_RR;
2625 break;
2627 Opc = NVPTX::TEX_CUBE_ARRAY_S32_F32_LEVEL_RR;
2628 break;
2630 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_RR;
2631 break;
2633 Opc = NVPTX::TEX_CUBE_ARRAY_U32_F32_LEVEL_RR;
2634 break;
2636 Opc = NVPTX::TLD4_R_2D_F32_F32_RR;
2637 break;
2639 Opc = NVPTX::TLD4_G_2D_F32_F32_RR;
2640 break;
2642 Opc = NVPTX::TLD4_B_2D_F32_F32_RR;
2643 break;
2645 Opc = NVPTX::TLD4_A_2D_F32_F32_RR;
2646 break;
2648 Opc = NVPTX::TLD4_R_2D_S32_F32_RR;
2649 break;
2651 Opc = NVPTX::TLD4_G_2D_S32_F32_RR;
2652 break;
2654 Opc = NVPTX::TLD4_B_2D_S32_F32_RR;
2655 break;
2657 Opc = NVPTX::TLD4_A_2D_S32_F32_RR;
2658 break;
2660 Opc = NVPTX::TLD4_R_2D_U32_F32_RR;
2661 break;
2663 Opc = NVPTX::TLD4_G_2D_U32_F32_RR;
2664 break;
2666 Opc = NVPTX::TLD4_B_2D_U32_F32_RR;
2667 break;
2669 Opc = NVPTX::TLD4_A_2D_U32_F32_RR;
2670 break;
2672 Opc = NVPTX::TEX_UNIFIED_1D_F32_S32_R;
2673 break;
2675 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_R;
2676 break;
2678 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_LEVEL_R;
2679 break;
2681 Opc = NVPTX::TEX_UNIFIED_1D_F32_F32_GRAD_R;
2682 break;
2684 Opc = NVPTX::TEX_UNIFIED_1D_S32_S32_R;
2685 break;
2687 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_R;
2688 break;
2690 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_LEVEL_R;
2691 break;
2693 Opc = NVPTX::TEX_UNIFIED_1D_S32_F32_GRAD_R;
2694 break;
2696 Opc = NVPTX::TEX_UNIFIED_1D_U32_S32_R;
2697 break;
2699 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_R;
2700 break;
2702 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_LEVEL_R;
2703 break;
2705 Opc = NVPTX::TEX_UNIFIED_1D_U32_F32_GRAD_R;
2706 break;
2708 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_S32_R;
2709 break;
2711 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_R;
2712 break;
2714 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_LEVEL_R;
2715 break;
2717 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_F32_F32_GRAD_R;
2718 break;
2720 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_S32_R;
2721 break;
2723 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_R;
2724 break;
2726 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_LEVEL_R;
2727 break;
2729 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_S32_F32_GRAD_R;
2730 break;
2732 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_S32_R;
2733 break;
2735 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_R;
2736 break;
2738 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_LEVEL_R;
2739 break;
2741 Opc = NVPTX::TEX_UNIFIED_1D_ARRAY_U32_F32_GRAD_R;
2742 break;
2744 Opc = NVPTX::TEX_UNIFIED_2D_F32_S32_R;
2745 break;
2747 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_R;
2748 break;
2750 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_LEVEL_R;
2751 break;
2753 Opc = NVPTX::TEX_UNIFIED_2D_F32_F32_GRAD_R;
2754 break;
2756 Opc = NVPTX::TEX_UNIFIED_2D_S32_S32_R;
2757 break;
2759 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_R;
2760 break;
2762 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_LEVEL_R;
2763 break;
2765 Opc = NVPTX::TEX_UNIFIED_2D_S32_F32_GRAD_R;
2766 break;
2768 Opc = NVPTX::TEX_UNIFIED_2D_U32_S32_R;
2769 break;
2771 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_R;
2772 break;
2774 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_LEVEL_R;
2775 break;
2777 Opc = NVPTX::TEX_UNIFIED_2D_U32_F32_GRAD_R;
2778 break;
2780 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_S32_R;
2781 break;
2783 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_R;
2784 break;
2786 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_LEVEL_R;
2787 break;
2789 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_F32_F32_GRAD_R;
2790 break;
2792 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_S32_R;
2793 break;
2795 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_R;
2796 break;
2798 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_LEVEL_R;
2799 break;
2801 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_S32_F32_GRAD_R;
2802 break;
2804 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_S32_R;
2805 break;
2807 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_R;
2808 break;
2810 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_LEVEL_R;
2811 break;
2813 Opc = NVPTX::TEX_UNIFIED_2D_ARRAY_U32_F32_GRAD_R;
2814 break;
2816 Opc = NVPTX::TEX_UNIFIED_3D_F32_S32_R;
2817 break;
2819 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_R;
2820 break;
2822 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_LEVEL_R;
2823 break;
2825 Opc = NVPTX::TEX_UNIFIED_3D_F32_F32_GRAD_R;
2826 break;
2828 Opc = NVPTX::TEX_UNIFIED_3D_S32_S32_R;
2829 break;
2831 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_R;
2832 break;
2834 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_LEVEL_R;
2835 break;
2837 Opc = NVPTX::TEX_UNIFIED_3D_S32_F32_GRAD_R;
2838 break;
2840 Opc = NVPTX::TEX_UNIFIED_3D_U32_S32_R;
2841 break;
2843 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_R;
2844 break;
2846 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_LEVEL_R;
2847 break;
2849 Opc = NVPTX::TEX_UNIFIED_3D_U32_F32_GRAD_R;
2850 break;
2852 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_R;
2853 break;
2855 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_LEVEL_R;
2856 break;
2858 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_R;
2859 break;
2861 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_LEVEL_R;
2862 break;
2864 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_R;
2865 break;
2867 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_LEVEL_R;
2868 break;
2870 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_R;
2871 break;
2873 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_LEVEL_R;
2874 break;
2876 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_R;
2877 break;
2879 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_LEVEL_R;
2880 break;
2882 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_R;
2883 break;
2885 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_LEVEL_R;
2886 break;
2888 Opc = NVPTX::TLD4_UNIFIED_R_2D_F32_F32_R;
2889 break;
2891 Opc = NVPTX::TLD4_UNIFIED_G_2D_F32_F32_R;
2892 break;
2894 Opc = NVPTX::TLD4_UNIFIED_B_2D_F32_F32_R;
2895 break;
2897 Opc = NVPTX::TLD4_UNIFIED_A_2D_F32_F32_R;
2898 break;
2900 Opc = NVPTX::TLD4_UNIFIED_R_2D_S32_F32_R;
2901 break;
2903 Opc = NVPTX::TLD4_UNIFIED_G_2D_S32_F32_R;
2904 break;
2906 Opc = NVPTX::TLD4_UNIFIED_B_2D_S32_F32_R;
2907 break;
2909 Opc = NVPTX::TLD4_UNIFIED_A_2D_S32_F32_R;
2910 break;
2912 Opc = NVPTX::TLD4_UNIFIED_R_2D_U32_F32_R;
2913 break;
2915 Opc = NVPTX::TLD4_UNIFIED_G_2D_U32_F32_R;
2916 break;
2918 Opc = NVPTX::TLD4_UNIFIED_B_2D_U32_F32_R;
2919 break;
2921 Opc = NVPTX::TLD4_UNIFIED_A_2D_U32_F32_R;
2922 break;
2924 Opc = NVPTX::TEX_UNIFIED_CUBE_F32_F32_GRAD_R;
2925 break;
2927 Opc = NVPTX::TEX_UNIFIED_CUBE_S32_F32_GRAD_R;
2928 break;
2930 Opc = NVPTX::TEX_UNIFIED_CUBE_U32_F32_GRAD_R;
2931 break;
2933 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_F32_F32_GRAD_R;
2934 break;
2936 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_S32_F32_GRAD_R;
2937 break;
2939 Opc = NVPTX::TEX_UNIFIED_CUBE_ARRAY_U32_F32_GRAD_R;
2940 break;
2941 }
2942
2943 // Copy over operands
2945 Ops.push_back(N->getOperand(0)); // Move chain to the back.
2946
2947 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
2948 return true;
2949}
2950
2951bool NVPTXDAGToDAGISel::trySurfaceIntrinsic(SDNode *N) {
2952 unsigned Opc = 0;
2953 switch (N->getOpcode()) {
2954 default: return false;
2956 Opc = NVPTX::SULD_1D_I8_CLAMP_R;
2957 break;
2959 Opc = NVPTX::SULD_1D_I16_CLAMP_R;
2960 break;
2962 Opc = NVPTX::SULD_1D_I32_CLAMP_R;
2963 break;
2965 Opc = NVPTX::SULD_1D_I64_CLAMP_R;
2966 break;
2968 Opc = NVPTX::SULD_1D_V2I8_CLAMP_R;
2969 break;
2971 Opc = NVPTX::SULD_1D_V2I16_CLAMP_R;
2972 break;
2974 Opc = NVPTX::SULD_1D_V2I32_CLAMP_R;
2975 break;
2977 Opc = NVPTX::SULD_1D_V2I64_CLAMP_R;
2978 break;
2980 Opc = NVPTX::SULD_1D_V4I8_CLAMP_R;
2981 break;
2983 Opc = NVPTX::SULD_1D_V4I16_CLAMP_R;
2984 break;
2986 Opc = NVPTX::SULD_1D_V4I32_CLAMP_R;
2987 break;
2989 Opc = NVPTX::SULD_1D_ARRAY_I8_CLAMP_R;
2990 break;
2992 Opc = NVPTX::SULD_1D_ARRAY_I16_CLAMP_R;
2993 break;
2995 Opc = NVPTX::SULD_1D_ARRAY_I32_CLAMP_R;
2996 break;
2998 Opc = NVPTX::SULD_1D_ARRAY_I64_CLAMP_R;
2999 break;
3001 Opc = NVPTX::SULD_1D_ARRAY_V2I8_CLAMP_R;
3002 break;
3004 Opc = NVPTX::SULD_1D_ARRAY_V2I16_CLAMP_R;
3005 break;
3007 Opc = NVPTX::SULD_1D_ARRAY_V2I32_CLAMP_R;
3008 break;
3010 Opc = NVPTX::SULD_1D_ARRAY_V2I64_CLAMP_R;
3011 break;
3013 Opc = NVPTX::SULD_1D_ARRAY_V4I8_CLAMP_R;
3014 break;
3016 Opc = NVPTX::SULD_1D_ARRAY_V4I16_CLAMP_R;
3017 break;
3019 Opc = NVPTX::SULD_1D_ARRAY_V4I32_CLAMP_R;
3020 break;
3022 Opc = NVPTX::SULD_2D_I8_CLAMP_R;
3023 break;
3025 Opc = NVPTX::SULD_2D_I16_CLAMP_R;
3026 break;
3028 Opc = NVPTX::SULD_2D_I32_CLAMP_R;
3029 break;
3031 Opc = NVPTX::SULD_2D_I64_CLAMP_R;
3032 break;
3034 Opc = NVPTX::SULD_2D_V2I8_CLAMP_R;
3035 break;
3037 Opc = NVPTX::SULD_2D_V2I16_CLAMP_R;
3038 break;
3040 Opc = NVPTX::SULD_2D_V2I32_CLAMP_R;
3041 break;
3043 Opc = NVPTX::SULD_2D_V2I64_CLAMP_R;
3044 break;
3046 Opc = NVPTX::SULD_2D_V4I8_CLAMP_R;
3047 break;
3049 Opc = NVPTX::SULD_2D_V4I16_CLAMP_R;
3050 break;
3052 Opc = NVPTX::SULD_2D_V4I32_CLAMP_R;
3053 break;
3055 Opc = NVPTX::SULD_2D_ARRAY_I8_CLAMP_R;
3056 break;
3058 Opc = NVPTX::SULD_2D_ARRAY_I16_CLAMP_R;
3059 break;
3061 Opc = NVPTX::SULD_2D_ARRAY_I32_CLAMP_R;
3062 break;
3064 Opc = NVPTX::SULD_2D_ARRAY_I64_CLAMP_R;
3065 break;
3067 Opc = NVPTX::SULD_2D_ARRAY_V2I8_CLAMP_R;
3068 break;
3070 Opc = NVPTX::SULD_2D_ARRAY_V2I16_CLAMP_R;
3071 break;
3073 Opc = NVPTX::SULD_2D_ARRAY_V2I32_CLAMP_R;
3074 break;
3076 Opc = NVPTX::SULD_2D_ARRAY_V2I64_CLAMP_R;
3077 break;
3079 Opc = NVPTX::SULD_2D_ARRAY_V4I8_CLAMP_R;
3080 break;
3082 Opc = NVPTX::SULD_2D_ARRAY_V4I16_CLAMP_R;
3083 break;
3085 Opc = NVPTX::SULD_2D_ARRAY_V4I32_CLAMP_R;
3086 break;
3088 Opc = NVPTX::SULD_3D_I8_CLAMP_R;
3089 break;
3091 Opc = NVPTX::SULD_3D_I16_CLAMP_R;
3092 break;
3094 Opc = NVPTX::SULD_3D_I32_CLAMP_R;
3095 break;
3097 Opc = NVPTX::SULD_3D_I64_CLAMP_R;
3098 break;
3100 Opc = NVPTX::SULD_3D_V2I8_CLAMP_R;
3101 break;
3103 Opc = NVPTX::SULD_3D_V2I16_CLAMP_R;
3104 break;
3106 Opc = NVPTX::SULD_3D_V2I32_CLAMP_R;
3107 break;
3109 Opc = NVPTX::SULD_3D_V2I64_CLAMP_R;
3110 break;
3112 Opc = NVPTX::SULD_3D_V4I8_CLAMP_R;
3113 break;
3115 Opc = NVPTX::SULD_3D_V4I16_CLAMP_R;
3116 break;
3118 Opc = NVPTX::SULD_3D_V4I32_CLAMP_R;
3119 break;
3121 Opc = NVPTX::SULD_1D_I8_TRAP_R;
3122 break;
3124 Opc = NVPTX::SULD_1D_I16_TRAP_R;
3125 break;
3127 Opc = NVPTX::SULD_1D_I32_TRAP_R;
3128 break;
3130 Opc = NVPTX::SULD_1D_I64_TRAP_R;
3131 break;
3133 Opc = NVPTX::SULD_1D_V2I8_TRAP_R;
3134 break;
3136 Opc = NVPTX::SULD_1D_V2I16_TRAP_R;
3137 break;
3139 Opc = NVPTX::SULD_1D_V2I32_TRAP_R;
3140 break;
3142 Opc = NVPTX::SULD_1D_V2I64_TRAP_R;
3143 break;
3145 Opc = NVPTX::SULD_1D_V4I8_TRAP_R;
3146 break;
3148 Opc = NVPTX::SULD_1D_V4I16_TRAP_R;
3149 break;
3151 Opc = NVPTX::SULD_1D_V4I32_TRAP_R;
3152 break;
3154 Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP_R;
3155 break;
3157 Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP_R;
3158 break;
3160 Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP_R;
3161 break;
3163 Opc = NVPTX::SULD_1D_ARRAY_I64_TRAP_R;
3164 break;
3166 Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP_R;
3167 break;
3169 Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP_R;
3170 break;
3172 Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP_R;
3173 break;
3175 Opc = NVPTX::SULD_1D_ARRAY_V2I64_TRAP_R;
3176 break;
3178 Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP_R;
3179 break;
3181 Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP_R;
3182 break;
3184 Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP_R;
3185 break;
3187 Opc = NVPTX::SULD_2D_I8_TRAP_R;
3188 break;
3190 Opc = NVPTX::SULD_2D_I16_TRAP_R;
3191 break;
3193 Opc = NVPTX::SULD_2D_I32_TRAP_R;
3194 break;
3196 Opc = NVPTX::SULD_2D_I64_TRAP_R;
3197 break;
3199 Opc = NVPTX::SULD_2D_V2I8_TRAP_R;
3200 break;
3202 Opc = NVPTX::SULD_2D_V2I16_TRAP_R;
3203 break;
3205 Opc = NVPTX::SULD_2D_V2I32_TRAP_R;
3206 break;
3208 Opc = NVPTX::SULD_2D_V2I64_TRAP_R;
3209 break;
3211 Opc = NVPTX::SULD_2D_V4I8_TRAP_R;
3212 break;
3214 Opc = NVPTX::SULD_2D_V4I16_TRAP_R;
3215 break;
3217 Opc = NVPTX::SULD_2D_V4I32_TRAP_R;
3218 break;
3220 Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP_R;
3221 break;
3223 Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP_R;
3224 break;
3226 Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP_R;
3227 break;
3229 Opc = NVPTX::SULD_2D_ARRAY_I64_TRAP_R;
3230 break;
3232 Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP_R;
3233 break;
3235 Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP_R;
3236 break;
3238 Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP_R;
3239 break;
3241 Opc = NVPTX::SULD_2D_ARRAY_V2I64_TRAP_R;
3242 break;
3244 Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP_R;
3245 break;
3247 Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP_R;
3248 break;
3250 Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP_R;
3251 break;
3253 Opc = NVPTX::SULD_3D_I8_TRAP_R;
3254 break;
3256 Opc = NVPTX::SULD_3D_I16_TRAP_R;
3257 break;
3259 Opc = NVPTX::SULD_3D_I32_TRAP_R;
3260 break;
3262 Opc = NVPTX::SULD_3D_I64_TRAP_R;
3263 break;
3265 Opc = NVPTX::SULD_3D_V2I8_TRAP_R;
3266 break;
3268 Opc = NVPTX::SULD_3D_V2I16_TRAP_R;
3269 break;
3271 Opc = NVPTX::SULD_3D_V2I32_TRAP_R;
3272 break;
3274 Opc = NVPTX::SULD_3D_V2I64_TRAP_R;
3275 break;
3277 Opc = NVPTX::SULD_3D_V4I8_TRAP_R;
3278 break;
3280 Opc = NVPTX::SULD_3D_V4I16_TRAP_R;
3281 break;
3283 Opc = NVPTX::SULD_3D_V4I32_TRAP_R;
3284 break;
3286 Opc = NVPTX::SULD_1D_I8_ZERO_R;
3287 break;
3289 Opc = NVPTX::SULD_1D_I16_ZERO_R;
3290 break;
3292 Opc = NVPTX::SULD_1D_I32_ZERO_R;
3293 break;
3295 Opc = NVPTX::SULD_1D_I64_ZERO_R;
3296 break;
3298 Opc = NVPTX::SULD_1D_V2I8_ZERO_R;
3299 break;
3301 Opc = NVPTX::SULD_1D_V2I16_ZERO_R;
3302 break;
3304 Opc = NVPTX::SULD_1D_V2I32_ZERO_R;
3305 break;
3307 Opc = NVPTX::SULD_1D_V2I64_ZERO_R;
3308 break;
3310 Opc = NVPTX::SULD_1D_V4I8_ZERO_R;
3311 break;
3313 Opc = NVPTX::SULD_1D_V4I16_ZERO_R;
3314 break;
3316 Opc = NVPTX::SULD_1D_V4I32_ZERO_R;
3317 break;
3319 Opc = NVPTX::SULD_1D_ARRAY_I8_ZERO_R;
3320 break;
3322 Opc = NVPTX::SULD_1D_ARRAY_I16_ZERO_R;
3323 break;
3325 Opc = NVPTX::SULD_1D_ARRAY_I32_ZERO_R;
3326 break;
3328 Opc = NVPTX::SULD_1D_ARRAY_I64_ZERO_R;
3329 break;
3331 Opc = NVPTX::SULD_1D_ARRAY_V2I8_ZERO_R;
3332 break;
3334 Opc = NVPTX::SULD_1D_ARRAY_V2I16_ZERO_R;
3335 break;
3337 Opc = NVPTX::SULD_1D_ARRAY_V2I32_ZERO_R;
3338 break;
3340 Opc = NVPTX::SULD_1D_ARRAY_V2I64_ZERO_R;
3341 break;
3343 Opc = NVPTX::SULD_1D_ARRAY_V4I8_ZERO_R;
3344 break;
3346 Opc = NVPTX::SULD_1D_ARRAY_V4I16_ZERO_R;
3347 break;
3349 Opc = NVPTX::SULD_1D_ARRAY_V4I32_ZERO_R;
3350 break;
3352 Opc = NVPTX::SULD_2D_I8_ZERO_R;
3353 break;
3355 Opc = NVPTX::SULD_2D_I16_ZERO_R;
3356 break;
3358 Opc = NVPTX::SULD_2D_I32_ZERO_R;
3359 break;
3361 Opc = NVPTX::SULD_2D_I64_ZERO_R;
3362 break;
3364 Opc = NVPTX::SULD_2D_V2I8_ZERO_R;
3365 break;
3367 Opc = NVPTX::SULD_2D_V2I16_ZERO_R;
3368 break;
3370 Opc = NVPTX::SULD_2D_V2I32_ZERO_R;
3371 break;
3373 Opc = NVPTX::SULD_2D_V2I64_ZERO_R;
3374 break;
3376 Opc = NVPTX::SULD_2D_V4I8_ZERO_R;
3377 break;
3379 Opc = NVPTX::SULD_2D_V4I16_ZERO_R;
3380 break;
3382 Opc = NVPTX::SULD_2D_V4I32_ZERO_R;
3383 break;
3385 Opc = NVPTX::SULD_2D_ARRAY_I8_ZERO_R;
3386 break;
3388 Opc = NVPTX::SULD_2D_ARRAY_I16_ZERO_R;
3389 break;
3391 Opc = NVPTX::SULD_2D_ARRAY_I32_ZERO_R;
3392 break;
3394 Opc = NVPTX::SULD_2D_ARRAY_I64_ZERO_R;
3395 break;
3397 Opc = NVPTX::SULD_2D_ARRAY_V2I8_ZERO_R;
3398 break;
3400 Opc = NVPTX::SULD_2D_ARRAY_V2I16_ZERO_R;
3401 break;
3403 Opc = NVPTX::SULD_2D_ARRAY_V2I32_ZERO_R;
3404 break;
3406 Opc = NVPTX::SULD_2D_ARRAY_V2I64_ZERO_R;
3407 break;
3409 Opc = NVPTX::SULD_2D_ARRAY_V4I8_ZERO_R;
3410 break;
3412 Opc = NVPTX::SULD_2D_ARRAY_V4I16_ZERO_R;
3413 break;
3415 Opc = NVPTX::SULD_2D_ARRAY_V4I32_ZERO_R;
3416 break;
3418 Opc = NVPTX::SULD_3D_I8_ZERO_R;
3419 break;
3421 Opc = NVPTX::SULD_3D_I16_ZERO_R;
3422 break;
3424 Opc = NVPTX::SULD_3D_I32_ZERO_R;
3425 break;
3427 Opc = NVPTX::SULD_3D_I64_ZERO_R;
3428 break;
3430 Opc = NVPTX::SULD_3D_V2I8_ZERO_R;
3431 break;
3433 Opc = NVPTX::SULD_3D_V2I16_ZERO_R;
3434 break;
3436 Opc = NVPTX::SULD_3D_V2I32_ZERO_R;
3437 break;
3439 Opc = NVPTX::SULD_3D_V2I64_ZERO_R;
3440 break;
3442 Opc = NVPTX::SULD_3D_V4I8_ZERO_R;
3443 break;
3445 Opc = NVPTX::SULD_3D_V4I16_ZERO_R;
3446 break;
3448 Opc = NVPTX::SULD_3D_V4I32_ZERO_R;
3449 break;
3450 }
3451
3452 // Copy over operands
3454 Ops.push_back(N->getOperand(0)); // Move chain to the back.
3455
3456 ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops));
3457 return true;
3458}
3459
3460
3461/// SelectBFE - Look for instruction sequences that can be made more efficient
3462/// by using the 'bfe' (bit-field extract) PTX instruction
3463bool NVPTXDAGToDAGISel::tryBFE(SDNode *N) {
3464 SDLoc DL(N);
3465 SDValue LHS = N->getOperand(0);
3466 SDValue RHS = N->getOperand(1);
3467 SDValue Len;
3468 SDValue Start;
3469 SDValue Val;
3470 bool IsSigned = false;
3471
3472 if (N->getOpcode() == ISD::AND) {
3473 // Canonicalize the operands
3474 // We want 'and %val, %mask'
3475 if (isa<ConstantSDNode>(LHS) && !isa<ConstantSDNode>(RHS)) {
3476 std::swap(LHS, RHS);
3477 }
3478
3479 ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS);
3480 if (!Mask) {
3481 // We need a constant mask on the RHS of the AND
3482 return false;
3483 }
3484
3485 // Extract the mask bits
3486 uint64_t MaskVal = Mask->getZExtValue();
3487 if (!isMask_64(MaskVal)) {
3488 // We *could* handle shifted masks here, but doing so would require an
3489 // 'and' operation to fix up the low-order bits so we would trade
3490 // shr+and for bfe+and, which has the same throughput
3491 return false;
3492 }
3493
3494 // How many bits are in our mask?
3495 int64_t NumBits = countr_one(MaskVal);
3496 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3497
3498 if (LHS.getOpcode() == ISD::SRL || LHS.getOpcode() == ISD::SRA) {
3499 // We have a 'srl/and' pair, extract the effective start bit and length
3500 Val = LHS.getNode()->getOperand(0);
3501 Start = LHS.getNode()->getOperand(1);
3502 ConstantSDNode *StartConst = dyn_cast<ConstantSDNode>(Start);
3503 if (StartConst) {
3504 uint64_t StartVal = StartConst->getZExtValue();
3505 // How many "good" bits do we have left? "good" is defined here as bits
3506 // that exist in the original value, not shifted in.
3507 int64_t GoodBits = Start.getValueSizeInBits() - StartVal;
3508 if (NumBits > GoodBits) {
3509 // Do not handle the case where bits have been shifted in. In theory
3510 // we could handle this, but the cost is likely higher than just
3511 // emitting the srl/and pair.
3512 return false;
3513 }
3514 Start = CurDAG->getTargetConstant(StartVal, DL, MVT::i32);
3515 } else {
3516 // Do not handle the case where the shift amount (can be zero if no srl
3517 // was found) is not constant. We could handle this case, but it would
3518 // require run-time logic that would be more expensive than just
3519 // emitting the srl/and pair.
3520 return false;
3521 }
3522 } else {
3523 // Do not handle the case where the LHS of the and is not a shift. While
3524 // it would be trivial to handle this case, it would just transform
3525 // 'and' -> 'bfe', but 'and' has higher-throughput.
3526 return false;
3527 }
3528 } else if (N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) {
3529 if (LHS->getOpcode() == ISD::AND) {
3530 ConstantSDNode *ShiftCnst = dyn_cast<ConstantSDNode>(RHS);
3531 if (!ShiftCnst) {
3532 // Shift amount must be constant
3533 return false;
3534 }
3535
3536 uint64_t ShiftAmt = ShiftCnst->getZExtValue();
3537
3538 SDValue AndLHS = LHS->getOperand(0);
3539 SDValue AndRHS = LHS->getOperand(1);
3540
3541 // Canonicalize the AND to have the mask on the RHS
3542 if (isa<ConstantSDNode>(AndLHS)) {
3543 std::swap(AndLHS, AndRHS);
3544 }
3545
3546 ConstantSDNode *MaskCnst = dyn_cast<ConstantSDNode>(AndRHS);
3547 if (!MaskCnst) {
3548 // Mask must be constant
3549 return false;
3550 }
3551
3552 uint64_t MaskVal = MaskCnst->getZExtValue();
3553 uint64_t NumZeros;
3554 uint64_t NumBits;
3555 if (isMask_64(MaskVal)) {
3556 NumZeros = 0;
3557 // The number of bits in the result bitfield will be the number of
3558 // trailing ones (the AND) minus the number of bits we shift off
3559 NumBits = llvm::countr_one(MaskVal) - ShiftAmt;
3560 } else if (isShiftedMask_64(MaskVal)) {
3561 NumZeros = llvm::countr_zero(MaskVal);
3562 unsigned NumOnes = llvm::countr_one(MaskVal >> NumZeros);
3563 // The number of bits in the result bitfield will be the number of
3564 // trailing zeros plus the number of set bits in the mask minus the
3565 // number of bits we shift off
3566 NumBits = NumZeros + NumOnes - ShiftAmt;
3567 } else {
3568 // This is not a mask we can handle
3569 return false;
3570 }
3571
3572 if (ShiftAmt < NumZeros) {
3573 // Handling this case would require extra logic that would make this
3574 // transformation non-profitable
3575 return false;
3576 }
3577
3578 Val = AndLHS;
3579 Start = CurDAG->getTargetConstant(ShiftAmt, DL, MVT::i32);
3580 Len = CurDAG->getTargetConstant(NumBits, DL, MVT::i32);
3581 } else if (LHS->getOpcode() == ISD::SHL) {
3582 // Here, we have a pattern like:
3583 //
3584 // (sra (shl val, NN), MM)
3585 // or
3586 // (srl (shl val, NN), MM)
3587 //
3588 // If MM >= NN, we can efficiently optimize this with bfe
3589 Val = LHS->getOperand(0);
3590
3591 SDValue ShlRHS = LHS->getOperand(1);
3592 ConstantSDNode *ShlCnst = dyn_cast<ConstantSDNode>(ShlRHS);
3593 if (!ShlCnst) {
3594 // Shift amount must be constant
3595 return false;
3596 }
3597 uint64_t InnerShiftAmt = ShlCnst->getZExtValue();
3598
3599 SDValue ShrRHS = RHS;
3600 ConstantSDNode *ShrCnst = dyn_cast<ConstantSDNode>(ShrRHS);
3601 if (!ShrCnst) {
3602 // Shift amount must be constant
3603 return false;
3604 }
3605 uint64_t OuterShiftAmt = ShrCnst->getZExtValue();
3606
3607 // To avoid extra codegen and be profitable, we need Outer >= Inner
3608 if (OuterShiftAmt < InnerShiftAmt) {
3609 return false;
3610 }
3611
3612 // If the outer shift is more than the type size, we have no bitfield to
3613 // extract (since we also check that the inner shift is <= the outer shift
3614 // then this also implies that the inner shift is < the type size)
3615 if (OuterShiftAmt >= Val.getValueSizeInBits()) {
3616 return false;
3617 }
3618
3619 Start = CurDAG->getTargetConstant(OuterShiftAmt - InnerShiftAmt, DL,
3620 MVT::i32);
3621 Len = CurDAG->getTargetConstant(Val.getValueSizeInBits() - OuterShiftAmt,
3622 DL, MVT::i32);
3623
3624 if (N->getOpcode() == ISD::SRA) {
3625 // If we have a arithmetic right shift, we need to use the signed bfe
3626 // variant
3627 IsSigned = true;
3628 }
3629 } else {
3630 // No can do...
3631 return false;
3632 }
3633 } else {
3634 // No can do...
3635 return false;
3636 }
3637
3638
3639 unsigned Opc;
3640 // For the BFE operations we form here from "and" and "srl", always use the
3641 // unsigned variants.
3642 if (Val.getValueType() == MVT::i32) {
3643 if (IsSigned) {
3644 Opc = NVPTX::BFE_S32rii;
3645 } else {
3646 Opc = NVPTX::BFE_U32rii;
3647 }
3648 } else if (Val.getValueType() == MVT::i64) {
3649 if (IsSigned) {
3650 Opc = NVPTX::BFE_S64rii;
3651 } else {
3652 Opc = NVPTX::BFE_U64rii;
3653 }
3654 } else {
3655 // We cannot handle this type
3656 return false;
3657 }
3658
3659 SDValue Ops[] = {
3660 Val, Start, Len
3661 };
3662
3663 ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getVTList(), Ops));
3664 return true;
3665}
3666
3667// SelectDirectAddr - Match a direct address for DAG.
3668// A direct address could be a globaladdress or externalsymbol.
3669bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
3670 // Return true if TGA or ES.
3671 if (N.getOpcode() == ISD::TargetGlobalAddress ||
3672 N.getOpcode() == ISD::TargetExternalSymbol) {
3673 Address = N;
3674 return true;
3675 }
3676 if (N.getOpcode() == NVPTXISD::Wrapper) {
3677 Address = N.getOperand(0);
3678 return true;
3679 }
3680 // addrspacecast(MoveParam(arg_symbol) to addrspace(PARAM)) -> arg_symbol
3681 if (AddrSpaceCastSDNode *CastN = dyn_cast<AddrSpaceCastSDNode>(N)) {
3682 if (CastN->getSrcAddressSpace() == ADDRESS_SPACE_GENERIC &&
3685 return SelectDirectAddr(CastN->getOperand(0).getOperand(0), Address);
3686 }
3687 return false;
3688}
3689
3690// symbol+offset
3691bool NVPTXDAGToDAGISel::SelectADDRsi_imp(
3692 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3693 if (Addr.getOpcode() == ISD::ADD) {
3694 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3695 SDValue base = Addr.getOperand(0);
3696 if (SelectDirectAddr(base, Base)) {
3697 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3698 mvt);
3699 return true;
3700 }
3701 }
3702 }
3703 return false;
3704}
3705
3706// symbol+offset
3707bool NVPTXDAGToDAGISel::SelectADDRsi(SDNode *OpNode, SDValue Addr,
3709 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i32);
3710}
3711
3712// symbol+offset
3713bool NVPTXDAGToDAGISel::SelectADDRsi64(SDNode *OpNode, SDValue Addr,
3715 return SelectADDRsi_imp(OpNode, Addr, Base, Offset, MVT::i64);
3716}
3717
3718// register+offset
3719bool NVPTXDAGToDAGISel::SelectADDRri_imp(
3720 SDNode *OpNode, SDValue Addr, SDValue &Base, SDValue &Offset, MVT mvt) {
3721 if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
3722 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3723 Offset = CurDAG->getTargetConstant(0, SDLoc(OpNode), mvt);
3724 return true;
3725 }
3726 if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
3727 Addr.getOpcode() == ISD::TargetGlobalAddress)
3728 return false; // direct calls.
3729
3730 if (Addr.getOpcode() == ISD::ADD) {
3731 if (SelectDirectAddr(Addr.getOperand(0), Addr)) {
3732 return false;
3733 }
3734 if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
3735 if (FrameIndexSDNode *FIN =
3736 dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
3737 // Constant offset from frame ref.
3738 Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), mvt);
3739 else
3740 Base = Addr.getOperand(0);
3741 Offset = CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(OpNode),
3742 mvt);
3743 return true;
3744 }
3745 }
3746 return false;
3747}
3748
3749// register+offset
3750bool NVPTXDAGToDAGISel::SelectADDRri(SDNode *OpNode, SDValue Addr,
3752 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i32);
3753}
3754
3755// register+offset
3756bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
3758 return SelectADDRri_imp(OpNode, Addr, Base, Offset, MVT::i64);
3759}
3760
3761bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
3762 unsigned int spN) const {
3763 const Value *Src = nullptr;
3764 if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
3765 if (spN == 0 && mN->getMemOperand()->getPseudoValue())
3766 return true;
3767 Src = mN->getMemOperand()->getValue();
3768 }
3769 if (!Src)
3770 return false;
3771 if (auto *PT = dyn_cast<PointerType>(Src->getType()))
3772 return (PT->getAddressSpace() == spN);
3773 return false;
3774}
3775
3776/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
3777/// inline asm expressions.
3779 const SDValue &Op, InlineAsm::ConstraintCode ConstraintID,
3780 std::vector<SDValue> &OutOps) {
3781 SDValue Op0, Op1;
3782 switch (ConstraintID) {
3783 default:
3784 return true;
3785 case InlineAsm::ConstraintCode::m: // memory
3786 if (SelectDirectAddr(Op, Op0)) {
3787 OutOps.push_back(Op0);
3788 OutOps.push_back(CurDAG->getTargetConstant(0, SDLoc(Op), MVT::i32));
3789 return false;
3790 }
3791 if (SelectADDRri(Op.getNode(), Op, Op0, Op1)) {
3792 OutOps.push_back(Op0);
3793 OutOps.push_back(Op1);
3794 return false;
3795 }
3796 break;
3797 }
3798 return true;
3799}
3800
3801/// GetConvertOpcode - Returns the CVT_ instruction opcode that implements a
3802/// conversion from \p SrcTy to \p DestTy.
3803unsigned NVPTXDAGToDAGISel::GetConvertOpcode(MVT DestTy, MVT SrcTy,
3804 LoadSDNode *LdNode) {
3805 bool IsSigned = LdNode && LdNode->getExtensionType() == ISD::SEXTLOAD;
3806 switch (SrcTy.SimpleTy) {
3807 default:
3808 llvm_unreachable("Unhandled source type");
3809 case MVT::i8:
3810 switch (DestTy.SimpleTy) {
3811 default:
3812 llvm_unreachable("Unhandled dest type");
3813 case MVT::i16:
3814 return IsSigned ? NVPTX::CVT_s16_s8 : NVPTX::CVT_u16_u8;
3815 case MVT::i32:
3816 return IsSigned ? NVPTX::CVT_s32_s8 : NVPTX::CVT_u32_u8;
3817 case MVT::i64:
3818 return IsSigned ? NVPTX::CVT_s64_s8 : NVPTX::CVT_u64_u8;
3819 }
3820 case MVT::i16:
3821 switch (DestTy.SimpleTy) {
3822 default:
3823 llvm_unreachable("Unhandled dest type");
3824 case MVT::i8:
3825 return IsSigned ? NVPTX::CVT_s8_s16 : NVPTX::CVT_u8_u16;
3826 case MVT::i32:
3827 return IsSigned ? NVPTX::CVT_s32_s16 : NVPTX::CVT_u32_u16;
3828 case MVT::i64:
3829 return IsSigned ? NVPTX::CVT_s64_s16 : NVPTX::CVT_u64_u16;
3830 }
3831 case MVT::i32:
3832 switch (DestTy.SimpleTy) {
3833 default:
3834 llvm_unreachable("Unhandled dest type");
3835 case MVT::i8:
3836 return IsSigned ? NVPTX::CVT_s8_s32 : NVPTX::CVT_u8_u32;
3837 case MVT::i16:
3838 return IsSigned ? NVPTX::CVT_s16_s32 : NVPTX::CVT_u16_u32;
3839 case MVT::i64:
3840 return IsSigned ? NVPTX::CVT_s64_s32 : NVPTX::CVT_u64_u32;
3841 }
3842 case MVT::i64:
3843 switch (DestTy.SimpleTy) {
3844 default:
3845 llvm_unreachable("Unhandled dest type");
3846 case MVT::i8:
3847 return IsSigned ? NVPTX::CVT_s8_s64 : NVPTX::CVT_u8_u64;
3848 case MVT::i16:
3849 return IsSigned ? NVPTX::CVT_s16_s64 : NVPTX::CVT_u16_u64;
3850 case MVT::i32:
3851 return IsSigned ? NVPTX::CVT_s32_s64 : NVPTX::CVT_u32_u64;
3852 }
3853 case MVT::f16:
3854 switch (DestTy.SimpleTy) {
3855 default:
3856 llvm_unreachable("Unhandled dest type");
3857 case MVT::f32:
3858 return NVPTX::CVT_f32_f16;
3859 case MVT::f64:
3860 return NVPTX::CVT_f64_f16;
3861 }
3862 }
3863}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
Atomic ordering constants.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
uint64_t Addr
#define DEBUG_TYPE
#define F(x, y, z)
Definition: MD5.cpp:55
#define getOpcodeForVectorStParam(n, ty, isimm)
static unsigned int getCodeAddrSpace(MemSDNode *N)
static int getLdStRegType(EVT VT)
static unsigned pickOpcodeForVectorStParam(SmallVector< SDValue, 8 > &Ops, unsigned NumElts, MVT::SimpleValueType MemTy, SelectionDAG *CurDAG, SDLoc DL)
#define getOpcodeForVectorStParamV2(ty, isimm)
static cl::opt< bool > EnableRsqrtOpt("nvptx-rsqrt-approx-opt", cl::init(true), cl::Hidden, cl::desc("Enable reciprocal sqrt optimization"))
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ)
static std::optional< unsigned > pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16, unsigned Opcode_i32, std::optional< unsigned > Opcode_i64, unsigned Opcode_f32, std::optional< unsigned > Opcode_f64)
static bool canLowerToLDG(MemSDNode *N, const NVPTXSubtarget &Subtarget, unsigned CodeAddrSpace, MachineFunction *F)
if(VerifyEach)
const char LLVMTargetMachineRef TM
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:38
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
#define PASS_NAME
Value * RHS
Value * LHS
unsigned getSrcAddressSpace() const
unsigned getDestAddressSpace() const
This is an SDNode representing atomic operations.
const SDValue & getVal() const
const ConstantFP * getConstantFPValue() const
ConstantFP - Floating Point Values [float, double].
Definition: Constants.h:269
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
const ConstantInt * getConstantIntValue() const
uint64_t getZExtValue() const
This class represents an Operation in the Expression.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:311
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
This class is used to represent ISD::LOAD nodes.
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Machine Value Type.
SimpleValueType SimpleTy
bool isVector() const
Return true if this is a vector value type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
MVT getVectorElementType() const
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
A description of a memory reference used in the backend.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
bool isVolatile() const
EVT getMemoryVT() const
Return the type of the in-memory value.
NVPTXDAGToDAGISelLegacy(NVPTXTargetMachine &tm, CodeGenOptLevel OptLevel)
bool runOnMachineFunction(MachineFunction &MF) override
bool SelectInlineAsmMemoryOperand(const SDValue &Op, InlineAsm::ConstraintCode ConstraintID, std::vector< SDValue > &OutOps) override
SelectInlineAsmMemoryOperand - Implement addressing mode selection for inline asm expressions.
const NVPTXSubtarget * Subtarget
const NVPTXTargetLowering * getTargetLowering() const override
bool useF32FTZ(const MachineFunction &MF) const
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool allowUnsafeFPMath(MachineFunction &MF) const
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
const SDValue & getOperand(unsigned Num) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
SelectionDAGISel - This is the common base class used for SelectionDAG-based pattern-matching instruc...
MachineFunction * MF
CodeGenOptLevel OptLevel
void ReplaceUses(SDValue F, SDValue T)
ReplaceUses - replace all uses of the old node F with the use of the new node T.
void ReplaceNode(SDNode *F, SDNode *T)
Replace all uses of F with T, then remove F from the DAG.
virtual bool runOnMachineFunction(MachineFunction &mf)
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
Definition: SelectionDAG.h:227
SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
const DataLayout & getDataLayout() const
Definition: SelectionDAG.h:486
SDValue getTargetFrameIndex(int FI, EVT VT)
Definition: SelectionDAG.h:741
SDValue getTargetConstantFP(double Val, const SDLoc &DL, EVT VT)
Definition: SelectionDAG.h:722
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
Definition: SelectionDAG.h:690
bool empty() const
Definition: SmallVector.h:94
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
This class is used to represent ISD::STORE nodes.
const SDValue & getValue() const
unsigned getPointerSizeInBits(unsigned AS) const
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: Lint.cpp:86
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ ConstantFP
Definition: ISDOpcodes.h:77
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, ptr, val) This corresponds to "store atomic" instruction.
Definition: ISDOpcodes.h:1248
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:240
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
Definition: ISDOpcodes.h:1038
@ TargetExternalSymbol
Definition: ISDOpcodes.h:169
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
Definition: ISDOpcodes.h:1244
@ TargetGlobalAddress
TargetGlobalAddress - Like GlobalAddress, but the DAG does no folding or anything else with this node...
Definition: ISDOpcodes.h:164
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:706
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition: ISDOpcodes.h:536
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:681
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition: ISDOpcodes.h:184
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition: ISDOpcodes.h:908
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition: ISDOpcodes.h:192
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
Definition: ISDOpcodes.h:1540
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
@ Offset
Definition: DWP.cpp:456
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
bool Isv2x16VT(EVT VT)
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition: bit.h:307
@ ADDRESS_SPACE_GENERIC
Definition: NVPTXBaseInfo.h:22
@ ADDRESS_SPACE_LOCAL
Definition: NVPTXBaseInfo.h:26
@ ADDRESS_SPACE_CONST
Definition: NVPTXBaseInfo.h:25
@ ADDRESS_SPACE_GLOBAL
Definition: NVPTXBaseInfo.h:23
@ ADDRESS_SPACE_PARAM
Definition: NVPTXBaseInfo.h:29
@ ADDRESS_SPACE_SHARED
Definition: NVPTXBaseInfo.h:24
FunctionPass * createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOptLevel OptLevel)
createNVPTXISelDag - This pass converts a legalized DAG into a NVPTX-specific DAG,...
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition: bit.h:215
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition: MathExtras.h:269
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:159
constexpr bool isMask_64(uint64_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition: MathExtras.h:257
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
AtomicOrdering
Atomic ordering for LLVM's memory model.
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
bool isKernelFunction(const Function &F)
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition: ValueTypes.h:146
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isVector() const
Return true if this is a vector value type.
Definition: ValueTypes.h:167
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
This represents a list of ValueType's that has been intern'd by a SelectionDAG.