llvm.org GIT mirror llvm / a0540d3
AMDGPU: Start defining a calling convention Partially implement callee-side for arguments and return values. byval doesn't work properly, and most likely sret or other on-stack return values most as well. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303308 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 2 years ago
29 changed file(s) with 1857 addition(s) and 132 deletion(s). Raw diff Collapse all Expand all
3737 unsigned VReg) const override;
3838 bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
3939 ArrayRef VRegs) const override;
40 CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
40 static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
41 static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
4142 };
4243 } // End of namespace llvm;
4344 #endif
1212
1313 // Inversion of CCIfInReg
1414 class CCIfNotInReg : CCIf<"!ArgFlags.isInReg()", A> {}
15 class CCIfExtend
16 : CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
1517
1618 // Calling convention for SI
1719 def CC_SI : CallingConv<[
5153 ]>>>
5254 ]>;
5355
54 def RetCC_SI : CallingConv<[
56 def RetCC_SI_Shader : CallingConv<[
5557 CCIfType<[i32] , CCAssignToReg<[
5658 SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
5759 SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
98100 CCCustom<"allocateKernArg">
99101 ]>;
100102
103 def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
104 (sequence "VGPR%u", 24, 255)
105 >;
106
107 def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
108 (sequence "VGPR%u", 32, 255)
109 >;
110
111 def CSR_AMDGPU_SGPRs_32_103 : CalleeSavedRegs<
112 (sequence "SGPR%u", 32, 103)
113 >;
114
115 def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
116 (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_103)
117 >;
118
119 // Calling convention for leaf functions
120 def CC_AMDGPU_Func : CallingConv<[
121 CCIfByVal>,
122 CCIfType<[i1], CCPromoteToType>,
123 CCIfType<[i1, i8, i16], CCIfExtend>>,
124 CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
125 VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
126 VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
127 VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
128 VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
129 CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
130 CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
131 CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
132 CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
133 CCIfType<[v8i32, v8f32], CCAssignToStack<32, 4>>,
134 CCIfType<[v16i32, v16f32], CCAssignToStack<64, 4>>
135 ]>;
136
137 // Calling convention for leaf functions
138 def RetCC_AMDGPU_Func : CallingConv<[
139 CCIfType<[i1], CCPromoteToType>,
140 CCIfType<[i1, i16], CCIfExtend>>,
141 CCIfType<[i32, f32, i16, f16, v2i16, v2f16], CCAssignToReg<[
142 VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
143 VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
144 VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
145 VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
146 CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
147 ]>;
148
101149 def CC_AMDGPU : CallingConv<[
102150 CCIf<"static_cast"
103151 "(State.getMachineFunction().getSubtarget()).getGeneration() >="
6969 // Up to SGPR0-SGPR39
7070 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
7171 &AMDGPU::SGPR_64RegClass, 20);
72 }
73 default:
74 return false;
75 }
76 }
77
78 // Allocate up to VGPR31.
79 //
80 // TODO: Since there are no VGPR alignent requirements would it be better to
81 // split into individual scalar registers?
82 static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
83 CCValAssign::LocInfo LocInfo,
84 ISD::ArgFlagsTy ArgFlags, CCState &State) {
85 switch (LocVT.SimpleTy) {
86 case MVT::i64:
87 case MVT::f64:
88 case MVT::v2i32:
89 case MVT::v2f32: {
90 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
91 &AMDGPU::VReg_64RegClass, 31);
92 }
93 case MVT::v4i32:
94 case MVT::v4f32:
95 case MVT::v2i64:
96 case MVT::v2f64: {
97 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
98 &AMDGPU::VReg_128RegClass, 29);
99 }
100 case MVT::v8i32:
101 case MVT::v8f32: {
102 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
103 &AMDGPU::VReg_256RegClass, 25);
104
105 }
106 case MVT::v16i32:
107 case MVT::v16f32: {
108 return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
109 &AMDGPU::VReg_512RegClass, 17);
110
72111 }
73112 default:
74113 return false;
772811 //===---------------------------------------------------------------------===//
773812
774813 CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
775 bool IsVarArg) const {
776 return CC_AMDGPU;
814 bool IsVarArg) {
815 switch (CC) {
816 case CallingConv::AMDGPU_KERNEL:
817 case CallingConv::SPIR_KERNEL:
818 return CC_AMDGPU_Kernel;
819 case CallingConv::AMDGPU_VS:
820 case CallingConv::AMDGPU_GS:
821 case CallingConv::AMDGPU_PS:
822 case CallingConv::AMDGPU_CS:
823 case CallingConv::AMDGPU_HS:
824 return CC_AMDGPU;
825 case CallingConv::C:
826 case CallingConv::Fast:
827 return CC_AMDGPU_Func;
828 default:
829 report_fatal_error("Unsupported calling convention.");
830 }
831 }
832
833 CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
834 bool IsVarArg) {
835 switch (CC) {
836 case CallingConv::AMDGPU_KERNEL:
837 case CallingConv::SPIR_KERNEL:
838 return CC_AMDGPU_Kernel;
839 case CallingConv::AMDGPU_VS:
840 case CallingConv::AMDGPU_GS:
841 case CallingConv::AMDGPU_PS:
842 case CallingConv::AMDGPU_CS:
843 case CallingConv::AMDGPU_HS:
844 return RetCC_SI_Shader;
845 case CallingConv::C:
846 case CallingConv::Fast:
847 return RetCC_AMDGPU_Func;
848 default:
849 report_fatal_error("Unsupported calling convention.");
850 }
777851 }
778852
779853 /// The SelectionDAGBuilder will automatically promote function arguments
873947 }
874948 }
875949
876 void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
877 const SmallVectorImpl &Outs) const {
878
879 State.AnalyzeReturn(Outs, RetCC_SI);
880 }
881
882 SDValue
883 AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
884 bool isVarArg,
885 const SmallVectorImpl &Outs,
886 const SmallVectorImpl &OutVals,
887 const SDLoc &DL, SelectionDAG &DAG) const {
950 SDValue AMDGPUTargetLowering::LowerReturn(
951 SDValue Chain, CallingConv::ID CallConv,
952 bool isVarArg,
953 const SmallVectorImpl &Outs,
954 const SmallVectorImpl &OutVals,
955 const SDLoc &DL, SelectionDAG &DAG) const {
956 // FIXME: Fails for r600 tests
957 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
958 // "wave terminate should not have return values");
888959 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
889960 }
890961
895966 /// Selects the correct CCAssignFn for a given CallingConvention value.
896967 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
897968 bool IsVarArg) {
898 switch (CC) {
899 case CallingConv::C:
900 case CallingConv::AMDGPU_KERNEL:
901 case CallingConv::SPIR_KERNEL:
902 return CC_AMDGPU_Kernel;
903 case CallingConv::AMDGPU_VS:
904 case CallingConv::AMDGPU_HS:
905 case CallingConv::AMDGPU_GS:
906 case CallingConv::AMDGPU_PS:
907 case CallingConv::AMDGPU_CS:
908 return CC_AMDGPU;
909 default:
910 report_fatal_error("Unsupported calling convention.");
911 }
969 return AMDGPUCallLowering::CCAssignFnForCall(CC, IsVarArg);
970 }
971
972 CCAssignFn *AMDGPUTargetLowering::CCAssignFnForReturn(CallingConv::ID CC,
973 bool IsVarArg) {
974 return AMDGPUCallLowering::CCAssignFnForReturn(CC, IsVarArg);
912975 }
913976
914977 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
114114 SmallVectorImpl &Results) const;
115115 void analyzeFormalArgumentsCompute(CCState &State,
116116 const SmallVectorImpl &Ins) const;
117 void AnalyzeReturn(CCState &State,
118 const SmallVectorImpl &Outs) const;
119
120117 public:
121118 AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
122119
163160 bool isCheapToSpeculateCtlz() const override;
164161
165162 static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
163 static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
164
166165 SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
167166 const SmallVectorImpl &Outs,
168167 const SmallVectorImpl &OutVals, const SDLoc &DL,
379379 def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
380380 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
381381
382 def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
382 def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
383383 [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
384384 >;
125125 }
126126
127127 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
128
129 int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(MI->getOpcode());
130
128 unsigned Opcode = MI->getOpcode();
129
130 // FIXME: Should be able to handle this with emitPseudoExpansionLowering. We
131 // need to select it to the subtarget specific version, and there's no way to
132 // do that with a single pseudo source operation.
133 if (Opcode == AMDGPU::S_SETPC_B64_return)
134 Opcode = AMDGPU::S_SETPC_B64;
135
136 int MCOpcode = ST.getInstrInfo()->pseudoToMCOpcode(Opcode);
131137 if (MCOpcode == -1) {
132138 LLVMContext &C = MI->getParent()->getParent()->getFunction()->getContext();
133139 C.emitError("AMDGPUMCInstLower::lower - Pseudo instruction doesn't have "
1111
1212 using namespace llvm;
1313
14 static bool isEntryFunctionCC(CallingConv::ID CC) {
15 switch (CC) {
16 case CallingConv::AMDGPU_KERNEL:
17 case CallingConv::SPIR_KERNEL:
18 case CallingConv::AMDGPU_VS:
19 case CallingConv::AMDGPU_HS:
20 case CallingConv::AMDGPU_GS:
21 case CallingConv::AMDGPU_PS:
22 case CallingConv::AMDGPU_CS:
23 return true;
24 default:
25 return false;
26 }
27 }
28
2914 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
3015 MachineFunctionInfo(),
3116 LocalMemoryObjects(),
3318 MaxKernArgAlign(0),
3419 LDSSize(0),
3520 ABIArgOffset(0),
36 IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
21 IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction()->getCallingConv())),
3722 NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
3823 // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
3924 // except reserved size is not correctly aligned.
1313
1414 #include "AMDGPURegisterInfo.h"
1515 #include "AMDGPUTargetMachine.h"
16 #include "SIRegisterInfo.h"
1617
1718 using namespace llvm;
1819
2223 // Function handling callbacks - Functions are a seldom used feature of GPUS, so
2324 // they are not supported at this time.
2425 //===----------------------------------------------------------------------===//
25
26 // Dummy to not crash RegisterClassInfo.
27 static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
28
29 const MCPhysReg *AMDGPURegisterInfo::getCalleeSavedRegs(
30 const MachineFunction *) const {
31 return &CalleeSavedReg;
32 }
33
34 unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
35 return AMDGPU::NoRegister;
36 }
3726
3827 unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
3928 static const unsigned SubRegs[] = {
4938
5039 #define GET_REGINFO_TARGET_DESC
5140 #include "AMDGPUGenRegisterInfo.inc"
41
42
43 // Forced to be here by one .inc
44 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
45 const MachineFunction *MF) const {
46 CallingConv::ID CC = MF->getFunction()->getCallingConv();
47 switch (CC) {
48 case CallingConv::C:
49 case CallingConv::Fast:
50 return CSR_AMDGPU_HighRegs_SaveList;
51 default: {
52 // Dummy to not crash RegisterClassInfo.
53 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
54 return &NoCalleeSavedReg;
55 }
56 }
57 }
58
59 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
60 CallingConv::ID CC) const {
61 switch (CC) {
62 case CallingConv::C:
63 case CallingConv::Fast:
64 return CSR_AMDGPU_HighRegs_RegMask;
65 default:
66 return nullptr;
67 }
68 }
69
70 unsigned SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
71 return AMDGPU::NoRegister;
72 }
2929 /// \returns the sub reg enum value for the given \p Channel
3030 /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
3131 unsigned getSubRegFromChannel(unsigned Channel) const;
32
33 const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
34 unsigned getFrameRegister(const MachineFunction &MF) const override;
3532 };
3633
3734 } // End namespace llvm
5555 return Reserved;
5656 }
5757
58 // Dummy to not crash RegisterClassInfo.
59 static const MCPhysReg CalleeSavedReg = AMDGPU::NoRegister;
60
61 const MCPhysReg *R600RegisterInfo::getCalleeSavedRegs(
62 const MachineFunction *) const {
63 return &CalleeSavedReg;
64 }
65
66 unsigned R600RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
67 return AMDGPU::NoRegister;
68 }
69
5870 unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
5971 return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
6072 }
2626 R600RegisterInfo();
2727
2828 BitVector getReservedRegs(const MachineFunction &MF) const override;
29 const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
30 unsigned getFrameRegister(const MachineFunction &MF) const override;
2931
3032 /// \brief get the HW encoding for a register's channel.
3133 unsigned getHWRegChan(unsigned reg) const;
188188 // ----
189189 // 13 (+1)
190190 unsigned ReservedRegCount = 13;
191 if (SPReg != AMDGPU::NoRegister)
192 ++ReservedRegCount;
193191
194192 if (AllSGPRs.size() < ReservedRegCount)
195193 return std::make_pair(ScratchWaveOffsetReg, SPReg);
207205 MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
208206 MFI->setScratchWaveOffsetReg(Reg);
209207 ScratchWaveOffsetReg = Reg;
210 } else {
211 if (SPReg == AMDGPU::NoRegister)
212 break;
213
214 MRI.replaceRegWith(SPReg, Reg);
215 MFI->setStackPtrOffsetReg(Reg);
216 SPReg = Reg;
217208 break;
218209 }
219210 }
222213 return std::make_pair(ScratchWaveOffsetReg, SPReg);
223214 }
224215
225 void SIFrameLowering::emitPrologue(MachineFunction &MF,
226 MachineBasicBlock &MBB) const {
216 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
217 MachineBasicBlock &MBB) const {
227218 // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
228219 // specified.
229220 const SISubtarget &ST = MF.getSubtarget();
423414 }
424415 }
425416
417 void SIFrameLowering::emitPrologue(MachineFunction &MF,
418 MachineBasicBlock &MBB) const {
419 const SIMachineFunctionInfo *MFI = MF.getInfo();
420 if (MFI->isEntryFunction())
421 emitEntryFunctionPrologue(MF, MBB);
422 }
423
426424 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
427425 MachineBasicBlock &MBB) const {
428426
2525 AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
2626 ~SIFrameLowering() override = default;
2727
28 void emitEntryFunctionPrologue(MachineFunction &MF,
29 MachineBasicBlock &MBB) const;
2830 void emitPrologue(MachineFunction &MF,
2931 MachineBasicBlock &MBB) const override;
3032 void emitEpilogue(MachineFunction &MF,
913913 return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
914914 }
915915
916 SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
917 const SDLoc &SL, SDValue Chain,
918 const ISD::InputArg &Arg) const {
919 MachineFunction &MF = DAG.getMachineFunction();
920 MachineFrameInfo &MFI = MF.getFrameInfo();
921
922 if (Arg.Flags.isByVal()) {
923 unsigned Size = Arg.Flags.getByValSize();
924 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
925 return DAG.getFrameIndex(FrameIdx, MVT::i32);
926 }
927
928 unsigned ArgOffset = VA.getLocMemOffset();
929 unsigned ArgSize = VA.getValVT().getStoreSize();
930
931 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
932
933 // Create load nodes to retrieve arguments from the stack.
934 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
935 SDValue ArgValue;
936
937 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
938 ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
939 MVT MemVT = VA.getValVT();
940
941 switch (VA.getLocInfo()) {
942 default:
943 break;
944 case CCValAssign::BCvt:
945 MemVT = VA.getLocVT();
946 break;
947 case CCValAssign::SExt:
948 ExtType = ISD::SEXTLOAD;
949 break;
950 case CCValAssign::ZExt:
951 ExtType = ISD::ZEXTLOAD;
952 break;
953 case CCValAssign::AExt:
954 ExtType = ISD::EXTLOAD;
955 break;
956 }
957
958 ArgValue = DAG.getExtLoad(
959 ExtType, SL, VA.getLocVT(), Chain, FIN,
960 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
961 MemVT);
962 return ArgValue;
963 }
964
916965 static void processShaderInputArgs(SmallVectorImpl &Splits,
917966 CallingConv::ID CallConv,
918967 ArrayRef Ins,
10931142 static void reservePrivateMemoryRegs(const TargetMachine &TM,
10941143 MachineFunction &MF,
10951144 const SIRegisterInfo &TRI,
1096 SIMachineFunctionInfo &Info) {
1145 SIMachineFunctionInfo &Info,
1146 bool NeedSP) {
10971147 // Now that we've figured out where the scratch register inputs are, see if
10981148 // should reserve the arguments and use them directly.
1099 bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
1149 MachineFrameInfo &MFI = MF.getFrameInfo();
1150 bool HasStackObjects = MFI.hasStackObjects();
11001151
11011152 // Record that we know we have non-spill stack objects so we don't need to
11021153 // check all stack objects later.
11531204 = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
11541205 Info.setScratchWaveOffsetReg(ReservedOffsetReg);
11551206 }
1207 }
1208
1209 if (NeedSP){
1210 unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF);
1211 Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg);
1212
1213 assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg());
1214 assert(!TRI.isSubRegister(Info.getScratchRSrcReg(),
1215 Info.getStackPtrOffsetReg()));
11561216 }
11571217 }
11581218
12221282 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
12231283 !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
12241284 !Info->hasWorkItemIDZ());
1285 } else if (IsKernel) {
1286 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
12251287 } else {
1226 assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
1288 Splits.append(Ins.begin(), Ins.end());
12271289 }
12281290
12291291 if (IsEntryFunc) {
12771339
12781340 InVals.push_back(Arg);
12791341 continue;
1280 }
1281
1282 if (VA.isMemLoc())
1283 report_fatal_error("memloc not supported with calling convention");
1342 } else if (!IsEntryFunc && VA.isMemLoc()) {
1343 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
1344 InVals.push_back(Val);
1345 if (!Arg.Flags.isByVal())
1346 Chains.push_back(Val.getValue(1));
1347 continue;
1348 }
12841349
12851350 assert(VA.isRegLoc() && "Parameter must be in a register!");
12861351
12901355 Reg = MF.addLiveIn(Reg, RC);
12911356 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
12921357
1293 if (Arg.VT.isVector()) {
1358 if (IsShader && Arg.VT.isVector()) {
12941359 // Build a vector from the registers
12951360 Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
12961361 unsigned NumElements = ParamType->getVectorNumElements();
13161381 InVals.push_back(Val);
13171382 }
13181383
1384 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1385
1386 // TODO: Could maybe omit SP if only tail calls?
1387 bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects();
1388
13191389 // Start adding system SGPRs.
1320 if (IsEntryFunc)
1390 if (IsEntryFunc) {
13211391 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
1322
1323 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
1392 reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP);
1393 } else {
1394 CCInfo.AllocateReg(Info->getScratchRSrcReg());
1395 CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
1396 CCInfo.AllocateReg(Info->getFrameOffsetReg());
1397
1398 if (NeedSP) {
1399 unsigned StackPtrReg = findFirstFreeSGPR(CCInfo);
1400 CCInfo.AllocateReg(StackPtrReg);
1401 Info->setStackPtrOffsetReg(StackPtrReg);
1402 }
1403 }
13241404
13251405 return Chains.empty() ? Chain :
13261406 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
1407 }
1408
1409 // TODO: If return values can't fit in registers, we should return as many as
1410 // possible in registers before passing on stack.
1411 bool SITargetLowering::CanLowerReturn(
1412 CallingConv::ID CallConv,
1413 MachineFunction &MF, bool IsVarArg,
1414 const SmallVectorImpl &Outs,
1415 LLVMContext &Context) const {
1416 // Replacing returns with sret/stack usage doesn't make sense for shaders.
1417 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
1418 // for shaders. Vector types should be explicitly handled by CC.
1419 if (AMDGPU::isEntryFunctionCC(CallConv))
1420 return true;
1421
1422 SmallVector RVLocs;
1423 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
1424 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
13271425 }
13281426
13291427 SDValue
13351433 MachineFunction &MF = DAG.getMachineFunction();
13361434 SIMachineFunctionInfo *Info = MF.getInfo();
13371435
1338 if (!AMDGPU::isShader(CallConv))
1436 if (AMDGPU::isKernel(CallConv)) {
13391437 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
13401438 OutVals, DL, DAG);
1439 }
1440
1441 bool IsShader = AMDGPU::isShader(CallConv);
13411442
13421443 Info->setIfReturnsVoid(Outs.size() == 0);
1444 bool IsWaveEnd = Info->returnsVoid() && IsShader;
13431445
13441446 SmallVector Splits;
13451447 SmallVector SplitVals;
13481450 for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
13491451 const ISD::OutputArg &Out = Outs[i];
13501452
1351 if (Out.VT.isVector()) {
1453 if (IsShader && Out.VT.isVector()) {
13521454 MVT VT = Out.VT.getVectorElementType();
13531455 ISD::OutputArg NewOut = Out;
13541456 NewOut.Flags.setSplit();
13791481 *DAG.getContext());
13801482
13811483 // Analyze outgoing return values.
1382 AnalyzeReturn(CCInfo, Splits);
1484 CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg));
13831485
13841486 SDValue Flag;
13851487 SmallVector RetOps;
13861488 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
1489
1490 // Add return address for callable functions.
1491 if (!Info->isEntryFunction()) {
1492 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
1493 SDValue ReturnAddrReg = CreateLiveInRegister(
1494 DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
1495
1496 // FIXME: Should be able to use a vreg here, but need a way to prevent it
1497 // from being allcoated to a CSR.
1498
1499 SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF),
1500 MVT::i64);
1501
1502 Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag);
1503 Flag = Chain.getValue(1);
1504
1505 RetOps.push_back(PhysReturnAddrReg);
1506 }
13871507
13881508 // Copy the result values into the output registers.
13891509 for (unsigned i = 0, realRVLocIdx = 0;
13911511 ++i, ++realRVLocIdx) {
13921512 CCValAssign &VA = RVLocs[i];
13931513 assert(VA.isRegLoc() && "Can only return in registers!");
1514 // TODO: Partially return in registers if return values don't fit.
13941515
13951516 SDValue Arg = SplitVals[realRVLocIdx];
13961517
13971518 // Copied from other backends.
13981519 switch (VA.getLocInfo()) {
1399 default: llvm_unreachable("Unknown loc info!");
14001520 case CCValAssign::Full:
14011521 break;
14021522 case CCValAssign::BCvt:
14031523 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
14041524 break;
1525 case CCValAssign::SExt:
1526 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
1527 break;
1528 case CCValAssign::ZExt:
1529 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
1530 break;
1531 case CCValAssign::AExt:
1532 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
1533 break;
1534 default:
1535 llvm_unreachable("Unknown loc info!");
14051536 }
14061537
14071538 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
14081539 Flag = Chain.getValue(1);
14091540 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
14101541 }
1542
1543 // FIXME: Does sret work properly?
14111544
14121545 // Update chain and glue.
14131546 RetOps[0] = Chain;
14141547 if (Flag.getNode())
14151548 RetOps.push_back(Flag);
14161549
1417 unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
1550 unsigned Opc = AMDGPUISD::ENDPGM;
1551 if (!IsWaveEnd)
1552 Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
14181553 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
14191554 }
14201555
2626 const SDLoc &SL, SDValue Chain,
2727 uint64_t Offset, bool Signed,
2828 const ISD::InputArg *Arg = nullptr) const;
29
30 SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
31 const SDLoc &SL, SDValue Chain,
32 const ISD::InputArg &Arg) const;
2933
3034 SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
3135 SelectionDAG &DAG) const override;
176180 const SDLoc &DL, SelectionDAG &DAG,
177181 SmallVectorImpl &InVals) const override;
178182
179 SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
183 bool CanLowerReturn(CallingConv::ID CallConv,
184 MachineFunction &MF, bool isVarArg,
185 const SmallVectorImpl &Outs,
186 LLVMContext &Context) const override;
187
188 SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
180189 const SmallVectorImpl &Outs,
181190 const SmallVectorImpl &OutVals, const SDLoc &DL,
182191 SelectionDAG &DAG) const override;
7979 FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
8080 WavesPerEU = ST.getWavesPerEU(*F);
8181
82 // Non-entry functions have no special inputs for now.
83 // TODO: Return early for non-entry CCs.
82 if (!isEntryFunction()) {
83 // Non-entry functions have no special inputs for now, other registers
84 // required for scratch access.
85 ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
86 ScratchWaveOffsetReg = AMDGPU::SGPR4;
87 FrameOffsetReg = AMDGPU::SGPR5;
88 return;
89 }
8490
8591 CallingConv::ID CC = F->getCallingConv();
86 if (CC == CallingConv::AMDGPU_PS)
87 PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
88
89 if (AMDGPU::isKernel(CC)) {
92 if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
9093 KernargSegmentPtr = true;
9194 WorkGroupIDX = true;
9295 WorkItemIDX = true;
96 } else if (CC == CallingConv::AMDGPU_PS) {
97 PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
9398 }
9499
95100 if (ST.debuggerEmitPrologue()) {
119124
120125 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
121126 bool MaySpill = ST.isVGPRSpillingEnabled(*F);
122 bool HasStackObjects = FrameInfo.hasStackObjects();
127 bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls();
123128
124129 if (HasStackObjects || MaySpill) {
125130 PrivateSegmentWaveByteOffset = true;
387387 void setScratchWaveOffsetReg(unsigned Reg) {
388388 assert(Reg != AMDGPU::NoRegister && "Should never be unset");
389389 ScratchWaveOffsetReg = Reg;
390
391 // FIXME: Only for entry functions.
392 FrameOffsetReg = ScratchWaveOffsetReg;
390 if (isEntryFunction())
391 FrameOffsetReg = ScratchWaveOffsetReg;
393392 }
394393
395394 unsigned getQueuePtrUserSGPR() const {
116116 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
117117 }
118118
119 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
120 const MachineFunction &MF) const {
121
122 const SISubtarget &ST = MF.getSubtarget();
123 unsigned RegCount = ST.getMaxNumSGPRs(MF);
119 static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
124120 unsigned Reg;
125121
126122 // Try to place it in a hole after PrivateSegmentBufferReg.
133129 // wave offset before it.
134130 Reg = RegCount - 5;
135131 }
132
133 return Reg;
134 }
135
136 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
137 const MachineFunction &MF) const {
138 const SISubtarget &ST = MF.getSubtarget();
139 unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
136140 return AMDGPU::SGPR_32RegClass.getRegister(Reg);
141 }
142
143 unsigned SIRegisterInfo::reservedStackPtrOffsetReg(
144 const MachineFunction &MF) const {
145 return AMDGPU::SGPR32;
137146 }
138147
139148 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
197206 assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
198207 }
199208
209 unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
210 if (StackPtrReg != AMDGPU::NoRegister) {
211 reserveRegisterTuples(Reserved, StackPtrReg);
212 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
213 }
214
215 unsigned FrameReg = MFI->getFrameOffsetReg();
216 if (FrameReg != AMDGPU::NoRegister) {
217 reserveRegisterTuples(Reserved, FrameReg);
218 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
219 }
220
200221 return Reserved;
201222 }
202223
203224 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
204 return Fn.getFrameInfo().hasStackObjects();
205 }
206
207 bool
208 SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
225 const SIMachineFunctionInfo *Info = Fn.getInfo();
226 if (Info->isEntryFunction()) {
227 const MachineFrameInfo &MFI = Fn.getFrameInfo();
228 return MFI.hasStackObjects() || MFI.hasCalls();
229 }
230
231 // May need scavenger for dealing with callee saved registers.
232 return true;
233 }
234
235 bool SIRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) const {
209236 return MF.getFrameInfo().hasStackObjects();
210237 }
211238
1616
1717 #include "AMDGPURegisterInfo.h"
1818 #include "SIDefines.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
1920 #include "llvm/CodeGen/MachineRegisterInfo.h"
2021
2122 namespace llvm {
5657 unsigned reservedPrivateSegmentWaveByteOffsetReg(
5758 const MachineFunction &MF) const;
5859
60 unsigned reservedStackPtrOffsetReg(const MachineFunction &MF) const;
61
5962 BitVector getReservedRegs(const MachineFunction &MF) const override;
63
64 const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
65 const uint32_t *getCallPreservedMask(const MachineFunction &MF,
66 CallingConv::ID) const override;
67
68 unsigned getFrameRegister(const MachineFunction &MF) const override;
6069
6170 bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
6271
227236
228237 const int *getRegUnitPressureSets(unsigned RegUnit) const override;
229238
239 unsigned getReturnAddressReg(const MachineFunction &MF) const {
240 // Not a callee saved register.
241 return AMDGPU::SGPR30_SGPR31;
242 }
243
230244 private:
231245 void buildSpillLoadStore(MachineBasicBlock::iterator MI,
232246 unsigned LoadStoreOp,
185185 def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
186186 def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
187187
188 let isTerminator = 1, isBarrier = 1,
189 isBranch = 1, isIndirectBranch = 1 in {
188 let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
189
190 let isBranch = 1, isIndirectBranch = 1 in {
190191 def S_SETPC_B64 : SOP1_1 <"s_setpc_b64">;
191 }
192 def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64">;
192 } // End isBranch = 1, isIndirectBranch = 1
193
194 let isReturn = 1 in {
195 // Define variant marked as return rather than branch.
196 def S_SETPC_B64_return : SOP1_1<"", [(AMDGPUret_flag i64:$src0)]>;
197 }
198 } // End isTerminator = 1, isBarrier = 1
199
200 let isCall = 1 in {
201 def S_SWAPPC_B64 : SOP1_64 <"s_swappc_b64"
202 >;
203 }
204
193205 def S_RFE_B64 : SOP1_1 <"s_rfe_b64">;
194206
195207 let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC] in {
517517 }
518518
519519 bool isEntryFunctionCC(CallingConv::ID CC) {
520 return true;
520 switch (CC) {
521 case CallingConv::AMDGPU_KERNEL:
522 case CallingConv::SPIR_KERNEL:
523 case CallingConv::AMDGPU_VS:
524 case CallingConv::AMDGPU_GS:
525 case CallingConv::AMDGPU_PS:
526 case CallingConv::AMDGPU_CS:
527 case CallingConv::AMDGPU_HS:
528 return true;
529 default:
530 return false;
531 }
521532 }
522533
523534 bool isSI(const MCSubtargetInfo &STI) {
261261 LLVM_READNONE
262262 inline bool isKernel(CallingConv::ID CC) {
263263 switch (CC) {
264 case CallingConv::C:
265264 case CallingConv::AMDGPU_KERNEL:
266265 case CallingConv::SPIR_KERNEL:
267266 return true;
55 ; Tests for add.
66 ; CHECK: name: addi32
77 ; CHECK: {{%[0-9]+}}(s32) = G_ADD
8 define i32 @addi32(i32 %arg1, i32 %arg2) {
8 define amdgpu_kernel void @addi32(i32 %arg1, i32 %arg2) {
99 %res = add i32 %arg1, %arg2
10 ret i32 %res
10 store i32 %res, i32 addrspace(1)* undef
11 ret void
1112 }
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1
2 ; Test that non-entry function frame indices are expanded properly to
3 ; give an index relative to the scratch wave offset register
4
5 ; Materialize into a mov. Make sure there isn't an unnecessary copy.
6 ; GCN-LABEL: {{^}}func_mov_fi_i32:
7 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8 ; GCN: s_sub_u32 vcc_hi, s5, s4
9 ; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
10 ; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
11 ; GCN-NOT: v_mov
12 ; GCN: ds_write_b32 v0, v0
13 define void @func_mov_fi_i32() #0 {
14 %alloca = alloca i32
15 store volatile i32* %alloca, i32* addrspace(3)* undef
16 ret void
17 }
18
19 ; Materialize into an add of a constant offset from the FI.
20 ; FIXME: Should be able to merge adds
21
22 ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32:
23 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
24 ; GCN: s_sub_u32 s6, s5, s4
25 ; GCN-NEXT: s_lshr_b32 s6, s6, 6
26 ; GCN-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, s6, 4
27 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
28 ; GCN-NOT: v_mov
29 ; GCN: ds_write_b32 v0, v0
30 define void @func_add_constant_to_fi_i32() #0 {
31 %alloca = alloca [2 x i32], align 4
32 %gep0 = getelementptr inbounds [2 x i32], [2 x i32]* %alloca, i32 0, i32 1
33 store volatile i32* %gep0, i32* addrspace(3)* undef
34 ret void
35 }
36
37 ; A user the materialized frame index can't be meaningfully folded
38 ; into.
39
40 ; GCN-LABEL: {{^}}func_other_fi_user_i32:
41 ; GCN: s_sub_u32 vcc_hi, s5, s4
42 ; GCN-NEXT: s_lshr_b32 vcc_hi, vcc_hi, 6
43 ; GCN-NEXT: v_add_i32_e64 v0, vcc, vcc_hi, 4
44 ; GCN-NEXT: v_mul_lo_i32 v0, v0, 9
45 ; GCN-NOT: v_mov
46 ; GCN: ds_write_b32 v0, v0
47 define void @func_other_fi_user_i32() #0 {
48 %alloca = alloca [2 x i32], align 4
49 %ptrtoint = ptrtoint [2 x i32]* %alloca to i32
50 %mul = mul i32 %ptrtoint, 9
51 store volatile i32 %mul, i32 addrspace(3)* undef
52 ret void
53 }
54
55 ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
56 ; GCN: v_mov_b32_e32 v1, 15{{$}}
57 ; GCN: buffer_store_dword v1, v0, s[0:3], s4 offen{{$}}
58 define void @func_store_private_arg_i32_ptr(i32* %ptr) #0 {
59 store volatile i32 15, i32* %ptr
60 ret void
61 }
62
63 ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
64 ; GCN: s_waitcnt
65 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s4 offen{{$}}
66 define void @func_load_private_arg_i32_ptr(i32* %ptr) #0 {
67 %val = load volatile i32, i32* %ptr
68 ret void
69 }
70
71 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr:
72 ; GCN: s_waitcnt
73 ; GCN-NEXT: s_sub_u32 s6, s5, s4
74 ; GCN-NEXT: v_lshr_b32_e64 v0, s6, 6
75 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 4, v0
76 ; GCN-NOT: v_mov
77 ; GCN: ds_write_b32 v0, v0
78 define void @void_func_byval_struct_i8_i32_ptr({ i8, i32 }* byval %arg0) #0 {
79 %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
80 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
81 %load1 = load i32, i32* %gep1
82 store volatile i32* %gep1, i32* addrspace(3)* undef
83 ret void
84 }
85
86 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
87 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88 ; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s5
89 ; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s5 offset:4
90 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #0 {
91 %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
92 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
93 %load0 = load i8, i8* %gep0
94 %load1 = load i32, i32* %gep1
95 store volatile i8 %load0, i8 addrspace(3)* undef
96 store volatile i32 %load1, i32 addrspace(3)* undef
97 ret void
98 }
99
100 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
101 ; GCN: s_sub_u32 s8, s5, s4
102 ; GCN: v_lshr_b32_e64 v1, s8, 6
103 ; GCN: s_and_saveexec_b64
104
105 ; GCN: v_add_i32_e32 v0, vcc, 4, v1
106 ; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4
107 ; GCN: ds_write_b32
108 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 {
109 %cmp = icmp eq i32 %arg2, 0
110 br i1 %cmp, label %bb, label %ret
111
112 bb:
113 %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
114 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
115 %load1 = load volatile i32, i32* %gep1
116 store volatile i32* %gep1, i32* addrspace(3)* undef
117 br label %ret
118
119 ret:
120 ret void
121 }
122
123 attributes #0 = { nounwind }
0 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3
4 ; GCN-LABEL: {{^}}void_func_i1:
5 ; GCN: v_and_b32_e32 v0, 1, v0
6 ; GCN: buffer_store_byte v0, off
7 define void @void_func_i1(i1 %arg0) #0 {
8 store i1 %arg0, i1 addrspace(1)* undef
9 ret void
10 }
11
12 ; GCN-LABEL: {{^}}void_func_i1_zeroext:
13 ; GCN: s_waitcnt
14 ; GCN-NEXT: v_or_b32_e32 v0, 12, v0
15 ; GCN-NOT: v0
16 ; GCN: buffer_store_dword v0, off
17 define void @void_func_i1_zeroext(i1 zeroext %arg0) #0 {
18 %ext = zext i1 %arg0 to i32
19 %add = add i32 %ext, 12
20 store i32 %add, i32 addrspace(1)* undef
21 ret void
22 }
23
24 ; GCN-LABEL: {{^}}void_func_i1_signext:
25 ; GCN: s_waitcnt
26 ; GCN-NEXT: v_add_i32_e32 v0, vcc, 12, v0
27 ; GCN-NOT: v0
28 ; GCN: buffer_store_dword v0, off
29 define void @void_func_i1_signext(i1 signext %arg0) #0 {
30 %ext = sext i1 %arg0 to i32
31 %add = add i32 %ext, 12
32 store i32 %add, i32 addrspace(1)* undef
33 ret void
34 }
35
36 ; GCN-LABEL: {{^}}void_func_i8:
37 ; GCN-NOT: v0
38 ; GCN: buffer_store_byte v0, off
39 define void @void_func_i8(i8 %arg0) #0 {
40 store i8 %arg0, i8 addrspace(1)* undef
41 ret void
42 }
43
44 ; GCN-LABEL: {{^}}void_func_i8_zeroext:
45 ; GCN-NOT: and_b32
46 ; GCN: v_add_i32_e32 v0, vcc, 12, v0
47 define void @void_func_i8_zeroext(i8 zeroext %arg0) #0 {
48 %ext = zext i8 %arg0 to i32
49 %add = add i32 %ext, 12
50 store i32 %add, i32 addrspace(1)* undef
51 ret void
52 }
53
54 ; GCN-LABEL: {{^}}void_func_i8_signext:
55 ; GCN-NOT: v_bfe_i32
56 ; GCN: v_add_i32_e32 v0, vcc, 12, v0
57 define void @void_func_i8_signext(i8 signext %arg0) #0 {
58 %ext = sext i8 %arg0 to i32
59 %add = add i32 %ext, 12
60 store i32 %add, i32 addrspace(1)* undef
61 ret void
62 }
63
64 ; GCN-LABEL: {{^}}void_func_i16:
65 ; GCN: buffer_store_short v0, off
66 define void @void_func_i16(i16 %arg0) #0 {
67 store i16 %arg0, i16 addrspace(1)* undef
68 ret void
69 }
70
71 ; GCN-LABEL: {{^}}void_func_i16_zeroext:
72 ; GCN-NOT: v0
73 ; GCN: v_add_i32_e32 v0, vcc, 12, v0
74 define void @void_func_i16_zeroext(i16 zeroext %arg0) #0 {
75 %ext = zext i16 %arg0 to i32
76 %add = add i32 %ext, 12
77 store i32 %add, i32 addrspace(1)* undef
78 ret void
79 }
80
81 ; GCN-LABEL: {{^}}void_func_i16_signext:
82 ; GCN-NOT: v0
83 ; GCN: v_add_i32_e32 v0, vcc, 12, v0
84 define void @void_func_i16_signext(i16 signext %arg0) #0 {
85 %ext = sext i16 %arg0 to i32
86 %add = add i32 %ext, 12
87 store i32 %add, i32 addrspace(1)* undef
88 ret void
89 }
90
91 ; GCN-LABEL: {{^}}void_func_i32:
92 ; GCN-NOT: v0
93 ; GCN: buffer_store_dword v0, off
94 define void @void_func_i32(i32 %arg0) #0 {
95 store i32 %arg0, i32 addrspace(1)* undef
96 ret void
97 }
98
99 ; GCN-LABEL: {{^}}void_func_i64:
100 ; GCN-NOT: v[0:1]
101 ; GCN-NOT: v0
102 ; GCN-NOT: v1
103 ; GCN: buffer_store_dwordx2 v[0:1], off
104 define void @void_func_i64(i64 %arg0) #0 {
105 store i64 %arg0, i64 addrspace(1)* undef
106 ret void
107 }
108
109 ; GCN-LABEL: {{^}}void_func_f16:
110 ; VI-NOT: v0
111 ; CI: v_cvt_f16_f32_e32 v0, v0
112 ; GCN: buffer_store_short v0, off
113 define void @void_func_f16(half %arg0) #0 {
114 store half %arg0, half addrspace(1)* undef
115 ret void
116 }
117
118 ; GCN-LABEL: {{^}}void_func_f32
119 ; GCN-NOT: v0
120 ; GCN: buffer_store_dword v0, off
121 define void @void_func_f32(float %arg0) #0 {
122 store float %arg0, float addrspace(1)* undef
123 ret void
124 }
125
126 ; GCN-LABEL: {{^}}void_func_f64:
127 ; GCN-NOT: v[0:1]
128 ; GCN-NOT: v0
129 ; GCN-NOT: v1
130 ; GCN: buffer_store_dwordx2 v[0:1], off
131 define void @void_func_f64(double %arg0) #0 {
132 store double %arg0, double addrspace(1)* undef
133 ret void
134 }
135
136 ; GCN-LABEL: {{^}}void_func_v2i32:
137 ; GCN-NOT: v[0:1]
138 ; GCN-NOT: v0
139 ; GCN-NOT: v1
140 ; GCN: buffer_store_dwordx2 v[0:1], off
141 define void @void_func_v2i32(<2 x i32> %arg0) #0 {
142 store <2 x i32> %arg0, <2 x i32> addrspace(1)* undef
143 ret void
144 }
145
146 ; GCN-LABEL: {{^}}void_func_v3i32:
147 ; GCN-DAG: buffer_store_dword v2, off
148 ; GCN-DAG: buffer_store_dwordx2 v[0:1], off
149 define void @void_func_v3i32(<3 x i32> %arg0) #0 {
150 store <3 x i32> %arg0, <3 x i32> addrspace(1)* undef
151 ret void
152 }
153
154 ; GCN-LABEL: {{^}}void_func_v4i32:
155 ; GCN: buffer_store_dwordx4 v[0:3], off
156 define void @void_func_v4i32(<4 x i32> %arg0) #0 {
157 store <4 x i32> %arg0, <4 x i32> addrspace(1)* undef
158 ret void
159 }
160
161 ; GCN-LABEL: {{^}}void_func_v5i32:
162 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
163 ; GCN-DAG: buffer_store_dword v4, off
164 define void @void_func_v5i32(<5 x i32> %arg0) #0 {
165 store <5 x i32> %arg0, <5 x i32> addrspace(1)* undef
166 ret void
167 }
168
169 ; GCN-LABEL: {{^}}void_func_v8i32:
170 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
171 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
172 define void @void_func_v8i32(<8 x i32> %arg0) #0 {
173 store <8 x i32> %arg0, <8 x i32> addrspace(1)* undef
174 ret void
175 }
176
177 ; GCN-LABEL: {{^}}void_func_v16i32:
178 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
179 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
180 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
181 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
182 define void @void_func_v16i32(<16 x i32> %arg0) #0 {
183 store <16 x i32> %arg0, <16 x i32> addrspace(1)* undef
184 ret void
185 }
186
187 ; GCN-LABEL: {{^}}void_func_v32i32:
188 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
189 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
190 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
191 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
192 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off
193 ; GCN-DAG: buffer_store_dwordx4 v[20:23], off
194 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
195 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
196 define void @void_func_v32i32(<32 x i32> %arg0) #0 {
197 store <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
198 ret void
199 }
200
201 ; 1 over register limit
202 ; GCN-LABEL: {{^}}void_func_v33i32:
203 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
204 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
205 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
206 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
207 ; GCN-DAG: buffer_load_dword [[STACKLOAD:v[0-9]+]], off, s[0:3], s5
208 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off
209 ; GCN-DAG: buffer_store_dwordx4 v[20:23], off
210 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
211 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
212 ; GCN: buffer_store_dword [[STACKLOAD]], off
213 define void @void_func_v33i32(<33 x i32> %arg0) #0 {
214 store <33 x i32> %arg0, <33 x i32> addrspace(1)* undef
215 ret void
216 }
217
218 ; GCN-LABEL: {{^}}void_func_v2i64:
219 ; GCN: buffer_store_dwordx4 v[0:3], off
220 define void @void_func_v2i64(<2 x i64> %arg0) #0 {
221 store <2 x i64> %arg0, <2 x i64> addrspace(1)* undef
222 ret void
223 }
224
225 ; GCN-LABEL: {{^}}void_func_v3i64:
226 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
227 ; GCN-DAG: buffer_store_dwordx2 v[4:5], off
228 define void @void_func_v3i64(<3 x i64> %arg0) #0 {
229 store <3 x i64> %arg0, <3 x i64> addrspace(1)* undef
230 ret void
231 }
232
233 ; GCN-LABEL: {{^}}void_func_v4i64:
234 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
235 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
236 define void @void_func_v4i64(<4 x i64> %arg0) #0 {
237 store <4 x i64> %arg0, <4 x i64> addrspace(1)* undef
238 ret void
239 }
240
241 ; GCN-LABEL: {{^}}void_func_v5i64:
242 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
243 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
244 ; GCN-DAG: buffer_store_dwordx2 v[8:9], off
245 define void @void_func_v5i64(<5 x i64> %arg0) #0 {
246 store <5 x i64> %arg0, <5 x i64> addrspace(1)* undef
247 ret void
248 }
249
250 ; GCN-LABEL: {{^}}void_func_v8i64:
251 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
252 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
253 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
254 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
255 define void @void_func_v8i64(<8 x i64> %arg0) #0 {
256 store <8 x i64> %arg0, <8 x i64> addrspace(1)* undef
257 ret void
258 }
259
260 ; GCN-LABEL: {{^}}void_func_v16i64:
261 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
262 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
263 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
264 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
265 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off
266 ; GCN-DAG: buffer_store_dwordx4 v[20:23], off
267 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
268 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
269 define void @void_func_v16i64(<16 x i64> %arg0) #0 {
270 store <16 x i64> %arg0, <16 x i64> addrspace(1)* undef
271 ret void
272 }
273
274 ; GCN-LABEL: {{^}}void_func_v2i16:
275 ; GFX9-NOT: v0
276 ; GFX9: buffer_store_dword v0, off
277 define void @void_func_v2i16(<2 x i16> %arg0) #0 {
278 store <2 x i16> %arg0, <2 x i16> addrspace(1)* undef
279 ret void
280 }
281
282 ; GCN-LABEL: {{^}}void_func_v3i16:
283 ; GCN-DAG: buffer_store_dword v0, off
284 ; GCN-DAG: buffer_store_short v2, off
285 define void @void_func_v3i16(<3 x i16> %arg0) #0 {
286 store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
287 ret void
288 }
289
290 ; GCN-LABEL: {{^}}void_func_v4i16:
291 ; GFX9-NOT: v0
292 ; GFX9-NOT: v1
293 ; GFX9: buffer_store_dwordx2 v[0:1], off
294 define void @void_func_v4i16(<4 x i16> %arg0) #0 {
295 store <4 x i16> %arg0, <4 x i16> addrspace(1)* undef
296 ret void
297 }
298
299 ; GCN-LABEL: {{^}}void_func_v5i16:
300 ; GCN-DAG: buffer_store_short v4, off,
301 ; GCN-DAG: buffer_store_dwordx2 v[1:2], off
302 define void @void_func_v5i16(<5 x i16> %arg0) #0 {
303 store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
304 ret void
305 }
306
307 ; GCN-LABEL: {{^}}void_func_v8i16:
308 ; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
309 define void @void_func_v8i16(<8 x i16> %arg0) #0 {
310 store <8 x i16> %arg0, <8 x i16> addrspace(1)* undef
311 ret void
312 }
313
314 ; GCN-LABEL: {{^}}void_func_v16i16:
315 ; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
316 ; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
317 define void @void_func_v16i16(<16 x i16> %arg0) #0 {
318 store <16 x i16> %arg0, <16 x i16> addrspace(1)* undef
319 ret void
320 }
321
322 ; GCN-LABEL: {{^}}void_func_v2f32:
323 ; GCN-NOT: v[0:1]
324 ; GCN-NOT: v0
325 ; GCN-NOT: v1
326 ; GCN: buffer_store_dwordx2 v[0:1], off
327 define void @void_func_v2f32(<2 x float> %arg0) #0 {
328 store <2 x float> %arg0, <2 x float> addrspace(1)* undef
329 ret void
330 }
331
332 ; GCN-LABEL: {{^}}void_func_v3f32:
333 ; GCN-DAG: buffer_store_dword v2, off
334 ; GCN-DAG: buffer_store_dwordx2 v[0:1], off
335 define void @void_func_v3f32(<3 x float> %arg0) #0 {
336 store <3 x float> %arg0, <3 x float> addrspace(1)* undef
337 ret void
338 }
339
340 ; GCN-LABEL: {{^}}void_func_v4f32:
341 ; GCN: buffer_store_dwordx4 v[0:3], off
342 define void @void_func_v4f32(<4 x float> %arg0) #0 {
343 store <4 x float> %arg0, <4 x float> addrspace(1)* undef
344 ret void
345 }
346
347 ; GCN-LABEL: {{^}}void_func_v8f32:
348 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
349 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
350 define void @void_func_v8f32(<8 x float> %arg0) #0 {
351 store <8 x float> %arg0, <8 x float> addrspace(1)* undef
352 ret void
353 }
354
355 ; GCN-LABEL: {{^}}void_func_v16f32:
356 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
357 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
358 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
359 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
360 define void @void_func_v16f32(<16 x float> %arg0) #0 {
361 store <16 x float> %arg0, <16 x float> addrspace(1)* undef
362 ret void
363 }
364
365 ; GCN-LABEL: {{^}}void_func_v2f64:
366 ; GCN: buffer_store_dwordx4 v[0:3], off
367 define void @void_func_v2f64(<2 x double> %arg0) #0 {
368 store <2 x double> %arg0, <2 x double> addrspace(1)* undef
369 ret void
370 }
371
372 ; GCN-LABEL: {{^}}void_func_v3f64:
373 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
374 ; GCN-DAG: buffer_store_dwordx2 v[4:5], off
375 define void @void_func_v3f64(<3 x double> %arg0) #0 {
376 store <3 x double> %arg0, <3 x double> addrspace(1)* undef
377 ret void
378 }
379
380 ; GCN-LABEL: {{^}}void_func_v4f64:
381 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
382 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
383 define void @void_func_v4f64(<4 x double> %arg0) #0 {
384 store <4 x double> %arg0, <4 x double> addrspace(1)* undef
385 ret void
386 }
387
388 ; GCN-LABEL: {{^}}void_func_v8f64:
389 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
390 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
391 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
392 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
393 define void @void_func_v8f64(<8 x double> %arg0) #0 {
394 store <8 x double> %arg0, <8 x double> addrspace(1)* undef
395 ret void
396 }
397
398 ; GCN-LABEL: {{^}}void_func_v16f64:
399 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
400 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
401 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
402 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
403 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off
404 ; GCN-DAG: buffer_store_dwordx4 v[20:23], off
405 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
406 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
407 define void @void_func_v16f64(<16 x double> %arg0) #0 {
408 store <16 x double> %arg0, <16 x double> addrspace(1)* undef
409 ret void
410 }
411
412 ; GCN-LABEL: {{^}}void_func_v2f16:
413 ; GFX9-NOT: v0
414 ; GFX9: buffer_store_dword v0, off
415 define void @void_func_v2f16(<2 x half> %arg0) #0 {
416 store <2 x half> %arg0, <2 x half> addrspace(1)* undef
417 ret void
418 }
419
420 ; GCN-LABEL: {{^}}void_func_v3f16:
421 ; GFX9-NOT: v0
422 ; GCN-DAG: buffer_store_dword v0, off
423 ; GCN-DAG: buffer_store_short v2, off
424 define void @void_func_v3f16(<3 x half> %arg0) #0 {
425 store <3 x half> %arg0, <3 x half> addrspace(1)* undef
426 ret void
427 }
428
429 ; GCN-LABEL: {{^}}void_func_v4f16:
430 ; GFX9-NOT: v0
431 ; GFX9-NOT: v1
432 ; GFX9-NOT: v[0:1]
433 ; GFX9: buffer_store_dwordx2 v[0:1], off
434 define void @void_func_v4f16(<4 x half> %arg0) #0 {
435 store <4 x half> %arg0, <4 x half> addrspace(1)* undef
436 ret void
437 }
438
439 ; GCN-LABEL: {{^}}void_func_v8f16:
440 ; GFX9-NOT: v0
441 ; GFX9-NOT: v1
442 ; GFX9: buffer_store_dwordx4 v[0:3], off
443 define void @void_func_v8f16(<8 x half> %arg0) #0 {
444 store <8 x half> %arg0, <8 x half> addrspace(1)* undef
445 ret void
446 }
447
448 ; GCN-LABEL: {{^}}void_func_v16f16:
449 ; GFX9-NOT: v0
450 ; GFX9-NOT: v1
451 ; GFX9-DAG: buffer_store_dwordx4 v[0:3], off
452 ; GFX9-DAG: buffer_store_dwordx4 v[4:7], off
453 define void @void_func_v16f16(<16 x half> %arg0) #0 {
454 store <16 x half> %arg0, <16 x half> addrspace(1)* undef
455 ret void
456 }
457
458 ; Make sure there is no alignment requirement for passed vgprs.
459 ; GCN-LABEL: {{^}}void_func_i32_i64_i32:
460 ; GCN-NOT: v0
461 ; GCN: buffer_store_dword v0, off
462 ; GCN: buffer_store_dwordx2 v[1:2]
463 ; GCN: buffer_store_dword v3
464 define void @void_func_i32_i64_i32(i32 %arg0, i64 %arg1, i32 %arg2) #0 {
465 store volatile i32 %arg0, i32 addrspace(1)* undef
466 store volatile i64 %arg1, i64 addrspace(1)* undef
467 store volatile i32 %arg2, i32 addrspace(1)* undef
468 ret void
469 }
470
471 ; GCN-LABEL: {{^}}void_func_struct_i32:
472 ; GCN-NOT: v0
473 ; GCN: buffer_store_dword v0, off
474 define void @void_func_struct_i32({ i32 } %arg0) #0 {
475 store { i32 } %arg0, { i32 } addrspace(1)* undef
476 ret void
477 }
478
479 ; GCN-LABEL: {{^}}void_func_struct_i8_i32:
480 ; GCN-DAG: buffer_store_byte v0, off
481 ; GCN-DAG: buffer_store_dword v1, off
482 define void @void_func_struct_i8_i32({ i8, i32 } %arg0) #0 {
483 store { i8, i32 } %arg0, { i8, i32 } addrspace(1)* undef
484 ret void
485 }
486
487 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32:
488 ; GCN-DAG: buffer_load_ubyte v[[ELT0:[0-9]+]], off, s[0:3], s5{{$}}
489 ; GCN-DAG: buffer_load_dword v[[ELT1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
490 ; GCN-DAG: buffer_store_dword v[[ELT1]]
491 ; GCN-DAG: buffer_store_byte v[[ELT0]]
492 define void @void_func_byval_struct_i8_i32({ i8, i32 }* byval %arg0) #0 {
493 %arg0.load = load { i8, i32 }, { i8, i32 }* %arg0
494 store { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
495 ret void
496 }
497
498 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_x2:
499 ; GCN: buffer_load_ubyte v[[ELT0_0:[0-9]+]], off, s[0:3], s5{{$}}
500 ; GCN: buffer_load_dword v[[ELT1_0:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
501 ; GCN: buffer_load_ubyte v[[ELT0_1:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
502 ; GCN: buffer_load_dword v[[ELT1_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
503
504 ; GCN: ds_write_b32 v0, v0
505 ; GCN: s_setpc_b64
506 define void @void_func_byval_struct_i8_i32_x2({ i8, i32 }* byval %arg0, { i8, i32 }* byval %arg1, i32 %arg2) #0 {
507 %arg0.load = load volatile { i8, i32 }, { i8, i32 }* %arg0
508 %arg1.load = load volatile { i8, i32 }, { i8, i32 }* %arg1
509 store volatile { i8, i32 } %arg0.load, { i8, i32 } addrspace(1)* undef
510 store volatile { i8, i32 } %arg1.load, { i8, i32 } addrspace(1)* undef
511 store volatile i32 %arg2, i32 addrspace(3)* undef
512 ret void
513 }
514
515 ; GCN-LABEL: {{^}}void_func_byval_i32_byval_i64:
516 ; GCN-DAG: buffer_load_dword v[[ARG0_LOAD:[0-9]+]], off, s[0:3], s5{{$}}
517 ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
518 ; GCN-DAG: buffer_load_dword v[[ARG1_LOAD1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
519 ; GCN-DAG: buffer_store_dword v[[ARG0_LOAD]], off
520 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[ARG1_LOAD0]]:[[ARG1_LOAD1]]{{\]}}, off
521 define void @void_func_byval_i32_byval_i64(i32* byval %arg0, i64* byval %arg1) #0 {
522 %arg0.load = load i32, i32* %arg0
523 %arg1.load = load i64, i64* %arg1
524 store i32 %arg0.load, i32 addrspace(1)* undef
525 store i64 %arg1.load, i64 addrspace(1)* undef
526 ret void
527 }
528
529 ; GCN-LABEL: {{^}}void_func_v32i32_i32_i64:
530 ; GCN-DAG: buffer_store_dwordx4 v[0:3], off
531 ; GCN-DAG: buffer_store_dwordx4 v[4:7], off
532 ; GCN-DAG: buffer_store_dwordx4 v[8:11], off
533 ; GCN-DAG: buffer_store_dwordx4 v[12:15], off
534 ; GCN-DAG: buffer_store_dwordx4 v[16:19], off
535 ; GCN-DAG: buffer_store_dwordx4 v[20:23], off
536 ; GCN-DAG: buffer_store_dwordx4 v[24:27], off
537 ; GCN-DAG: buffer_store_dwordx4 v[28:31], off
538 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1:[0-9]+]], off, s[0:3], s5{{$}}
539 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:4
540 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:8
541
542 ; GCN: buffer_store_dword v[[LOAD_ARG1]]
543 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
544 define void @void_func_v32i32_i32_i64(<32 x i32> %arg0, i32 %arg1, i64 %arg2) #0 {
545 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
546 store volatile i32 %arg1, i32 addrspace(1)* undef
547 store volatile i64 %arg2, i64 addrspace(1)* undef
548 ret void
549 }
550
551 ; FIXME: Different ext load types on CI vs. VI
552 ; GCN-LABEL: {{^}}void_func_v32i32_i1_i8_i16:
553 ; GCN-DAG: buffer_load_ubyte [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
554 ; VI-DAG: buffer_load_ushort [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
555 ; VI-DAG: buffer_load_ushort [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
556 ; VI-DAG: buffer_load_ushort [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
557
558 ; CI-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
559 ; CI-DAG: buffer_load_dword [[LOAD_ARG3:v[0-9]+]], off, s[0:3], s5 offset:8{{$}}
560 ; CI-DAG: buffer_load_dword [[LOAD_ARG4:v[0-9]+]], off, s[0:3], s5 offset:12{{$}}
561
562 ; GCN-DAG: v_and_b32_e32 [[TRUNC_ARG1_I1:v[0-9]+]], 1, [[LOAD_ARG1]]
563 ; CI-DAG: v_cvt_f16_f32_e32 [[CVT_ARG4:v[0-9]+]], [[LOAD_ARG4]]
564
565 ; GCN: buffer_store_byte [[TRUNC_ARG1_I1]], off
566 ; GCN: buffer_store_byte [[LOAD_ARG2]], off
567 ; GCN: buffer_store_short [[LOAD_ARG3]], off
568 ; VI: buffer_store_short [[LOAD_ARG4]], off
569
570 ; CI: buffer_store_short [[CVT_ARG4]], off
571 define void @void_func_v32i32_i1_i8_i16(<32 x i32> %arg0, i1 %arg1, i8 %arg2, i16 %arg3, half %arg4) #0 {
572 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
573 store volatile i1 %arg1, i1 addrspace(1)* undef
574 store volatile i8 %arg2, i8 addrspace(1)* undef
575 store volatile i16 %arg3, i16 addrspace(1)* undef
576 store volatile half %arg4, half addrspace(1)* undef
577 ret void
578 }
579
580 ; GCN-LABEL: {{^}}void_func_v32i32_v2i32_v2f32:
581 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
582 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
583 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
584 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
585
586 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_1]]{{\]}}, off
587 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_1]]{{\]}}, off
588 define void @void_func_v32i32_v2i32_v2f32(<32 x i32> %arg0, <2 x i32> %arg1, <2 x float> %arg2) #0 {
589 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
590 store volatile <2 x i32> %arg1, <2 x i32> addrspace(1)* undef
591 store volatile <2 x float> %arg2, <2 x float> addrspace(1)* undef
592 ret void
593 }
594
595 ; GCN-LABEL: {{^}}void_func_v32i32_v2i16_v2f16:
596 ; GFX9-DAG: buffer_load_dword [[LOAD_ARG1:v[0-9]+]], off, s[0:3], s5{{$}}
597 ; GFX9-DAG: buffer_load_dword [[LOAD_ARG2:v[0-9]+]], off, s[0:3], s5 offset:4{{$}}
598 ; GFX9: buffer_store_dword [[LOAD_ARG1]], off
599 ; GFX9: buffer_store_short [[LOAD_ARG2]], off
600 define void @void_func_v32i32_v2i16_v2f16(<32 x i32> %arg0, <2 x i16> %arg1, <2 x half> %arg2) #0 {
601 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
602 store volatile <2 x i16> %arg1, <2 x i16> addrspace(1)* undef
603 store volatile <2 x half> %arg2, <2 x half> addrspace(1)* undef
604 ret void
605 }
606
607 ; GCN-LABEL: {{^}}void_func_v32i32_v2i64_v2f64:
608 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
609 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
610 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
611 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
612
613 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
614 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
615 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
616 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
617
618 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
619 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
620 define void @void_func_v32i32_v2i64_v2f64(<32 x i32> %arg0, <2 x i64> %arg1, <2 x double> %arg2) #0 {
621 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
622 store volatile <2 x i64> %arg1, <2 x i64> addrspace(1)* undef
623 store volatile <2 x double> %arg2, <2 x double> addrspace(1)* undef
624 ret void
625 }
626
627 ; GCN-LABEL: {{^}}void_func_v32i32_v4i32_v4f32:
628 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
629 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
630 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
631 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
632
633 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
634 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
635 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
636 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
637
638 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
639 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
640 define void @void_func_v32i32_v4i32_v4f32(<32 x i32> %arg0, <4 x i32> %arg1, <4 x float> %arg2) #0 {
641 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
642 store volatile <4 x i32> %arg1, <4 x i32> addrspace(1)* undef
643 store volatile <4 x float> %arg2, <4 x float> addrspace(1)* undef
644 ret void
645 }
646
647 ; GCN-LABEL: {{^}}void_func_v32i32_v8i32_v8f32:
648 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
649 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
650 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
651 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
652 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
653 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
654 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
655 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
656
657 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
658 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
659 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
660 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
661 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
662 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
663 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
664 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
665
666 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_4]]:[[LOAD_ARG1_7]]{{\]}}, off
667 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG1_0]]:[[LOAD_ARG1_3]]{{\]}}, off
668 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_4]]:[[LOAD_ARG2_7]]{{\]}}, off
669 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOAD_ARG2_0]]:[[LOAD_ARG2_3]]{{\]}}, off
670 define void @void_func_v32i32_v8i32_v8f32(<32 x i32> %arg0, <8 x i32> %arg1, <8 x float> %arg2) #0 {
671 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
672 store volatile <8 x i32> %arg1, <8 x i32> addrspace(1)* undef
673 store volatile <8 x float> %arg2, <8 x float> addrspace(1)* undef
674 ret void
675 }
676
677 ; GCN-LABEL: {{^}}void_func_v32i32_v16i32_v16f32:
678 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_0:[0-9]+]], off, s[0:3], s5{{$}}
679 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_1:[0-9]+]], off, s[0:3], s5 offset:4{{$}}
680 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_2:[0-9]+]], off, s[0:3], s5 offset:8{{$}}
681 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_3:[0-9]+]], off, s[0:3], s5 offset:12{{$}}
682 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_4:[0-9]+]], off, s[0:3], s5 offset:16{{$}}
683 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_5:[0-9]+]], off, s[0:3], s5 offset:20{{$}}
684 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_6:[0-9]+]], off, s[0:3], s5 offset:24{{$}}
685 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_7:[0-9]+]], off, s[0:3], s5 offset:28{{$}}
686 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_8:[0-9]+]], off, s[0:3], s5 offset:32{{$}}
687 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_9:[0-9]+]], off, s[0:3], s5 offset:36{{$}}
688 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_10:[0-9]+]], off, s[0:3], s5 offset:40{{$}}
689 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_11:[0-9]+]], off, s[0:3], s5 offset:44{{$}}
690 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_12:[0-9]+]], off, s[0:3], s5 offset:48{{$}}
691 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_13:[0-9]+]], off, s[0:3], s5 offset:52{{$}}
692 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_14:[0-9]+]], off, s[0:3], s5 offset:56{{$}}
693 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG1_15:[0-9]+]], off, s[0:3], s5 offset:60{{$}}
694
695 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_0:[0-9]+]], off, s[0:3], s5 offset:64{{$}}
696 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_1:[0-9]+]], off, s[0:3], s5 offset:68{{$}}
697 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_2:[0-9]+]], off, s[0:3], s5 offset:72{{$}}
698 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_3:[0-9]+]], off, s[0:3], s5 offset:76{{$}}
699 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_4:[0-9]+]], off, s[0:3], s5 offset:80{{$}}
700 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_5:[0-9]+]], off, s[0:3], s5 offset:84{{$}}
701 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_6:[0-9]+]], off, s[0:3], s5 offset:88{{$}}
702 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_7:[0-9]+]], off, s[0:3], s5 offset:92{{$}}
703 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_8:[0-9]+]], off, s[0:3], s5 offset:96{{$}}
704 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_9:[0-9]+]], off, s[0:3], s5 offset:100{{$}}
705 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_10:[0-9]+]], off, s[0:3], s5 offset:104{{$}}
706 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_11:[0-9]+]], off, s[0:3], s5 offset:108{{$}}
707 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_12:[0-9]+]], off, s[0:3], s5 offset:112{{$}}
708 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_13:[0-9]+]], off, s[0:3], s5 offset:116{{$}}
709 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_14:[0-9]+]], off, s[0:3], s5 offset:120{{$}}
710 ; GCN-DAG: buffer_load_dword v[[LOAD_ARG2_15:[0-9]+]], off, s[0:3], s5 offset:124{{$}}
711 define void @void_func_v32i32_v16i32_v16f32(<32 x i32> %arg0, <16 x i32> %arg1, <16 x float> %arg2) #0 {
712 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
713 store volatile <16 x i32> %arg1, <16 x i32> addrspace(1)* undef
714 store volatile <16 x float> %arg2, <16 x float> addrspace(1)* undef
715 ret void
716 }
717
718 ; Check there is no crash.
719 ; GCN-LABEL: {{^}}void_func_v16i8:
720 define void @void_func_v16i8(<16 x i8> %arg0) #0 {
721 store volatile <16 x i8> %arg0, <16 x i8> addrspace(1)* undef
722 ret void
723 }
724
725 ; Check there is no crash.
726 ; GCN-LABEL: {{^}}void_func_v32i32_v16i8:
727 define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
728 store volatile <32 x i32> %arg0, <32 x i32> addrspace(1)* undef
729 store volatile <16 x i8> %arg1, <16 x i8> addrspace(1)* undef
730 ret void
731 }
732
733 attributes #0 = { nounwind }
0 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
3
4 ; GCN-LABEL: {{^}}i1_func_void:
5 ; GCN: buffer_load_ubyte v0, off
6 ; GCN-NEXT: s_waitcnt
7 ; GCN-NEXT: s_setpc_b64
8 define i1 @i1_func_void() #0 {
9 %val = load i1, i1 addrspace(1)* undef
10 ret i1 %val
11 }
12
13 ; FIXME: Missing and?
14 ; GCN-LABEL: {{^}}i1_zeroext_func_void:
15 ; GCN: buffer_load_ubyte v0, off
16 ; GCN-NEXT: s_waitcnt vmcnt(0)
17 ; GCN-NEXT: s_setpc_b64
18 define zeroext i1 @i1_zeroext_func_void() #0 {
19 %val = load i1, i1 addrspace(1)* undef
20 ret i1 %val
21 }
22
23 ; GCN-LABEL: {{^}}i1_signext_func_void:
24 ; GCN: buffer_load_ubyte v0, off
25 ; GCN-NEXT: s_waitcnt vmcnt(0)
26 ; GCN-NEXT: v_bfe_i32 v0, v0, 0, 1{{$}}
27 ; GCN-NEXT: s_setpc_b64
28 define signext i1 @i1_signext_func_void() #0 {
29 %val = load i1, i1 addrspace(1)* undef
30 ret i1 %val
31 }
32
33 ; GCN-LABEL: {{^}}i8_func_void:
34 ; GCN: buffer_load_ubyte v0, off
35 ; GCN-NEXT: s_waitcnt vmcnt(0)
36 ; GCN-NEXT: s_setpc_b64
37 define i8 @i8_func_void() #0 {
38 %val = load i8, i8 addrspace(1)* undef
39 ret i8 %val
40 }
41
42 ; GCN-LABEL: {{^}}i8_zeroext_func_void:
43 ; GCN: buffer_load_ubyte v0, off
44 ; GCN-NEXT: s_waitcnt vmcnt(0)
45 ; GCN-NEXT: s_setpc_b64
46 define zeroext i8 @i8_zeroext_func_void() #0 {
47 %val = load i8, i8 addrspace(1)* undef
48 ret i8 %val
49 }
50
51 ; GCN-LABEL: {{^}}i8_signext_func_void:
52 ; GCN: buffer_load_sbyte v0, off
53 ; GCN-NEXT: s_waitcnt vmcnt(0)
54 ; GCN-NEXT: s_setpc_b64
55 define signext i8 @i8_signext_func_void() #0 {
56 %val = load i8, i8 addrspace(1)* undef
57 ret i8 %val
58 }
59
60 ; GCN-LABEL: {{^}}i16_func_void:
61 ; GCN: buffer_load_ushort v0, off
62 ; GCN-NEXT: s_waitcnt vmcnt(0)
63 ; GCN-NEXT: s_setpc_b64
64 define i16 @i16_func_void() #0 {
65 %val = load i16, i16 addrspace(1)* undef
66 ret i16 %val
67 }
68
69 ; GCN-LABEL: {{^}}i16_zeroext_func_void:
70 ; GCN: buffer_load_ushort v0, off
71 ; GCN-NEXT: s_waitcnt vmcnt(0)
72 ; GCN-NEXT: s_setpc_b64
73 define zeroext i16 @i16_zeroext_func_void() #0 {
74 %val = load i16, i16 addrspace(1)* undef
75 ret i16 %val
76 }
77
78 ; GCN-LABEL: {{^}}i16_signext_func_void:
79 ; GCN: buffer_load_sshort v0, off
80 ; GCN-NEXT: s_waitcnt vmcnt(0)
81 ; GCN-NEXT: s_setpc_b64
82 define signext i16 @i16_signext_func_void() #0 {
83 %val = load i16, i16 addrspace(1)* undef
84 ret i16 %val
85 }
86
87 ; GCN-LABEL: {{^}}i32_func_void:
88 ; GCN: buffer_load_dword v0, off
89 ; GCN-NEXT: s_waitcnt vmcnt(0)
90 ; GCN-NEXT: s_setpc_b64
91 define i32 @i32_func_void() #0 {
92 %val = load i32, i32 addrspace(1)* undef
93 ret i32 %val
94 }
95
96 ; GCN-LABEL: {{^}}i64_func_void:
97 ; GCN: buffer_load_dwordx2 v[0:1], off
98 ; GCN-NEXT: s_waitcnt vmcnt(0)
99 ; GCN-NEXT: s_setpc_b64
100 define i64 @i64_func_void() #0 {
101 %val = load i64, i64 addrspace(1)* undef
102 ret i64 %val
103 }
104
105 ; GCN-LABEL: {{^}}f32_func_void:
106 ; GCN: buffer_load_dword v0, off, s[8:11], 0
107 ; GCN-NEXT: s_waitcnt vmcnt(0)
108 ; GCN-NEXT: s_setpc_b64
109 define float @f32_func_void() #0 {
110 %val = load float, float addrspace(1)* undef
111 ret float %val
112 }
113
114 ; GCN-LABEL: {{^}}f64_func_void:
115 ; GCN: buffer_load_dwordx2 v[0:1], off
116 ; GCN-NEXT: s_waitcnt vmcnt(0)
117 ; GCN-NEXT: s_setpc_b64
118 define double @f64_func_void() #0 {
119 %val = load double, double addrspace(1)* undef
120 ret double %val
121 }
122
123 ; GCN-LABEL: {{^}}v2i32_func_void:
124 ; GCN: buffer_load_dwordx2 v[0:1], off
125 ; GCN-NEXT: s_waitcnt vmcnt(0)
126 ; GCN-NEXT: s_setpc_b64
127 define <2 x i32> @v2i32_func_void() #0 {
128 %val = load <2 x i32>, <2 x i32> addrspace(1)* undef
129 ret <2 x i32> %val
130 }
131
132 ; GCN-LABEL: {{^}}v3i32_func_void:
133 ; GCN: buffer_load_dwordx4 v[0:3], off
134 ; GCN-NEXT: s_waitcnt vmcnt(0)
135 ; GCN-NEXT: s_setpc_b64
136 define <3 x i32> @v3i32_func_void() #0 {
137 %val = load <3 x i32>, <3 x i32> addrspace(1)* undef
138 ret <3 x i32> %val
139 }
140
141 ; GCN-LABEL: {{^}}v4i32_func_void:
142 ; GCN: buffer_load_dwordx4 v[0:3], off
143 ; GCN-NEXT: s_waitcnt vmcnt(0)
144 ; GCN-NEXT: s_setpc_b64
145 define <4 x i32> @v4i32_func_void() #0 {
146 %val = load <4 x i32>, <4 x i32> addrspace(1)* undef
147 ret <4 x i32> %val
148 }
149
150 ; GCN-LABEL: {{^}}v5i32_func_void:
151 ; GCN-DAG: buffer_load_dword v4, off
152 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
153 ; GCN: s_waitcnt vmcnt(0)
154 ; GCN-NEXT: s_setpc_b64
155 define <5 x i32> @v5i32_func_void() #0 {
156 %val = load volatile <5 x i32>, <5 x i32> addrspace(1)* undef
157 ret <5 x i32> %val
158 }
159
160 ; GCN-LABEL: {{^}}v8i32_func_void:
161 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
162 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
163 ; GCN: s_waitcnt vmcnt(0)
164 ; GCN-NEXT: s_setpc_b64
165 define <8 x i32> @v8i32_func_void() #0 {
166 %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
167 %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
168 ret <8 x i32> %val
169 }
170
171 ; GCN-LABEL: {{^}}v16i32_func_void:
172 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
173 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
174 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
175 ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
176 ; GCN: s_waitcnt vmcnt(0)
177 ; GCN-NEXT: s_setpc_b64
178 define <16 x i32> @v16i32_func_void() #0 {
179 %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
180 %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
181 ret <16 x i32> %val
182 }
183
184 ; GCN-LABEL: {{^}}v32i32_func_void:
185 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
186 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
187 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
188 ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
189 ; GCN-DAG: buffer_load_dwordx4 v[16:19], off
190 ; GCN-DAG: buffer_load_dwordx4 v[20:23], off
191 ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
192 ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
193 ; GCN: s_waitcnt vmcnt(0)
194 ; GCN-NEXT: s_setpc_b64
195 define <32 x i32> @v32i32_func_void() #0 {
196 %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
197 %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
198 ret <32 x i32> %val
199 }
200
201 ; GCN-LABEL: {{^}}v2i64_func_void:
202 ; GCN: buffer_load_dwordx4 v[0:3], off
203 ; GCN-NEXT: s_waitcnt vmcnt(0)
204 ; GCN-NEXT: s_setpc_b64
205 define <2 x i64> @v2i64_func_void() #0 {
206 %val = load <2 x i64>, <2 x i64> addrspace(1)* undef
207 ret <2 x i64> %val
208 }
209
210 ; GCN-LABEL: {{^}}v3i64_func_void:
211 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
212 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
213 ; GCN: s_waitcnt vmcnt(0)
214 ; GCN-NEXT: s_setpc_b64
215 define <3 x i64> @v3i64_func_void() #0 {
216 %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef
217 %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
218 ret <3 x i64> %val
219 }
220
221 ; GCN-LABEL: {{^}}v4i64_func_void:
222 ; GCN: buffer_load_dwordx4 v[0:3], off
223 ; GCN: buffer_load_dwordx4 v[4:7], off
224 ; GCN-NEXT: s_waitcnt vmcnt(0)
225 ; GCN-NEXT: s_setpc_b64
226 define <4 x i64> @v4i64_func_void() #0 {
227 %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef
228 %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
229 ret <4 x i64> %val
230 }
231
232 ; GCN-LABEL: {{^}}v5i64_func_void:
233 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
234 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
235 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
236 ; GCN: s_waitcnt vmcnt(0)
237 ; GCN-NEXT: s_setpc_b64
238 define <5 x i64> @v5i64_func_void() #0 {
239 %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef
240 %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
241 ret <5 x i64> %val
242 }
243
244 ; GCN-LABEL: {{^}}v8i64_func_void:
245 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
246 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
247 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
248 ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
249 ; GCN: s_waitcnt vmcnt(0)
250 ; GCN-NEXT: s_setpc_b64
251 define <8 x i64> @v8i64_func_void() #0 {
252 %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef
253 %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
254 ret <8 x i64> %val
255 }
256
257 ; GCN-LABEL: {{^}}v16i64_func_void:
258 ; GCN-DAG: buffer_load_dwordx4 v[0:3], off
259 ; GCN-DAG: buffer_load_dwordx4 v[4:7], off
260 ; GCN-DAG: buffer_load_dwordx4 v[8:11], off
261 ; GCN-DAG: buffer_load_dwordx4 v[12:15], off
262 ; GCN-DAG: buffer_load_dwordx4 v[16:19], off
263 ; GCN-DAG: buffer_load_dwordx4 v[20:23], off
264 ; GCN-DAG: buffer_load_dwordx4 v[24:27], off
265 ; GCN-DAG: buffer_load_dwordx4 v[28:31], off
266 ; GCN: s_waitcnt vmcnt(0)
267 ; GCN-NEXT: s_setpc_b64
268 define <16 x i64> @v16i64_func_void() #0 {
269 %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef
270 %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
271 ret <16 x i64> %val
272 }
273
274 ; GCN-LABEL: {{^}}v2i16_func_void:
275 ; GFX9: buffer_load_dword v0, off
276 ; GFX9-NEXT: s_waitcnt vmcnt(0)
277 ; GFX9-NEXT: s_setpc_b64
278 define <2 x i16> @v2i16_func_void() #0 {
279 %val = load <2 x i16>, <2 x i16> addrspace(1)* undef
280 ret <2 x i16> %val
281 }
282
283 ; GCN-LABEL: {{^}}v3i16_func_void:
284 ; GFX9: buffer_load_dwordx2 v[0:1], off
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
286 ; GFX9-NEXT: s_setpc_b64
287 define <3 x i16> @v3i16_func_void() #0 {
288 %val = load <3 x i16>, <3 x i16> addrspace(1)* undef
289 ret <3 x i16> %val
290 }
291
292 ; GCN-LABEL: {{^}}v4i16_func_void:
293 ; GFX9: buffer_load_dwordx2 v[0:1], off
294 ; GFX9-NEXT: s_waitcnt vmcnt(0)
295 ; GFX9-NEXT: s_setpc_b64
296 define <4 x i16> @v4i16_func_void() #0 {
297 %val = load <4 x i16>, <4 x i16> addrspace(1)* undef
298 ret <4 x i16> %val
299 }
300
301 ; FIXME: Should not scalarize
302 ; GCN-LABEL: {{^}}v5i16_func_void:
303 ; GFX9: buffer_load_dwordx2 v[0:1]
304 ; GFX9: buffer_load_ushort v4
305 ; GFX9: v_lshrrev_b32_e32 v3, 16, v1
306 ; GFX9: v_mov_b32_e32 v2, v1
307 ; GFX9: v_lshrrev_b32_e32 v3, 16, v0
308 ; GCN: s_setpc_b64
309 define <5 x i16> @v5i16_func_void() #0 {
310 %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef
311 %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
312 ret <5 x i16> %val
313 }
314
315 ; GCN-LABEL: {{^}}v8i16_func_void:
316 ; GFX9-DAG: buffer_load_dwordx4 v[0:3], off
317 ; GFX9: s_waitcnt vmcnt(0)
318 ; GFX9-NEXT: s_setpc_b64
319 define <8 x i16> @v8i16_func_void() #0 {
320 %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef
321 %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
322 ret <8 x i16> %val
323 }
324
325 ; GCN-LABEL: {{^}}v16i16_func_void:
326 ; GFX9: buffer_load_dwordx4 v[0:3], off
327 ; GFX9: buffer_load_dwordx4 v[4:7], off
328 ; GFX9: s_waitcnt vmcnt(0)
329 ; GFX9-NEXT: s_setpc_b64
330 define <16 x i16> @v16i16_func_void() #0 {
331 %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef
332 %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
333 ret <16 x i16> %val
334 }
335
336 ; FIXME: Should pack
337 ; GCN-LABEL: {{^}}v16i8_func_void:
338 ; GCN-DAG: v12
339 ; GCN-DAG: v13
340 ; GCN-DAG: v14
341 ; GCN-DAG: v15
342 define <16 x i8> @v16i8_func_void() #0 {
343 %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
344 %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
345 ret <16 x i8> %val
346 }
347
348 ; FIXME: Should pack
349 ; GCN-LABEL: {{^}}v4i8_func_void:
350 ; GCN: buffer_load_dword v0
351 ; GCN-DAG: v_lshrrev_b32_e32 v2, 16, v0
352 ; GCN-DAG: v_lshrrev_b32_e32 v3, 24, v0
353 ; CI-DAG: v_bfe_u32 v1, v0, 8, 8
354 ; VI-DAG: v_lshrrev_b16_e32 v1, 8, v0
355 ; GCN: s_setpc_b64
356 define <4 x i8> @v4i8_func_void() #0 {
357 %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(2)* undef
358 %val = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
359 ret <4 x i8> %val
360 }
361
362 ; GCN-LABEL: {{^}}struct_i8_i32_func_void:
363 ; GCN-DAG: buffer_load_dword v1
364 ; GCN-DAG: buffer_load_ubyte v0
365 ; GCN: s_waitcnt vmcnt(0)
366 ; GCN-NEXT: s_setpc_b64
367 define {i8, i32} @struct_i8_i32_func_void() #0 {
368 %val = load { i8, i32 }, { i8, i32 } addrspace(1)* undef
369 ret { i8, i32 } %val
370 }
371
372 ; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32:
373 ; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]]
374 ; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
375 ; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s4 offen{{$}}
376 ; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s4 offen offset:4{{$}}
377 define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 {
378 %val0 = load volatile i8, i8 addrspace(1)* undef
379 %val1 = load volatile i32, i32 addrspace(1)* undef
380 %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 0
381 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %arg0, i32 0, i32 1
382 store i8 %val0, i8* %gep0
383 store i32 %val1, i32* %gep1
384 ret void
385 }
386
387 ; GCN-LABEL: {{^}}v33i32_func_void:
388 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
389 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
390 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
391 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
392 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
393 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
394 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
395 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
396 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
397 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
398 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
399 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
400 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
401 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
402 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
403 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
404 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
405 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
406 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
407 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
408 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
409 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
410 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
411 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
412 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
413 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
414 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
415 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
416 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
417 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
418 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
419 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
420 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
421 ; GCN: s_waitcnt vmcnt(0)
422 ; GCN-NEXT: s_setpc_b64
423 define <33 x i32> @v33i32_func_void() #0 {
424 %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef
425 %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
426 ret <33 x i32> %val
427 }
428
429 ; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
430 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
431 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
432 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
433 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
434 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
435 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
436 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
437 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
438 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
439 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
440 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
441 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
442 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
443 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
444 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
445 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
446 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
447 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
448 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
449 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
450 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
451 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
452 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
453 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
454 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
455 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
456 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
457 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
458 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
459 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
460 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
461 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
462 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
463 ; GCN: s_waitcnt vmcnt(0)
464 ; GCN-NEXT: s_setpc_b64
465 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
466 %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef
467 %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
468 ret { <32 x i32>, i32 }%val
469 }
470
471 ; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
472 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
473 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
474 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}}
475 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}}
476 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}}
477 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}}
478 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}}
479 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}}
480 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}}
481 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}}
482 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}}
483 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}}
484 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}}
485 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}}
486 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}}
487 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}}
488 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}}
489 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}}
490 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}}
491 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}}
492 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}}
493 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}}
494 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}}
495 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}}
496 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}}
497 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}}
498 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}}
499 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}}
500 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}}
501 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}}
502 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}}
503 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}}
504 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}}
505 ; GCN: s_waitcnt vmcnt(0)
506 ; GCN-NEXT: s_setpc_b64
507 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
508 %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef
509 %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
510 ret { i32, <32 x i32> }%val
511 }
512
513 attributes #0 = { nounwind }
2626
2727 ; ELF: Symbol {
2828 ; ELF: Name: simple
29 ; ELF: Size: 44
29 ; ELF: Size: 48
3030 ; ELF: Type: Function (0x2)
3131 ; ELF: }
3232
4040 ; HSA: .p2align 2
4141 ; HSA: {{^}}simple:
4242 ; HSA-NOT: amd_kernel_code_t
43
44 ; FIXME: Check this isn't a kernarg load when calling convention implemented.
45 ; XHSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
43 ; HSA-NOT: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
4644
4745 ; Make sure we are setting the ATC bit:
48 ; HSA-CI: s_mov_b32 s[[HI:[0-9]]], 0x100f000
46 ; HSA-CI: s_mov_b32 s[[HI:[0-9]+]], 0x100f000
4947 ; On VI+ we also need to set MTYPE = 2
50 ; HSA-VI: s_mov_b32 s[[HI:[0-9]]], 0x1100f000
48 ; HSA-VI: s_mov_b32 s[[HI:[0-9]+]], 0x1100f000
5149 ; Make sure we generate flat store for HSA
5250 ; HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
5351
5553 ; HSA: .size simple, .Lfunc_end0-simple
5654 ; HSA: ; Function info:
5755 ; HSA-NOT: COMPUTE_PGM_RSRC2
58 define void @simple(i32 addrspace(1)* %out) {
56 define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) {
5957 entry:
58 %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
6059 store i32 0, i32 addrspace(1)* %out
6160 ret void
6261 }
190190 ; CHECK: v_mov_b32_e32 v0, s0
191191 ; CHECK: v_mov_b32_e32 v1, s1
192192 ; CHECK: use v[0:1]
193 define void @i64_imm_input_phys_vgpr() {
193 define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
194194 entry:
195195 call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
196196 ret void
0 # RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
1 --- |
2
3 define amdgpu_kernel void @func0() {
4 ret void
5 }
6
7 ...
8
19 ---
210 # We should not detect any interference between v0/v1 here and only allocate
311 # sgpr0-sgpr3.