llvm.org GIT mirror llvm / 72f82e4
Merging r293000: ------------------------------------------------------------------------ r293000 | thomas.stellard | 2017-01-24 17:25:13 -0800 (Tue, 24 Jan 2017) | 15 lines AMDGPU add support for spilling to a user sgpr pointed buffers Summary: This lets you select which sort of spilling you want, either s[0:1] or 64-bit loads from s[0:1]. Patch By: Dave Airlie Reviewers: nhaehnle, arsenm, tstellarAMD Reviewed By: arsenm Subscribers: mareko, llvm-commits, kzhuravl, wdng, yaxunl, tony-tye Differential Revision: https://reviews.llvm.org/D25428 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_40@293240 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 2 years ago
10 changed file(s) with 124 addition(s) and 36 deletion(s). Raw diff Collapse all Expand all
9898 def int_amdgcn_dispatch_id :
9999 GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
100100 Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
101
102 def int_amdgcn_implicit_buffer_ptr :
103 GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
104 Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>;
101105
102106 //===----------------------------------------------------------------------===//
103107 // Instruction Intrinsics
139139 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
140140 const AMDGPUSubtarget &STM = MF->getSubtarget();
141141 SIProgramInfo KernelInfo;
142 if (STM.isAmdCodeObjectV2()) {
142 if (STM.isAmdCodeObjectV2(*MF)) {
143143 getSIProgramInfo(KernelInfo, *MF);
144144 EmitAmdKernelCodeT(*MF, KernelInfo);
145145 }
148148 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
149149 const SIMachineFunctionInfo *MFI = MF->getInfo();
150150 const AMDGPUSubtarget &STM = MF->getSubtarget();
151 if (MFI->isKernel() && STM.isAmdCodeObjectV2()) {
151 if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
152152 AMDGPUTargetStreamer *TS =
153153 static_cast(OutStreamer->getTargetStreamer());
154154 SmallString<128> SymbolName;
778778
779779 // FIXME: Should use getKernArgSize
780780 header.kernarg_segment_byte_size =
781 STM.getKernArgSegmentSize(MFI->getABIArgOffset());
781 STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
782782 header.wavefront_sgpr_count = KernelInfo.NumSGPR;
783783 header.workitem_vgpr_count = KernelInfo.NumVGPR;
784784 header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
296296 return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
297297 }
298298
299 unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
300 unsigned ImplicitBytes = getImplicitArgNumBytes();
299 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
300 unsigned ExplicitArgBytes) const {
301 unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
301302 if (ImplicitBytes == 0)
302303 return ExplicitArgBytes;
303304
310310 return EnableXNACK;
311311 }
312312
313 bool isAmdCodeObjectV2() const {
314 return isAmdHsaOS() || isMesa3DOS();
313 bool isMesaKernel(const MachineFunction &MF) const {
314 return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
315 }
316
317 // Covers VS/PS/CS graphics shaders
318 bool isMesaGfxShader(const MachineFunction &MF) const {
319 return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
320 }
321
322 bool isAmdCodeObjectV2(const MachineFunction &MF) const {
323 return isAmdHsaOS() || isMesaKernel(MF);
315324 }
316325
317326 /// \brief Returns the offset in bytes from the start of the input buffer
318327 /// of the first explicit kernel argument.
319 unsigned getExplicitKernelArgOffset() const {
320 return isAmdCodeObjectV2() ? 0 : 36;
328 unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
329 return isAmdCodeObjectV2(MF) ? 0 : 36;
321330 }
322331
323332 unsigned getAlignmentForImplicitArgPtr() const {
324333 return isAmdHsaOS() ? 8 : 4;
325334 }
326335
327 unsigned getImplicitArgNumBytes() const {
328 if (isMesa3DOS())
336 unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
337 if (isMesaKernel(MF))
329338 return 16;
330339 if (isAmdHsaOS() && isOpenCLEnv())
331340 return 32;
584593 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
585594 }
586595
587 unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
596 unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
588597
589598 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
590599 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
15811581
15821582 unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
15831583 unsigned PartOffset = VA.getLocMemOffset();
1584 unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
1584 unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
15851585
15861586 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
15871587 SDValue Arg = DAG.getLoad(
236236
237237
238238 unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
239 if (ST.isAmdCodeObjectV2()) {
239 if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
240240 PreloadedPrivateBufferReg = TRI->getPreloadedValue(
241241 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
242242 }
254254 }
255255
256256 if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
257 assert(ST.isAmdCodeObjectV2());
257 assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
258258 MRI.addLiveIn(PreloadedPrivateBufferReg);
259259 MBB.addLiveIn(PreloadedPrivateBufferReg);
260260 }
279279
280280 bool CopyBuffer = ResourceRegUsed &&
281281 PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
282 ST.isAmdCodeObjectV2(MF) &&
282283 ScratchRsrcReg != PreloadedPrivateBufferReg;
283284
284285 // This needs to be careful of the copying order to avoid overwriting one of
302303 .addReg(PreloadedPrivateBufferReg, RegState::Kill);
303304 }
304305
305 if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
306 assert(!ST.isAmdCodeObjectV2());
306 if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
307 assert(!ST.isAmdCodeObjectV2(MF));
307308 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
308309
309 unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
310 unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
311310 unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
312311 unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
313312
314313 // Use relocations to get the pointer, and setup the other bits manually.
315314 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
316 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
317 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
318 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
319
320 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
321 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
322 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
315
316 if (MFI->hasPrivateMemoryInputPtr()) {
317 unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
318
319 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
320 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
321
322 BuildMI(MBB, I, DL, Mov64, Rsrc01)
323 .addReg(PreloadedPrivateBufferReg)
324 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
325 } else {
326 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
327
328 PointerType *PtrTy =
329 PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
330 AMDGPUAS::CONSTANT_ADDRESS);
331 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
332 auto MMO = MF.getMachineMemOperand(PtrInfo,
333 MachineMemOperand::MOLoad |
334 MachineMemOperand::MOInvariant |
335 MachineMemOperand::MODereferenceable,
336 0, 0);
337 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
338 .addReg(PreloadedPrivateBufferReg)
339 .addImm(0) // offset
340 .addImm(0) // glc
341 .addMemOperand(MMO)
342 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
343 }
344 } else {
345 unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
346 unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
347
348 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
349 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
350 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
351
352 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
353 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
354 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
355
356 }
323357
324358 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
325359 .addImm(Rsrc23 & 0xffffffff)
841841 if (!AMDGPU::isShader(CallConv)) {
842842 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
843843 } else {
844 assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
844 assert(!Info->hasDispatchPtr() &&
845845 !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
846846 !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
847847 !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
849849 !Info->hasWorkItemIDZ());
850850 }
851851
852 if (Info->hasPrivateMemoryInputPtr()) {
853 unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
854 MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
855 CCInfo.AllocateReg(PrivateMemoryPtrReg);
856 }
857
852858 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
853859 if (Info->hasPrivateSegmentBuffer()) {
854860 unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
907913 if (VA.isMemLoc()) {
908914 VT = Ins[i].VT;
909915 EVT MemVT = VA.getLocVT();
910 const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
916 const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
911917 VA.getLocMemOffset();
912918 // The first 36 bytes of the input buffer contains information about
913919 // thread group and global sizes.
10321038 if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
10331039 HasStackObjects = true;
10341040
1035 if (ST.isAmdCodeObjectV2()) {
1041 if (ST.isAmdCodeObjectV2(MF)) {
10361042 if (HasStackObjects) {
10371043 // If we have stack objects, we unquestionably need the private buffer
10381044 // resource. For the Code Object V2 ABI, this will be the first 4 user
23612367 // TODO: Should this propagate fast-math-flags?
23622368
23632369 switch (IntrinsicID) {
2370 case Intrinsic::amdgcn_implicit_buffer_ptr: {
2371 unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
2372 return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
2373 }
23642374 case Intrinsic::amdgcn_dispatch_ptr:
23652375 case Intrinsic::amdgcn_queue_ptr: {
2366 if (!Subtarget->isAmdCodeObjectV2()) {
2376 if (!Subtarget->isAmdCodeObjectV2(MF)) {
23672377 DiagnosticInfoUnsupported BadIntrin(
23682378 *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
23692379 DL.getDebugLoc());
7676 PrivateSegmentWaveByteOffset(false),
7777 WorkItemIDX(false),
7878 WorkItemIDY(false),
79 WorkItemIDZ(false) {
79 WorkItemIDZ(false),
80 PrivateMemoryInputPtr(false) {
8081 const SISubtarget &ST = MF.getSubtarget();
8182 const Function *F = MF.getFunction();
8283
113114 if (HasStackObjects || MaySpill)
114115 PrivateSegmentWaveByteOffset = true;
115116
116 if (ST.isAmdCodeObjectV2()) {
117 if (ST.isAmdCodeObjectV2(MF)) {
117118 if (HasStackObjects || MaySpill)
118119 PrivateSegmentBuffer = true;
119120
125126
126127 if (F->hasFnAttribute("amdgpu-dispatch-id"))
127128 DispatchID = true;
129 } else if (ST.isMesaGfxShader(MF)) {
130 if (HasStackObjects || MaySpill)
131 PrivateMemoryInputPtr = true;
128132 }
129133
130134 // We don't need to worry about accessing spills with flat instructions.
179183 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
180184 NumUserSGPRs += 2;
181185 return FlatScratchInitUserSGPR;
186 }
187
188 unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
189 PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
190 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
191 NumUserSGPRs += 2;
192 return PrivateMemoryPtrUserSGPR;
182193 }
183194
184195 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
8383 unsigned ScratchRSrcReg;
8484 unsigned ScratchWaveOffsetReg;
8585
86 // Input registers for non-HSA ABI
87 unsigned PrivateMemoryPtrUserSGPR;
88
8689 // Input registers setup for the HSA ABI.
8790 // User SGPRs in allocation order.
8891 unsigned PrivateSegmentBufferUserSGPR;
162165 bool WorkItemIDY : 1;
163166 bool WorkItemIDZ : 1;
164167
168 // Private memory buffer
169 // Compute directly in sgpr[0:1]
170 // Other shaders indirect 64-bits at sgpr[0:1]
171 bool PrivateMemoryInputPtr : 1;
172
165173 MCPhysReg getNextUserSGPR() const {
166174 assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
167175 return AMDGPU::SGPR0 + NumUserSGPRs;
197205 unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
198206 unsigned addDispatchID(const SIRegisterInfo &TRI);
199207 unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
208 unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
200209
201210 // Add system SGPRs.
202211 unsigned addWorkGroupIDX() {
301310 return WorkItemIDZ;
302311 }
303312
313 bool hasPrivateMemoryInputPtr() const {
314 return PrivateMemoryInputPtr;
315 }
316
304317 unsigned getNumUserSGPRs() const {
305318 return NumUserSGPRs;
306319 }
335348
336349 unsigned getQueuePtrUserSGPR() const {
337350 return QueuePtrUserSGPR;
351 }
352
353 unsigned getPrivateMemoryPtrUserSGPR() const {
354 return PrivateMemoryPtrUserSGPR;
338355 }
339356
340357 bool hasSpilledSGPRs() const {
11071107 case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
11081108 return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
11091109 case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
1110 assert(ST.isAmdCodeObjectV2() &&
1111 "Non-CodeObjectV2 ABI currently uses relocations");
1112 assert(MFI->hasPrivateSegmentBuffer());
1113 return MFI->PrivateSegmentBufferUserSGPR;
1110 if (ST.isAmdCodeObjectV2(MF)) {
1111 assert(MFI->hasPrivateSegmentBuffer());
1112 return MFI->PrivateSegmentBufferUserSGPR;
1113 }
1114 assert(MFI->hasPrivateMemoryInputPtr());
1115 return MFI->PrivateMemoryPtrUserSGPR;
11141116 case SIRegisterInfo::KERNARG_SEGMENT_PTR:
11151117 assert(MFI->hasKernargSegmentPtr());
11161118 return MFI->KernargSegmentPtrUserSGPR;