llvm.org GIT mirror llvm / b4989f0
Merging r260658: ------------------------------------------------------------------------ r260658 | Matthew.Arsenault | 2016-02-11 22:31:30 -0800 (Thu, 11 Feb 2016) | 12 lines AMDGPU: Set flat_scratch from flat_scratch_init reg This was hardcoded to the static private size, but this would be missing the offset and additional size for someday when we have dynamic sizing. Also stops always initializing flat_scratch even when unused. In the future we should stop emitting this unless flat instructions are used to access private memory. For example this will initialize it almost always on VI because flat is used for global access. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@271684 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 3 years ago
14 changed file(s) with 152 addition(s) and 165 deletion(s). Raw diff Collapse all Expand all
2020
2121 static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
2222 const MachineFrameInfo *FrameInfo) {
23 if (!FuncInfo->hasSpilledSGPRs())
24 return false;
25
26 if (FuncInfo->hasSpilledVGPRs())
27 return false;
28
29 for (int I = FrameInfo->getObjectIndexBegin(),
30 E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
31 if (!FrameInfo->isSpillSlotObjectIndex(I))
32 return false;
33 }
34
35 return true;
23 return FuncInfo->hasSpilledSGPRs() &&
24 (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
3625 }
3726
3827 static ArrayRef getAllSGPR128() {
6655 static_cast(MF.getSubtarget().getInstrInfo());
6756 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
6857 const AMDGPUSubtarget &ST = MF.getSubtarget();
58 MachineRegisterInfo &MRI = MF.getRegInfo();
59 MachineBasicBlock::iterator I = MBB.begin();
6960
7061 // We need to insert initialization of the scratch resource descriptor.
7162 unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
8172 if (ST.isAmdHsaOS()) {
8273 PreloadedPrivateBufferReg = TRI->getPreloadedValue(
8374 MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
75 }
76
77 if (MFI->hasFlatScratchInit()) {
78 // We don't need this if we only have spills since there is no user facing
79 // scratch.
80
81 // TODO: If we know we don't have flat instructions earlier, we can omit
82 // this from the input registers.
83 //
84 // TODO: We only need to know if we access scratch space through a flat
85 // pointer. Because we only detect if flat instructions are used at all,
86 // this will be used more often than necessary on VI.
87
88 DebugLoc DL;
89
90 unsigned FlatScratchInitReg
91 = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
92
93 MRI.addLiveIn(FlatScratchInitReg);
94 MBB.addLiveIn(FlatScratchInitReg);
95
96 // Copy the size in bytes.
97 unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
98 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
99 .addReg(FlatScrInitHi, RegState::Kill);
100
101 unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
102
103 // Add wave offset in bytes to private base offset.
104 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
105 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
106 .addReg(FlatScrInitLo)
107 .addReg(ScratchWaveOffsetReg);
108
109 // Convert offset to 256-byte units.
110 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
111 .addReg(FlatScrInitLo, RegState::Kill)
112 .addImm(8);
84113 }
85114
86115 // If we reserved the original input registers, we don't need to copy to the
95124
96125 // We added live-ins during argument lowering, but since they were not used
97126 // they were deleted. We're adding the uses now, so add them back.
98 MachineRegisterInfo &MRI = MF.getRegInfo();
99127 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
100128 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
101129
159187 assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
160188
161189 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
162 MachineBasicBlock::iterator I = MBB.begin();
163190 DebugLoc DL;
164191
165192 if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
684684 CCInfo.AllocateReg(InputPtrReg);
685685 }
686686
687 if (Info->hasFlatScratchInit()) {
688 unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
689 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
690 CCInfo.AllocateReg(FlatScratchInitReg);
691 }
692
687693 AnalyzeFormalArguments(CCInfo, Splits);
688694
689695 SmallVector Chains;
811817
812818 // Now that we've figured out where the scratch register inputs are, see if
813819 // should reserve the arguments and use them directly.
814
815820 bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
821 // Record that we know we have non-spill stack objects so we don't need to
822 // check all stack objects later.
823 if (HasStackObjects)
824 Info->setHasNonSpillStackObjects(true);
816825
817826 if (ST.isAmdHsaOS()) {
818827 // TODO: Assume we will spill without optimizations.
571571 AMDGPU::EXEC).addReg(AMDGPU::EXEC);
572572 }
573573
574 // FIXME: This seems inappropriate to do here.
575574 if (NeedFlat && MFI->IsKernel) {
576 // Insert the prologue initializing the SGPRs pointing to the scratch space
577 // for flat accesses.
578 const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
579
580575 // TODO: What to use with function calls?
581
582 // FIXME: This is reporting stack size that is used in a scratch buffer
583 // rather than registers as well.
584 uint64_t StackSizeBytes = FrameInfo->getStackSize();
585
586 int IndirectBegin
587 = static_cast(TII)->getIndirectIndexBegin(MF);
588 // Convert register index to 256-byte unit.
589 uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
590
591 assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
592 "Stack limits should be smaller than 16-bits");
593
594 // Initialize the flat scratch register pair.
595 // TODO: Can we use one s_mov_b64 here?
596
597 // Offset is in units of 256-bytes.
598 MachineBasicBlock &MBB = MF.front();
599 DebugLoc NoDL;
600 MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
601 const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
602
603 assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
604
605 BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
606 .addImm(StackOffset);
607
608 // Documentation says size is "per-thread scratch size in bytes"
609 BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
610 .addImm(StackSizeBytes);
576 // We will need to Initialize the flat scratch register pair.
577 if (NeedFlat)
578 MFI->setHasFlatInstructions(true);
611579 }
612580
613581 return true;
5353 NumSystemSGPRs(0),
5454 HasSpilledSGPRs(false),
5555 HasSpilledVGPRs(false),
56 HasNonSpillStackObjects(false),
57 HasFlatInstructions(false),
5658 PrivateSegmentBuffer(false),
5759 DispatchPtr(false),
5860 QueuePtr(false),
9294 if (F->hasFnAttribute("amdgpu-work-item-id-z"))
9395 WorkItemIDZ = true;
9496
95 bool MaySpill = ST.isVGPRSpillingEnabled(this);
96 bool HasStackObjects = FrameInfo->hasStackObjects();
97
98 if (HasStackObjects || MaySpill)
99 PrivateSegmentWaveByteOffset = true;
100
101 if (ST.isAmdHsaOS()) {
102 if (HasStackObjects || MaySpill)
103 PrivateSegmentBuffer = true;
104
105 if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
106 DispatchPtr = true;
107 }
108
10997 // X, XY, and XYZ are the only supported combinations, so make sure Y is
11098 // enabled if Z is.
11199 if (WorkItemIDZ)
112100 WorkItemIDY = true;
101
102 bool MaySpill = ST.isVGPRSpillingEnabled(this);
103 bool HasStackObjects = FrameInfo->hasStackObjects();
104
105 if (HasStackObjects || MaySpill)
106 PrivateSegmentWaveByteOffset = true;
107
108 if (ST.isAmdHsaOS()) {
109 if (HasStackObjects || MaySpill)
110 PrivateSegmentBuffer = true;
111
112 if (F->hasFnAttribute("amdgpu-dispatch-ptr"))
113 DispatchPtr = true;
114 }
115
116 // We don't need to worry about accessing spills with flat instructions.
117 // TODO: On VI where we must use flat for global, we should be able to omit
118 // this if it is never used for generic access.
119 if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
120 ST.isAmdHsaOS())
121 FlatScratchInit = true;
113122 }
114123
115124 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
139148 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
140149 NumUserSGPRs += 2;
141150 return KernargSegmentPtrUserSGPR;
151 }
152
153 unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
154 FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
155 getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
156 NumUserSGPRs += 2;
157 return FlatScratchInitUserSGPR;
142158 }
143159
144160 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
7272 private:
7373 bool HasSpilledSGPRs;
7474 bool HasSpilledVGPRs;
75 bool HasNonSpillStackObjects;
76 bool HasFlatInstructions;
7577
7678 // Feature bits required for inputs passed in user SGPRs.
7779 bool PrivateSegmentBuffer : 1;
128130 unsigned addDispatchPtr(const SIRegisterInfo &TRI);
129131 unsigned addQueuePtr(const SIRegisterInfo &TRI);
130132 unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
133 unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
131134
132135 // Add system SGPRs.
133136 unsigned addWorkGroupIDX() {
276279 HasSpilledVGPRs = Spill;
277280 }
278281
282 bool hasNonSpillStackObjects() const {
283 return HasNonSpillStackObjects;
284 }
285
286 void setHasNonSpillStackObjects(bool StackObject = true) {
287 HasNonSpillStackObjects = StackObject;
288 }
289
290 bool hasFlatInstructions() const {
291 return HasFlatInstructions;
292 }
293
294 void setHasFlatInstructions(bool UseFlat = true) {
295 HasFlatInstructions = UseFlat;
296 }
297
279298 unsigned getPSInputAddr() const {
280299 return PSInputAddr;
281300 }
648648 case SIRegisterInfo::KERNARG_SEGMENT_PTR:
649649 assert(MFI->hasKernargSegmentPtr());
650650 return MFI->KernargSegmentPtrUserSGPR;
651 case SIRegisterInfo::DISPATCH_ID:
652 llvm_unreachable("unimplemented");
653 case SIRegisterInfo::FLAT_SCRATCH_INIT:
654 assert(MFI->hasFlatScratchInit());
655 return MFI->FlatScratchInitUserSGPR;
651656 case SIRegisterInfo::DISPATCH_PTR:
652657 assert(MFI->hasDispatchPtr());
653658 return MFI->DispatchPtrUserSGPR;
120120
121121 enum PreloadedValue {
122122 // SGPRS:
123 PRIVATE_SEGMENT_BUFFER = 0,
123 PRIVATE_SEGMENT_BUFFER = 0,
124124 DISPATCH_PTR = 1,
125125 QUEUE_PTR = 2,
126126 KERNARG_SEGMENT_PTR = 3,
127 DISPATCH_ID = 4,
128 FLAT_SCRATCH_INIT = 5,
127129 WORKGROUP_ID_X = 10,
128130 WORKGROUP_ID_Y = 11,
129131 WORKGROUP_ID_Z = 12,
189189 }
190190
191191 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
192 ; VI-DAG: s_movk_i32 flat_scratch_lo, 0x0
193 ; VI-DAG: s_movk_i32 flat_scratch_hi, 0x0
194192 ; GCN: s_and_saveexec_b64
195193 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
196194 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
None ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
1 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
2 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
2 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
3 ; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
44
5
6 ; There are no stack objects even though flat is used by default, so
7 ; flat_scratch_init should be disabled.
8
9 ; ALL-LABEL: {{^}}test:
10 ; HSA: .amd_kernel_code_t
11 ; HSA: enable_sgpr_flat_scratch_init = 0
12 ; HSA: .end_amd_kernel_code_t
13
14 ; ALL-NOT: flat_scr
515
616 ; HSA-DEFAULT: flat_store_dword
717 ; HSA-NODEFAULT: buffer_store_dword
18
819 ; NOHSA-DEFAULT: buffer_store_dword
920 ; NOHSA-NODEFAULT: flat_store_dword
1021 define void @test(i32 addrspace(1)* %out) {
2323
2424 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
2525 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
26 ; VI: s_movk_i32 flat_scratch_lo, 0x0
27 ; VI: s_movk_i32 flat_scratch_hi, 0x0
2826 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
2927
3028 define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
3735
3836 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
3937 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
40 ; VI: s_movk_i32 flat_scratch_lo, 0x0
41 ; VI: s_movk_i32 flat_scratch_hi, 0x0
4238 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
4339 ; GCN: buffer_store_dword [[RET]]
4440 define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
7066
7167 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
7268 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
73 ; VI: s_movk_i32 flat_scratch_lo, 0x0
74 ; VI: s_movk_i32 flat_scratch_hi, 0x0
7569 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
7670 define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
7771 entry:
8276
8377 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
8478 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
85 ; VI: s_movk_i32 flat_scratch_lo, 0x0
86 ; VI: s_movk_i32 flat_scratch_hi, 0x0
8779 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
8880 ; GCN: buffer_store_dword [[RET]]
8981 define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
116108
117109 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
118110 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
119 ; VI: s_movk_i32 flat_scratch_lo, 0x0
120 ; VI: s_movk_i32 flat_scratch_hi, 0x0
121111 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
122112 define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
123113 entry:
129119
130120 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
131121 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
132 ; VI: s_movk_i32 flat_scratch_lo, 0x0
133 ; VI: s_movk_i32 flat_scratch_hi, 0x0
134122 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
135123 ; GCN: buffer_store_dword [[RET]]
136124 define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
162150
163151 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
164152 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
165 ; VI: s_movk_i32 flat_scratch_lo, 0x0
166 ; VI: s_movk_i32 flat_scratch_hi, 0x0
167153 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
168154 define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
169155 entry:
174160
175161 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
176162 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
177 ; VI: s_movk_i32 flat_scratch_lo, 0x0
178 ; VI: s_movk_i32 flat_scratch_hi, 0x0
179163 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
180164 ; GCN: buffer_store_dword [[RET]]
181165 define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
208192
209193 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
210194 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
211 ; VI: s_movk_i32 flat_scratch_lo, 0x0
212 ; VI: s_movk_i32 flat_scratch_hi, 0x0
213195 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
214196 define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
215197 entry:
221203
222204 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
223205 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
224 ; VI: s_movk_i32 flat_scratch_lo, 0x0
225 ; VI: s_movk_i32 flat_scratch_hi, 0x0
226206 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
227207 ; GCN: buffer_store_dword [[RET]]
228208 define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
254234
255235 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
256236 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
257 ; VI: s_movk_i32 flat_scratch_lo, 0x0
258 ; VI: s_movk_i32 flat_scratch_hi, 0x0
259237 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
260238 define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
261239 entry:
266244
267245 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
268246 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
269 ; VI: s_movk_i32 flat_scratch_lo, 0x0
270 ; VI: s_movk_i32 flat_scratch_hi, 0x0
271247 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
272248 ; GCN: buffer_store_dword [[RET]]
273249 define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
300276
301277 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
302278 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
303 ; VI: s_movk_i32 flat_scratch_lo, 0x0
304 ; VI: s_movk_i32 flat_scratch_hi, 0x0
305279 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
306280 define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
307281 entry:
313287
314288 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
315289 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
316 ; VI: s_movk_i32 flat_scratch_lo, 0x0
317 ; VI: s_movk_i32 flat_scratch_hi, 0x0
318290 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
319291 ; GCN: buffer_store_dword [[RET]]
320292 define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
346318
347319 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
348320 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
349 ; VI: s_movk_i32 flat_scratch_lo, 0x0
350 ; VI: s_movk_i32 flat_scratch_hi, 0x0
351321 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
352322 define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
353323 entry:
358328
359329 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
360330 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
361 ; VI: s_movk_i32 flat_scratch_lo, 0x0
362 ; VI: s_movk_i32 flat_scratch_hi, 0x0
363331 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
364332 ; GCN: buffer_store_dword [[RET]]
365333 define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
392360
393361 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
394362 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
395 ; VI: s_movk_i32 flat_scratch_lo, 0x0
396 ; VI: s_movk_i32 flat_scratch_hi, 0x0
397363 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
398364 define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
399365 entry:
405371
406372 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
407373 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
408 ; VI: s_movk_i32 flat_scratch_lo, 0x0
409 ; VI: s_movk_i32 flat_scratch_hi, 0x0
410374 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
411375 ; GCN: buffer_store_dword [[RET]]
412376 define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
438402
439403 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
440404 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
441 ; VI: s_movk_i32 flat_scratch_lo, 0x0
442 ; VI: s_movk_i32 flat_scratch_hi, 0x0
443405 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
444406 define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
445407 entry:
450412
451413 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
452414 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
453 ; VI: s_movk_i32 flat_scratch_lo, 0x0
454 ; VI: s_movk_i32 flat_scratch_hi, 0x0
455415 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
456416 ; GCN: buffer_store_dword [[RET]]
457417 define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
484444
485445 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
486446 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
487 ; VI: s_movk_i32 flat_scratch_lo, 0x0
488 ; VI: s_movk_i32 flat_scratch_hi, 0x0
489447 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
490448 define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
491449 entry:
497455
498456 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
499457 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
500 ; VI: s_movk_i32 flat_scratch_lo, 0x0
501 ; VI: s_movk_i32 flat_scratch_hi, 0x0
502458 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
503459 ; GCN: buffer_store_dword [[RET]]
504460 define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
530486
531487 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
532488 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
533 ; VI: s_movk_i32 flat_scratch_lo, 0x0
534 ; VI: s_movk_i32 flat_scratch_hi, 0x0
535489 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
536490 define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
537491 entry:
542496
543497 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
544498 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
545 ; VI: s_movk_i32 flat_scratch_lo, 0x0
546 ; VI: s_movk_i32 flat_scratch_hi, 0x0
547499 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
548500 ; GCN: buffer_store_dword [[RET]]
549501 define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
576528
577529 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
578530 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
579 ; VI: s_movk_i32 flat_scratch_lo, 0x0
580 ; VI: s_movk_i32 flat_scratch_hi, 0x0
581531 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
582532 define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
583533 entry:
589539
590540 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
591541 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
592 ; VI: s_movk_i32 flat_scratch_lo, 0x0
593 ; VI: s_movk_i32 flat_scratch_hi, 0x0
594542 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
595543 ; GCN: buffer_store_dword [[RET]]
596544 define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
622570
623571 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
624572 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
625 ; VI: s_movk_i32 flat_scratch_lo, 0x0
626 ; VI: s_movk_i32 flat_scratch_hi, 0x0
627573 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
628574 define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
629575 entry:
634580
635581 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
636582 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
637 ; VI: s_movk_i32 flat_scratch_lo, 0x0
638 ; VI: s_movk_i32 flat_scratch_hi, 0x0
639583 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
640584 ; GCN: buffer_store_dword [[RET]]
641585 define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
668612
669613 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
670614 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
671 ; VI: s_movk_i32 flat_scratch_lo, 0x0
672 ; VI: s_movk_i32 flat_scratch_hi, 0x0
673615 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
674616 define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
675617 entry:
681623
682624 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
683625 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
684 ; VI: s_movk_i32 flat_scratch_lo, 0x0
685 ; VI: s_movk_i32 flat_scratch_hi, 0x0
686626 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
687627 ; GCN: buffer_store_dword [[RET]]
688628 define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
714654
715655 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
716656 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
717 ; VI: s_movk_i32 flat_scratch_lo, 0x0
718 ; VI: s_movk_i32 flat_scratch_hi, 0x0
719657 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
720658 define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
721659 entry:
726664
727665 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
728666 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
729 ; VI: s_movk_i32 flat_scratch_lo, 0x0
730 ; VI: s_movk_i32 flat_scratch_hi, 0x0
731667 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
732668 ; GCN: buffer_store_dword [[RET]]
733669 define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
770706
771707 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
772708 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
773 ; VI: s_movk_i32 flat_scratch_lo, 0x0
774 ; VI: s_movk_i32 flat_scratch_hi, 0x0
775709 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
776710 ; GCN: buffer_store_dword [[RET]]
777711 define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
803737
804738 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
805739 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
806 ; VI: s_movk_i32 flat_scratch_lo, 0x0
807 ; VI: s_movk_i32 flat_scratch_hi, 0x0
808740 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
809741 define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
810742 entry:
815747
816748 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
817749 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
818 ; VI: s_movk_i32 flat_scratch_lo, 0x0
819 ; VI: s_movk_i32 flat_scratch_hi, 0x0
820750 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
821751 ; GCN: buffer_store_dword [[RET]]
822752 define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
849779
850780 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
851781 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
852 ; VI: s_movk_i32 flat_scratch_lo, 0x0
853 ; VI: s_movk_i32 flat_scratch_hi, 0x0
854782 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
855783 define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
856784 entry:
862790
863791 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
864792 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
865 ; VI: s_movk_i32 flat_scratch_lo, 0x0
866 ; VI: s_movk_i32 flat_scratch_hi, 0x0
867793 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
868794 ; GCN: buffer_store_dword [[RET]]
869795 define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
895821
896822 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
897823 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
898 ; VI: s_movk_i32 flat_scratch_lo, 0x0
899 ; VI: s_movk_i32 flat_scratch_hi, 0x0
900824 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
901825 define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
902826 entry:
907831
908832 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
909833 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
910 ; VI: s_movk_i32 flat_scratch_lo, 0x0
911 ; VI: s_movk_i32 flat_scratch_hi, 0x0
912834 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
913835 ; GCN: buffer_store_dword [[RET]]
914836 define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
2727
2828 ; ELF: Symbol {
2929 ; ELF: Name: simple
30 ; ELF: Size: 296
30 ; ELF: Size: 288
3131 ; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
3232 ; ELF: }
3333
1616 ; GCNHSA: .amd_kernel_code_t
1717
1818 ; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
19 ; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
19 ; GCNHSA: compute_pgm_rsrc2_user_sgpr = 8
2020 ; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
2121 ; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
2222 ; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
2828 ; GCNHSA: enable_sgpr_queue_ptr = 0
2929 ; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
3030 ; GCNHSA: enable_sgpr_dispatch_id = 0
31 ; GCNHSA: enable_sgpr_flat_scratch_init = 0
31 ; GCNHSA: enable_sgpr_flat_scratch_init = 1
3232 ; GCNHSA: enable_sgpr_private_segment_size = 0
3333 ; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
3434 ; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
3838 ; GCNHSA: .end_amd_kernel_code_t
3939
4040
41 ; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
42 ; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
41 ; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
42 ; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
4343
4444 ; Scratch size = alloca size + emergency stack slot
4545 ; ALL: ; ScratchSize: 32772
3232 ; by 4 bytes.
3333 ; HSA-ALLOCA: workitem_private_segment_byte_size = 24
3434 ; HSA-ALLOCA: .end_amd_kernel_code_t
35
36 ; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7
37 ; HSA-ALLOCA: s_add_u32 s6, s6, s9
38 ; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8
3539
3640 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
3741 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
None ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
3 ; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
2 ; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
44
55 ; This ends up using all 256 registers and requires register
66 ; scavenging which will fail to find an unsued register.
1616
1717 ; GCN-LABEL: {{^}}spill_vgpr_compute:
1818
19 ; GCN: s_mov_b32 s16, s3
20 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
21 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
22 ; GCN-NEXT: s_mov_b32 s14, -1
23 ; SI-NEXT: s_mov_b32 s15, 0x98f000
24 ; VI-NEXT: s_mov_b32 s15, 0x980000
19 ; HSA: enable_sgpr_private_segment_buffer = 1
20 ; HSA: enable_sgpr_flat_scratch_init = 0
21 ; HSA: workitem_private_segment_byte_size = 1024
22
23 ; GCN-NOT: flat_scr
24
25 ; GCNMESA: s_mov_b32 s16, s3
26 ; GCNMESA: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
27 ; GCNMESA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
28 ; GCNMESA-NEXT: s_mov_b32 s14, -1
29 ; SIMESA-NEXT: s_mov_b32 s15, 0x98f000
30 ; VIMESA-NEXT: s_mov_b32 s15, 0x980000
2531
2632
2733 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill