llvm.org GIT mirror llvm / 510a2b9
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit Summary: For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD. This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions. Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug. Reviewers: mareko, arsenm, tstellarAMD, nhaehnle Subscribers: FireBurn, kerberizer, llvm-commits, arsenm Differential Revision: http://reviews.llvm.org/D18340 Patch By: Bas Nieuwenhuizen git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@266337 91177308-0d34-0410-b5e6-96231b3b80d8 Tom Stellard 4 years ago
9 changed file(s) with 220 addition(s) and 69 deletion(s). Raw diff Collapse all Expand all
620620 return (Value + Align - 1 - Skew) / Align * Align + Skew;
621621 }
622622
623 /// Returns the largest uint64_t less than or equal to \p Value and is
624 /// \p Skew mod \p Align. \p Align must be non-zero
625 inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
626 Skew %= Align;
627 return (Value - Skew) / Align * Align + Skew;
628 }
629
623630 /// Returns the offset to the next integer (mod 2**64) that is greater than
624631 /// or equal to \p Value and is a multiple of \p Align. \p Align must be
625632 /// non-zero.
495495
496496 DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
497497
498 // FIXME: This is the maximum work group size. We should try to get
499 // value from the reqd_work_group_size function attribute if it is
500 // available.
501 unsigned WorkGroupSize = 256;
498 const Function &ContainingFunction = *I.getParent()->getParent();
499
500 // FIXME: We should also try to get this value from the reqd_work_group_size
501 // function attribute if it is available.
502 unsigned WorkGroupSize = AMDGPU::getMaximumWorkGroupSize(ContainingFunction);
503
502504 int AllocaSize =
503505 WorkGroupSize * Mod->getDataLayout().getTypeAllocSize(AllocaTy);
504506
519521
520522 Function *F = I.getParent()->getParent();
521523
522 Type *GVTy = ArrayType::get(I.getAllocatedType(), 256);
524 Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
523525 GlobalVariable *GV = new GlobalVariable(
524526 *Mod, GVTy, false, GlobalValue::InternalLinkage,
525527 UndefValue::get(GVTy),
4747 PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
4848 PSInputAddr(0),
4949 ReturnsVoid(true),
50 MaximumWorkGroupSize(0),
5051 LDSWaveSpillSize(0),
5152 PSInputEna(0),
5253 NumUserSGPRs(0),
122123 if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
123124 ST.isAmdHsaOS())
124125 FlatScratchInit = true;
126
127 if (AMDGPU::isCompute(F->getCallingConv()))
128 MaximumWorkGroupSize = AMDGPU::getMaximumWorkGroupSize(*F);
129 else
130 MaximumWorkGroupSize = ST.getWavefrontSize();
125131 }
126132
127133 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
201207
202208 unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize(
203209 const MachineFunction &MF) const {
204 const AMDGPUSubtarget &ST = MF.getSubtarget();
205 // FIXME: We should get this information from kernel attributes if it
206 // is available.
207 if (AMDGPU::isCompute(MF.getFunction()->getCallingConv()))
208 return 256;
209 return ST.getWavefrontSize();
210 }
210 return MaximumWorkGroupSize;
211 }
5959 unsigned PSInputAddr;
6060 bool ReturnsVoid;
6161
62 unsigned MaximumWorkGroupSize;
63
6264 public:
6365 // FIXME: Make private
6466 unsigned LDSWaveSpillSize;
2222
2323 using namespace llvm;
2424
25 static unsigned getMaxWaveCountPerSIMD(const MachineFunction &MF) {
26 const SIMachineFunctionInfo& MFI = *MF.getInfo();
27 const AMDGPUSubtarget &ST = MF.getSubtarget();
28 unsigned SIMDPerCU = 4;
29
30 unsigned MaxInvocationsPerWave = SIMDPerCU * ST.getWavefrontSize();
31 return alignTo(MFI.getMaximumWorkGroupSize(MF), MaxInvocationsPerWave) /
32 MaxInvocationsPerWave;
33 }
34
35 static unsigned getMaxWorkGroupSGPRCount(const MachineFunction &MF) {
36 const AMDGPUSubtarget &ST = MF.getSubtarget();
37 unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
38
39 unsigned TotalSGPRCountPerSIMD, AddressableSGPRCount, SGPRUsageAlignment;
40 unsigned ReservedSGPRCount;
41
42 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
43 TotalSGPRCountPerSIMD = 800;
44 AddressableSGPRCount = 102;
45 SGPRUsageAlignment = 16;
46 ReservedSGPRCount = 6; // VCC, FLAT_SCRATCH, XNACK
47 } else {
48 TotalSGPRCountPerSIMD = 512;
49 AddressableSGPRCount = 104;
50 SGPRUsageAlignment = 8;
51 ReservedSGPRCount = 2; // VCC
52 }
53
54 unsigned MaxSGPRCount = (TotalSGPRCountPerSIMD / MaxWaveCountPerSIMD);
55 MaxSGPRCount = alignDown(MaxSGPRCount, SGPRUsageAlignment);
56
57 if (ST.hasSGPRInitBug())
58 MaxSGPRCount = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
59
60 return std::min(MaxSGPRCount - ReservedSGPRCount, AddressableSGPRCount);
61 }
62
63 static unsigned getMaxWorkGroupVGPRCount(const MachineFunction &MF) {
64 unsigned MaxWaveCountPerSIMD = getMaxWaveCountPerSIMD(MF);
65 unsigned TotalVGPRCountPerSIMD = 256;
66 unsigned VGPRUsageAlignment = 4;
67
68 return alignDown(TotalVGPRCountPerSIMD / MaxWaveCountPerSIMD,
69 VGPRUsageAlignment);
70 }
71
2572 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
2673 for (unsigned i = 0; PSets[i] != -1; ++i) {
2774 if (PSets[i] == (int)PSetID)
70117
71118 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
72119 const MachineFunction &MF) const {
73 const AMDGPUSubtarget &ST = MF.getSubtarget();
74 if (ST.hasSGPRInitBug()) {
75 // Leave space for flat_scr, xnack_mask, vcc, and alignment
76 unsigned BaseIdx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 8 - 4;
77 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
78 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
79 }
80
81 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
82 // 96/97 need to be reserved for flat_scr, 98/99 for xnack_mask, and
83 // 100/101 for vcc. This is the next sgpr128 down.
84 return AMDGPU::SGPR92_SGPR93_SGPR94_SGPR95;
85 }
86
87 return AMDGPU::SGPR96_SGPR97_SGPR98_SGPR99;
120 unsigned BaseIdx = alignDown(getMaxWorkGroupSGPRCount(MF), 4) - 4;
121 unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
122 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
88123 }
89124
90125 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
91126 const MachineFunction &MF) const {
92 const AMDGPUSubtarget &ST = MF.getSubtarget();
93 if (ST.hasSGPRInitBug()) {
94 unsigned Idx = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6 - 1;
95 return AMDGPU::SGPR_32RegClass.getRegister(Idx);
96 }
97
98 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
99 // Next register before reservations for flat_scr, xnack_mask, vcc,
100 // and scratch resource.
101 return AMDGPU::SGPR91;
102 }
103
104 return AMDGPU::SGPR95;
127 unsigned RegCount = getMaxWorkGroupSGPRCount(MF);
128 unsigned Reg;
129
130 // Try to place it in a hole after PrivateSegmentbufferReg.
131 if (RegCount & 3) {
132 // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
133 // alignment constraints, so we have a hole where can put the wave offset.
134 Reg = RegCount - 1;
135 } else {
136 // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
137 // wave offset before it.
138 Reg = RegCount - 5;
139 }
140 return AMDGPU::SGPR_32RegClass.getRegister(Reg);
105141 }
106142
107143 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
123159 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
124160 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
125161
126 // Reserve the last 2 registers so we will always have at least 2 more that
127 // will physically contain VCC.
128 reserveRegisterTuples(Reserved, AMDGPU::SGPR102_SGPR103);
129
130 const AMDGPUSubtarget &ST = MF.getSubtarget();
131
132 if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
133 // SI/CI have 104 SGPRs. VI has 102. We need to shift down the reservation
134 // for VCC/XNACK_MASK/FLAT_SCR.
135 //
136 // TODO The SGPRs that alias to XNACK_MASK could be used as general purpose
137 // SGPRs when the XNACK feature is not used. This is currently not done
138 // because the code that counts SGPRs cannot account for such holes.
139 reserveRegisterTuples(Reserved, AMDGPU::SGPR96_SGPR97);
140 reserveRegisterTuples(Reserved, AMDGPU::SGPR98_SGPR99);
141 reserveRegisterTuples(Reserved, AMDGPU::SGPR100_SGPR101);
142 }
143
144 // Tonga and Iceland can only allocate a fixed number of SGPRs due
145 // to a hw bug.
146 if (ST.hasSGPRInitBug()) {
147 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
148 // Reserve some SGPRs for FLAT_SCRATCH, XNACK_MASK, and VCC (6 SGPRs).
149 unsigned Limit = AMDGPUSubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG - 6;
150
151 for (unsigned i = Limit; i < NumSGPRs; ++i) {
152 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
153 reserveRegisterTuples(Reserved, Reg);
154 }
162 unsigned MaxWorkGroupSGPRCount = getMaxWorkGroupSGPRCount(MF);
163 unsigned MaxWorkGroupVGPRCount = getMaxWorkGroupVGPRCount(MF);
164
165 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
166 unsigned NumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
167 for (unsigned i = MaxWorkGroupSGPRCount; i < NumSGPRs; ++i) {
168 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
169 reserveRegisterTuples(Reserved, Reg);
170 }
171
172
173 for (unsigned i = MaxWorkGroupVGPRCount; i < NumVGPRs; ++i) {
174 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
175 reserveRegisterTuples(Reserved, Reg);
155176 }
156177
157178 const SIMachineFunctionInfo *MFI = MF.getInfo();
123123 return Result;
124124 }
125125
126 unsigned getMaximumWorkGroupSize(const Function &F) {
127 return getIntegerAttribute(F, "amdgpu-max-work-group-size", 256);
128 }
129
126130 unsigned getInitialPSInputAddr(const Function &F) {
127131 return getIntegerAttribute(F, "InitialPSInputAddr", 0);
128132 }
4444 bool isGlobalSegment(const GlobalValue *GV);
4545 bool isReadOnlySegment(const GlobalValue *GV);
4646
47 unsigned getMaximumWorkGroupSize(const Function &F);
4748 unsigned getInitialPSInputAddr(const Function &F);
4849
4950 bool isShader(CallingConv::ID cc);
0 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
1
2 ; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
3
4 define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
5 entry:
6 %stack = alloca [5 x i32], align 4
7 %0 = load i32, i32 addrspace(1)* %in, align 4
8 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
9 store i32 4, i32* %arrayidx1, align 4
10 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
11 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
12 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
13 store i32 5, i32* %arrayidx3, align 4
14 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
15 %2 = load i32, i32* %arrayidx10, align 4
16 store i32 %2, i32 addrspace(1)* %out, align 4
17 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
18 %3 = load i32, i32* %arrayidx12
19 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
20 store i32 %3, i32 addrspace(1)* %arrayidx13
21 ret void
22 }
23
24 ; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
25
26 define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
27 entry:
28 %stack = alloca [5 x i32], align 4
29 %0 = load i32, i32 addrspace(1)* %in, align 4
30 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
31 store i32 4, i32* %arrayidx1, align 4
32 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
33 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
34 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
35 store i32 5, i32* %arrayidx3, align 4
36 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
37 %2 = load i32, i32* %arrayidx10, align 4
38 store i32 %2, i32 addrspace(1)* %out, align 4
39 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
40 %3 = load i32, i32* %arrayidx12
41 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
42 store i32 %3, i32 addrspace(1)* %arrayidx13
43 ret void
44 }
45
46 ; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
47
48 define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
49 entry:
50 %stack = alloca [5 x i32], align 4
51 %0 = load i32, i32 addrspace(1)* %in, align 4
52 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
53 store i32 4, i32* %arrayidx1, align 4
54 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
55 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
56 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
57 store i32 5, i32* %arrayidx3, align 4
58 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
59 %2 = load i32, i32* %arrayidx10, align 4
60 store i32 %2, i32 addrspace(1)* %out, align 4
61 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
62 %3 = load i32, i32* %arrayidx12
63 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
64 store i32 %3, i32 addrspace(1)* %arrayidx13
65 ret void
66 }
67
68 attributes #0 = { nounwind "amdgpu-max-work-group-size"="63" }
69 attributes #1 = { nounwind "amdgpu-max-work-group-size"="256" }
70 attributes #2 = { nounwind "amdgpu-max-work-group-size"="1600" }
71
0 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck %s
1
2 ; CHECK: NumVgprs: 63
3 define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
4 main_body:
5 %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
6 %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
7 %10 = extractelement <3 x i32> %7, i32 0
8 %11 = extractelement <3 x i32> %7, i32 1
9 %12 = mul i32 %10, %11
10 %bc = bitcast <3 x i32> %7 to <3 x float>
11 %13 = extractelement <3 x float> %bc, i32 1
12 %14 = insertelement <512 x float> undef, float %13, i32 %12
13 call void @llvm.amdgcn.s.barrier()
14 %15 = extractelement <3 x i32> %6, i32 0
15 %16 = extractelement <3 x i32> %7, i32 0
16 %17 = shl i32 %15, 5
17 %18 = add i32 %17, %16
18 %19 = shl i32 %18, 4
19 %20 = extractelement <3 x i32> %7, i32 1
20 %21 = shl i32 %20, 2
21 %22 = sext i32 %21 to i64
22 %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
23 %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
24 %25 = load i32, i32 addrspace(3)* %24, align 4
25 %26 = extractelement <512 x float> %14, i32 %25
26 %27 = insertelement <4 x float> undef, float %26, i32 0
27 call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
28 ret void
29 }
30
31 declare void @llvm.amdgcn.s.barrier() #1
32
33 declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
34
35 attributes #0 = { "amdgpu-max-work-group-size"="1024" }
36 attributes #1 = { convergent nounwind }
37 attributes #2 = { nounwind }
38
39 !0 = !{!1, !1, i64 0, i32 1}
40 !1 = !{!"const", null}