llvm.org GIT mirror llvm / cdbb0ae
AMDGPU: Add pass to optimize reqd_work_group_size Eliminate loads from the dispatch packet when they will have a known value. Also pattern match the code used by the library to handle partial workgroup dispatches, which isn't necessary if reqd_work_group_size is used. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@332771 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 1 year, 9 months ago
5 changed file(s) with 781 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
7272 void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
7373 extern char &AMDGPULowerIntrinsicsID;
7474
75 ModulePass *createAMDGPULowerKernelAttributesPass();
76 void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
77 extern char &AMDGPULowerKernelAttributesID;
78
7579 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
7680 extern char &AMDGPURewriteOutArgumentsID;
7781
0 //===-- AMDGPULowerKernelAttributes.cpp ------------------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This pass does attempts to make use of reqd_work_group_size metadata
10 /// to eliminate loads from the dispatch packet and to constant fold OpenCL
11 /// get_local_size-like functions.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "llvm/Analysis/ValueTracking.h"
18 #include "llvm/CodeGen/Passes.h"
19 #include "llvm/CodeGen/TargetPassConfig.h"
20 #include "llvm/IR/Constants.h"
21 #include "llvm/IR/Function.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/PatternMatch.h"
24 #include "llvm/Pass.h"
25
26 #define DEBUG_TYPE "amdgpu-lower-kernel-attributes"
27
28 using namespace llvm;
29
30 namespace {
31
32 // Field offsets in hsa_kernel_dispatch_packet_t.
33 enum DispatchPackedOffsets {
34 WORKGROUP_SIZE_X = 4,
35 WORKGROUP_SIZE_Y = 6,
36 WORKGROUP_SIZE_Z = 8,
37
38 GRID_SIZE_X = 12,
39 GRID_SIZE_Y = 16,
40 GRID_SIZE_Z = 20
41 };
42
43 class AMDGPULowerKernelAttributes : public ModulePass {
44 Module *Mod = nullptr;
45
46 public:
47 static char ID;
48
49 AMDGPULowerKernelAttributes() : ModulePass(ID) {}
50
51 bool processUse(CallInst *CI);
52
53 bool doInitialization(Module &M) override;
54 bool runOnModule(Module &M) override;
55
56 StringRef getPassName() const override {
57 return "AMDGPU Kernel Attributes";
58 }
59
60 void getAnalysisUsage(AnalysisUsage &AU) const override {
61 AU.setPreservesAll();
62 }
63 };
64
65 } // end anonymous namespace
66
67 bool AMDGPULowerKernelAttributes::doInitialization(Module &M) {
68 Mod = &M;
69 return false;
70 }
71
72 bool AMDGPULowerKernelAttributes::processUse(CallInst *CI) {
73 Function *F = CI->getParent()->getParent();
74
75 auto MD = F->getMetadata("reqd_work_group_size");
76 const bool HasReqdWorkGroupSize = MD && MD->getNumOperands() == 3;
77
78 const bool HasUniformWorkGroupSize =
79 F->getFnAttribute("uniform-work-group-size").getValueAsString() == "true";
80
81 if (!HasReqdWorkGroupSize && !HasUniformWorkGroupSize)
82 return false;
83
84 Value *WorkGroupSizeX = nullptr;
85 Value *WorkGroupSizeY = nullptr;
86 Value *WorkGroupSizeZ = nullptr;
87
88 Value *GridSizeX = nullptr;
89 Value *GridSizeY = nullptr;
90 Value *GridSizeZ = nullptr;
91
92 const DataLayout &DL = Mod->getDataLayout();
93
94 // We expect to see several GEP users, casted to the appropriate type and
95 // loaded.
96 for (User *U : CI->users()) {
97 if (!U->hasOneUse())
98 continue;
99
100 int64_t Offset = 0;
101 if (GetPointerBaseWithConstantOffset(U, Offset, DL) != CI)
102 continue;
103
104 auto *BCI = dyn_cast(*U->user_begin());
105 if (!BCI || !BCI->hasOneUse())
106 continue;
107
108 auto *Load = dyn_cast(*BCI->user_begin());
109 if (!Load || !Load->isSimple())
110 continue;
111
112 unsigned LoadSize = DL.getTypeStoreSize(Load->getType());
113
114 // TODO: Handle merged loads.
115 switch (Offset) {
116 case WORKGROUP_SIZE_X:
117 if (LoadSize == 2)
118 WorkGroupSizeX = Load;
119 break;
120 case WORKGROUP_SIZE_Y:
121 if (LoadSize == 2)
122 WorkGroupSizeY = Load;
123 break;
124 case WORKGROUP_SIZE_Z:
125 if (LoadSize == 2)
126 WorkGroupSizeZ = Load;
127 break;
128 case GRID_SIZE_X:
129 if (LoadSize == 4)
130 GridSizeX = Load;
131 break;
132 case GRID_SIZE_Y:
133 if (LoadSize == 4)
134 GridSizeY = Load;
135 break;
136 case GRID_SIZE_Z:
137 if (LoadSize == 4)
138 GridSizeZ = Load;
139 break;
140 default:
141 break;
142 }
143 }
144
145 // Pattern match the code used to handle partial workgroup dispatches in the
146 // library implementation of get_local_size, so the entire function can be
147 // constant folded with a known group size.
148 //
149 // uint r = grid_size - group_id * group_size;
150 // get_local_size = (r < group_size) ? r : group_size;
151 //
152 // If we have uniform-work-group-size (which is the default in OpenCL 1.2),
153 // the grid_size is required to be a multiple of group_size). In this case:
154 //
155 // grid_size - (group_id * group_size) < group_size
156 // ->
157 // grid_size < group_size + (group_id * group_size)
158 //
159 // (grid_size / group_size) < 1 + group_id
160 //
161 // grid_size / group_size is at least 1, so we can conclude the select
162 // condition is false (except for group_id == 0, where the select result is
163 // the same).
164
165 bool MadeChange = false;
166 Value *WorkGroupSizes[3] = { WorkGroupSizeX, WorkGroupSizeY, WorkGroupSizeZ };
167 Value *GridSizes[3] = { GridSizeX, GridSizeY, GridSizeZ };
168
169 for (int I = 0; HasUniformWorkGroupSize && I < 3; ++I) {
170 Value *GroupSize = WorkGroupSizes[I];
171 Value *GridSize = GridSizes[I];
172 if (!GroupSize || !GridSize)
173 continue;
174
175 for (User *U : GroupSize->users()) {
176 auto *ZextGroupSize = dyn_cast(U);
177 if (!ZextGroupSize)
178 continue;
179
180 for (User *ZextUser : ZextGroupSize->users()) {
181 auto *SI = dyn_cast(ZextUser);
182 if (!SI)
183 continue;
184
185 using namespace llvm::PatternMatch;
186 auto GroupIDIntrin = I == 0 ?
187 m_Intrinsic() :
188 (I == 1 ? m_Intrinsic() :
189 m_Intrinsic());
190
191 auto SubExpr = m_Sub(m_Specific(GridSize),
192 m_Mul(GroupIDIntrin, m_Specific(ZextGroupSize)));
193
194 ICmpInst::Predicate Pred;
195 if (match(SI,
196 m_Select(m_ICmp(Pred, SubExpr, m_Specific(ZextGroupSize)),
197 SubExpr,
198 m_Specific(ZextGroupSize))) &&
199 Pred == ICmpInst::ICMP_ULT) {
200 if (HasReqdWorkGroupSize) {
201 ConstantInt *KnownSize
202 = mdconst::extract(MD->getOperand(I));
203 SI->replaceAllUsesWith(ConstantExpr::getIntegerCast(KnownSize,
204 SI->getType(),
205 false));
206 } else {
207 SI->replaceAllUsesWith(ZextGroupSize);
208 }
209
210 MadeChange = true;
211 }
212 }
213 }
214 }
215
216 if (!HasReqdWorkGroupSize)
217 return MadeChange;
218
219 // Eliminate any other loads we can from the dispatch packet.
220 for (int I = 0; I < 3; ++I) {
221 Value *GroupSize = WorkGroupSizes[I];
222 if (!GroupSize)
223 continue;
224
225 ConstantInt *KnownSize = mdconst::extract(MD->getOperand(I));
226 GroupSize->replaceAllUsesWith(
227 ConstantExpr::getIntegerCast(KnownSize,
228 GroupSize->getType(),
229 false));
230 MadeChange = true;
231 }
232
233 return MadeChange;
234 }
235
236 // TODO: Move makeLIDRangeMetadata usage into here. Seem to not get
237 // TargetPassConfig for subtarget.
238 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
239 StringRef DispatchPtrName
240 = Intrinsic::getName(Intrinsic::amdgcn_dispatch_ptr);
241
242 Function *DispatchPtr = Mod->getFunction(DispatchPtrName);
243 if (!DispatchPtr) // Dispatch ptr not used.
244 return false;
245
246 bool MadeChange = false;
247
248 SmallPtrSet HandledUses;
249 for (auto *U : DispatchPtr->users()) {
250 CallInst *CI = cast(U);
251 if (HandledUses.insert(CI).second) {
252 if (processUse(CI))
253 MadeChange = true;
254 }
255 }
256
257 return MadeChange;
258 }
259
260 INITIALIZE_PASS_BEGIN(AMDGPULowerKernelAttributes, DEBUG_TYPE,
261 "AMDGPU IR optimizations", false, false)
262 INITIALIZE_PASS_END(AMDGPULowerKernelAttributes, DEBUG_TYPE, "AMDGPU IR optimizations",
263 false, false)
264
265 char AMDGPULowerKernelAttributes::ID = 0;
266
267 ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
268 return new AMDGPULowerKernelAttributes();
269 }
160160 initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
161161 initializeAMDGPUAnnotateUniformValuesPass(*PR);
162162 initializeAMDGPUArgumentUsageInfoPass(*PR);
163 initializeAMDGPULowerKernelAttributesPass(*PR);
163164 initializeAMDGPULowerIntrinsicsPass(*PR);
164165 initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
165166 initializeAMDGPUPromoteAllocaPass(*PR);
402403 // Add infer address spaces pass to the opt pipeline after inlining
403404 // but before SROA to increase SROA opportunities.
404405 PM.add(createInferAddressSpacesPass());
406
407 // This should run after inlining to have any chance of doing anything,
408 // and before other cleanup optimizations.
409 PM.add(createAMDGPULowerKernelAttributesPass());
405410 });
406411 }
407412
3838 AMDGPULibCalls.cpp
3939 AMDGPULibFunc.cpp
4040 AMDGPULowerIntrinsics.cpp
41 AMDGPULowerKernelAttributes.cpp
4142 AMDGPUMachineCFGStructurizer.cpp
4243 AMDGPUMachineFunction.cpp
4344 AMDGPUMachineModuleInfo.cpp
0 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -amdgpu-lower-kernel-attributes -instcombine %s | FileCheck -enable-var-scope %s
1
2 ; CHECK-LABEL: @invalid_reqd_work_group_size(
3 ; CHECK: load i16,
4 define amdgpu_kernel void @invalid_reqd_work_group_size(i16 addrspace(1)* %out) #0 !reqd_work_group_size !1 {
5 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
6 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
7 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
8 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
9 store i16 %group.size.x, i16 addrspace(1)* %out
10 ret void
11 }
12
13 ; CHECK-LABEL: @volatile_load_group_size_x(
14 ; CHECK: load volatile i16,
15 define amdgpu_kernel void @volatile_load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
16 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
17 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
18 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
19 %group.size.x = load volatile i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
20 store i16 %group.size.x, i16 addrspace(1)* %out
21 ret void
22 }
23
24 ; CHECK-LABEL: @load_group_size_x(
25 ; CHECK-NEXT: store i16 8,
26 define amdgpu_kernel void @load_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
27 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
28 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
29 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
30 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
31 store i16 %group.size.x, i16 addrspace(1)* %out
32 ret void
33 }
34
35 ; CHECK-LABEL: @load_group_size_y(
36 ; CHECK-NEXT: store i16 16,
37 define amdgpu_kernel void @load_group_size_y(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
38 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
39 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
40 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
41 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
42 store i16 %group.size.y, i16 addrspace(1)* %out
43 ret void
44 }
45
46 ; CHECK-LABEL: @load_group_size_z(
47 ; CHECK-NEXT: store i16 2,
48 define amdgpu_kernel void @load_group_size_z(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
49 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
50 %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
51 %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
52 %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
53 store i16 %group.size.z, i16 addrspace(1)* %out
54 ret void
55 }
56
57 ; Metadata uses i64 instead of i32
58 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i64(
59 ; CHECK-NEXT: store i16 8,
60 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i64(i16 addrspace(1)* %out) #0 !reqd_work_group_size !2 {
61 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
62 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
63 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
64 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
65 store i16 %group.size.x, i16 addrspace(1)* %out
66 ret void
67 }
68
69 ; Metadata uses i16 instead of i32
70 ; CHECK-LABEL: @load_group_size_x_reqd_work_group_size_i16(
71 ; CHECK-NEXT: store i16 8,
72 define amdgpu_kernel void @load_group_size_x_reqd_work_group_size_i16(i16 addrspace(1)* %out) #0 !reqd_work_group_size !3 {
73 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
74 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
75 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
76 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
77 store i16 %group.size.x, i16 addrspace(1)* %out
78 ret void
79 }
80
81 ; CHECK-LABEL: @use_local_size_x_8_16_2(
82 ; CHECK-NEXT: store i64 8,
83 define amdgpu_kernel void @use_local_size_x_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
84 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
85 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
86 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
87 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
88 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
89 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
90 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
91 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
92 %group.size.x.zext = zext i16 %group.size.x to i32
93 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
94 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
95 %cmp = icmp ult i32 %sub, %group.size.x.zext
96 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
97 %zext = zext i32 %select to i64
98 store i64 %zext, i64 addrspace(1)* %out
99 ret void
100 }
101
102 ; CHECK-LABEL: @use_local_size_y_8_16_2(
103 ; CHECK-NEXT: store i64 16,
104 define amdgpu_kernel void @use_local_size_y_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
105 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
106 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 6
107 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
108 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
109 %gep.grid.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
110 %gep.grid.size.y.bc = bitcast i8 addrspace(4)* %gep.grid.size.y to i32 addrspace(4)*
111 %grid.size.y = load i32, i32 addrspace(4)* %gep.grid.size.y.bc, align 4
112 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
113 %group.size.y.zext = zext i16 %group.size.y to i32
114 %group.id_x_group.size.y = mul i32 %group.id, %group.size.y.zext
115 %sub = sub i32 %grid.size.y, %group.id_x_group.size.y
116 %cmp = icmp ult i32 %sub, %group.size.y.zext
117 %select = select i1 %cmp, i32 %sub, i32 %group.size.y.zext
118 %zext = zext i32 %select to i64
119 store i64 %zext, i64 addrspace(1)* %out
120 ret void
121 }
122
123 ; CHECK-LABEL: @use_local_size_z_8_16_2(
124 ; CHECK-NEXT: store i64 2,
125 define amdgpu_kernel void @use_local_size_z_8_16_2(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
126 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
127 %gep.group.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 8
128 %gep.group.size.z.bc = bitcast i8 addrspace(4)* %gep.group.size.z to i16 addrspace(4)*
129 %group.size.z = load i16, i16 addrspace(4)* %gep.group.size.z.bc, align 4
130 %gep.grid.size.z = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 20
131 %gep.grid.size.z.bc = bitcast i8 addrspace(4)* %gep.grid.size.z to i32 addrspace(4)*
132 %grid.size.z = load i32, i32 addrspace(4)* %gep.grid.size.z.bc, align 4
133 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.z()
134 %group.size.z.zext = zext i16 %group.size.z to i32
135 %group.id_x_group.size.z = mul i32 %group.id, %group.size.z.zext
136 %sub = sub i32 %grid.size.z, %group.id_x_group.size.z
137 %cmp = icmp ult i32 %sub, %group.size.z.zext
138 %select = select i1 %cmp, i32 %sub, i32 %group.size.z.zext
139 %zext = zext i32 %select to i64
140 store i64 %zext, i64 addrspace(1)* %out
141 ret void
142 }
143
144 ; Simplification on select is invalid, but we can still eliminate the
145 ; load of the group size.
146
147 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_group_id(
148 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
149 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
150 define amdgpu_kernel void @local_size_x_8_16_2_wrong_group_id(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
151 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
152 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
153 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
154 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
155 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
156 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
157 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
158 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.y()
159 %group.size.x.zext = zext i16 %group.size.x to i32
160 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
161 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
162 %cmp = icmp ult i32 %sub, %group.size.x.zext
163 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
164 %zext = zext i32 %select to i64
165 store i64 %zext, i64 addrspace(1)* %out
166 ret void
167 }
168
169 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_grid_size(
170 ; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
171 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
172 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
173 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
174 define amdgpu_kernel void @local_size_x_8_16_2_wrong_grid_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
175 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
176 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
177 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
178 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
179 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 16
180 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
181 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
182 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
183 %group.size.x.zext = zext i16 %group.size.x to i32
184 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
185 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
186 %cmp = icmp ult i32 %sub, %group.size.x.zext
187 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
188 %zext = zext i32 %select to i64
189 store i64 %zext, i64 addrspace(1)* %out
190 ret void
191 }
192
193 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_cmp_type(
194 ; CHECK: %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
195 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
196 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
197 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
198 ; CHECK: %cmp = icmp slt i32 %sub, 8
199 ; CHECK: %select = select i1 %cmp, i32 %sub, i32 8
200 define amdgpu_kernel void @local_size_x_8_16_2_wrong_cmp_type(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
201 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
202 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
203 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
204 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
205 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
206 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
207 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
208 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
209 %group.size.x.zext = zext i16 %group.size.x to i32
210 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
211 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
212 %cmp = icmp slt i32 %sub, %group.size.x.zext
213 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
214 %zext = zext i32 %select to i64
215 store i64 %zext, i64 addrspace(1)* %out
216 ret void
217 }
218
219 ; CHECK-LABEL: @local_size_x_8_16_2_wrong_select(
220 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
221 ; CHECK: %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
222 ; CHECK: %1 = icmp ugt i32 %sub, 8
223 ; CHECK: %select = select i1 %1, i32 %sub, i32 8
224 ; CHECK: %zext = zext i32 %select to i64
225 define amdgpu_kernel void @local_size_x_8_16_2_wrong_select(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
226 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
227 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
228 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
229 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
230 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
231 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
232 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
233 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
234 %group.size.x.zext = zext i16 %group.size.x to i32
235 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
236 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
237 %cmp = icmp ult i32 %sub, %group.size.x.zext
238 %select = select i1 %cmp, i32 %group.size.x.zext, i32 %sub
239 %zext = zext i32 %select to i64
240 store i64 %zext, i64 addrspace(1)* %out
241 ret void
242 }
243
244 ; CHECK-LABEL: @use_local_size_x_8_16_2_wrong_grid_load_size(
245 ; CHECK: %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
246 ; CHECK: %grid.size.x.zext = zext i16 %grid.size.x to i32
247 ; CHECK: %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
248 ; CHECK: %group.id_x_group.size.x = shl i32 %group.id, 3
249 ; CHECK: %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
250 define amdgpu_kernel void @use_local_size_x_8_16_2_wrong_grid_load_size(i64 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
251 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
252 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
253 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
254 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
255 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
256 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i16 addrspace(4)*
257 %grid.size.x = load i16, i16 addrspace(4)* %gep.grid.size.x.bc, align 4
258 %grid.size.x.zext = zext i16 %grid.size.x to i32
259 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
260 %group.size.x.zext = zext i16 %group.size.x to i32
261 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
262 %sub = sub i32 %grid.size.x.zext, %group.id_x_group.size.x
263 %cmp = icmp ult i32 %sub, %group.size.x.zext
264 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
265 %zext = zext i32 %select to i64
266 store i64 %zext, i64 addrspace(1)* %out
267 ret void
268 }
269
270 ; CHECK-LABEL: @func_group_size_x(
271 ; CHECK-NEXT: ret i32 8
272 define i32 @func_group_size_x(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
273 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
274 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
275 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
276 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
277 %zext = zext i16 %group.size.x to i32
278 ret i32 %zext
279 }
280
281 ; CHECK-LABEL: @__ockl_get_local_size_reqd_size(
282 ; CHECK: %group.size = phi i32 [ 2, %bb17 ], [ 16, %bb9 ], [ 8, %bb1 ], [ 1, %bb ]
283 define i64 @__ockl_get_local_size_reqd_size(i32 %arg) #1 !reqd_work_group_size !0 {
284 bb:
285 %tmp = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
286 switch i32 %arg, label %bb25 [
287 i32 0, label %bb1
288 i32 1, label %bb9
289 i32 2, label %bb17
290 ]
291
292 bb1: ; preds = %bb
293 %tmp2 = tail call i32 @llvm.amdgcn.workgroup.id.x()
294 %tmp3 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 12
295 %tmp4 = bitcast i8 addrspace(4)* %tmp3 to i32 addrspace(4)*
296 %tmp5 = load i32, i32 addrspace(4)* %tmp4, align 4
297 %tmp6 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 4
298 %tmp7 = bitcast i8 addrspace(4)* %tmp6 to i16 addrspace(4)*
299 %tmp8 = load i16, i16 addrspace(4)* %tmp7, align 4
300 br label %bb25
301
302 bb9: ; preds = %bb
303 %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.y()
304 %tmp11 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 16
305 %tmp12 = bitcast i8 addrspace(4)* %tmp11 to i32 addrspace(4)*
306 %tmp13 = load i32, i32 addrspace(4)* %tmp12, align 8
307 %tmp14 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 6
308 %tmp15 = bitcast i8 addrspace(4)* %tmp14 to i16 addrspace(4)*
309 %tmp16 = load i16, i16 addrspace(4)* %tmp15, align 2
310 br label %bb25
311
312 bb17: ; preds = %bb
313 %tmp18 = tail call i32 @llvm.amdgcn.workgroup.id.z()
314 %tmp19 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 20
315 %tmp20 = bitcast i8 addrspace(4)* %tmp19 to i32 addrspace(4)*
316 %tmp21 = load i32, i32 addrspace(4)* %tmp20, align 4
317 %tmp22 = getelementptr inbounds i8, i8 addrspace(4)* %tmp, i64 8
318 %tmp23 = bitcast i8 addrspace(4)* %tmp22 to i16 addrspace(4)*
319 %tmp24 = load i16, i16 addrspace(4)* %tmp23, align 8
320 br label %bb25
321
322 bb25: ; preds = %bb17, %bb9, %bb1, %bb
323 %tmp26 = phi i32 [ %tmp21, %bb17 ], [ %tmp13, %bb9 ], [ %tmp5, %bb1 ], [ 0, %bb ]
324 %group.size = phi i16 [ %tmp24, %bb17 ], [ %tmp16, %bb9 ], [ %tmp8, %bb1 ], [ 1, %bb ]
325 %tmp28 = phi i32 [ %tmp18, %bb17 ], [ %tmp10, %bb9 ], [ %tmp2, %bb1 ], [ 0, %bb ]
326 %tmp29 = zext i16 %group.size to i32
327 %tmp30 = mul i32 %tmp28, %tmp29
328 %tmp31 = sub i32 %tmp26, %tmp30
329 %tmp32 = icmp ult i32 %tmp31, %tmp29
330 %tmp33 = select i1 %tmp32, i32 %tmp31, i32 %tmp29
331 %tmp34 = zext i32 %tmp33 to i64
332 ret i64 %tmp34
333 }
334
335 ; CHECK-LABEL: @all_local_size(
336 ; CHECK-NEXT: store volatile i64 8, i64 addrspace(1)* %out, align 4
337 ; CHECK-NEXT: store volatile i64 16, i64 addrspace(1)* %out, align 4
338 ; CHECK-NEXT: store volatile i64 2, i64 addrspace(1)* %out, align 4
339 define amdgpu_kernel void @all_local_size(i64 addrspace(1)* nocapture readnone %out) #0 !reqd_work_group_size !0 {
340 %tmp.i = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
341 %tmp2.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #0
342 %tmp3.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 12
343 %tmp4.i = bitcast i8 addrspace(4)* %tmp3.i to i32 addrspace(4)*
344 %tmp5.i = load i32, i32 addrspace(4)* %tmp4.i, align 4
345 %tmp6.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 4
346 %tmp7.i = bitcast i8 addrspace(4)* %tmp6.i to i16 addrspace(4)*
347 %tmp8.i = load i16, i16 addrspace(4)* %tmp7.i, align 4
348 %tmp29.i = zext i16 %tmp8.i to i32
349 %tmp30.i = mul i32 %tmp2.i, %tmp29.i
350 %tmp31.i = sub i32 %tmp5.i, %tmp30.i
351 %tmp32.i = icmp ult i32 %tmp31.i, %tmp29.i
352 %tmp33.i = select i1 %tmp32.i, i32 %tmp31.i, i32 %tmp29.i
353 %tmp34.i = zext i32 %tmp33.i to i64
354 %tmp10.i = tail call i32 @llvm.amdgcn.workgroup.id.y() #0
355 %tmp11.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 16
356 %tmp12.i = bitcast i8 addrspace(4)* %tmp11.i to i32 addrspace(4)*
357 %tmp13.i = load i32, i32 addrspace(4)* %tmp12.i, align 8
358 %tmp14.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 6
359 %tmp15.i = bitcast i8 addrspace(4)* %tmp14.i to i16 addrspace(4)*
360 %tmp16.i = load i16, i16 addrspace(4)* %tmp15.i, align 2
361 %tmp29.i9 = zext i16 %tmp16.i to i32
362 %tmp30.i10 = mul i32 %tmp10.i, %tmp29.i9
363 %tmp31.i11 = sub i32 %tmp13.i, %tmp30.i10
364 %tmp32.i12 = icmp ult i32 %tmp31.i11, %tmp29.i9
365 %tmp33.i13 = select i1 %tmp32.i12, i32 %tmp31.i11, i32 %tmp29.i9
366 %tmp34.i14 = zext i32 %tmp33.i13 to i64
367 %tmp18.i = tail call i32 @llvm.amdgcn.workgroup.id.z() #0
368 %tmp19.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 20
369 %tmp20.i = bitcast i8 addrspace(4)* %tmp19.i to i32 addrspace(4)*
370 %tmp21.i = load i32, i32 addrspace(4)* %tmp20.i, align 4
371 %tmp22.i = getelementptr inbounds i8, i8 addrspace(4)* %tmp.i, i64 8
372 %tmp23.i = bitcast i8 addrspace(4)* %tmp22.i to i16 addrspace(4)*
373 %tmp24.i = load i16, i16 addrspace(4)* %tmp23.i, align 8
374 %tmp29.i2 = zext i16 %tmp24.i to i32
375 %tmp30.i3 = mul i32 %tmp18.i, %tmp29.i2
376 %tmp31.i4 = sub i32 %tmp21.i, %tmp30.i3
377 %tmp32.i5 = icmp ult i32 %tmp31.i4, %tmp29.i2
378 %tmp33.i6 = select i1 %tmp32.i5, i32 %tmp31.i4, i32 %tmp29.i2
379 %tmp34.i7 = zext i32 %tmp33.i6 to i64
380 store volatile i64 %tmp34.i, i64 addrspace(1)* %out, align 4
381 store volatile i64 %tmp34.i14, i64 addrspace(1)* %out, align 4
382 store volatile i64 %tmp34.i7, i64 addrspace(1)* %out, align 4
383 ret void
384 }
385
386 ; TODO: Should be able to handle this, but not much reason to.
387 ; CHECK-LABEL: @partial_load_group_size_x(
388 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
389 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
390 ; CHECK-NEXT: %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
391 ; CHECK-NEXT: store i8 %group.size.x.lo, i8 addrspace(1)* %out, align 1
392 define amdgpu_kernel void @partial_load_group_size_x(i8 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
393 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
394 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
395 %group.size.x.lo = load i8, i8 addrspace(4)* %gep.group.size.x, align 1
396 store i8 %group.size.x.lo, i8 addrspace(1)* %out
397 ret void
398 }
399
400 ; TODO: Should be able to handle this
401 ; CHECK-LABEL: @load_group_size_xy_i32(
402 ; CHECK: %group.size.xy = load i32,
403 ; CHECK: store i32 %group.size.xy
404 define amdgpu_kernel void @load_group_size_xy_i32(i32 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
405 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
406 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
407 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i32 addrspace(4)*
408 %group.size.xy = load i32, i32 addrspace(4)* %gep.group.size.x.bc, align 4
409 store i32 %group.size.xy, i32 addrspace(1)* %out
410 ret void
411 }
412
413 ; CHECK-LABEL: @load_group_size_x_y_multiple_dispatch_ptr(
414 ; CHECK-NEXT: store volatile i16 8, i16 addrspace(1)* %out, align 2
415 ; CHECK-NEXT: store volatile i16 16, i16 addrspace(1)* %out, align 2
416 define amdgpu_kernel void @load_group_size_x_y_multiple_dispatch_ptr(i16 addrspace(1)* %out) #0 !reqd_work_group_size !0 {
417 %dispatch.ptr0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
418 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr0, i64 4
419 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
420 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
421 store volatile i16 %group.size.x, i16 addrspace(1)* %out
422
423 %dispatch.ptr1 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
424 %gep.group.size.y = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr1, i64 6
425 %gep.group.size.y.bc = bitcast i8 addrspace(4)* %gep.group.size.y to i16 addrspace(4)*
426 %group.size.y = load i16, i16 addrspace(4)* %gep.group.size.y.bc, align 4
427 store volatile i16 %group.size.y, i16 addrspace(1)* %out
428
429 ret void
430 }
431
432 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size(
433 ; CHECK-NEXT: %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
434 ; CHECK-NEXT: %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
435 ; CHECK-NEXT: %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
436 ; CHECK-NEXT: %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
437 ; CHECK-NEXT: %zext = zext i16 %group.size.x to i64
438 ; CHECK-NEXT: store i64 %zext, i64 addrspace(1)* %out, align 4
439 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size(i64 addrspace(1)* %out) #2 {
440 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
441 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
442 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
443 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
444 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
445 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
446 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
447 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
448 %group.size.x.zext = zext i16 %group.size.x to i32
449 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
450 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
451 %cmp = icmp ult i32 %sub, %group.size.x.zext
452 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
453 %zext = zext i32 %select to i64
454 store i64 %zext, i64 addrspace(1)* %out
455 ret void
456 }
457
458 ; CHECK-LABEL: @use_local_size_x_uniform_work_group_size_false(
459 ; CHECK: icmp ult
460 ; CHECK: select
461 define amdgpu_kernel void @use_local_size_x_uniform_work_group_size_false(i64 addrspace(1)* %out) #3 {
462 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
463 %gep.group.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 4
464 %gep.group.size.x.bc = bitcast i8 addrspace(4)* %gep.group.size.x to i16 addrspace(4)*
465 %group.size.x = load i16, i16 addrspace(4)* %gep.group.size.x.bc, align 4
466 %gep.grid.size.x = getelementptr inbounds i8, i8 addrspace(4)* %dispatch.ptr, i64 12
467 %gep.grid.size.x.bc = bitcast i8 addrspace(4)* %gep.grid.size.x to i32 addrspace(4)*
468 %grid.size.x = load i32, i32 addrspace(4)* %gep.grid.size.x.bc, align 4
469 %group.id = tail call i32 @llvm.amdgcn.workgroup.id.x()
470 %group.size.x.zext = zext i16 %group.size.x to i32
471 %group.id_x_group.size.x = mul i32 %group.id, %group.size.x.zext
472 %sub = sub i32 %grid.size.x, %group.id_x_group.size.x
473 %cmp = icmp ult i32 %sub, %group.size.x.zext
474 %select = select i1 %cmp, i32 %sub, i32 %group.size.x.zext
475 %zext = zext i32 %select to i64
476 store i64 %zext, i64 addrspace(1)* %out
477 ret void
478 }
479
480 ; CHECK-LABEL: @no_use_dispatch_ptr(
481 ; CHECK-NEXT: ret void
482 define amdgpu_kernel void @no_use_dispatch_ptr() {
483 %dispatch.ptr = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
484 ret void
485 }
486
487 declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
488 declare i32 @llvm.amdgcn.workgroup.id.x() #1
489 declare i32 @llvm.amdgcn.workgroup.id.y() #1
490 declare i32 @llvm.amdgcn.workgroup.id.z() #1
491
492 attributes #0 = { nounwind "uniform-work-group-size"="true" }
493 attributes #1 = { nounwind readnone speculatable }
494 attributes #2 = { nounwind "uniform-work-group-size"="true" }
495 attributes #3 = { nounwind "uniform-work-group-size"="false" }
496
497 !0 = !{i32 8, i32 16, i32 2}
498 !1 = !{i32 8, i32 16}
499 !2 = !{i64 8, i64 16, i64 2}
500 !3 = !{i16 8, i16 16, i16 2}