llvm.org GIT mirror llvm / a52756b
[AMDGPU] Emit metadata for hidden arguments for kernel enqueue Identifies kernels which performs device side kernel enqueues and emit metadata for the associated hidden kernel arguments. Such kernels are marked with calls-enqueue-kernel function attribute by AMDGPUOpenCLEnqueueKernelLowering pass and later on hidden kernel arguments metadata HiddenDefaultQueue and HiddenCompletionAction are emitted for them. Differential Revision: https://reviews.llvm.org/D39255 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316907 91177308-0d34-0410-b5e6-96231b3b80d8 Yaxun Liu 1 year, 9 months ago
6 changed file(s) with 214 addition(s) and 9 deletion(s). Raw diff Collapse all Expand all
10381038 passed in the kernarg.
10391039
10401040 "HiddenCompletionAction"
1041 *TBD*
1042
1043 .. TODO
1044 Add description.
1041 A global address space pointer
1042 to help link enqueued kernels into
1043 the ancestor tree for determining
1044 when the parent kernel has finished.
10451045
10461046 "ValueType" string Required Kernel argument value type. Only
10471047 present if "ValueKind" is
2424 // linkage does not work since optimization passes will try to replace loads
2525 // of the global variable with its initialization value.
2626 //
27 // It also identifies the kernels directly or indirectly enqueues kernels
28 // and adds "calls-enqueue-kernel" function attribute to them, which will
29 // be used to determine whether to emit runtime metadata for the kernel
30 // enqueue related hidden kernel arguments.
31 //
2732 //===----------------------------------------------------------------------===//
2833
2934 #include "AMDGPU.h"
35 #include "llvm/ADT/DenseSet.h"
3036 #include "llvm/ADT/StringRef.h"
3137 #include "llvm/IR/Constants.h"
38 #include "llvm/IR/Instructions.h"
3239 #include "llvm/IR/Module.h"
40 #include "llvm/IR/User.h"
3341 #include "llvm/Pass.h"
3442 #include "llvm/Support/Debug.h"
3543 #include "llvm/Support/raw_ostream.h"
6573 return new AMDGPUOpenCLEnqueuedBlockLowering();
6674 }
6775
76 /// Collect direct or indrect callers of \p F and save them
77 /// to \p Callers.
78 static void collectCallers(Function *F, DenseSet &Callers) {
79 for (auto U : F->users()) {
80 if (auto *CI = dyn_cast(&*U)) {
81 auto *Caller = CI->getParent()->getParent();
82 if (Callers.count(Caller))
83 continue;
84 Callers.insert(Caller);
85 collectCallers(Caller, Callers);
86 }
87 }
88 }
89
6890 bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
91 DenseSet Callers;
6992 auto &C = M.getContext();
7093 auto AS = AMDGPU::getAMDGPUAS(M);
7194 bool Changed = false;
90113 AddrCast->replaceAllUsesWith(NewPtr);
91114 F.addFnAttr("runtime-handle", RuntimeHandle);
92115 F.setLinkage(GlobalValue::ExternalLinkage);
116
117 // Collect direct or indirect callers of enqueue_kernel.
118 for (auto U : NewPtr->users()) {
119 if (auto *I = dyn_cast(&*U)) {
120 auto *F = I->getParent()->getParent();
121 Callers.insert(F);
122 collectCallers(F, Callers);
123 }
124 }
93125 Changed = true;
94126 }
95127 }
128
129 for (auto F : Callers) {
130 if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
131 continue;
132 F->addFnAttr("calls-enqueue-kernel");
133 }
96134 return Changed;
97135 }
265265 emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
266266 emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
267267
268 if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
269 return;
270
271268 auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
272269 AMDGPUASI.GLOBAL_ADDRESS);
273 emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
270 auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
271 if (CallsPrintf)
272 emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
273 if (Func.hasFnAttribute("calls-enqueue-kernel")) {
274 if (!CallsPrintf) {
275 // Emit a dummy argument so that the remaining hidden arguments
276 // have a fixed position relative to the first hidden argument.
277 // This is to facilitate library code to access hidden arguments.
278 emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
279 }
280 emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
281 emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
282 }
274283 }
275284
276285 void MetadataStreamer::emitKernelArg(const Argument &Arg) {
88 %struct.ndrange_t = type { i32 }
99 %opencl.queue_t = type opaque
1010
11 define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
11 ; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space
12 define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
13 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
14 ret void
15 }
16
17 ; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]]
18 define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
19 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
20 call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d)
21 ret void
22 }
23
24 ; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]]
25 define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
1226 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
1327 entry:
1428 %block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
7690 ret void
7791 }
7892
93 ; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" }
7994 ; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
8095 ; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"
8196
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
1 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
2
3 ; CHECK: ---
4 ; CHECK: Version: [ 1, 0 ]
5 ; CHECK-NOT: Printf:
6 ; CHECK: Kernels:
7
8 ; CHECK: - Name: test_non_enqueue_kernel_caller
9 ; CHECK-NEXT: SymbolName: 'test_non_enqueue_kernel_caller@kd'
10 ; CHECK-NEXT: Language: OpenCL C
11 ; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
12 ; CHECK-NEXT: Args:
13 ; CHECK-NEXT: - TypeName: char
14 ; CHECK-NEXT: Size: 1
15 ; CHECK-NEXT: Align: 1
16 ; CHECK-NEXT: ValueKind: ByValue
17 ; CHECK-NEXT: ValueType: I8
18 ; CHECK-NEXT: AccQual: Default
19 ; CHECK-NEXT: - Size: 8
20 ; CHECK-NEXT: Align: 8
21 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
22 ; CHECK-NEXT: ValueType: I64
23 ; CHECK-NEXT: - Size: 8
24 ; CHECK-NEXT: Align: 8
25 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
26 ; CHECK-NEXT: ValueType: I64
27 ; CHECK-NEXT: - Size: 8
28 ; CHECK-NEXT: Align: 8
29 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
30 ; CHECK-NEXT: ValueType: I64
31 ; CHECK-NOT: ValueKind: HiddenNone
32 ; CHECK-NOT: ValueKind: HiddenDefaultQueue
33 ; CHECK-NOT: ValueKind: HiddenCompletionAction
34 define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a)
35 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
36 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
37 ret void
38 }
39
40 ; CHECK: - Name: test_enqueue_kernel_caller
41 ; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
42 ; CHECK-NEXT: Language: OpenCL C
43 ; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
44 ; CHECK-NEXT: Args:
45 ; CHECK-NEXT: - TypeName: char
46 ; CHECK-NEXT: Size: 1
47 ; CHECK-NEXT: Align: 1
48 ; CHECK-NEXT: ValueKind: ByValue
49 ; CHECK-NEXT: ValueType: I8
50 ; CHECK-NEXT: AccQual: Default
51 ; CHECK-NEXT: - Size: 8
52 ; CHECK-NEXT: Align: 8
53 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
54 ; CHECK-NEXT: ValueType: I64
55 ; CHECK-NEXT: - Size: 8
56 ; CHECK-NEXT: Align: 8
57 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
58 ; CHECK-NEXT: ValueType: I64
59 ; CHECK-NEXT: - Size: 8
60 ; CHECK-NEXT: Align: 8
61 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
62 ; CHECK-NEXT: ValueType: I64
63 ; CHECK-NEXT: - Size: 8
64 ; CHECK-NEXT: Align: 8
65 ; CHECK-NEXT: ValueKind: HiddenNone
66 ; CHECK-NEXT: ValueType: I8
67 ; CHECK-NEXT: AddrSpaceQual: Global
68 ; CHECK-NEXT: - Size: 8
69 ; CHECK-NEXT: Align: 8
70 ; CHECK-NEXT: ValueKind: HiddenDefaultQueue
71 ; CHECK-NEXT: ValueType: I8
72 ; CHECK-NEXT: AddrSpaceQual: Global
73 ; CHECK-NEXT: - Size: 8
74 ; CHECK-NEXT: Align: 8
75 ; CHECK-NEXT: ValueKind: HiddenCompletionAction
76 ; CHECK-NEXT: ValueType: I8
77 ; CHECK-NEXT: AddrSpaceQual: Global
78 define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0
79 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
80 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
81 ret void
82 }
83
84 attributes #0 = { "calls-enqueue-kernel" }
85
86 !1 = !{i32 0}
87 !2 = !{!"none"}
88 !3 = !{!"char"}
89 !4 = !{!""}
90
91 !opencl.ocl.version = !{!90}
92 !90 = !{i32 2, i32 0}
93
94
95 ; PARSER: AMDGPU HSA Metadata Parser Test: PASS
5050 ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
5151 ; CHECK-NEXT: ValueType: I8
5252 ; CHECK-NEXT: AddrSpaceQual: Global
53 ; CHECK-NOT: ValueKind: HiddenDefaultQueue
54 ; CHECK-NOT: ValueKind: HiddenCompletionAction
5355 define amdgpu_kernel void @test_char(i8 %a)
5456 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
5557 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
12661268 ret void
12671269 }
12681270
1271 ; CHECK: - Name: test_enqueue_kernel_caller
1272 ; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
1273 ; CHECK-NEXT: Language: OpenCL C
1274 ; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
1275 ; CHECK-NEXT: Args:
1276 ; CHECK-NEXT: - TypeName: char
1277 ; CHECK-NEXT: Size: 1
1278 ; CHECK-NEXT: Align: 1
1279 ; CHECK-NEXT: ValueKind: ByValue
1280 ; CHECK-NEXT: ValueType: I8
1281 ; CHECK-NEXT: AccQual: Default
1282 ; CHECK-NEXT: - Size: 8
1283 ; CHECK-NEXT: Align: 8
1284 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
1285 ; CHECK-NEXT: ValueType: I64
1286 ; CHECK-NEXT: - Size: 8
1287 ; CHECK-NEXT: Align: 8
1288 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
1289 ; CHECK-NEXT: ValueType: I64
1290 ; CHECK-NEXT: - Size: 8
1291 ; CHECK-NEXT: Align: 8
1292 ; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
1293 ; CHECK-NEXT: ValueType: I64
1294 ; CHECK-NEXT: - Size: 8
1295 ; CHECK-NEXT: Align: 8
1296 ; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
1297 ; CHECK-NEXT: ValueType: I8
1298 ; CHECK-NEXT: AddrSpaceQual: Global
1299 ; CHECK-NEXT: - Size: 8
1300 ; CHECK-NEXT: Align: 8
1301 ; CHECK-NEXT: ValueKind: HiddenDefaultQueue
1302 ; CHECK-NEXT: ValueType: I8
1303 ; CHECK-NEXT: AddrSpaceQual: Global
1304 ; CHECK-NEXT: - Size: 8
1305 ; CHECK-NEXT: Align: 8
1306 ; CHECK-NEXT: ValueKind: HiddenCompletionAction
1307 ; CHECK-NEXT: ValueType: I8
1308 ; CHECK-NEXT: AddrSpaceQual: Global
1309 define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
1310 !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
1311 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
1312 ret void
1313 }
1314
12691315 attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
1316 attributes #1 = { "calls-enqueue-kernel" }
12701317
12711318 !llvm.printf.fmts = !{!100, !101}
12721319