llvm.org GIT mirror llvm / 2930e5c
[AMDGPU] Change constant addr space to 4 Differential Revision: https://reviews.llvm.org/D43170 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@325030 91177308-0d34-0410-b5e6-96231b3b80d8 Yaxun Liu 2 years ago
90 changed file(s) with 1356 addition(s) and 1365 deletion(s). Raw diff Collapse all Expand all
269269 .. table:: Address Space Mapping
270270 :name: amdgpu-address-space-mapping-table
271271
272 ================== ================= =================
272 ================== =================
273273 LLVM Address Space Memory Space
274 ------------------ -----------------------------------
275 \ Current Default Future Default
276 ================== ================= =================
277 0 Generic (Flat) Generic (Flat)
278 1 Global Global
279 2 Constant Region (GDS)
280 3 Local (group/LDS) Local (group/LDS)
281 4 Region (GDS) Constant
282 5 Private (Scratch) Private (Scratch)
283 6 Constant 32-bit Constant 32-bit
284 ================== ================= =================
285
286 Current Default
287 This is the current default address space mapping used for all languages.
288 This will shortly be deprecated.
289
290 Future Default
291 This will shortly be the only address space mapping for all languages using
292 AMDGPU backend.
274 ================== =================
275 0 Generic (Flat)
276 1 Global
277 2 Region (GDS)
278 3 Local (group/LDS)
279 4 Constant
280 5 Private (Scratch)
281 6 Constant 32-bit
282 ================== =================
293283
294284 .. _amdgpu-memory-scopes:
295285
8282
8383 def int_amdgcn_dispatch_ptr :
8484 GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">,
85 Intrinsic<[LLVMQualPointerType2>], [],
85 Intrinsic<[LLVMQualPointerType4>], [],
8686 [IntrNoMem, IntrSpeculatable]>;
8787
8888 def int_amdgcn_queue_ptr :
8989 GCCBuiltin<"__builtin_amdgcn_queue_ptr">,
90 Intrinsic<[LLVMQualPointerType2>], [],
90 Intrinsic<[LLVMQualPointerType4>], [],
9191 [IntrNoMem, IntrSpeculatable]>;
9292
9393 def int_amdgcn_kernarg_segment_ptr :
9494 GCCBuiltin<"__builtin_amdgcn_kernarg_segment_ptr">,
95 Intrinsic<[LLVMQualPointerType2>], [],
95 Intrinsic<[LLVMQualPointerType4>], [],
9696 [IntrNoMem, IntrSpeculatable]>;
9797
9898 def int_amdgcn_implicitarg_ptr :
9999 GCCBuiltin<"__builtin_amdgcn_implicitarg_ptr">,
100 Intrinsic<[LLVMQualPointerType2>], [],
100 Intrinsic<[LLVMQualPointerType4>], [],
101101 [IntrNoMem, IntrSpeculatable]>;
102102
103103 def int_amdgcn_groupstaticsize :
110110
111111 def int_amdgcn_implicit_buffer_ptr :
112112 GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
113 Intrinsic<[LLVMQualPointerType2>], [],
113 Intrinsic<[LLVMQualPointerType4>], [],
114114 [IntrNoMem, IntrSpeculatable]>;
115115
116116 // Set EXEC to the 64-bit value given.
221221 MAX_COMMON_ADDRESS = 5,
222222
223223 GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
224 CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
224 CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2)
225225 LOCAL_ADDRESS = 3, ///< Address space for local memory.
226226
227227 CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory
6060 /* Region */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
6161 };
6262 static const AliasResult ASAliasRulesGenIsZero[6][6] = {
63 /* Flat Global Constant Group Region Private */
63 /* Flat Global Region Group Constant Private */
6464 /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
6565 /* Global */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
6666 /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias, NoAlias},
7171 assert(AS.MAX_COMMON_ADDRESS <= 5);
7272 if (AS.FLAT_ADDRESS == 0) {
7373 assert(AS.GLOBAL_ADDRESS == 1 &&
74 AS.REGION_ADDRESS == 4 &&
74 AS.REGION_ADDRESS == 2 &&
7575 AS.LOCAL_ADDRESS == 3 &&
76 AS.CONSTANT_ADDRESS == 2 &&
76 AS.CONSTANT_ADDRESS == 4 &&
7777 AS.PRIVATE_ADDRESS == 5);
7878 ASAliasRules = &ASAliasRulesGenIsZero;
7979 } else {
115115
116116 if (Info->hasKernargSegmentPtr()) {
117117 unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
118 const LLT P2 = LLT::pointer(2, 64);
118 const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
119119 unsigned VReg = MRI.createGenericVirtualRegister(P2);
120120 MRI.addLiveIn(InputPtrReg, VReg);
121121 MIRBuilder.getMBB().addLiveIn(InputPtrReg);
1111 /// \todo This should be generated by TableGen.
1212 //===----------------------------------------------------------------------===//
1313
14 #include "AMDGPU.h"
1415 #include "AMDGPULegalizerInfo.h"
1516 #include "llvm/CodeGen/TargetOpcodes.h"
1617 #include "llvm/CodeGen/ValueTypes.h"
2829 const LLT V2S16 = LLT::vector(2, 16);
2930 const LLT S32 = LLT::scalar(32);
3031 const LLT S64 = LLT::scalar(64);
31 const LLT P1 = LLT::pointer(1, 64);
32 const LLT P2 = LLT::pointer(2, 64);
32 const LLT P1 = LLT::pointer(AMDGPUAS::GLOBAL_ADDRESS, 64);
33 const LLT P2 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
3334
3435 setAction({G_ADD, S32}, Legal);
3536 setAction({G_AND, S32}, Legal);
265265
266266 // 32-bit private, local, and region pointers. 64-bit global, constant and
267267 // flat.
268 return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32"
268 return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
269269 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
270270 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
271271 }
928928 AMDGPUAS AS;
929929 AS.FLAT_ADDRESS = 0;
930930 AS.PRIVATE_ADDRESS = 5;
931 AS.REGION_ADDRESS = 4;
931 AS.REGION_ADDRESS = 2;
932932 return AS;
933933 }
934934
44 # REQUIRES: global-isel
55
66 --- |
7 define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
7 define amdgpu_kernel void @smrd_imm(i32 addrspace(4)* %const0) { ret void }
88 ...
99 ---
1010
9090 bb.0:
9191 liveins: $sgpr0_sgpr1
9292
93 %0:sgpr(p2) = COPY $sgpr0_sgpr1
93 %0:sgpr(p4) = COPY $sgpr0_sgpr1
9494
9595 %1:sgpr(s64) = G_CONSTANT i64 4
96 %2:sgpr(p2) = G_GEP %0, %1
96 %2:sgpr(p4) = G_GEP %0, %1
9797 %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0)
9898 $sgpr0 = COPY %3
9999
100100 %4:sgpr(s64) = G_CONSTANT i64 1020
101 %5:sgpr(p2) = G_GEP %0, %4
101 %5:sgpr(p4) = G_GEP %0, %4
102102 %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0)
103103 $sgpr0 = COPY %6
104104
105105 %7:sgpr(s64) = G_CONSTANT i64 1024
106 %8:sgpr(p2) = G_GEP %0, %7
106 %8:sgpr(p4) = G_GEP %0, %7
107107 %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0)
108108 $sgpr0 = COPY %9
109109
110110 %10:sgpr(s64) = G_CONSTANT i64 1048572
111 %11:sgpr(p2) = G_GEP %0, %10
111 %11:sgpr(p4) = G_GEP %0, %10
112112 %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0)
113113 $sgpr0 = COPY %12
114114
115115 %13:sgpr(s64) = G_CONSTANT i64 1048576
116 %14:sgpr(p2) = G_GEP %0, %13
116 %14:sgpr(p4) = G_GEP %0, %13
117117 %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0)
118118 $sgpr0 = COPY %15
119119
120120 %16:sgpr(s64) = G_CONSTANT i64 17179869180
121 %17:sgpr(p2) = G_GEP %0, %16
121 %17:sgpr(p4) = G_GEP %0, %16
122122 %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0)
123123 $sgpr0 = COPY %18
124124
125125 %19:sgpr(s64) = G_CONSTANT i64 17179869184
126 %20:sgpr(p2) = G_GEP %0, %19
126 %20:sgpr(p4) = G_GEP %0, %19
127127 %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0)
128128 $sgpr0 = COPY %21
129129
130130 %22:sgpr(s64) = G_CONSTANT i64 4294967292
131 %23:sgpr(p2) = G_GEP %0, %22
131 %23:sgpr(p4) = G_GEP %0, %22
132132 %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0)
133133 $sgpr0 = COPY %24
134134
135135 %25:sgpr(s64) = G_CONSTANT i64 4294967296
136 %26:sgpr(p2) = G_GEP %0, %25
136 %26:sgpr(p4) = G_GEP %0, %25
137137 %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0)
138138 $sgpr0 = COPY %27
139139
1717 }
1818
1919 ; CHECK-LABEL: name: test_ptr2_byval
20 ; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1
20 ; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
2121 ; CHECK: G_LOAD [[S01]]
22 define amdgpu_vs void @test_ptr2_byval(i32 addrspace(2)* byval %arg0) {
23 %tmp0 = load volatile i32, i32 addrspace(2)* %arg0
22 define amdgpu_vs void @test_ptr2_byval(i32 addrspace(4)* byval %arg0) {
23 %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
2424 ret void
2525 }
2626
2727 ; CHECK-LABEL: name: test_ptr2_inreg
28 ; CHECK: [[S01:%[0-9]+]]:_(p2) = COPY $sgpr0_sgpr1
28 ; CHECK: [[S01:%[0-9]+]]:_(p4) = COPY $sgpr0_sgpr1
2929 ; CHECK: G_LOAD [[S01]]
30 define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(2)* inreg %arg0) {
31 %tmp0 = load volatile i32, i32 addrspace(2)* %arg0
30 define amdgpu_vs void @test_ptr2_inreg(i32 addrspace(4)* inreg %arg0) {
31 %tmp0 = load volatile i32, i32 addrspace(4)* %arg0
3232 ret void
3333 }
3434
3535 ; CHECK-LABEL: name: test_sgpr_alignment0
3636 ; CHECK: [[S0:%[0-9]+]]:_(s32) = COPY $sgpr0
37 ; CHECK: [[S23:%[0-9]+]]:_(p2) = COPY $sgpr2_sgpr3
37 ; CHECK: [[S23:%[0-9]+]]:_(p4) = COPY $sgpr2_sgpr3
3838 ; CHECK: G_LOAD [[S23]]
3939 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.exp), %{{[0-9]+}}(s32), %{{[0-9]+}}(s32), [[S0]]
40 define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(2)* inreg %arg1) {
41 %tmp0 = load volatile i32, i32 addrspace(2)* %arg1
40 define amdgpu_vs void @test_sgpr_alignment0(float inreg %arg0, i32 addrspace(4)* inreg %arg1) {
41 %tmp0 = load volatile i32, i32 addrspace(4)* %arg1
4242 call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %arg0, float undef, float undef, float undef, i1 false, i1 false) #0
4343 ret void
4444 }
22 # REQUIRES: global-isel
33
44 --- |
5 define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
5 define amdgpu_kernel void @load_constant(i32 addrspace(4)* %ptr0) { ret void }
66 define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
77 %tmp0 = load i32, i32 addrspace(1)* %ptr1
88 ret void
2929 body: |
3030 bb.0:
3131 liveins: $sgpr0_sgpr1
32 %0:_(p2) = COPY $sgpr0_sgpr1
32 %0:_(p4) = COPY $sgpr0_sgpr1
3333 %1:_(s32) = G_LOAD %0 :: (load 4 from %ir.ptr0)
3434 ...
3535
88 ; GCN-LABEL: {{^}}smrd0:
99 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
1010 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
11 define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
11 define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
1212 entry:
13 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
14 %1 = load i32, i32 addrspace(2)* %0
13 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
14 %1 = load i32, i32 addrspace(4)* %0
1515 store i32 %1, i32 addrspace(1)* %out
1616 ret void
1717 }
2020 ; GCN-LABEL: {{^}}smrd1:
2121 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
2222 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
23 define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
23 define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
2424 entry:
25 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
26 %1 = load i32, i32 addrspace(2)* %0
25 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
26 %1 = load i32, i32 addrspace(4)* %0
2727 store i32 %1, i32 addrspace(1)* %out
2828 ret void
2929 }
3535 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
3636 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
3737 ; GCN: s_endpgm
38 define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
38 define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
3939 entry:
40 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
41 %1 = load i32, i32 addrspace(2)* %0
40 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
41 %1 = load i32, i32 addrspace(4)* %0
4242 store i32 %1, i32 addrspace(1)* %out
4343 ret void
4444 }
5050 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
5151 ; TODO: Add VI checks
5252 ; XGCN: s_endpgm
53 define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
53 define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
5454 entry:
55 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
56 %1 = load i32, i32 addrspace(2)* %0
55 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
56 %1 = load i32, i32 addrspace(4)* %0
5757 store i32 %1, i32 addrspace(1)* %out
5858 ret void
5959 }
6464 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
6565 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
6666 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
67 define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
67 define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
6868 entry:
69 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
70 %1 = load i32, i32 addrspace(2)* %0
69 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
70 %1 = load i32, i32 addrspace(4)* %0
7171 store i32 %1, i32 addrspace(1)* %out
7272 ret void
7373 }
7878 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
7979 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
8080 ; GCN: s_endpgm
81 define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
81 define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
8282 entry:
83 %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
84 %1 = load i32, i32 addrspace(2)* %0
83 %0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
84 %1 = load i32, i32 addrspace(4)* %0
8585 store i32 %1, i32 addrspace(1)* %out
8686 ret void
8787 }
2626
2727 ; VI: s_add_i32
2828 ; VI: s_add_i32
29 define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
30 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
31 %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
29 define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0, <2 x i16> addrspace(4)* %in1) #1 {
30 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
31 %b = load <2 x i16>, <2 x i16> addrspace(4)* %in1
3232 %add = add <2 x i16> %a, %b
3333 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
3434 ret void
4040
4141 ; VI: s_add_i32
4242 ; VI: s_add_i32
43 define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
44 %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
43 define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in0) #1 {
44 %a = load <2 x i16>, <2 x i16> addrspace(4)* %in0
4545 %add = add <2 x i16> %a, %a
4646 store <2 x i16> %add, <2 x i16> addrspace(1)* %out
4747 ret void
9999 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
100100 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
101101 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
102 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
103 %stof = addrspacecast i32 addrspace(2)* %ptr to i32*
102 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #0 {
103 %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
104104 %ld = load volatile i32, i32* %stof
105105 ret void
106106 }
159159 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
160160 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
161161 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
162 %ftos = addrspacecast i32* %ptr to i32 addrspace(2)*
163 load volatile i32, i32 addrspace(2)* %ftos
162 %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
163 load volatile i32, i32 addrspace(4)* %ftos
164164 ret void
165165 }
166166
33 ; This test just checks that the compiler doesn't crash.
44
55 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
6 define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
6 define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(4)* inreg) #0 {
77 entry:
8 %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
8 %1 = load <32 x i8>, <32 x i8> addrspace(4)* %0
99 %2 = bitcast <32 x i8> %1 to <8 x i32>
1010 %3 = extractelement <8 x i32> %2, i32 1
1111 %4 = icmp ne i32 %3, 0
4747 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
4848
4949
50 ; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
51 ; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(2)* [[DISPATCH_PTR]] to i32 addrspace(2)*
52 ; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 1
53 ; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP0]], align 4, !invariant.load !0
54 ; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(2)* [[CAST_DISPATCH_PTR]], i64 2
55 ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
50 ; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
51 ; HSAOPT: [[CAST_DISPATCH_PTR:%[0-9]+]] = bitcast i8 addrspace(4)* [[DISPATCH_PTR]] to i32 addrspace(4)*
52 ; HSAOPT: [[GEP0:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 1
53 ; HSAOPT: [[LDXY:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP0]], align 4, !invariant.load !0
54 ; HSAOPT: [[GEP1:%[0-9]+]] = getelementptr inbounds i32, i32 addrspace(4)* [[CAST_DISPATCH_PTR]], i64 2
55 ; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(4)* [[GEP1]], align 4, !range !1, !invariant.load !0
5656 ; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
5757
5858 ; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2
77 declare i32 @llvm.amdgcn.workitem.id.y() #0
88 declare i32 @llvm.amdgcn.workitem.id.z() #0
99
10 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
11 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
12 declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
13 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
10 declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
11 declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
12 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
13 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
1414 declare i64 @llvm.amdgcn.dispatch.id() #0
1515
1616 ; HSA: define void @use_workitem_id_x() #1 {
5757
5858 ; HSA: define void @use_dispatch_ptr() #7 {
5959 define void @use_dispatch_ptr() #1 {
60 %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
61 store volatile i8 addrspace(2)* %dispatch.ptr, i8 addrspace(2)* addrspace(1)* undef
60 %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
61 store volatile i8 addrspace(4)* %dispatch.ptr, i8 addrspace(4)* addrspace(1)* undef
6262 ret void
6363 }
6464
6565 ; HSA: define void @use_queue_ptr() #8 {
6666 define void @use_queue_ptr() #1 {
67 %queue.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
68 store volatile i8 addrspace(2)* %queue.ptr, i8 addrspace(2)* addrspace(1)* undef
67 %queue.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
68 store volatile i8 addrspace(4)* %queue.ptr, i8 addrspace(4)* addrspace(1)* undef
6969 ret void
7070 }
7171
185185
186186 ; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #8 {
187187 define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
188 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
189 store volatile i32 0, i32 addrspace(4)* %stof
188 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)*
189 store volatile i32 0, i32 addrspace(2)* %stof
190190 ret void
191191 }
192192
193193 ; HSA: define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #12 {
194194 define void @use_group_to_flat_addrspacecast_gfx9(i32 addrspace(3)* %ptr) #2 {
195 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
196 store volatile i32 0, i32 addrspace(4)* %stof
195 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)*
196 store volatile i32 0, i32 addrspace(2)* %stof
197197 ret void
198198 }
199199
200200 ; HSA: define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #13 {
201201 define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(i32 addrspace(3)* %ptr) #2 {
202 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
203 store volatile i32 0, i32 addrspace(4)* %stof
202 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(2)*
203 store volatile i32 0, i32 addrspace(2)* %stof
204204 call void @func_indirect_use_queue_ptr()
205205 ret void
206206 }
225225
226226 ; HSA: define void @use_kernarg_segment_ptr() #14 {
227227 define void @use_kernarg_segment_ptr() #1 {
228 %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
229 store volatile i8 addrspace(2)* %kernarg.segment.ptr, i8 addrspace(2)* addrspace(1)* undef
228 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
229 store volatile i8 addrspace(4)* %kernarg.segment.ptr, i8 addrspace(4)* addrspace(1)* undef
230230 ret void
231231 }
232232
238238
239239 ; HSA: define amdgpu_kernel void @kern_use_implicitarg_ptr() #15 {
240240 define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
241 %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
242 store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef
241 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
242 store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
243243 ret void
244244 }
245245
246246 ; HSA: define void @use_implicitarg_ptr() #15 {
247247 define void @use_implicitarg_ptr() #1 {
248 %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
249 store volatile i8 addrspace(2)* %implicitarg.ptr, i8 addrspace(2)* addrspace(1)* undef
248 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
249 store volatile i8 addrspace(4)* %implicitarg.ptr, i8 addrspace(4)* addrspace(1)* undef
250250 ret void
251251 }
252252
77 declare i32 @llvm.amdgcn.workitem.id.y() #0
88 declare i32 @llvm.amdgcn.workitem.id.z() #0
99
10 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
11 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
12 declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
10 declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
11 declare i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
12 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
1313
1414 ; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
1515 define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
148148
149149 ; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
150150 define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
151 %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
152 %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
153 %val = load i32, i32 addrspace(2)* %bc
151 %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
152 %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
153 %val = load i32, i32 addrspace(4)* %bc
154154 store i32 %val, i32 addrspace(1)* %ptr
155155 ret void
156156 }
157157
158158 ; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
159159 define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
160 %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
161 %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
162 %val = load i32, i32 addrspace(2)* %bc
160 %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.queue.ptr()
161 %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
162 %val = load i32, i32 addrspace(4)* %bc
163163 store i32 %val, i32 addrspace(1)* %ptr
164164 ret void
165165 }
166166
167167 ; HSA: define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #12 {
168168 define amdgpu_kernel void @use_kernarg_segment_ptr(i32 addrspace(1)* %ptr) #1 {
169 %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
170 %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
171 %val = load i32, i32 addrspace(2)* %bc
169 %dispatch.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
170 %bc = bitcast i8 addrspace(4)* %dispatch.ptr to i32 addrspace(4)*
171 %val = load i32, i32 addrspace(4)* %bc
172172 store i32 %val, i32 addrspace(1)* %ptr
173173 ret void
174174 }
209209 ret void
210210 }
211211
212 ; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
213 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
214 %stof = addrspacecast i32 addrspace(2)* %ptr to i32*
212 ; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 {
213 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(4)* %ptr) #1 {
214 %stof = addrspacecast i32 addrspace(4)* %ptr to i32*
215215 %ld = load volatile i32, i32* %stof
216216 ret void
217217 }
225225
226226 ; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 {
227227 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 {
228 %ftos = addrspacecast i32* %ptr to i32 addrspace(2)*
229 %ld = load volatile i32, i32 addrspace(2)* %ftos
228 %ftos = addrspacecast i32* %ptr to i32 addrspace(4)*
229 %ld = load volatile i32, i32 addrspace(4)* %ftos
230230 ret void
231231 }
232232
357357 br i1 %cmp0, label %bb2, label %bb1
358358
359359 bb1:
360 %val = load volatile i32, i32 addrspace(2)* undef
360 %val = load volatile i32, i32 addrspace(4)* undef
361361 %cmp1 = icmp eq i32 %val, 3
362362 br i1 %cmp1, label %bb3, label %bb2
363363
344344 ; GCN: s_waitcnt
345345 ; GCN-NEXT: s_swappc_b64
346346 define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
347 %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
347 %ptr = load <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
348348 %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
349349 call void @external_void_func_v8i32(<8 x i32> %val)
350350 ret void
358358 ; GCN: s_waitcnt
359359 ; GCN-NEXT: s_swappc_b64
360360 define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
361 %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
361 %ptr = load <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
362362 %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
363363 call void @external_void_func_v16i32(<16 x i32> %val)
364364 ret void
376376 ; GCN: s_waitcnt
377377 ; GCN-NEXT: s_swappc_b64
378378 define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
379 %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
379 %ptr = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
380380 %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
381381 call void @external_void_func_v32i32(<32 x i32> %val)
382382 ret void
404404 ; GCN: s_swappc_b64
405405 ; GCN-NEXT: s_endpgm
406406 define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
407 %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
407 %ptr0 = load <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
408408 %val0 = load <32 x i32>, <32 x i32> addrspace(1)* %ptr0
409409 %val1 = load i32, i32 addrspace(1)* undef
410410 call void @external_void_func_v32i32_i32(<32 x i32> %val0, i32 %val1)
429429 ; GCN: s_waitcnt vmcnt(0)
430430 ; GCN-NEXT: s_swappc_b64
431431 define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
432 %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(2)* undef
432 %ptr0 = load { i8, i32 } addrspace(1)*, { i8, i32 } addrspace(1)* addrspace(4)* undef
433433 %val = load { i8, i32 }, { i8, i32 } addrspace(1)* %ptr0
434434 call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
435435 ret void
515515
516516 ; GCN-LABEL: {{^}}test_call_external_void_func_v16i8:
517517 define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
518 %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
518 %ptr = load <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
519519 %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
520520 call void @external_void_func_v16i8(<16 x i8> %val)
521521 ret void
33 ; GCN-LABEL: {{^}}use_dispatch_ptr:
44 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
55 define void @use_dispatch_ptr() #1 {
6 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
7 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
8 %value = load volatile i32, i32 addrspace(2)* %header_ptr
6 %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
7 %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
8 %value = load volatile i32, i32 addrspace(4)* %header_ptr
99 ret void
1010 }
1111
2020 ; GCN-LABEL: {{^}}use_queue_ptr:
2121 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
2222 define void @use_queue_ptr() #1 {
23 %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
24 %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
25 %value = load volatile i32, i32 addrspace(2)* %header_ptr
23 %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
24 %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
25 %value = load volatile i32, i32 addrspace(4)* %header_ptr
2626 ret void
2727 }
2828
6161 ; GCN-LABEL: {{^}}use_kernarg_segment_ptr:
6262 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0
6363 define void @use_kernarg_segment_ptr() #1 {
64 %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
65 %header_ptr = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
66 %value = load volatile i32, i32 addrspace(2)* %header_ptr
64 %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
65 %header_ptr = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
66 %value = load volatile i32, i32 addrspace(4)* %header_ptr
6767 ret void
6868 }
6969
434434 %alloca = alloca i32, align 4, addrspace(5)
435435 store volatile i32 0, i32 addrspace(5)* %alloca
436436
437 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
438 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
439 %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
440
441 %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
442 %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
443 %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
444
445 %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
446 %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
447 %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
437 %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
438 %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
439 %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc
440
441 %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
442 %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
443 %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
444
445 %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
446 %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
447 %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
448448
449449 %val3 = call i64 @llvm.amdgcn.dispatch.id()
450450 call void asm sideeffect "; use $0", "s"(i64 %val3)
514514 %alloca = alloca i32, align 4, addrspace(5)
515515 store volatile i32 0, i32 addrspace(5)* %alloca
516516
517 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
518 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
519 %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
520
521 %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
522 %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
523 %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
524
525 %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
526 %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
527 %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
517 %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
518 %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
519 %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc
520
521 %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
522 %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
523 %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
524
525 %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
526 %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
527 %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
528528
529529 %val3 = call i64 @llvm.amdgcn.dispatch.id()
530530 call void asm sideeffect "; use $0", "s"(i64 %val3)
572572
573573 store volatile i32 0, i32 addrspace(5)* %alloca
574574
575 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
576 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
577 %val0 = load volatile i32, i32 addrspace(2)* %dispatch_ptr.bc
578
579 %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
580 %queue_ptr.bc = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
581 %val1 = load volatile i32, i32 addrspace(2)* %queue_ptr.bc
582
583 %kernarg_segment_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
584 %kernarg_segment_ptr.bc = bitcast i8 addrspace(2)* %kernarg_segment_ptr to i32 addrspace(2)*
585 %val2 = load volatile i32, i32 addrspace(2)* %kernarg_segment_ptr.bc
575 %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
576 %dispatch_ptr.bc = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
577 %val0 = load volatile i32, i32 addrspace(4)* %dispatch_ptr.bc
578
579 %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
580 %queue_ptr.bc = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
581 %val1 = load volatile i32, i32 addrspace(4)* %queue_ptr.bc
582
583 %kernarg_segment_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
584 %kernarg_segment_ptr.bc = bitcast i8 addrspace(4)* %kernarg_segment_ptr to i32 addrspace(4)*
585 %val2 = load volatile i32, i32 addrspace(4)* %kernarg_segment_ptr.bc
586586
587587 %val3 = call i64 @llvm.amdgcn.dispatch.id()
588588 call void asm sideeffect "; use $0", "s"(i64 %val3)
602602 declare i32 @llvm.amdgcn.workgroup.id.x() #0
603603 declare i32 @llvm.amdgcn.workgroup.id.y() #0
604604 declare i32 @llvm.amdgcn.workgroup.id.z() #0
605 declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
606 declare noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
605 declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
606 declare noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
607607 declare i64 @llvm.amdgcn.dispatch.id() #0
608 declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
608 declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
609609
610610 attributes #0 = { nounwind readnone speculatable }
611611 attributes #1 = { nounwind noinline }
8686 entry:
8787 %out.gep = getelementptr i32, i32* %out, i64 999999
8888 %in.gep = getelementptr i32, i32* %in, i64 7
89 %cast = addrspacecast i32* %in.gep to i32 addrspace(2)*
89 %cast = addrspacecast i32* %in.gep to i32 addrspace(4)*
9090 %tmp0 = icmp eq i32 %cond, 0
9191 br i1 %tmp0, label %endif, label %if
9292
9393 if:
94 %tmp1 = load i32, i32 addrspace(2)* %cast
94 %tmp1 = load i32, i32 addrspace(4)* %cast
9595 br label %endif
9696
9797 endif:
267267 }
268268
269269 ; OPT-LABEL: @test_sink_constant_small_offset_i32
270 ; OPT-NOT: getelementptr i32, i32 addrspace(2)*
270 ; OPT-NOT: getelementptr i32, i32 addrspace(4)*
271271 ; OPT: br i1
272272
273273 ; GCN-LABEL: {{^}}test_sink_constant_small_offset_i32:
274274 ; GCN: s_and_saveexec_b64
275275 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
276276 ; GCN: s_or_b64 exec, exec
277 define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
278 entry:
279 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
280 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
281 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
282 %tmp0 = icmp eq i32 %tid, 0
283 br i1 %tmp0, label %endif, label %if
284
285 if:
286 %tmp1 = load i32, i32 addrspace(2)* %in.gep
277 define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
278 entry:
279 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
280 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
281 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
282 %tmp0 = icmp eq i32 %tid, 0
283 br i1 %tmp0, label %endif, label %if
284
285 if:
286 %tmp1 = load i32, i32 addrspace(4)* %in.gep
287287 br label %endif
288288
289289 endif:
296296 }
297297
298298 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_i32
299 ; OPT-NOT: getelementptr i32, i32 addrspace(2)*
299 ; OPT-NOT: getelementptr i32, i32 addrspace(4)*
300300 ; OPT: br i1
301301
302302 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_i32:
303303 ; GCN: s_and_saveexec_b64
304304 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
305305 ; GCN: s_or_b64 exec, exec
306 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
307 entry:
308 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
309 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
310 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
311 %tmp0 = icmp eq i32 %tid, 0
312 br i1 %tmp0, label %endif, label %if
313
314 if:
315 %tmp1 = load i32, i32 addrspace(2)* %in.gep
306 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
307 entry:
308 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
309 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 255
310 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
311 %tmp0 = icmp eq i32 %tid, 0
312 br i1 %tmp0, label %endif, label %if
313
314 if:
315 %tmp1 = load i32, i32 addrspace(4)* %in.gep
316316 br label %endif
317317
318318 endif:
325325 }
326326
327327 ; OPT-LABEL: @test_sink_constant_max_8_bit_offset_p1_i32
328 ; OPT-SI: getelementptr i32, i32 addrspace(2)*
329 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
330 ; OPT-VI-NOT: getelementptr i32, i32 addrspace(2)*
328 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
329 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
330 ; OPT-VI-NOT: getelementptr i32, i32 addrspace(4)*
331331 ; OPT: br i1
332332
333333 ; GCN-LABEL: {{^}}test_sink_constant_max_8_bit_offset_p1_i32:
336336
337337 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
338338 ; GCN: s_or_b64 exec, exec
339 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
340 entry:
341 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
342 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
343 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
344 %tmp0 = icmp eq i32 %tid, 0
345 br i1 %tmp0, label %endif, label %if
346
347 if:
348 %tmp1 = load i32, i32 addrspace(2)* %in.gep
339 define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
340 entry:
341 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
342 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 256
343 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
344 %tmp0 = icmp eq i32 %tid, 0
345 br i1 %tmp0, label %endif, label %if
346
347 if:
348 %tmp1 = load i32, i32 addrspace(4)* %in.gep
349349 br label %endif
350350
351351 endif:
358358 }
359359
360360 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_i32
361 ; OPT-SI: getelementptr i32, i32 addrspace(2)*
362 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
361 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
362 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
363363 ; OPT: br i1
364364
365365 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_i32:
368368 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
369369 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
370370 ; GCN: s_or_b64 exec, exec
371 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
372 entry:
373 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
374 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
375 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
376 %tmp0 = icmp eq i32 %tid, 0
377 br i1 %tmp0, label %endif, label %if
378
379 if:
380 %tmp1 = load i32, i32 addrspace(2)* %in.gep
371 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
372 entry:
373 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
374 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 4294967295
375 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
376 %tmp0 = icmp eq i32 %tid, 0
377 br i1 %tmp0, label %endif, label %if
378
379 if:
380 %tmp1 = load i32, i32 addrspace(4)* %in.gep
381381 br label %endif
382382
383383 endif:
390390 }
391391
392392 ; OPT-LABEL: @test_sink_constant_max_32_bit_offset_p1_i32
393 ; OPT: getelementptr i32, i32 addrspace(2)*
393 ; OPT: getelementptr i32, i32 addrspace(4)*
394394 ; OPT: br i1
395395
396396 ; GCN-LABEL: {{^}}test_sink_constant_max_32_bit_offset_p1_i32:
399399 ; GCN: s_addc_u32
400400 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
401401 ; GCN: s_or_b64 exec, exec
402 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
403 entry:
404 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
405 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
406 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
407 %tmp0 = icmp eq i32 %tid, 0
408 br i1 %tmp0, label %endif, label %if
409
410 if:
411 %tmp1 = load i32, i32 addrspace(2)* %in.gep
402 define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
403 entry:
404 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
405 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 17179869181
406 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
407 %tmp0 = icmp eq i32 %tid, 0
408 br i1 %tmp0, label %endif, label %if
409
410 if:
411 %tmp1 = load i32, i32 addrspace(4)* %in.gep
412412 br label %endif
413413
414414 endif:
429429 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
430430
431431 ; GCN: s_or_b64 exec, exec
432 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
433 entry:
434 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
435 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
436 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
437 %tmp0 = icmp eq i32 %tid, 0
438 br i1 %tmp0, label %endif, label %if
439
440 if:
441 %tmp1 = load i32, i32 addrspace(2)* %in.gep
432 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
433 entry:
434 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
435 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262143
436 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
437 %tmp0 = icmp eq i32 %tid, 0
438 br i1 %tmp0, label %endif, label %if
439
440 if:
441 %tmp1 = load i32, i32 addrspace(4)* %in.gep
442442 br label %endif
443443
444444 endif:
451451 }
452452
453453 ; OPT-LABEL: @test_sink_constant_max_20_bit_byte_offset_p1_i32
454 ; OPT-SI: getelementptr i32, i32 addrspace(2)*
455 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(2)*
456 ; OPT-VI: getelementptr i32, i32 addrspace(2)*
454 ; OPT-SI: getelementptr i32, i32 addrspace(4)*
455 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(4)*
456 ; OPT-VI: getelementptr i32, i32 addrspace(4)*
457457 ; OPT: br i1
458458
459459 ; GCN-LABEL: {{^}}test_sink_constant_max_20_bit_byte_offset_p1_i32:
467467 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
468468
469469 ; GCN: s_or_b64 exec, exec
470 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
471 entry:
472 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
473 %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
474 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
475 %tmp0 = icmp eq i32 %tid, 0
476 br i1 %tmp0, label %endif, label %if
477
478 if:
479 %tmp1 = load i32, i32 addrspace(2)* %in.gep
470 define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) {
471 entry:
472 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
473 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 262144
474 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
475 %tmp0 = icmp eq i32 %tid, 0
476 br i1 %tmp0, label %endif, label %if
477
478 if:
479 %tmp1 = load i32, i32 addrspace(4)* %in.gep
480480 br label %endif
481481
482482 endif:
523523 ; OPT: br i1 %tmp0,
524524 ; OPT: if:
525525 ; OPT: getelementptr i8, {{.*}} 4095
526 define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
526 define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(4)* %in) {
527527 entry:
528528 %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
529 %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
530 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
531 %tmp0 = icmp eq i32 %tid, 0
532 br i1 %tmp0, label %endif, label %if
533
534 if:
535 %bitcast = bitcast i8 addrspace(2)* %in.gep to i32 addrspace(2)*
536 %tmp1 = load i32, i32 addrspace(2)* %bitcast, align 1
529 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
530 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
531 %tmp0 = icmp eq i32 %tid, 0
532 br i1 %tmp0, label %endif, label %if
533
534 if:
535 %bitcast = bitcast i8 addrspace(4)* %in.gep to i32 addrspace(4)*
536 %tmp1 = load i32, i32 addrspace(4)* %bitcast, align 1
537537 br label %endif
538538
539539 endif:
3131 ; GCN: v_add_f64
3232 ; GCN: v_cndmask_b32_e32
3333 ; GCN: v_cndmask_b32_e32
34 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
34 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(4)* %in) #0 {
3535 entry:
36 %v = load double, double addrspace(2)* %in
36 %v = load double, double addrspace(4)* %in
3737 %cc = fcmp oeq double %v, 1.000000e+00
3838 br i1 %cc, label %if, label %endif
3939
186186
187187 ; GCN: [[ENDIF]]:
188188 ; GCN: buffer_store_dword
189 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
190 entry:
191 %v = load i32, i32 addrspace(2)* %in
189 define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(4)* %in, float %cnd) #0 {
190 entry:
191 %v = load i32, i32 addrspace(4)* %in
192192 %cc = fcmp oeq float %cnd, 1.000000e+00
193193 br i1 %cc, label %if, label %endif
194194
205205
206206 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
207207 ; GCN: v_cndmask_b32
208 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
209 entry:
210 %v = load float, float addrspace(2)* %in
208 define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(4)* %in) #0 {
209 entry:
210 %v = load float, float addrspace(4)* %in
211211 %cc = fcmp oeq float %v, 1.000000e+00
212212 br i1 %cc, label %if, label %endif
213213
247247 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
248248 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
249249 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
250 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
251 entry:
252 %v = load i32, i32 addrspace(2)* %in
250 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(4)* %in, i32 %cond) #0 {
251 entry:
252 %v = load i32, i32 addrspace(4)* %in
253253 %cc = icmp eq i32 %cond, 1
254254 br i1 %cc, label %if, label %endif
255255
294294 ; GCN: s_addc_u32
295295 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
296296 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
297 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
298 entry:
299 %v = load i64, i64 addrspace(2)* %in
297 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(4)* %in, i32 %cond) #0 {
298 entry:
299 %v = load i64, i64 addrspace(4)* %in
300300 %cc = icmp eq i32 %cond, 1
301301 br i1 %cc, label %if, label %endif
302302
319319 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
320320 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
321321 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
322 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
323 entry:
324 %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
322 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(4)* %in, i32 %cond) #0 {
323 entry:
324 %v = load <3 x i32>, <3 x i32> addrspace(4)* %in
325325 %cc = icmp eq i32 %cond, 1
326326 br i1 %cc, label %if, label %endif
327327
344344 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
345345 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
346346 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
347 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
348 entry:
349 %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
347 define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(4)* %in, i32 %cond) #0 {
348 entry:
349 %v = load <4 x i32>, <4 x i32> addrspace(4)* %in
350350 %cc = icmp eq i32 %cond, 1
351351 br i1 %cc, label %if, label %endif
352352
77 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
88 ; GCN-DAG: buffer_store_short [[VELT0]]
99 ; GCN-DAG: buffer_store_short [[VELT1]]
10 define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
11 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
10 define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
11 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
1212 %p0 = extractelement <2 x half> %vec, i32 0
1313 %p1 = extractelement <2 x half> %vec, i32 1
1414 %out1 = getelementptr half, half addrspace(1)* %out, i32 10
2525 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
2626 ; GCN: buffer_store_short [[VELT1]]
2727 ; GCN: ScratchSize: 0
28 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
29 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
28 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 %idx) #0 {
29 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
3030 %elt = extractelement <2 x half> %vec, i32 %idx
3131 store half %elt, half addrspace(1)* %out, align 2
3232 ret void
4444 ; SI: buffer_store_short [[ELT]]
4545 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
4646 ; GCN: ScratchSize: 0{{$}}
47 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
47 define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
4848 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4949 %tid.ext = sext i32 %tid to i64
5050 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
5151 %out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
52 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
52 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
5353 %idx = load i32, i32 addrspace(1)* %gep
5454 %elt = extractelement <2 x half> %vec, i32 %idx
5555 store half %elt, half addrspace(1)* %out.gep, align 2
88 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
99 ; GCN-DAG: buffer_store_short [[VELT0]]
1010 ; GCN-DAG: buffer_store_short [[VELT1]]
11 define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
12 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
11 define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
12 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1313 %p0 = extractelement <2 x i16> %vec, i32 0
1414 %p1 = extractelement <2 x i16> %vec, i32 1
1515 %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
2626 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
2727 ; GCN: buffer_store_short [[VELT1]]
2828 ; GCN: ScratchSize: 0
29 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
30 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
29 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
30 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
3131 %elt = extractelement <2 x i16> %vec, i32 %idx
3232 store i16 %elt, i16 addrspace(1)* %out, align 2
3333 ret void
4444 ; SI: buffer_store_short [[ELT]]
4545 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
4646 ; GCN: ScratchSize: 0{{$}}
47 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
47 define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
4848 %tid = call i32 @llvm.amdgcn.workitem.id.x()
4949 %tid.ext = sext i32 %tid to i64
5050 %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
5151 %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
5252 %idx = load volatile i32, i32 addrspace(1)* %gep
53 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
53 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
5454 %elt = extractelement <2 x i16> %vec, i32 %idx
5555 store i16 %elt, i16 addrspace(1)* %out.gep, align 2
5656 ret void
0 ; RUN: llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s
11 ; RUN: llvm-as -data-layout=A5 < %s | llc -mtriple=amdgcn-amd-amdhsa-amdgiz -mcpu=gfx803 -enable-si-insert-waitcnts=1 -verify-machineinstrs | FileCheck --check-prefix=GCN %s
22
3 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
4 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
3 declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
4 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
55 declare i32 @llvm.amdgcn.workitem.id.x()
66 declare i32 @llvm.amdgcn.workgroup.id.x()
77 declare void @llvm.amdgcn.s.barrier()
3333 fence syncscope("workgroup") acquire
3434 %8 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_local.temp, i64 0, i64 0), align 4
3535 %9 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
36 %10 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
36 %10 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
3737 %11 = call i32 @llvm.amdgcn.workitem.id.x()
3838 %12 = call i32 @llvm.amdgcn.workgroup.id.x()
39 %13 = getelementptr inbounds i8, i8 addrspace(2)* %10, i64 4
40 %14 = bitcast i8 addrspace(2)* %13 to i16 addrspace(2)*
41 %15 = load i16, i16 addrspace(2)* %14, align 4
39 %13 = getelementptr inbounds i8, i8 addrspace(4)* %10, i64 4
40 %14 = bitcast i8 addrspace(4)* %13 to i16 addrspace(4)*
41 %15 = load i16, i16 addrspace(4)* %14, align 4
4242 %16 = zext i16 %15 to i32
4343 %17 = mul i32 %12, %16
4444 %18 = add i32 %17, %11
45 %19 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
45 %19 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
4646 %20 = zext i32 %18 to i64
47 %21 = bitcast i8 addrspace(2)* %19 to i64 addrspace(2)*
48 %22 = load i64, i64 addrspace(2)* %21, align 8
47 %21 = bitcast i8 addrspace(4)* %19 to i64 addrspace(4)*
48 %22 = load i64, i64 addrspace(4)* %21, align 8
4949 %23 = add i64 %22, %20
5050 %24 = getelementptr inbounds i32, i32 addrspace(1)* %9, i64 %23
5151 store i32 %8, i32 addrspace(1)* %24, align 4
6767 ;
6868 %5 = load i32, i32 addrspace(5)* %3, align 4
6969 %6 = sext i32 %5 to i64
70 %7 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
70 %7 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
7171 %8 = call i32 @llvm.amdgcn.workitem.id.x()
7272 %9 = call i32 @llvm.amdgcn.workgroup.id.x()
73 %10 = getelementptr inbounds i8, i8 addrspace(2)* %7, i64 4
74 %11 = bitcast i8 addrspace(2)* %10 to i16 addrspace(2)*
75 %12 = load i16, i16 addrspace(2)* %11, align 4
73 %10 = getelementptr inbounds i8, i8 addrspace(4)* %7, i64 4
74 %11 = bitcast i8 addrspace(4)* %10 to i16 addrspace(4)*
75 %12 = load i16, i16 addrspace(4)* %11, align 4
7676 %13 = zext i16 %12 to i32
7777 %14 = mul i32 %9, %13
7878 %15 = add i32 %14, %8
79 %16 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
79 %16 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
8080 %17 = zext i32 %15 to i64
81 %18 = bitcast i8 addrspace(2)* %16 to i64 addrspace(2)*
82 %19 = load i64, i64 addrspace(2)* %18, align 8
81 %18 = bitcast i8 addrspace(4)* %16 to i64 addrspace(4)*
82 %19 = load i64, i64 addrspace(4)* %18, align 8
8383 %20 = add i64 %19, %17
8484 %21 = icmp ult i64 %6, %20
8585 br i1 %21, label %22, label %61
8686
8787 ;
88 %23 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
88 %23 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
8989 %24 = call i32 @llvm.amdgcn.workitem.id.x()
9090 %25 = call i32 @llvm.amdgcn.workgroup.id.x()
91 %26 = getelementptr inbounds i8, i8 addrspace(2)* %23, i64 4
92 %27 = bitcast i8 addrspace(2)* %26 to i16 addrspace(2)*
93 %28 = load i16, i16 addrspace(2)* %27, align 4
91 %26 = getelementptr inbounds i8, i8 addrspace(4)* %23, i64 4
92 %27 = bitcast i8 addrspace(4)* %26 to i16 addrspace(4)*
93 %28 = load i16, i16 addrspace(4)* %27, align 4
9494 %29 = zext i16 %28 to i32
9595 %30 = mul i32 %25, %29
9696 %31 = add i32 %30, %24
97 %32 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
97 %32 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
9898 %33 = zext i32 %31 to i64
99 %34 = bitcast i8 addrspace(2)* %32 to i64 addrspace(2)*
100 %35 = load i64, i64 addrspace(2)* %34, align 8
99 %34 = bitcast i8 addrspace(4)* %32 to i64 addrspace(4)*
100 %35 = load i64, i64 addrspace(4)* %34, align 8
101101 %36 = add i64 %35, %33
102102 %37 = add i64 %36, 2184
103103 %38 = trunc i64 %37 to i32
104104 %39 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
105105 %40 = load i32, i32 addrspace(5)* %3, align 4
106106 %41 = sext i32 %40 to i64
107 %42 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
107 %42 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
108108 %43 = call i32 @llvm.amdgcn.workitem.id.x()
109109 %44 = call i32 @llvm.amdgcn.workgroup.id.x()
110 %45 = getelementptr inbounds i8, i8 addrspace(2)* %42, i64 4
111 %46 = bitcast i8 addrspace(2)* %45 to i16 addrspace(2)*
112 %47 = load i16, i16 addrspace(2)* %46, align 4
110 %45 = getelementptr inbounds i8, i8 addrspace(4)* %42, i64 4
111 %46 = bitcast i8 addrspace(4)* %45 to i16 addrspace(4)*
112 %47 = load i16, i16 addrspace(4)* %46, align 4
113113 %48 = zext i16 %47 to i32
114114 %49 = mul i32 %44, %48
115115 %50 = add i32 %49, %43
116 %51 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
116 %51 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
117117 %52 = zext i32 %50 to i64
118 %53 = bitcast i8 addrspace(2)* %51 to i64 addrspace(2)*
119 %54 = load i64, i64 addrspace(2)* %53, align 8
118 %53 = bitcast i8 addrspace(4)* %51 to i64 addrspace(4)*
119 %54 = load i64, i64 addrspace(4)* %53, align 8
120120 %55 = add i64 %54, %52
121121 %56 = add i64 %41, %55
122122 %57 = getelementptr inbounds i32, i32 addrspace(1)* %39, i64 %56
146146 %2 = alloca i32 addrspace(1)*, align 4, addrspace(5)
147147 store i32 addrspace(1)* %0, i32 addrspace(1)* addrspace(5)* %2, align 4
148148 %3 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
149 %4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
149 %4 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
150150 %5 = call i32 @llvm.amdgcn.workitem.id.x()
151151 %6 = call i32 @llvm.amdgcn.workgroup.id.x()
152 %7 = getelementptr inbounds i8, i8 addrspace(2)* %4, i64 4
153 %8 = bitcast i8 addrspace(2)* %7 to i16 addrspace(2)*
154 %9 = load i16, i16 addrspace(2)* %8, align 4
152 %7 = getelementptr inbounds i8, i8 addrspace(4)* %4, i64 4
153 %8 = bitcast i8 addrspace(4)* %7 to i16 addrspace(4)*
154 %9 = load i16, i16 addrspace(4)* %8, align 4
155155 %10 = zext i16 %9 to i32
156156 %11 = mul i32 %6, %10
157157 %12 = add i32 %11, %5
158 %13 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
158 %13 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
159159 %14 = zext i32 %12 to i64
160 %15 = bitcast i8 addrspace(2)* %13 to i64 addrspace(2)*
161 %16 = load i64, i64 addrspace(2)* %15, align 8
160 %15 = bitcast i8 addrspace(4)* %13 to i64 addrspace(4)*
161 %16 = load i64, i64 addrspace(4)* %15, align 8
162162 %17 = add i64 %16, %14
163163 %18 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %17
164164 store i32 1, i32 addrspace(1)* %18, align 4
177177 fence syncscope("workgroup") acquire
178178 %24 = load i32, i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @test_global_local.temp, i64 0, i64 0), align 4
179179 %25 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %2, align 4
180 %26 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
180 %26 = call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr()
181181 %27 = call i32 @llvm.amdgcn.workitem.id.x()
182182 %28 = call i32 @llvm.amdgcn.workgroup.id.x()
183 %29 = getelementptr inbounds i8, i8 addrspace(2)* %26, i64 4
184 %30 = bitcast i8 addrspace(2)* %29 to i16 addrspace(2)*
185 %31 = load i16, i16 addrspace(2)* %30, align 4
183 %29 = getelementptr inbounds i8, i8 addrspace(4)* %26, i64 4
184 %30 = bitcast i8 addrspace(4)* %29 to i16 addrspace(4)*
185 %31 = load i16, i16 addrspace(4)* %30, align 4
186186 %32 = zext i16 %31 to i32
187187 %33 = mul i32 %28, %32
188188 %34 = add i32 %33, %27
189 %35 = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
189 %35 = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
190190 %36 = zext i32 %34 to i64
191 %37 = bitcast i8 addrspace(2)* %35 to i64 addrspace(2)*
192 %38 = load i64, i64 addrspace(2)* %37, align 8
191 %37 = bitcast i8 addrspace(4)* %35 to i64 addrspace(4)*
192 %38 = load i64, i64 addrspace(4)* %37, align 8
193193 %39 = add i64 %38, %36
194194 %40 = getelementptr inbounds i32, i32 addrspace(1)* %25, i64 %39
195195 store i32 %24, i32 addrspace(1)* %40, align 4
163163 ; GCN: s_waitcnt vmcnt(0)
164164 ; GCN-NEXT: s_setpc_b64
165165 define <8 x i32> @v8i32_func_void() #0 {
166 %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(2)* undef
166 %ptr = load volatile <8 x i32> addrspace(1)*, <8 x i32> addrspace(1)* addrspace(4)* undef
167167 %val = load <8 x i32>, <8 x i32> addrspace(1)* %ptr
168168 ret <8 x i32> %val
169169 }
176176 ; GCN: s_waitcnt vmcnt(0)
177177 ; GCN-NEXT: s_setpc_b64
178178 define <16 x i32> @v16i32_func_void() #0 {
179 %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(2)* undef
179 %ptr = load volatile <16 x i32> addrspace(1)*, <16 x i32> addrspace(1)* addrspace(4)* undef
180180 %val = load <16 x i32>, <16 x i32> addrspace(1)* %ptr
181181 ret <16 x i32> %val
182182 }
193193 ; GCN: s_waitcnt vmcnt(0)
194194 ; GCN-NEXT: s_setpc_b64
195195 define <32 x i32> @v32i32_func_void() #0 {
196 %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(2)* undef
196 %ptr = load volatile <32 x i32> addrspace(1)*, <32 x i32> addrspace(1)* addrspace(4)* undef
197197 %val = load <32 x i32>, <32 x i32> addrspace(1)* %ptr
198198 ret <32 x i32> %val
199199 }
213213 ; GCN: s_waitcnt vmcnt(0)
214214 ; GCN-NEXT: s_setpc_b64
215215 define <3 x i64> @v3i64_func_void() #0 {
216 %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(2)* undef
216 %ptr = load volatile <3 x i64> addrspace(1)*, <3 x i64> addrspace(1)* addrspace(4)* undef
217217 %val = load <3 x i64>, <3 x i64> addrspace(1)* %ptr
218218 ret <3 x i64> %val
219219 }
224224 ; GCN-NEXT: s_waitcnt vmcnt(0)
225225 ; GCN-NEXT: s_setpc_b64
226226 define <4 x i64> @v4i64_func_void() #0 {
227 %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(2)* undef
227 %ptr = load volatile <4 x i64> addrspace(1)*, <4 x i64> addrspace(1)* addrspace(4)* undef
228228 %val = load <4 x i64>, <4 x i64> addrspace(1)* %ptr
229229 ret <4 x i64> %val
230230 }
236236 ; GCN: s_waitcnt vmcnt(0)
237237 ; GCN-NEXT: s_setpc_b64
238238 define <5 x i64> @v5i64_func_void() #0 {
239 %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(2)* undef
239 %ptr = load volatile <5 x i64> addrspace(1)*, <5 x i64> addrspace(1)* addrspace(4)* undef
240240 %val = load <5 x i64>, <5 x i64> addrspace(1)* %ptr
241241 ret <5 x i64> %val
242242 }
249249 ; GCN: s_waitcnt vmcnt(0)
250250 ; GCN-NEXT: s_setpc_b64
251251 define <8 x i64> @v8i64_func_void() #0 {
252 %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(2)* undef
252 %ptr = load volatile <8 x i64> addrspace(1)*, <8 x i64> addrspace(1)* addrspace(4)* undef
253253 %val = load <8 x i64>, <8 x i64> addrspace(1)* %ptr
254254 ret <8 x i64> %val
255255 }
266266 ; GCN: s_waitcnt vmcnt(0)
267267 ; GCN-NEXT: s_setpc_b64
268268 define <16 x i64> @v16i64_func_void() #0 {
269 %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(2)* undef
269 %ptr = load volatile <16 x i64> addrspace(1)*, <16 x i64> addrspace(1)* addrspace(4)* undef
270270 %val = load <16 x i64>, <16 x i64> addrspace(1)* %ptr
271271 ret <16 x i64> %val
272272 }
308308 ; GFX9: v_lshrrev_b32_e32 v1, 16, v0
309309 ; GCN: s_setpc_b64
310310 define <5 x i16> @v5i16_func_void() #0 {
311 %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(2)* undef
311 %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
312312 %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
313313 ret <5 x i16> %val
314314 }
318318 ; GFX9: s_waitcnt vmcnt(0)
319319 ; GFX9-NEXT: s_setpc_b64
320320 define <8 x i16> @v8i16_func_void() #0 {
321 %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(2)* undef
321 %ptr = load volatile <8 x i16> addrspace(1)*, <8 x i16> addrspace(1)* addrspace(4)* undef
322322 %val = load <8 x i16>, <8 x i16> addrspace(1)* %ptr
323323 ret <8 x i16> %val
324324 }
329329 ; GFX9: s_waitcnt vmcnt(0)
330330 ; GFX9-NEXT: s_setpc_b64
331331 define <16 x i16> @v16i16_func_void() #0 {
332 %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(2)* undef
332 %ptr = load volatile <16 x i16> addrspace(1)*, <16 x i16> addrspace(1)* addrspace(4)* undef
333333 %val = load <16 x i16>, <16 x i16> addrspace(1)* %ptr
334334 ret <16 x i16> %val
335335 }
341341 ; GCN-DAG: v14
342342 ; GCN-DAG: v15
343343 define <16 x i8> @v16i8_func_void() #0 {
344 %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(2)* undef
344 %ptr = load volatile <16 x i8> addrspace(1)*, <16 x i8> addrspace(1)* addrspace(4)* undef
345345 %val = load <16 x i8>, <16 x i8> addrspace(1)* %ptr
346346 ret <16 x i8> %val
347347 }
355355 ; GFX89-DAG: v_lshrrev_b16_e32 v1, 8, v0
356356 ; GCN: s_setpc_b64
357357 define <4 x i8> @v4i8_func_void() #0 {
358 %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(2)* undef
358 %ptr = load volatile <4 x i8> addrspace(1)*, <4 x i8> addrspace(1)* addrspace(4)* undef
359359 %val = load <4 x i8>, <4 x i8> addrspace(1)* %ptr
360360 ret <4 x i8> %val
361361 }
426426 ; GFX9: s_waitcnt vmcnt(0)
427427 ; GFX9-NEXT: s_setpc_b64
428428 define <33 x i32> @v33i32_func_void() #0 {
429 %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(2)* undef
429 %ptr = load volatile <33 x i32> addrspace(1)*, <33 x i32> addrspace(1)* addrspace(4)* undef
430430 %val = load <33 x i32>, <33 x i32> addrspace(1)* %ptr
431431 ret <33 x i32> %val
432432 }
468468 ; GFX9: s_waitcnt vmcnt(0)
469469 ; GFX9-NEXT: s_setpc_b64
470470 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
471 %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(2)* undef
471 %ptr = load volatile { <32 x i32>, i32 } addrspace(1)*, { <32 x i32>, i32 } addrspace(1)* addrspace(4)* undef
472472 %val = load { <32 x i32>, i32 }, { <32 x i32>, i32 } addrspace(1)* %ptr
473473 ret { <32 x i32>, i32 }%val
474474 }
510510 ; GFX9: s_waitcnt vmcnt(0)
511511 ; GFX9-NEXT: s_setpc_b64
512512 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
513 %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(2)* undef
513 %ptr = load volatile { i32, <32 x i32> } addrspace(1)*, { i32, <32 x i32> } addrspace(1)* addrspace(4)* undef
514514 %val = load { i32, <32 x i32> }, { i32, <32 x i32> } addrspace(1)* %ptr
515515 ret { i32, <32 x i32> }%val
516516 }
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOHSA %s
11 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA %s
22
3 @private1 = private unnamed_addr addrspace(2) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
4 @private2 = private unnamed_addr addrspace(2) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
5 @available_externally = available_externally addrspace(2) global [256 x i32] zeroinitializer
3 @private1 = private unnamed_addr addrspace(4) constant [4 x float] [float 0.0, float 1.0, float 2.0, float 3.0]
4 @private2 = private unnamed_addr addrspace(4) constant [4 x float] [float 4.0, float 5.0, float 6.0, float 7.0]
5 @available_externally = available_externally addrspace(4) global [256 x i32] zeroinitializer
66
77 ; GCN-LABEL: {{^}}private_test:
88 ; GCN: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
2626 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4
2727
2828 define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
29 %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
30 %val = load float, float addrspace(2)* %ptr
29 %ptr = getelementptr [4 x float], [4 x float] addrspace(4) * @private1, i32 0, i32 %index
30 %val = load float, float addrspace(4)* %ptr
3131 store volatile float %val, float addrspace(1)* %out
32 %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index
33 %val2 = load float, float addrspace(2)* %ptr2
32 %ptr2 = getelementptr [4 x float], [4 x float] addrspace(4) * @private2, i32 0, i32 %index
33 %val2 = load float, float addrspace(4)* %ptr2
3434 store volatile float %val2, float addrspace(1)* %out
3535 ret void
3636 }
4040 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
4141 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4
4242 define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
43 %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1
44 %val = load i32, i32 addrspace(2)* %ptr
43 %ptr = getelementptr [256 x i32], [256 x i32] addrspace(4)* @available_externally, i32 0, i32 1
44 %val = load i32, i32 addrspace(4)* %ptr
4545 store i32 %val, i32 addrspace(1)* %out
4646 ret void
4747 }
33 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44
55
6 @b = internal addrspace(2) constant [1 x i16] [ i16 7 ], align 2
6 @b = internal addrspace(4) constant [1 x i16] [ i16 7 ], align 2
77
8 @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
8 @float_gv = internal unnamed_addr addrspace(4) constant [5 x float] [float 0.0, float 1.0, float 2.0, float 3.0, float 4.0], align 4
99
1010 ; FUNC-LABEL: {{^}}float:
1111 ; GCN: s_load_dword
1616 ; EG-NOT: MOV
1717 define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) {
1818 entry:
19 %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
20 %1 = load float, float addrspace(2)* %0
19 %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index
20 %1 = load float, float addrspace(4)* %0
2121 store float %1, float addrspace(1)* %out
2222 ret void
2323 }
2424
25 @i32_gv = internal unnamed_addr addrspace(2) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
25 @i32_gv = internal unnamed_addr addrspace(4) constant [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4], align 4
2626
2727 ; FUNC-LABEL: {{^}}i32:
2828
3434 ; EG-NOT: MOV
3535 define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) {
3636 entry:
37 %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
38 %1 = load i32, i32 addrspace(2)* %0
37 %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(4)* @i32_gv, i32 0, i32 %index
38 %1 = load i32, i32 addrspace(4)* %0
3939 store i32 %1, i32 addrspace(1)* %out
4040 ret void
4141 }
4343
4444 %struct.foo = type { float, [5 x i32] }
4545
46 @struct_foo_gv = internal unnamed_addr addrspace(2) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
46 @struct_foo_gv = internal unnamed_addr addrspace(4) constant [1 x %struct.foo] [ %struct.foo { float 16.0, [5 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4] } ]
4747
4848 ; FUNC-LABEL: {{^}}struct_foo_gv_load:
4949 ; GCN: s_load_dword
5353 ; EG-NOT: MOVA_INT
5454 ; EG-NOT: MOV
5555 define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
56 %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
57 %load = load i32, i32 addrspace(2)* %gep, align 4
56 %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(4)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
57 %load = load i32, i32 addrspace(4)* %gep, align 4
5858 store i32 %load, i32 addrspace(1)* %out, align 4
5959 ret void
6060 }
6161
62 @array_v1_gv = internal addrspace(2) constant [4 x <1 x i32>] [ <1 x i32> ,
62 @array_v1_gv = internal addrspace(4) constant [4 x <1 x i32>] [ <1 x i32> ,
6363 <1 x i32> ,
6464 <1 x i32> ,
6565 <1 x i32> ]
7272 ; EG-NOT: MOVA_INT
7373 ; EG-NOT: MOV
7474 define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
75 %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
76 %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
75 %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(4)* @array_v1_gv, i32 0, i32 %index
76 %load = load <1 x i32>, <1 x i32> addrspace(4)* %gep, align 4
7777 store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
7878 ret void
7979 }
8989 br i1 %0, label %if, label %else
9090
9191 if:
92 %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
93 %2 = load float, float addrspace(2)* %1
92 %1 = getelementptr inbounds [5 x float], [5 x float] addrspace(4)* @float_gv, i32 0, i32 %index
93 %2 = load float, float addrspace(4)* %1
9494 store float %2, float addrspace(1)* %out
9595 br label %endif
9696
99
1010 ; HSA: .globl simple_align16
1111 ; HSA: .p2align 5
12 define void @simple_align16(i32 addrspace(1)* addrspace(2)* %ptr.out) align 32 {
12 define void @simple_align16(i32 addrspace(1)* addrspace(4)* %ptr.out) align 32 {
1313 entry:
14 %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
14 %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
1515 store i32 0, i32 addrspace(1)* %out
1616 ret void
1717 }
5050 ; HSA: .size simple, .Lfunc_end0-simple
5151 ; HSA: ; Function info:
5252 ; HSA-NOT: COMPUTE_PGM_RSRC2
53 define void @simple(i32 addrspace(1)* addrspace(2)* %ptr.out) {
53 define void @simple(i32 addrspace(1)* addrspace(4)* %ptr.out) {
5454 entry:
55 %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
55 %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
5656 store i32 0, i32 addrspace(1)* %out
5757 ret void
5858 }
6060 ; Ignore explicit alignment that is too low.
6161 ; HSA: .globl simple_align2
6262 ; HSA: .p2align 2
63 define void @simple_align2(i32 addrspace(1)* addrspace(2)* %ptr.out) align 2 {
63 define void @simple_align2(i32 addrspace(1)* addrspace(4)* %ptr.out) align 2 {
6464 entry:
65 %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(2)* %ptr.out
65 %out = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(4)* %ptr.out
6666 store i32 0, i32 addrspace(1)* %out
6767 ret void
6868 }
580580 ; CHECK-NEXT: ValueType: I8
581581 ; CHECK-NEXT: AddrSpaceQual: Global
582582 define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g,
583 i32 addrspace(2)* %c,
583 i32 addrspace(4)* %c,
584584 i32 addrspace(3)* %l)
585585 !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51
586586 !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
1919 %.0.vec.insert = insertelement <2 x i32> undef, i32 %arg2, i32 0
2020 %.4.vec.insert = shufflevector <2 x i32> %.0.vec.insert, <2 x i32> %tmp6, <2 x i32>
2121 %tmp7 = bitcast <2 x i32> %.4.vec.insert to i64
22 %tmp8 = inttoptr i64 %tmp7 to [4294967295 x i8] addrspace(2)*
22 %tmp8 = inttoptr i64 %tmp7 to [4294967295 x i8] addrspace(4)*
2323 %tmp9 = add <3 x i32> %arg3, %arg5
24 %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp8, i64 0, i64 32
25 %tmp11 = bitcast i8 addrspace(2)* %tmp10 to <8 x i32> addrspace(2)*, !amdgpu.uniform !0
26 %tmp12 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp11, align 16
24 %tmp10 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 32
25 %tmp11 = bitcast i8 addrspace(4)* %tmp10 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0
26 %tmp12 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp11, align 16
2727 %tmp13 = shufflevector <3 x i32> %tmp9, <3 x i32> undef, <2 x i32>
2828 %tmp14 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp12, i32 15, i1 false, i1 false, i1 false, i1 false) #0
29 %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(2)*
30 %tmp16 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp15, align 16
29 %tmp15 = inttoptr i64 %tmp7 to <8 x i32> addrspace(4)*
30 %tmp16 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16
3131 call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp14, <2 x i32> %tmp13, <8 x i32> %tmp16, i32 15, i1 false, i1 false, i1 false, i1 false) #0
32 %tmp17 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp15, align 16
32 %tmp17 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp15, align 16
3333 %tmp18 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp13, <8 x i32> %tmp17, i32 15, i1 false, i1 false, i1 false, i1 false) #0
34 %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(2)* %tmp8, i64 0, i64 64
35 %tmp20 = bitcast i8 addrspace(2)* %tmp19 to <8 x i32> addrspace(2)*, !amdgpu.uniform !0
36 %tmp21 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp20, align 16
34 %tmp19 = getelementptr [4294967295 x i8], [4294967295 x i8] addrspace(4)* %tmp8, i64 0, i64 64
35 %tmp20 = bitcast i8 addrspace(4)* %tmp19 to <8 x i32> addrspace(4)*, !amdgpu.uniform !0
36 %tmp21 = load <8 x i32>, <8 x i32> addrspace(4)* %tmp20, align 16
3737 call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %tmp18, <2 x i32> %tmp13, <8 x i32> %tmp21, i32 15, i1 false, i1 false, i1 false, i1 false) #0
3838 ret void
3939 }
99
1010 ; GFX9-NOT: lshr
1111 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
12 define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
13 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
12 define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
13 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
1414 %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
1515 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
1616 ret void
2727 ; GFX9-NOT: [[ELT0]]
2828 ; GFX9-NOT: [[VEC]]
2929 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
30 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
31 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
30 define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
31 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
3232 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
3333 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
3434 ret void
4747 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
4848 ; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
4949 ; GFX9-DAG: ; use [[ELT1]]
50 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
51 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
50 define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
51 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
5252 %elt1 = extractelement <2 x i16> %vec, i32 1
5353 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
5454 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
6767 ; GFX9-NOT: [[ELT0]]
6868 ; GFX9-NOT: [[VEC]]
6969 ; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
70 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
71 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
70 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
71 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
7272 %elt.hi = lshr i32 %elt.arg, 16
7373 %elt = trunc i32 %elt.hi to i16
7474 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
8787 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
8888 ; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
8989 ; GFX9: ; use [[ELT1]]
90 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
91 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
90 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
91 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
9292 %elt.hi = lshr i32 %elt.arg, 16
9393 %elt = trunc i32 %elt.hi to i16
9494 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
112112 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
113113 ; GFX9: ; use [[ELT_HI]]
114114 ; GFX9: ; use [[VEC_HI]]
115 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
116 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
115 define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
116 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
117117 %elt.hi = lshr i32 %elt.arg, 16
118118 %elt = trunc i32 %elt.hi to i16
119119 %vec.hi = extractelement <2 x i16> %vec, i32 1
136136 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
137137
138138 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
139 define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
140 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
139 define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr) #0 {
140 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
141141 %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
142142 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
143143 ret void
152152
153153 ; GCN-NOT: shlr
154154 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
155 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
156 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
155 define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
156 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
157157 %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
158158 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
159159 ret void
166166
167167 ; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
168168 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
169 define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
170 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
169 define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
170 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
171171 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
172172 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
173173 ret void
181181 ; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
182182
183183 ; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
184 define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
185 %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
184 define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(4)* %vec.ptr) #0 {
185 %vec = load <2 x half>, <2 x half> addrspace(4)* %vec.ptr
186186 %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
187187 store <2 x half> %vecins, <2 x half> addrspace(1)* %out
188188 ret void
398398 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
399399 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]]
400400 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
401 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
402 %idx = load volatile i32, i32 addrspace(2)* %idx.ptr
403 %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
401 define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 addrspace(4)* %idx.ptr) #0 {
402 %idx = load volatile i32, i32 addrspace(4)* %idx.ptr
403 %vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
404404 %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
405405 store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
406406 ret void
2121 ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
2222 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
2323 ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
24 define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
25 %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
24 define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(4)* dereferenceable(4096) nonnull %in) #0 {
25 %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(4)* %in, !invariant.load !0
2626 %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
2727 store i16 123, i16 addrspace(1)* %ptr, align 4
2828 store i16 456, i16 addrspace(1)* %ptr.1
1313 ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding
1414 ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc
1515
16 define amdgpu_vs void @main([17 x <4 x i32>] addrspace(2)* byval %arg, [32 x <4 x i32>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <4 x i32>] addrspace(2)* byval %arg3, [17 x <4 x i32>] addrspace(2)* inreg %arg4, [17 x <4 x i32>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
16 define amdgpu_vs void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <32 x i8>] addrspace(4)* byval %arg2, [2 x <4 x i32>] addrspace(4)* byval %arg3, [17 x <4 x i32>] addrspace(4)* inreg %arg4, [17 x <4 x i32>] addrspace(4)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) {
1717 main_body:
18 %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(2)* %arg3, i64 0, i32 1
19 %tmp10 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp, !tbaa !0
18 %tmp = getelementptr [2 x <4 x i32>], [2 x <4 x i32>] addrspace(4)* %arg3, i64 0, i32 1
19 %tmp10 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
2020 %tmp11 = shl i32 %arg6, 2
2121 %tmp12 = call i32 @llvm.SI.buffer.load.dword.i32.i32(<4 x i32> %tmp10, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0)
2222 %tmp13 = bitcast i32 %tmp12 to float
66 ; GCN: enable_sgpr_dispatch_ptr = 1
77 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
88 define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
9 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
10 %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
11 %value = load i32, i32 addrspace(2)* %header_ptr
9 %dispatch_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
10 %header_ptr = bitcast i8 addrspace(4)* %dispatch_ptr to i32 addrspace(4)*
11 %value = load i32, i32 addrspace(4)* %header_ptr
1212 store i32 %value, i32 addrspace(1)* %out
1313 ret void
1414 }
1515
16 declare noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
16 declare noalias i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #0
1717
1818 attributes #0 = { readnone }
11
22 ; ERROR: in function test_kernel{{.*}}: non-hsa intrinsic with hsa target
33 define amdgpu_kernel void @test_kernel(i32 addrspace(1)* %out) #1 {
4 %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
5 %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
6 %value = load i32, i32 addrspace(2)* %header_ptr
4 %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
5 %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
6 %value = load i32, i32 addrspace(4)* %header_ptr
77 store i32 %value, i32 addrspace(1)* %out
88 ret void
99 }
1010
1111 ; ERROR: in function test_func{{.*}}: non-hsa intrinsic with hsa target
1212 define void @test_func(i32 addrspace(1)* %out) #1 {
13 %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
14 %header_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
15 %value = load i32, i32 addrspace(2)* %header_ptr
13 %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
14 %header_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
15 %value = load i32, i32 addrspace(4)* %header_ptr
1616 store i32 %value, i32 addrspace(1)* %out
1717 ret void
1818 }
1919
20 declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
20 declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
2121
2222 attributes #0 = { nounwind readnone speculatable }
2323 attributes #1 = { nounwind }
99 define amdgpu_ps i32 @test_ps() #1 {
1010 %alloca = alloca i32, addrspace(5)
1111 store volatile i32 0, i32 addrspace(5)* %alloca
12 %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
13 %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
14 %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
12 %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
13 %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
14 %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
1515 ret i32 %value
1616 }
1717
2222 define amdgpu_cs i32 @test_cs() #1 {
2323 %alloca = alloca i32, addrspace(5)
2424 store volatile i32 0, i32 addrspace(5)* %alloca
25 %implicit_buffer_ptr = call i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr()
26 %buffer_ptr = bitcast i8 addrspace(2)* %implicit_buffer_ptr to i32 addrspace(2)*
27 %value = load volatile i32, i32 addrspace(2)* %buffer_ptr
25 %implicit_buffer_ptr = call i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr()
26 %buffer_ptr = bitcast i8 addrspace(4)* %implicit_buffer_ptr to i32 addrspace(4)*
27 %value = load volatile i32, i32 addrspace(4)* %buffer_ptr
2828 ret i32 %value
2929 }
3030
31 declare i8 addrspace(2)* @llvm.amdgcn.implicit.buffer.ptr() #0
31 declare i8 addrspace(4)* @llvm.amdgcn.implicit.buffer.ptr() #0
3232
3333 attributes #0 = { nounwind readnone speculatable }
3434 attributes #1 = { nounwind }
1010
1111 ; HSA: s_load_dword s0, s[4:5], 0x0
1212 define amdgpu_kernel void @kernel_implicitarg_ptr_empty() #0 {
13 %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
14 %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
15 %load = load volatile i32, i32 addrspace(2)* %cast
13 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
14 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
15 %load = load volatile i32, i32 addrspace(4)* %cast
1616 ret void
1717 }
1818
2525
2626 ; HSA: s_load_dword s0, s[4:5], 0x1c
2727 define amdgpu_kernel void @kernel_implicitarg_ptr([112 x i8]) #0 {
28 %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
29 %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
30 %load = load volatile i32, i32 addrspace(2)* %cast
28 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
29 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
30 %load = load volatile i32, i32 addrspace(4)* %cast
3131 ret void
3232 }
3333
3737 ; GCN-NEXT: s_waitcnt
3838 ; GCN-NEXT: s_setpc_b64
3939 define void @func_implicitarg_ptr() #1 {
40 %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
41 %cast = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
42 %load = load volatile i32, i32 addrspace(2)* %cast
40 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
41 %cast = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
42 %load = load volatile i32, i32 addrspace(4)* %cast
4343 ret void
4444 }
4545
8585 ; GCN: s_load_dword s{{[0-9]+}}, s[6:7], 0x0{{$}}
8686 ; GCN: s_load_dword s{{[0-9]+}}, s[8:9], 0x0{{$}}
8787 define void @func_kernarg_implicitarg_ptr() #1 {
88 %kernarg.segment.ptr = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
89 %implicitarg.ptr = call i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
90 %cast.kernarg.segment.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
91 %cast.implicitarg = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
92 %load0 = load volatile i32, i32 addrspace(2)* %cast.kernarg.segment.ptr
93 %load1 = load volatile i32, i32 addrspace(2)* %cast.implicitarg
88 %kernarg.segment.ptr = call i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
89 %implicitarg.ptr = call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
90 %cast.kernarg.segment.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
91 %cast.implicitarg = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
92 %load0 = load volatile i32, i32 addrspace(4)* %cast.kernarg.segment.ptr
93 %load1 = load volatile i32, i32 addrspace(4)* %cast.implicitarg
9494 ret void
9595 }
9696
105105 ret void
106106 }
107107
108 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #2
109 declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #2
108 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
109 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #2
110110
111111 attributes #0 = { nounwind noinline }
112112 attributes #1 = { nounwind noinline }
1010
1111 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
1212 define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
13 %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
14 %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
15 %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
16 %value = load i32, i32 addrspace(2)* %gep
13 %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
14 %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
15 %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
16 %value = load i32, i32 addrspace(4)* %gep
1717 store i32 %value, i32 addrspace(1)* %out
1818 ret void
1919 }
2222 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
2323 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
2424 define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
25 %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
26 %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
27 %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
28 %value = load i32, i32 addrspace(2)* %gep
25 %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
26 %header.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
27 %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
28 %value = load i32, i32 addrspace(4)* %gep
2929 store i32 %value, i32 addrspace(1)* %out
3030 ret void
3131 }
4141 ; MESA: buffer_store_dword [[V_VAL]]
4242 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
4343 define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
44 %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
45 %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
46 %val = load i32, i32 addrspace(2)* %arg.ptr
44 %implicitarg.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr()
45 %arg.ptr = bitcast i8 addrspace(4)* %implicitarg.ptr to i32 addrspace(4)*
46 %val = load i32, i32 addrspace(4)* %arg.ptr
4747 store i32 %val, i32 addrspace(1)* %out
4848 ret void
4949 }
5252 ; HSA: enable_sgpr_kernarg_segment_ptr = 1
5353 ; HSA: s_load_dword s{{[0-9]+}}, s[4:5]
5454 define amdgpu_kernel void @test_no_kernargs() #1 {
55 %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
56 %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
57 %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
58 %value = load i32, i32 addrspace(2)* %gep
55 %kernarg.segment.ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr()
56 %header.ptr = bitcast i8 addrspace(4)* %kernarg.segment.ptr to i32 addrspace(4)*
57 %gep = getelementptr i32, i32 addrspace(4)* %header.ptr, i64 10
58 %value = load i32, i32 addrspace(4)* %gep
5959 store volatile i32 %value, i32 addrspace(1)* undef
6060 ret void
6161 }
6262
63 declare i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr() #0
64 declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #0
63 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #0
64 declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #0
6565
6666 attributes #0 = { nounwind readnone }
6767 attributes #1 = { nounwind }
66 ; GCN: enable_sgpr_queue_ptr = 1
77 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
88 define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
9 %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
10 %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
11 %value = load i32, i32 addrspace(2)* %header_ptr
9 %queue_ptr = call noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
10 %header_ptr = bitcast i8 addrspace(4)* %queue_ptr to i32 addrspace(4)*
11 %value = load i32, i32 addrspace(4)* %header_ptr
1212 store i32 %value, i32 addrspace(1)* %out
1313 ret void
1414 }
1515
16 declare noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
16 declare noalias i8 addrspace(4)* @llvm.amdgcn.queue.ptr() #0
1717
1818 attributes #0 = { nounwind readnone }
22
33 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i1) nounwind
44 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind
5 declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(2)* nocapture, i64, i1) nounwind
5 declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(4)* nocapture, i64, i1) nounwind
66
77
88 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align1:
327327 }
328328
329329 ; Test shouldConvertConstantLoadToIntImm
330 @hello.align4 = private unnamed_addr addrspace(2) constant [16 x i8] c"constant string\00", align 4
331 @hello.align1 = private unnamed_addr addrspace(2) constant [16 x i8] c"constant string\00", align 1
330 @hello.align4 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 4
331 @hello.align1 = private unnamed_addr addrspace(4) constant [16 x i8] c"constant string\00", align 1
332332
333333 ; FUNC-LABEL: {{^}}test_memcpy_const_string_align4:
334334 ; SI: s_getpc_b64
340340 ; SI-DAG: buffer_store_dwordx4
341341 ; SI-DAG: buffer_store_dwordx4
342342 define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
343 %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)*
344 call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(2)* align 4 %str, i64 32, i1 false)
343 %str = bitcast [16 x i8] addrspace(4)* @hello.align4 to i8 addrspace(4)*
344 call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* align 4 %out, i8 addrspace(4)* align 4 %str, i64 32, i1 false)
345345 ret void
346346 }
347347
365365 ; SI: buffer_store_byte
366366 ; SI: buffer_store_byte
367367 define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
368 %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)*
369 call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i1 false)
370 ret void
371 }
368 %str = bitcast [16 x i8] addrspace(4)* @hello.align1 to i8 addrspace(4)*
369 call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(4)* %str, i64 32, i1 false)
370 ret void
371 }
55 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
66 ; GCN-NOHSA: buffer_store_dwordx2
77 ; GCN-HSA: flat_store_dwordx2
8 define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
9 %ld = load double, double addrspace(2)* %in
8 define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(4)* %in) #0 {
9 %ld = load double, double addrspace(4)* %in
1010 store double %ld, double addrspace(1)* %out
1111 ret void
1212 }
88
99 ; EG: VTX_READ_8
1010 ; EG: AND_INT
11 define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
12 %load = load i1, i1 addrspace(2)* %in
11 define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
12 %load = load i1, i1 addrspace(4)* %in
1313 store i1 %load, i1 addrspace(1)* %out
1414 ret void
1515 }
1616
1717 ; FUNC-LABEL: {{^}}constant_load_v2i1:
18 define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
19 %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
18 define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
19 %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
2020 store <2 x i1> %load, <2 x i1> addrspace(1)* %out
2121 ret void
2222 }
2323
2424 ; FUNC-LABEL: {{^}}constant_load_v3i1:
25 define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
26 %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
25 define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
26 %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
2727 store <3 x i1> %load, <3 x i1> addrspace(1)* %out
2828 ret void
2929 }
3030
3131 ; FUNC-LABEL: {{^}}constant_load_v4i1:
32 define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
33 %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
32 define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
33 %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
3434 store <4 x i1> %load, <4 x i1> addrspace(1)* %out
3535 ret void
3636 }
3737
3838 ; FUNC-LABEL: {{^}}constant_load_v8i1:
39 define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
40 %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
39 define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
40 %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
4141 store <8 x i1> %load, <8 x i1> addrspace(1)* %out
4242 ret void
4343 }
4444
4545 ; FUNC-LABEL: {{^}}constant_load_v16i1:
46 define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
47 %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
46 define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
47 %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
4848 store <16 x i1> %load, <16 x i1> addrspace(1)* %out
4949 ret void
5050 }
5151
5252 ; FUNC-LABEL: {{^}}constant_load_v32i1:
53 define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
54 %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
53 define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
54 %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
5555 store <32 x i1> %load, <32 x i1> addrspace(1)* %out
5656 ret void
5757 }
5858
5959 ; FUNC-LABEL: {{^}}constant_load_v64i1:
60 define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
61 %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
60 define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
61 %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
6262 store <64 x i1> %load, <64 x i1> addrspace(1)* %out
6363 ret void
6464 }
6666 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
6767 ; GCN: buffer_load_ubyte
6868 ; GCN: buffer_store_dword
69 define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
70 %a = load i1, i1 addrspace(2)* %in
69 define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
70 %a = load i1, i1 addrspace(4)* %in
7171 %ext = zext i1 %a to i32
7272 store i32 %ext, i32 addrspace(1)* %out
7373 ret void
8080
8181 ; EG: VTX_READ_8
8282 ; EG: BFE_INT
83 define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
84 %a = load i1, i1 addrspace(2)* %in
83 define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
84 %a = load i1, i1 addrspace(4)* %in
8585 %ext = sext i1 %a to i32
8686 store i32 %ext, i32 addrspace(1)* %out
8787 ret void
8888 }
8989
9090 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
91 define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
92 %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
91 define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
92 %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
9393 %ext = zext <1 x i1> %load to <1 x i32>
9494 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
9595 ret void
9696 }
9797
9898 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
99 define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
100 %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
99 define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
100 %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
101101 %ext = sext <1 x i1> %load to <1 x i32>
102102 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
103103 ret void
104104 }
105105
106106 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
107 define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
108 %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
107 define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
108 %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
109109 %ext = zext <2 x i1> %load to <2 x i32>
110110 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
111111 ret void
112112 }
113113
114114 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
115 define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
116 %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
115 define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
116 %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
117117 %ext = sext <2 x i1> %load to <2 x i32>
118118 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
119119 ret void
120120 }
121121
122122 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
123 define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
124 %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
123 define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
124 %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
125125 %ext = zext <3 x i1> %load to <3 x i32>
126126 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
127127 ret void
128128 }
129129
130130 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
131 define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
132 %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
131 define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
132 %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
133133 %ext = sext <3 x i1> %load to <3 x i32>
134134 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
135135 ret void
136136 }
137137
138138 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
139 define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
140 %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
139 define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
140 %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
141141 %ext = zext <4 x i1> %load to <4 x i32>
142142 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
143143 ret void
144144 }
145145
146146 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
147 define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
148 %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
147 define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
148 %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
149149 %ext = sext <4 x i1> %load to <4 x i32>
150150 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
151151 ret void
152152 }
153153
154154 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
155 define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
156 %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
155 define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
156 %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
157157 %ext = zext <8 x i1> %load to <8 x i32>
158158 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
159159 ret void
160160 }
161161
162162 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
163 define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
164 %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
163 define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
164 %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
165165 %ext = sext <8 x i1> %load to <8 x i32>
166166 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
167167 ret void
168168 }
169169
170170 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
171 define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
172 %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
171 define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
172 %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
173173 %ext = zext <16 x i1> %load to <16 x i32>
174174 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
175175 ret void
176176 }
177177
178178 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
179 define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
180 %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
179 define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
180 %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
181181 %ext = sext <16 x i1> %load to <16 x i32>
182182 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
183183 ret void
184184 }
185185
186186 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
187 define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
188 %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
187 define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
188 %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
189189 %ext = zext <32 x i1> %load to <32 x i32>
190190 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
191191 ret void
192192 }
193193
194194 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
195 define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
196 %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
195 define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
196 %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
197197 %ext = sext <32 x i1> %load to <32 x i32>
198198 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
199199 ret void
200200 }
201201
202202 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
203 define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
204 %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
203 define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
204 %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
205205 %ext = zext <64 x i1> %load to <64 x i32>
206206 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
207207 ret void
208208 }
209209
210210 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
211 define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
212 %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
211 define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
212 %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
213213 %ext = sext <64 x i1> %load to <64 x i32>
214214 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
215215 ret void
220220 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
221221 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
222222 ; GCN: buffer_store_dwordx2
223 define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
224 %a = load i1, i1 addrspace(2)* %in
223 define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
224 %a = load i1, i1 addrspace(4)* %in
225225 %ext = zext i1 %a to i64
226226 store i64 %ext, i64 addrspace(1)* %out
227227 ret void
232232 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
233233 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
234234 ; GCN: buffer_store_dwordx2
235 define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
236 %a = load i1, i1 addrspace(2)* %in
235 define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(4)* nocapture %in) #0 {
236 %a = load i1, i1 addrspace(4)* %in
237237 %ext = sext i1 %a to i64
238238 store i64 %ext, i64 addrspace(1)* %out
239239 ret void
240240 }
241241
242242 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
243 define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
244 %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
243 define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
244 %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
245245 %ext = zext <1 x i1> %load to <1 x i64>
246246 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
247247 ret void
248248 }
249249
250250 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
251 define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
252 %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
251 define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(4)* nocapture %in) #0 {
252 %load = load <1 x i1>, <1 x i1> addrspace(4)* %in
253253 %ext = sext <1 x i1> %load to <1 x i64>
254254 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
255255 ret void
256256 }
257257
258258 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
259 define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
260 %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
259 define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
260 %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
261261 %ext = zext <2 x i1> %load to <2 x i64>
262262 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
263263 ret void
264264 }
265265
266266 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
267 define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
268 %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
267 define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(4)* nocapture %in) #0 {
268 %load = load <2 x i1>, <2 x i1> addrspace(4)* %in
269269 %ext = sext <2 x i1> %load to <2 x i64>
270270 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
271271 ret void
272272 }
273273
274274 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
275 define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
276 %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
275 define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
276 %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
277277 %ext = zext <3 x i1> %load to <3 x i64>
278278 store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
279279 ret void
280280 }
281281
282282 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
283 define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
284 %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
283 define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(4)* nocapture %in) #0 {
284 %load = load <3 x i1>, <3 x i1> addrspace(4)* %in
285285 %ext = sext <3 x i1> %load to <3 x i64>
286286 store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
287287 ret void
288288 }
289289
290290 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
291 define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
292 %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
291 define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
292 %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
293293 %ext = zext <4 x i1> %load to <4 x i64>
294294 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
295295 ret void
296296 }
297297
298298 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
299 define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
300 %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
299 define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(4)* nocapture %in) #0 {
300 %load = load <4 x i1>, <4 x i1> addrspace(4)* %in
301301 %ext = sext <4 x i1> %load to <4 x i64>
302302 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
303303 ret void
304304 }
305305
306306 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
307 define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
308 %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
307 define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
308 %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
309309 %ext = zext <8 x i1> %load to <8 x i64>
310310 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
311311 ret void
312312 }
313313
314314 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
315 define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
316 %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
315 define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(4)* nocapture %in) #0 {
316 %load = load <8 x i1>, <8 x i1> addrspace(4)* %in
317317 %ext = sext <8 x i1> %load to <8 x i64>
318318 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
319319 ret void
320320 }
321321
322322 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
323 define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
324 %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
323 define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
324 %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
325325 %ext = zext <16 x i1> %load to <16 x i64>
326326 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
327327 ret void
328328 }
329329
330330 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
331 define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
332 %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
331 define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(4)* nocapture %in) #0 {
332 %load = load <16 x i1>, <16 x i1> addrspace(4)* %in
333333 %ext = sext <16 x i1> %load to <16 x i64>
334334 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
335335 ret void
336336 }
337337
338338 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
339 define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
340 %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
339 define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
340 %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
341341 %ext = zext <32 x i1> %load to <32 x i64>
342342 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
343343 ret void
344344 }
345345
346346 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
347 define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
348 %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
347 define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(4)* nocapture %in) #0 {
348 %load = load <32 x i1>, <32 x i1> addrspace(4)* %in
349349 %ext = sext <32 x i1> %load to <32 x i64>
350350 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
351351 ret void
352352 }
353353
354354 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
355 define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
356 %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
355 define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
356 %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
357357 %ext = zext <64 x i1> %load to <64 x i64>
358358 store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
359359 ret void
360360 }
361361
362362 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
363 define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
364 %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
363 define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(4)* nocapture %in) #0 {
364 %load = load <64 x i1>, <64 x i1> addrspace(4)* %in
365365 %ext = sext <64 x i1> %load to <64 x i64>
366366 store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
367367 ret void
77 ; GCN-HSA: flat_load_ushort
88
99 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
10 define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
11 entry:
12 %ld = load i16, i16 addrspace(2)* %in
10 define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(4)* %in) {
11 entry:
12 %ld = load i16, i16 addrspace(4)* %in
1313 store i16 %ld, i16 addrspace(1)* %out
1414 ret void
1515 }
1818 ; GCN: s_load_dword s
1919
2020 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
21 define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
22 entry:
23 %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
21 define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) {
22 entry:
23 %ld = load <2 x i16>, <2 x i16> addrspace(4)* %in
2424 store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
2525 ret void
2626 }
3030
3131 ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
3232 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
33 define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
34 entry:
35 %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
33 define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
34 entry:
35 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
3636 store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
3737 ret void
3838 }
4141 ; GCN: s_load_dwordx2
4242
4343 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
44 define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
45 entry:
46 %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
44 define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) {
45 entry:
46 %ld = load <4 x i16>, <4 x i16> addrspace(4)* %in
4747 store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
4848 ret void
4949 }
5252 ; GCN: s_load_dwordx4
5353
5454 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
55 define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
56 entry:
57 %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
55 define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) {
56 entry:
57 %ld = load <8 x i16>, <8 x i16> addrspace(4)* %in
5858 store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
5959 ret void
6060 }
6464
6565 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
6666 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
67 define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
68 entry:
69 %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
67 define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) {
68 entry:
69 %ld = load <16 x i16>, <16 x i16> addrspace(4)* %in
7070 store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
7171 ret void
7272 }
7979 ; GCN-HSA: flat_store_dword
8080
8181 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
82 define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
83 %a = load i16, i16 addrspace(2)* %in
82 define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
83 %a = load i16, i16 addrspace(4)* %in
8484 %ext = zext i16 %a to i32
8585 store i32 %ext, i32 addrspace(1)* %out
8686 ret void
9696 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
9797 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
9898 ; EG: 16
99 define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
100 %a = load i16, i16 addrspace(2)* %in
99 define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
100 %a = load i16, i16 addrspace(4)* %in
101101 %ext = sext i16 %a to i32
102102 store i32 %ext, i32 addrspace(1)* %out
103103 ret void
108108 ; GCN-HSA: flat_load_ushort
109109
110110 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
111 define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
112 %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
111 define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
112 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
113113 %ext = zext <1 x i16> %load to <1 x i32>
114114 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
115115 ret void
122122 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
123123 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
124124 ; EG: 16
125 define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
126 %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
125 define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
126 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
127127 %ext = sext <1 x i16> %load to <1 x i32>
128128 store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
129129 ret void
139139 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
140140 ; EG: 16
141141 ; EG: 16
142 define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
143 %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
142 define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
143 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
144144 %ext = zext <2 x i16> %load to <2 x i32>
145145 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
146146 ret void
159159 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
160160 ; EG-DAG: 16
161161 ; EG-DAG: 16
162 define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
163 %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
162 define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
163 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
164164 %ext = sext <2 x i16> %load to <2 x i32>
165165 store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
166166 ret void
182182 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
183183 ; EG-DAG: 65535
184184 ; EG-DAG: 65535
185 define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
186 entry:
187 %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
185 define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
186 entry:
187 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
188188 %ext = zext <3 x i16> %ld to <3 x i32>
189189 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
190190 ret void
203203 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
204204 ; EG-DAG: 16
205205 ; EG-DAG: 16
206 define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
207 entry:
208 %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
206 define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(4)* %in) {
207 entry:
208 %ld = load <3 x i16>, <3 x i16> addrspace(4)* %in
209209 %ext = sext <3 x i16> %ld to <3 x i32>
210210 store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
211211 ret void
228228 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
229229 ; EG-DAG: 65535
230230 ; EG-DAG: 65535
231 define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
232 %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
231 define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
232 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
233233 %ext = zext <4 x i16> %load to <4 x i32>
234234 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
235235 ret void
253253 ; EG-DAG: 16
254254 ; EG-DAG: 16
255255 ; EG-DAG: 16
256 define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
257 %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
256 define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
257 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
258258 %ext = sext <4 x i16> %load to <4 x i32>
259259 store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
260260 ret void
287287 ; EG-DAG: 65535
288288 ; EG-DAG: 65535
289289 ; EG-DAG: 65535
290 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
291 %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
290 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
291 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
292292 %ext = zext <8 x i16> %load to <8 x i32>
293293 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
294294 ret void
321321 ; EG-DAG: 16
322322 ; EG-DAG: 16
323323 ; EG-DAG: 16
324 define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
325 %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
324 define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
325 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
326326 %ext = sext <8 x i16> %load to <8 x i32>
327327 store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
328328 ret void
336336 ; v16i16 is naturally 32 byte aligned
337337 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1
338338 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1
339 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
340 %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
339 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
340 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
341341 %ext = zext <16 x i16> %load to <16 x i32>
342342 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
343343 ret void
351351 ; v16i16 is naturally 32 byte aligned
352352 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1
353353 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1
354 define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
355 %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
354 define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
355 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
356356 %ext = sext <16 x i16> %load to <16 x i32>
357357 store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
358358 ret void
368368 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
369369 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
370370 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
371 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
372 %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
371 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
372 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
373373 %ext = zext <32 x i16> %load to <32 x i32>
374374 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
375375 ret void
384384 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
385385 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
386386 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
387 define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
388 %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
387 define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
388 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
389389 %ext = sext <32 x i16> %load to <32 x i32>
390390 store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
391391 ret void
403403 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
404404 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
405405 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
406 define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
407 %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
406 define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
407 %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
408408 %ext = zext <64 x i16> %load to <64 x i32>
409409 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
410410 ret void
420420 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
421421 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
422422 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
423 define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
424 %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
423 define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
424 %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
425425 %ext = sext <64 x i16> %load to <64 x i32>
426426 store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
427427 ret void
437437
438438 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
439439 ; EG: MOV {{.*}}, 0.0
440 define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
441 %a = load i16, i16 addrspace(2)* %in
440 define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
441 %a = load i16, i16 addrspace(4)* %in
442442 %ext = zext i16 %a to i64
443443 store i64 %ext, i64 addrspace(1)* %out
444444 ret void
463463 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
464464 ; TODO: These could be expanded earlier using ASHR 15
465465 ; EG: 31
466 define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
467 %a = load i16, i16 addrspace(2)* %in
466 define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(4)* %in) #0 {
467 %a = load i16, i16 addrspace(4)* %in
468468 %ext = sext i16 %a to i64
469469 store i64 %ext, i64 addrspace(1)* %out
470470 ret void
474474
475475 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
476476 ; EG: MOV {{.*}}, 0.0
477 define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
478 %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
477 define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
478 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
479479 %ext = zext <1 x i16> %load to <1 x i64>
480480 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
481481 ret void
487487 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
488488 ; TODO: These could be expanded earlier using ASHR 15
489489 ; EG: 31
490 define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
491 %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
490 define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(4)* %in) #0 {
491 %load = load <1 x i16>, <1 x i16> addrspace(4)* %in
492492 %ext = sext <1 x i16> %load to <1 x i64>
493493 store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
494494 ret void
497497 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
498498
499499 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
500 define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
501 %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
500 define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
501 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
502502 %ext = zext <2 x i16> %load to <2 x i64>
503503 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
504504 ret void
507507 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
508508
509509 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
510 define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
511 %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
510 define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(4)* %in) #0 {
511 %load = load <2 x i16>, <2 x i16> addrspace(4)* %in
512512 %ext = sext <2 x i16> %load to <2 x i64>
513513 store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
514514 ret void
517517 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
518518
519519 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
520 define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
521 %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
520 define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
521 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
522522 %ext = zext <4 x i16> %load to <4 x i64>
523523 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
524524 ret void
527527 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
528528
529529 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
530 define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
531 %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
530 define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(4)* %in) #0 {
531 %load = load <4 x i16>, <4 x i16> addrspace(4)* %in
532532 %ext = sext <4 x i16> %load to <4 x i64>
533533 store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
534534 ret void
537537 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
538538
539539 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
540 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
541 %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
540 define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
541 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
542542 %ext = zext <8 x i16> %load to <8 x i64>
543543 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
544544 ret void
547547 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
548548
549549 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
550 define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
551 %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
550 define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(4)* %in) #0 {
551 %load = load <8 x i16>, <8 x i16> addrspace(4)* %in
552552 %ext = sext <8 x i16> %load to <8 x i64>
553553 store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
554554 ret void
558558
559559 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
560560 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
561 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
562 %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
561 define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
562 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
563563 %ext = zext <16 x i16> %load to <16 x i64>
564564 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
565565 ret void
569569
570570 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
571571 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
572 define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
573 %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
572 define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(4)* %in) #0 {
573 %load = load <16 x i16>, <16 x i16> addrspace(4)* %in
574574 %ext = sext <16 x i16> %load to <16 x i64>
575575 store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
576576 ret void
582582 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
583583 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
584584 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
585 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
586 %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
585 define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
586 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
587587 %ext = zext <32 x i16> %load to <32 x i64>
588588 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
589589 ret void
595595 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
596596 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
597597 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
598 define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
599 %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
598 define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(4)* %in) #0 {
599 %load = load <32 x i16>, <32 x i16> addrspace(4)* %in
600600 %ext = sext <32 x i16> %load to <32 x i64>
601601 store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
602602 ret void
605605 ; These trigger undefined register machine verifier errors
606606
607607 ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
608 ; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
609 ; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
608 ; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
609 ; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
610610 ; %ext = zext <64 x i16> %load to <64 x i64>
611611 ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
612612 ; ret void
613613 ; }
614614
615615 ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
616 ; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
617 ; %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
616 ; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(4)* %in) #0 {
617 ; %load = load <64 x i16>, <64 x i16> addrspace(4)* %in
618618 ; %ext = sext <64 x i16> %load to <64 x i64>
619619 ; store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
620620 ; ret void
66 ; GCN: s_load_dword s{{[0-9]+}}
77
88 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
9 define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
10 entry:
11 %ld = load i32, i32 addrspace(2)* %in
9 define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(4)* %in) #0 {
10 entry:
11 %ld = load i32, i32 addrspace(4)* %in
1212 store i32 %ld, i32 addrspace(1)* %out
1313 ret void
1414 }
1717 ; GCN: s_load_dwordx2
1818
1919 ; EG: VTX_READ_64
20 define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
21 entry:
22 %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
20 define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(4)* %in) #0 {
21 entry:
22 %ld = load