llvm.org GIT mirror llvm / a2386c3
[AMDGPU] Switch to the new addr space mapping by default This requires corresponding clang change. Differential Revision: https://reviews.llvm.org/D40955 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324101 91177308-0d34-0410-b5e6-96231b3b80d8 Yaxun Liu 1 year, 9 months ago
104 changed file(s) with 3726 addition(s) and 3746 deletion(s). Raw diff Collapse all Expand all
259259 static StringRef computeDataLayout(const Triple &TT) {
260260 if (TT.getArch() == Triple::r600) {
261261 // 32-bit pointers.
262 if (TT.getEnvironmentName() == "amdgiz" ||
263 TT.getEnvironmentName() == "amdgizcl")
264262 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
265263 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
266 return "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
267 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
268264 }
269265
270266 // 32-bit private, local, and region pointers. 64-bit global, constant and
271267 // flat.
272 if (TT.getEnvironmentName() == "amdgiz" ||
273 TT.getEnvironmentName() == "amdgizcl")
274268 return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
275269 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
276270 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
277 return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
278 "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
279 "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
280271 }
281272
282273 LLVM_READNONE
934934 namespace AMDGPU {
935935
936936 AMDGPUAS getAMDGPUAS(Triple T) {
937 auto Env = T.getEnvironmentName();
938937 AMDGPUAS AS;
939 if (Env == "amdgiz" || Env == "amdgizcl") {
940 AS.FLAT_ADDRESS = 0;
941 AS.PRIVATE_ADDRESS = 5;
942 AS.REGION_ADDRESS = 4;
943 }
944 else {
945 AS.FLAT_ADDRESS = 4;
946 AS.PRIVATE_ADDRESS = 0;
947 AS.REGION_ADDRESS = 5;
948 }
938 AS.FLAT_ADDRESS = 0;
939 AS.PRIVATE_ADDRESS = 5;
940 AS.REGION_ADDRESS = 4;
949941 return AS;
950942 }
951943
0 ; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
11
22 ; CHECK: 'addrspacecast_global_to_flat'
3 ; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(4)*
4 define i8 addrspace(4)* @addrspacecast_global_to_flat(i8 addrspace(1)* %ptr) #0 {
5 %cast = addrspacecast i8 addrspace(1)* %ptr to i8 addrspace(4)*
6 ret i8 addrspace(4)* %cast
3 ; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8*
4 define i8* @addrspacecast_global_to_flat(i8 addrspace(1)* %ptr) #0 {
5 %cast = addrspacecast i8 addrspace(1)* %ptr to i8*
6 ret i8* %cast
77 }
88
99 ; CHECK: 'addrspacecast_global_to_flat_v2'
10 ; CHECK: estimated cost of 0 for {{.*}} addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8 addrspace(4)*>
11 define <2 x i8 addrspace(4)*> @addrspacecast_global_to_flat_v2(<2 x i8 addrspace(1)*> %ptr) #0 {
12 %cast = addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8 addrspace(4)*>
13 ret <2 x i8 addrspace(4)*> %cast
10 ; CHECK: estimated cost of 0 for {{.*}} addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8*>
11 define <2 x i8*> @addrspacecast_global_to_flat_v2(<2 x i8 addrspace(1)*> %ptr) #0 {
12 %cast = addrspacecast <2 x i8 addrspace(1)*> %ptr to <2 x i8*>
13 ret <2 x i8*> %cast
1414 }
1515
1616 ; CHECK: 'addrspacecast_global_to_flat_v32'
17 ; CHECK: estimated cost of 0 for {{.*}} addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8 addrspace(4)*>
18 define <32 x i8 addrspace(4)*> @addrspacecast_global_to_flat_v32(<32 x i8 addrspace(1)*> %ptr) #0 {
19 %cast = addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8 addrspace(4)*>
20 ret <32 x i8 addrspace(4)*> %cast
17 ; CHECK: estimated cost of 0 for {{.*}} addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8*>
18 define <32 x i8*> @addrspacecast_global_to_flat_v32(<32 x i8 addrspace(1)*> %ptr) #0 {
19 %cast = addrspacecast <32 x i8 addrspace(1)*> %ptr to <32 x i8*>
20 ret <32 x i8*> %cast
2121 }
2222
2323 ; CHECK: 'addrspacecast_local_to_flat'
24 ; CHECK: estimated cost of 1 for {{.*}} addrspacecast i8 addrspace(3)* %ptr to i8 addrspace(4)*
25 define i8 addrspace(4)* @addrspacecast_local_to_flat(i8 addrspace(3)* %ptr) #0 {
26 %cast = addrspacecast i8 addrspace(3)* %ptr to i8 addrspace(4)*
27 ret i8 addrspace(4)* %cast
24 ; CHECK: estimated cost of 1 for {{.*}} addrspacecast i8 addrspace(3)* %ptr to i8*
25 define i8* @addrspacecast_local_to_flat(i8 addrspace(3)* %ptr) #0 {
26 %cast = addrspacecast i8 addrspace(3)* %ptr to i8*
27 ret i8* %cast
2828 }
2929
3030 ; CHECK: 'addrspacecast_local_to_flat_v2'
31 ; CHECK: estimated cost of 2 for {{.*}} addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8 addrspace(4)*>
32 define <2 x i8 addrspace(4)*> @addrspacecast_local_to_flat_v2(<2 x i8 addrspace(3)*> %ptr) #0 {
33 %cast = addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8 addrspace(4)*>
34 ret <2 x i8 addrspace(4)*> %cast
31 ; CHECK: estimated cost of 2 for {{.*}} addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8*>
32 define <2 x i8*> @addrspacecast_local_to_flat_v2(<2 x i8 addrspace(3)*> %ptr) #0 {
33 %cast = addrspacecast <2 x i8 addrspace(3)*> %ptr to <2 x i8*>
34 ret <2 x i8*> %cast
3535 }
3636
3737 ; CHECK: 'addrspacecast_local_to_flat_v32'
38 ; CHECK: estimated cost of 32 for {{.*}} addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8 addrspace(4)*>
39 define <32 x i8 addrspace(4)*> @addrspacecast_local_to_flat_v32(<32 x i8 addrspace(3)*> %ptr) #0 {
40 %cast = addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8 addrspace(4)*>
41 ret <32 x i8 addrspace(4)*> %cast
38 ; CHECK: estimated cost of 32 for {{.*}} addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8*>
39 define <32 x i8*> @addrspacecast_local_to_flat_v32(<32 x i8 addrspace(3)*> %ptr) #0 {
40 %cast = addrspacecast <32 x i8 addrspace(3)*> %ptr to <32 x i8*>
41 ret <32 x i8*> %cast
4242 }
4343
4444 attributes #0 = { nounwind readnone }
33 ; CHECK-NEXT: s_nop 0
44 ; CHECK-NEXT: ;;#ASMEND
55
6 define void @foo(i32* %ptr) {
6 define void @foo(i32 addrspace(5)* %ptr) {
77 %tmp = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm "s_nop 0", "=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,=v,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65"(i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2)
88 %tmp2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } %tmp, 0
9 store i32 %tmp2, i32* %ptr, align 4
9 store i32 %tmp2, i32 addrspace(5)* %ptr, align 4
1010 ret void
1111 }
3434 ; CI: NumSgprs: {{[0-9][0-9]+}}
3535 ; GFX9: NumSgprs: {{[0-9]+}}
3636 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
37 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
38 store volatile i32 7, i32 addrspace(4)* %stof
37 %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
38 store volatile i32 7, i32* %stof
3939 ret void
4040 }
4141
7272
7373 ; CI: NumSgprs: {{[0-9][0-9]+}}
7474 ; GFX9: NumSgprs: {{[0-9]+}}
75 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
76 %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
77 store volatile i32 7, i32 addrspace(4)* %stof
75 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #0 {
76 %stof = addrspacecast i32 addrspace(5)* %ptr to i32*
77 store volatile i32 7, i32* %stof
7878 ret void
7979 }
8080
8888 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
8989 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
9090 define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
91 %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
92 store volatile i32 7, i32 addrspace(4)* %stof
91 %stof = addrspacecast i32 addrspace(1)* %ptr to i32*
92 store volatile i32 7, i32* %stof
9393 ret void
9494 }
9595
100100 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
101101 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
102102 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
103 %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
104 %ld = load volatile i32, i32 addrspace(4)* %stof
103 %stof = addrspacecast i32 addrspace(2)* %ptr to i32*
104 %ld = load volatile i32, i32* %stof
105105 ret void
106106 }
107107
116116 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
117117 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
118118 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
119 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
120 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
119 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 {
120 %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
121121 store volatile i32 0, i32 addrspace(3)* %ftos
122122 ret void
123123 }
133133 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
134134 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
135135 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
136 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
137 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
138 store volatile i32 0, i32* %ftos
136 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 {
137 %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
138 store volatile i32 0, i32 addrspace(5)* %ftos
139139 ret void
140140 }
141141
147147 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
148148 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
149149 ; HSA: {{flat|global}}_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
150 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
151 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
150 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #0 {
151 %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
152152 store volatile i32 0, i32 addrspace(1)* %ftos
153153 ret void
154154 }
158158
159159 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
160160 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
161 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
162 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
161 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #0 {
162 %ftos = addrspacecast i32* %ptr to i32 addrspace(2)*
163163 load volatile i32, i32 addrspace(2)* %ftos
164164 ret void
165165 }
177177 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
178178 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
179179 define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
180 %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
181 store volatile i32 7, i32 addrspace(4)* %cast
180 %cast = addrspacecast i32 addrspace(3)* null to i32*
181 store volatile i32 7, i32* %cast
182182 ret void
183183 }
184184
187187 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
188188 ; HSA: ds_write_b32 [[PTR]], [[K]]
189189 define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
190 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
190 %cast = addrspacecast i32* null to i32 addrspace(3)*
191191 store volatile i32 7, i32 addrspace(3)* %cast
192192 ret void
193193 }
198198 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
199199 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
200200 define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
201 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
202 store volatile i32 7, i32 addrspace(4)* %cast
201 %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32*
202 store volatile i32 7, i32* %cast
203203 ret void
204204 }
205205
208208 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
209209 ; HSA: ds_write_b32 [[PTR]], [[K]]
210210 define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
211 %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
211 %cast = addrspacecast i32* inttoptr (i64 -1 to i32*) to i32 addrspace(3)*
212212 store volatile i32 7, i32 addrspace(3)* %cast
213213 ret void
214214 }
223223 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
224224 ; HSA: {{flat|global}}_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
225225 define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
226 %cast = addrspacecast i32* null to i32 addrspace(4)*
227 store volatile i32 7, i32 addrspace(4)* %cast
226 %cast = addrspacecast i32 addrspace(5)* null to i32*
227 store volatile i32 7, i32* %cast
228228 ret void
229229 }
230230
232232 ; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
233233 ; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
234234 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
235 %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
236 store volatile i32 7, i32* %cast
235 %cast = addrspacecast i32* null to i32 addrspace(5)*
236 store volatile i32 7, i32 addrspace(5)* %cast
237237 ret void
238238 }
239239
249249 br i1 %cmp, label %local, label %global
250250
251251 local:
252 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
252 %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32*
253253 br label %end
254254
255255 global:
256 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
256 %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32*
257257 br label %end
258258
259259 end:
260 %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
261 store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
262 ; %val = load i32, i32 addrspace(4)* %fptr, align 4
260 %fptr = phi i32* [ %flat_local, %local ], [ %flat_global, %global ]
261 store volatile i32 %x, i32* %fptr, align 4
262 ; %val = load i32, i32* %fptr, align 4
263263 ; store i32 %val, i32 addrspace(1)* %out, align 4
264264 ret void
265265 }
277277 ; HSA: s_barrier
278278 ; HSA: {{flat|global}}_load_dword
279279 define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
280 %alloca = alloca i32, i32 9, align 4
280 %alloca = alloca i32, i32 9, align 4, addrspace(5)
281281 %x = call i32 @llvm.amdgcn.workitem.id.x() #2
282 %pptr = getelementptr i32, i32* %alloca, i32 %x
283 %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
284 store volatile i32 %x, i32 addrspace(4)* %fptr
282 %pptr = getelementptr i32, i32 addrspace(5)* %alloca, i32 %x
283 %fptr = addrspacecast i32 addrspace(5)* %pptr to i32*
284 store volatile i32 %x, i32* %fptr
285285 ; Dummy call
286286 call void @llvm.amdgcn.s.barrier() #1
287 %reload = load volatile i32, i32 addrspace(4)* %fptr, align 4
287 %reload = load volatile i32, i32* %fptr, align 4
288288 store volatile i32 %reload, i32 addrspace(1)* %out, align 4
289289 ret void
290290 }
1616 ; GCN: buffer_store_dword [[RESULT]]
1717 define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
1818 entry:
19 %0 = alloca [2 x i32]
20 %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
21 %2 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 1
22 store i32 0, i32* %1
23 store i32 1, i32* %2
24 %3 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 %in
25 %4 = load i32, i32* %3
19 %0 = alloca [2 x i32], addrspace(5)
20 %1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 0
21 %2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 1
22 store i32 0, i32 addrspace(5)* %1
23 store i32 1, i32 addrspace(5)* %2
24 %3 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %0, i32 0, i32 %in
25 %4 = load i32, i32 addrspace(5)* %3
2626 %5 = call i32 @llvm.amdgcn.workitem.id.x()
2727 %6 = add i32 %4, %5
2828 store i32 %6, i32 addrspace(1)* %out
0 ; RUN: opt -mtriple=amdgcn-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
11 ; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
22
3 ; CHECK: NoAlias: i8 addrspace(1)* %p1, i8* %p
3 ; CHECK: NoAlias: i8 addrspace(1)* %p1, i8 addrspace(5)* %p
44
5 define void @test(i8* %p, i8 addrspace(1)* %p1) {
5 define void @test(i8 addrspace(5)* %p, i8 addrspace(1)* %p1) {
66 ret void
77 }
88
None ; RUN: opt -mtriple=amdgcn--amdhsa -O3 -S -amdgpu-function-calls -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s
1 ; RUN: opt -mtriple=amdgcn--amdhsa -O3 -S -amdgpu-function-calls < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s
0 ; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S -amdgpu-function-calls -inline-threshold=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INL1 %s
1 ; RUN: opt -mtriple=amdgcn--amdhsa -data-layout=A5 -O3 -S -amdgpu-function-calls < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-INLDEF %s
22
33 define coldcc float @foo(float %x, float %y) {
44 entry:
99 ret float %cond
1010 }
1111
12 define coldcc void @foo_private_ptr(float* nocapture %p) {
12 define coldcc void @foo_private_ptr(float addrspace(5)* nocapture %p) {
1313 entry:
14 %tmp1 = load float, float* %p, align 4
14 %tmp1 = load float, float addrspace(5)* %p, align 4
1515 %cmp = fcmp ogt float %tmp1, 1.000000e+00
1616 br i1 %cmp, label %if.then, label %if.end
1717
1818 if.then: ; preds = %entry
1919 %div = fdiv float 1.000000e+00, %tmp1
20 store float %div, float* %p, align 4
20 store float %div, float addrspace(5)* %p, align 4
2121 br label %if.end
2222
2323 if.end: ; preds = %if.then, %entry
2424 ret void
2525 }
2626
27 define coldcc void @foo_private_ptr2(float* nocapture %p1, float* nocapture %p2) {
27 define coldcc void @foo_private_ptr2(float addrspace(5)* nocapture %p1, float addrspace(5)* nocapture %p2) {
2828 entry:
29 %tmp1 = load float, float* %p1, align 4
29 %tmp1 = load float, float addrspace(5)* %p1, align 4
3030 %cmp = fcmp ogt float %tmp1, 1.000000e+00
3131 br i1 %cmp, label %if.then, label %if.end
3232
3333 if.then: ; preds = %entry
3434 %div = fdiv float 2.000000e+00, %tmp1
35 store float %div, float* %p2, align 4
35 store float %div, float addrspace(5)* %p2, align 4
3636 br label %if.end
3737
3838 if.end: ; preds = %if.then, %entry
4545 ret float %call
4646 }
4747
48 define void @foo_noinline(float* nocapture %p) #0 {
48 define void @foo_noinline(float addrspace(5)* nocapture %p) #0 {
4949 entry:
50 %tmp1 = load float, float* %p, align 4
50 %tmp1 = load float, float addrspace(5)* %p, align 4
5151 %mul = fmul float %tmp1, 2.000000e+00
52 store float %mul, float* %p, align 4
52 store float %mul, float addrspace(5)* %p, align 4
5353 ret void
5454 }
5555
6262 ; GCN: tail call float @_Z3sinf(
6363 define amdgpu_kernel void @test_inliner(float addrspace(1)* nocapture %a, i32 %n) {
6464 entry:
65 %pvt_arr = alloca [64 x float], align 4
65 %pvt_arr = alloca [64 x float], align 4, addrspace(5)
6666 %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
6767 %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid
6868 %tmp2 = load float, float addrspace(1)* %arrayidx, align 4
7171 %tmp5 = load float, float addrspace(1)* %arrayidx2, align 4
7272 %c1 = tail call coldcc float @foo(float %tmp2, float %tmp5)
7373 %or = or i32 %tid, %n
74 %arrayidx5 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %or
75 store float %c1, float* %arrayidx5, align 4
76 %arrayidx7 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %or
77 call coldcc void @foo_private_ptr(float* %arrayidx7)
78 %arrayidx8 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 1
79 %arrayidx9 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 2
80 call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9)
81 call void @foo_noinline(float* %arrayidx7)
74 %arrayidx5 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %or
75 store float %c1, float addrspace(5)* %arrayidx5, align 4
76 %arrayidx7 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %or
77 call coldcc void @foo_private_ptr(float addrspace(5)* %arrayidx7)
78 %arrayidx8 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 1
79 %arrayidx9 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 2
80 call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9)
81 call void @foo_noinline(float addrspace(5)* %arrayidx7)
8282 %and = and i32 %tid, %n
83 %arrayidx11 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %and
84 %tmp12 = load float, float* %arrayidx11, align 4
83 %arrayidx11 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %and
84 %tmp12 = load float, float addrspace(5)* %arrayidx11, align 4
8585 %c2 = call coldcc float @sin_wrapper(float %tmp12)
86 store float %c2, float* %arrayidx7, align 4
86 store float %c2, float addrspace(5)* %arrayidx7, align 4
8787 %xor = xor i32 %tid, %n
88 %arrayidx16 = getelementptr inbounds [64 x float], [64 x float]* %pvt_arr, i32 0, i32 %xor
89 %tmp16 = load float, float* %arrayidx16, align 4
88 %arrayidx16 = getelementptr inbounds [64 x float], [64 x float] addrspace(5)* %pvt_arr, i32 0, i32 %xor
89 %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4
9090 store float %tmp16, float addrspace(1)* %arrayidx, align 4
9191 ret void
9292 }
9595 ; GCN: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i
9696 define amdgpu_kernel void @test_inliner_multi_pvt_ptr(float addrspace(1)* nocapture %a, i32 %n, float %v) {
9797 entry:
98 %pvt_arr1 = alloca [32 x float], align 4
99 %pvt_arr2 = alloca [32 x float], align 4
98 %pvt_arr1 = alloca [32 x float], align 4, addrspace(5)
99 %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
100100 %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
101101 %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid
102102 %or = or i32 %tid, %n
103 %arrayidx4 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %or
104 %arrayidx5 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 %or
105 store float %v, float* %arrayidx4, align 4
106 store float %v, float* %arrayidx5, align 4
107 %arrayidx8 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 1
108 %arrayidx9 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 2
109 call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9)
103 %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or
104 %arrayidx5 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %or
105 store float %v, float addrspace(5)* %arrayidx4, align 4
106 store float %v, float addrspace(5)* %arrayidx5, align 4
107 %arrayidx8 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 1
108 %arrayidx9 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 2
109 call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9)
110110 %xor = xor i32 %tid, %n
111 %arrayidx15 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %xor
112 %arrayidx16 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr2, i32 0, i32 %xor
113 %tmp15 = load float, float* %arrayidx15, align 4
114 %tmp16 = load float, float* %arrayidx16, align 4
111 %arrayidx15 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %xor
112 %arrayidx16 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %xor
113 %tmp15 = load float, float addrspace(5)* %arrayidx15, align 4
114 %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4
115115 %tmp17 = fadd float %tmp15, %tmp16
116116 store float %tmp17, float addrspace(1)* %arrayidx, align 4
117117 ret void
122122 ; GCN-INLDEF: %div.i{{[0-9]*}} = fdiv float 2.000000e+00, %tmp1.i
123123 define amdgpu_kernel void @test_inliner_multi_pvt_ptr_cutoff(float addrspace(1)* nocapture %a, i32 %n, float %v) {
124124 entry:
125 %pvt_arr1 = alloca [32 x float], align 4
126 %pvt_arr2 = alloca [33 x float], align 4
125 %pvt_arr1 = alloca [32 x float], align 4, addrspace(5)
126 %pvt_arr2 = alloca [33 x float], align 4, addrspace(5)
127127 %tid = tail call i32 @llvm.amdgcn.workitem.id.x()
128128 %arrayidx = getelementptr inbounds float, float addrspace(1)* %a, i32 %tid
129129 %or = or i32 %tid, %n
130 %arrayidx4 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %or
131 %arrayidx5 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 %or
132 store float %v, float* %arrayidx4, align 4
133 store float %v, float* %arrayidx5, align 4
134 %arrayidx8 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 1
135 %arrayidx9 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 2
136 call coldcc void @foo_private_ptr2(float* %arrayidx8, float* %arrayidx9)
130 %arrayidx4 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %or
131 %arrayidx5 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %or
132 store float %v, float addrspace(5)* %arrayidx4, align 4
133 store float %v, float addrspace(5)* %arrayidx5, align 4
134 %arrayidx8 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 1
135 %arrayidx9 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 2
136 call coldcc void @foo_private_ptr2(float addrspace(5)* %arrayidx8, float addrspace(5)* %arrayidx9)
137137 %xor = xor i32 %tid, %n
138 %arrayidx15 = getelementptr inbounds [32 x float], [32 x float]* %pvt_arr1, i32 0, i32 %xor
139 %arrayidx16 = getelementptr inbounds [33 x float], [33 x float]* %pvt_arr2, i32 0, i32 %xor
140 %tmp15 = load float, float* %arrayidx15, align 4
141 %tmp16 = load float, float* %arrayidx16, align 4
138 %arrayidx15 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %pvt_arr1, i32 0, i32 %xor
139 %arrayidx16 = getelementptr inbounds [33 x float], [33 x float] addrspace(5)* %pvt_arr2, i32 0, i32 %xor
140 %tmp15 = load float, float addrspace(5)* %arrayidx15, align 4
141 %tmp16 = load float, float addrspace(5)* %arrayidx16, align 4
142142 %tmp17 = fadd float %tmp15, %tmp16
143143 store float %tmp17, float addrspace(1)* %arrayidx, align 4
144144 ret void
44 ; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
55 ; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
66
7 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s
8 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s
7 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s
8 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -data-layout=A5 -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s
99
1010 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
1111
7979 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1
8080 define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
8181 entry:
82 %stack = alloca [5 x i32], align 4
82 %stack = alloca [5 x i32], align 4, addrspace(5)
8383 %0 = load i32, i32 addrspace(1)* %in, align 4
84 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
85 store i32 4, i32* %arrayidx1, align 4
84 %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
85 store i32 4, i32 addrspace(5)* %arrayidx1, align 4
8686 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
8787 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
88 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
89 store i32 5, i32* %arrayidx3, align 4
90 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
91 %2 = load i32, i32* %arrayidx10, align 4
88 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
89 store i32 5, i32 addrspace(5)* %arrayidx3, align 4
90 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
91 %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
9292 store i32 %2, i32 addrspace(1)* %out, align 4
93 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
94 %3 = load i32, i32* %arrayidx12
93 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
94 %3 = load i32, i32 addrspace(5)* %arrayidx12
9595 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
9696 store i32 %3, i32 addrspace(1)* %arrayidx13
9797 ret void
101101 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
102102 define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
103103 entry:
104 %stack = alloca [8 x i32], align 16
104 %stack = alloca [8 x i32], align 16, addrspace(5)
105105 %0 = load i32, i32 addrspace(1)* %in, align 4
106 %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %0
107 store i32 4, i32* %arrayidx1, align 4
106 %arrayidx1 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 %0
107 store i32 4, i32 addrspace(5)* %arrayidx1, align 4
108108 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
109109 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
110 %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 %1
111 store i32 5, i32* %arrayidx3, align 4
112 %arrayidx10 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 0
113 %2 = load i32, i32* %arrayidx10, align 4
110 %arrayidx3 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 %1
111 store i32 5, i32 addrspace(5)* %arrayidx3, align 4
112 %arrayidx10 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 0
113 %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
114114 store i32 %2, i32 addrspace(1)* %out, align 4
115 %arrayidx12 = getelementptr inbounds [8 x i32], [8 x i32]* %stack, i32 0, i32 1
116 %3 = load i32, i32* %arrayidx12
115 %arrayidx12 = getelementptr inbounds [8 x i32], [8 x i32] addrspace(5)* %stack, i32 0, i32 1
116 %3 = load i32, i32 addrspace(5)* %arrayidx12
117117 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
118118 store i32 %3, i32 addrspace(1)* %arrayidx13
119119 ret void
126126 ; SI-NOT: ds_write
127127 define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
128128 entry:
129 %stack = alloca [5 x i32], align 4
129 %stack = alloca [5 x i32], align 4, addrspace(5)
130130 %0 = load i32, i32 addrspace(1)* %in, align 4
131 %arrayidx1 = getelementptr [5 x i32], [5 x i32]* %stack, i32 0, i32 %0
132 store i32 4, i32* %arrayidx1, align 4
131 %arrayidx1 = getelementptr [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %0
132 store i32 4, i32 addrspace(5)* %arrayidx1, align 4
133133 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
134134 %1 = load i32, i32 addrspace(1)* %arrayidx2, align 4
135 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %1
136 store i32 5, i32* %arrayidx3, align 4
137 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 0
138 %2 = load i32, i32* %arrayidx10, align 4
135 %arrayidx3 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 %1
136 store i32 5, i32 addrspace(5)* %arrayidx3, align 4
137 %arrayidx10 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 0
138 %2 = load i32, i32 addrspace(5)* %arrayidx10, align 4
139139 store i32 %2, i32 addrspace(1)* %out, align 4
140 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 1
141 %3 = load i32, i32* %arrayidx12
140 %arrayidx12 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(5)* %stack, i32 0, i32 1
141 %3 = load i32, i32 addrspace(5)* %arrayidx12
142142 %arrayidx13 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 1
143143 store i32 %3, i32 addrspace(1)* %arrayidx13
144144 ret void
161161
162162 define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
163163 entry:
164 %a = alloca %struct.point
165 %b = alloca %struct.point
166 %a.x.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
167 %a.y.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 1
168 %b.x.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
169 %b.y.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 1
170 store i32 0, i32* %a.x.ptr
171 store i32 1, i32* %a.y.ptr
172 store i32 2, i32* %b.x.ptr
173 store i32 3, i32* %b.y.ptr
174 %a.indirect.ptr = getelementptr %struct.point, %struct.point* %a, i32 0, i32 0
175 %b.indirect.ptr = getelementptr %struct.point, %struct.point* %b, i32 0, i32 0
176 %a.indirect = load i32, i32* %a.indirect.ptr
177 %b.indirect = load i32, i32* %b.indirect.ptr
164 %a = alloca %struct.point, addrspace(5)
165 %b = alloca %struct.point, addrspace(5)
166 %a.x.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0
167 %a.y.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 1
168 %b.x.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0
169 %b.y.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 1
170 store i32 0, i32 addrspace(5)* %a.x.ptr
171 store i32 1, i32 addrspace(5)* %a.y.ptr
172 store i32 2, i32 addrspace(5)* %b.x.ptr
173 store i32 3, i32 addrspace(5)* %b.y.ptr
174 %a.indirect.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %a, i32 0, i32 0
175 %b.indirect.ptr = getelementptr %struct.point, %struct.point addrspace(5)* %b, i32 0, i32 0
176 %a.indirect = load i32, i32 addrspace(5)* %a.indirect.ptr
177 %b.indirect = load i32, i32 addrspace(5)* %b.indirect.ptr
178178 %0 = add i32 %a.indirect, %b.indirect
179179 store i32 %0, i32 addrspace(1)* %out
180180 ret void
190190
191191 define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
192192 entry:
193 %prv_array_const = alloca [2 x i32]
194 %prv_array = alloca [2 x i32]
193 %prv_array_const = alloca [2 x i32], addrspace(5)
194 %prv_array = alloca [2 x i32], addrspace(5)
195195 %a = load i32, i32 addrspace(1)* %in
196196 %b_src_ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
197197 %b = load i32, i32 addrspace(1)* %b_src_ptr
198 %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
199 store i32 %a, i32* %a_dst_ptr
200 %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 1
201 store i32 %b, i32* %b_dst_ptr
198 %a_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0
199 store i32 %a, i32 addrspace(5)* %a_dst_ptr
200 %b_dst_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 1
201 store i32 %b, i32 addrspace(5)* %b_dst_ptr
202202 br label %for.body
203203
204204 for.body:
205205 %inc = phi i32 [0, %entry], [%count, %for.body]
206 %x_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array_const, i32 0, i32 0
207 %x = load i32, i32* %x_ptr
208 %y_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
209 %y = load i32, i32* %y_ptr
206 %x_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array_const, i32 0, i32 0
207 %x = load i32, i32 addrspace(5)* %x_ptr
208 %y_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0
209 %y = load i32, i32 addrspace(5)* %y_ptr
210210 %xy = add i32 %x, %y
211 store i32 %xy, i32* %y_ptr
211 store i32 %xy, i32 addrspace(5)* %y_ptr
212212 %count = add i32 %inc, 1
213213 %done = icmp eq i32 %count, 4095
214214 br i1 %done, label %for.end, label %for.body
215215
216216 for.end:
217 %value_ptr = getelementptr inbounds [2 x i32], [2 x i32]* %prv_array, i32 0, i32 0
218 %value = load i32, i32* %value_ptr
217 %value_ptr = getelementptr inbounds [2 x i32], [2 x i32] addrspace(5)* %prv_array, i32 0, i32 0
218 %value = load i32, i32 addrspace(5)* %value_ptr
219219 store i32 %value, i32 addrspace(1)* %out
220220 ret void
221221 }
234234 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
235235 define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
236236 entry:
237 %0 = alloca [2 x i16]
238 %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
239 %2 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 1
240 store i16 0, i16* %1
241 store i16 1, i16* %2
242 %3 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 %index
243 %4 = load i16, i16* %3
237 %0 = alloca [2 x i16], addrspace(5)
238 %1 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 0
239 %2 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 1
240 store i16 0, i16 addrspace(5)* %1
241 store i16 1, i16 addrspace(5)* %2
242 %3 = getelementptr inbounds [2 x i16], [2 x i16] addrspace(5)* %0, i32 0, i32 %index
243 %4 = load i16, i16 addrspace(5)* %3
244244 %5 = sext i16 %4 to i32
245245 store i32 %5, i32 addrspace(1)* %out
246246 ret void
257257 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
258258 define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
259259 entry:
260 %0 = alloca [2 x i8]
261 %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
262 %2 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 1
263 store i8 0, i8* %1
264 store i8 1, i8* %2
265 %3 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 %index
266 %4 = load i8, i8* %3
260 %0 = alloca [2 x i8], addrspace(5)
261 %1 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 0
262 %2 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 1
263 store i8 0, i8 addrspace(5)* %1
264 store i8 1, i8 addrspace(5)* %2
265 %3 = getelementptr inbounds [2 x i8], [2 x i8] addrspace(5)* %0, i32 0, i32 %index
266 %4 = load i8, i8 addrspace(5)* %3
267267 %5 = sext i8 %4 to i32
268268 store i32 %5, i32 addrspace(1)* %out
269269 ret void
280280 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
281281 define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
282282 entry:
283 %0 = alloca [3 x i8], align 1
284 %1 = alloca [2 x i8], align 1
285 %2 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 0
286 %3 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 1
287 %4 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 2
288 %5 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 0
289 %6 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 1
290 store i8 0, i8* %2
291 store i8 1, i8* %3
292 store i8 2, i8* %4
293 store i8 1, i8* %5
294 store i8 0, i8* %6
295 %7 = getelementptr [3 x i8], [3 x i8]* %0, i32 0, i32 %in
296 %8 = getelementptr [2 x i8], [2 x i8]* %1, i32 0, i32 %in
297 %9 = load i8, i8* %7
298 %10 = load i8, i8* %8
283 %0 = alloca [3 x i8], align 1, addrspace(5)
284 %1 = alloca [2 x i8], align 1, addrspace(5)
285 %2 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 0
286 %3 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 1
287 %4 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 2
288 %5 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 0
289 %6 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 1
290 store i8 0, i8 addrspace(5)* %2
291 store i8 1, i8 addrspace(5)* %3
292 store i8 2, i8 addrspace(5)* %4
293 store i8 1, i8 addrspace(5)* %5
294 store i8 0, i8 addrspace(5)* %6
295 %7 = getelementptr [3 x i8], [3 x i8] addrspace(5)* %0, i32 0, i32 %in
296 %8 = getelementptr [2 x i8], [2 x i8] addrspace(5)* %1, i32 0, i32 %in
297 %9 = load i8, i8 addrspace(5)* %7
298 %10 = load i8, i8 addrspace(5)* %8
299299 %11 = add i8 %9, %10
300300 %12 = sext i8 %11 to i32
301301 store i32 %12, i32 addrspace(1)* %out
304304
305305 define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
306306 entry:
307 %alloca = alloca [2 x [2 x i8]]
308 %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
309 %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 1
310 store i8 0, i8* %gep0
311 store i8 1, i8* %gep1
312 %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 %index
313 %load = load i8, i8* %gep2
307 %alloca = alloca [2 x [2 x i8]], addrspace(5)
308 %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
309 %gep1 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
310 store i8 0, i8 addrspace(5)* %gep0
311 store i8 1, i8 addrspace(5)* %gep1
312 %gep2 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
313 %load = load i8, i8 addrspace(5)* %gep2
314314 %sext = sext i8 %load to i32
315315 store i32 %sext, i32 addrspace(1)* %out
316316 ret void
318318
319319 define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
320320 entry:
321 %alloca = alloca [2 x [2 x i32]]
322 %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
323 %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 1
324 store i32 0, i32* %gep0
325 store i32 1, i32* %gep1
326 %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 %index
327 %load = load i32, i32* %gep2
321 %alloca = alloca [2 x [2 x i32]], addrspace(5)
322 %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
323 %gep1 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
324 store i32 0, i32 addrspace(5)* %gep0
325 store i32 1, i32 addrspace(5)* %gep1
326 %gep2 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
327 %load = load i32, i32 addrspace(5)* %gep2
328328 store i32 %load, i32 addrspace(1)* %out
329329 ret void
330330 }
331331
332332 define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
333333 entry:
334 %alloca = alloca [2 x [2 x i64]]
335 %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
336 %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 1
337 store i64 0, i64* %gep0
338 store i64 1, i64* %gep1
339 %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 %index
340 %load = load i64, i64* %gep2
334 %alloca = alloca [2 x [2 x i64]], addrspace(5)
335 %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 0
336 %gep1 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 1
337 store i64 0, i64 addrspace(5)* %gep0
338 store i64 1, i64 addrspace(5)* %gep1
339 %gep2 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index
340 %load = load i64, i64 addrspace(5)* %gep2
341341 store i64 %load, i64 addrspace(1)* %out
342342 ret void
343343 }
346346
347347 define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
348348 entry:
349 %alloca = alloca [2 x [2 x %struct.pair32]]
350 %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
351 %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 1, i32 1
352 store i32 0, i32* %gep0
353 store i32 1, i32* %gep1
354 %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 %index, i32 0
355 %load = load i32, i32* %gep2
349 %alloca = alloca [2 x [2 x %struct.pair32]], addrspace(5)
350 %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 0, i32 1
351 %gep1 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 1, i32 1
352 store i32 0, i32 addrspace(5)* %gep0
353 store i32 1, i32 addrspace(5)* %gep1
354 %gep2 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]] addrspace(5)* %alloca, i32 0, i32 0, i32 %index, i32 0
355 %load = load i32, i32 addrspace(5)* %gep2
356356 store i32 %load, i32 addrspace(1)* %out
357357 ret void
358358 }
359359
360360 define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
361361 entry:
362 %alloca = alloca [2 x %struct.pair32]
363 %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
364 %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 1, i32 0
365 store i32 0, i32* %gep0
366 store i32 1, i32* %gep1
367 %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 %index, i32 0
368 %load = load i32, i32* %gep2
362 %alloca = alloca [2 x %struct.pair32], addrspace(5)
363 %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 0, i32 1
364 %gep1 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 1, i32 0
365 store i32 0, i32 addrspace(5)* %gep0
366 store i32 1, i32 addrspace(5)* %gep1
367 %gep2 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32] addrspace(5)* %alloca, i32 0, i32 %index, i32 0
368 %load = load i32, i32 addrspace(5)* %gep2
369369 store i32 %load, i32 addrspace(1)* %out
370370 ret void
371371 }
372372
373373 define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
374374 entry:
375 %tmp = alloca [2 x i32]
376 %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
377 %tmp2 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 1
378 store i32 0, i32* %tmp1
379 store i32 1, i32* %tmp2
375 %tmp = alloca [2 x i32], addrspace(5)
376 %tmp1 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 0
377 %tmp2 = getelementptr [2 x i32], [2 x i32] addrspace(5)* %tmp, i32 0, i32 1
378 store i32 0, i32 addrspace(5)* %tmp1
379 store i32 1, i32 addrspace(5)* %tmp2
380380 %cmp = icmp eq i32 %in, 0
381 %sel = select i1 %cmp, i32* %tmp1, i32* %tmp2
382 %load = load i32, i32* %sel
381 %sel = select i1 %cmp, i32 addrspace(5)* %tmp1, i32 addrspace(5)* %tmp2
382 %load = load i32, i32 addrspace(5)* %sel
383383 store i32 %load, i32 addrspace(1)* %out
384384 ret void
385385 }
393393 ; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5,
394394 ; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
395395 define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
396 %alloca = alloca [16 x i32]
397 %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
398 store i32 5, i32* %tmp0
399 %tmp1 = ptrtoint [16 x i32]* %alloca to i32
396 %alloca = alloca [16 x i32], addrspace(5)
397 %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a
398 store i32 5, i32 addrspace(5)* %tmp0
399 %tmp1 = ptrtoint [16 x i32] addrspace(5)* %alloca to i32
400400 %tmp2 = add i32 %tmp1, 5
401 %tmp3 = inttoptr i32 %tmp2 to i32*
402 %tmp4 = getelementptr i32, i32* %tmp3, i32 %b
403 %tmp5 = load i32, i32* %tmp4
401 %tmp3 = inttoptr i32 %tmp2 to i32 addrspace(5)*
402 %tmp4 = getelementptr i32, i32 addrspace(5)* %tmp3, i32 %b
403 %tmp5 = load i32, i32 addrspace(5)* %tmp4
404404 store i32 %tmp5, i32 addrspace(1)* %out
405405 ret void
406406 }
410410 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
411411 define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
412412 entry:
413 %A.addr = alloca i32 addrspace(1)*, align 4
414 store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
415 %ld0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
413 %A.addr = alloca i32 addrspace(1)*, align 4, addrspace(5)
414 store i32 addrspace(1)* %A, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
415 %ld0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
416416 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0
417417 store i32 1, i32 addrspace(1)* %arrayidx, align 4
418 %ld1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
418 %ld1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
419419 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1
420420 store i32 2, i32 addrspace(1)* %arrayidx1, align 4
421 %ld2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4
421 %ld2 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %A.addr, align 4
422422 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %ld2, i32 2
423423 store i32 3, i32 addrspace(1)* %arrayidx2, align 4
424424 ret void
461461 ; SI: buffer_load_dword
462462
463463 define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
464 %alloca = alloca [2 x <16 x i32>]
465 %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
466 %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
464 %alloca = alloca [2 x <16 x i32>], addrspace(5)
465 %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>] addrspace(5)* %alloca, i32 0, i32 %a
466 %tmp5 = load <16 x i32>, <16 x i32> addrspace(5)* %tmp0
467467 store <16 x i32> %tmp5, <16 x i32> addrspace(1)* %out
468468 ret void
469469 }
505505 ; SI: buffer_load_dword
506506
507507 define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
508 %alloca = alloca [2 x <16 x float>]
509 %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
510 %tmp5 = load <16 x float>, <16 x float>* %tmp0
508 %alloca = alloca [2 x <16 x float>], addrspace(5)
509 %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>] addrspace(5)* %alloca, i32 0, i32 %a
510 %tmp5 = load <16 x float>, <16 x float> addrspace(5)* %tmp0
511511 store <16 x float> %tmp5, <16 x float> addrspace(1)* %out
512512 ret void
513513 }
521521 ; SI: buffer_load_dword
522522
523523 define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
524 %alloca = alloca [16 x <2 x float>]
525 %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
526 %tmp5 = load <2 x float>, <2 x float>* %tmp0
524 %alloca = alloca [16 x <2 x float>], addrspace(5)
525 %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>] addrspace(5)* %alloca, i32 0, i32 %a
526 %tmp5 = load <2 x float>, <2 x float> addrspace(5)* %tmp0
527527 store <2 x float> %tmp5, <2 x float> addrspace(1)* %out
528528 ret void
529529 }
533533 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
534534 define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
535535 entry:
536 %tmp = alloca [0 x i32]
537 store [0 x i32] [], [0 x i32]* %tmp
538 %load = load [0 x i32], [0 x i32]* %tmp
536 %tmp = alloca [0 x i32], addrspace(5)
537 store [0 x i32] [], [0 x i32] addrspace(5)* %tmp
538 %load = load [0 x i32], [0 x i32] addrspace(5)* %tmp
539539 store [0 x i32] %load, [0 x i32] addrspace(1)* %out
540540 ret void
541541 }
545545 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
546546 define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
547547 entry:
548 %tmp = alloca [1 x i32]
549 store [1 x i32] [i32 0], [1 x i32]* %tmp
550 %load = load [1 x i32], [1 x i32]* %tmp
548 %tmp = alloca [1 x i32], addrspace(5)
549 store [1 x i32] [i32 0], [1 x i32] addrspace(5)* %tmp
550 %load = load [1 x i32], [1 x i32] addrspace(5)* %tmp
551551 store [1 x i32] %load, [1 x i32] addrspace(1)* %out
552552 ret void
553553 }
1616 ; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:
1717 ; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]:
1818
19 define amdgpu_kernel void @scratch(<2 x i32> %in, i32 %idx, i32* %out) {
19 define amdgpu_kernel void @scratch(<2 x i32> %in, i32 %idx, i32 addrspace(5)* %out) {
2020 entry:
21 %v = alloca [2 x i32]
22 %vv = bitcast [2 x i32]* %v to <2 x i32>*
23 store <2 x i32> %in, <2 x i32>* %vv
24 %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx
25 %x = load i32, i32* %e
26 store i32 %x, i32* %out
21 %v = alloca [2 x i32], addrspace(5)
22 %vv = bitcast [2 x i32] addrspace(5)* %v to <2 x i32> addrspace(5)*
23 store <2 x i32> %in, <2 x i32> addrspace(5)* %vv
24 %e = getelementptr [2 x i32], [2 x i32] addrspace(5)* %v, i32 0, i32 %idx
25 %x = load i32, i32 addrspace(5)* %e
26 store i32 %x, i32 addrspace(5)* %out
2727 ret void
2828 }
2929
4040 ; PAL: s_load_dwordx4 s{{\[}}[[SCRATCHDESC:[0-9]+]]:{{[0-9]+]}}, s{{\[}}[[GITPTR]]:
4141 ; PAL: buffer_store{{.*}}, s{{\[}}[[SCRATCHDESC]]:
4242
43 define amdgpu_kernel void @scratch2(<2 x i32> %in, i32 %idx, i32* %out) #0 {
43 define amdgpu_kernel void @scratch2(<2 x i32> %in, i32 %idx, i32 addrspace(5)* %out) #0 {
4444 entry:
45 %v = alloca [2 x i32]
46 %vv = bitcast [2 x i32]* %v to <2 x i32>*
47 store <2 x i32> %in, <2 x i32>* %vv
48 %e = getelementptr [2 x i32], [2 x i32]* %v, i32 0, i32 %idx
49 %x = load i32, i32* %e
50 store i32 %x, i32* %out
45 %v = alloca [2 x i32], addrspace(5)
46 %vv = bitcast [2 x i32] addrspace(5)* %v to <2 x i32> addrspace(5)*
47 store <2 x i32> %in, <2 x i32> addrspace(5)* %vv
48 %e = getelementptr [2 x i32], [2 x i32] addrspace(5)* %v, i32 0, i32 %idx
49 %x = load i32, i32 addrspace(5)* %e
50 store i32 %x, i32 addrspace(5)* %out
5151 ret void
5252 }
5353
175175
176176 ; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
177177 define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
178 %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
179 store volatile i32 0, i32 addrspace(4)* %stof
180 ret void
181 }
182
183 ; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
184 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
185 %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
186 store volatile i32 0, i32 addrspace(4)* %stof
187 ret void
188 }
189
190 ; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
191 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
192 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
178 %stof = addrspacecast i32 addrspace(3)* %ptr to i32*
179 store volatile i32 0, i32* %stof
180 ret void
181 }
182
183 ; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #11 {
184 define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32 addrspace(5)* %ptr) #1 {
185 %stof = addrspacecast i32 addrspace(5)* %ptr to i32*
186 store volatile i32 0, i32* %stof
187 ret void
188 }
189
190 ; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #1 {
191 define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #1 {
192 %ftos = addrspacecast i32* %ptr to i32 addrspace(3)*
193193 store volatile i32 0, i32 addrspace(3)* %ftos
194194 ret void
195195 }
196196
197 ; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
198 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
199 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
200 store volatile i32 0, i32* %ftos
197 ; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #1 {
198 define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #1 {
199 %ftos = addrspacecast i32* %ptr to i32 addrspace(5)*
200 store volatile i32 0, i32 addrspace(5)* %ftos
201201 ret void
202202 }
203203
204204 ; No-op addrspacecast should not use queue ptr
205205 ; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
206206 define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
207 %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
208 store volatile i32 0, i32 addrspace(4)* %stof
207 %stof = addrspacecast i32 addrspace(1)* %ptr to i32*
208 store volatile i32 0, i32* %stof
209209 ret void
210210 }
211211
212212 ; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
213213 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
214 %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
215 %ld = load volatile i32, i32 addrspace(4)* %stof
216 ret void
217 }
218
219 ; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
220 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
221 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
214 %stof = addrspacecast i32 addrspace(2)* %ptr to i32*
215 %ld = load volatile i32, i32* %stof
216 ret void
217 }
218
219 ; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #1 {
220 define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32* %ptr) #1 {
221 %ftos = addrspacecast i32* %ptr to i32 addrspace(1)*
222222 store volatile i32 0, i32 addrspace(1)* %ftos
223223 ret void
224224 }
225225
226 ; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
227 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
228 %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
226 ; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 {
227 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32* %ptr) #1 {
228 %ftos = addrspacecast i32* %ptr to i32 addrspace(2)*
229229 %ld = load volatile i32, i32 addrspace(2)* %ftos
230230 ret void
231231 }
1919 ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
2020 ; alloca to a vector. It currently fails because it does not know how
2121 ; to interpret:
22 ; getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
22 ; getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 1, i32 %b
2323
2424 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
2525 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
2626 define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
27 %alloca = alloca [16 x i32], align 16
27 %alloca = alloca [16 x i32], align 16, addrspace(5)
2828 %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
2929 %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
3030 %a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid
3232 %a = load i32, i32 addrspace(1)* %a_ptr, !range !0
3333 %b = load i32, i32 addrspace(1)* %b_ptr, !range !0
3434 %result = add i32 %a, %b
35 %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
36 store i32 %result, i32* %alloca_ptr, align 4
35 %alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 1, i32 %b
36 store i32 %result, i32 addrspace(5)* %alloca_ptr, align 4
3737 ; Dummy call
3838 call void @llvm.amdgcn.s.barrier()
39 %reload = load i32, i32* %alloca_ptr, align 4, !range !0
39 %reload = load i32, i32 addrspace(5)* %alloca_ptr, align 4, !range !0
4040 %out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
4141 store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
4242 ret void
1313 ; GCN-NOT: s32
1414 ; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s5 offset:20{{$}}
1515 ; GCN-NOT: s32
16 define void @void_func_byval_struct(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
17 entry:
18 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
19 %tmp = load volatile i32, i32* %arrayidx, align 4
16 define void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
17 entry:
18 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
19 %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
2020 %add = add nsw i32 %tmp, 1
21 store volatile i32 %add, i32* %arrayidx, align 4
22 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
23 %tmp1 = load volatile i32, i32* %arrayidx2, align 4
21 store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
22 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
23 %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
2424 %add3 = add nsw i32 %tmp1, 2
25 store volatile i32 %add3, i32* %arrayidx2, align 4
25 store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
2626 store volatile i32 9, i32 addrspace(1)* null, align 4
2727 ret void
2828 }
5353 ; GCN: buffer_load_dword v33,
5454 ; GCN: s_sub_u32 s32, s32, 0xb00{{$}}
5555 ; GCN: s_setpc_b64
56 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct* byval noalias nocapture align 4 %arg0, %struct.ByValStruct* byval noalias nocapture align 4 %arg1) #1 {
57 entry:
58 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
59 %tmp = load volatile i32, i32* %arrayidx, align 4
56 define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 {
57 entry:
58 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
59 %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4
6060 %add = add nsw i32 %tmp, 1
61 store volatile i32 %add, i32* %arrayidx, align 4
62 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
63 %tmp1 = load volatile i32, i32* %arrayidx2, align 4
61 store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4
62 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
63 %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4
6464 %add3 = add nsw i32 %tmp1, 2
6565 call void @external_void_func_void()
66 store volatile i32 %add3, i32* %arrayidx2, align 4
66 store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4
6767 store volatile i32 9, i32 addrspace(1)* null, align 4
6868 ret void
6969 }
113113 ; GCN-NEXT: s_setpc_b64
114114 define void @call_void_func_byval_struct_func() #0 {
115115 entry:
116 %arg0 = alloca %struct.ByValStruct, align 4
117 %arg1 = alloca %struct.ByValStruct, align 4
118 %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
119 call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
120 %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
121 call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
122 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
123 store volatile i32 9, i32* %arrayidx, align 4
124 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
125 store volatile i32 13, i32* %arrayidx2, align 4
126 call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
127 call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
128 call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
116 %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
117 %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
118 %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
119 call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
120 %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
121 call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
122 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
123 store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4
124 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
125 store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4
126 call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1)
127 call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
128 call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
129129 ret void
130130 }
131131
166166 ; GCN: s_endpgm
167167 define amdgpu_kernel void @call_void_func_byval_struct_kernel() #0 {
168168 entry:
169 %arg0 = alloca %struct.ByValStruct, align 4
170 %arg1 = alloca %struct.ByValStruct, align 4
171 %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
172 call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
173 %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
174 call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
175 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
176 store volatile i32 9, i32* %arrayidx, align 4
177 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
178 store volatile i32 13, i32* %arrayidx2, align 4
179 call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
180 call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
181 call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
169 %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
170 %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
171 %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
172 call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
173 %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
174 call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
175 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
176 store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4
177 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
178 store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4
179 call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1)
180 call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
181 call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
182182 ret void
183183 }
184184
185185 ; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim:
186186 define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 {
187187 entry:
188 %arg0 = alloca %struct.ByValStruct, align 4
189 %arg1 = alloca %struct.ByValStruct, align 4
190 %tmp = bitcast %struct.ByValStruct* %arg0 to i8*
191 call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp)
192 %tmp1 = bitcast %struct.ByValStruct* %arg1 to i8*
193 call void @llvm.lifetime.start.p0i8(i64 32, i8* %tmp1)
194 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg0, i32 0, i32 0, i32 0
195 store volatile i32 9, i32* %arrayidx, align 4
196 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct* %arg1, i32 0, i32 0, i32 0
197 store volatile i32 13, i32* %arrayidx2, align 4
198 call void @void_func_byval_struct(%struct.ByValStruct* byval nonnull align 4 %arg0, %struct.ByValStruct* byval nonnull align 4 %arg1)
199 call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp1)
200 call void @llvm.lifetime.end.p0i8(i64 32, i8* %tmp)
188 %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5)
189 %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5)
190 %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)*
191 call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp)
192 %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)*
193 call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1)
194 %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0
195 store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4
196 %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0
197 store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4
198 call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1)
199 call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1)
200 call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp)
201201 ret void
202202 }
203203
204204 declare void @external_void_func_void() #0
205205
206 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #3
207 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #3
206 declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3
207 declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #3
208208
209209 attributes #0 = { nounwind }
210210 attributes #1 = { noinline norecurse nounwind }
3737
3838 ; Structs
3939 declare void @external_void_func_struct_i8_i32({ i8, i32 }) #0
40 declare void @external_void_func_byval_struct_i8_i32({ i8, i32 }* byval) #0
41 declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* sret, { i8, i32 }* byval) #0
40 declare void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* byval) #0
41 declare void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* sret, { i8, i32 } addrspace(5)* byval) #0
4242
4343 declare void @external_void_func_v16i8(<16 x i8>) #0
4444
464464 ; GCN-NEXT: s_swappc_b64
465465 ; GCN-NOT: [[SP]]
466466 define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 {
467 %val = alloca { i8, i32 }, align 4
468 %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 0
469 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %val, i32 0, i32 1
470 store i8 3, i8* %gep0
471 store i32 8, i32* %gep1
472 call void @external_void_func_byval_struct_i8_i32({ i8, i32 }* %val)
467 %val = alloca { i8, i32 }, align 4, addrspace(5)
468 %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 0
469 %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %val, i32 0, i32 1
470 store i8 3, i8 addrspace(5)* %gep0
471 store i32 8, i32 addrspace(5)* %gep1
472 call void @external_void_func_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %val)
473473 ret void
474474 }
475475
496496 ; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off
497497 ; GCN: buffer_store_dword [[LOAD_OUT_VAL1]], off
498498 define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32(i32) #0 {
499 %in.val = alloca { i8, i32 }, align 4
500 %out.val = alloca { i8, i32 }, align 4
501 %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 0
502 %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %in.val, i32 0, i32 1
503 store i8 3, i8* %in.gep0
504 store i32 8, i32* %in.gep1
505 call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 }* %out.val, { i8, i32 }* %in.val)
506 %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 0
507 %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %out.val, i32 0, i32 1
508 %out.val0 = load i8, i8* %out.gep0
509 %out.val1 = load i32, i32* %out.gep1
499 %in.val = alloca { i8, i32 }, align 4, addrspace(5)
500 %out.val = alloca { i8, i32 }, align 4, addrspace(5)
501 %in.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 0
502 %in.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %in.val, i32 0, i32 1
503 store i8 3, i8 addrspace(5)* %in.gep0
504 store i32 8, i32 addrspace(5)* %in.gep1
505 call void @external_void_func_sret_struct_i8_i32_byval_struct_i8_i32({ i8, i32 } addrspace(5)* %out.val, { i8, i32 } addrspace(5)* %in.val)
506 %out.gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 0
507 %out.gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %out.val, i32 0, i32 1
508 %out.val0 = load i8, i8 addrspace(5)* %out.gep0
509 %out.val1 = load i32, i32 addrspace(5)* %out.gep1
510510
511511 store volatile i8 %out.val0, i8 addrspace(1)* undef
512512 store volatile i32 %out.val1, i32 addrspace(1)* undef
131131 ; GCN-LABEL: {{^}}use_stack0:
132132 ; GCN: ScratchSize: 2052
133133 define void @use_stack0() #1 {
134 %alloca = alloca [512 x i32], align 4
135 call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
134 %alloca = alloca [512 x i32], align 4, addrspace(5)
135 call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0
136136 ret void
137137 }
138138
139139 ; GCN-LABEL: {{^}}use_stack1:
140140 ; GCN: ScratchSize: 404
141141 define void @use_stack1() #1 {
142 %alloca = alloca [100 x i32], align 4
143 call void asm sideeffect "; use $0", "v"([100 x i32]* %alloca) #0
142 %alloca = alloca [100 x i32], align 4, addrspace(5)
143 call void asm sideeffect "; use $0", "v"([100 x i32] addrspace(5)* %alloca) #0
144144 ret void
145145 }
146146
147147 ; GCN-LABEL: {{^}}indirect_use_stack:
148148 ; GCN: ScratchSize: 2124
149149 define void @indirect_use_stack() #1 {
150 %alloca = alloca [16 x i32], align 4
151 call void asm sideeffect "; use $0", "v"([16 x i32]* %alloca) #0
150 %alloca = alloca [16 x i32], align 4, addrspace(5)
151 call void asm sideeffect "; use $0", "v"([16 x i32] addrspace(5)* %alloca) #0
152152 call void @use_stack0()
153153 ret void
154154 }
200200 ; GCN-LABEL: {{^}}direct_recursion_use_stack:
201201 ; GCN: ScratchSize: 2056
202202 define void @direct_recursion_use_stack(i32 %val) #2 {
203 %alloca = alloca [512 x i32], align 4
204 call void asm sideeffect "; use $0", "v"([512 x i32]* %alloca) #0
203 %alloca = alloca [512 x i32], align 4, addrspace(5)
204 call void asm sideeffect "; use $0", "v"([512 x i32] addrspace(5)* %alloca) #0
205205 %cmp = icmp eq i32 %val, 0
206206 br i1 %cmp, label %ret, label %call
207207
2727 ; GCN-NEXT: s_waitcnt
2828 ; GCN-NEXT: s_setpc_b64
2929 define void @callee_with_stack() #0 {
30 %alloca = alloca i32
31 store volatile i32 0, i32* %alloca
30 %alloca = alloca i32, addrspace(5)
31 store volatile i32 0, i32 addrspace(5)* %alloca
3232 ret void
3333 }
3434
5656 ; GCN: s_waitcnt
5757 ; GCN-NEXT: s_setpc_b64
5858 define void @callee_with_stack_and_call() #0 {
59 %alloca = alloca i32
60 store volatile i32 0, i32* %alloca
59 %alloca = alloca i32, addrspace(5)
60 store volatile i32 0, i32 addrspace(5)* %alloca
6161 call void @external_void_func_void()
6262 ret void
6363 }
4242 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE_LOAD]]
4343 ; GCN: {{flat|global}}_store_dword v{{\[[0-9]+}}:[[HI]]{{\]}}
4444 define void @use_queue_ptr_addrspacecast() #1 {
45 %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32 addrspace(4)*
46 store volatile i32 0, i32 addrspace(4)* %asc
45 %asc = addrspacecast i32 addrspace(3)* inttoptr (i32 16 to i32 addrspace(3)*) to i32*
46 store volatile i32 0, i32* %asc
4747 ret void
4848 }
4949
112112 ; GCN: ; use s6
113113 ; GCN: s_setpc_b64
114114 define void @use_stack_workgroup_id_x() #1 {
115 %alloca = alloca i32
116 store volatile i32 0, i32* %alloca
115 %alloca = alloca i32, addrspace(5)
116 store volatile i32 0, i32 addrspace(5)* %alloca
117117 %val = call i32 @llvm.amdgcn.workgroup.id.x()
118118 call void asm sideeffect "; use $0", "s"(i32 %val)
119119 ret void
431431 ; GCN: ; use s15
432432 ; GCN: ; use s16
433433 define void @use_every_sgpr_input() #1 {
434 %alloca = alloca i32, align 4
435 store volatile i32 0, i32* %alloca
434 %alloca = alloca i32, align 4, addrspace(5)
435 store volatile i32 0, i32 addrspace(5)* %alloca
436436
437437 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
438438 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
511511 ; GCN-DAG: s_mov_b32 s8, s16
512512 ; GCN: s_swappc_b64
513513 define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 {
514 %alloca = alloca i32, align 4
515 store volatile i32 0, i32* %alloca
514 %alloca = alloca i32, align 4, addrspace(5)
515 store volatile i32 0, i32 addrspace(5)* %alloca
516516
517517 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
518518 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
567567 ; GCN: ; use [[SAVE_Y]]
568568 ; GCN: ; use [[SAVE_Z]]
569569 define void @func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill() #1 {
570 %alloca = alloca i32, align 4
570 %alloca = alloca i32, align 4, addrspace(5)
571571 call void @use_workgroup_id_xyz()
572572
573 store volatile i32 0, i32* %alloca
573 store volatile i32 0, i32 addrspace(5)* %alloca
574574
575575 %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
576576 %dispatch_ptr.bc = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
367367 i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
368368 i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,
369369 i32 %arg16, i32 %arg17, i32 %arg18, i32 %arg19, i32 %arg20, i32 %arg21, i32 %arg22, i32 %arg23,
370 i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32* byval %arg32) #1 {
370 i32 %arg24, i32 %arg25, i32 %arg26, i32 %arg27, i32 %arg28, i32 %arg29, i32 %arg30, i32 %arg31, i32 addrspace(5)* byval %arg32) #1 {
371371 %val = call i32 @llvm.amdgcn.workitem.id.x()
372372 store volatile i32 %val, i32 addrspace(1)* undef
373373
406406 store volatile i32 %arg29, i32 addrspace(1)* undef
407407 store volatile i32 %arg30, i32 addrspace(1)* undef
408408 store volatile i32 %arg31, i32 addrspace(1)* undef
409 %private = load volatile i32, i32* %arg32
409 %private = load volatile i32, i32 addrspace(5)* %arg32
410410 ret void
411411 }
412412
434434 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
435435 ; GCN: s_swappc_b64
436436 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 {
437 %alloca = alloca i32, align 4
438 store volatile i32 999, i32* %alloca
437 %alloca = alloca i32, align 4, addrspace(5)
438 store volatile i32 999, i32 addrspace(5)* %alloca
439439 call void @too_many_args_use_workitem_id_x_byval(
440440 i32 10, i32 20, i32 30, i32 40,
441441 i32 50, i32 60, i32 70, i32 80,
445445 i32 210, i32 220, i32 230, i32 240,
446446 i32 250, i32 260, i32 270, i32 280,
447447 i32 290, i32 300, i32 310, i32 320,
448 i32* %alloca)
448 i32 addrspace(5)* %alloca)
449449 ret void
450450 }
451451
459459 ; GCN: v_mov_b32_e32 [[RELOAD_BYVAL]],
460460 ; GCN: s_swappc_b64
461461 define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
462 %alloca = alloca i32, align 4
463 store volatile i32 999, i32* %alloca
462 %alloca = alloca i32, align 4, addrspace(5)
463 store volatile i32 999, i32 addrspace(5)* %alloca
464464 call void @too_many_args_use_workitem_id_x_byval(
465465 i32 10, i32 20, i32 30, i32 40,
466466 i32 50, i32 60, i32 70, i32 80,
470470 i32 210, i32 220, i32 230, i32 240,
471471 i32 250, i32 260, i32 270, i32 280,
472472 i32 290, i32 300, i32 310, i32 320,
473 i32* %alloca)
473 i32 addrspace(5)* %alloca)
474474 ret void
475475 }
476476
44 ; GCN: buffer_store_dword [[FI]]
55 define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
66 entry:
7 %b = alloca i8
8 call void @llvm.lifetime.start.p0i8(i64 1, i8* %b)
9 store volatile i8* %b, i8* addrspace(1)* undef
10 call void @llvm.lifetime.end.p0i8(i64 1, i8* %b)
7 %b = alloca i8, addrspace(5)
8 call void @llvm.lifetime.start.p5i8(i64 1, i8 addrspace(5)* %b)
9 store volatile i8 addrspace(5)* %b, i8 addrspace(5)* addrspace(1)* undef
10 call void @llvm.lifetime.end.p5i8(i64 1, i8 addrspace(5)* %b)
1111 ret void
1212 }
1313
1717 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
1818 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
1919 ; GCN: ds_write_b32 [[VLDSPTR]], [[ZERO0]]
20 define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
21 %tmp = alloca float
22 store float 4.0, float *%tmp
23 store float* %tmp, float* addrspace(3)* %ptr
20 define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %ptr) #0 {
21 %tmp = alloca float, addrspace(5)
22 store float 4.0, float addrspace(5)*%tmp
23 store float addrspace(5)* %tmp, float addrspace(5)* addrspace(3)* %ptr
2424 ret void
2525 }
2626
3737
3838 ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
3939 ; GCN: ds_write_b32 [[VLDSPTR]], [[FI1]]
40 define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
41 %tmp0 = alloca float
42 %tmp1 = alloca float
43 store float 4.0, float* %tmp0
44 store float 4.0, float* %tmp1
45 store volatile float* %tmp0, float* addrspace(3)* %ptr
46 store volatile float* %tmp1, float* addrspace(3)* %ptr
40 define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float addrspace(5)* addrspace(3)* %ptr) #0 {
41 %tmp0 = alloca float, addrspace(5)
42 %tmp1 = alloca float, addrspace(5)
43 store float 4.0, float addrspace(5)* %tmp0
44 store float 4.0, float addrspace(5)* %tmp1
45 store volatile float addrspace(5)* %tmp0, float addrspace(5)* addrspace(3)* %ptr
46 store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(3)* %ptr
4747 ret void
4848 }
4949
5454 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
5555 ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
5656 define amdgpu_kernel void @stored_fi_to_self() #0 {
57 %tmp = alloca i32*
57 %tmp = alloca i32 addrspace(5)*, addrspace(5)
5858
5959 ; Avoid optimizing everything out
60 store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
61 %bitcast = bitcast i32** %tmp to i32*
62 store volatile i32* %bitcast, i32** %tmp
60 store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp
61 %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp to i32 addrspace(5)*
62 store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp
6363 ret void
6464 }
6565
7373 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
7474 ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
7575 define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
76 %tmp0 = alloca [512 x i32]
77 %tmp1 = alloca i32*
76 %tmp0 = alloca [512 x i32], addrspace(5)
77 %tmp1 = alloca i32 addrspace(5)*, addrspace(5)
7878
7979 ; Avoid optimizing everything out
80 %tmp0.cast = bitcast [512 x i32]* %tmp0 to i32*
81 store volatile i32 32, i32* %tmp0.cast
82
83 store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp1
84
85 %bitcast = bitcast i32** %tmp1 to i32*
86 store volatile i32* %bitcast, i32** %tmp1
80 %tmp0.cast = bitcast [512 x i32] addrspace(5)* %tmp0 to i32 addrspace(5)*
81 store volatile i32 32, i32 addrspace(5)* %tmp0.cast
82
83 store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp1
84
85 %bitcast = bitcast i32 addrspace(5)* addrspace(5)* %tmp1 to i32 addrspace(5)*
86 store volatile i32 addrspace(5)* %bitcast, i32 addrspace(5)* addrspace(5)* %tmp1
8787 ret void
8888 }
8989
9898 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
9999 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
100100 define amdgpu_kernel void @stored_fi_to_fi() #0 {
101 %tmp0 = alloca i32*
102 %tmp1 = alloca i32*
103 %tmp2 = alloca i32*
104 store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp0
105 store volatile i32* inttoptr (i32 5678 to i32*), i32** %tmp1
106 store volatile i32* inttoptr (i32 9999 to i32*), i32** %tmp2
107
108 %bitcast1 = bitcast i32** %tmp1 to i32*
109 %bitcast2 = bitcast i32** %tmp2 to i32* ; at offset 8
110
111 store volatile i32* %bitcast1, i32** %tmp2 ; store offset 4 at offset 8
112 store volatile i32* %bitcast2, i32** %tmp1 ; store offset 8 at offset 4
101 %tmp0 = alloca i32 addrspace(5)*, addrspace(5)
102 %tmp1 = alloca i32 addrspace(5)*, addrspace(5)
103 %tmp2 = alloca i32 addrspace(5)*, addrspace(5)
104 store volatile i32 addrspace(5)* inttoptr (i32 1234 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp0
105 store volatile i32 addrspace(5)* inttoptr (i32 5678 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp1
106 store volatile i32 addrspace(5)* inttoptr (i32 9999 to i32 addrspace(5)*), i32 addrspace(5)* addrspace(5)* %tmp2
107
108 %bitcast1 = bitcast i32 addrspace(5)* addrspace(5)* %tmp1 to i32 addrspace(5)*
109 %bitcast2 = bitcast i32 addrspace(5)* addrspace(5)* %tmp2 to i32 addrspace(5)* ; at offset 8
110
111 store volatile i32 addrspace(5)* %bitcast1, i32 addrspace(5)* addrspace(5)* %tmp2 ; store offset 4 at offset 8
112 store volatile i32 addrspace(5)* %bitcast2, i32 addrspace(5)* addrspace(5)* %tmp1 ; store offset 8 at offset 4
113113 ret void
114114 }
115115
117117 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
118118 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
119119 ; GCN: buffer_store_dword [[FI]]
120 define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
121 %tmp = alloca float
122 store float 0.0, float *%tmp
123 store float* %tmp, float* addrspace(1)* %ptr
120 define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* %ptr) #0 {
121 %tmp = alloca float, addrspace(5)
122 store float 0.0, float addrspace(5)*%tmp
123 store float addrspace(5)* %tmp, float addrspace(5)* addrspace(1)* %ptr
124124 ret void
125125 }
126126
135135
136136 ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
137137 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
138 define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
139 %tmp0 = alloca float
140 %tmp1 = alloca float
141 %tmp2 = alloca float
142 store volatile float 0.0, float *%tmp0
143 store volatile float 0.0, float *%tmp1
144 store volatile float 0.0, float *%tmp2
145 store volatile float* %tmp1, float* addrspace(1)* %ptr
146 store volatile float* %tmp2, float* addrspace(1)* %ptr
138 define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5)* addrspace(1)* %ptr) #0 {
139 %tmp0 = alloca float, addrspace(5)
140 %tmp1 = alloca float, addrspace(5)
141 %tmp2 = alloca float, addrspace(5)
142 store volatile float 0.0, float addrspace(5)*%tmp0
143 store volatile float 0.0, float addrspace(5)*%tmp1
144 store volatile float 0.0, float addrspace(5)*%tmp2
145 store volatile float addrspace(5)* %tmp1, float addrspace(5)* addrspace(1)* %ptr
146 store volatile float addrspace(5)* %tmp2, float addrspace(5)* addrspace(1)* %ptr
147147 ret void
148148 }
149149
162162 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
163163
164164 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
165 define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
166 %tmp0 = alloca [4096 x i32]
167 %tmp1 = alloca [4096 x i32]
168 %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
169 store volatile i32 0, i32* %gep0.tmp0
170 %gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 4095
171 store volatile i32 999, i32* %gep1.tmp0
172 %gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 14
173 store i32* %gep0.tmp1, i32* addrspace(1)* %ptr
174 ret void
175 }
176
177 @g1 = external addrspace(1) global i32*
165 define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5)* addrspace(1)* %ptr) #0 {
166 %tmp0 = alloca [4096 x i32], addrspace(5)
167 %tmp1 = alloca [4096 x i32], addrspace(5)
168 %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 0
169 store volatile i32 0, i32 addrspace(5)* %gep0.tmp0
170 %gep1.tmp0 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 4095
171 store volatile i32 999, i32 addrspace(5)* %gep1.tmp0
172 %gep0.tmp1 = getelementptr [4096 x i32], [4096 x i32] addrspace(5)* %tmp0, i32 0, i32 14
173 store i32 addrspace(5)* %gep0.tmp1, i32 addrspace(5)* addrspace(1)* %ptr
174 ret void
175 }
176
177 @g1 = external addrspace(1) global i32 addrspace(5)*
178178
179179 ; This was leaving a dead node around resulting in failing to select
180180 ; on the leftover AssertZext's ValueType operand.
187187 ; GCN: buffer_store_dword [[FI]]
188188 define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
189189 entry:
190 %b = alloca i32, align 4
191 %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
192 %arrayidx = getelementptr inbounds i32, i32* %tmp1, i32 %idx
193 %tmp2 = load i32, i32* %arrayidx, align 4
194 store volatile i32* %b, i32* addrspace(1)* undef
195 ret void
196 }
197
198 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
199 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
190 %b = alloca i32, align 4, addrspace(5)
191 %tmp1 = load volatile i32 addrspace(5)*, i32 addrspace(5)* addrspace(1)* @g1, align 4
192 %arrayidx = getelementptr inbounds i32, i32 addrspace(5)* %tmp1, i32 %idx
193 %tmp2 = load i32, i32 addrspace(5)* %arrayidx, align 4
194 store volatile i32 addrspace(5)* %b, i32 addrspace(5)* addrspace(1)* undef
195 ret void
196 }
197
198 declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #1
199 declare void @llvm.lifetime.end.p5i8(i64, i8 addrspace(5)* nocapture) #1
200200
201201 attributes #0 = { nounwind }
202202 attributes #1 = { argmemonly nounwind }
55 ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
66
77 ; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
8 ; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in
8 ; OPT-CIVI: getelementptr i32, i32* %in
99 ; OPT-CIVI: br i1
1010 ; OPT-CIVI-NOT: ptrtoint
1111
1212 ; OPT-GFX9: br
13 ; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(4)* %0, i64 28
14 ; OPT-GFX9: %1 = bitcast i8 addrspace(4)* %sunkaddr to i32 addrspace(4)*
15 ; OPT-GFX9: load i32, i32 addrspace(4)* %1
13 ; OPT-GFX9: %sunkaddr = getelementptr i8, i8* %0, i64 28
14 ; OPT-GFX9: %1 = bitcast i8* %sunkaddr to i32*
15 ; OPT-GFX9: load i32, i32* %1
1616
1717 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
1818 ; GCN: flat_load_dword
1919 ; GCN: {{^}}BB0_2:
20 define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
21 entry:
22 %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
23 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
20 define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32* %out, i32* %in, i32 %cond) {
21 entry:
22 %out.gep = getelementptr i32, i32* %out, i64 999999
23 %in.gep = getelementptr i32, i32* %in, i64 7
2424 %tmp0 = icmp eq i32 %cond, 0
2525 br i1 %tmp0, label %endif, label %if
2626
2727 if:
28 %tmp1 = load i32, i32 addrspace(4)* %in.gep
28 %tmp1 = load i32, i32* %in.gep
2929 br label %endif
3030
3131 endif:
3232 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
33 store i32 %x, i32 addrspace(4)* %out.gep
33 store i32 %x, i32* %out.gep
3434 br label %done
3535
3636 done:
3838 }
3939
4040 ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
41 ; OPT: getelementptr i32, i32 addrspace(4)* %out,
41 ; OPT: getelementptr i32, i32* %out,
4242 ; rOPT-CI-NOT: getelementptr
4343 ; OPT: br i1
4444
4949
5050 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
5151 ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
52 define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
53 entry:
54 %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
55 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
56 %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(1)*
52 define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32* %out, i32* %in, i32 %cond) {
53 entry:
54 %out.gep = getelementptr i32, i32* %out, i64 999999
55 %in.gep = getelementptr i32, i32* %in, i64 7
56 %cast = addrspacecast i32* %in.gep to i32 addrspace(1)*
5757 %tmp0 = icmp eq i32 %cond, 0
5858 br i1 %tmp0, label %endif, label %if
5959
6363
6464 endif:
6565 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
66 store i32 %x, i32 addrspace(4)* %out.gep
66 store i32 %x, i32* %out.gep
6767 br label %done
6868
6969 done:
7171 }
7272
7373 ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_constant_i32(
74 ; OPT: getelementptr i32, i32 addrspace(4)* %out,
74 ; OPT: getelementptr i32, i32* %out,
7575 ; OPT-CI-NOT: getelementptr
7676 ; OPT: br i1
7777
8282
8383 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
8484 ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
85 define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
86 entry:
87 %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
88 %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
89 %cast = addrspacecast i32 addrspace(4)* %in.gep to i32 addrspace(2)*
85 define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32* %out, i32* %in, i32 %cond) {
86 entry:
87 %out.gep = getelementptr i32, i32* %out, i64 999999
88 %in.gep = getelementptr i32, i32* %in, i64 7
89 %cast = addrspacecast i32* %in.gep to i32 addrspace(2)*
9090 %tmp0 = icmp eq i32 %cond, 0
9191 br i1 %tmp0, label %endif, label %if
9292
9696
9797 endif:
9898 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
99 store i32 %x, i32 addrspace(4)* %out.gep
99 store i32 %x, i32* %out.gep
100100 br label %done
101101
102102 done:
104104 }
105105
106106 ; OPT-LABEL: @test_sink_flat_small_max_flat_offset(
107 ; OPT-CIVI: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
107 ; OPT-CIVI: %in.gep = getelementptr i8, i8* %in, i64 4095
108108 ; OPT-CIVI: br
109109 ; OPT-CIVI-NOT: getelementptr
110 ; OPT-CIVI: load i8, i8 addrspace(4)* %in.gep
110 ; OPT-CIVI: load i8, i8* %in.gep
111111
112112 ; OPT-GFX9: br
113 ; OPT-GFX9: %sunkaddr = getelementptr i8, i8 addrspace(4)* %in, i64 4095
114 ; OPT-GFX9: load i8, i8 addrspace(4)* %sunkaddr
113 ; OPT-GFX9: %sunkaddr = getelementptr i8, i8* %in, i64 4095
114 ; OPT-GFX9: load i8, i8* %sunkaddr
115115
116116 ; GCN-LABEL: {{^}}test_sink_flat_small_max_flat_offset:
117117 ; GFX9: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
118118 ; CIVI: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}}
119 define amdgpu_kernel void @test_sink_flat_small_max_flat_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in) #1 {
120 entry:
121 %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024
122 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4095
119 define amdgpu_kernel void @test_sink_flat_small_max_flat_offset(i32* %out, i8* %in) #1 {
120 entry:
121 %out.gep = getelementptr i32, i32* %out, i32 1024
122 %in.gep = getelementptr i8, i8* %in, i64 4095
123123 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
124124 %tmp0 = icmp eq i32 %tid, 0
125125 br i1 %tmp0, label %endif, label %if
126126
127127 if:
128 %tmp1 = load i8, i8 addrspace(4)* %in.gep
128 %tmp1 = load i8, i8* %in.gep
129129 %tmp2 = sext i8 %tmp1 to i32
130130 br label %endif
131131
132132 endif:
133133 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
134 store i32 %x, i32 addrspace(4)* %out.gep
134 store i32 %x, i32* %out.gep
135135 br label %done
136136
137137 done:
139139 }
140140
141141 ; OPT-LABEL: @test_sink_flat_small_max_plus_1_flat_offset(
142 ; OPT: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4096
142 ; OPT: %in.gep = getelementptr i8, i8* %in, i64 4096
143143 ; OPT: br
144144 ; OPT-NOT: getelementptr
145 ; OPT: load i8, i8 addrspace(4)* %in.gep
145 ; OPT: load i8, i8* %in.gep
146146
147147 ; GCN-LABEL: {{^}}test_sink_flat_small_max_plus_1_flat_offset:
148148 ; GCN: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}}
149 define amdgpu_kernel void @test_sink_flat_small_max_plus_1_flat_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in) #1 {
150 entry:
151 %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 99999
152 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 4096
149 define amdgpu_kernel void @test_sink_flat_small_max_plus_1_flat_offset(i32* %out, i8* %in) #1 {
150 entry:
151 %out.gep = getelementptr i32, i32* %out, i64 99999
152 %in.gep = getelementptr i8, i8* %in, i64 4096
153153 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
154154 %tmp0 = icmp eq i32 %tid, 0
155155 br i1 %tmp0, label %endif, label %if
156156
157157 if:
158 %tmp1 = load i8, i8 addrspace(4)* %in.gep
158 %tmp1 = load i8, i8* %in.gep
159159 %tmp2 = sext i8 %tmp1 to i32
160160 br label %endif
161161
162162 endif:
163163 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
164 store i32 %x, i32 addrspace(4)* %out.gep
164 store i32 %x, i32* %out.gep
165165 br label %done
166166
167167 done:
169169 }
170170
171171 ; OPT-LABEL: @test_no_sink_flat_reg_offset(
172 ; OPT: %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 %reg
172 ; OPT: %in.gep = getelementptr i8, i8* %in, i64 %reg
173173 ; OPT: br
174174
175175 ; OPT-NOT: getelementptr
176 ; OPT: load i8, i8 addrspace(4)* %in.gep
176 ; OPT: load i8, i8* %in.gep
177177
178178 ; GCN-LABEL: {{^}}test_no_sink_flat_reg_offset:
179179 ; GCN: flat_load_sbyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]$}}
180 define amdgpu_kernel void @test_no_sink_flat_reg_offset(i32 addrspace(4)* %out, i8 addrspace(4)* %in, i64 %reg) #1 {
181 entry:
182 %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024
183 %in.gep = getelementptr i8, i8 addrspace(4)* %in, i64 %reg
180 define amdgpu_kernel void @test_no_sink_flat_reg_offset(i32* %out, i8* %in, i64 %reg) #1 {
181 entry:
182 %out.gep = getelementptr i32, i32* %out, i32 1024
183 %in.gep = getelementptr i8, i8* %in, i64 %reg
184184 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
185185 %tmp0 = icmp eq i32 %tid, 0
186186 br i1 %tmp0, label %endif, label %if
187187
188188 if:
189 %tmp1 = load i8, i8 addrspace(4)* %in.gep
189 %tmp1 = load i8, i8* %in.gep
190190 %tmp2 = sext i8 %tmp1 to i32
191191 br label %endif
192192
193193 endif:
194194 %x = phi i32 [ %tmp2, %if ], [ 0, %entry ]
195 store i32 %x, i32 addrspace(4)* %out.gep
195 store i32 %x, i32* %out.gep
196196 br label %done
197197
198198 done:
66 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
77 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
88
9 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
9 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
1010
1111 ; OPT-LABEL: @test_sink_global_small_offset_i32(
1212 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
136136 ; GCN: {{^}}BB4_2:
137137 define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
138138 entry:
139 %alloca = alloca [512 x i32], align 4
139 %alloca = alloca [512 x i32], align 4, addrspace(5)
140140 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
141141 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
142142 %add.arg = add i32 %arg, 8
143 %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1022
144 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
145 %tmp0 = icmp eq i32 %tid, 0
146 br i1 %tmp0, label %endif, label %if
147
148 if:
149 store volatile i32 123, i32* %alloca.gep
150 %tmp1 = load volatile i32, i32* %alloca.gep
143 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1022
144 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
145 %tmp0 = icmp eq i32 %tid, 0
146 br i1 %tmp0, label %endif, label %if
147
148 if:
149 store volatile i32 123, i32 addrspace(5)* %alloca.gep
150 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
151151 br label %endif
152152
153153 endif:
154154 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
155155 store i32 %x, i32 addrspace(1)* %out.gep.0
156 %load = load volatile i32, i32* %alloca.gep
156 %load = load volatile i32, i32 addrspace(5)* %alloca.gep
157157 store i32 %load, i32 addrspace(1)* %out.gep.1
158158 br label %done
159159
177177
178178 define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
179179 entry:
180 %alloca = alloca [512 x i32], align 4
180 %alloca = alloca [512 x i32], align 4, addrspace(5)
181181 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
182182 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
183183 %add.arg = add i32 %arg, 8
184 %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1023
185 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
186 %tmp0 = icmp eq i32 %tid, 0
187 br i1 %tmp0, label %endif, label %if
188
189 if:
190 store volatile i32 123, i32* %alloca.gep
191 %tmp1 = load volatile i32, i32* %alloca.gep
184 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1023
185 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
186 %tmp0 = icmp eq i32 %tid, 0
187 br i1 %tmp0, label %endif, label %if
188
189 if:
190 store volatile i32 123, i32 addrspace(5)* %alloca.gep
191 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
192192 br label %endif
193193
194194 endif:
195195 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
196196 store i32 %x, i32 addrspace(1)* %out.gep.0
197 %load = load volatile i32, i32* %alloca.gep
197 %load = load volatile i32, i32 addrspace(5)* %alloca.gep
198198 store i32 %load, i32 addrspace(1)* %out.gep.1
199199 br label %done
200200
203203 }
204204
205205 ; OPT-LABEL: @test_no_sink_scratch_large_offset_i32(
206 ; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
206 ; OPT: %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
207207 ; OPT: br i1
208208 ; OPT-NOT: ptrtoint
209209
214214 ; GCN: {{^BB[0-9]+}}_2:
215215 define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
216216 entry:
217 %alloca = alloca [512 x i32], align 4
217 %alloca = alloca [512 x i32], align 4, addrspace(5)
218218 %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
219219 %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
220220 %add.arg = add i32 %arg, 8
221 %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1024
222 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
223 %tmp0 = icmp eq i32 %tid, 0
224 br i1 %tmp0, label %endif, label %if
225
226 if:
227 store volatile i32 123, i32* %alloca.gep
228 %tmp1 = load volatile i32, i32* %alloca.gep
221 %alloca.gep = getelementptr [512 x i32], [512 x i32] addrspace(5)* %alloca, i32 0, i32 1024
222 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
223 %tmp0 = icmp eq i32 %tid, 0
224 br i1 %tmp0, label %endif, label %if
225
226 if:
227 store volatile i32 123, i32 addrspace(5)* %alloca.gep
228 %tmp1 = load volatile i32, i32 addrspace(5)* %alloca.gep
229229 br label %endif
230230
231231 endif:
232232 %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
233233 store i32 %x, i32 addrspace(1)* %out.gep.0
234 %load = load volatile i32, i32* %alloca.gep
234 %load = load volatile i32, i32 addrspace(5)* %alloca.gep
235235 store i32 %load, i32 addrspace(1)* %out.gep.1
236236 br label %done
237237
236236 br i1 %tmp3, label %bb4, label %bb10
237237
238238 bb4: ; preds = %bb2
239 %tmp6 = load float, float* undef
239 %tmp6 = load float, float addrspace(5)* undef
240240 %tmp7 = fcmp olt float %tmp6, 0.0
241241 br i1 %tmp7, label %bb8, label %Flow
242242
256256 br label %bb1
257257
258258 bb12: ; preds = %bb10
259 store volatile <4 x float> %tmp11, <4 x float>* undef, align 16
259 store volatile <4 x float> %tmp11, <4 x float> addrspace(5)* undef, align 16
260260 ret void
261261 }
262262
702702 ; GCN: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, [[FI]]
703703 define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
704704 entry:
705 %stack0 = alloca i32
706 %ptr0 = load volatile i32*, i32* addrspace(1)* undef
707 %eq = icmp eq i32* %ptr0, %stack0
705 %stack0 = alloca i32, addrspace(5)
706 %ptr0 = load volatile i32 addrspace(5)*, i32 addrspace(5)* addrspace(1)* undef
707 %eq = icmp eq i32 addrspace(5)* %ptr0, %stack0
708708 %ext = zext i1 %eq to i32
709709 store volatile i32 %ext, i32 addrspace(1)* %out
710710 ret void
77 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
88 define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
99 entry:
10 %alloca = alloca [16 x i32]
10 %alloca = alloca [16 x i32], addrspace(5)
1111 br label %loop
1212
1313 loop:
1414 %inc = phi i32 [0, %entry], [%inc.i, %loop]
15 %ptr = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %inc
16 store i32 %inc, i32* %ptr
15 %ptr = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %inc
16 store i32 %inc, i32 addrspace(5)* %ptr
1717 %inc.i = add i32 %inc, 1
1818 %cnd = icmp uge i32 %inc.i, 16
1919 br i1 %cnd, label %done, label %loop
2020
2121 done:
22 %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 0
23 %tmp1 = load i32, i32* %tmp0
22 %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 0
23 %tmp1 = load i32, i32 addrspace(5)* %tmp0
2424 store i32 %tmp1, i32 addrspace(1)* %out
2525 ret void
2626 }
44 ; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
55 define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
66 entry:
7 %tmp0 = alloca i8
8 %tmp1 = load i8, i8* %tmp0
7 %tmp0 = alloca i8, addrspace(5)
8 %tmp1 = load i8, i8 addrspace(5)* %tmp0
99 %tmp2 = sext i8 %tmp1 to i32
1010 store i32 %tmp2, i32 addrspace(1)* %out
1111 ret void
1515 ; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
1616 define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
1717 entry:
18 %tmp0 = alloca i8
19 %tmp1 = load i8, i8* %tmp0
18 %tmp0 = alloca i8, addrspace(5)
19 %tmp1 = load i8, i8 addrspace(5)* %tmp0
2020 %tmp2 = zext i8 %tmp1 to i32
2121 store i32 %tmp2, i32 addrspace(1)* %out
2222 ret void
2626 ; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
2727 define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
2828 entry:
29 %tmp0 = alloca i16
30 %tmp1 = load i16, i16* %tmp0
29 %tmp0 = alloca i16, addrspace(5)
30 %tmp1 = load i16, i16 addrspace(5)* %tmp0
3131 %tmp2 = sext i16 %tmp1 to i32
3232 store i32 %tmp2, i32 addrspace(1)* %out
3333 ret void
3737 ; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
3838 define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
3939 entry:
40 %tmp0 = alloca i16
41 %tmp1 = load volatile i16, i16* %tmp0
40 %tmp0 = alloca i16, addrspace(5)
41 %tmp1 = load volatile i16, i16 addrspace(5)* %tmp0
4242 %tmp2 = zext i16 %tmp1 to i32
4343 store i32 %tmp2, i32 addrspace(1)* %out
4444 ret void
1818 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
1919 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
2020 define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
21 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
22 store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
21 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
22 store volatile i32 %x, i32* %fptr, align 4
2323 ret void
2424 }
2525
2626 ; CHECK-LABEL: {{^}}store_flat_i64:
2727 ; CHECK: flat_store_dwordx2
2828 define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
29 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
30 store volatile i64 %x, i64 addrspace(4)* %fptr, align 8
29 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
30 store volatile i64 %x, i64* %fptr, align 8
3131 ret void
3232 }
3333
3434 ; CHECK-LABEL: {{^}}store_flat_v4i32:
3535 ; CHECK: flat_store_dwordx4
3636 define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
37 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
38 store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
37 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
38 store volatile <4 x i32> %x, <4 x i32>* %fptr, align 16
3939 ret void
4040 }
4141
4242 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
4343 ; CHECK: flat_store_short
4444 define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
45 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
45 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
4646 %y = trunc i32 %x to i16
47 store volatile i16 %y, i16 addrspace(4)* %fptr, align 2
47 store volatile i16 %y, i16* %fptr, align 2
4848 ret void
4949 }
5050
5151 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
5252 ; CHECK: flat_store_byte
5353 define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
54 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
54 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
5555 %y = trunc i32 %x to i8
56 store volatile i8 %y, i8 addrspace(4)* %fptr, align 2
56 store volatile i8 %y, i8* %fptr, align 2
5757 ret void
5858 }
5959
6262 ; CHECK-LABEL: load_flat_i32:
6363 ; CHECK: flat_load_dword
6464 define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
65 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
66 %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4
65 %fptr = addrspacecast i32 addrspace(1)* %gptr to i32*
66 %fload = load volatile i32, i32* %fptr, align 4
6767 store i32 %fload, i32 addrspace(1)* %out, align 4
6868 ret void
6969 }
7171 ; CHECK-LABEL: load_flat_i64:
7272 ; CHECK: flat_load_dwordx2
7373 define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
74 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
75 %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8
74 %fptr = addrspacecast i64 addrspace(1)* %gptr to i64*
75 %fload = load volatile i64, i64* %fptr, align 8
7676 store i64 %fload, i64 addrspace(1)* %out, align 8
7777 ret void
7878 }
8080 ; CHECK-LABEL: load_flat_v4i32:
8181 ; CHECK: flat_load_dwordx4
8282 define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
83 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
84 %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
83 %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32>*
84 %fload = load volatile <4 x i32>, <4 x i32>* %fptr, align 32
8585 store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
8686 ret void
8787 }
8989 ; CHECK-LABEL: sextload_flat_i8:
9090 ; CHECK: flat_load_sbyte
9191 define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
92 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
93 %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
92 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
93 %fload = load volatile i8, i8* %fptr, align 4
9494 %ext = sext i8 %fload to i32
9595 store i32 %ext, i32 addrspace(1)* %out, align 4
9696 ret void
9999 ; CHECK-LABEL: zextload_flat_i8:
100100 ; CHECK: flat_load_ubyte
101101 define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
102 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
103 %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
102 %fptr = addrspacecast i8 addrspace(1)* %gptr to i8*
103 %fload = load volatile i8, i8* %fptr, align 4
104104 %ext = zext i8 %fload to i32
105105 store i32 %ext, i32 addrspace(1)* %out, align 4
106106 ret void
109109 ; CHECK-LABEL: sextload_flat_i16:
110110 ; CHECK: flat_load_sshort
111111 define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
112 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
113 %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
112 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
113 %fload = load volatile i16, i16* %fptr, align 4
114114 %ext = sext i16 %fload to i32
115115 store i32 %ext, i32 addrspace(1)* %out, align 4
116116 ret void
119119 ; CHECK-LABEL: zextload_flat_i16:
120120 ; CHECK: flat_load_ushort
121121 define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
122 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
123 %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
122 %fptr = addrspacecast i16 addrspace(1)* %gptr to i16*
123 %fload = load volatile i16, i16* %fptr, align 4
124124 %ext = zext i16 %fload to i32
125125 store i32 %ext, i32 addrspace(1)* %out, align 4
126126 ret void
132132 ; CHECK: flat_load_ubyte
133133 ; CHECK: flat_load_ubyte
134134 define amdgpu_kernel void @flat_scratch_unaligned_load() {
135 %scratch = alloca i32
136 %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
137 %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1
135 %scratch = alloca i32, addrspace(5)
136 %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
137 %ld = load volatile i32, i32* %fptr, align 1
138138 ret void
139139 }
140140
144144 ; CHECK: flat_store_byte
145145 ; CHECK: flat_store_byte
146146 define amdgpu_kernel void @flat_scratch_unaligned_store() {
147 %scratch = alloca i32
148 %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
149 store volatile i32 0, i32 addrspace(4)* %fptr, align 1
147 %scratch = alloca i32, addrspace(5)
148 %fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
149 store volatile i32 0, i32* %fptr, align 1
150150 ret void
151151 }
152152
155155 ; HSA: flat_load_dword
156156 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
157157 define amdgpu_kernel void @flat_scratch_multidword_load() {
158 %scratch = alloca <2 x i32>
159 %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
160 %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr
158 %scratch = alloca <2 x i32>, addrspace(5)
159 %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
160 %ld = load volatile <2 x i32>, <2 x i32>* %fptr
161161 ret void
162162 }
163163
166166 ; HSA: flat_store_dword
167167 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
168168 define amdgpu_kernel void @flat_scratch_multidword_store() {
169 %scratch = alloca <2 x i32>
170 %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
171 store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr
169 %scratch = alloca <2 x i32>, addrspace(5)
170 %fptr = addrspacecast <2 x i32> addrspace(5)* %scratch to <2 x i32>*
171 store volatile <2 x i32> zeroinitializer, <2 x i32>* %fptr
172172 ret void
173173 }
174174
175175 ; CHECK-LABEL: {{^}}store_flat_i8_max_offset:
176176 ; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
177177 ; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4095{{$}}
178 define amdgpu_kernel void @store_flat_i8_max_offset(i8 addrspace(4)* %fptr, i8 %x) #0 {
179 %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4095
180 store volatile i8 %x, i8 addrspace(4)* %fptr.offset
178 define amdgpu_kernel void @store_flat_i8_max_offset(i8* %fptr, i8 %x) #0 {
179 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
180 store volatile i8 %x, i8* %fptr.offset
181181 ret void
182182 }
183183
184184 ; CHECK-LABEL: {{^}}store_flat_i8_max_offset_p1:
185185 ; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
186 define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8 addrspace(4)* %fptr, i8 %x) #0 {
187 %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4096
188 store volatile i8 %x, i8 addrspace(4)* %fptr.offset
186 define amdgpu_kernel void @store_flat_i8_max_offset_p1(i8* %fptr, i8 %x) #0 {
187 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
188 store volatile i8 %x, i8* %fptr.offset
189189 ret void
190190 }
191191
192192 ; CHECK-LABEL: {{^}}store_flat_i8_neg_offset:
193193 ; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}}
194 define amdgpu_kernel void @store_flat_i8_neg_offset(i8 addrspace(4)* %fptr, i8 %x) #0 {
195 %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 -2
196 store volatile i8 %x, i8 addrspace(4)* %fptr.offset
194 define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 {
195 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
196 store volatile i8 %x, i8* %fptr.offset
197197 ret void
198198 }
199199
200200 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset:
201201 ; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
202202 ; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4095{{$}}
203 define amdgpu_kernel void @load_flat_i8_max_offset(i8 addrspace(4)* %fptr) #0 {
204 %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4095
205 %val = load volatile i8, i8 addrspace(4)* %fptr.offset
203 define amdgpu_kernel void @load_flat_i8_max_offset(i8* %fptr) #0 {
204 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4095
205 %val = load volatile i8, i8* %fptr.offset
206206 ret void
207207 }
208208
209209 ; CHECK-LABEL: {{^}}load_flat_i8_max_offset_p1:
210210 ; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
211 define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8 addrspace(4)* %fptr) #0 {
212 %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 4096
213 %val = load volatile i8, i8 addrspace(4)* %fptr.offset
211 define amdgpu_kernel void @load_flat_i8_max_offset_p1(i8* %fptr) #0 {
212 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 4096
213 %val = load volatile i8, i8* %fptr.offset
214214 ret void
215215 }
216216
217217 ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset:
218218 ; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}}
219 define amdgpu_kernel void @load_flat_i8_neg_offset(i8 addrspace(4)* %fptr) #0 {
220 %fptr.offset = getelementptr inbounds i8, i8 addrspace(4)* %fptr, i64 -2
221 %val = load volatile i8, i8 addrspace(4)* %fptr.offset
219 define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 {
220 %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2
221 %val = load volatile i8, i8* %fptr.offset
222222 ret void
223223 }
224224
3737 ; NOHSA-NOADDR64: flat_store_dword
3838 define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) {
3939 entry:
40 %out.addr = alloca i32 addrspace(1)*, align 4
40 %out.addr = alloca i32 addrspace(1)*, align 4, addrspace(5)
4141
42 store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 4
43 %ld0 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4
42 store i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(5)* %out.addr, align 4
43 %ld0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %out.addr, align 4
4444
4545 %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0
4646 store i32 1, i32 addrspace(1)* %arrayidx, align 4
4747
48 %ld1 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4
48 %ld1 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(5)* %out.addr, align 4
4949 %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1
5050 store i32 2, i32 addrspace(1)* %arrayidx1, align 4
5151
44 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
55 ; CIVI: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
66 ; GFX9: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
7 define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
8 entry:
9 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
10 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
7 define amdgpu_kernel void @atomic_add_i32_offset(i32* %out, i32 %in) {
8 entry:
9 %gep = getelementptr i32, i32* %out, i32 4
10 %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
1111 ret void
1212 }
1313
1414 ; GCN-LABEL: {{^}}atomic_add_i32_max_offset:
1515 ; CIVI: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
1616 ; GFX9: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:4092{{$}}
17 define amdgpu_kernel void @atomic_add_i32_max_offset(i32 addrspace(4)* %out, i32 %in) {
18 entry:
19 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 1023
20 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
17 define amdgpu_kernel void @atomic_add_i32_max_offset(i32* %out, i32 %in) {
18 entry:
19 %gep = getelementptr i32, i32* %out, i32 1023
20 %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
2121 ret void
2222 }
2323
2424 ; GCN-LABEL: {{^}}atomic_add_i32_max_offset_p1:
2525 ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
26 define amdgpu_kernel void @atomic_add_i32_max_offset_p1(i32 addrspace(4)* %out, i32 %in) {
27 entry:
28 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 1024
29 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
26 define amdgpu_kernel void @atomic_add_i32_max_offset_p1(i32* %out, i32 %in) {
27 entry:
28 %gep = getelementptr i32, i32* %out, i32 1024
29 %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
3030 ret void
3131 }
3232
3434 ; CIVI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
3535 ; GFX9: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
3636 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
37 define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
38 entry:
39 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
40 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
41 store i32 %val, i32 addrspace(4)* %out2
37 define amdgpu_kernel void @atomic_add_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
38 entry:
39 %gep = getelementptr i32, i32* %out, i32 4
40 %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
41 store i32 %val, i32* %out2
4242 ret void
4343 }
4444
4545 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
4646 ; CIVI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
4747 ; GFX9: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
48 define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
49 entry:
50 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
51 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
52 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
48 define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
49 entry:
50 %ptr = getelementptr i32, i32* %out, i64 %index
51 %gep = getelementptr i32, i32* %ptr, i32 4
52 %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
5353 ret void
5454 }
5555
5757 ; CIVI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
5858 ; GFX9: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
5959 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
60 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
61 entry:
62 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
63 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
64 %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
65 store i32 %val, i32 addrspace(4)* %out2
60 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
61 entry:
62 %ptr = getelementptr i32, i32* %out, i64 %index
63 %gep = getelementptr i32, i32* %ptr, i32 4
64 %val = atomicrmw volatile add i32* %gep, i32 %in seq_cst
65 store i32 %val, i32* %out2
6666 ret void
6767 }
6868
6969 ; GCN-LABEL: {{^}}atomic_add_i32:
7070 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
71 define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
72 entry:
73 %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
71 define amdgpu_kernel void @atomic_add_i32(i32* %out, i32 %in) {
72 entry:
73 %val = atomicrmw volatile add i32* %out, i32 %in seq_cst
7474 ret void
7575 }
7676
7777 ; GCN-LABEL: {{^}}atomic_add_i32_ret:
7878 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
7979 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
80 define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
81 entry:
82 %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
83 store i32 %val, i32 addrspace(4)* %out2
80 define amdgpu_kernel void @atomic_add_i32_ret(i32* %out, i32* %out2, i32 %in) {
81 entry:
82 %val = atomicrmw volatile add i32* %out, i32 %in seq_cst
83 store i32 %val, i32* %out2
8484 ret void
8585 }
8686
8787 ; GCN-LABEL: {{^}}atomic_add_i32_addr64:
8888 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
89 define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
90 entry:
91 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
92 %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
89 define amdgpu_kernel void @atomic_add_i32_addr64(i32* %out, i32 %in, i64 %index) {
90 entry:
91 %ptr = getelementptr i32, i32* %out, i64 %index
92 %val = atomicrmw volatile add i32* %ptr, i32 %in seq_cst
9393 ret void
9494 }
9595
9696 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
9797 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
9898 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
99 define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
100 entry:
101 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
102 %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
103 store i32 %val, i32 addrspace(4)* %out2
99 define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
100 entry:
101 %ptr = getelementptr i32, i32* %out, i64 %index
102 %val = atomicrmw volatile add i32* %ptr, i32 %in seq_cst
103 store i32 %val, i32* %out2
104104 ret void
105105 }
106106
107107 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
108108 ; CIVI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
109109 ; GFX9: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
110 define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
111 entry:
112 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
113 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
110 define amdgpu_kernel void @atomic_and_i32_offset(i32* %out, i32 %in) {
111 entry:
112 %gep = getelementptr i32, i32* %out, i32 4
113 %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
114114 ret void
115115 }
116116
118118 ; CIVI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
119119 ; GFX9: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
120120 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
121 define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
122 entry:
123 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
124 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
125 store i32 %val, i32 addrspace(4)* %out2
121 define amdgpu_kernel void @atomic_and_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
122 entry:
123 %gep = getelementptr i32, i32* %out, i32 4
124 %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
125 store i32 %val, i32* %out2
126126 ret void
127127 }
128128
129129 ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
130130 ; CIVI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
131131 ; GFX9: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
132 define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
133 entry:
134 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
135 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
136 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
132 define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
133 entry:
134 %ptr = getelementptr i32, i32* %out, i64 %index
135 %gep = getelementptr i32, i32* %ptr, i32 4
136 %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
137137 ret void
138138 }
139139
141141 ; CIVI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
142142 ; GFX9: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
143143 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
144 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
145 entry:
146 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
147 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
148 %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
149 store i32 %val, i32 addrspace(4)* %out2
144 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
145 entry:
146 %ptr = getelementptr i32, i32* %out, i64 %index
147 %gep = getelementptr i32, i32* %ptr, i32 4
148 %val = atomicrmw volatile and i32* %gep, i32 %in seq_cst
149 store i32 %val, i32* %out2
150150 ret void
151151 }
152152
153153 ; GCN-LABEL: {{^}}atomic_and_i32:
154154 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
155 define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
156 entry:
157 %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
155 define amdgpu_kernel void @atomic_and_i32(i32* %out, i32 %in) {
156 entry:
157 %val = atomicrmw volatile and i32* %out, i32 %in seq_cst
158158 ret void
159159 }
160160
161161 ; GCN-LABEL: {{^}}atomic_and_i32_ret:
162162 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
163163 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
164 define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
165 entry:
166 %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
167 store i32 %val, i32 addrspace(4)* %out2
164 define amdgpu_kernel void @atomic_and_i32_ret(i32* %out, i32* %out2, i32 %in) {
165 entry:
166 %val = atomicrmw volatile and i32* %out, i32 %in seq_cst
167 store i32 %val, i32* %out2
168168 ret void
169169 }
170170
171171 ; GCN-LABEL: {{^}}atomic_and_i32_addr64:
172172 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
173 define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
174 entry:
175 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
176 %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
173 define amdgpu_kernel void @atomic_and_i32_addr64(i32* %out, i32 %in, i64 %index) {
174 entry:
175 %ptr = getelementptr i32, i32* %out, i64 %index
176 %val = atomicrmw volatile and i32* %ptr, i32 %in seq_cst
177177 ret void
178178 }
179179
180180 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
181181 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
182182 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
183 define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
184 entry:
185 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
186 %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
187 store i32 %val, i32 addrspace(4)* %out2
183 define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
184 entry:
185 %ptr = getelementptr i32, i32* %out, i64 %index
186 %val = atomicrmw volatile and i32* %ptr, i32 %in seq_cst
187 store i32 %val, i32* %out2
188188 ret void
189189 }
190190
191191 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
192192 ; CIVI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
193193 ; GFX9: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
194 define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
195 entry:
196 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
197 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
194 define amdgpu_kernel void @atomic_sub_i32_offset(i32* %out, i32 %in) {
195 entry:
196 %gep = getelementptr i32, i32* %out, i32 4
197 %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
198198 ret void
199199 }
200200
202202 ; CIVI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
203203 ; GFX9: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
204204 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
205 define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
206 entry:
207 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
208 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
209 store i32 %val, i32 addrspace(4)* %out2
205 define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
206 entry:
207 %gep = getelementptr i32, i32* %out, i32 4
208 %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
209 store i32 %val, i32* %out2
210210 ret void
211211 }
212212
213213 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
214214 ; CIVI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
215215 ; GFX9: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
216 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
217 entry:
218 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
219 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
220 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
216 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
217 entry:
218 %ptr = getelementptr i32, i32* %out, i64 %index
219 %gep = getelementptr i32, i32* %ptr, i32 4
220 %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
221221 ret void
222222 }
223223
225225 ; CIVI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
226226 ; GFX9: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
227227 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
228 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
229 entry:
230 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
231 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
232 %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
233 store i32 %val, i32 addrspace(4)* %out2
228 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
229 entry:
230 %ptr = getelementptr i32, i32* %out, i64 %index
231 %gep = getelementptr i32, i32* %ptr, i32 4
232 %val = atomicrmw volatile sub i32* %gep, i32 %in seq_cst
233 store i32 %val, i32* %out2
234234 ret void
235235 }
236236
237237 ; GCN-LABEL: {{^}}atomic_sub_i32:
238238 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
239 define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
240 entry:
241 %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
239 define amdgpu_kernel void @atomic_sub_i32(i32* %out, i32 %in) {
240 entry:
241 %val = atomicrmw volatile sub i32* %out, i32 %in seq_cst
242242 ret void
243243 }
244244
245245 ; GCN-LABEL: {{^}}atomic_sub_i32_ret:
246246 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
247247 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
248 define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
249 entry:
250 %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
251 store i32 %val, i32 addrspace(4)* %out2
248 define amdgpu_kernel void @atomic_sub_i32_ret(i32* %out, i32* %out2, i32 %in) {
249 entry:
250 %val = atomicrmw volatile sub i32* %out, i32 %in seq_cst
251 store i32 %val, i32* %out2
252252 ret void
253253 }
254254
255255 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
256256 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
257 define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
258 entry:
259 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
260 %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
257 define amdgpu_kernel void @atomic_sub_i32_addr64(i32* %out, i32 %in, i64 %index) {
258 entry:
259 %ptr = getelementptr i32, i32* %out, i64 %index
260 %val = atomicrmw volatile sub i32* %ptr, i32 %in seq_cst
261261 ret void
262262 }
263263
264264 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
265265 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
266266 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
267 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
268 entry:
269 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
270 %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
271 store i32 %val, i32 addrspace(4)* %out2
267 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
268 entry:
269 %ptr = getelementptr i32, i32* %out, i64 %index
270 %val = atomicrmw volatile sub i32* %ptr, i32 %in seq_cst
271 store i32 %val, i32* %out2
272272 ret void
273273 }
274274
275275 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
276276 ; CIVI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
277277 ; GFX9: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
278 define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
279 entry:
280 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
281 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
278 define amdgpu_kernel void @atomic_max_i32_offset(i32* %out, i32 %in) {
279 entry:
280 %gep = getelementptr i32, i32* %out, i32 4
281 %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
282282 ret void
283283 }
284284
286286 ; CIVI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
287287 ; GFX9: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
288288 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
289 define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
290 entry:
291 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
292 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
293 store i32 %val, i32 addrspace(4)* %out2
289 define amdgpu_kernel void @atomic_max_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
290 entry:
291 %gep = getelementptr i32, i32* %out, i32 4
292 %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
293 store i32 %val, i32* %out2
294294 ret void
295295 }
296296
297297 ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
298298 ; CIVI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
299299 ; GFX9: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
300 define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
301 entry:
302 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
303 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
304 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
300 define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
301 entry:
302 %ptr = getelementptr i32, i32* %out, i64 %index
303 %gep = getelementptr i32, i32* %ptr, i32 4
304 %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
305305 ret void
306306 }
307307
309309 ; CIVI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
310310 ; GFX9: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
311311 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
312 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
313 entry:
314 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
315 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
316 %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
317 store i32 %val, i32 addrspace(4)* %out2
312 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
313 entry:
314 %ptr = getelementptr i32, i32* %out, i64 %index
315 %gep = getelementptr i32, i32* %ptr, i32 4
316 %val = atomicrmw volatile max i32* %gep, i32 %in seq_cst
317 store i32 %val, i32* %out2
318318 ret void
319319 }
320320
321321 ; GCN-LABEL: {{^}}atomic_max_i32:
322322 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
323 define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
324 entry:
325 %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
323 define amdgpu_kernel void @atomic_max_i32(i32* %out, i32 %in) {
324 entry:
325 %val = atomicrmw volatile max i32* %out, i32 %in seq_cst
326326 ret void
327327 }
328328
329329 ; GCN-LABEL: {{^}}atomic_max_i32_ret:
330330 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
331331 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
332 define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
333 entry:
334 %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
335 store i32 %val, i32 addrspace(4)* %out2
332 define amdgpu_kernel void @atomic_max_i32_ret(i32* %out, i32* %out2, i32 %in) {
333 entry:
334 %val = atomicrmw volatile max i32* %out, i32 %in seq_cst
335 store i32 %val, i32* %out2
336336 ret void
337337 }
338338
339339 ; GCN-LABEL: {{^}}atomic_max_i32_addr64:
340340 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
341 define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
342 entry:
343 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
344 %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
341 define amdgpu_kernel void @atomic_max_i32_addr64(i32* %out, i32 %in, i64 %index) {
342 entry:
343 %ptr = getelementptr i32, i32* %out, i64 %index
344 %val = atomicrmw volatile max i32* %ptr, i32 %in seq_cst
345345 ret void
346346 }
347347
348348 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
349349 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
350350 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
351 define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
352 entry:
353 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
354 %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
355 store i32 %val, i32 addrspace(4)* %out2
351 define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
352 entry:
353 %ptr = getelementptr i32, i32* %out, i64 %index
354 %val = atomicrmw volatile max i32* %ptr, i32 %in seq_cst
355 store i32 %val, i32* %out2
356356 ret void
357357 }
358358
359359 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
360360 ; CIVI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
361361 ; GFX9: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
362 define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
363 entry:
364 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
365 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
362 define amdgpu_kernel void @atomic_umax_i32_offset(i32* %out, i32 %in) {
363 entry:
364 %gep = getelementptr i32, i32* %out, i32 4
365 %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
366366 ret void
367367 }
368368
370370 ; CIVI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
371371 ; GFX9: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
372372 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
373 define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
374 entry:
375 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
376 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
377 store i32 %val, i32 addrspace(4)* %out2
373 define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
374 entry:
375 %gep = getelementptr i32, i32* %out, i32 4
376 %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
377 store i32 %val, i32* %out2
378378 ret void
379379 }
380380
381381 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
382382 ; CIVI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
383383 ; GFX9: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
384 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
385 entry:
386 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
387 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
388 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
384 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
385 entry:
386 %ptr = getelementptr i32, i32* %out, i64 %index
387 %gep = getelementptr i32, i32* %ptr, i32 4
388 %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
389389 ret void
390390 }
391391
393393 ; CIVI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
394394 ; GFX9: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
395395 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
396 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
397 entry:
398 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
399 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
400 %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
401 store i32 %val, i32 addrspace(4)* %out2
396 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32* %out, i32* %out2, i32 %in, i64 %index) {
397 entry:
398 %ptr = getelementptr i32, i32* %out, i64 %index
399 %gep = getelementptr i32, i32* %ptr, i32 4
400 %val = atomicrmw volatile umax i32* %gep, i32 %in seq_cst
401 store i32 %val, i32* %out2
402402 ret void
403403 }
404404
405405 ; GCN-LABEL: {{^}}atomic_umax_i32:
406406 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
407 define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
408 entry:
409 %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
407 define amdgpu_kernel void @atomic_umax_i32(i32* %out, i32 %in) {
408 entry:
409 %val = atomicrmw volatile umax i32* %out, i32 %in seq_cst
410410 ret void
411411 }
412412
413413 ; GCN-LABEL: {{^}}atomic_umax_i32_ret:
414414 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
415415 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
416 define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
417 entry:
418 %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
419 store i32 %val, i32 addrspace(4)* %out2
416 define amdgpu_kernel void @atomic_umax_i32_ret(i32* %out, i32* %out2, i32 %in) {
417 entry:
418 %val = atomicrmw volatile umax i32* %out, i32 %in seq_cst
419 store i32 %val, i32* %out2
420420 ret void
421421 }
422422
423423 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
424424 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
425 define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
426 entry:
427 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
428 %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
425 define amdgpu_kernel void @atomic_umax_i32_addr64(i32* %out, i32 %in, i64 %index) {
426 entry:
427 %ptr = getelementptr i32, i32* %out, i64 %index
428 %val = atomicrmw volatile umax i32* %ptr, i32 %in seq_cst
429429 ret void
430430 }
431431
432432 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
433433 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
434434 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
435 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
436 entry:
437 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
438 %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
439 store i32 %val, i32 addrspace(4)* %out2
435 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32* %out, i32* %out2, i32 %in, i64 %index) {
436 entry:
437 %ptr = getelementptr i32, i32* %out, i64 %index
438 %val = atomicrmw volatile umax i32* %ptr, i32 %in seq_cst
439 store i32 %val, i32* %out2
440440 ret void
441441 }
442442
443443 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
444444 ; CIVI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
445445 ; GFX9: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
446 define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
447 entry:
448 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
449 %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
446 define amdgpu_kernel void @atomic_min_i32_offset(i32* %out, i32 %in) {
447 entry:
448 %gep = getelementptr i32, i32* %out, i32 4
449 %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst
450450 ret void
451451 }
452452
454454 ; CIVI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
455455 ; GFX9: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
456456 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
457 define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
458 entry:
459 %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
460 %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
461 store i32 %val, i32 addrspace(4)* %out2
457 define amdgpu_kernel void @atomic_min_i32_ret_offset(i32* %out, i32* %out2, i32 %in) {
458 entry:
459 %gep = getelementptr i32, i32* %out, i32 4
460 %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst
461 store i32 %val, i32* %out2
462462 ret void
463463 }
464464
465465 ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
466466 ; CIVI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
467467 ; GFX9: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16{{$}}
468 define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
469 entry:
470 %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
471 %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
472 %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
468 define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32* %out, i32 %in, i64 %index) {
469 entry:
470 %ptr = getelementptr i32, i32* %out, i64 %index
471 %gep = getelementptr i32, i32* %ptr, i32 4
472 %val = atomicrmw volatile min i32* %gep, i32 %in seq_cst
473473 ret void
474474 }
475475
477477 ; CIVI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
478478 ; GFX9: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} offset:16 glc{{$}}
479479 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
480 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)