llvm.org GIT mirror llvm / 0f9ec97
[AMDGPU] Switch scalarize global loads ON by default Differential revision: https://reviews.llvm.org/D34407 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307026 91177308-0d34-0410-b5e6-96231b3b80d8 Alexander Timofeev 2 years ago
141 changed file(s) with 799 addition(s) and 569 deletion(s). Raw diff Collapse all Expand all
8484 static cl::opt ScalarizeGlobal(
8585 "amdgpu-scalarize-global-loads",
8686 cl::desc("Enable global load scalarization"),
87 cl::init(false),
87 cl::init(true),
8888 cl::Hidden);
8989
9090 // Option to run internalize pass.
44 ;FUNC-LABEL: {{^}}test1:
55 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
66
7 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
8 ;SI-NOT: [[REG]]
9 ;SI: buffer_store_dword [[REG]],
7 ;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
8 ;SI: v_mov_b32_e32 v[[REG]], s[[REG]]
9 ;SI: buffer_store_dword v[[REG]],
1010 define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1111 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1212 %a = load i32, i32 addrspace(1)* %in
2020 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2121 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2222
23 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
24 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
23 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
24 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
2525
2626 define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
2727 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
3838 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
3939 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4040
41 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
42 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
43 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
44 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
41 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
42 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
43 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
44 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
4545
4646 define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
4747 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1818
1919 ; Check that the SGPR add operand is correctly moved to a VGPR.
2020 ; GCN-LABEL: {{^}}sgpr_operand:
21 ; GCN: v_add_i32
22 ; GCN: v_addc_u32
23 ; GCN: v_addc_u32
24 ; GCN: v_addc_u32
21 ; GCN: s_add_u32
22 ; GCN: s_addc_u32
23 ; GCN: s_addc_u32
24 ; GCN: s_addc_u32
2525 define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
2626 %foo = load i128, i128 addrspace(1)* %in, align 8
2727 %result = add i128 %foo, %a
3030 }
3131
3232 ; GCN-LABEL: {{^}}sgpr_operand_reversed:
33 ; GCN: v_add_i32
34 ; GCN: v_addc_u32
35 ; GCN: v_addc_u32
36 ; GCN: v_addc_u32
33 ; GCN: s_add_u32
34 ; GCN: s_addc_u32
35 ; GCN: s_addc_u32
36 ; GCN: s_addc_u32
3737 define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
3838 %foo = load i128, i128 addrspace(1)* %in, align 8
3939 %result = add i128 %a, %foo
1818
1919 ; Check that the SGPR add operand is correctly moved to a VGPR.
2020 ; SI-LABEL: {{^}}sgpr_operand:
21 ; SI: v_add_i32
22 ; SI: v_addc_u32
21 ; SI: s_add_u32
22 ; SI: s_addc_u32
2323 define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
2424 %foo = load i64, i64 addrspace(1)* %in, align 8
2525 %result = add i64 %foo, %a
3131 ; SGPR as other operand.
3232 ;
3333 ; SI-LABEL: {{^}}sgpr_operand_reversed:
34 ; SI: v_add_i32
35 ; SI: v_addc_u32
34 ; SI: s_add_u32
35 ; SI: s_addc_u32
3636 define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
3737 %foo = load i64, i64 addrspace(1)* %in, align 8
3838 %result = add i64 %a, %foo
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22
33 ; FUNC-LABEL: {{^}}v_and_i64_br:
4 ; SI: v_and_b32
5 ; SI: v_and_b32
4 ; SI: s_and_b64
65 define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
76 entry:
87 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
77 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
88 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
99
10 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
11 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
10 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
11 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
1212
1313 define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
1414 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
2525 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2626 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2727
28 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
29 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
30 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
31 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
28
29 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
30 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
31 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
32 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
3233
3334 define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
3435 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
135136 ; FUNC-LABEL: {{^}}v_and_constant_i32
136137 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
137138 define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
138 %a = load i32, i32 addrspace(1)* %aptr, align 4
139 %tid = call i32 @llvm.r600.read.tidig.x() #0
140 %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
141 %a = load i32, i32 addrspace(1)* %gep, align 4
139142 %and = and i32 %a, 1234567
140143 store i32 %and, i32 addrspace(1)* %out, align 4
141144 ret void
144147 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
145148 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
146149 define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
147 %a = load i32, i32 addrspace(1)* %aptr, align 4
150 %tid = call i32 @llvm.r600.read.tidig.x() #0
151 %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
152 %a = load i32, i32 addrspace(1)* %gep, align 4
148153 %and = and i32 %a, 64
149154 store i32 %and, i32 addrspace(1)* %out, align 4
150155 ret void
153158 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
154159 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
155160 define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
156 %a = load i32, i32 addrspace(1)* %aptr, align 4
161 %tid = call i32 @llvm.r600.read.tidig.x() #0
162 %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
163 %a = load i32, i32 addrspace(1)* %gep, align 4
157164 %and = and i32 %a, -16
158165 store i32 %and, i32 addrspace(1)* %out, align 4
159166 ret void
238245 ; SI: v_and_b32
239246 ; SI: v_and_b32
240247 define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
241 %a = load i64, i64 addrspace(1)* %aptr, align 8
242 %b = load i64, i64 addrspace(1)* %bptr, align 8
248 %tid = call i32 @llvm.r600.read.tidig.x() #0
249 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
250 %a = load i64, i64 addrspace(1)* %gep.a, align 8
251 %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
252 %b = load i64, i64 addrspace(1)* %gep.b, align 8
243253 %and = and i64 %a, %b
244254 store i64 %and, i64 addrspace(1)* %out, align 8
245255 ret void
250260 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
251261 ; SI: buffer_store_dwordx2
252262 define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
253 %a = load i64, i64 addrspace(1)* %aptr, align 8
263 %tid = call i32 @llvm.r600.read.tidig.x() #0
264 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
265 %a = load i64, i64 addrspace(1)* %gep.a, align 8
254266 %and = and i64 %a, 1231231234567
255267 store i64 %and, i64 addrspace(1)* %out, align 8
256268 ret void
298310 }
299311
300312 ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
301 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
313 ; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
302314 ; SI-NOT: and
303315 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
304316 ; SI-NOT: and
305317 ; SI: buffer_store_dwordx2
306318 define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
307 %a = load i64, i64 addrspace(1)* %aptr, align 8
319 %tid = call i32 @llvm.r600.read.tidig.x() #0
320 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
321 %a = load i64, i64 addrspace(1)* %gep.a, align 8
308322 %and = and i64 %a, 1234567
309323 store i64 %and, i64 addrspace(1)* %out, align 8
310324 ret void
311325 }
312326
313327 ; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
314 ; SI: buffer_load_dword v{{[0-9]+}}
328 ; SI: {{buffer|flat}}_load_dword v{{[0-9]+}}
315329 ; SI-NOT: and
316330 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
317331 ; SI-NOT: and
318332 ; SI: buffer_store_dwordx2
319333 define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
320 %a = load i64, i64 addrspace(1)* %aptr, align 8
334 %tid = call i32 @llvm.r600.read.tidig.x() #0
335 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
336 %a = load i64, i64 addrspace(1)* %gep.a, align 8
321337 %and = and i64 %a, 64
322338 store i64 %and, i64 addrspace(1)* %out, align 8
323339 ret void
325341
326342 ; FIXME: Should be able to reduce load width
327343 ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
328 ; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
344 ; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
329345 ; SI-NOT: and
330346 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
331347 ; SI-NOT: and
332348 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
333349 define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
334 %a = load i64, i64 addrspace(1)* %aptr, align 8
350 %tid = call i32 @llvm.r600.read.tidig.x() #0
351 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
352 %a = load i64, i64 addrspace(1)* %gep.a, align 8
335353 %and = and i64 %a, -8
336354 store i64 %and, i64 addrspace(1)* %out, align 8
337355 ret void
548566 store i64 %and, i64 addrspace(1)* %out, align 8
549567 ret void
550568 }
551
552569 attributes #0 = { nounwind readnone }
11 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22
33 ; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
4 ; GCN: {{buffer|flat}}_load_dwordx4
5 ; GCN-DAG: {{buffer|flat}}_load_dwordx4
6 ; GCN-DAG: {{buffer|flat}}_load_dword
4 ; GCN: s_load_dwordx4
5 ; GCN-DAG: s_load_dwordx4
6 ; GCN-DAG: s_load_dword
77
88 ; GCN: {{buffer|flat}}_store_byte
99 ; GCN: {{buffer|flat}}_store_byte
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
3
4 declare i32 @llvm.amdgcn.workitem.id.x() #1
35
46 declare i16 @llvm.bitreverse.i16(i16) #1
57 declare i32 @llvm.bitreverse.i32(i32) #1
4143 }
4244
4345 ; FUNC-LABEL: {{^}}v_brev_i32:
44 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
46 ; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
4547 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
4648 ; SI: buffer_store_dword [[RESULT]],
4749 ; SI: s_endpgm
4850 define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
49 %val = load i32, i32 addrspace(1)* %valptr
51 %tid = call i32 @llvm.amdgcn.workitem.id.x()
52 %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
53 %val = load i32, i32 addrspace(1)* %gep
5054 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
5155 store i32 %brev, i32 addrspace(1)* %out
5256 ret void
6569 ; SI: v_bfrev_b32_e32
6670 ; SI: v_bfrev_b32_e32
6771 define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
68 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
72 %tid = call i32 @llvm.amdgcn.workitem.id.x()
73 %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
74 %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
6975 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
7076 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
7177 ret void
8187 ; FUNC-LABEL: {{^}}v_brev_i64:
8288 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
8389 define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
84 %val = load i64, i64 addrspace(1)* %valptr
90 %tid = call i32 @llvm.amdgcn.workitem.id.x()
91 %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
92 %val = load i64, i64 addrspace(1)* %gep
8593 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
8694 store i64 %brev, i64 addrspace(1)* %out
8795 ret void
96104
97105 ; FUNC-LABEL: {{^}}v_brev_v2i64:
98106 define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
99 %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
107 %tid = call i32 @llvm.amdgcn.workitem.id.x()
108 %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
109 %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
100110 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
101111 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
102112 ret void
99 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
1010
1111 ; FUNC-LABEL: @test_bswap_i32
12 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
12 ; SI: s_load_dword [[VAL:s[0-9]+]]
1313 ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
1414 ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
1515 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
0 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s
11 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s
22 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
4 ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
5 ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
66
77 ; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
88 ; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in
3939
4040 ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
4141 ; OPT: getelementptr i32, i32 addrspace(4)* %out,
42 ; OPT-CI-NOT: getelementptr
42 ; rOPT-CI-NOT: getelementptr
4343 ; OPT: br i1
4444
4545 ; OPT-CI: addrspacecast
0 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
11 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
22 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
3 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
66
77 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
88
1111 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
1212 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
1313 ; It's probably OK if this is slightly higher:
14 ; CHECK: ; NumVgprs: 8
14 ; CHECK: ; NumVgprs: 4
1515 define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
1616 entry:
1717 %cmpflag = icmp eq i32 %flag, 1
44 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
55
66 ; FUNC-LABEL: {{^}}test_copy_v4i8:
7 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
7 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
88 ; GCN: buffer_store_dword [[REG]]
99 ; GCN: s_endpgm
1010 define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
11 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
11 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
12 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
13 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
1214 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
1315 ret void
1416 }
1517
1618 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
17 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
19 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
1820 ; GCN: buffer_store_dword [[REG]]
1921 ; GCN: buffer_store_dword [[REG]]
2022 ; GCN: s_endpgm
2123 define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
22 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
24 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
25 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
26 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
2327 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
2428 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
2529 ret void
2630 }
2731
2832 ; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
29 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
33 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
3034 ; GCN: buffer_store_dword [[REG]]
3135 ; GCN: buffer_store_dword [[REG]]
3236 ; GCN: buffer_store_dword [[REG]]
3337 ; GCN: s_endpgm
3438 define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
35 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
39 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
40 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
41 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
3642 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
3743 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
3844 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
4046 }
4147
4248 ; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
43 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
49 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
4450 ; GCN: buffer_store_dword [[REG]]
4551 ; GCN: buffer_store_dword [[REG]]
4652 ; GCN: buffer_store_dword [[REG]]
4753 ; GCN: buffer_store_dword [[REG]]
4854 ; GCN: s_endpgm
4955 define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
50 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
56 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
57 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
58 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
5159 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
5260 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
5361 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
5664 }
5765
5866 ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
59 ; GCN: buffer_load_dword
67 ; GCN: {{buffer|flat}}_load_dword
6068 ; GCN-DAG: v_lshrrev_b32
6169 ; GCN: v_and_b32
6270 ; GCN: v_or_b32
6573
6674 ; GCN: s_endpgm
6775 define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
68 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
76 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
77 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
78 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
6979 %add = add <4 x i8> %val,
7080 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
7181 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
96106 }
97107
98108 ; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
99 ; GCN: buffer_load_dword
109 ; GCN: {{buffer|flat}}_load_dword
100110 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
101111 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
102112 ; GCN: s_endpgm
103113 define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
104 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
114 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
115 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
116 %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
105117 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
106118 ret void
107119 }
108120
109121 ; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
110 ; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
111 ; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
122 ; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
123 ; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
112124 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
113125 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
114126 ; GCN: s_endpgm
119131 }
120132
121133 ; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
122 ; GCN: buffer_load_ubyte
123 ; GCN: buffer_load_ubyte
124 ; GCN: buffer_load_ubyte
134 ; GCN: {{buffer|flat}}_load_ubyte
135 ; GCN: {{buffer|flat}}_load_ubyte
136 ; GCN: {{buffer|flat}}_load_ubyte
125137
126138 ; GCN: buffer_store_byte
127139 ; GCN: buffer_store_byte
134146 }
135147
136148 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
137 ; GCN: buffer_load_ubyte
138 ; GCN: buffer_load_ubyte
139 ; GCN: buffer_load_ubyte
140 ; GCN: buffer_load_ubyte
149 ; GCN: {{buffer|flat}}_load_ubyte
150 ; GCN: {{buffer|flat}}_load_ubyte
151 ; GCN: {{buffer|flat}}_load_ubyte
152 ; GCN: {{buffer|flat}}_load_ubyte
141153 ; GCN: buffer_store_dword
142154 ; GCN: s_endpgm
143155 define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
147159 }
148160
149161 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
150 ; GCN: buffer_load_ubyte
151 ; GCN: buffer_load_ubyte
152 ; GCN: buffer_load_ubyte
153 ; GCN: buffer_load_ubyte
162 ; GCN: {{buffer|flat}}_load_ubyte
163 ; GCN: {{buffer|flat}}_load_ubyte
164 ; GCN: {{buffer|flat}}_load_ubyte
165 ; GCN: {{buffer|flat}}_load_ubyte
154166 ; GCN: buffer_store_byte
155167 ; GCN: buffer_store_byte
156168 ; GCN: buffer_store_byte
3333 }
3434
3535 ; FUNC-LABEL: {{^}}v_ctlz_i32:
36 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
36 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
3737 ; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
3838 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]]
3939 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc
4343 ; EG: FFBH_UINT
4444 ; EG: CNDE_INT
4545 define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
46 %val = load i32, i32 addrspace(1)* %valptr, align 4
46 %tid = call i32 @llvm.r600.read.tidig.x()
47 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
48 %val = load i32, i32 addrspace(1)* %in.gep, align 4
4749 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
4850 store i32 %ctlz, i32 addrspace(1)* %out, align 4
4951 ret void
5052 }
5153
5254 ; FUNC-LABEL: {{^}}v_ctlz_v2i32:
53 ; GCN: buffer_load_dwordx2
55 ; GCN: {{buffer|flat}}_load_dwordx2
5456 ; GCN: v_ffbh_u32_e32
5557 ; GCN: v_ffbh_u32_e32
5658 ; GCN: buffer_store_dwordx2
6163 ; EG: FFBH_UINT
6264 ; EG: CNDE_INT
6365 define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
64 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
66 %tid = call i32 @llvm.r600.read.tidig.x()
67 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
68 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
6569 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
6670 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
6771 ret void
6872 }
6973
7074 ; FUNC-LABEL: {{^}}v_ctlz_v4i32:
71 ; GCN: buffer_load_dwordx4
75 ; GCN: {{buffer|flat}}_load_dwordx4
7276 ; GCN: v_ffbh_u32_e32
7377 ; GCN: v_ffbh_u32_e32
7478 ; GCN: v_ffbh_u32_e32
8993 ; EG-DAG: FFBH_UINT
9094 ; EG-DAG: CNDE_INT
9195 define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
92 %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
96 %tid = call i32 @llvm.r600.read.tidig.x()
97 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
98 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
9399 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
94100 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
95101 ret void
96102 }
97103
98104 ; FUNC-LABEL: {{^}}v_ctlz_i8:
99 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
105 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
100106 ; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
101107 ; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
102108 ; GCN: buffer_store_byte [[RESULT]],
167173 }
168174
169175 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
170 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
176 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
171177 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
172178 ; GCN: buffer_store_dword [[RESULT]],
173179 ; GCN: s_endpgm
174 define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
175 %val = load i32, i32 addrspace(1)* %valptr
180 define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
181 %tid = call i32 @llvm.r600.read.tidig.x()
182 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
183 %val = load i32, i32 addrspace(1)* %in.gep
176184 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
177185 %cmp = icmp eq i32 %val, 0
178186 %sel = select i1 %cmp, i32 -1, i32 %ctlz
181189 }
182190
183191 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
184 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
192 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
185193 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
186194 ; GCN: buffer_store_dword [[RESULT]],
187195 ; GCN: s_endpgm
188196 define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
189 %val = load i32, i32 addrspace(1)* %valptr
197 %tid = call i32 @llvm.r600.read.tidig.x()
198 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
199 %val = load i32, i32 addrspace(1)* %in.gep
190200 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
191201 %cmp = icmp ne i32 %val, 0
192202 %sel = select i1 %cmp, i32 %ctlz, i32 -1
196206
197207 ; TODO: Should be able to eliminate select here as well.
198208 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
199 ; GCN: buffer_load_dword
209 ; GCN: {{buffer|flat}}_load_dword
200210 ; GCN: v_ffbh_u32_e32
201211 ; GCN: v_cmp
202212 ; GCN: v_cndmask
203213 ; GCN: s_endpgm
204214 define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
205 %val = load i32, i32 addrspace(1)* %valptr
215 %tid = call i32 @llvm.r600.read.tidig.x()
216 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
217 %val = load i32, i32 addrspace(1)* %in.gep
206218 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
207219 %cmp = icmp eq i32 %ctlz, 32
208220 %sel = select i1 %cmp, i32 -1, i32 %ctlz
211223 }
212224
213225 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
214 ; GCN: buffer_load_dword
226 ; GCN: {{buffer|flat}}_load_dword
215227 ; GCN: v_ffbh_u32_e32
216228 ; GCN: v_cmp
217229 ; GCN: v_cndmask
218230 ; GCN: s_endpgm
219231 define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
220 %val = load i32, i32 addrspace(1)* %valptr
232 %tid = call i32 @llvm.r600.read.tidig.x()
233 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
234 %val = load i32, i32 addrspace(1)* %in.gep
221235 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
222236 %cmp = icmp ne i32 %ctlz, 32
223237 %sel = select i1 %cmp, i32 %ctlz, i32 -1
241255 }
242256
243257 ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
244 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
258 ; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
245259 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
246260 ; SI: buffer_store_short [[FFBH]],
247261 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
2828 }
2929
3030 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
31 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
31 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
3232 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
3333 ; GCN: buffer_store_dword [[RESULT]],
3434 ; GCN: s_endpgm
3535 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
3636 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
3737 define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
38 %val = load i32, i32 addrspace(1)* %valptr, align 4
38 %tid = call i32 @llvm.r600.read.tidig.x()
39 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
40 %val = load i32, i32 addrspace(1)* %in.gep, align 4
3941 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
4042 store i32 %ctlz, i32 addrspace(1)* %out, align 4
4143 ret void
4244 }
4345
4446 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
45 ; GCN: buffer_load_dwordx2
47 ; GCN: {{buffer|flat}}_load_dwordx2
4648 ; GCN: v_ffbh_u32_e32
4749 ; GCN: v_ffbh_u32_e32
4850 ; GCN: buffer_store_dwordx2
5153 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
5254 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
5355 define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
54 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
56 %tid = call i32 @llvm.r600.read.tidig.x()
57 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
58 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
5559 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
5660 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
5761 ret void
5862 }
5963
6064 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
61 ; GCN: buffer_load_dwordx4
65 ; GCN: {{buffer|flat}}_load_dwordx4
6266 ; GCN: v_ffbh_u32_e32
6367 ; GCN: v_ffbh_u32_e32
6468 ; GCN: v_ffbh_u32_e32
7175 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
7276 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
7377 define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
74 %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
78 %tid = call i32 @llvm.r600.read.tidig.x()
79 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
80 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
7581 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
7682 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
7783 ret void
7884 }
7985
8086 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
81 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
87 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
8288 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
8389 ; GCN: buffer_store_byte [[RESULT]],
8490 define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
85 %val = load i8, i8 addrspace(1)* %valptr
91 %tid = call i32 @llvm.r600.read.tidig.x()
92 %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
93 %val = load i8, i8 addrspace(1)* %in.gep
8694 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
8795 store i8 %ctlz, i8 addrspace(1)* %out
8896 ret void
144152 }
145153
146154 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
147 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
155 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
148156 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
149157 ; GCN: buffer_store_dword [[RESULT]],
150 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
151 %val = load i32, i32 addrspace(1)* %valptr
158 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
159 %tid = call i32 @llvm.r600.read.tidig.x()
160 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
161 %val = load i32, i32 addrspace(1)* %in.gep
152162 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
153163 %cmp = icmp eq i32 %val, 0
154164 %sel = select i1 %cmp, i32 -1, i32 %ctlz
157167 }
158168
159169 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
160 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
170 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
161171 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
162172 ; GCN: buffer_store_dword [[RESULT]],
163173 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
164 %val = load i32, i32 addrspace(1)* %valptr
174 %tid = call i32 @llvm.r600.read.tidig.x()
175 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
176 %val = load i32, i32 addrspace(1)* %in.gep
165177 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
166178 %cmp = icmp ne i32 %val, 0
167179 %sel = select i1 %cmp, i32 %ctlz, i32 -1
185197 }
186198
187199 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
188 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
200 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
189201 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
190202 ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]]
191203 ; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
192204 ; GCN-DAG: buffer_store_dword [[RESULT0]]
193205 ; GCN-DAG: buffer_store_byte [[RESULT1]]
194206 ; GCN: s_endpgm
195 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
196 %val = load i32, i32 addrspace(1)* %valptr
207 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
208 %tid = call i32 @llvm.r600.read.tidig.x()
209 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
210 %val = load i32, i32 addrspace(1)* %in.gep
197211 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
198212 %cmp = icmp eq i32 %val, 0
199213 %sel = select i1 %cmp, i32 -1, i32 %ctlz
204218
205219 ; Selected on wrong constant
206220 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0:
207 ; GCN: buffer_load_dword
208 ; GCN: v_ffbh_u32_e32
209 ; GCN: v_cmp
210 ; GCN: v_cndmask
211 ; GCN: buffer_store_dword
212 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
213 %val = load i32, i32 addrspace(1)* %valptr
221 ; GCN: {{buffer|flat}}_load_dword
222 ; GCN: v_ffbh_u32_e32
223 ; GCN: v_cmp
224 ; GCN: v_cndmask
225 ; GCN: buffer_store_dword
226 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
227 %tid = call i32 @llvm.r600.read.tidig.x()
228 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
229 %val = load i32, i32 addrspace(1)* %in.gep
214230 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
215231 %cmp = icmp eq i32 %val, 0
216232 %sel = select i1 %cmp, i32 0, i32 %ctlz
220236
221237 ; Selected on wrong constant
222238 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0:
223 ; GCN: buffer_load_dword
239 ; GCN: {{buffer|flat}}_load_dword
224240 ; GCN: v_ffbh_u32_e32
225241 ; GCN: v_cmp
226242 ; GCN: v_cndmask
227243 ; GCN: buffer_store_dword
228244 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
229 %val = load i32, i32 addrspace(1)* %valptr
245 %tid = call i32 @llvm.r600.read.tidig.x()
246 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
247 %val = load i32, i32 addrspace(1)* %in.gep
230248 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
231249 %cmp = icmp ne i32 %val, 0
232250 %sel = select i1 %cmp, i32 %ctlz, i32 0
236254
237255 ; Compare on wrong constant
238256 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
239 ; GCN: buffer_load_dword
240 ; GCN: v_ffbh_u32_e32
241 ; GCN: v_cmp
242 ; GCN: v_cndmask
243 ; GCN: buffer_store_dword
244 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
245 %val = load i32, i32 addrspace(1)* %valptr
257 ; GCN: {{buffer|flat}}_load_dword
258 ; GCN: v_ffbh_u32_e32
259 ; GCN: v_cmp
260 ; GCN: v_cndmask
261 ; GCN: buffer_store_dword
262 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
263 %tid = call i32 @llvm.r600.read.tidig.x()
264 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
265 %val = load i32, i32 addrspace(1)* %in.gep
246266 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
247267 %cmp = icmp eq i32 %val, 1
248268 %sel = select i1 %cmp, i32 0, i32 %ctlz
252272
253273 ; Selected on wrong constant
254274 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
255 ; GCN: buffer_load_dword
275 ; GCN: {{buffer|flat}}_load_dword
256276 ; GCN: v_ffbh_u32_e32
257277 ; GCN: v_cmp
258278 ; GCN: v_cndmask
259279 ; GCN: buffer_store_dword
260280 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
261 %val = load i32, i32 addrspace(1)* %valptr
281 %tid = call i32 @llvm.r600.read.tidig.x()
282 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
283 %val = load i32, i32 addrspace(1)* %in.gep
262284 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
263285 %cmp = icmp ne i32 %val, 1
264286 %sel = select i1 %cmp, i32 %ctlz, i32 0
77 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
88 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
99
10 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
11
1012 ; FUNC-LABEL: {{^}}s_ctpop_i32:
1113 ; GCN: s_load_dword [[SVAL:s[0-9]+]],
1214 ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
2325
2426 ; XXX - Why 0 in register?
2527 ; FUNC-LABEL: {{^}}v_ctpop_i32:
26 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
28 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
2729 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
2830 ; GCN: buffer_store_dword [[RESULT]],
2931 ; GCN: s_endpgm
3032
3133 ; EG: BCNT_INT
3234 define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
33 %val = load i32, i32 addrspace(1)* %in, align 4
35 %tid = call i32 @llvm.r600.read.tidig.x()
36 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
37 %val = load i32, i32 addrspace(1)* %in.gep, align 4
3438 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
3539 store i32 %ctpop, i32 addrspace(1)* %out, align 4
3640 ret void
3741 }
3842
3943 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
40 ; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
41 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
44 ; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
45 ; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]],
4246 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
4347 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
4448 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
4852 ; EG: BCNT_INT
4953 ; EG: BCNT_INT
5054 define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
51 %val0 = load i32, i32 addrspace(1)* %in0, align 4
52 %val1 = load i32, i32 addrspace(1)* %in1, align 4
55 %tid = call i32 @llvm.r600.read.tidig.x()
56 %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
57 %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
58 %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
59 %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
5360 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
5461 %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
5562 %add = add i32 %ctpop0, %ctpop1
5865 }
5966
6067 ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
61 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
68 ; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
6269 ; GCN: s_waitcnt
6370 ; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
6471 ; GCN: buffer_store_dword [[RESULT]],
6572 ; GCN: s_endpgm
66 define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
67 %val0 = load i32, i32 addrspace(1)* %in0, align 4
68 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
69 %add = add i32 %ctpop0, %sval
73 define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind {
74 %tid = call i32 @llvm.r600.read.tidig.x()
75 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
76 %val = load i32, i32 addrspace(1)* %in.gep, align 4
77 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
78 %add = add i32 %ctpop, %sval
7079 store i32 %add, i32 addrspace(1)* %out, align 4
7180 ret void
7281 }
7988 ; EG: BCNT_INT
8089 ; EG: BCNT_INT
8190 define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
82 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
91 %tid = call i32 @llvm.r600.read.tidig.x()
92 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
93 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
8394 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
8495 store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
8596 ret void
97108 ; EG: BCNT_INT
98109 ; EG: BCNT_INT
99110 define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
100 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
111 %tid = call i32 @llvm.r600.read.tidig.x()
112 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
113 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
101114 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
102115 store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
103116 ret void
123136 ; EG: BCNT_INT
124137 ; EG: BCNT_INT
125138 define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
126 %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
139 %tid = call i32 @llvm.r600.read.tidig.x()
140 %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid
141 %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32
127142 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
128143 store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
129144 ret void
165180 ; EG: BCNT_INT
166181 ; EG: BCNT_INT
167182 define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
168 %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
183 %tid = call i32 @llvm.r600.read.tidig.x()
184 %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid
185 %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32
169186 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
170187 store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
171188 ret void
172189 }
173190
174191 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
175 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
192 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
176193 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
177194 ; GCN: buffer_store_dword [[RESULT]],
178195 ; GCN: s_endpgm
179196
180197 ; EG: BCNT_INT
181198 define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
182 %val = load i32, i32 addrspace(1)* %in, align 4
199 %tid = call i32 @llvm.r600.read.tidig.x()
200 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
201 %val = load i32, i32 addrspace(1)* %in.gep, align 4
183202 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
184203 %add = add i32 %ctpop, 4
185204 store i32 %add, i32 addrspace(1)* %out, align 4
187206 }
188207
189208 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
190 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
209 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
191210 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
192211 ; GCN: buffer_store_dword [[RESULT]],
193212 ; GCN: s_endpgm
194213
195214 ; EG: BCNT_INT
196215 define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
197 %val = load i32, i32 addrspace(1)* %in, align 4
216 %tid = call i32 @llvm.r600.read.tidig.x()
217 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
218 %val = load i32, i32 addrspace(1)* %in.gep, align 4
198219 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
199220 %add = add i32 4, %ctpop
200221 store i32 %add, i32 addrspace(1)* %out, align 4
202223 }
203224
204225 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
205 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
226 ; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
206227 ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
207228 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
208229 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
209230 ; GCN: buffer_store_dword [[RESULT]],
210231 ; GCN: s_endpgm
211232 define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
212 %val = load i32, i32 addrspace(1)* %in, align 4
233 %tid = call i32 @llvm.r600.read.tidig.x()
234 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
235 %val = load i32, i32 addrspace(1)* %in.gep, align 4
213236 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
214237 %add = add i32 %ctpop, 99999
215238 store i32 %add, i32 addrspace(1)* %out, align 4
217240 }
218241
219242 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
220 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
243 ; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
221244 ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
222245 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
223246 ; GCN: buffer_store_dword [[RESULT]],
225248
226249 ; EG: BCNT_INT
227250 define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
228 %val = load i32, i32 addrspace(1)* %in, align 4
251 %tid = call i32 @llvm.r600.read.tidig.x()
252 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
253 %val = load i32, i32 addrspace(1)* %in.gep, align 4
229254 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
230255 %add = add i32 %ctpop, %const
231256 store i32 %add, i32 addrspace(1)* %out, align 4
233258 }
234259
235260 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
236 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
261 ; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
237262 ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
238263 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
239264 ; GCN: buffer_store_dword [[RESULT]],
241266
242267 ; EG: BCNT_INT
243268 define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
244 %val = load i32, i32 addrspace(1)* %in, align 4
269 %tid = call i32 @llvm.r600.read.tidig.x()
270 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
271 %val = load i32, i32 addrspace(1)* %in.gep, align 4
245272 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
246273 %add = add i32 %const, %ctpop
247274 store i32 %add, i32 addrspace(1)* %out, align 4
249276 }
250277
251278 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
252 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
253 ; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
254 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
279 ; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
280 ; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
281 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
282 ; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
283 ; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
255284 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
256285 ; GCN: buffer_store_dword [[RESULT]],
257286 ; GCN: s_endpgm
258287
259288 ; EG: BCNT_INT
260289 define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
261 %val = load i32, i32 addrspace(1)* %in, align 4
262 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
263 %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
290 %tid = call i32 @llvm.r600.read.tidig.x()
291 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
292 %val = load i32, i32 addrspace(1)* %in.gep, align 4
293 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
294 %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid
264295 %const = load i32, i32 addrspace(1)* %gep, align 4
265296 %add = add i32 %const, %ctpop
266297 store i32 %add, i32 addrspace(1)* %out, align 4
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
2
3 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
24
35 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
46 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
2426 }
2527
2628 ; FUNC-LABEL: {{^}}v_ctpop_i64:
27 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
29 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
2830 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
2931 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
3032 ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
3133 ; GCN: buffer_store_dword [[RESULT]],
3234 ; GCN: s_endpgm
3335 define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
34 %val = load i64, i64 addrspace(1)* %in, align 8
36 %tid = call i32 @llvm.r600.read.tidig.x()
37 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
38 %val = load i64, i64 addrspace(1)* %in.gep, align 8
3539 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
3640 %truncctpop = trunc i64 %ctpop to i32
3741 store i32 %truncctpop, i32 addrspace(1)* %out, align 4
3943 }
4044
4145 ; FUNC-LABEL: {{^}}v_ctpop_i64_user:
42 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
46 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
4347 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
4448 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
4549 ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
4852 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
4953 ; GCN: s_endpgm
5054 define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
51 %val = load i64, i64 addrspace(1)* %in, align 8
55 %tid = call i32 @llvm.r600.read.tidig.x()
56 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
57 %val = load i64, i64 addrspace(1)* %in.gep, align 8
5258 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
5359 %or = or i64 %ctpop, %s.val
5460 store i64 %or, i64 addrspace(1)* %out
8692 ; GCN: v_bcnt_u32_b32
8793 ; GCN: s_endpgm
8894 define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
89 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
95 %tid = call i32 @llvm.r600.read.tidig.x()
96 %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid
97 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16
9098 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
9199 %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
92100 store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
104112 ; GCN: v_bcnt_u32_b32
105113 ; GCN: s_endpgm
106114 define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
107 %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
115 %tid = call i32 @llvm.r600.read.tidig.x()
116 %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
117 %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32
108118 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
109119 %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
110120 store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
168178 ; FIXME: Should not have extra add
169179
170180 ; FUNC-LABEL: {{^}}v_ctpop_i128:
171 ; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
181 ; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
182 ; VI: flat_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}
172183
173184 ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
174185 ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
181192 ; GCN: buffer_store_dword [[RESULT]],
182193 ; GCN: s_endpgm
183194 define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
184 %val = load i128, i128 addrspace(1)* %in, align 8
195 %tid = call i32 @llvm.r600.read.tidig.x()
196 %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid
197 %val = load i128, i128 addrspace(1)* %in.gep, align 8
185198 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
186199 %truncctpop = trunc i128 %ctpop to i32
187200 store i32 %truncctpop, i32 addrspace(1)* %out, align 4
44 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
55 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
66 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
7 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
78
89 ; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
910 ; SI: s_load_dword [[VAL:s[0-9]+]],
2021 }
2122
2223 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
23 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
24 ; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
2425 ; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
2526 ; SI: buffer_store_dword [[RESULT]],
2627 ; SI: s_endpgm
2728 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
2829 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
2930 define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
30 %val = load i32, i32 addrspace(1)* %valptr, align 4
31 %tid = call i32 @llvm.r600.read.tidig.x()
32 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
33 %val = load i32, i32 addrspace(1)* %in.gep, align 4
3134 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
3235 store i32 %cttz, i32 addrspace(1)* %out, align 4
3336 ret void
3437 }
3538
3639 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
37 ; SI: buffer_load_dwordx2
40 ; SI: {{buffer|flat}}_load_dwordx2
3841 ; SI: v_ffbl_b32_e32
3942 ; SI: v_ffbl_b32_e32
4043 ; SI: buffer_store_dwordx2
4346 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
4447 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
4548 define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
46 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
49 %tid = call i32 @llvm.r600.read.tidig.x()
50 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
51 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
4752 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
4853 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
4954 ret void
5055 }
5156
5257 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
53 ; SI: buffer_load_dwordx4
58 ; SI: {{buffer|flat}}_load_dwordx4
5459 ; SI: v_ffbl_b32_e32
5560 ; SI: v_ffbl_b32_e32
5661 ; SI: v_ffbl_b32_e32
6368 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
6469 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
6570 define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
66 %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
71 %tid = call i32 @llvm.r600.read.tidig.x()
72 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
73 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
6774 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
6875 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
6976 ret void
44 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
55
66 ; GCN-LABEL: {{^}}load_i8_to_f32:
7 ; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
7 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]],
88 ; GCN-NOT: bfe
99 ; GCN-NOT: lshr
1010 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
1111 ; GCN: buffer_store_dword [[CONV]],
1212 define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
13 %load = load i8, i8 addrspace(1)* %in, align 1
13 %tid = call i32 @llvm.amdgcn.workitem.id.x()
14 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
15 %load = load i8, i8 addrspace(1)* %gep, align 1
1416 %cvt = uitofp i8 %load to float
1517 store float %cvt, float addrspace(1)* %out, align 4
1618 ret void
1719 }
1820
1921 ; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
20 ; GCN: buffer_load_ushort [[LD:v[0-9]+]]
22 ; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]]
2123 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
2224 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
2325 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
2426 define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
25 %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
27 %tid = call i32 @llvm.amdgcn.workitem.id.x()
28 %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
29 %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
2630 %cvt = uitofp <2 x i8> %load to <2 x float>
2731 store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
2832 ret void
2933 }
3034
3135 ; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
32 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
36 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
3337 ; GCN-NOT: v_cvt_f32_ubyte3_e32
3438 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
3539 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
3640 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
3741 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
3842 define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
39 %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
43 %tid = call i32 @llvm.amdgcn.workitem.id.x()
44 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
45 %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
4046 %cvt = uitofp <3 x i8> %load to <3 x float>
4147 store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
4248 ret void
4349 }
4450
4551 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
46 ; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
52 ; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]]
4753 ; GCN-NOT: bfe
4854 ; GCN-NOT: lshr
4955 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
5258 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
5359 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
5460 define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
55 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
61 %tid = call i32 @llvm.amdgcn.workitem.id.x()
62 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
63 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
5664 %cvt = uitofp <4 x i8> %load to <4 x float>
5765 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
5866 ret void
6371
6472 ; FIXME: Packing bytes
6573 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
66 ; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
67 ; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
68 ; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
69 ; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
74 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]]
75 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]]
76 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]]
77 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]]
7078 ; GCN-DAG: v_lshlrev_b32
7179 ; GCN-DAG: v_or_b32
7280 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
7684
7785 ; GCN: buffer_store_dwordx4
7886 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
79 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
87 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
89 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
8090 %cvt = uitofp <4 x i8> %load to <4 x float>
8191 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
8292 ret void
123133 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
124134 ; GCN: s_endpgm
125135 define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
126 %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
136 %tid = call i32 @llvm.amdgcn.workitem.id.x()
137 %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
138 %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
127139 %cvt = uitofp <7 x i8> %load to <7 x float>
128140 store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
129141 ret void
130142 }
131143
132144 ; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
133 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
145 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
134146 ; GCN-NOT: bfe
135147 ; GCN-NOT: lshr
136148 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
146158 ; GCN: buffer_store_dwordx4
147159 ; GCN: buffer_store_dwordx4
148160 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
149 %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
161 %tid = call i32 @llvm.amdgcn.workitem.id.x()
162 %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
163 %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
150164 %cvt = uitofp <8 x i8> %load to <8 x float>
151165 store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
152166 ret void
153167 }
154168
155169 ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
156 ; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
170 ; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
157171 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
158172 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
159173 ; GCN: buffer_store_dword [[CONV]],
160174 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
161 %load = load i32, i32 addrspace(1)* %in, align 4
175 %tid = call i32 @llvm.amdgcn.workitem.id.x()
176 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
177 %load = load i32, i32 addrspace(1)* %gep, align 4
162178 %add = add i32 %load, 2
163179 %inreg = and i32 %add, 255
164180 %cvt = uitofp i32 %inreg to float
168184
169185 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
170186 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
171 %load = load i32, i32 addrspace(1)* %in, align 4
187 %tid = call i32 @llvm.amdgcn.workitem.id.x()
188 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
189 %load = load i32, i32 addrspace(1)* %gep, align 4
172190 %inreg = and i32 %load, 65280
173191 %shr = lshr i32 %inreg, 8
174192 %cvt = uitofp i32 %shr to float
180198 ; them so it shouldn't really matter.
181199 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
182200 define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
183 %load = load i8, i8 addrspace(1)* %in, align 1
201 %tid = call i32 @llvm.amdgcn.workitem.id.x()
202 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
203 %load = load i8, i8 addrspace(1)* %gep, align 1
184204 %ext = zext i8 %load to i32
185205 %cvt = uitofp i32 %ext to float
186206 store float %cvt, float addrspace(1)* %out, align 4
189209
190210 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
191211 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
192 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
212 %tid = call i32 @llvm.amdgcn.workitem.id.x()
213 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
214 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
193215 %ext = zext <4 x i8> %load to <4 x i32>
194216 %cvt = uitofp <4 x i32> %ext to <4 x float>
195217 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
197219 }
198220
199221 ; GCN-LABEL: {{^}}extract_byte0_to_f32:
200 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
222 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
201223 ; GCN-NOT: [[VAL]]
202224 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
203225 ; GCN: buffer_store_dword [[CONV]]
204226 define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
205 %val = load i32, i32 addrspace(1)* %in
227 %tid = call i32 @llvm.amdgcn.workitem.id.x()
228 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
229 %val = load i32, i32 addrspace(1)* %gep
206230 %and = and i32 %val, 255
207231 %cvt = uitofp i32 %and to float
208232 store float %cvt, float addrspace(1)* %out
210234 }
211235
212236 ; GCN-LABEL: {{^}}extract_byte1_to_f32:
213 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
237 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
214238 ; GCN-NOT: [[VAL]]
215239 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
216240 ; GCN: buffer_store_dword [[CONV]]
217241 define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
218 %val = load i32, i32 addrspace(1)* %in
242 %tid = call i32 @llvm.amdgcn.workitem.id.x()
243 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
244 %val = load i32, i32 addrspace(1)* %gep
219245 %srl = lshr i32 %val, 8
220246 %and = and i32 %srl, 255
221247 %cvt = uitofp i32 %and to float
224250 }
225251
226252 ; GCN-LABEL: {{^}}extract_byte2_to_f32:
227 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
253 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
228254 ; GCN-NOT: [[VAL]]
229255 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
230256 ; GCN: buffer_store_dword [[CONV]]
231257 define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
232 %val = load i32, i32 addrspace(1)* %in
258 %tid = call i32 @llvm.amdgcn.workitem.id.x()
259 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
260 %val = load i32, i32 addrspace(1)* %gep
233261 %srl = lshr i32 %val, 16
234262 %and = and i32 %srl, 255
235263 %cvt = uitofp i32 %and to float
238266 }
239267
240268 ; GCN-LABEL: {{^}}extract_byte3_to_f32:
241 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
269 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
242270 ; GCN-NOT: [[VAL]]
243271 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
244272 ; GCN: buffer_store_dword [[CONV]]
245273 define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
246 %val = load i32, i32 addrspace(1)* %in
274 %tid = call i32 @llvm.amdgcn.workitem.id.x()
275 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
276 %val = load i32, i32 addrspace(1)* %gep
247277 %srl = lshr i32 %val, 24
248278 %and = and i32 %srl, 255
249279 %cvt = uitofp i32 %and to float
None ; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 ; FIXME: Most of these cases that don't trigger because of broken cost
33 ; heuristics. Should not need -stress-early-ifcvt
None ; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22
33 ; FIXME: This leaves behind a now unnecessary and with exec
0 ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE %s
11 ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
22 ; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
3
4 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
35
46 ; Test that the -enable-no-signed-zeros-fp-math flag works
57
911
1012 ; GCN-UNSAFE-NOT: xor
1113 define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
12 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
13 %a = load float, float addrspace(1)* %in, align 4
14 %tid = call i32 @llvm.amdgcn.workitem.id.x()
15 %add = add i32 %tid, 1
16 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
17 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add
18 %a = load float, float addrspace(1)* %gep, align 4
1419 %b = load float, float addrspace(1)* %b_ptr, align 4
1520 %result = fsub float %a, %b
1621 %neg.result = fsub float -0.0, %result
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1
2 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
13
24 ; Make sure the add and load are reduced to 32-bits even with the
35 ; bitcast to vector.
79 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
810 ; GCN: buffer_store_dword [[ADD]]
911 define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
10 %a = load i64, i64 addrspace(1)* %in
12 %tid = call i32 @llvm.amdgcn.workitem.id.x()
13 %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
14 %a = load i64, i64 addrspace(1)* %gep
1115 %add = add i64 %a, %b
1216 %val.bc = bitcast i64 %add to <2 x i32>
1317 %extract = extractelement <2 x i32> %val.bc, i32 0
2024 ; GCN: v_add_f64
2125 ; GCN: buffer_store_dword v
2226 define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
23 %a = load double, double addrspace(1)* %in
27 %tid = call i32 @llvm.amdgcn.workitem.id.x()
28 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
29 %a = load double, double addrspace(1)* %gep
2430 %add = fadd double %a, %b
2531 %val.bc = bitcast double %add to <2 x i32>
2632 %extract = extractelement <2 x i32> %val.bc, i32 0
3339 ; GCN: v_add_i32
3440 ; GCN: buffer_store_dword
3541 define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
36 %a = load i64, i64 addrspace(1)* %in
42 %tid = call i32 @llvm.amdgcn.workitem.id.x()
43 %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
44 %a = load i64, i64 addrspace(1)* %gep
3745 %add = add i64 %a, %b
3846 %val.bc = bitcast i64 %add to <2 x float>
3947 %extract = extractelement <2 x float> %val.bc, i32 0
133133 ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
134134 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
135135 define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
136 %val = load <2 x half>, <2 x half> addrspace(1)* %in
136 %tid = call i32 @llvm.amdgcn.workitem.id.x()
137 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
138 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
137139 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
138140 %fmul = fmul <2 x half> %fabs, %val
139141 store <2 x half> %fmul, <2 x half> addrspace(1)* %out
11 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fadd_f16
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
4 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
66 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
77 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
88 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
2323 }
2424
2525 ; GCN-LABEL: {{^}}fadd_f16_imm_a
26 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
26 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
2727 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
2828 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
2929 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
4141 }
4242
4343 ; GCN-LABEL: {{^}}fadd_f16_imm_b
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
44 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
4545 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
4646 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]]
4747 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
5959 }
6060
6161 ; GCN-LABEL: {{^}}fadd_v2f16:
62 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
63 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
62 ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
63 ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
6464
6565 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
6666 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
6969
7070 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7171 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
72 ; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
7474 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7575 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7676 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
8787 <2 x half> addrspace(1)* %a,
8888 <2 x half> addrspace(1)* %b) {
8989 entry:
90 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
91 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
90 %tid = call i32 @llvm.amdgcn.workitem.id.x()
91 %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
92 %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
93 %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
94 %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
9295 %r.val = fadd <2 x half> %a.val, %b.val
9396 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
9497 ret void
9598 }
9699
97100 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
98 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
101 ; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
99102 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
100103 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
101104 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
117120 <2 x half> addrspace(1)* %r,
118121 <2 x half> addrspace(1)* %b) {
119122 entry:
120 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
123 %tid = call i32 @llvm.amdgcn.workitem.id.x()
124 %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
125 %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
121126 %r.val = fadd <2 x half> , %b.val
122127 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
123128 ret void
124129 }
125130
126131 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
127 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
132 ; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
128133 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
129134 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
130135 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
146151 <2 x half> addrspace(1)* %r,
147152 <2 x half> addrspace(1)* %a) {
148153 entry:
149 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
154 %tid = call i32 @llvm.amdgcn.workitem.id.x()
155 %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
156 %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
150157 %r.val = fadd <2 x half> %a.val,
151158 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
152159 ret void
153160 }
161
162 declare i32 @llvm.amdgcn.workitem.id.x() #1
163
164 attributes #0 = { nounwind }
165 attributes #1 = { nounwind readnone }
44 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
55 define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
66 double addrspace(1)* %in2) {
7 %r0 = load double, double addrspace(1)* %in1
8 %r1 = load double, double addrspace(1)* %in2
7 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8 %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid
9 %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid
10 %r0 = load double, double addrspace(1)* %gep1
11 %r1 = load double, double addrspace(1)* %gep2
912 %r2 = fadd double %r0, %r1
1013 store double %r2, double addrspace(1)* %out
1114 ret void
4144 store <2 x double> %r2, <2 x double> addrspace(1)* %out
4245 ret void
4346 }
47
48 declare i32 @llvm.amdgcn.workitem.id.x() #1
49
50 attributes #0 = { nounwind }
51 attributes #1 = { nounwind readnone }
44 declare half @llvm.canonicalize.f16(half) #0
55 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
66 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
7 declare i32 @llvm.amdgcn.workitem.id.x() #0
8
79
810 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
911 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
212214 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
213215 ; GFX9: buffer_store_dword [[REG]]
214216 define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
215 %val = load <2 x half>, <2 x half> addrspace(1)* %out
217 %tid = call i32 @llvm.amdgcn.workitem.id.x()
218 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
219 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
216220 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
217221 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
218222 ret void
232236 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
233237 ; GCN: buffer_store_dword
234238 define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
235 %val = load <2 x half>, <2 x half> addrspace(1)* %out
239 %tid = call i32 @llvm.amdgcn.workitem.id.x()
240 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
241 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
236242 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
237243 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
238244 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
250256 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
251257 ; GCN: buffer_store_dword
252258 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
253 %val = load <2 x half>, <2 x half> addrspace(1)* %out
259 %tid = call i32 @llvm.amdgcn.workitem.id.x()
260 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
261 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
254262 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
255263 %val.fabs.fneg = fsub <2 x half> , %val.fabs
256264 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
269277 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
270278 ; GFX9: buffer_store_dword [[REG]]
271279 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
272 %val = load <2 x half>, <2 x half> addrspace(1)* %out
280 %tid = call i32 @llvm.amdgcn.workitem.id.x()
281 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
282 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
273283 %fneg.val = fsub <2 x half> , %val
274284 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
275285 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 declare float @llvm.fabs.f32(float) #0
33 declare float @llvm.canonicalize.f32(float) #0
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fcmp_f16_lt
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
11 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
22
33 ; CHECK-LABEL: {{^}}flt_f64:
4 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
4 ; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
55 define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
66 double addrspace(1)* %in2) {
77 %r0 = load double, double addrspace(1)* %in1
1313 }
1414
1515 ; CHECK-LABEL: {{^}}fle_f64:
16 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
16 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
1717 define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
1818 double addrspace(1)* %in2) {
1919 %r0 = load double, double addrspace(1)* %in1
2525 }
2626
2727 ; CHECK-LABEL: {{^}}fgt_f64:
28 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
28 ; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
2929 define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
3030 double addrspace(1)* %in2) {
3131 %r0 = load double, double addrspace(1)* %in1
3737 }
3838
3939 ; CHECK-LABEL: {{^}}fge_f64:
40 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
40 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
4141 define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
4242 double addrspace(1)* %in2) {
4343 %r0 = load double, double addrspace(1)* %in1
4949 }
5050
5151 ; CHECK-LABEL: {{^}}fne_f64:
52 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
52 ; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
5353 define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
5454 double addrspace(1)* %in2) {
5555 %r0 = load double, double addrspace(1)* %in1
6161 }
6262
6363 ; CHECK-LABEL: {{^}}feq_f64:
64 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
64 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
6565 define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
6666 double addrspace(1)* %in2) {
6767 %r0 = load double, double addrspace(1)* %in1
55 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
66
77 define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
8 %r1 = load double, double addrspace(1)* %in
8 %tid = call i32 @llvm.amdgcn.workitem.id.x()
9 %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid
10 %r1 = load double, double addrspace(1)* %gep
911 %r2 = fadd double %r1, 5.000000e+00
1012 store double %r2, double addrspace(1)* %out
1113 ret void
1214 }
15
16 declare i32 @llvm.amdgcn.workitem.id.x() #1
17
18 attributes #0 = { nounwind }
19 attributes #1 = { nounwind readnone }
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
33
44 declare half @llvm.copysign.f16(half, half)
55 declare float @llvm.copysign.f32(float, float)
88 declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
99 declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
1010
11 declare i32 @llvm.amdgcn.workitem.id.x()
12
1113 ; GCN-LABEL: {{^}}test_copysign_f16:
12 ; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
13 ; SI: buffer_load_ushort v[[MAG:[0-9]+]]
14 ; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
15 ; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
1416 ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
1517 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
1618 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
1719 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
1820 ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
19 ; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]]
20 ; GFX89: buffer_load_ushort v[[MAG:[0-9]+]]
21 ; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
22 ; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
2123 ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
2224 ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
2325 ; GCN: buffer_store_short v[[OUT]]
3537 }
3638
3739 ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
38 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
39 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
40 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
41 ; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
4042 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
4143 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
4244 ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
4749 half addrspace(1)* %arg_mag,
4850 float addrspace(1)* %arg_sign) {
4951 entry:
50 %mag = load half, half addrspace(1)* %arg_mag
52 %tid = call i32 @llvm.amdgcn.workitem.id.x()
53 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
54 %mag = load half, half addrspace(1)* %arg_mag_gep
5155 %mag.ext = fpext half %mag to float
52 %sign = load float, float addrspace(1)* %arg_sign
56 %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
57 %sign = load float, float addrspace(1)* %arg_sign_gep
5358 %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
5459 store float %out, float addrspace(1)* %arg_out
5560 ret void
5661 }
5762
5863 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
59 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
60 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
64 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
65 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
6166 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
6267 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
6368 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]]
6974 half addrspace(1)* %arg_mag,
7075 double addrspace(1)* %arg_sign) {
7176 entry:
72 %mag = load half, half addrspace(1)* %arg_mag
77 %tid = call i32 @llvm.amdgcn.workitem.id.x()
78 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
79 %mag = load half, half addrspace(1)* %arg_mag_gep
7380 %mag.ext = fpext half %mag to double
74 %sign = load double, double addrspace(1)* %arg_sign
81 %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
82 %sign = load double, double addrspace(1)* %arg_sign_gep
7583 %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
7684 store double %out, double addrspace(1)* %arg_out
7785 ret void
7886 }
7987
8088 ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
81 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
82 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
89 ; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
90 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
8391 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
8492 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
8593 ; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]]
92100 float addrspace(1)* %arg_mag,
93101 half addrspace(1)* %arg_sign) {
94102 entry:
95 %mag = load float, float addrspace(1)* %arg_mag
96 %sign = load half, half addrspace(1)* %arg_sign
103 %tid = call i32 @llvm.amdgcn.workitem.id.x()
104 %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
105 %mag = load float, float addrspace(1)* %arg_mag_gep
106 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
107 %sign = load half, half addrspace(1)* %arg_sign_gep
97108 %sign.ext = fpext half %sign to float
98109 %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
99110 store float %out, float addrspace(1)* %arg_out
101112 }
102113
103114 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
104 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
105 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
115 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
116 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
106117 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
107118 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
108119 ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
115126 double addrspace(1)* %arg_mag,
116127 half addrspace(1)* %arg_sign) {
117128 entry:
118 %mag = load double, double addrspace(1)* %arg_mag
119 %sign = load half, half addrspace(1)* %arg_sign
129 %tid = call i32 @llvm.amdgcn.workitem.id.x()
130 %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid
131 %mag = load double, double addrspace(1)* %arg_mag_gep
132 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
133 %sign = load half, half addrspace(1)* %arg_sign_gep
120134 %sign.ext = fpext half %sign to double
121135 %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
122136 store double %out, double addrspace(1)* %arg_out
124138 }
125139
126140 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
127 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
128 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
141 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
142 ; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
129143 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
130144 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
131145 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]]
140154 half addrspace(1)* %arg_mag,
141155 float addrspace(1)* %arg_sign) {
142156 entry:
143 %mag = load half, half addrspace(1)* %arg_mag
144 %sign = load float, float addrspace(1)* %arg_sign
157 %tid = call i32 @llvm.amdgcn.workitem.id.x()
158 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
159 %mag = load half, half addrspace(1)* %arg_mag_gep
160 %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
161 %sign = load float, float addrspace(1)* %arg_sign_gep
145162 %sign.trunc = fptrunc float %sign to half
146163 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
147164 store half %out, half addrspace(1)* %arg_out
149166 }
150167
151168 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
152 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
153 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
169 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
170 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
154171 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
155172 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
156173 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
165182 half addrspace(1)* %arg_mag,
166183 double addrspace(1)* %arg_sign) {
167184 entry:
185 %tid = call i32 @llvm.amdgcn.workitem.id.x()
186 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
168187 %mag = load half, half addrspace(1)* %arg_mag
169 %sign = load double, double addrspace(1)* %arg_sign
188 %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
189 %sign = load double, double addrspace(1)* %arg_sign_gep
170190 %sign.trunc = fptrunc double %sign to half
171191 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
172192 store half %out, half addrspace(1)* %arg_out
174194 }
175195
176196 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
177 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
178 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
197 ; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
198 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
179199 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
180200 ; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
181201 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
192212 float addrspace(1)* %arg_mag,
193213 half addrspace(1)* %arg_sign) {
194214 entry:
195 %mag = load float, float addrspace(1)* %arg_mag
215 %tid = call i32 @llvm.amdgcn.workitem.id.x()
216 %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
217 %mag = load float, float addrspace(1)* %arg_mag_gep
196218 %mag.trunc = fptrunc float %mag to half
197 %sign = load half, half addrspace(1)* %arg_sign
219 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
220 %sign = load half, half addrspace(1)* %arg_sign_gep
198221 %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
199222 store half %out, half addrspace(1)* %arg_out
200223 ret void
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
33
44 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
55 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22
33 declare double @llvm.fma.f64(double, double, double) nounwind readnone
44 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
None ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
22
33 declare float @llvm.fma.f32(float, float, float) nounwind readnone
44 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fmul_f16
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
22
33 ; FUNC-LABEL: {{^}}fmul_f64:
44 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
None ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
1 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
2 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
3 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
4
5 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
6 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
7 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
8 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
4
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
99
1010 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
1111
None ; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
1 ; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
3 ; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
66
77 ; GCN-LABEL: {{^}}fmuladd_f64:
88 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
None ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
44
5 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
99
1010 declare i32 @llvm.amdgcn.workitem.id.x() #1
1111 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
55 ; SI-NOT: and
None ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
33
44 ; FIXME: Should be able to do scalar op
55 ; GCN-LABEL: {{^}}s_fneg_f16:
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
55
None ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
33
44 ; GCN-LABEL: {{^}}fpext_f16_to_f32
55 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fptosi_f16_to_i16
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fptoui_f16_to_i16
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
33
44 ; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
55 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
33
4 ; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
66
77 declare double @llvm.fabs.f64(double) #0
88 declare double @llvm.floor.f64(double) #0
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
3 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
55
66 declare float @llvm.fabs.f32(float) #0
77 declare float @llvm.floor.f32(float) #0
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}frem_f32:
55 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
22
33 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
44 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
33
44
55 ; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
33
44 ; GCN-LABEL: {{^}}fsub_f16:
55 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}v_fsub_f32:
55 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
22
33 declare double @llvm.fabs.f64(double) #0
44
None ; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
33
44 declare double @llvm.trunc.f64(double) nounwind readnone
55 declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
None ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22 ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33 ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
44
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; half args should be promoted to float for SI and lower.
44
None ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; Use a 64-bit value with lo bits that can be represented as an inline constant
44 ; GCN-LABEL: {{^}}i64_imm_inline_lo:
None ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
33 ; FIXME: Merge into imm.ll
44
55 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
44
55 ; Tests for indirect addressing on SI, which is implemented using dynamic
66 ; indexing of vectors.
None ; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 ; GatherAllAliases gives up on trying to analyze cases where the
33 ; pointer may have been loaded from an aliased store, so make sure
None ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
11
22 declare half @llvm.fabs.f16(half %a)
33 declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
11
22 declare i1 @llvm.amdgcn.class.f32(float, i32) #1
33 declare i1 @llvm.amdgcn.class.f64(double, i32) #1
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
11 ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
22
33 ; FIXME: Enable for VI.
None ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
11
22 declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
33
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
44 ; GCN: v_bfe_i32
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22
33 declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
44
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
22
33 declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
44
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
44 ; GCN: v_bfe_u32
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.ceil.f16(half %a)
44 declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.cos.f16(half %a)
44 declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.exp2.f16(half %a)
44 declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.floor.f16(half %a)
44 declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.fma.f16(half %a, half %b, half %c)
44 declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
None ; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
2 ; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
44
55 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
66 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.log2.f16(half %a)
44 declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.maxnum.f16(half %a, half %b)
44 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.minnum.f16(half %a, half %b)
44 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
33
44 declare half @llvm.rint.f16(half %a)
55 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.sin.f16(half %a)
44 declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.sqrt.f16(half %a)
44 declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.trunc.f16(half %a)
44 declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
33
4 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
5 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
66
77 ; FUNC-LABEL: {{^}}global_load_f32:
88 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}global_load_f64:
55 ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
55
66 ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
77
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44
55
66 ; FUNC-LABEL: {{^}}global_load_i32:
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
33
4 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
66
77 ; FUNC-LABEL: {{^}}global_load_i64:
88 ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
55
66
77 ; FUNC-LABEL: {{^}}global_load_i8:
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
55
66 ; FUNC-LABEL: {{^}}load_i24:
77 ; SI: {{flat|buffer}}_load_ubyte
None ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
22
33
44 ; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
None ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
22
33 ; This test is mostly to test DAG store merging, so disable the vectorizer.
44 ; Run with devices with different unaligned load restrictions.
None ; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
11
22 declare i32 @llvm.amdgcn.workitem.id.x() readnone
33
None ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
33
44 ; mul24 and mad24 are affected
55
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
11
22 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
33
None ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44
55 ; FUNC-LABEL: {{^}}or_v2i32:
None ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0
33 declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
33
44 ; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
55 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
None ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
22
33 ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
44 ; SI: buffer_load_dwordx4
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
22
33 ; BOTH-LABEL: {{^}}s_rotl_i64:
44 ; BOTH-DAG: s_lshl_b64
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
22
33 ; BOTH-LABEL: {{^}}s_rotr_i64:
44 ; BOTH-DAG: s_sub_i32
None ; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
22
33 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
44 declare float @llvm.sqrt.f32(float) nounwind readnone
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
22
33 ; SI-LABEL: {{^}}s_movk_i32_k0:
44 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
None ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
33
44 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
55 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
None ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
33
44 declare i32 @llvm.amdgcn.workitem.id.x() #0
55 declare i32 @llvm.amdgcn.workitem.id.y() #0
None ; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; XXX - Why the packing?
44 ; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
11
22 ; FIXME: This currently doesn't do a great job of clustering the
33 ; loads, which end up with extra moves between them. Right now, it
None ; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
22
33 ; When a frame index offset is more than 12-bits, make sure we don't store
44 ; it in mubuf's offset field.
None ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 ; The code generated by sdiv is long and complex and may frequently change.
55 ; The goal of this test is to make sure the ISel doesn't fail.
None ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
33
44 ; GCN-LABEL: {{^}}add_shr_i32:
55 ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
None ; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
33
44 ; Test expansion of scalar selects on vectors.
55 ; Evergreen not enabled since it seems to be having problems with doubles.
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}select_f16:
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44
55 ; FIXME: i16 promotion pass ruins the scalar cases when legal.
66 ; FIXME: r600 fails verifier
None ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
22
33 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
44 ; used in an REG_SEQUENCE that also needs to be handled.
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
22
33 ; CHECK-LABEL: {{^}}phi1:
44 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0