llvm.org GIT mirror llvm / 0a25612
Revert r307026, "[AMDGPU] Switch scalarize global loads ON by default" It broke a testcase. Failing Tests (1): LLVM :: CodeGen/AMDGPU/alignbit-pat.ll git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307054 91177308-0d34-0410-b5e6-96231b3b80d8 NAKAMURA Takumi 2 years ago
141 changed file(s) with 570 addition(s) and 800 deletion(s). Raw diff Collapse all Expand all
8484 static cl::opt ScalarizeGlobal(
8585 "amdgpu-scalarize-global-loads",
8686 cl::desc("Enable global load scalarization"),
87 cl::init(true),
87 cl::init(false),
8888 cl::Hidden);
8989
9090 // Option to run internalize pass.
44 ;FUNC-LABEL: {{^}}test1:
55 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
66
7 ;SI: s_add_i32 s[[REG:[0-9]+]], {{s[0-9]+, s[0-9]+}}
8 ;SI: v_mov_b32_e32 v[[REG]], s[[REG]]
9 ;SI: buffer_store_dword v[[REG]],
7 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
8 ;SI-NOT: [[REG]]
9 ;SI: buffer_store_dword [[REG]],
1010 define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
1111 %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
1212 %a = load i32, i32 addrspace(1)* %in
2020 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2121 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2222
23 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
24 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
23 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
24 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
2525
2626 define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
2727 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
3838 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
3939 ;EG: ADD_INT {{[* ]*}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
4040
41 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
42 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
43 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
44 ;SI: s_add_i32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
41 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
42 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
43 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
44 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
4545
4646 define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
4747 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
1818
1919 ; Check that the SGPR add operand is correctly moved to a VGPR.
2020 ; GCN-LABEL: {{^}}sgpr_operand:
21 ; GCN: s_add_u32
22 ; GCN: s_addc_u32
23 ; GCN: s_addc_u32
24 ; GCN: s_addc_u32
21 ; GCN: v_add_i32
22 ; GCN: v_addc_u32
23 ; GCN: v_addc_u32
24 ; GCN: v_addc_u32
2525 define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
2626 %foo = load i128, i128 addrspace(1)* %in, align 8
2727 %result = add i128 %foo, %a
3030 }
3131
3232 ; GCN-LABEL: {{^}}sgpr_operand_reversed:
33 ; GCN: s_add_u32
34 ; GCN: s_addc_u32
35 ; GCN: s_addc_u32
36 ; GCN: s_addc_u32
33 ; GCN: v_add_i32
34 ; GCN: v_addc_u32
35 ; GCN: v_addc_u32
36 ; GCN: v_addc_u32
3737 define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
3838 %foo = load i128, i128 addrspace(1)* %in, align 8
3939 %result = add i128 %a, %foo
1818
1919 ; Check that the SGPR add operand is correctly moved to a VGPR.
2020 ; SI-LABEL: {{^}}sgpr_operand:
21 ; SI: s_add_u32
22 ; SI: s_addc_u32
21 ; SI: v_add_i32
22 ; SI: v_addc_u32
2323 define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
2424 %foo = load i64, i64 addrspace(1)* %in, align 8
2525 %result = add i64 %foo, %a
3131 ; SGPR as other operand.
3232 ;
3333 ; SI-LABEL: {{^}}sgpr_operand_reversed:
34 ; SI: s_add_u32
35 ; SI: s_addc_u32
34 ; SI: v_add_i32
35 ; SI: v_addc_u32
3636 define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
3737 %foo = load i64, i64 addrspace(1)* %in, align 8
3838 %result = add i64 %a, %foo
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22
33 ; FUNC-LABEL: {{^}}v_and_i64_br:
4 ; SI: s_and_b64
4 ; SI: v_and_b32
5 ; SI: v_and_b32
56 define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
67 entry:
78 %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
77 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
88 ; EG: AND_INT {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
99
10 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
11 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
10 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
11 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
1212
1313 define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
1414 %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
2525 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2626 ; EG: AND_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
2727
28
29 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
30 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
31 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
32 ; SI: s_and_b32 s{{[0-9]+, s[0-9]+, s[0-9]+}}
28 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
29 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
30 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
31 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
3332
3433 define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
3534 %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
136135 ; FUNC-LABEL: {{^}}v_and_constant_i32
137136 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
138137 define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
139 %tid = call i32 @llvm.r600.read.tidig.x() #0
140 %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
141 %a = load i32, i32 addrspace(1)* %gep, align 4
138 %a = load i32, i32 addrspace(1)* %aptr, align 4
142139 %and = and i32 %a, 1234567
143140 store i32 %and, i32 addrspace(1)* %out, align 4
144141 ret void
147144 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
148145 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
149146 define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
150 %tid = call i32 @llvm.r600.read.tidig.x() #0
151 %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
152 %a = load i32, i32 addrspace(1)* %gep, align 4
147 %a = load i32, i32 addrspace(1)* %aptr, align 4
153148 %and = and i32 %a, 64
154149 store i32 %and, i32 addrspace(1)* %out, align 4
155150 ret void
158153 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
159154 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
160155 define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
161 %tid = call i32 @llvm.r600.read.tidig.x() #0
162 %gep = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
163 %a = load i32, i32 addrspace(1)* %gep, align 4
156 %a = load i32, i32 addrspace(1)* %aptr, align 4
164157 %and = and i32 %a, -16
165158 store i32 %and, i32 addrspace(1)* %out, align 4
166159 ret void
245238 ; SI: v_and_b32
246239 ; SI: v_and_b32
247240 define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
248 %tid = call i32 @llvm.r600.read.tidig.x() #0
249 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
250 %a = load i64, i64 addrspace(1)* %gep.a, align 8
251 %gep.b = getelementptr i64, i64 addrspace(1)* %bptr, i32 %tid
252 %b = load i64, i64 addrspace(1)* %gep.b, align 8
241 %a = load i64, i64 addrspace(1)* %aptr, align 8
242 %b = load i64, i64 addrspace(1)* %bptr, align 8
253243 %and = and i64 %a, %b
254244 store i64 %and, i64 addrspace(1)* %out, align 8
255245 ret void
260250 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
261251 ; SI: buffer_store_dwordx2
262252 define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
263 %tid = call i32 @llvm.r600.read.tidig.x() #0
264 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
265 %a = load i64, i64 addrspace(1)* %gep.a, align 8
253 %a = load i64, i64 addrspace(1)* %aptr, align 8
266254 %and = and i64 %a, 1231231234567
267255 store i64 %and, i64 addrspace(1)* %out, align 8
268256 ret void
310298 }
311299
312300 ; FUNC-LABEL: {{^}}v_and_i64_32_bit_constant:
313 ; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
301 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
314302 ; SI-NOT: and
315303 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
316304 ; SI-NOT: and
317305 ; SI: buffer_store_dwordx2
318306 define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
319 %tid = call i32 @llvm.r600.read.tidig.x() #0
320 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
321 %a = load i64, i64 addrspace(1)* %gep.a, align 8
307 %a = load i64, i64 addrspace(1)* %aptr, align 8
322308 %and = and i64 %a, 1234567
323309 store i64 %and, i64 addrspace(1)* %out, align 8
324310 ret void
325311 }
326312
327313 ; FUNC-LABEL: {{^}}v_and_inline_imm_i64:
328 ; SI: {{buffer|flat}}_load_dword v{{[0-9]+}}
314 ; SI: buffer_load_dword v{{[0-9]+}}
329315 ; SI-NOT: and
330316 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
331317 ; SI-NOT: and
332318 ; SI: buffer_store_dwordx2
333319 define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
334 %tid = call i32 @llvm.r600.read.tidig.x() #0
335 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
336 %a = load i64, i64 addrspace(1)* %gep.a, align 8
320 %a = load i64, i64 addrspace(1)* %aptr, align 8
337321 %and = and i64 %a, 64
338322 store i64 %and, i64 addrspace(1)* %out, align 8
339323 ret void
341325
342326 ; FIXME: Should be able to reduce load width
343327 ; FUNC-LABEL: {{^}}v_and_inline_neg_imm_i64:
344 ; SI: {{buffer|flat}}_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
328 ; SI: buffer_load_dwordx2 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
345329 ; SI-NOT: and
346330 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
347331 ; SI-NOT: and
348332 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
349333 define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
350 %tid = call i32 @llvm.r600.read.tidig.x() #0
351 %gep.a = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
352 %a = load i64, i64 addrspace(1)* %gep.a, align 8
334 %a = load i64, i64 addrspace(1)* %aptr, align 8
353335 %and = and i64 %a, -8
354336 store i64 %and, i64 addrspace(1)* %out, align 8
355337 ret void
566548 store i64 %and, i64 addrspace(1)* %out, align 8
567549 ret void
568550 }
551
569552 attributes #0 = { nounwind readnone }
11 ; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22
33 ; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
4 ; GCN: s_load_dwordx4
5 ; GCN-DAG: s_load_dwordx4
6 ; GCN-DAG: s_load_dword
4 ; GCN: {{buffer|flat}}_load_dwordx4
5 ; GCN-DAG: {{buffer|flat}}_load_dwordx4
6 ; GCN-DAG: {{buffer|flat}}_load_dword
77
88 ; GCN: {{buffer|flat}}_store_byte
99 ; GCN: {{buffer|flat}}_store_byte
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
3
4 declare i32 @llvm.amdgcn.workitem.id.x() #1
53
64 declare i16 @llvm.bitreverse.i16(i16) #1
75 declare i32 @llvm.bitreverse.i32(i32) #1
4341 }
4442
4543 ; FUNC-LABEL: {{^}}v_brev_i32:
46 ; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
44 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
4745 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
4846 ; SI: buffer_store_dword [[RESULT]],
4947 ; SI: s_endpgm
5048 define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
51 %tid = call i32 @llvm.amdgcn.workitem.id.x()
52 %gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
53 %val = load i32, i32 addrspace(1)* %gep
49 %val = load i32, i32 addrspace(1)* %valptr
5450 %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
5551 store i32 %brev, i32 addrspace(1)* %out
5652 ret void
6965 ; SI: v_bfrev_b32_e32
7066 ; SI: v_bfrev_b32_e32
7167 define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
72 %tid = call i32 @llvm.amdgcn.workitem.id.x()
73 %gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
74 %val = load <2 x i32>, <2 x i32> addrspace(1)* %gep
68 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
7569 %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
7670 store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
7771 ret void
8781 ; FUNC-LABEL: {{^}}v_brev_i64:
8882 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
8983 define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
90 %tid = call i32 @llvm.amdgcn.workitem.id.x()
91 %gep = getelementptr i64, i64 addrspace(1)* %valptr, i32 %tid
92 %val = load i64, i64 addrspace(1)* %gep
84 %val = load i64, i64 addrspace(1)* %valptr
9385 %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
9486 store i64 %brev, i64 addrspace(1)* %out
9587 ret void
10496
10597 ; FUNC-LABEL: {{^}}v_brev_v2i64:
10698 define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
107 %tid = call i32 @llvm.amdgcn.workitem.id.x()
108 %gep = getelementptr <2 x i64> , <2 x i64> addrspace(1)* %valptr, i32 %tid
109 %val = load <2 x i64>, <2 x i64> addrspace(1)* %gep
99 %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
110100 %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
111101 store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
112102 ret void
99 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
1010
1111 ; FUNC-LABEL: @test_bswap_i32
12 ; SI: s_load_dword [[VAL:s[0-9]+]]
12 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
1313 ; SI-DAG: v_alignbit_b32 [[TMP0:v[0-9]+]], [[VAL]], [[VAL]], 8
1414 ; SI-DAG: v_alignbit_b32 [[TMP1:v[0-9]+]], [[VAL]], [[VAL]], 24
1515 ; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0xff00ff
0 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI -check-prefix=OPT-CIVI %s
11 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI -check-prefix=OPT-CIVI %s
22 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-GFX9 %s
3 ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
4 ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
5 ; RUN: llc -march=amdgcn -amdgpu-scalarize-global-loads=false -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
66
77 ; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
88 ; OPT-CIVI: getelementptr i32, i32 addrspace(4)* %in
3939
4040 ; OPT-LABEL: @test_sink_noop_addrspacecast_flat_to_global_i32(
4141 ; OPT: getelementptr i32, i32 addrspace(4)* %out,
42 ; rOPT-CI-NOT: getelementptr
42 ; OPT-CI-NOT: getelementptr
4343 ; OPT: br i1
4444
4545 ; OPT-CI: addrspacecast
0 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
11 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
22 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
3 ; RUN: llc -march=amdgcn -mcpu=tahiti -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-scalarize-global-loads=false -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
3 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
4 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
66
77 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
88
1111 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
1212 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
1313 ; It's probably OK if this is slightly higher:
14 ; CHECK: ; NumVgprs: 4
14 ; CHECK: ; NumVgprs: 8
1515 define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
1616 entry:
1717 %cmpflag = icmp eq i32 %flag, 1
44 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
55
66 ; FUNC-LABEL: {{^}}test_copy_v4i8:
7 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
7 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
88 ; GCN: buffer_store_dword [[REG]]
99 ; GCN: s_endpgm
1010 define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
11 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
12 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
13 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
11 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
1412 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
1513 ret void
1614 }
1715
1816 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
19 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
17 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
2018 ; GCN: buffer_store_dword [[REG]]
2119 ; GCN: buffer_store_dword [[REG]]
2220 ; GCN: s_endpgm
2321 define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
24 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
25 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
26 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
22 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
2723 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
2824 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
2925 ret void
3026 }
3127
3228 ; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
33 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
29 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
3430 ; GCN: buffer_store_dword [[REG]]
3531 ; GCN: buffer_store_dword [[REG]]
3632 ; GCN: buffer_store_dword [[REG]]
3733 ; GCN: s_endpgm
3834 define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
39 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
40 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
41 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
35 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
4236 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
4337 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
4438 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
4640 }
4741
4842 ; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
49 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
43 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
5044 ; GCN: buffer_store_dword [[REG]]
5145 ; GCN: buffer_store_dword [[REG]]
5246 ; GCN: buffer_store_dword [[REG]]
5347 ; GCN: buffer_store_dword [[REG]]
5448 ; GCN: s_endpgm
5549 define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
56 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
57 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
58 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
50 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
5951 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
6052 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
6153 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
6456 }
6557
6658 ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
67 ; GCN: {{buffer|flat}}_load_dword
59 ; GCN: buffer_load_dword
6860 ; GCN-DAG: v_lshrrev_b32
6961 ; GCN: v_and_b32
7062 ; GCN: v_or_b32
7365
7466 ; GCN: s_endpgm
7567 define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
76 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
77 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
78 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
68 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
7969 %add = add <4 x i8> %val,
8070 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
8171 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
10696 }
10797
10898 ; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
109 ; GCN: {{buffer|flat}}_load_dword
99 ; GCN: buffer_load_dword
110100 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
111101 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
112102 ; GCN: s_endpgm
113103 define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
114 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
115 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
116 %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
104 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
117105 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
118106 ret void
119107 }
120108
121109 ; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
122 ; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
123 ; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
110 ; GCN-DAG: buffer_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
111 ; GCN-DAG: buffer_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
124112 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
125113 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
126114 ; GCN: s_endpgm
131119 }
132120
133121 ; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
134 ; GCN: {{buffer|flat}}_load_ubyte
135 ; GCN: {{buffer|flat}}_load_ubyte
136 ; GCN: {{buffer|flat}}_load_ubyte
122 ; GCN: buffer_load_ubyte
123 ; GCN: buffer_load_ubyte
124 ; GCN: buffer_load_ubyte
137125
138126 ; GCN: buffer_store_byte
139127 ; GCN: buffer_store_byte
146134 }
147135
148136 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
149 ; GCN: {{buffer|flat}}_load_ubyte
150 ; GCN: {{buffer|flat}}_load_ubyte
151 ; GCN: {{buffer|flat}}_load_ubyte
152 ; GCN: {{buffer|flat}}_load_ubyte
137 ; GCN: buffer_load_ubyte
138 ; GCN: buffer_load_ubyte
139 ; GCN: buffer_load_ubyte
140 ; GCN: buffer_load_ubyte
153141 ; GCN: buffer_store_dword
154142 ; GCN: s_endpgm
155143 define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
159147 }
160148
161149 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
162 ; GCN: {{buffer|flat}}_load_ubyte
163 ; GCN: {{buffer|flat}}_load_ubyte
164 ; GCN: {{buffer|flat}}_load_ubyte
165 ; GCN: {{buffer|flat}}_load_ubyte
150 ; GCN: buffer_load_ubyte
151 ; GCN: buffer_load_ubyte
152 ; GCN: buffer_load_ubyte
153 ; GCN: buffer_load_ubyte
166154 ; GCN: buffer_store_byte
167155 ; GCN: buffer_store_byte
168156 ; GCN: buffer_store_byte
3333 }
3434
3535 ; FUNC-LABEL: {{^}}v_ctlz_i32:
36 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
36 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
3737 ; GCN-DAG: v_ffbh_u32_e32 [[CTLZ:v[0-9]+]], [[VAL]]
3838 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[CTLZ]]
3939 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], 32, [[CTLZ]], vcc
4343 ; EG: FFBH_UINT
4444 ; EG: CNDE_INT
4545 define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
46 %tid = call i32 @llvm.r600.read.tidig.x()
47 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
48 %val = load i32, i32 addrspace(1)* %in.gep, align 4
46 %val = load i32, i32 addrspace(1)* %valptr, align 4
4947 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
5048 store i32 %ctlz, i32 addrspace(1)* %out, align 4
5149 ret void
5250 }
5351
5452 ; FUNC-LABEL: {{^}}v_ctlz_v2i32:
55 ; GCN: {{buffer|flat}}_load_dwordx2
53 ; GCN: buffer_load_dwordx2
5654 ; GCN: v_ffbh_u32_e32
5755 ; GCN: v_ffbh_u32_e32
5856 ; GCN: buffer_store_dwordx2
6361 ; EG: FFBH_UINT
6462 ; EG: CNDE_INT
6563 define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
66 %tid = call i32 @llvm.r600.read.tidig.x()
67 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
68 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
64 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
6965 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
7066 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
7167 ret void
7268 }
7369
7470 ; FUNC-LABEL: {{^}}v_ctlz_v4i32:
75 ; GCN: {{buffer|flat}}_load_dwordx4
71 ; GCN: buffer_load_dwordx4
7672 ; GCN: v_ffbh_u32_e32
7773 ; GCN: v_ffbh_u32_e32
7874 ; GCN: v_ffbh_u32_e32
9389 ; EG-DAG: FFBH_UINT
9490 ; EG-DAG: CNDE_INT
9591 define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
96 %tid = call i32 @llvm.r600.read.tidig.x()
97 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
98 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
92 %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
9993 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
10094 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
10195 ret void
10296 }
10397
10498 ; FUNC-LABEL: {{^}}v_ctlz_i8:
105 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
99 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
106100 ; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
107101 ; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
108102 ; GCN: buffer_store_byte [[RESULT]],
173167 }
174168
175169 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_neg1:
176 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
170 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
177171 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
178172 ; GCN: buffer_store_dword [[RESULT]],
179173 ; GCN: s_endpgm
180 define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
181 %tid = call i32 @llvm.r600.read.tidig.x()
182 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
183 %val = load i32, i32 addrspace(1)* %in.gep
174 define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
175 %val = load i32, i32 addrspace(1)* %valptr
184176 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
185177 %cmp = icmp eq i32 %val, 0
186178 %sel = select i1 %cmp, i32 -1, i32 %ctlz
189181 }
190182
191183 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_neg1:
192 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
184 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
193185 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
194186 ; GCN: buffer_store_dword [[RESULT]],
195187 ; GCN: s_endpgm
196188 define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
197 %tid = call i32 @llvm.r600.read.tidig.x()
198 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
199 %val = load i32, i32 addrspace(1)* %in.gep
189 %val = load i32, i32 addrspace(1)* %valptr
200190 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
201191 %cmp = icmp ne i32 %val, 0
202192 %sel = select i1 %cmp, i32 %ctlz, i32 -1
206196
207197 ; TODO: Should be able to eliminate select here as well.
208198 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_eq_bitwidth:
209 ; GCN: {{buffer|flat}}_load_dword
199 ; GCN: buffer_load_dword
210200 ; GCN: v_ffbh_u32_e32
211201 ; GCN: v_cmp
212202 ; GCN: v_cndmask
213203 ; GCN: s_endpgm
214204 define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
215 %tid = call i32 @llvm.r600.read.tidig.x()
216 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
217 %val = load i32, i32 addrspace(1)* %in.gep
205 %val = load i32, i32 addrspace(1)* %valptr
218206 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
219207 %cmp = icmp eq i32 %ctlz, 32
220208 %sel = select i1 %cmp, i32 -1, i32 %ctlz
223211 }
224212
225213 ; FUNC-LABEL: {{^}}v_ctlz_i32_sel_ne_bitwidth:
226 ; GCN: {{buffer|flat}}_load_dword
214 ; GCN: buffer_load_dword
227215 ; GCN: v_ffbh_u32_e32
228216 ; GCN: v_cmp
229217 ; GCN: v_cndmask
230218 ; GCN: s_endpgm
231219 define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
232 %tid = call i32 @llvm.r600.read.tidig.x()
233 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
234 %val = load i32, i32 addrspace(1)* %in.gep
220 %val = load i32, i32 addrspace(1)* %valptr
235221 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
236222 %cmp = icmp ne i32 %ctlz, 32
237223 %sel = select i1 %cmp, i32 %ctlz, i32 -1
255241 }
256242
257243 ; FUNC-LABEL: {{^}}v_ctlz_i16_sel_eq_neg1:
258 ; SI: {{buffer|flat}}_load_ushort [[VAL:v[0-9]+]],
244 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
259245 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
260246 ; SI: buffer_store_short [[FFBH]],
261247 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
2828 }
2929
3030 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32:
31 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
31 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
3232 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
3333 ; GCN: buffer_store_dword [[RESULT]],
3434 ; GCN: s_endpgm
3535 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
3636 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
3737 define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
38 %tid = call i32 @llvm.r600.read.tidig.x()
39 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
40 %val = load i32, i32 addrspace(1)* %in.gep, align 4
38 %val = load i32, i32 addrspace(1)* %valptr, align 4
4139 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
4240 store i32 %ctlz, i32 addrspace(1)* %out, align 4
4341 ret void
4442 }
4543
4644 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v2i32:
47 ; GCN: {{buffer|flat}}_load_dwordx2
45 ; GCN: buffer_load_dwordx2
4846 ; GCN: v_ffbh_u32_e32
4947 ; GCN: v_ffbh_u32_e32
5048 ; GCN: buffer_store_dwordx2
5351 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
5452 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
5553 define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
56 %tid = call i32 @llvm.r600.read.tidig.x()
57 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
58 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
54 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
5955 %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
6056 store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
6157 ret void
6258 }
6359
6460 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_v4i32:
65 ; GCN: {{buffer|flat}}_load_dwordx4
61 ; GCN: buffer_load_dwordx4
6662 ; GCN: v_ffbh_u32_e32
6763 ; GCN: v_ffbh_u32_e32
6864 ; GCN: v_ffbh_u32_e32
7571 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
7672 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
7773 define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
78 %tid = call i32 @llvm.r600.read.tidig.x()
79 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
80 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
74 %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
8175 %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
8276 store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
8377 ret void
8478 }
8579
8680 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i8:
87 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
81 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
8882 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
8983 ; GCN: buffer_store_byte [[RESULT]],
9084 define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
91 %tid = call i32 @llvm.r600.read.tidig.x()
92 %in.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
93 %val = load i8, i8 addrspace(1)* %in.gep
85 %val = load i8, i8 addrspace(1)* %valptr
9486 %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
9587 store i8 %ctlz, i8 addrspace(1)* %out
9688 ret void
152144 }
153145
154146 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1:
155 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
147 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
156148 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
157149 ; GCN: buffer_store_dword [[RESULT]],
158 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
159 %tid = call i32 @llvm.r600.read.tidig.x()
160 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
161 %val = load i32, i32 addrspace(1)* %in.gep
150 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
151 %val = load i32, i32 addrspace(1)* %valptr
162152 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
163153 %cmp = icmp eq i32 %val, 0
164154 %sel = select i1 %cmp, i32 -1, i32 %ctlz
167157 }
168158
169159 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_neg1:
170 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
160 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
171161 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
172162 ; GCN: buffer_store_dword [[RESULT]],
173163 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
174 %tid = call i32 @llvm.r600.read.tidig.x()
175 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
176 %val = load i32, i32 addrspace(1)* %in.gep
164 %val = load i32, i32 addrspace(1)* %valptr
177165 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
178166 %cmp = icmp ne i32 %val, 0
179167 %sel = select i1 %cmp, i32 %ctlz, i32 -1
197185 }
198186
199187 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
200 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
188 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
201189 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT0:v[0-9]+]], [[VAL]]
202190 ; GCN-DAG: v_cmp_eq_u32_e32 vcc, 0, [[VAL]]
203191 ; GCN-DAG: v_cndmask_b32_e64 [[RESULT1:v[0-9]+]], 0, 1, vcc
204192 ; GCN-DAG: buffer_store_dword [[RESULT0]]
205193 ; GCN-DAG: buffer_store_byte [[RESULT1]]
206194 ; GCN: s_endpgm
207 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
208 %tid = call i32 @llvm.r600.read.tidig.x()
209 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
210 %val = load i32, i32 addrspace(1)* %in.gep
195 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
196 %val = load i32, i32 addrspace(1)* %valptr
211197 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
212198 %cmp = icmp eq i32 %val, 0
213199 %sel = select i1 %cmp, i32 -1, i32 %ctlz
218204
219205 ; Selected on wrong constant
220206 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_0:
221 ; GCN: {{buffer|flat}}_load_dword
222 ; GCN: v_ffbh_u32_e32
223 ; GCN: v_cmp
224 ; GCN: v_cndmask
225 ; GCN: buffer_store_dword
226 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
227 %tid = call i32 @llvm.r600.read.tidig.x()
228 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
229 %val = load i32, i32 addrspace(1)* %in.gep
207 ; GCN: buffer_load_dword
208 ; GCN: v_ffbh_u32_e32
209 ; GCN: v_cmp
210 ; GCN: v_cndmask
211 ; GCN: buffer_store_dword
212 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
213 %val = load i32, i32 addrspace(1)* %valptr
230214 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
231215 %cmp = icmp eq i32 %val, 0
232216 %sel = select i1 %cmp, i32 0, i32 %ctlz
236220
237221 ; Selected on wrong constant
238222 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_0:
239 ; GCN: {{buffer|flat}}_load_dword
223 ; GCN: buffer_load_dword
240224 ; GCN: v_ffbh_u32_e32
241225 ; GCN: v_cmp
242226 ; GCN: v_cndmask
243227 ; GCN: buffer_store_dword
244228 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
245 %tid = call i32 @llvm.r600.read.tidig.x()
246 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
247 %val = load i32, i32 addrspace(1)* %in.gep
229 %val = load i32, i32 addrspace(1)* %valptr
248230 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
249231 %cmp = icmp ne i32 %val, 0
250232 %sel = select i1 %cmp, i32 %ctlz, i32 0
254236
255237 ; Compare on wrong constant
256238 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
257 ; GCN: {{buffer|flat}}_load_dword
258 ; GCN: v_ffbh_u32_e32
259 ; GCN: v_cmp
260 ; GCN: v_cndmask
261 ; GCN: buffer_store_dword
262 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
263 %tid = call i32 @llvm.r600.read.tidig.x()
264 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
265 %val = load i32, i32 addrspace(1)* %in.gep
239 ; GCN: buffer_load_dword
240 ; GCN: v_ffbh_u32_e32
241 ; GCN: v_cmp
242 ; GCN: v_cndmask
243 ; GCN: buffer_store_dword
244 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
245 %val = load i32, i32 addrspace(1)* %valptr
266246 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
267247 %cmp = icmp eq i32 %val, 1
268248 %sel = select i1 %cmp, i32 0, i32 %ctlz
272252
273253 ; Selected on wrong constant
274254 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
275 ; GCN: {{buffer|flat}}_load_dword
255 ; GCN: buffer_load_dword
276256 ; GCN: v_ffbh_u32_e32
277257 ; GCN: v_cmp
278258 ; GCN: v_cndmask
279259 ; GCN: buffer_store_dword
280260 define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
281 %tid = call i32 @llvm.r600.read.tidig.x()
282 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
283 %val = load i32, i32 addrspace(1)* %in.gep
261 %val = load i32, i32 addrspace(1)* %valptr
284262 %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
285263 %cmp = icmp ne i32 %val, 1
286264 %sel = select i1 %cmp, i32 %ctlz, i32 0
77 declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>) nounwind readnone
88 declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
99
10 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
11
1210 ; FUNC-LABEL: {{^}}s_ctpop_i32:
1311 ; GCN: s_load_dword [[SVAL:s[0-9]+]],
1412 ; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[SVAL]]
2523
2624 ; XXX - Why 0 in register?
2725 ; FUNC-LABEL: {{^}}v_ctpop_i32:
28 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
26 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
2927 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0
3028 ; GCN: buffer_store_dword [[RESULT]],
3129 ; GCN: s_endpgm
3230
3331 ; EG: BCNT_INT
3432 define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
35 %tid = call i32 @llvm.r600.read.tidig.x()
36 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
37 %val = load i32, i32 addrspace(1)* %in.gep, align 4
33 %val = load i32, i32 addrspace(1)* %in, align 4
3834 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
3935 store i32 %ctpop, i32 addrspace(1)* %out, align 4
4036 ret void
4137 }
4238
4339 ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
44 ; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
45 ; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]],
40 ; GCN: buffer_load_dword [[VAL1:v[0-9]+]],
41 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
4642 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
4743 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
4844 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
5248 ; EG: BCNT_INT
5349 ; EG: BCNT_INT
5450 define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
55 %tid = call i32 @llvm.r600.read.tidig.x()
56 %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %tid
57 %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %tid
58 %val0 = load i32, i32 addrspace(1)* %in0.gep, align 4
59 %val1 = load i32, i32 addrspace(1)* %in1.gep, align 4
51 %val0 = load i32, i32 addrspace(1)* %in0, align 4
52 %val1 = load i32, i32 addrspace(1)* %in1, align 4
6053 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
6154 %ctpop1 = call i32 @llvm.ctpop.i32(i32 %val1) nounwind readnone
6255 %add = add i32 %ctpop0, %ctpop1
6558 }
6659
6760 ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32:
68 ; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
61 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
6962 ; GCN: s_waitcnt
7063 ; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
7164 ; GCN: buffer_store_dword [[RESULT]],
7265 ; GCN: s_endpgm
73 define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %sval) nounwind {
74 %tid = call i32 @llvm.r600.read.tidig.x()
75 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
76 %val = load i32, i32 addrspace(1)* %in.gep, align 4
77 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
78 %add = add i32 %ctpop, %sval
66 define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
67 %val0 = load i32, i32 addrspace(1)* %in0, align 4
68 %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
69 %add = add i32 %ctpop0, %sval
7970 store i32 %add, i32 addrspace(1)* %out, align 4
8071 ret void
8172 }
8879 ; EG: BCNT_INT
8980 ; EG: BCNT_INT
9081 define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
91 %tid = call i32 @llvm.r600.read.tidig.x()
92 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 %tid
93 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
82 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
9483 %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
9584 store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
9685 ret void
10897 ; EG: BCNT_INT
10998 ; EG: BCNT_INT
11099 define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
111 %tid = call i32 @llvm.r600.read.tidig.x()
112 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
113 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
100 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
114101 %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
115102 store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
116103 ret void
136123 ; EG: BCNT_INT
137124 ; EG: BCNT_INT
138125 define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
139 %tid = call i32 @llvm.r600.read.tidig.x()
140 %in.gep = getelementptr <8 x i32>, <8 x i32> addrspace(1)* %in, i32 %tid
141 %val = load <8 x i32>, <8 x i32> addrspace(1)* %in.gep, align 32
126 %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
142127 %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
143128 store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
144129 ret void
180165 ; EG: BCNT_INT
181166 ; EG: BCNT_INT
182167 define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
183 %tid = call i32 @llvm.r600.read.tidig.x()
184 %in.gep = getelementptr <16 x i32>, <16 x i32> addrspace(1)* %in, i32 %tid
185 %val = load <16 x i32>, <16 x i32> addrspace(1)* %in.gep, align 32
168 %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
186169 %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
187170 store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
188171 ret void
189172 }
190173
191174 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant:
192 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
175 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
193176 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
194177 ; GCN: buffer_store_dword [[RESULT]],
195178 ; GCN: s_endpgm
196179
197180 ; EG: BCNT_INT
198181 define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
199 %tid = call i32 @llvm.r600.read.tidig.x()
200 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
201 %val = load i32, i32 addrspace(1)* %in.gep, align 4
182 %val = load i32, i32 addrspace(1)* %in, align 4
202183 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
203184 %add = add i32 %ctpop, 4
204185 store i32 %add, i32 addrspace(1)* %out, align 4
206187 }
207188
208189 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv:
209 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
190 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
210191 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4
211192 ; GCN: buffer_store_dword [[RESULT]],
212193 ; GCN: s_endpgm
213194
214195 ; EG: BCNT_INT
215196 define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
216 %tid = call i32 @llvm.r600.read.tidig.x()
217 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
218 %val = load i32, i32 addrspace(1)* %in.gep, align 4
197 %val = load i32, i32 addrspace(1)* %in, align 4
219198 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
220199 %add = add i32 4, %ctpop
221200 store i32 %add, i32 addrspace(1)* %out, align 4
223202 }
224203
225204 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
226 ; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
205 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
227206 ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
228207 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
229208 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
230209 ; GCN: buffer_store_dword [[RESULT]],
231210 ; GCN: s_endpgm
232211 define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
233 %tid = call i32 @llvm.r600.read.tidig.x()
234 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
235 %val = load i32, i32 addrspace(1)* %in.gep, align 4
212 %val = load i32, i32 addrspace(1)* %in, align 4
236213 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
237214 %add = add i32 %ctpop, 99999
238215 store i32 %add, i32 addrspace(1)* %out, align 4
240217 }
241218
242219 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var:
243 ; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
220 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
244221 ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
245222 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
246223 ; GCN: buffer_store_dword [[RESULT]],
248225
249226 ; EG: BCNT_INT
250227 define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
251 %tid = call i32 @llvm.r600.read.tidig.x()
252 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
253 %val = load i32, i32 addrspace(1)* %in.gep, align 4
228 %val = load i32, i32 addrspace(1)* %in, align 4
254229 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
255230 %add = add i32 %ctpop, %const
256231 store i32 %add, i32 addrspace(1)* %out, align 4
258233 }
259234
260235 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv:
261 ; GCN-DAG: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
236 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
262237 ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]],
263238 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
264239 ; GCN: buffer_store_dword [[RESULT]],
266241
267242 ; EG: BCNT_INT
268243 define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
269 %tid = call i32 @llvm.r600.read.tidig.x()
270 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
271 %val = load i32, i32 addrspace(1)* %in.gep, align 4
244 %val = load i32, i32 addrspace(1)* %in, align 4
272245 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
273246 %add = add i32 %const, %ctpop
274247 store i32 %add, i32 addrspace(1)* %out, align 4
276249 }
277250
278251 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_vvar_inv:
279 ; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
280 ; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
281 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
282 ; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
283 ; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
252 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}}
253 ; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16
254 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
284255 ; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
285256 ; GCN: buffer_store_dword [[RESULT]],
286257 ; GCN: s_endpgm
287258
288259 ; EG: BCNT_INT
289260 define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
290 %tid = call i32 @llvm.r600.read.tidig.x()
291 %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
292 %val = load i32, i32 addrspace(1)* %in.gep, align 4
293 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
294 %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 %tid
261 %val = load i32, i32 addrspace(1)* %in, align 4
262 %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
263 %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
295264 %const = load i32, i32 addrspace(1)* %gep, align 4
296265 %add = add i32 %const, %ctpop
297266 store i32 %add, i32 addrspace(1)* %out, align 4
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
11 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
2
3 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
42
53 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
64 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
2624 }
2725
2826 ; FUNC-LABEL: {{^}}v_ctpop_i64:
29 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
27 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
3028 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
3129 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
3230 ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
3331 ; GCN: buffer_store_dword [[RESULT]],
3432 ; GCN: s_endpgm
3533 define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
36 %tid = call i32 @llvm.r600.read.tidig.x()
37 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
38 %val = load i64, i64 addrspace(1)* %in.gep, align 8
34 %val = load i64, i64 addrspace(1)* %in, align 8
3935 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
4036 %truncctpop = trunc i64 %ctpop to i32
4137 store i32 %truncctpop, i32 addrspace(1)* %out, align 4
4339 }
4440
4541 ; FUNC-LABEL: {{^}}v_ctpop_i64_user:
46 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
42 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}},
4743 ; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0
4844 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
4945 ; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
5248 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
5349 ; GCN: s_endpgm
5450 define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
55 %tid = call i32 @llvm.r600.read.tidig.x()
56 %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
57 %val = load i64, i64 addrspace(1)* %in.gep, align 8
51 %val = load i64, i64 addrspace(1)* %in, align 8
5852 %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
5953 %or = or i64 %ctpop, %s.val
6054 store i64 %or, i64 addrspace(1)* %out
9286 ; GCN: v_bcnt_u32_b32
9387 ; GCN: s_endpgm
9488 define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
95 %tid = call i32 @llvm.r600.read.tidig.x()
96 %in.gep = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i32 %tid
97 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in.gep, align 16
89 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
9890 %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
9991 %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
10092 store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
112104 ; GCN: v_bcnt_u32_b32
113105 ; GCN: s_endpgm
114106 define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
115 %tid = call i32 @llvm.r600.read.tidig.x()
116 %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
117 %val = load <4 x i64>, <4 x i64> addrspace(1)* %in.gep, align 32
107 %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
118108 %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
119109 %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
120110 store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
178168 ; FIXME: Should not have extra add
179169
180170 ; FUNC-LABEL: {{^}}v_ctpop_i128:
181 ; SI: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
182 ; VI: flat_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}
171 ; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
183172
184173 ; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0
185174 ; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]]
192181 ; GCN: buffer_store_dword [[RESULT]],
193182 ; GCN: s_endpgm
194183 define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
195 %tid = call i32 @llvm.r600.read.tidig.x()
196 %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %tid
197 %val = load i128, i128 addrspace(1)* %in.gep, align 8
184 %val = load i128, i128 addrspace(1)* %in, align 8
198185 %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
199186 %truncctpop = trunc i128 %ctpop to i32
200187 store i32 %truncctpop, i32 addrspace(1)* %out, align 4
44 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
55 declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
66 declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
7 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
87
98 ; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
109 ; SI: s_load_dword [[VAL:s[0-9]+]],
2120 }
2221
2322 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
24 ; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
23 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
2524 ; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
2625 ; SI: buffer_store_dword [[RESULT]],
2726 ; SI: s_endpgm
2827 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
2928 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
3029 define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
31 %tid = call i32 @llvm.r600.read.tidig.x()
32 %in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
33 %val = load i32, i32 addrspace(1)* %in.gep, align 4
30 %val = load i32, i32 addrspace(1)* %valptr, align 4
3431 %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
3532 store i32 %cttz, i32 addrspace(1)* %out, align 4
3633 ret void
3734 }
3835
3936 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
40 ; SI: {{buffer|flat}}_load_dwordx2
37 ; SI: buffer_load_dwordx2
4138 ; SI: v_ffbl_b32_e32
4239 ; SI: v_ffbl_b32_e32
4340 ; SI: buffer_store_dwordx2
4643 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
4744 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
4845 define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
49 %tid = call i32 @llvm.r600.read.tidig.x()
50 %in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
51 %val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
46 %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
5247 %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
5348 store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
5449 ret void
5550 }
5651
5752 ; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
58 ; SI: {{buffer|flat}}_load_dwordx4
53 ; SI: buffer_load_dwordx4
5954 ; SI: v_ffbl_b32_e32
6055 ; SI: v_ffbl_b32_e32
6156 ; SI: v_ffbl_b32_e32
6863 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
6964 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
7065 define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
71 %tid = call i32 @llvm.r600.read.tidig.x()
72 %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
73 %val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
66 %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
7467 %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
7568 store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
7669 ret void
44 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
55
66 ; GCN-LABEL: {{^}}load_i8_to_f32:
7 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG:v[0-9]+]],
7 ; GCN: buffer_load_ubyte [[LOADREG:v[0-9]+]],
88 ; GCN-NOT: bfe
99 ; GCN-NOT: lshr
1010 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
1111 ; GCN: buffer_store_dword [[CONV]],
1212 define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
13 %tid = call i32 @llvm.amdgcn.workitem.id.x()
14 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
15 %load = load i8, i8 addrspace(1)* %gep, align 1
13 %load = load i8, i8 addrspace(1)* %in, align 1
1614 %cvt = uitofp i8 %load to float
1715 store float %cvt, float addrspace(1)* %out, align 4
1816 ret void
1917 }
2018
2119 ; GCN-LABEL: {{^}}load_v2i8_to_v2f32:
22 ; GCN: {{buffer|flat}}_load_ushort [[LD:v[0-9]+]]
20 ; GCN: buffer_load_ushort [[LD:v[0-9]+]]
2321 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
2422 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
2523 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
2624 define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
27 %tid = call i32 @llvm.amdgcn.workitem.id.x()
28 %gep = getelementptr <2 x i8>, <2 x i8> addrspace(1)* %in, i32 %tid
29 %load = load <2 x i8>, <2 x i8> addrspace(1)* %gep, align 2
25 %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
3026 %cvt = uitofp <2 x i8> %load to <2 x float>
3127 store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
3228 ret void
3329 }
3430
3531 ; GCN-LABEL: {{^}}load_v3i8_to_v3f32:
36 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
32 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
3733 ; GCN-NOT: v_cvt_f32_ubyte3_e32
3834 ; GCN-DAG: v_cvt_f32_ubyte2_e32 v{{[0-9]+}}, [[VAL]]
3935 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
4036 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
4137 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
4238 define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
43 %tid = call i32 @llvm.amdgcn.workitem.id.x()
44 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid
45 %load = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
39 %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
4640 %cvt = uitofp <3 x i8> %load to <3 x float>
4741 store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
4842 ret void
4943 }
5044
5145 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32:
52 ; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]]
46 ; GCN: buffer_load_dword [[LOADREG:v[0-9]+]]
5347 ; GCN-NOT: bfe
5448 ; GCN-NOT: lshr
5549 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v[[HIRESULT:[0-9]+]], [[LOADREG]]
5852 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
5953 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
6054 define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
61 %tid = call i32 @llvm.amdgcn.workitem.id.x()
62 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
63 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
55 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
6456 %cvt = uitofp <4 x i8> %load to <4 x float>
6557 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
6658 ret void
7163
7264 ; FIXME: Packing bytes
7365 ; GCN-LABEL: {{^}}load_v4i8_to_v4f32_unaligned:
74 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG3:v[0-9]+]]
75 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG2:v[0-9]+]]
76 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG1:v[0-9]+]]
77 ; GCN: {{buffer|flat}}_load_ubyte [[LOADREG0:v[0-9]+]]
66 ; GCN: buffer_load_ubyte [[LOADREG3:v[0-9]+]]
67 ; GCN: buffer_load_ubyte [[LOADREG2:v[0-9]+]]
68 ; GCN: buffer_load_ubyte [[LOADREG1:v[0-9]+]]
69 ; GCN: buffer_load_ubyte [[LOADREG0:v[0-9]+]]
7870 ; GCN-DAG: v_lshlrev_b32
7971 ; GCN-DAG: v_or_b32
8072 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]],
8476
8577 ; GCN: buffer_store_dwordx4
8678 define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
87 %tid = call i32 @llvm.amdgcn.workitem.id.x()
88 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
89 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
79 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
9080 %cvt = uitofp <4 x i8> %load to <4 x float>
9181 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
9282 ret void
133123 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
134124 ; GCN: s_endpgm
135125 define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
136 %tid = call i32 @llvm.amdgcn.workitem.id.x()
137 %gep = getelementptr <7 x i8>, <7 x i8> addrspace(1)* %in, i32 %tid
138 %load = load <7 x i8>, <7 x i8> addrspace(1)* %gep, align 1
126 %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
139127 %cvt = uitofp <7 x i8> %load to <7 x float>
140128 store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
141129 ret void
142130 }
143131
144132 ; GCN-LABEL: {{^}}load_v8i8_to_v8f32:
145 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
133 ; GCN: buffer_load_dwordx2 v{{\[}}[[LOLOAD:[0-9]+]]:[[HILOAD:[0-9]+]]{{\]}},
146134 ; GCN-NOT: bfe
147135 ; GCN-NOT: lshr
148136 ; GCN-DAG: v_cvt_f32_ubyte3_e32 v{{[0-9]+}}, v[[LOLOAD]]
158146 ; GCN: buffer_store_dwordx4
159147 ; GCN: buffer_store_dwordx4
160148 define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
161 %tid = call i32 @llvm.amdgcn.workitem.id.x()
162 %gep = getelementptr <8 x i8>, <8 x i8> addrspace(1)* %in, i32 %tid
163 %load = load <8 x i8>, <8 x i8> addrspace(1)* %gep, align 8
149 %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
164150 %cvt = uitofp <8 x i8> %load to <8 x float>
165151 store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
166152 ret void
167153 }
168154
169155 ; GCN-LABEL: {{^}}i8_zext_inreg_i32_to_f32:
170 ; GCN: {{buffer|flat}}_load_dword [[LOADREG:v[0-9]+]],
156 ; GCN: buffer_load_dword [[LOADREG:v[0-9]+]],
171157 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
172158 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
173159 ; GCN: buffer_store_dword [[CONV]],
174160 define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
175 %tid = call i32 @llvm.amdgcn.workitem.id.x()
176 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
177 %load = load i32, i32 addrspace(1)* %gep, align 4
161 %load = load i32, i32 addrspace(1)* %in, align 4
178162 %add = add i32 %load, 2
179163 %inreg = and i32 %add, 255
180164 %cvt = uitofp i32 %inreg to float
184168
185169 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
186170 define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
187 %tid = call i32 @llvm.amdgcn.workitem.id.x()
188 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
189 %load = load i32, i32 addrspace(1)* %gep, align 4
171 %load = load i32, i32 addrspace(1)* %in, align 4
190172 %inreg = and i32 %load, 65280
191173 %shr = lshr i32 %inreg, 8
192174 %cvt = uitofp i32 %shr to float
198180 ; them so it shouldn't really matter.
199181 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
200182 define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
201 %tid = call i32 @llvm.amdgcn.workitem.id.x()
202 %gep = getelementptr i8, i8 addrspace(1)* %in, i32 %tid
203 %load = load i8, i8 addrspace(1)* %gep, align 1
183 %load = load i8, i8 addrspace(1)* %in, align 1
204184 %ext = zext i8 %load to i32
205185 %cvt = uitofp i32 %ext to float
206186 store float %cvt, float addrspace(1)* %out, align 4
209189
210190 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
211191 define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
212 %tid = call i32 @llvm.amdgcn.workitem.id.x()
213 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid
214 %load = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 1
192 %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
215193 %ext = zext <4 x i8> %load to <4 x i32>
216194 %cvt = uitofp <4 x i32> %ext to <4 x float>
217195 store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
219197 }
220198
221199 ; GCN-LABEL: {{^}}extract_byte0_to_f32:
222 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
200 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
223201 ; GCN-NOT: [[VAL]]
224202 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
225203 ; GCN: buffer_store_dword [[CONV]]
226204 define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
227 %tid = call i32 @llvm.amdgcn.workitem.id.x()
228 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
229 %val = load i32, i32 addrspace(1)* %gep
205 %val = load i32, i32 addrspace(1)* %in
230206 %and = and i32 %val, 255
231207 %cvt = uitofp i32 %and to float
232208 store float %cvt, float addrspace(1)* %out
234210 }
235211
236212 ; GCN-LABEL: {{^}}extract_byte1_to_f32:
237 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
213 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
238214 ; GCN-NOT: [[VAL]]
239215 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
240216 ; GCN: buffer_store_dword [[CONV]]
241217 define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
242 %tid = call i32 @llvm.amdgcn.workitem.id.x()
243 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
244 %val = load i32, i32 addrspace(1)* %gep
218 %val = load i32, i32 addrspace(1)* %in
245219 %srl = lshr i32 %val, 8
246220 %and = and i32 %srl, 255
247221 %cvt = uitofp i32 %and to float
250224 }
251225
252226 ; GCN-LABEL: {{^}}extract_byte2_to_f32:
253 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
227 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
254228 ; GCN-NOT: [[VAL]]
255229 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
256230 ; GCN: buffer_store_dword [[CONV]]
257231 define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
258 %tid = call i32 @llvm.amdgcn.workitem.id.x()
259 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
260 %val = load i32, i32 addrspace(1)* %gep
232 %val = load i32, i32 addrspace(1)* %in
261233 %srl = lshr i32 %val, 16
262234 %and = and i32 %srl, 255
263235 %cvt = uitofp i32 %and to float
266238 }
267239
268240 ; GCN-LABEL: {{^}}extract_byte3_to_f32:
269 ; GCN: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]]
241 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
270242 ; GCN-NOT: [[VAL]]
271243 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
272244 ; GCN: buffer_store_dword [[CONV]]
273245 define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
274 %tid = call i32 @llvm.amdgcn.workitem.id.x()
275 %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
276 %val = load i32, i32 addrspace(1)* %gep
246 %val = load i32, i32 addrspace(1)* %in
277247 %srl = lshr i32 %val, 24
278248 %and = and i32 %srl, 255
279249 %cvt = uitofp i32 %and to float
None ; RUN: llc -amdgpu-scalarize-global-loads=false -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -stress-early-ifcvt -amdgpu-early-ifcvt=1 -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 ; FIXME: Most of these cases that don't trigger because of broken cost
33 ; heuristics. Should not need -stress-early-ifcvt
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=verde -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11 ; XUN: llc -march=amdgcn -mcpu=tonga -amdgpu-early-ifcvt=1 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22
33 ; FIXME: This leaves behind a now unnecessary and with exec
0 ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE %s
11 ; RUN: llc -march=amdgcn -enable-no-signed-zeros-fp-math=1 < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
22 ; RUN: llc -march=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-UNSAFE %s
3
4 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
53
64 ; Test that the -enable-no-signed-zeros-fp-math flag works
75
119
1210 ; GCN-UNSAFE-NOT: xor
1311 define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
14 %tid = call i32 @llvm.amdgcn.workitem.id.x()
15 %add = add i32 %tid, 1
16 %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
17 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 %add
18 %a = load float, float addrspace(1)* %gep, align 4
12 %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
13 %a = load float, float addrspace(1)* %in, align 4
1914 %b = load float, float addrspace(1)* %b_ptr, align 4
2015 %result = fsub float %a, %b
2116 %neg.result = fsub float -0.0, %result
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1
2 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
31
42 ; Make sure the add and load are reduced to 32-bits even with the
53 ; bitcast to vector.
97 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
108 ; GCN: buffer_store_dword [[ADD]]
119 define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
12 %tid = call i32 @llvm.amdgcn.workitem.id.x()
13 %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
14 %a = load i64, i64 addrspace(1)* %gep
10 %a = load i64, i64 addrspace(1)* %in
1511 %add = add i64 %a, %b
1612 %val.bc = bitcast i64 %add to <2 x i32>
1713 %extract = extractelement <2 x i32> %val.bc, i32 0
2420 ; GCN: v_add_f64
2521 ; GCN: buffer_store_dword v
2622 define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
27 %tid = call i32 @llvm.amdgcn.workitem.id.x()
28 %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
29 %a = load double, double addrspace(1)* %gep
23 %a = load double, double addrspace(1)* %in
3024 %add = fadd double %a, %b
3125 %val.bc = bitcast double %add to <2 x i32>
3226 %extract = extractelement <2 x i32> %val.bc, i32 0
3933 ; GCN: v_add_i32
4034 ; GCN: buffer_store_dword
4135 define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
42 %tid = call i32 @llvm.amdgcn.workitem.id.x()
43 %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
44 %a = load i64, i64 addrspace(1)* %gep
36 %a = load i64, i64 addrspace(1)* %in
4537 %add = add i64 %a, %b
4638 %val.bc = bitcast i64 %add to <2 x float>
4739 %extract = extractelement <2 x float> %val.bc, i32 0
133133 ; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
134134 ; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
135135 define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
136 %tid = call i32 @llvm.amdgcn.workitem.id.x()
137 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
138 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
136 %val = load <2 x half>, <2 x half> addrspace(1)* %in
139137 %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
140138 %fmul = fmul <2 x half> %fabs, %val
141139 store <2 x half> %fmul, <2 x half> addrspace(1)* %out
11 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fadd_f16
4 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
4 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
5 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
66 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
77 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
88 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
2323 }
2424
2525 ; GCN-LABEL: {{^}}fadd_f16_imm_a
26 ; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
26 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
2727 ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
2828 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
2929 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
4141 }
4242
4343 ; GCN-LABEL: {{^}}fadd_f16_imm_b
44 ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
4545 ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
4646 ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], 2.0, v[[A_F32]]
4747 ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
5959 }
6060
6161 ; GCN-LABEL: {{^}}fadd_v2f16:
62 ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
63 ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
62 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
63 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
6464
6565 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
6666 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
6969
7070 ; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
7171 ; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
72 ; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
72 ; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
73 ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
7474 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
7575 ; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
7676 ; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
8787 <2 x half> addrspace(1)* %a,
8888 <2 x half> addrspace(1)* %b) {
8989 entry:
90 %tid = call i32 @llvm.amdgcn.workitem.id.x()
91 %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
92 %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
93 %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
94 %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
90 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
91 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
9592 %r.val = fadd <2 x half> %a.val, %b.val
9693 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
9794 ret void
9895 }
9996
10097 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
101 ; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
98 ; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
10299 ; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
103100 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
104101 ; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
120117 <2 x half> addrspace(1)* %r,
121118 <2 x half> addrspace(1)* %b) {
122119 entry:
123 %tid = call i32 @llvm.amdgcn.workitem.id.x()
124 %gep.b = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %b, i32 %tid
125 %b.val = load <2 x half>, <2 x half> addrspace(1)* %gep.b
120 %b.val = load <2 x half>, <2 x half> addrspace(1)* %b
126121 %r.val = fadd <2 x half> , %b.val
127122 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
128123 ret void
129124 }
130125
131126 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
132 ; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
127 ; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
133128 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
134129 ; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
135130 ; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
151146 <2 x half> addrspace(1)* %r,
152147 <2 x half> addrspace(1)* %a) {
153148 entry:
154 %tid = call i32 @llvm.amdgcn.workitem.id.x()
155 %gep.a = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %a, i32 %tid
156 %a.val = load <2 x half>, <2 x half> addrspace(1)* %gep.a
149 %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
157150 %r.val = fadd <2 x half> %a.val,
158151 store <2 x half> %r.val, <2 x half> addrspace(1)* %r
159152 ret void
160153 }
161
162 declare i32 @llvm.amdgcn.workitem.id.x() #1
163
164 attributes #0 = { nounwind }
165 attributes #1 = { nounwind readnone }
44 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
55 define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
66 double addrspace(1)* %in2) {
7 %tid = call i32 @llvm.amdgcn.workitem.id.x()
8 %gep1 = getelementptr inbounds double, double addrspace(1)* %in1, i32 %tid
9 %gep2 = getelementptr inbounds double, double addrspace(1)* %in2, i32 %tid
10 %r0 = load double, double addrspace(1)* %gep1
11 %r1 = load double, double addrspace(1)* %gep2
7 %r0 = load double, double addrspace(1)* %in1
8 %r1 = load double, double addrspace(1)* %in2
129 %r2 = fadd double %r0, %r1
1310 store double %r2, double addrspace(1)* %out
1411 ret void
4441 store <2 x double> %r2, <2 x double> addrspace(1)* %out
4542 ret void
4643 }
47
48 declare i32 @llvm.amdgcn.workitem.id.x() #1
49
50 attributes #0 = { nounwind }
51 attributes #1 = { nounwind readnone }
44 declare half @llvm.canonicalize.f16(half) #0
55 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
66 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
7 declare i32 @llvm.amdgcn.workitem.id.x() #0
8
97
108 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
119 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
214212 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
215213 ; GFX9: buffer_store_dword [[REG]]
216214 define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
217 %tid = call i32 @llvm.amdgcn.workitem.id.x()
218 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
219 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
215 %val = load <2 x half>, <2 x half> addrspace(1)* %out
220216 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
221217 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
222218 ret void
236232 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
237233 ; GCN: buffer_store_dword
238234 define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
239 %tid = call i32 @llvm.amdgcn.workitem.id.x()
240 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
241 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
235 %val = load <2 x half>, <2 x half> addrspace(1)* %out
242236 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
243237 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
244238 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
256250 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
257251 ; GCN: buffer_store_dword
258252 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
259 %tid = call i32 @llvm.amdgcn.workitem.id.x()
260 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
261 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
253 %val = load <2 x half>, <2 x half> addrspace(1)* %out
262254 %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
263255 %val.fabs.fneg = fsub <2 x half> , %val.fabs
264256 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
277269 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
278270 ; GFX9: buffer_store_dword [[REG]]
279271 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
280 %tid = call i32 @llvm.amdgcn.workitem.id.x()
281 %gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
282 %val = load <2 x half>, <2 x half> addrspace(1)* %gep
272 %val = load <2 x half>, <2 x half> addrspace(1)* %out
283273 %fneg.val = fsub <2 x half> , %val
284274 %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
285275 store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 declare float @llvm.fabs.f32(float) #0
33 declare float @llvm.canonicalize.f32(float) #0
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fcmp_f16_lt
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
11 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
22
33 ; CHECK-LABEL: {{^}}flt_f64:
4 ; CHECK: v_cmp_nge_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
4 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
55 define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
66 double addrspace(1)* %in2) {
77 %r0 = load double, double addrspace(1)* %in1
1313 }
1414
1515 ; CHECK-LABEL: {{^}}fle_f64:
16 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
16 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
1717 define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
1818 double addrspace(1)* %in2) {
1919 %r0 = load double, double addrspace(1)* %in1
2525 }
2626
2727 ; CHECK-LABEL: {{^}}fgt_f64:
28 ; CHECK: v_cmp_nle_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
28 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
2929 define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
3030 double addrspace(1)* %in2) {
3131 %r0 = load double, double addrspace(1)* %in1
3737 }
3838
3939 ; CHECK-LABEL: {{^}}fge_f64:
40 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
40 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
4141 define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
4242 double addrspace(1)* %in2) {
4343 %r0 = load double, double addrspace(1)* %in1
4949 }
5050
5151 ; CHECK-LABEL: {{^}}fne_f64:
52 ; CHECK: v_cmp_neq_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
52 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
5353 define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
5454 double addrspace(1)* %in2) {
5555 %r0 = load double, double addrspace(1)* %in1
6161 }
6262
6363 ; CHECK-LABEL: {{^}}feq_f64:
64 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{s[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
64 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
6565 define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
6666 double addrspace(1)* %in2) {
6767 %r0 = load double, double addrspace(1)* %in1
55 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
66
77 define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
8 %tid = call i32 @llvm.amdgcn.workitem.id.x()
9 %gep = getelementptr inbounds double, double addrspace(1)* %in, i32 %tid
10 %r1 = load double, double addrspace(1)* %gep
8 %r1 = load double, double addrspace(1)* %in
119 %r2 = fadd double %r1, 5.000000e+00
1210 store double %r2, double addrspace(1)* %out
1311 ret void
1412 }
15
16 declare i32 @llvm.amdgcn.workitem.id.x() #1
17
18 attributes #0 = { nounwind }
19 attributes #1 = { nounwind readnone }
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
33
44 declare half @llvm.copysign.f16(half, half)
55 declare float @llvm.copysign.f32(float, float)
88 declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
99 declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
1010
11 declare i32 @llvm.amdgcn.workitem.id.x()
12
1311 ; GCN-LABEL: {{^}}test_copysign_f16:
14 ; SI: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
15 ; SI: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
12 ; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
13 ; SI: buffer_load_ushort v[[MAG:[0-9]+]]
1614 ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
1715 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
1816 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
1917 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
2018 ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
21 ; GFX89: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
22 ; GFX89: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
19 ; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]]
20 ; GFX89: buffer_load_ushort v[[MAG:[0-9]+]]
2321 ; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
2422 ; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
2523 ; GCN: buffer_store_short v[[OUT]]
3735 }
3836
3937 ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
40 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
41 ; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
38 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
39 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
4240 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
4341 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
4442 ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
4947 half addrspace(1)* %arg_mag,
5048 float addrspace(1)* %arg_sign) {
5149 entry:
52 %tid = call i32 @llvm.amdgcn.workitem.id.x()
53 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
54 %mag = load half, half addrspace(1)* %arg_mag_gep
50 %mag = load half, half addrspace(1)* %arg_mag
5551 %mag.ext = fpext half %mag to float
56 %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
57 %sign = load float, float addrspace(1)* %arg_sign_gep
52 %sign = load float, float addrspace(1)* %arg_sign
5853 %out = call float @llvm.copysign.f32(float %mag.ext, float %sign)
5954 store float %out, float addrspace(1)* %arg_out
6055 ret void
6156 }
6257
6358 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
64 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
65 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
59 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
60 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
6661 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
6762 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]]
6863 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]{{\]}}, v[[MAG_EXT]]
7469 half addrspace(1)* %arg_mag,
7570 double addrspace(1)* %arg_sign) {
7671 entry:
77 %tid = call i32 @llvm.amdgcn.workitem.id.x()
78 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
79 %mag = load half, half addrspace(1)* %arg_mag_gep
72 %mag = load half, half addrspace(1)* %arg_mag
8073 %mag.ext = fpext half %mag to double
81 %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
82 %sign = load double, double addrspace(1)* %arg_sign_gep
74 %sign = load double, double addrspace(1)* %arg_sign
8375 %out = call double @llvm.copysign.f64(double %mag.ext, double %sign)
8476 store double %out, double addrspace(1)* %arg_out
8577 ret void
8678 }
8779
8880 ; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
89 ; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
90 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
81 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
82 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
9183 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
9284 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
9385 ; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]]
10092 float addrspace(1)* %arg_mag,
10193 half addrspace(1)* %arg_sign) {
10294 entry:
103 %tid = call i32 @llvm.amdgcn.workitem.id.x()
104 %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
105 %mag = load float, float addrspace(1)* %arg_mag_gep
106 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
107 %sign = load half, half addrspace(1)* %arg_sign_gep
95 %mag = load float, float addrspace(1)* %arg_mag
96 %sign = load half, half addrspace(1)* %arg_sign
10897 %sign.ext = fpext half %sign to float
10998 %out = call float @llvm.copysign.f32(float %mag, float %sign.ext)
11099 store float %out, float addrspace(1)* %arg_out
112101 }
113102
114103 ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
115 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
116 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
104 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
105 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
117106 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
118107 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
119108 ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
126115 double addrspace(1)* %arg_mag,
127116 half addrspace(1)* %arg_sign) {
128117 entry:
129 %tid = call i32 @llvm.amdgcn.workitem.id.x()
130 %arg_mag_gep = getelementptr double, double addrspace(1)* %arg_mag, i32 %tid
131 %mag = load double, double addrspace(1)* %arg_mag_gep
132 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
133 %sign = load half, half addrspace(1)* %arg_sign_gep
118 %mag = load double, double addrspace(1)* %arg_mag
119 %sign = load half, half addrspace(1)* %arg_sign
134120 %sign.ext = fpext half %sign to double
135121 %out = call double @llvm.copysign.f64(double %mag, double %sign.ext)
136122 store double %out, double addrspace(1)* %arg_out
138124 }
139125
140126 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
141 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
142 ; GCN-DAG: {{buffer|flat}}_load_dword v[[SIGN:[0-9]+]]
127 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
128 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
143129 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
144130 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
145131 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]]
154140 half addrspace(1)* %arg_mag,
155141 float addrspace(1)* %arg_sign) {
156142 entry:
157 %tid = call i32 @llvm.amdgcn.workitem.id.x()
158 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
159 %mag = load half, half addrspace(1)* %arg_mag_gep
160 %arg_sign_gep = getelementptr float, float addrspace(1)* %arg_sign, i32 %tid
161 %sign = load float, float addrspace(1)* %arg_sign_gep
143 %mag = load half, half addrspace(1)* %arg_mag
144 %sign = load float, float addrspace(1)* %arg_sign
162145 %sign.trunc = fptrunc float %sign to half
163146 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
164147 store half %out, half addrspace(1)* %arg_out
166149 }
167150
168151 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
169 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[MAG:[0-9]+]]
170 ; GCN-DAG: {{buffer|flat}}_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
152 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
153 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
171154 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
172155 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
173156 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
182165 half addrspace(1)* %arg_mag,
183166 double addrspace(1)* %arg_sign) {
184167 entry:
185 %tid = call i32 @llvm.amdgcn.workitem.id.x()
186 %arg_mag_gep = getelementptr half, half addrspace(1)* %arg_mag, i32 %tid
187 %mag = load half, half addrspace(1)* %arg_mag
188 %arg_sign_gep = getelementptr double, double addrspace(1)* %arg_sign, i32 %tid
189 %sign = load double, double addrspace(1)* %arg_sign_gep
168 %mag = load half, half addrspace(1)* %arg_mag
169 %sign = load double, double addrspace(1)* %arg_sign
190170 %sign.trunc = fptrunc double %sign to half
191171 %out = call half @llvm.copysign.f16(half %mag, half %sign.trunc)
192172 store half %out, half addrspace(1)* %arg_out
194174 }
195175
196176 ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
197 ; GCN-DAG: {{buffer|flat}}_load_dword v[[MAG:[0-9]+]]
198 ; GCN-DAG: {{buffer|flat}}_load_ushort v[[SIGN:[0-9]+]]
177 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
178 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
199179 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
200180 ; SI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
201181 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
212192 float addrspace(1)* %arg_mag,
213193 half addrspace(1)* %arg_sign) {
214194 entry:
215 %tid = call i32 @llvm.amdgcn.workitem.id.x()
216 %arg_mag_gep = getelementptr float, float addrspace(1)* %arg_mag, i32 %tid
217 %mag = load float, float addrspace(1)* %arg_mag_gep
195 %mag = load float, float addrspace(1)* %arg_mag
218196 %mag.trunc = fptrunc float %mag to half
219 %arg_sign_gep = getelementptr half, half addrspace(1)* %arg_sign, i32 %tid
220 %sign = load half, half addrspace(1)* %arg_sign_gep
197 %sign = load half, half addrspace(1)* %arg_sign
221198 %out = call half @llvm.copysign.f16(half %mag.trunc, half %sign)
222199 store half %out, half addrspace(1)* %arg_out
223200 ret void
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
33
44 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
55 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22
33 declare double @llvm.fma.f64(double, double, double) nounwind readnone
44 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
22
33 declare float @llvm.fma.f32(float, float, float) nounwind readnone
44 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fmul_f16
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
22
33 ; FUNC-LABEL: {{^}}fmul_f64:
44 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
4
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
0 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s
1 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s
2 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s
3 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s
4
5 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s
6 ; RUN: llc -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s
7 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-FLUSH-CONTRACT,GCN-FLUSH,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s
8 ; RUN: llc -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s
99
1010 ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
1111
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICTSI %s
1 ; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
2 ; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
3 ; RUN: llc -march=amdgcn -mcpu=verde -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
66
77 ; GCN-LABEL: {{^}}fmuladd_f64:
88 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
0 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
1 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
44
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
6 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
7 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
8 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
99
1010 declare i32 @llvm.amdgcn.workitem.id.x() #1
1111 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
55 ; SI-NOT: and
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
0 ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s
33
44 ; FIXME: Should be able to do scalar op
55 ; GCN-LABEL: {{^}}s_fneg_f16:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
55
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
33
44 ; GCN-LABEL: {{^}}fpext_f16_to_f32
55 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fptosi_f16_to_i16
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}fptoui_f16_to_i16
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
33
44 ; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
55 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
33
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
4 ; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
5 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
66
77 declare double @llvm.fabs.f64(double) #0
88 declare double @llvm.floor.f64(double) #0
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
3 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
55
66 declare float @llvm.fabs.f32(float) #0
77 declare float @llvm.floor.f32(float) #0
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}frem_f32:
55 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
22
33 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
44 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
33
44
55 ; Run with unsafe-fp-math to make sure nothing tries to turn this into 1 / rsqrt(x)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
33
44 ; GCN-LABEL: {{^}}fsub_f16:
55 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}v_fsub_f32:
55 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
22
33 declare double @llvm.fabs.f64(double) #0
44
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
33
44 declare double @llvm.trunc.f64(double) nounwind readnone
55 declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
22 ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33 ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
44
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; half args should be promoted to float for SI and lower.
44
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; Use a 64-bit value with lo bits that can be represented as an inline constant
44 ; GCN-LABEL: {{^}}i64_imm_inline_lo:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
0 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
33 ; FIXME: Merge into imm.ll
44
55 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
44
55 ; Tests for indirect addressing on SI, which is implemented using dynamic
66 ; indexing of vectors.
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -amdgpu-load-store-vectorizer=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 ; GatherAllAliases gives up on trying to analyze cases where the
33 ; pointer may have been loaded from an aliased store, so make sure
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
11
22 declare half @llvm.fabs.f16(half %a)
33 declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
11
22 declare i1 @llvm.amdgcn.class.f32(float, i32) #1
33 declare i1 @llvm.amdgcn.class.f64(double, i32) #1
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
11 ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
22
33 ; FIXME: Enable for VI.
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
11
22 declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
33
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
44 ; GCN: v_bfe_i32
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22
33 declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
44
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
22
33 declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
44
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
44 ; GCN: v_bfe_u32
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.ceil.f16(half %a)
44 declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.cos.f16(half %a)
44 declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.exp2.f16(half %a)
44 declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.floor.f16(half %a)
44 declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.fma.f16(half %a, half %b, half %c)
44 declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
0 ; RUN: llc -march=amdgcn -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
2 ; RUN: llc -march=amdgcn -mattr=+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
44
55 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
66 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.log2.f16(half %a)
44 declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.maxnum.f16(half %a, half %b)
44 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.minnum.f16(half %a, half %b)
44 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
33
44 declare half @llvm.rint.f16(half %a)
55 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.sin.f16(half %a)
44 declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.sqrt.f16(half %a)
44 declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 declare half @llvm.trunc.f16(half %a)
44 declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
33
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
5 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
66
77 ; FUNC-LABEL: {{^}}global_load_f32:
88 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}global_load_f64:
55 ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
55
66 ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
77
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44
55
66 ; FUNC-LABEL: {{^}}global_load_i32:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
33
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
5 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
66
77 ; FUNC-LABEL: {{^}}global_load_i64:
88 ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
1 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
55
66
77 ; FUNC-LABEL: {{^}}global_load_i8:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
4 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
55
66 ; FUNC-LABEL: {{^}}load_i24:
77 ; SI: {{flat|buffer}}_load_ubyte
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
22
33
44 ; FUNC-LABEL: {{^}}v_test_imax_sge_i32:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
22
33 ; This test is mostly to test DAG store merging, so disable the vectorizer.
44 ; Run with devices with different unaligned load restrictions.
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck %s
11
22 declare i32 @llvm.amdgcn.workitem.id.x() readnone
33
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
33
44 ; mul24 and mad24 are affected
55
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
11
22 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
33
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44
55 ; FUNC-LABEL: {{^}}or_v2i32:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) #0
33 declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) #0
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
33
44 ; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
55 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
22
33 ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
44 ; SI: buffer_load_dwordx4
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
22
33 ; BOTH-LABEL: {{^}}s_rotl_i64:
44 ; BOTH-DAG: s_lshl_b64
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
22
33 ; BOTH-LABEL: {{^}}s_rotr_i64:
44 ; BOTH-DAG: s_sub_i32
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s
22
33 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
44 declare float @llvm.sqrt.f32(float) nounwind readnone
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
22
33 ; SI-LABEL: {{^}}s_movk_i32_k0:
44 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress -verify-machineinstrs< %s
0 ; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s
33
44 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
55 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
0 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=CI %s
2 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI --check-prefix=GCN-HSA %s
33
44 declare i32 @llvm.amdgcn.workitem.id.x() #0
55 declare i32 @llvm.amdgcn.workitem.id.y() #0
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; XXX - Why the packing?
44 ; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI %s
11
22 ; FIXME: This currently doesn't do a great job of clustering the
33 ; loads, which end up with extra moves between them. Right now, it
None ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
22
33 ; When a frame index offset is more than 12-bits, make sure we don't store
44 ; it in mubuf's offset field.
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 ; The code generated by sdiv is long and complex and may frequently change.
55 ; The goal of this test is to make sure the ISel doesn't fail.
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
33
44 ; GCN-LABEL: {{^}}add_shr_i32:
55 ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
None ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
0 ; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
2 ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
33
44 ; Test expansion of scalar selects on vectors.
55 ; Evergreen not enabled since it seems to be having problems with doubles.
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
22
33 ; GCN-LABEL: {{^}}select_f16:
44 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
2 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
3 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-packed-inlinable-literals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
3 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
44
55 ; FIXME: i16 promotion pass ruins the scalar cases when legal.
66 ; FIXME: r600 fails verifier
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
22
33 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
44 ; used in an REG_SEQUENCE that also needs to be handled.
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
1 ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
22
33 ; CHECK-LABEL: {{^}}phi1:
44 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
11
22 ; Extract the high bit of the 1st quarter
33 ; GCN-LABEL: {{^}}v_uextract_bit_31_i128:
None ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s