llvm.org GIT mirror llvm / 559cd2e
Merging r275870: ------------------------------------------------------------------------ r275870 | arsenm | 2016-07-18 11:34:59 -0700 (Mon, 18 Jul 2016) | 1 line AMDGPU/R600: Replace barrier intrinsics ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@275896 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 4 years ago
13 changed file(s) with 247 addition(s) and 239 deletion(s). Raw diff Collapse all Expand all
4242
4343 def int_r600_read_workdim : AMDGPUReadPreloadRegisterIntrinsic;
4444
45 def int_r600_group_barrier : GCCBuiltin<"__builtin_r600_group_barrier">,
46 Intrinsic<[], [], [IntrConvergent]>;
4547
4648 // AS 7 is PARAM_I_ADDRESS, used for kernel arguments
4749 def int_r600_implicitarg_ptr :
2929 [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
3030 >;
3131
32 // Deprecated in favor of llvm.amdgcn.s.barrier
33 def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>;
34 def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>;
35
3632 // Deprecated in favor of llvm.amdgcn.read.workdim
3733 def int_AMDGPU_read_workdim : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
3834 }
393393 def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
394394
395395 def GROUP_BARRIER : InstR600 <
396 (outs), (ins), " GROUP_BARRIER", [(int_AMDGPU_barrier_local), (int_AMDGPU_barrier_global)], AnyALU>,
396 (outs), (ins), " GROUP_BARRIER", [(int_r600_group_barrier)], AnyALU>,
397397 R600ALU_Word0,
398398 R600ALU_Word1_OP2 <0x54> {
399399
422422 let ALUInst = 1;
423423 }
424424
425 def : Pat <
426 (int_AMDGPU_barrier_global),
427 (GROUP_BARRIER)
428 >;
429
430425 //===----------------------------------------------------------------------===//
431426 // LDS Instructions
432427 //===----------------------------------------------------------------------===//
24522452 (S_WAITCNT (as_i16imm $simm16))
24532453 >;
24542454
2455 // FIXME: These should be removed eventually
2456 def : Pat <
2457 (int_AMDGPU_barrier_global),
2458 (S_BARRIER)
2459 >;
2460
2461 def : Pat <
2462 (int_AMDGPU_barrier_local),
2463 (S_BARRIER)
2464 >;
2465
24662455 //===----------------------------------------------------------------------===//
24672456 // VOP1 Patterns
24682457 //===----------------------------------------------------------------------===//
None ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s
0 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck %s
11 ;
22 ; This test checks that the lds input queue will is empty at the end of
33 ; the ALU clause.
1313 entry:
1414 %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
1515 %1 = load i32, i32 addrspace(3)* %0
16 call void @llvm.AMDGPU.barrier.local()
16 call void @llvm.r600.group.barrier()
1717
1818 ; This will start a new clause for the vertex fetch
1919 %2 = load i32, i32 addrspace(1)* %in
2222 ret void
2323 }
2424
25 declare void @llvm.AMDGPU.barrier.local()
25 declare void @llvm.r600.group.barrier() nounwind convergent
2626
2727 ; The machine scheduler does not do proper alias analysis and assumes that
2828 ; loads from global values (Note that a global value is different that a
+0
-30
test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.global.ll less more
None ; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
2
3 ; FUNC-LABEL: {{^}}test_barrier_global:
4 ; EG: GROUP_BARRIER
5 ; SI: buffer_store_dword
6 ; SI: s_waitcnt
7 ; SI: s_barrier
8
9 define void @test_barrier_global(i32 addrspace(1)* %out) {
10 entry:
11 %0 = call i32 @llvm.r600.read.tidig.x()
12 %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
13 store i32 %0, i32 addrspace(1)* %1
14 call void @llvm.AMDGPU.barrier.global()
15 %2 = call i32 @llvm.r600.read.local.size.x()
16 %3 = sub i32 %2, 1
17 %4 = sub i32 %3, %0
18 %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
19 %6 = load i32, i32 addrspace(1)* %5
20 store i32 %6, i32 addrspace(1)* %1
21 ret void
22 }
23
24 declare void @llvm.AMDGPU.barrier.global()
25
26 declare i32 @llvm.r600.read.tidig.x() #0
27 declare i32 @llvm.r600.read.local.size.x() #0
28
29 attributes #0 = { readnone }
+0
-31
test/CodeGen/AMDGPU/llvm.AMDGPU.barrier.local.ll less more
None ; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
2
3 ; FUNC-LABEL: {{^}}test_barrier_local:
4 ; EG: GROUP_BARRIER
5
6 ; SI: buffer_store_dword
7 ; SI: s_waitcnt
8 ; SI: s_barrier
9
10 define void @test_barrier_local(i32 addrspace(1)* %out) {
11 entry:
12 %0 = call i32 @llvm.r600.read.tidig.x()
13 %1 = getelementptr i32, i32 addrspace(1)* %out, i32 %0
14 store i32 %0, i32 addrspace(1)* %1
15 call void @llvm.AMDGPU.barrier.local()
16 %2 = call i32 @llvm.r600.read.local.size.x()
17 %3 = sub i32 %2, 1
18 %4 = sub i32 %3, %0
19 %5 = getelementptr i32, i32 addrspace(1)* %out, i32 %4
20 %6 = load i32, i32 addrspace(1)* %5
21 store i32 %6, i32 addrspace(1)* %1
22 ret void
23 }
24
25 declare void @llvm.AMDGPU.barrier.local()
26
27 declare i32 @llvm.r600.read.tidig.x() #0
28 declare i32 @llvm.r600.read.local.size.x() #0
29
30 attributes #0 = { readnone }
0 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
1
2 ; EG-LABEL: {{^}}test_group_barrier:
3 ; EG: GROUP_BARRIER
4 define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
5 entry:
6 %tmp = call i32 @llvm.r600.read.tidig.x()
7 %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
8 store i32 %tmp, i32 addrspace(1)* %tmp1
9 call void @llvm.r600.group.barrier()
10 %tmp2 = call i32 @llvm.r600.read.local.size.x()
11 %tmp3 = sub i32 %tmp2, 1
12 %tmp4 = sub i32 %tmp3, %tmp
13 %tmp5 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp4
14 %tmp6 = load i32, i32 addrspace(1)* %tmp5
15 store i32 %tmp6, i32 addrspace(1)* %tmp1
16 ret void
17 }
18
19 ; Function Attrs: convergent nounwind
20 declare void @llvm.r600.group.barrier() #1
21
22 ; Function Attrs: nounwind readnone
23 declare i32 @llvm.r600.read.tidig.x() #2
24
25 ; Function Attrs: nounwind readnone
26 declare i32 @llvm.r600.read.local.size.x() #2
27
28 attributes #0 = { nounwind }
29 attributes #1 = { convergent nounwind }
30 attributes #2 = { nounwind readnone }
+0
-80
test/CodeGen/AMDGPU/local-memory-two-objects.ll less more
None ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
3
4 @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
5 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
6
7
8 ; Check that the LDS size emitted correctly
9 ; EG: .long 166120
10 ; EG-NEXT: .long 8
11 ; GCN: .long 47180
12 ; GCN-NEXT: .long 32900
13
14
15 ; FUNC-LABEL: {{^}}local_memory_two_objects:
16
17 ; We would like to check the lds writes are using different
18 ; addresses, but due to variations in the scheduler, we can't do
19 ; this consistently on evergreen GPUs.
20 ; EG: LDS_WRITE
21 ; EG: LDS_WRITE
22
23 ; GROUP_BARRIER must be the last instruction in a clause
24 ; EG: GROUP_BARRIER
25 ; EG-NEXT: ALU clause
26
27 ; Make sure the lds reads are using different addresses, at different
28 ; constant offsets.
29 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
30 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
31
32
33 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
34 ; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
35 ; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
36
37
38 ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
39
40 ; SI-DAG: ds_write_b32 [[ADDRW]],
41 ; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
42
43 ; GCN: s_barrier
44
45 ; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
46 ; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
47
48 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
49 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
50
51 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
52 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
53
54 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
55 entry:
56 %x.i = call i32 @llvm.r600.read.tidig.x() #0
57 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
58 store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
59 %mul = shl nsw i32 %x.i, 1
60 %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
61 store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
62 %sub = sub nsw i32 3, %x.i
63 call void @llvm.AMDGPU.barrier.local()
64 %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
65 %0 = load i32, i32 addrspace(3)* %arrayidx2, align 4
66 %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
67 store i32 %0, i32 addrspace(1)* %arrayidx3, align 4
68 %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
69 %1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
70 %add = add nsw i32 %x.i, 4
71 %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
72 store i32 %1, i32 addrspace(1)* %arrayidx5, align 4
73 ret void
74 }
75
76 declare i32 @llvm.r600.read.tidig.x() #0
77 declare void @llvm.AMDGPU.barrier.local()
78
79 attributes #0 = { readnone }
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
2
3 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
4
5 ; Check that the LDS size emitted correctly
6 ; SI: .long 47180
7 ; SI-NEXT: .long 65668
8 ; CI: .long 47180
9 ; CI-NEXT: .long 32900
10
11 ; GCN-LABEL: {{^}}local_memory:
12
13 ; GCN-NOT: s_wqm_b64
14 ; GCN: ds_write_b32
15
16 ; GCN: s_barrier
17
18 ; GCN: ds_read_b32 {{v[0-9]+}},
19 define void @local_memory(i32 addrspace(1)* %out) #0 {
20 entry:
21 %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
22 %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
23 store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
24 %add = add nsw i32 %y.i, 1
25 %cmp = icmp eq i32 %add, 16
26 %.add = select i1 %cmp, i32 0, i32 %add
27 call void @llvm.amdgcn.s.barrier()
28 %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
29 %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
30 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
31 store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
32 ret void
33 }
34
35 @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
36 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
37
38 ; Check that the LDS size emitted correctly
39 ; EG: .long 166120
40 ; EG-NEXT: .long 8
41 ; GCN: .long 47180
42 ; GCN-NEXT: .long 32900
43
44 ; GCN-LABEL: {{^}}local_memory_two_objects:
45 ; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
46 ; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
47 ; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
48
49 ; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
50
51 ; SI-DAG: ds_write_b32 [[ADDRW]],
52 ; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
53
54 ; GCN: s_barrier
55
56 ; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
57 ; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
58
59 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
60 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
61
62 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
63 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
64 define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
65 entry:
66 %x.i = call i32 @llvm.amdgcn.workitem.id.x()
67 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
68 store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
69 %mul = shl nsw i32 %x.i, 1
70 %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
71 store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
72 %sub = sub nsw i32 3, %x.i
73 call void @llvm.amdgcn.s.barrier()
74 %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
75 %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
76 %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
77 store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
78 %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
79 %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
80 %add = add nsw i32 %x.i, 4
81 %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
82 store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
83 ret void
84 }
85
86 declare i32 @llvm.amdgcn.workitem.id.x() #1
87 declare void @llvm.amdgcn.s.barrier() #2
88
89 attributes #0 = { nounwind }
90 attributes #1 = { nounwind readnone }
91 attributes #2 = { convergent nounwind }
None ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
0 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
1 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
22 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
55
6
7 ; Check that the LDS size emitted correctly
8 ; EG: .long 166120
9 ; EG-NEXT: .long 128
10 ; SI: .long 47180
11 ; SI-NEXT: .long 65668
12 ; CI: .long 47180
13 ; CI-NEXT: .long 32900
14
15 ; FUNC-LABEL: {{^}}local_memory:
16
17 ; EG: LDS_WRITE
18 ; SI-NOT: s_wqm_b64
19 ; SI: ds_write_b32
20
21 ; GROUP_BARRIER must be the last instruction in a clause
22 ; EG: GROUP_BARRIER
23 ; EG-NEXT: ALU clause
24 ; SI: s_barrier
25
26 ; EG: LDS_READ_RET
27 ; SI: ds_read_b32 {{v[0-9]+}},
28
29 define void @local_memory(i32 addrspace(1)* %out) {
30 entry:
31 %y.i = call i32 @llvm.r600.read.tidig.x() #0
32 %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
33 store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
34 %add = add nsw i32 %y.i, 1
35 %cmp = icmp eq i32 %add, 16
36 %.add = select i1 %cmp, i32 0, i32 %add
37 call void @llvm.AMDGPU.barrier.local()
38 %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
39 %0 = load i32, i32 addrspace(3)* %arrayidx1, align 4
40 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
41 store i32 %0, i32 addrspace(1)* %arrayidx2, align 4
42 ret void
43 }
44
456 @lds = addrspace(3) global [512 x i32] undef, align 4
467
47 ; On SI we need to make sure that the base offset is a register and not
48 ; an immediate.
8 ; On SI we need to make sure that the base offset is a register and
9 ; not an immediate.
10
4911 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
5012 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
5113 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
14
5215 ; R600: LDS_READ_RET
53 define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
16 define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
5417 entry:
5518 %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
5619 %tmp1 = load i32, i32 addrspace(3)* %tmp0
6629 ; R600: LDS_READ_RET
6730 ; GCN-DAG: ds_read_b32
6831 ; GCN-DAG: ds_read2_b32
69 define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) {
32 define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
7033 %scalar = load i32, i32 addrspace(3)* %in
7134 %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
7235 %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
7740 ret void
7841 }
7942
80 declare i32 @llvm.r600.read.tidig.x() #0
81 declare void @llvm.AMDGPU.barrier.local()
82
83 attributes #0 = { readnone }
43 attributes #0 = { nounwind }
0 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
1
2 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
3
4 ; Check that the LDS size emitted correctly
5 ; EG: .long 166120
6 ; EG-NEXT: .long 128
7
8 ; FUNC-LABEL: {{^}}local_memory:
9
10 ; EG: LDS_WRITE
11
12 ; GROUP_BARRIER must be the last instruction in a clause
13 ; EG: GROUP_BARRIER
14 ; EG-NEXT: ALU clause
15
16 ; EG: LDS_READ_RET
17 define void @local_memory(i32 addrspace(1)* %out) #0 {
18 entry:
19 %y.i = call i32 @llvm.r600.read.tidig.x() #1
20 %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
21 store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
22 %add = add nsw i32 %y.i, 1
23 %cmp = icmp eq i32 %add, 16
24 %.add = select i1 %cmp, i32 0, i32 %add
25 call void @llvm.r600.group.barrier()
26 %arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
27 %tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
28 %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
29 store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
30 ret void
31 }
32
33 @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
34 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
35
36 ; Check that the LDS size emitted correctly
37 ; EG: .long 166120
38 ; EG-NEXT: .long 8
39 ; GCN: .long 47180
40 ; GCN-NEXT: .long 32900
41
42 ; FUNC-LABEL: {{^}}local_memory_two_objects:
43
44 ; We would like to check the lds writes are using different
45 ; addresses, but due to variations in the scheduler, we can't do
46 ; this consistently on evergreen GPUs.
47 ; EG: LDS_WRITE
48 ; EG: LDS_WRITE
49
50 ; GROUP_BARRIER must be the last instruction in a clause
51 ; EG: GROUP_BARRIER
52 ; EG-NEXT: ALU clause
53
54 ; Make sure the lds reads are using different addresses, at different
55 ; constant offsets.
56 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
57 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
58
59 define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
60 entry:
61 %x.i = call i32 @llvm.r600.read.tidig.x() #1
62 %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
63 store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
64 %mul = shl nsw i32 %x.i, 1
65 %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
66 store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
67 %sub = sub nsw i32 3, %x.i
68 call void @llvm.r600.group.barrier()
69 %arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
70 %tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
71 %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
72 store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
73 %arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
74 %tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
75 %add = add nsw i32 %x.i, 4
76 %arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
77 store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
78 ret void
79 }
80
81 declare i32 @llvm.r600.read.tidig.x() #1
82 declare void @llvm.r600.group.barrier() #2
83
84 attributes #0 = { nounwind }
85 attributes #1 = { nounwind readnone }
86 attributes #2 = { convergent nounwind }
None ; XFAIL: *
1 ; REQUIRES: asserts
2 ; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
3 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI
0 ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
1 ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+vgpr-spilling < %s | FileCheck -check-prefix=GCN %s
42
5 declare void @llvm.AMDGPU.barrier.local() nounwind convergent
3 declare void @llvm.amdgcn.s.barrier() nounwind convergent
64
7
8 ; SI-LABEL: {{^}}main(
5 ; GCN-LABEL: {{^}}main:
96 define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
107 main_body:
118 %0 = extractelement <4 x float> %reg1, i32 0
3835 %temp3.0 = phi float [ 0.000000e+00, %main_body ], [ %101, %Flow2 ]
3936 %15 = extractelement <4 x float> %reg1, i32 1
4037 %16 = extractelement <4 x float> %reg1, i32 3
41 %17 = load <4 x float>, <4 x float> addrspace(9)* null
38 %17 = load <4 x float>, <4 x float> addrspace(2)* null
4239 %18 = extractelement <4 x float> %17, i32 0
4340 %19 = fmul float %18, %0
44 %20 = load <4 x float>, <4 x float> addrspace(9)* null
41 %20 = load <4 x float>, <4 x float> addrspace(2)* null
4542 %21 = extractelement <4 x float> %20, i32 1
4643 %22 = fmul float %21, %0
47 %23 = load <4 x float>, <4 x float> addrspace(9)* null
44 %23 = load <4 x float>, <4 x float> addrspace(2)* null
4845 %24 = extractelement <4 x float> %23, i32 2
4946 %25 = fmul float %24, %0
50 %26 = load <4 x float>, <4 x float> addrspace(9)* null
47 %26 = load <4 x float>, <4 x float> addrspace(2)* null
5148 %27 = extractelement <4 x float> %26, i32 3
5249 %28 = fmul float %27, %0
53 %29 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
50 %29 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
5451 %30 = extractelement <4 x float> %29, i32 0
5552 %31 = fmul float %30, %15
5653 %32 = fadd float %31, %19
57 %33 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
54 %33 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
5855 %34 = extractelement <4 x float> %33, i32 1
5956 %35 = fmul float %34, %15
6057 %36 = fadd float %35, %22
61 %37 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
58 %37 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
6259 %38 = extractelement <4 x float> %37, i32 2
6360 %39 = fmul float %38, %15
6461 %40 = fadd float %39, %25
65 %41 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1)
62 %41 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 1)
6663 %42 = extractelement <4 x float> %41, i32 3
6764 %43 = fmul float %42, %15
6865 %44 = fadd float %43, %28
69 %45 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
66 %45 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
7067 %46 = extractelement <4 x float> %45, i32 0
7168 %47 = fmul float %46, %1
7269 %48 = fadd float %47, %32
73 %49 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
70 %49 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
7471 %50 = extractelement <4 x float> %49, i32 1
7572 %51 = fmul float %50, %1
7673 %52 = fadd float %51, %36
77 %53 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
74 %53 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
7875 %54 = extractelement <4 x float> %53, i32 2
7976 %55 = fmul float %54, %1
8077 %56 = fadd float %55, %40
81 %57 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2)
78 %57 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 2)
8279 %58 = extractelement <4 x float> %57, i32 3
8380 %59 = fmul float %58, %1
8481 %60 = fadd float %59, %44
85 %61 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
82 %61 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
8683 %62 = extractelement <4 x float> %61, i32 0
8784 %63 = fmul float %62, %16
8885 %64 = fadd float %63, %48
89 %65 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
86 %65 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
9087 %66 = extractelement <4 x float> %65, i32 1
9188 %67 = fmul float %66, %16
9289 %68 = fadd float %67, %52
93 %69 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
90 %69 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
9491 %70 = extractelement <4 x float> %69, i32 2
9592 %71 = fmul float %70, %16
9693 %72 = fadd float %71, %56
97 %73 = load <4 x float>, <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3)
94 %73 = load <4 x float>, <4 x float> addrspace(2)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(2)* null, i64 0, i32 3)
9895 %74 = extractelement <4 x float> %73, i32 3
9996 %75 = fmul float %74, %16
10097 %76 = fadd float %75, %60
10299 %78 = insertelement <4 x float> %77, float %68, i32 1
103100 %79 = insertelement <4 x float> %78, float %72, i32 2
104101 %80 = insertelement <4 x float> %79, float %76, i32 3
105 call void @llvm.AMDGPU.barrier.local()
102 call void @llvm.amdgcn.s.barrier()
106103 %81 = insertelement <4 x float> undef, float %temp.0, i32 0
107104 %82 = insertelement <4 x float> %81, float %temp1.0, i32 1
108105 %83 = insertelement <4 x float> %82, float %temp2.0, i32 2
109106 %84 = insertelement <4 x float> %83, float %temp3.0, i32 3
110 call void @llvm.AMDGPU.barrier.local()
107 call void @llvm.amdgcn.s.barrier()
111108 ret void
112109
113110 LOOP: ; preds = %main_body, %Flow