llvm.org GIT mirror llvm / faf7289
AMDGPU/SI: Enable load-store-opt by default. Summary: Enable load-store-opt by default, and update LIT tests. Reviewers: arsenm Differential Revision: http://reviews.llvm.org/D20694 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@270894 91177308-0d34-0410-b5e6-96231b3b80d8 Changpeng Fang 3 years ago
26 changed file(s) with 80 addition(s) and 93 deletion(s). Raw diff Collapse all Expand all
5555 // for SI has the unhelpful behavior that it unsets everything else if you
5656 // disable it.
5757
58 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,");
58 SmallString<256> FullFS("+promote-alloca,+fp64-denormals,+load-store-opt,");
5959 if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
6060 FullFS += "+flat-for-global,";
6161 FullFS += FS;
77 ; CHECK-LABEL: {{^}}do_as_ptr_calcs:
88 ; CHECK: s_load_dword [[SREG1:s[0-9]+]],
99 ; CHECK: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
10 ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:12
11 ; CHECK-DAG: ds_read_b32 v{{[0-9]+}}, [[VREG1]] offset:20
10 ; CHECK-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
1211 define void @do_as_ptr_calcs(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
1312 entry:
1413 %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
6161 ; GCN: buffer_load_dword [[VAL0:v[0-9]+]],
6262 ; GCN: s_waitcnt
6363 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
64 ; GCN-NEXT: buffer_store_dword [[RESULT]],
64 ; GCN: buffer_store_dword [[RESULT]],
6565 ; GCN: s_endpgm
6666 define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
6767 %val0 = load i32, i32 addrspace(1)* %in0, align 4
202202 }
203203
204204 ; FUNC-LABEL: {{^}}v_ctpop_i32_add_literal:
205 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
206 ; GCN: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
205 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]],
206 ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f
207207 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
208208 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
209209 ; GCN: buffer_store_dword [[RESULT]],
66 ; GCN-LABEL: {{^}}reschedule_global_load_lds_store:
77 ; GCN: buffer_load_dword
88 ; GCN: buffer_load_dword
9 ; GCN: ds_write_b32
10 ; GCN: ds_write_b32
9 ; GCN: ds_write2_b32
1110 ; GCN: s_endpgm
1211 define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
1312 entry:
1212 ; CI: v_ceil_f64_e32
1313 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
1414 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
15 ; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
16 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
15 ; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
16 ; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
1717 ; SI-DAG: s_not_b64
1818 ; SI-DAG: s_and_b64
1919 ; SI-DAG: cmp_gt_i32
1010 ; SI: buffer_store_dword [[RESULT]],
1111 ; SI: s_endpgm
1212 define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
13 %a = load float, float addrspace(1)* %aptr, align 4
14 %b = load float, float addrspace(1)* %bptr, align 4
15 %c = load float, float addrspace(1)* %cptr, align 4
13 %a = load volatile float, float addrspace(1)* %aptr, align 4
14 %b = load volatile float, float addrspace(1)* %bptr, align 4
15 %c = load volatile float, float addrspace(1)* %cptr, align 4
1616 %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
1717 %f1 = call float @llvm.maxnum.f32(float %f0, float %c) nounwind readnone
1818 store float %f1, float addrspace(1)* %out, align 4
2828 ; SI: buffer_store_dword [[RESULT]],
2929 ; SI: s_endpgm
3030 define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
31 %a = load float, float addrspace(1)* %aptr, align 4
32 %b = load float, float addrspace(1)* %bptr, align 4
33 %c = load float, float addrspace(1)* %cptr, align 4
31 %a = load volatile float, float addrspace(1)* %aptr, align 4
32 %b = load volatile float, float addrspace(1)* %bptr, align 4
33 %c = load volatile float, float addrspace(1)* %cptr, align 4
3434 %f0 = call float @llvm.maxnum.f32(float %a, float %b) nounwind readnone
3535 %f1 = call float @llvm.maxnum.f32(float %c, float %f0) nounwind readnone
3636 store float %f1, float addrspace(1)* %out, align 4
1111 ; SI: buffer_store_dword [[RESULT]],
1212 ; SI: s_endpgm
1313 define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
14 %a = load float, float addrspace(1)* %aptr, align 4
15 %b = load float, float addrspace(1)* %bptr, align 4
16 %c = load float, float addrspace(1)* %cptr, align 4
14 %a = load volatile float, float addrspace(1)* %aptr, align 4
15 %b = load volatile float, float addrspace(1)* %bptr, align 4
16 %c = load volatile float, float addrspace(1)* %cptr, align 4
1717 %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
1818 %f1 = call float @llvm.minnum.f32(float %f0, float %c) nounwind readnone
1919 store float %f1, float addrspace(1)* %out, align 4
2929 ; SI: buffer_store_dword [[RESULT]],
3030 ; SI: s_endpgm
3131 define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
32 %a = load float, float addrspace(1)* %aptr, align 4
33 %b = load float, float addrspace(1)* %bptr, align 4
34 %c = load float, float addrspace(1)* %cptr, align 4
32 %a = load volatile float, float addrspace(1)* %aptr, align 4
33 %b = load volatile float, float addrspace(1)* %bptr, align 4
34 %c = load volatile float, float addrspace(1)* %cptr, align 4
3535 %f0 = call float @llvm.minnum.f32(float %a, float %b) nounwind readnone
3636 %f1 = call float @llvm.minnum.f32(float %c, float %f0) nounwind readnone
3737 store float %f1, float addrspace(1)* %out, align 4
152152 ; }
153153
154154 ; FUNC-LABEL: {{^}}zextload_global_i1_to_i64:
155 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]],
156 ; SI: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
155 ; SI-DAG: buffer_load_ubyte [[LOAD:v[0-9]+]],
156 ; SI-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
157157 ; SI: buffer_store_dwordx2
158158 define void @zextload_global_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
159159 %a = load i1, i1 addrspace(1)* %in
153153 }
154154
155155 ; FUNC-LABEL: {{^}}zextload_global_i16_to_i64:
156 ; SI: buffer_load_ushort v[[LO:[0-9]+]],
157 ; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
156 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
157 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
158158 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
159159 define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
160160 %a = load i16, i16 addrspace(1)* %in
22 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
33
44 ; FUNC-LABEL: {{^}}zextload_global_i32_to_i64:
5 ; SI: buffer_load_dword v[[LO:[0-9]+]],
6 ; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
5 ; SI-DAG: buffer_load_dword v[[LO:[0-9]+]],
6 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
77 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
88 define void @zextload_global_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
99 %a = load i32, i32 addrspace(1)* %in
150150 ; }
151151
152152 ; FUNC-LABEL: {{^}}zextload_global_i8_to_i64:
153 ; SI: buffer_load_ubyte v[[LO:[0-9]+]],
154 ; SI: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
153 ; SI-DAG: buffer_load_ubyte v[[LO:[0-9]+]],
154 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
155155 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
156156 define void @zextload_global_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
157157 %a = load i8, i8 addrspace(1)* %in
0 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
11 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s
22 ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s
4 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s
55
66 declare void @llvm.amdgcn.s.barrier() #0
77
1717
1818 ; SI-PROMOTE: ds_write_b64
1919 ; SI-PROMOTE: ds_read_b64
20 ; CI-PROMOTE: ds_write_b64
21 ; CI-PROMOTE: ds_read_b64
2022 define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
2123 %val = load double, double addrspace(1)* %in, align 8
2224 %array = alloca [16 x double], align 8
4648 ; SI-PROMOTE: ds_write_b64
4749 ; SI-PROMOTE: ds_read_b64
4850 ; SI-PROMOTE: ds_read_b64
51 ; CI-PROMOTE: ds_write2_b64
52 ; CI-PROMOTE: ds_read2_b64
4953 define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
5054 %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
5155 %array = alloca [8 x <2 x double>], align 16
7074
7175 ; SI-PROMOTE: ds_write_b64
7276 ; SI-PROMOTE: ds_read_b64
77 ; CI-PROMOTE: ds_write_b64
78 ; CI-PROMOTE: ds_read_b64
7379 define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
7480 %val = load i64, i64 addrspace(1)* %in, align 8
7581 %array = alloca [8 x i64], align 8
100106 ; SI-PROMOTE: ds_write_b64
101107 ; SI-PROMOTE: ds_read_b64
102108 ; SI-PROMOTE: ds_read_b64
109 ; CI-PROMOTE: ds_write2_b64
110 ; CI-PROMOTE: ds_read2_b64
103111 define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
104112 %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
105113 %array = alloca [8 x <2 x i64>], align 16
77
88 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}]
99 ; TODO: this constant should be folded:
10 ; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
10 ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
1111 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
1212 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
13 ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
14 ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
15 ; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
16 ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
13 ; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
14 ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
15 ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
1716
1817 define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind {
1918 %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone
2424 ; SI: v_rsq_clamp_f64_e32
2525
2626 ; TODO: this constant should be folded:
27 ; VI-DAG: s_mov_b32 s[[ALLBITS:[0-9+]]], -1
27 ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
2828 ; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
2929 ; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
30 ; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], s[[ALLBITS]]
3130 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
32 ; VI: s_mov_b32 s[[LOW2:[0-9+]]], s[[ALLBITS]]
33 ; VI: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
34 ; VI: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
31 ; VI-DAG: s_mov_b32 s[[LOW2:[0-9+]]], s[[LOW1]]
32 ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
33 ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW2]]:[[HIGH2]]]
3534 define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
3635 %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
3736 store double %rsq_clamp, double addrspace(1)* %out
152152 ; FIXME: Use 64-bit ops
153153 ; FUNC-LABEL: {{^}}test_small_memcpy_i64_lds_to_lds_align8:
154154
155 ; SI: ds_read_b64
156 ; SI: ds_read_b64
157 ; SI: ds_read_b64
158 ; SI: ds_read_b64
159
160 ; SI: ds_write_b64
161 ; SI: ds_write_b64
162 ; SI: ds_write_b64
163 ; SI: ds_write_b64
155 ; SI: ds_read2_b64
156 ; SI: ds_read2_b64
157
158 ; SI: ds_write2_b64
159 ; SI: ds_write2_b64
164160
165161 ; SI-DAG: s_endpgm
166162 define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
8484 }
8585
8686 ; FUNC-LABEL: {{^}}global_zextload_i1_to_i64:
87 ; SI: buffer_load_ubyte
88 ; SI: v_mov_b32_e32 {{v[0-9]+}}, 0
87 ; SI-DAG: buffer_load_ubyte
88 ; SI-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0
8989 ; SI: buffer_store_dwordx2
9090 ; SI: s_endpgm
9191 define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
121121
122122 ; BOTH-LABEL: {{^}}local_v2i64_store:
123123 ; BOTH-NOT: ADD
124 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:112
125 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:120
124 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:15 offset1:14
126125 ; BOTH: s_endpgm
127126 define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
128127 %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
132131
133132 ; BOTH-LABEL: {{^}}local_v2i64_store_0_offset:
134133 ; BOTH-NOT: ADD
135 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
136 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
134 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
137135 ; BOTH: s_endpgm
138136 define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
139137 store <2 x i64> , <2 x i64> addrspace(3)* %out, align 16
142140
143141 ; BOTH-LABEL: {{^}}local_v4i64_store:
144142 ; BOTH-NOT: ADD
145 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:224
146 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:232
147 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:240
148 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:248
143 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:31 offset1:30
144 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:29 offset1:28
149145 ; BOTH: s_endpgm
150146 define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
151147 %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
155151
156152 ; BOTH-LABEL: {{^}}local_v4i64_store_0_offset:
157153 ; BOTH-NOT: ADD
158 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
159 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:8
160 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:16
161 ; BOTH-DAG: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:24
154 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:3 offset1:2
155 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:1
162156 ; BOTH: s_endpgm
163157 define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
164158 store <4 x i64> , <4 x i64> addrspace(3)* %out, align 16
3131 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
3232 ; SI: v_add_i32_e32 [[SIPTR:v[0-9]+]], vcc, 16, v{{[0-9]+}}
3333 ; SI: ds_read_b32 {{v[0-9]+}}, [[SIPTR]]
34 ; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR:v[0-9]+]] offset:16
35 ; CI-DAG: ds_read_b32 {{v[0-9]+}}, [[ADDRR]]
36
34 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:4
3735 define void @local_memory_two_objects(i32 addrspace(1)* %out) {
3836 entry:
3937 %x.i = call i32 @llvm.r600.read.tidig.x() #0
66
77 ; FUNC-LABEL: {{^}}missing_store_reduced:
88 ; SI: ds_read_b64
9 ; SI: buffer_store_dword
10 ; SI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
9 ; SI-DAG: buffer_store_dword
10 ; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
1111 ; SI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
1212 ; SI: s_load_dword
1313 ; SI: s_nop 2
1515 }
1616
1717 ; SI-LABEL: {{^}}no_reorder_scalarized_v2f64_local_load_store:
18 ; SI: ds_read_b64
19 ; SI: ds_read_b64
20 ; SI: ds_write_b64
21 ; SI: ds_write_b64
18 ; SI: ds_read2_b64
19 ; SI: ds_write2_b64
2220 ; SI: s_endpgm
2321 define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
2422 %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
3333 ; working.
3434
3535 ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
36 ; SI: buffer_load_dword [[VAL:v[0-9]+]],
37 ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
36 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
37 ; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
3838 ; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
3939 ; SI: v_add_i32
4040 ; SI: v_lshrrev_b32
3939 }
4040
4141 ; GCN-LABEL: {{^}}lshr_i64_32:
42 ; GCN: buffer_load_dword v[[LO:[0-9]+]]
43 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
42 ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
43 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
4444 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
4545 define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
4646 %val = load i64, i64 addrspace(1)* %in
8080 }
8181
8282 ; GCN-LABEL: {{^}}shl_i64_const_32:
83 ; GCN: buffer_load_dword v[[HI:[0-9]+]]
84 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
83 ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
84 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
8585 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
8686 define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
8787 %val = load i64, i64 addrspace(1)* %in
99 @stored_global_ptr = addrspace(3) global i32 addrspace(1)* undef, align 8
1010
1111 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
12 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
13 ; CI-NEXT: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
12 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
1413 ; CI: buffer_store_dword
1514 define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
1615 %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
7069 }
7170
7271 ; FUNC-LABEL: @reorder_constant_load_global_store_constant_load
73 ; CI: buffer_store_dword
74 ; CI: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
72 ; CI-DAG: buffer_store_dword
73 ; CI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
7574 ; CI: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
7675 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
7776 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
155154 }
156155
157156 ; FUNC-LABEL: @reorder_local_offsets
158 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
159 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
157 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
160158 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
161159 ; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
162160 ; CI: buffer_store_dword
286286 ; CM: LDS_WRITE
287287 ; CM: LDS_WRITE
288288
289 ; SI: ds_write_b64
290 ; SI: ds_write_b64
289 ; SI: ds_write2_b64
291290 define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
292291 entry:
293292 store <4 x i32> %in, <4 x i32> addrspace(3)* %out
4141 }
4242
4343 ; GCN-LABEL: {{^}}test_use_s_v_s:
44 ; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
45 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
46
4447 ; GCN: buffer_load_dword [[VA0:v[0-9]+]]
4548 ; GCN-NOT: v_mov_b32
4649 ; GCN: buffer_load_dword [[VA1:v[0-9]+]]
47
48 ; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
49 ; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
5050
5151 ; GCN-NOT: v_mov_b32
5252 ; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
6363 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
6464 ; SI: buffer_store_byte [[RESULT]]
6565 define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
66 %a = load i1, i1 addrspace(1)* %in0
67 %b = load i1, i1 addrspace(1)* %in1
66 %a = load volatile i1, i1 addrspace(1)* %in0
67 %b = load volatile i1, i1 addrspace(1)* %in1
6868 %xor = xor i1 %a, %b
6969 store i1 %xor, i1 addrspace(1)* %out
7070 ret void