llvm.org GIT mirror llvm / 743f98a
AMDGPU: Generate check lines Checking all the instructions will help catch LICM changes when passes are reordered. Also switch to using gfx9 since global stores make the relevant instructions more obvious. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@369855 91177308-0d34-0410-b5e6-96231b3b80d8 Matt Arsenault 28 days ago
1 changed file(s) with 345 addition(s) and 106 deletion(s). Raw diff Collapse all Expand all
None ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
1
2 ; GCN-LABEL: {{^}}udiv32_invariant_denom:
3 ; GCN: v_cvt_f32_u32
4 ; GCN: v_rcp_iflag_f32
5 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
6 ; GCN: v_cvt_u32_f32_e32
7 ; GCN-DAG: v_mul_hi_u32
8 ; GCN-DAG: v_mul_lo_u32
9 ; GCN-DAG: v_sub_i32_e32
10 ; GCN-DAG: v_cmp_eq_u32_e64
11 ; GCN-DAG: v_cndmask_b32_e64
12 ; GCN-DAG: v_mul_hi_u32
13 ; GCN-DAG: v_add_i32_e32
14 ; GCN-DAG: v_subrev_i32_e32
15 ; GCN-DAG: v_cndmask_b32_e64
16 ; GCN: [[LOOP:BB[0-9_]+]]:
17 ; GCN-NOT: v_rcp
18 ; GCN: s_cbranch_scc0 [[LOOP]]
19 ; GCN: s_endpgm
0 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
2
203 define amdgpu_kernel void @udiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
4 ; GFX9-LABEL: udiv32_invariant_denom:
5 ; GFX9: ; %bb.0: ; %bb
6 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
7 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
8 ; GFX9-NEXT: s_mov_b32 s5, 0
9 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
10 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s4
11 ; GFX9-NEXT: s_sub_i32 s8, 0, s4
12 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
13 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
14 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
15 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s4
16 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s4
17 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
18 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
19 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
20 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
21 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
22 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
23 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
25 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
26 ; GFX9-NEXT: BB0_1: ; %bb3
27 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
28 ; GFX9-NEXT: v_mul_lo_u32 v3, s8, v2
29 ; GFX9-NEXT: v_mul_lo_u32 v4, v2, s4
30 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v2
31 ; GFX9-NEXT: v_add_u32_e32 v5, -1, v2
32 ; GFX9-NEXT: v_add_u32_e32 v3, s5, v3
33 ; GFX9-NEXT: v_cmp_ge_u32_e32 vcc, s5, v4
34 ; GFX9-NEXT: v_cmp_le_u32_e64 s[0:1], s4, v3
35 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
36 ; GFX9-NEXT: s_and_b64 s[0:1], s[0:1], vcc
37 ; GFX9-NEXT: s_add_i32 s5, s5, 1
38 ; GFX9-NEXT: v_mov_b32_e32 v4, s7
39 ; GFX9-NEXT: s_add_u32 s6, s6, 4
40 ; GFX9-NEXT: s_addc_u32 s7, s7, 0
41 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v2, v6, s[0:1]
42 ; GFX9-NEXT: v_add_co_u32_e64 v1, s[2:3], v1, v0
43 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v6, vcc
44 ; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v2, s[2:3]
45 ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
46 ; GFX9-NEXT: global_store_dword v[3:4], v5, off
47 ; GFX9-NEXT: s_cbranch_scc0 BB0_1
48 ; GFX9-NEXT: ; %bb.2: ; %bb2
49 ; GFX9-NEXT: s_endpgm
2150 bb:
2251 br label %bb3
2352
3564 br i1 %tmp8, label %bb2, label %bb3
3665 }
3766
38 ; GCN-LABEL: {{^}}urem32_invariant_denom:
39 ; GCN: v_cvt_f32_u32
40 ; GCN: v_rcp_iflag_f32
41 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
42 ; GCN: v_cvt_u32_f32_e32
43 ; GCN-DAG: v_mul_hi_u32
44 ; GCN-DAG: v_mul_lo_u32
45 ; GCN-DAG: v_sub_i32_e32
46 ; GCN-DAG: v_cmp_eq_u32_e64
47 ; GCN-DAG: v_cndmask_b32_e64
48 ; GCN-DAG: v_mul_hi_u32
49 ; GCN-DAG: v_add_i32_e32
50 ; GCN-DAG: v_subrev_i32_e32
51 ; GCN-DAG: v_cndmask_b32_e64
52 ; GCN: [[LOOP:BB[0-9_]+]]:
53 ; GCN-NOT: v_rcp
54 ; GCN: s_cbranch_scc0 [[LOOP]]
55 ; GCN: s_endpgm
5667 define amdgpu_kernel void @urem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
68 ; GFX9-LABEL: urem32_invariant_denom:
69 ; GFX9: ; %bb.0: ; %bb
70 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
71 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
72 ; GFX9-NEXT: s_mov_b32 s3, 0
73 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
75 ; GFX9-NEXT: s_sub_i32 s6, 0, s2
76 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
77 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
78 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
79 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s2
80 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2
81 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
82 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
83 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
84 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
85 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
86 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
87 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
88 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
89 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
90 ; GFX9-NEXT: BB1_1: ; %bb3
91 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
92 ; GFX9-NEXT: v_mul_lo_u32 v5, s6, v2
93 ; GFX9-NEXT: v_sub_u32_e32 v6, 1, v2
94 ; GFX9-NEXT: v_not_b32_e32 v7, v2
95 ; GFX9-NEXT: v_mul_lo_u32 v8, v2, s2
96 ; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6
97 ; GFX9-NEXT: v_mul_lo_u32 v7, s2, v7
98 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v0
99 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
100 ; GFX9-NEXT: v_add_u32_e32 v5, s3, v5
101 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v5
102 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v8
103 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
104 ; GFX9-NEXT: v_add_u32_e32 v6, s3, v6
105 ; GFX9-NEXT: v_add_u32_e32 v7, s3, v7
106 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
107 ; GFX9-NEXT: s_add_i32 s3, s3, 1
108 ; GFX9-NEXT: v_mov_b32_e32 v4, s5
109 ; GFX9-NEXT: s_add_u32 s4, s4, 4
110 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc
111 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
112 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1]
113 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
114 ; GFX9-NEXT: global_store_dword v[3:4], v5, off
115 ; GFX9-NEXT: s_cbranch_scc0 BB1_1
116 ; GFX9-NEXT: ; %bb.2: ; %bb2
117 ; GFX9-NEXT: s_endpgm
57118 bb:
58119 br label %bb3
59120
71132 br i1 %tmp8, label %bb2, label %bb3
72133 }
73134
74 ; GCN-LABEL: {{^}}sdiv32_invariant_denom:
75 ; GCN: v_cvt_f32_u32
76 ; GCN: v_rcp_iflag_f32
77 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
78 ; GCN: v_cvt_u32_f32_e32
79 ; GCN-DAG: v_mul_hi_u32
80 ; GCN-DAG: v_mul_lo_u32
81 ; GCN-DAG: v_sub_i32_e32
82 ; GCN-DAG: v_cmp_eq_u32_e64
83 ; GCN-DAG: v_cndmask_b32_e64
84 ; GCN-DAG: v_mul_hi_u32
85 ; GCN-DAG: v_add_i32_e32
86 ; GCN-DAG: v_subrev_i32_e32
87 ; GCN-DAG: v_cndmask_b32_e64
88 ; GCN: [[LOOP:BB[0-9_]+]]:
89 ; GCN-NOT: v_rcp
90 ; GCN: s_cbranch_scc0 [[LOOP]]
91 ; GCN: s_endpgm
92135 define amdgpu_kernel void @sdiv32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
136 ; GFX9-LABEL: sdiv32_invariant_denom:
137 ; GFX9: ; %bb.0: ; %bb
138 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
139 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24
140 ; GFX9-NEXT: s_mov_b32 s8, 0
141 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
142 ; GFX9-NEXT: s_ashr_i32 s4, s2, 31
143 ; GFX9-NEXT: s_add_i32 s2, s2, s4
144 ; GFX9-NEXT: s_xor_b32 s5, s2, s4
145 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s5
146 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
147 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
148 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
149 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s5
150 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s5
151 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
152 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
153 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
154 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
155 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
156 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
157 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
158 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
159 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
160 ; GFX9-NEXT: BB2_1: ; %bb3
161 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
162 ; GFX9-NEXT: v_mul_lo_u32 v5, s5, v2
163 ; GFX9-NEXT: v_add_u32_e32 v6, 1, v2
164 ; GFX9-NEXT: v_mov_b32_e32 v3, s6
165 ; GFX9-NEXT: v_add_u32_e32 v7, -1, v2
166 ; GFX9-NEXT: v_sub_u32_e32 v8, s8, v5
167 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s8, v5
168 ; GFX9-NEXT: v_cmp_le_u32_e64 s[2:3], s5, v8
169 ; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], s[0:1]
170 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, v6, s[2:3]
171 ; GFX9-NEXT: s_add_i32 s8, s8, 1
172 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[0:1]
173 ; GFX9-NEXT: v_mov_b32_e32 v4, s7
174 ; GFX9-NEXT: s_add_u32 s6, s6, 4
175 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v0
176 ; GFX9-NEXT: v_xor_b32_e32 v5, s4, v5
177 ; GFX9-NEXT: s_addc_u32 s7, s7, 0
178 ; GFX9-NEXT: v_subrev_u32_e32 v5, s4, v5
179 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
180 ; GFX9-NEXT: s_cmpk_eq_i32 s8, 0x400
181 ; GFX9-NEXT: global_store_dword v[3:4], v5, off
182 ; GFX9-NEXT: s_cbranch_scc0 BB2_1
183 ; GFX9-NEXT: ; %bb.2: ; %bb2
184 ; GFX9-NEXT: s_endpgm
93185 bb:
94186 br label %bb3
95187
107199 br i1 %tmp8, label %bb2, label %bb3
108200 }
109201
110 ; GCN-LABEL: {{^}}srem32_invariant_denom:
111 ; GCN: v_cvt_f32_u32
112 ; GCN: v_rcp_iflag_f32
113 ; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x4f800000,
114 ; GCN: v_cvt_u32_f32_e32
115 ; GCN-DAG: v_mul_hi_u32
116 ; GCN-DAG: v_mul_lo_u32
117 ; GCN-DAG: v_sub_i32_e32
118 ; GCN-DAG: v_cmp_eq_u32_e64
119 ; GCN-DAG: v_cndmask_b32_e64
120 ; GCN-DAG: v_mul_hi_u32
121 ; GCN-DAG: v_add_i32_e32
122 ; GCN-DAG: v_subrev_i32_e32
123 ; GCN-DAG: v_cndmask_b32_e64
124 ; GCN: [[LOOP:BB[0-9_]+]]:
125 ; GCN-NOT: v_rcp
126 ; GCN: s_cbranch_scc0 [[LOOP]]
127 ; GCN: s_endpgm
128202 define amdgpu_kernel void @srem32_invariant_denom(i32 addrspace(1)* nocapture %arg, i32 %arg1) {
203 ; GFX9-LABEL: srem32_invariant_denom:
204 ; GFX9: ; %bb.0: ; %bb
205 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
206 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX9-NEXT: s_ashr_i32 s3, s2, 31
209 ; GFX9-NEXT: s_add_i32 s2, s2, s3
210 ; GFX9-NEXT: s_xor_b32 s2, s2, s3
211 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s2
212 ; GFX9-NEXT: s_mov_b32 s3, 0
213 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
214 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f800000, v0
215 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
216 ; GFX9-NEXT: v_mul_lo_u32 v1, v0, s2
217 ; GFX9-NEXT: v_mul_hi_u32 v2, v0, s2
218 ; GFX9-NEXT: v_sub_u32_e32 v3, 0, v1
219 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
220 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
221 ; GFX9-NEXT: v_mul_hi_u32 v1, v1, v0
222 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1
223 ; GFX9-NEXT: v_sub_u32_e32 v0, v0, v1
224 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
225 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
226 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
227 ; GFX9-NEXT: BB3_1: ; %bb3
228 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
229 ; GFX9-NEXT: v_mul_lo_u32 v5, s2, v2
230 ; GFX9-NEXT: v_sub_u32_e32 v6, 1, v2
231 ; GFX9-NEXT: v_not_b32_e32 v7, v2
232 ; GFX9-NEXT: v_mul_lo_u32 v6, s2, v6
233 ; GFX9-NEXT: v_mul_lo_u32 v7, s2, v7
234 ; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v0
235 ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
236 ; GFX9-NEXT: v_sub_u32_e32 v8, s3, v5
237 ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s2, v8
238 ; GFX9-NEXT: v_cmp_ge_u32_e64 s[0:1], s3, v5
239 ; GFX9-NEXT: v_mov_b32_e32 v3, s4
240 ; GFX9-NEXT: v_add_u32_e32 v6, s3, v6
241 ; GFX9-NEXT: v_add_u32_e32 v7, s3, v7
242 ; GFX9-NEXT: s_and_b64 vcc, vcc, s[0:1]
243 ; GFX9-NEXT: s_add_i32 s3, s3, 1
244 ; GFX9-NEXT: v_mov_b32_e32 v4, s5
245 ; GFX9-NEXT: s_add_u32 s4, s4, 4
246 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v7, vcc
247 ; GFX9-NEXT: s_addc_u32 s5, s5, 0
248 ; GFX9-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1]
249 ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
250 ; GFX9-NEXT: global_store_dword v[3:4], v5, off
251 ; GFX9-NEXT: s_cbranch_scc0 BB3_1
252 ; GFX9-NEXT: ; %bb.2: ; %bb2
253 ; GFX9-NEXT: s_endpgm
129254 bb:
130255 br label %bb3
131256
143268 br i1 %tmp8, label %bb2, label %bb3
144269 }
145270
146 ; GCN-LABEL: {{^}}udiv16_invariant_denom:
147 ; GCN: v_cvt_f32_u32
148 ; GCN: v_rcp_iflag_f32
149 ; GCN: [[LOOP:BB[0-9_]+]]:
150 ; GCN-NOT: v_rcp
151 ; GCN: s_cbranch_scc0 [[LOOP]]
152 ; GCN: s_endpgm
153271 define amdgpu_kernel void @udiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
272 ; GFX9-LABEL: udiv16_invariant_denom:
273 ; GFX9: ; %bb.0: ; %bb
274 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
275 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
276 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
277 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
278 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
279 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
280 ; GFX9-NEXT: s_and_b32 s3, s3, s2
281 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
282 ; GFX9-NEXT: s_movk_i32 s3, 0x400
283 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
284 ; GFX9-NEXT: BB4_1: ; %bb3
285 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
286 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
287 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
288 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
289 ; GFX9-NEXT: v_mov_b32_e32 v2, s5
290 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
291 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v2, v6, s[0:1]
292 ; GFX9-NEXT: v_mul_f32_e32 v2, v7, v1
293 ; GFX9-NEXT: v_trunc_f32_e32 v2, v2
294 ; GFX9-NEXT: v_cvt_u32_f32_e32 v8, v2
295 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
296 ; GFX9-NEXT: v_mad_f32 v2, -v2, v0, v7
297 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v2|, v0
298 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v4
299 ; GFX9-NEXT: v_addc_co_u32_e64 v2, s[0:1], 0, v8, s[0:1]
300 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
301 ; GFX9-NEXT: global_store_short v[5:6], v2, off
302 ; GFX9-NEXT: s_cbranch_vccz BB4_1
303 ; GFX9-NEXT: ; %bb.2: ; %bb2
304 ; GFX9-NEXT: s_endpgm
154305 bb:
155306 br label %bb3
156307
168319 br i1 %tmp8, label %bb2, label %bb3
169320 }
170321
171 ; GCN-LABEL: {{^}}urem16_invariant_denom:
172 ; GCN: v_cvt_f32_u32
173 ; GCN: v_rcp_iflag_f32
174 ; GCN: [[LOOP:BB[0-9_]+]]:
175 ; GCN-NOT: v_rcp
176 ; GCN: s_cbranch_scc0 [[LOOP]]
177 ; GCN: s_endpgm
178322 define amdgpu_kernel void @urem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
323 ; GFX9-LABEL: urem16_invariant_denom:
324 ; GFX9: ; %bb.0: ; %bb
325 ; GFX9-NEXT: s_load_dword s3, s[0:1], 0x2c
326 ; GFX9-NEXT: s_mov_b32 s2, 0xffff
327 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
328 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
329 ; GFX9-NEXT: s_movk_i32 s6, 0x400
330 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
331 ; GFX9-NEXT: s_and_b32 s3, s3, s2
332 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
333 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
334 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
335 ; GFX9-NEXT: BB5_1: ; %bb3
336 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
337 ; GFX9-NEXT: v_and_b32_e32 v2, s2, v4
338 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v2
339 ; GFX9-NEXT: v_lshlrev_b64 v[5:6], 1, v[2:3]
340 ; GFX9-NEXT: v_mov_b32_e32 v8, s5
341 ; GFX9-NEXT: v_add_co_u32_e64 v5, s[0:1], s4, v5
342 ; GFX9-NEXT: v_addc_co_u32_e64 v6, s[0:1], v8, v6, s[0:1]
343 ; GFX9-NEXT: v_mul_f32_e32 v8, v7, v1
344 ; GFX9-NEXT: v_trunc_f32_e32 v8, v8
345 ; GFX9-NEXT: v_cvt_u32_f32_e32 v9, v8
346 ; GFX9-NEXT: v_mad_f32 v7, -v8, v0, v7
347 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v7|, v0
348 ; GFX9-NEXT: v_add_u16_e32 v4, 1, v4
349 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], 0, v9, s[0:1]
350 ; GFX9-NEXT: v_mul_lo_u32 v7, v7, s3
351 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s6, v4
352 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
353 ; GFX9-NEXT: v_sub_u32_e32 v2, v2, v7
354 ; GFX9-NEXT: global_store_short v[5:6], v2, off
355 ; GFX9-NEXT: s_cbranch_vccz BB5_1
356 ; GFX9-NEXT: ; %bb.2: ; %bb2
357 ; GFX9-NEXT: s_endpgm
179358 bb:
180359 br label %bb3
181360
193372 br i1 %tmp8, label %bb2, label %bb3
194373 }
195374
196 ; GCN-LABEL: {{^}}sdiv16_invariant_denom:
197 ; GCN-DAG: s_sext_i32_i16
198 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff
199 ; GCN-DAG: v_cvt_f32_i32
200 ; GCN-DAG: v_rcp_iflag_f32
201 ; GCN: [[LOOP:BB[0-9_]+]]:
202 ; GCN-NOT: v_rcp
203 ; GCN: s_cbranch_scc0 [[LOOP]]
204 ; GCN: s_endpgm
205375 define amdgpu_kernel void @sdiv16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
376 ; GFX9-LABEL: sdiv16_invariant_denom:
377 ; GFX9: ; %bb.0: ; %bb
378 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
379 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
380 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
381 ; GFX9-NEXT: s_movk_i32 s3, 0x400
382 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
383 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
384 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
385 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
386 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
387 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0
388 ; GFX9-NEXT: BB6_1: ; %bb3
389 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
390 ; GFX9-NEXT: v_bfe_i32 v6, v5, 0, 16
391 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5
392 ; GFX9-NEXT: v_cvt_f32_i32_e32 v10, v6
393 ; GFX9-NEXT: v_xor_b32_e32 v9, s2, v6
394 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[3:4]
395 ; GFX9-NEXT: v_mov_b32_e32 v8, s5
396 ; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6
397 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v8, v7, s[0:1]
398 ; GFX9-NEXT: v_mul_f32_e32 v8, v10, v1
399 ; GFX9-NEXT: v_trunc_f32_e32 v8, v8
400 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 30, v9
401 ; GFX9-NEXT: v_cvt_i32_f32_e32 v9, v8
402 ; GFX9-NEXT: v_mad_f32 v8, -v8, v0, v10
403 ; GFX9-NEXT: v_add_u16_e32 v5, 1, v5
404 ; GFX9-NEXT: v_or_b32_e32 v3, 1, v3
405 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v8|, v2
406 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5
407 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
408 ; GFX9-NEXT: v_add_u32_e32 v3, v9, v3
409 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
410 ; GFX9-NEXT: global_store_short v[6:7], v3, off
411 ; GFX9-NEXT: s_cbranch_vccz BB6_1
412 ; GFX9-NEXT: ; %bb.2: ; %bb2
413 ; GFX9-NEXT: s_endpgm
206414 bb:
207415 br label %bb3
208416
220428 br i1 %tmp8, label %bb2, label %bb3
221429 }
222430
223 ; GCN-LABEL: {{^}}srem16_invariant_denom:
224 ; GCN-DAG: s_sext_i32_i16
225 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fffffff
226 ; GCN-DAG: v_cvt_f32_i32
227 ; GCN-DAG: v_rcp_iflag_f32
228 ; GCN: [[LOOP:BB[0-9_]+]]:
229 ; GCN-NOT: v_rcp
230 ; GCN: s_cbranch_scc0 [[LOOP]]
231 ; GCN: s_endpgm
232431 define amdgpu_kernel void @srem16_invariant_denom(i16 addrspace(1)* nocapture %arg, i16 %arg1) {
432 ; GFX9-LABEL: srem16_invariant_denom:
433 ; GFX9: ; %bb.0: ; %bb
434 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c
435 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
436 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
437 ; GFX9-NEXT: s_movk_i32 s3, 0x400
438 ; GFX9-NEXT: v_mov_b32_e32 v5, 0
439 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
440 ; GFX9-NEXT: s_sext_i32_i16 s2, s2
441 ; GFX9-NEXT: v_cvt_f32_i32_e32 v0, s2
442 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
443 ; GFX9-NEXT: v_and_b32_e32 v2, 0x7fffffff, v0
444 ; GFX9-NEXT: BB7_1: ; %bb3
445 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
446 ; GFX9-NEXT: v_bfe_i32 v8, v5, 0, 16
447 ; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v5
448 ; GFX9-NEXT: v_cvt_f32_i32_e32 v11, v8
449 ; GFX9-NEXT: v_lshlrev_b64 v[6:7], 1, v[3:4]
450 ; GFX9-NEXT: v_mov_b32_e32 v9, s5
451 ; GFX9-NEXT: v_add_co_u32_e64 v6, s[0:1], s4, v6
452 ; GFX9-NEXT: v_addc_co_u32_e64 v7, s[0:1], v9, v7, s[0:1]
453 ; GFX9-NEXT: v_mul_f32_e32 v9, v11, v1
454 ; GFX9-NEXT: v_xor_b32_e32 v10, s2, v8
455 ; GFX9-NEXT: v_trunc_f32_e32 v9, v9
456 ; GFX9-NEXT: v_ashrrev_i32_e32 v3, 30, v10
457 ; GFX9-NEXT: v_cvt_i32_f32_e32 v10, v9
458 ; GFX9-NEXT: v_mad_f32 v9, -v9, v0, v11
459 ; GFX9-NEXT: v_or_b32_e32 v3, 1, v3
460 ; GFX9-NEXT: v_cmp_ge_f32_e64 s[0:1], |v9|, v2
461 ; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[0:1]
462 ; GFX9-NEXT: v_add_u32_e32 v3, v10, v3
463 ; GFX9-NEXT: v_mul_lo_u32 v3, v3, s2
464 ; GFX9-NEXT: v_add_u16_e32 v5, 1, v5
465 ; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, s3, v5
466 ; GFX9-NEXT: s_and_b64 vcc, exec, vcc
467 ; GFX9-NEXT: v_sub_u32_e32 v3, v8, v3
468 ; GFX9-NEXT: global_store_short v[6:7], v3, off
469 ; GFX9-NEXT: s_cbranch_vccz BB7_1
470 ; GFX9-NEXT: ; %bb.2: ; %bb2
471 ; GFX9-NEXT: s_endpgm
233472 bb:
234473 br label %bb3
235474