llvm.org GIT mirror llvm / ddb10d2
[AMDGPU] Optimize SI_IF lowering for simple if regions Currently SI_IF results in a s_and_saveexec_b64 followed by s_xor_b64. The xor is used to extract only the changed bits. In case of a simple if region where the only use of that value is in the SI_END_CF to restore the old exec mask, we can omit the xor and perform an or of the exec mask with the original exec value saved by the s_and_saveexec_b64. Differential Revision: https://reviews.llvm.org/D35861 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@309185 91177308-0d34-0410-b5e6-96231b3b80d8 Stanislav Mekhanoshin 2 years ago
14 changed file(s) with 29 addition(s) and 40 deletion(s). Raw diff Collapse all Expand all
148148 MachineOperand &ImpDefSCC = MI.getOperand(4);
149149 assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
150150
151 // If there is only one use of save exec register and that use is SI_END_CF,
152 // we can optimize SI_IF by returning the full saved exec mask instead of
153 // just cleared bits.
154 bool SimpleIf = false;
155 auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
156 SimpleIf = U != MRI->use_instr_nodbg_end() &&
157 std::next(U) == MRI->use_instr_nodbg_end() &&
158 U->getOpcode() == AMDGPU::SI_END_CF;
159
151160 // Add an implicit def of exec to discourage scheduling VALU after this which
152161 // will interfere with trying to form s_and_saveexec_b64 later.
153 unsigned CopyReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
162 unsigned CopyReg = SimpleIf ? SaveExecReg
163 : MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
154164 MachineInstr *CopyExec =
155165 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
156166 .addReg(AMDGPU::EXEC)
165175 .addReg(Cond.getReg());
166176 setImpSCCDefDead(*And, true);
167177
168 MachineInstr *Xor =
169 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
170 .addReg(Tmp)
171 .addReg(CopyReg);
172 setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
178 MachineInstr *Xor = nullptr;
179 if (!SimpleIf) {
180 Xor =
181 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
182 .addReg(Tmp)
183 .addReg(CopyReg);
184 setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
185 }
173186
174187 // Use a copy that is a terminator to get correct spill code placement it with
175188 // fast regalloc.
193206 // register.
194207 LIS->ReplaceMachineInstrInMaps(MI, *And);
195208
196 LIS->InsertMachineInstrInMaps(*Xor);
209 if (!SimpleIf)
210 LIS->InsertMachineInstrInMaps(*Xor);
197211 LIS->InsertMachineInstrInMaps(*SetExec);
198212 LIS->InsertMachineInstrInMaps(*NewBr);
199213
206220 LIS->removeInterval(SaveExecReg);
207221 LIS->createAndComputeVirtRegInterval(SaveExecReg);
208222 LIS->createAndComputeVirtRegInterval(Tmp);
209 LIS->createAndComputeVirtRegInterval(CopyReg);
223 if (!SimpleIf)
224 LIS->createAndComputeVirtRegInterval(CopyReg);
210225 }
211226
212227 void SILowerControlFlow::emitElse(MachineInstr &MI) {
1313 ; GCN-DAG: v_cmp_lt_f32_e32 vcc,
1414 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
1515 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
16 ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
1716 ; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
1817
1918 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
140140 ; GCN: buffer_load_dword
141141 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
142142 ; GCN: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
143 ; GCN: s_xor_b64 [[SAVE]], exec, [[SAVE]]
144143
145144 ; GCN: v_nop_e64
146145 ; GCN: v_nop_e64
384383 ; GCN-LABEL: {{^}}uniform_inside_divergent:
385384 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
386385 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
387 ; GCN-NEXT: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
388386 ; GCN-NEXT: ; mask branch [[ENDIF:BB[0-9]+_[0-9]+]]
389387 ; GCN-NEXT: s_cbranch_execnz [[IF:BB[0-9]+_[0-9]+]]
390388
435433 ; GCN-LABEL: {{^}}analyze_mask_branch:
436434 ; GCN: v_cmp_lt_f32_e32 vcc
437435 ; GCN-NEXT: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
438 ; GCN-NEXT: s_xor_b64 [[MASK]], exec, [[MASK]]
439436 ; GCN-NEXT: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
440437 ; GCN-NEXT: s_cbranch_execz [[BRANCH_SKIP:BB[0-9]+_[0-9]+]]
441438 ; GCN-NEXT: s_branch [[LOOP_BODY:BB[0-9]+_[0-9]+]]
1919 ; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], s{{[0-9]+}}, v0
2020 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
2121 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]]
22 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
2322
2423 ; Spill saved exec
2524 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
10099
101100 ; GCN: s_mov_b64 s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, exec
102101 ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]]
103 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
104102
105103 ; Spill load
106104 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
33 ; SI-LABEL: {{^}}br_i1_phi:
44 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
55 ; SI: s_and_saveexec_b64
6 ; SI: s_xor_b64
76 ; SI: v_mov_b32_e32 [[REG]], -1{{$}}
87 ; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
98 ; SI: s_and_saveexec_b64
10 ; SI: s_xor_b64
119 ; SI: s_endpgm
1210 define amdgpu_kernel void @br_i1_phi(i32 %arg) {
1311 bb:
136136 ; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
137137 ; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
138138 ; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
139 ; SI: s_xor_b64 [[SAVE]], exec, [[SAVE]]
140139
141140 ; SI: buffer_load_dword [[LOAD:v[0-9]+]]
142141 ; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
8080 ; GCN-NEXT: s_or_b64 exec, exec
8181 ; GCN: v_cmp_ne_u32_e32 vcc, 0
8282 ; GCN-NEXT: s_and_saveexec_b64
83 ; GCN-NEXT: s_xor_b64
8483
8584 ; GCN: ; %exit0
8685 ; GCN: buffer_store_dword
1010 ; GCN-NEXT: ; %else
1111
1212 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
13 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
1413 ; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
1514
1615 ; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
5958
6059 ; GCN: ; BB#{{[0-9]+}}: ; %else
6160 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
62 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
6361 ; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
6462
6563 ; GCN-NEXT: ; %unreachable.bb
22 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator:
33 ; GCN: v_cmp_eq_u32
44 ; GCN: s_and_saveexec_b64
5 ; GCN: s_xor_b64
65 ; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
76
87 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
3029 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
3130 ; GCN: v_cmp_ne_u32
3231 ; GCN: s_and_saveexec_b64
33 ; GCN: s_xor_b64
3432 ; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
3533
3634 ; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
201201 ; CHECK-LABEL: {{^}}test_kill_divergent_loop:
202202 ; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0
203203 ; CHECK-NEXT: s_and_saveexec_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], vcc
204 ; CHECK-NEXT: s_xor_b64 [[SAVEEXEC]], exec, [[SAVEEXEC]]
205204 ; CHECK-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
206205 ; CHECK-NEXT: s_cbranch_execz [[EXIT]]
207206
336335 ; CHECK-LABEL: {{^}}if_after_kill_block:
337336 ; CHECK: ; BB#0:
338337 ; CHECK: s_and_saveexec_b64
339 ; CHECK: s_xor_b64
340338 ; CHECK-NEXT: mask branch [[BB4:BB[0-9]+_[0-9]+]]
341339
342340 ; CHECK: v_cmpx_le_f32_e32 vcc, 0,
346344
347345 ; CHECK: v_cmp_neq_f32_e32 vcc, 0,
348346 ; CHECK: s_and_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc
349 ; CHECK: s_xor_b64 s{{\[[0-9]+:[0-9]+\]}}, exec
350347 ; CHECK: mask branch [[END:BB[0-9]+_[0-9]+]]
351348 ; CHECK-NOT: branch
352349
99 ; CHECK: v_mbcnt_lo_u32_b32_e64
1010 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1111 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc
12 ; CHECK-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
1312 ; BB0_1:
1413 ; CHECK: s_load_dword s0, s[0:1], 0xa
1514 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
302302 ; GCN-LABEL: {{^}}uniform_inside_divergent:
303303 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
304304 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
305 ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
306305 ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0
307306 ; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]]
308307 ; GCN: s_endpgm
334333 ; GCN: [[IF_LABEL]]:
335334 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
336335 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
337 ; GCN: s_xor_b64 [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
338336 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
339337 ; GCN: buffer_store_dword [[ONE]]
340338 define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
359357 ; GCN-LABEL: {{^}}divergent_if_uniform_if:
360358 ; GCN: v_cmp_eq_u32_e32 vcc, 0, v0
361359 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
362 ; GCN: s_xor_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
363360 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
364361 ; GCN: buffer_store_dword [[ONE]]
365362 ; GCN: s_or_b64 exec, exec, [[MASK]]
44 ; CHECK-LABEL: {{^}}test1:
55 ; CHECK: v_cmp_ne_u32_e32 vcc, 0
66 ; CHECK: s_and_saveexec_b64
7 ; CHECK-NEXT: s_xor_b64
87 ; CHECK-NEXT: ; mask branch
98 ; CHECK-NEXT: s_cbranch_execz BB{{[0-9]+_[0-9]+}}
109 ; CHECK-NEXT: BB{{[0-9]+_[0-9]+}}: ; %loop_body.preheader
3433
3534 ; CHECK-LABEL: {{^}}test2:
3635 ; CHECK: s_and_saveexec_b64
37 ; CHECK-NEXT: s_xor_b64
3836 ; CHECK-NEXT: ; mask branch
3937 ; CHECK-NEXT: s_cbranch_execz
4038 define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
1717 ; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
1818 ; SI: v_mov_b32_e32 v{{[0-9]}}, -1
1919 ; SI: s_and_saveexec_b64
20 ; SI-NEXT: s_xor_b64
2120 ; SI-NEXT: ; mask branch
2221
2322 ; v_mov should be after exec modification
6564 ; SI-LABEL: {{^}}simple_test_v_if:
6665 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
6766 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
68 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
69 ; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
67 ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
7068
7169 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
7270 ; SI: buffer_store_dword
9391 ; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
9492 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
9593 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
96 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
97 ; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
94 ; SI-NEXT: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
9895
9996 ; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
10097 ; SI: buffer_store_dword
159156 ; SI-LABEL: {{^}}simple_test_v_loop:
160157 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
161158 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
162 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
163 ; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
159 ; SI-NEXT: ; mask branch
160 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
164161
165162 ; SI: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
166163
201198 ; SI: buffer_load_dword [[VBOUND:v[0-9]+]]
202199 ; SI: v_cmp_lt_i32_e32 vcc
203200 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
204 ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
205 ; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
201 ; SI-NEXT: ; mask branch
202 ; SI-NEXT: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
206203
207204 ; Initialize inner condition to false
208205 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader