llvm.org GIT mirror llvm / b794d6e
AMDGPU: Do not combine loads/store across physreg defs Summary: Since this pass operates on machine SSA form, this should only really affect M0 in practice. Fixes various piglit variable-indexing/vs-varying-array-mat4-index-* Change-Id: Ib2a1dc3a8d7b08225a8da49a86f533faa0986aa8 Fixes: r317751 ("AMDGPU: Merge S_BUFFER_LOAD_DWORD_IMM into x2, x4") Reviewers: arsenm, mareko, rampitec Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits Differential Revision: https://reviews.llvm.org/D40343 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@325677 91177308-0d34-0410-b5e6-96231b3b80d8 Nicolai Haehnle 2 years ago
4 changed file(s) with 102 addition(s) and 16 deletion(s). Raw diff Collapse all Expand all
227227 return true;
228228 }
229229
230 static bool
231 hasPhysRegDef(MachineInstr &MI) {
232 for (const MachineOperand &Def : MI.defs()) {
233 if (Def.isReg() &&
234 TargetRegisterInfo::isPhysicalRegister(Def.getReg()))
235 return true;
236 }
237 return false;
238 }
239
230240 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
231241 // XXX - Would the same offset be OK? Is there any reason this would happen or
232242 // be useful?
349359 return false;
350360 }
351361
362 if (hasPhysRegDef(*MBBI)) {
363 // We could re-order this instruction in theory, but it would require
364 // tracking physreg defs and uses. This should only affect M0 in
365 // practice.
366 return false;
367 }
368
352369 if (MBBI->mayLoadOrStore() &&
353370 (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
354371 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))) {
436453 // down past this instruction.
437454 // check if we can move I across MBBI and if we can move all I's users
438455 if (!memAccessesCanBeReordered(*CI.I, *MBBI, TII, AA) ||
439 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA))
456 !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, TII, AA) ||
457 hasPhysRegDef(*MBBI))
440458 break;
441459 }
442460 return false;
612612 ret void
613613 }
614614
615 ; GCN-LABEL: ds_read_call_read:
616 ; GCN: ds_read_b32
617 ; GCN: s_swappc_b64
618 ; GCN: ds_read_b32
619 define amdgpu_kernel void @ds_read_call_read(i32 addrspace(1)* %out, i32 addrspace(3)* %arg) {
620 %x = call i32 @llvm.amdgcn.workitem.id.x()
621 %arrayidx0 = getelementptr i32, i32 addrspace(3)* %arg, i32 %x
622 %arrayidx1 = getelementptr i32, i32 addrspace(3)* %arrayidx0, i32 1
623 %v0 = load i32, i32 addrspace(3)* %arrayidx0, align 4
624 call void @void_func_void()
625 %v1 = load i32, i32 addrspace(3)* %arrayidx1, align 4
626 %r = add i32 %v0, %v1
627 store i32 %r, i32 addrspace(1)* %out, align 4
628 ret void
629 }
630
631 declare void @void_func_void() #3
632
615633 declare i32 @llvm.amdgcn.workgroup.id.x() #1
616634 declare i32 @llvm.amdgcn.workgroup.id.y() #1
617635 declare i32 @llvm.amdgcn.workitem.id.x() #1
622640 attributes #0 = { nounwind }
623641 attributes #1 = { nounwind readnone speculatable }
624642 attributes #2 = { convergent nounwind }
643 attributes #3 = { nounwind noinline }
159159
160160 ; SI won't merge ds memory operations, because of the signed offset bug, so
161161 ; we only have check lines for VI.
162 ; VI-LABEL: v_interp_readnone:
163 ; VI: s_mov_b32 m0, 0
164 ; VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
165 ; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
166 ; VI: s_mov_b32 m0, -1{{$}}
167 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
168 define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
169 bb:
170 store float 0.000000e+00, float addrspace(3)* %lds
171 %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
172 %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
173 store float 0.000000e+00, float addrspace(3)* %tmp2
174 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
175 ret void
176 }
162 ;
163 ; TODO: VI won't merge them either, because we are conservative about moving
164 ; instructions past changes to physregs.
165 ;
166 ; TODO-VI-LABEL: v_interp_readnone:
167 ; TODO-VI: s_mov_b32 m0, 0
168 ; TODO-VI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
169 ; TODO-VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
170 ; TODO-VI: s_mov_b32 m0, -1{{$}}
171 ; TODO-VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
172 ;define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
173 ;bb:
174 ; store float 0.000000e+00, float addrspace(3)* %lds
175 ; %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
176 ; %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
177 ; store float 0.000000e+00, float addrspace(3)* %tmp2
178 ; call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
179 ; ret void
180 ;}
177181
178182 ; Thest that v_interp_p1 uses different source and destination registers
179183 ; on 16 bank LDS chips.
229229 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) #0
230230 call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) #0
231231 ret void
232 }
233
234 ; GCN-LABEL: {{^}}smrd_imm_nomerge_m0:
235 ;
236 ; In principle we could merge the loads here as well, but it would require
237 ; careful tracking of physical registers since both v_interp* and v_movrel*
238 ; instructions (or gpr idx mode) use M0.
239 ;
240 ; GCN: s_buffer_load_dword
241 ; GCN: s_buffer_load_dword
242 define amdgpu_ps float @smrd_imm_nomerge_m0(<4 x i32> inreg %desc, i32 inreg %prim, float %u, float %v) #0 {
243 main_body:
244 %idx1.f = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 0)
245 %idx1 = bitcast float %idx1.f to i32
246
247 %v0.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 0, i32 %prim)
248 %v0.x = call nsz float @llvm.amdgcn.interp.p2(float %v0.x1, float %v, i32 0, i32 0, i32 %prim)
249 %v0.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 1, i32 %prim)
250 %v0.y = call nsz float @llvm.amdgcn.interp.p2(float %v0.y1, float %v, i32 0, i32 1, i32 %prim)
251 %v0.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 0, i32 2, i32 %prim)
252 %v0.z = call nsz float @llvm.amdgcn.interp.p2(float %v0.z1, float %v, i32 0, i32 2, i32 %prim)
253 %v0.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
254 %v0.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
255 %v0 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
256 %a = extractelement <3 x float> %v0, i32 %idx1
257
258 %v1.x1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 0, i32 %prim)
259 %v1.x = call nsz float @llvm.amdgcn.interp.p2(float %v1.x1, float %v, i32 1, i32 0, i32 %prim)
260 %v1.y1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 1, i32 %prim)
261 %v1.y = call nsz float @llvm.amdgcn.interp.p2(float %v1.y1, float %v, i32 1, i32 1, i32 %prim)
262 %v1.z1 = call nsz float @llvm.amdgcn.interp.p1(float %u, i32 1, i32 2, i32 %prim)
263 %v1.z = call nsz float @llvm.amdgcn.interp.p2(float %v1.z1, float %v, i32 1, i32 2, i32 %prim)
264 %v1.tmp0 = insertelement <3 x float> undef, float %v0.x, i32 0
265 %v1.tmp1 = insertelement <3 x float> %v0.tmp0, float %v0.y, i32 1
266 %v1 = insertelement <3 x float> %v0.tmp1, float %v0.z, i32 2
267
268 %b = extractelement <3 x float> %v1, i32 %idx1
269 %c = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)
270
271 %res.tmp = fadd float %a, %b
272 %res = fadd float %res.tmp, %c
273 ret float %res
232274 }
233275
234276 ; GCN-LABEL: {{^}}smrd_vgpr_merged:
288330
289331 declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
290332 declare float @llvm.SI.load.const.v4i32(<4 x i32>, i32) #1
333 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
334 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
291335
292336 attributes #0 = { nounwind }
293337 attributes #1 = { nounwind readnone }
338 attributes #2 = { nounwind readnone speculatable }
294339
295340 !0 = !{}