llvm.org GIT mirror llvm / c300b1a
[AMDGPU] Implement llvm.amdgcn.set.inactive intrinsic Summary: This intrinsic lets us set inactive lanes to an identity value when implementing wavefront reductions. In combination with Whole Wavefront Mode, it lets inactive lanes be skipped over as required by GLSL/Vulkan. Lowering the intrinsic needs to happen post-RA so that RA knows that the destination isn't completely overwritten due to the EXEC shenanigans, so we need another pseudo-instruction to represent the un-lowered intrinsic. Reviewers: tstellar, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34719 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310088 91177308-0d34-0410-b5e6-96231b3b80d8 Connor Abbott 2 years ago
6 changed file(s) with 150 addition(s) and 3 deletion(s). Raw diff Collapse all Expand all
755755 [LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
756756 >;
757757
758 // Given a value, copies it while setting all the inactive lanes to a given
759 // value. Note that OpenGL helper lanes are considered active, so if the
760 // program ever uses WQM, then the instruction and the first source will be
761 // computed in WQM.
762 def int_amdgcn_set_inactive :
763 Intrinsic<[llvm_anyint_ty],
764 [LLVMMatchType<0>, // value to be copied
765 LLVMMatchType<0>], // value for the inactive lanes to take
766 [IntrNoMem, IntrConvergent]>;
767
758768 //===----------------------------------------------------------------------===//
759769 // CI+ Intrinsics
760770 //===----------------------------------------------------------------------===//
10951095 .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1))
10961096 .addReg(Dst, RegState::Implicit | RegState::Define);
10971097 }
1098 MI.eraseFromParent();
1099 break;
1100 }
1101 case AMDGPU::V_SET_INACTIVE_B32: {
1102 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1103 .addReg(AMDGPU::EXEC);
1104 BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), MI.getOperand(0).getReg())
1105 .add(MI.getOperand(2));
1106 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1107 .addReg(AMDGPU::EXEC);
1108 MI.eraseFromParent();
1109 break;
1110 }
1111 case AMDGPU::V_SET_INACTIVE_B64: {
1112 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1113 .addReg(AMDGPU::EXEC);
1114 MachineInstr *Copy = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO),
1115 MI.getOperand(0).getReg())
1116 .add(MI.getOperand(2));
1117 expandPostRAPseudo(*Copy);
1118 BuildMI(MBB, MI, DL, get(AMDGPU::S_NOT_B64), AMDGPU::EXEC)
1119 .addReg(AMDGPU::EXEC);
10981120 MI.eraseFromParent();
10991121 break;
11001122 }
134134 let hasSideEffects = 0;
135135 let mayLoad = 0;
136136 let mayStore = 0;
137 }
138
139 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
140 // restoring it after we're done.
141 def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
142 (ins VGPR_32: $src, VSrc_b32:$inactive),
143 [(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
144 let Constraints = "$src = $vdst";
145 }
146
147 def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
148 (ins VReg_64: $src, VSrc_b64:$inactive),
149 [(set i64:$vdst, (int_amdgcn_set_inactive i64:$src, i64:$inactive))]> {
150 let Constraints = "$src = $vdst";
137151 }
138152
139153 let usesCustomInserter = 1, SALU = 1 in {
302302 std::vector &Worklist) {
303303 char GlobalFlags = 0;
304304 bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");
305 SmallVector SetInactiveInstrs;
305306
306307 // We need to visit the basic blocks in reverse post-order so that we visit
307308 // defs before uses, in particular so that we don't accidentally mark an
340341 GlobalFlags |= StateWWM;
341342 LowerToCopyInstrs.push_back(&MI);
342343 continue;
344 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
345 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
346 III.Disabled = StateWWM;
347 MachineOperand &Inactive = MI.getOperand(2);
348 if (Inactive.isReg()) {
349 if (Inactive.isUndef()) {
350 LowerToCopyInstrs.push_back(&MI);
351 } else {
352 unsigned Reg = Inactive.getReg();
353 if (TargetRegisterInfo::isVirtualRegister(Reg)) {
354 for (MachineInstr &DefMI : MRI->def_instructions(Reg))
355 markInstruction(DefMI, StateWWM, Worklist);
356 }
357 }
358 }
359 SetInactiveInstrs.push_back(&MI);
360 continue;
343361 } else if (TII->isDisableWQM(MI)) {
344362 BBI.Needs |= StateExact;
345363 if (!(BBI.InNeeds & StateExact)) {
379397 }
380398 }
381399
400 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
401 // ever used anywhere in the function. This implements the corresponding
402 // semantics of @llvm.amdgcn.set.inactive.
403 if (GlobalFlags & StateWQM) {
404 for (MachineInstr *MI : SetInactiveInstrs)
405 markInstruction(*MI, StateWQM, Worklist);
406 }
407
382408 return GlobalFlags;
383409 }
384410
798824 }
799825
800826 void SIWholeQuadMode::lowerCopyInstrs() {
801 for (MachineInstr *MI : LowerToCopyInstrs)
827 for (MachineInstr *MI : LowerToCopyInstrs) {
828 for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
829 MI->RemoveOperand(i);
802830 MI->setDesc(TII->get(AMDGPU::COPY));
831 }
803832 }
804833
805834 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
0 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
1 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
2
3
4 ; GCN-LABEL: {{^}}set_inactive:
5 ; GCN: s_not_b64 exec, exec
6 ; GCN: v_mov_b32_e32 {{v[0-9]+}}, 42
7 ; GCN: s_not_b64 exec, exec
8 define amdgpu_kernel void @set_inactive(i32 addrspace(1)* %out, i32 %in) {
9 %tmp = call i32 @llvm.amdgcn.set.inactive.i32(i32 %in, i32 42) #0
10 store i32 %tmp, i32 addrspace(1)* %out
11 ret void
12 }
13
14 ; GCN-LABEL: {{^}}set_inactive_64:
15 ; GCN: s_not_b64 exec, exec
16 ; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0
17 ; GCN: v_mov_b32_e32 {{v[0-9]+}}, 0
18 ; GCN: s_not_b64 exec, exec
19 define amdgpu_kernel void @set_inactive_64(i64 addrspace(1)* %out, i64 %in) {
20 %tmp = call i64 @llvm.amdgcn.set.inactive.i64(i64 %in, i64 0) #0
21 store i64 %tmp, i64 addrspace(1)* %out
22 ret void
23 }
24
25 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
26 declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64) #0
27
28 attributes #0 = { convergent readnone }
253253 endif:
254254 %out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
255255 ret float %out.1
256 }
257
258 ; Check that @llvm.amdgcn.set.inactive disables WWM.
259 ;
260 ;CHECK-LABEL: {{^}}test_set_inactive1:
261 ;CHECK: buffer_load_dword
262 ;CHECK: s_not_b64 exec, exec
263 ;CHECK: v_mov_b32_e32
264 ;CHECK: s_not_b64 exec, exec
265 ;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
266 ;CHECK: v_add_i32_e32
267 define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
268 main_body:
269 %src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
270 %src.0 = bitcast float %src to i32
271 %src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
272 %out = add i32 %src.1, %src.1
273 %out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
274 %out.1 = bitcast i32 %out.0 to float
275 call void @llvm.amdgcn.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
276 ret void
277 }
278
279 ; Check that enabling WQM anywhere enables WQM for the set.inactive source.
280 ;
281 ;CHECK-LABEL: {{^}}test_set_inactive2:
282 ;CHECK: s_wqm_b64 exec, exec
283 ;CHECK: buffer_load_dword
284 ;CHECK: buffer_load_dword
285 define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
286 main_body:
287 %src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
288 %src1.0 = bitcast float %src1 to i32
289 %src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
290 %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
291 %src0.0 = bitcast float %src0 to i32
292 %src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
293 %out = add i32 %src0.1, %src1.1
294 %out.0 = bitcast i32 %out to float
295 call void @llvm.amdgcn.buffer.store.f32(float %out.0, <4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
296 ret void
256297 }
257298
258299 ; Check a case of one branch of an if-else requiring WQM, the other requiring
512553 ; CHECK: s_wqm_b64 exec, exec
513554 ; CHECK: v_add_f32_e32 v0,
514555 ; CHECK: s_and_b64 exec, exec, [[ORIG]]
515 define amdgpu_ps float @test_prolog_1(float %a, float %b) #4 {
556 define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
516557 main_body:
517558 %s = fadd float %a, %b
518559 ret float %s
679720 declare i32 @llvm.amdgcn.wqm.i32(i32) #3
680721 declare float @llvm.amdgcn.wwm.f32(float) #3
681722 declare i32 @llvm.amdgcn.wwm.i32(i32) #3
723 declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
682724 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
683725 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
684726
685727 attributes #1 = { nounwind }
686728 attributes #2 = { nounwind readonly }
687729 attributes #3 = { nounwind readnone }
688 attributes #4 = { "amdgpu-ps-wqm-outputs" }
730 attributes #4 = { nounwind readnone convergent }
731 attributes #5 = { "amdgpu-ps-wqm-outputs" }