llvm.org GIT mirror llvm / d404fac
Merging r277500: ------------------------------------------------------------------------ r277500 | nha | 2016-08-02 12:17:37 -0700 (Tue, 02 Aug 2016) | 17 lines AMDGPU: Track physical registers in SIWholeQuadMode Summary: There are cases where uniform branch conditions are computed in VGPRs, and we didn't correctly mark those as WQM. The stray change in basic-branch.ll is because invoking the LiveIntervals analysis leads to the detection of a dead register that would otherwise not be seen at -O0. This is a candidate for the 3.9 branch, as it fixes a possible hang. Reviewers: arsenm, tstellarAMD, mareko Subscribers: arsenm, llvm-commits, kzhuravl Differential Revision: https://reviews.llvm.org/D22673 ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_39@277619 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 4 years ago
3 changed file(s) with 94 addition(s) and 30 deletion(s). Raw diff Collapse all Expand all
9393 const SIInstrInfo *TII;
9494 const SIRegisterInfo *TRI;
9595 MachineRegisterInfo *MRI;
96 LiveIntervals *LIS;
9697
9798 DenseMap Instructions;
9899 DenseMap Blocks;
99100 SmallVector ExecExports;
100101 SmallVector LiveMaskQueries;
101102
103 void markInstruction(MachineInstr &MI, char Flag,
104 std::vector &Worklist);
102105 char scanInstructions(MachineFunction &MF, std::vector &Worklist);
103106 void propagateInstruction(MachineInstr &MI, std::vector &Worklist);
104107 void propagateBlock(MachineBasicBlock &MBB, std::vector &Worklist);
125128 }
126129
127130 void getAnalysisUsage(AnalysisUsage &AU) const override {
131 AU.addRequired();
128132 AU.setPreservesCFG();
129133 MachineFunctionPass::getAnalysisUsage(AU);
130134 }
134138
135139 char SIWholeQuadMode::ID = 0;
136140
137 INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
138 "SI Whole Quad Mode", false, false)
141 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
142 false)
143 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
144 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
145 false)
139146
140147 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
141148
142149 FunctionPass *llvm::createSIWholeQuadModePass() {
143150 return new SIWholeQuadMode;
151 }
152
153 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
154 std::vector &Worklist) {
155 InstrInfo &II = Instructions[&MI];
156
157 assert(Flag == StateWQM || Flag == StateExact);
158
159 // Ignore if the instruction is already marked. The typical case is that we
160 // mark an instruction WQM multiple times, but for atomics it can happen that
161 // Flag is StateWQM, but Needs is already set to StateExact. In this case,
162 // letting the atomic run in StateExact is correct as per the relevant specs.
163 if (II.Needs)
164 return;
165
166 II.Needs = Flag;
167 Worklist.push_back(&MI);
144168 }
145169
146170 // Scan instructions to determine which ones require an Exact execmask and
191215 continue;
192216 }
193217
194 Instructions[&MI].Needs = Flags;
195 Worklist.push_back(&MI);
218 markInstruction(MI, Flags, Worklist);
196219 GlobalFlags |= Flags;
197220 }
198221
248271 if (!Use.isReg() || !Use.isUse())
249272 continue;
250273
251 // At this point, physical registers appear as inputs or outputs
252 // and following them makes no sense (and would in fact be incorrect
253 // when the same VGPR is used as both an output and an input that leads
254 // to a NeedsWQM instruction).
255 //
256 // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we
257 // have to trace this, in practice it happens for 64-bit computations like
258 // pointers where both dwords are followed already anyway.
259 if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
274 unsigned Reg = Use.getReg();
275
276 // Handle physical registers that we need to track; this is mostly relevant
277 // for VCC, which can appear as the (implicit) input of a uniform branch,
278 // e.g. when a loop counter is stored in a VGPR.
279 if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
280 if (Reg == AMDGPU::EXEC)
281 continue;
282
283 for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
284 LiveRange &LR = LIS->getRegUnit(*RegUnit);
285 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
286 if (!Value)
287 continue;
288
289 // Since we're in machine SSA, we do not need to track physical
290 // registers across basic blocks.
291 if (Value->isPHIDef())
292 continue;
293
294 markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,
295 Worklist);
296 }
297
260298 continue;
261
262 for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
263 InstrInfo &DefII = Instructions[&DefMI];
264
265 // Obviously skip if DefMI is already flagged as NeedWQM.
266 //
267 // The instruction might also be flagged as NeedExact. This happens when
268 // the result of an atomic is used in a WQM computation. In this case,
269 // the atomic must not run for helper pixels and the WQM result is
270 // undefined.
271 if (DefII.Needs != 0)
272 continue;
273
274 DefII.Needs = StateWQM;
275 Worklist.push_back(&DefMI);
276 }
299 }
300
301 for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
302 markInstruction(DefMI, StateWQM, Worklist);
277303 }
278304 }
279305
467493 TII = ST.getInstrInfo();
468494 TRI = &TII->getRegisterInfo();
469495 MRI = &MF.getRegInfo();
496 LIS = &getAnalysis();
470497
471498 char GlobalFlags = analyzeFunction(MF);
472499 if (!(GlobalFlags & StateWQM)) {
33 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
44
55 ; GCN-LABEL: {{^}}test_branch:
6 ; GCNNOOPT: v_writelane_b32
76 ; GCNNOOPT: v_writelane_b32
87 ; GCNNOOPT: v_writelane_b32
98 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
349349 ret float %s
350350 }
351351
352 ; CHECK-LABEL: {{^}}test_loop_vcc:
353 ; CHECK-NEXT: ; %entry
354 ; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
355 ; CHECK: s_wqm_b64 exec, exec
356 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
357 ; CHECK: image_store
358 ; CHECK: s_wqm_b64 exec, exec
359 ; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2
360 ; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]]
361
362 ; CHECK: [[LOOPHDR]]: ; %loop
363 ; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
364 ; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]]
365 ; CHECK: s_cbranch_vccz
366 ; CHECK: ; %break
367
368 ; CHECK: ; return
369 define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
370 entry:
371 call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
372 br label %loop
373
374 loop:
375 %ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ]
376 %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
377 %cc = icmp sgt i32 %ctr.iv, 7
378 br i1 %cc, label %break, label %body
379
380 body:
381 %c.i = bitcast <4 x float> %c.iv to <4 x i32>
382 %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
383 %ctr.next = add i32 %ctr.iv, 2
384 br label %loop
385
386 break:
387 ret <4 x float> %c.iv
388 }
389
352390 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
353391
354392 declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2