llvm.org GIT mirror llvm / 07b40ea
[SDA] Don't stop divergence propagation at the IPD. Summary: This fixes B42473 and B42706. This patch makes the SDA propagate branch divergence until the end of the RPO traversal. Before, the SyncDependenceAnalysis propagated divergence only until the IPD in rpo order. RPO is incompatible with post dominance in the presence of loops. This made the SDA crash because blocks were missed in the propagation. Reviewers: foad, nhaehnle Reviewed By: foad Subscribers: jvesely, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65274 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@372223 91177308-0d34-0410-b5e6-96231b3b80d8 Jay Foad 1 year, 1 month ago
2 changed file(s) with 138 addition(s) and 36 deletion(s). Raw diff Collapse all Expand all
217217 template
218218 std::unique_ptr
219219 computeJoinPoints(const BasicBlock &RootBlock,
220 SuccessorIterable NodeSuccessors, const Loop *ParentLoop, const BasicBlock * PdBoundBlock) {
220 SuccessorIterable NodeSuccessors, const Loop *ParentLoop) {
221221 assert(JoinBlocks);
222
223 LLVM_DEBUG(dbgs() << "SDA:computeJoinPoints. Parent loop: " << (ParentLoop ? ParentLoop->getName() : "") << "\n" );
222224
223225 // bootstrap with branch targets
224226 for (const auto *SuccBlock : NodeSuccessors) {
227229 if (ParentLoop && !ParentLoop->contains(SuccBlock)) {
228230 // immediate loop exit from node.
229231 ReachedLoopExits.insert(SuccBlock);
230 continue;
231232 } else {
232233 // regular successor
233234 PendingUpdates.insert(SuccBlock);
234235 }
235236 }
236237
238 LLVM_DEBUG(
239 dbgs() << "SDA: rpo order:\n";
240 for (const auto * RpoBlock : FuncRPOT) {
241 dbgs() << "- " << RpoBlock->getName() << "\n";
242 }
243 );
244
237245 auto ItBeginRPO = FuncRPOT.begin();
238246
239247 // skip until term (TODO RPOT won't let us start at @term directly)
244252
245253 // propagate definitions at the immediate successors of the node in RPO
246254 auto ItBlockRPO = ItBeginRPO;
247 while (++ItBlockRPO != ItEndRPO && *ItBlockRPO != PdBoundBlock) {
255 while ((++ItBlockRPO != ItEndRPO) &&
256 !PendingUpdates.empty()) {
248257 const auto *Block = *ItBlockRPO;
249
250 // skip @block if not pending update
258 LLVM_DEBUG(dbgs() << "SDA::joins. visiting " << Block->getName() << "\n");
259
260 // skip Block if not pending update
251261 auto ItPending = PendingUpdates.find(Block);
252262 if (ItPending == PendingUpdates.end())
253263 continue;
254264 PendingUpdates.erase(ItPending);
255265
256 // propagate definition at @block to its successors
266 // propagate definition at Block to its successors
257267 auto ItDef = DefMap.find(Block);
258268 const auto *DefBlock = ItDef->second;
259269 assert(DefBlock);
276286 }
277287 }
278288 }
289
290 LLVM_DEBUG(dbgs() << "SDA::joins. After propagation:\n"; printDefs(dbgs()));
279291
280292 // We need to know the definition at the parent loop header to decide
281293 // whether the definition at the header is different from the definition at
291303 // |
292304 // proper exit from both loops
293305 //
294 // D post-dominates B as it is the only proper exit from the "A loop".
295 // If C has a divergent branch, propagation will therefore stop at D.
296 // That implies that B will never receive a definition.
297 // But that definition can only be the same as at D (D itself in thise case)
298 // because all paths to anywhere have to pass through D.
299 //
300 const BasicBlock *ParentLoopHeader =
301 ParentLoop ? ParentLoop->getHeader() : nullptr;
302 if (ParentLoop && ParentLoop->contains(PdBoundBlock)) {
303 DefMap[ParentLoopHeader] = DefMap[PdBoundBlock];
304 }
305
306306 // analyze reached loop exits
307307 if (!ReachedLoopExits.empty()) {
308 const BasicBlock *ParentLoopHeader =
309 ParentLoop ? ParentLoop->getHeader() : nullptr;
310
308311 assert(ParentLoop);
309 const auto *HeaderDefBlock = DefMap[ParentLoopHeader];
312 auto ItHeaderDef = DefMap.find(ParentLoopHeader);
313 const auto *HeaderDefBlock = (ItHeaderDef == DefMap.end()) ? nullptr : ItHeaderDef->second;
314
310315 LLVM_DEBUG(printDefs(dbgs()));
311 assert(HeaderDefBlock && "no definition in header of carrying loop");
316 assert(HeaderDefBlock && "no definition at header of carrying loop");
312317
313318 for (const auto *ExitBlock : ReachedLoopExits) {
314319 auto ItExitDef = DefMap.find(ExitBlock);
338343 return *ItCached->second;
339344 }
340345
341 // dont propagte beyond the immediate post dom of the loop
342 const auto *PdNode = PDT.getNode(const_cast(Loop.getHeader()));
343 const auto *IpdNode = PdNode->getIDom();
344 const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
345 while (PdBoundBlock && Loop.contains(PdBoundBlock)) {
346 IpdNode = IpdNode->getIDom();
347 PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
348 }
349
350346 // compute all join points
351347 DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
352348 auto JoinBlocks = Propagator.computeJoinPoints(
353 *Loop.getHeader(), LoopExits, Loop.getParentLoop(), PdBoundBlock);
349 *Loop.getHeader(), LoopExits, Loop.getParentLoop());
354350
355351 auto ItInserted = CachedLoopExitJoins.emplace(&Loop, std::move(JoinBlocks));
356352 assert(ItInserted.second);
369365 if (ItCached != CachedBranchJoins.end())
370366 return *ItCached->second;
371367
372 // dont propagate beyond the immediate post dominator of the branch
373 const auto *PdNode = PDT.getNode(const_cast(Term.getParent()));
374 const auto *IpdNode = PdNode->getIDom();
375 const auto *PdBoundBlock = IpdNode ? IpdNode->getBlock() : nullptr;
376
377368 // compute all join points
378369 DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
379370 const auto &TermBlock = *Term.getParent();
380371 auto JoinBlocks = Propagator.computeJoinPoints(
381 TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock), PdBoundBlock);
372 TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
382373
383374 auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
384375 assert(ItInserted.second);
0 ; RUN: opt -mtriple amdgcn-unknown-amdhsa -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s
1
2 declare i32 @gf2(i32)
3 declare i32 @gf1(i32)
4
5 define void @tw1(i32 addrspace(4)* noalias nocapture readonly %A, i32 addrspace(4)* noalias nocapture %B) local_unnamed_addr #2 {
6 ; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'tw1':
7 ; CHECK: DIVERGENT: i32 addrspace(4)* %A
8 ; CHECK: DIVERGENT: i32 addrspace(4)* %B
9 entry:
10 ; CHECK: DIVERGENT: %call = tail call i32 @gf2(i32 0) #0
11 ; CHECK: DIVERGENT: %cmp = icmp ult i32 %call, 16
12 ; CHECK: DIVERGENT: br i1 %cmp, label %if.then, label %new_exit
13 %call = tail call i32 @gf2(i32 0) #3
14 %cmp = icmp ult i32 %call, 16
15 br i1 %cmp, label %if.then, label %new_exit
16
17 if.then:
18 ; CHECK: DIVERGENT: %call1 = tail call i32 @gf1(i32 0) #0
19 ; CHECK: DIVERGENT: %arrayidx = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %call1
20 ; CHECK: DIVERGENT: %0 = load i32, i32 addrspace(4)* %arrayidx, align 4
21 ; CHECK: DIVERGENT: %cmp225 = icmp sgt i32 %0, 0
22 ; CHECK: DIVERGENT: %arrayidx10 = getelementptr inbounds i32, i32 addrspace(4)* %B, i32 %call1
23 ; CHECK: DIVERGENT: br i1 %cmp225, label %while.body.preheader, label %if.then.while.end_crit_edge
24 %call1 = tail call i32 @gf1(i32 0) #4
25 %arrayidx = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %call1
26 %0 = load i32, i32 addrspace(4)* %arrayidx, align 4
27 %cmp225 = icmp sgt i32 %0, 0
28 %arrayidx10 = getelementptr inbounds i32, i32 addrspace(4)* %B, i32 %call1
29 br i1 %cmp225, label %while.body.preheader, label %if.then.while.end_crit_edge
30
31 while.body.preheader:
32 br label %while.body
33
34 if.then.while.end_crit_edge:
35 ; CHECK: DIVERGENT: %.pre = load i32, i32 addrspace(4)* %arrayidx10, align 4
36 %.pre = load i32, i32 addrspace(4)* %arrayidx10, align 4
37 br label %while.end
38
39 while.body:
40 ; CHECK-NOT: DIVERGENT: %i.026 = phi i32 [ %inc, %if.end.while.body_crit_edge ], [ 0, %while.body.preheader ]
41 ; CHECK: DIVERGENT: %call3 = tail call i32 @gf1(i32 0) #0
42 ; CHECK: DIVERGENT: %cmp4 = icmp ult i32 %call3, 10
43 ; CHECK: DIVERGENT: %arrayidx6 = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %i.026
44 ; CHECK: DIVERGENT: %1 = load i32, i32 addrspace(4)* %arrayidx6, align 4
45 ; CHECK: DIVERGENT: br i1 %cmp4, label %if.then5, label %if.else
46 %i.026 = phi i32 [ %inc, %if.end.while.body_crit_edge ], [ 0, %while.body.preheader ]
47 %call3 = tail call i32 @gf1(i32 0) #4
48 %cmp4 = icmp ult i32 %call3, 10
49 %arrayidx6 = getelementptr inbounds i32, i32 addrspace(4)* %A, i32 %i.026
50 %1 = load i32, i32 addrspace(4)* %arrayidx6, align 4
51 br i1 %cmp4, label %if.then5, label %if.else
52
53 if.then5:
54 ; CHECK: DIVERGENT: %mul = shl i32 %1, 1
55 ; CHECK: DIVERGENT: %2 = load i32, i32 addrspace(4)* %arrayidx10, align 4
56 ; CHECK: DIVERGENT: %add = add nsw i32 %2, %mul
57 %mul = shl i32 %1, 1
58 %2 = load i32, i32 addrspace(4)* %arrayidx10, align 4
59 %add = add nsw i32 %2, %mul
60 br label %if.end
61
62 if.else:
63 ; CHECK: DIVERGENT: %mul9 = shl i32 %1, 2
64 ; CHECK: DIVERGENT: %3 = load i32, i32 addrspace(4)* %arrayidx10, align 4
65 ; CHECK: DIVERGENT: %add11 = add nsw i32 %3, %mul9
66 %mul9 = shl i32 %1, 2
67 %3 = load i32, i32 addrspace(4)* %arrayidx10, align 4
68 %add11 = add nsw i32 %3, %mul9
69 br label %if.end
70
71 if.end:
72 ; CHECK: DIVERGENT: %storemerge = phi i32 [ %add11, %if.else ], [ %add, %if.then5 ]
73 ; CHECK: DIVERGENT: store i32 %storemerge, i32 addrspace(4)* %arrayidx10, align 4
74 ; CHECK-NOT: DIVERGENT: %inc = add nuw nsw i32 %i.026, 1
75 ; CHECK: DIVERGENT: %exitcond = icmp ne i32 %inc, %0
76 ; CHECK: DIVERGENT: br i1 %exitcond, label %if.end.while.body_crit_edge, label %while.end.loopexit
77 %storemerge = phi i32 [ %add11, %if.else ], [ %add, %if.then5 ]
78 store i32 %storemerge, i32 addrspace(4)* %arrayidx10, align 4
79 %inc = add nuw nsw i32 %i.026, 1
80 %exitcond = icmp ne i32 %inc, %0
81 br i1 %exitcond, label %if.end.while.body_crit_edge, label %while.end.loopexit
82
83 if.end.while.body_crit_edge:
84 br label %while.body
85
86 while.end.loopexit:
87 ; CHECK: DIVERGENT: %storemerge.lcssa = phi i32 [ %storemerge, %if.end ]
88 %storemerge.lcssa = phi i32 [ %storemerge, %if.end ]
89 br label %while.end
90
91 while.end:
92 ; CHECK: DIVERGENT: %4 = phi i32 [ %.pre, %if.then.while.end_crit_edge ], [ %storemerge.lcssa, %while.end.loopexit ]
93 ; CHECK: DIVERGENT: %i.0.lcssa = phi i32 [ 0, %if.then.while.end_crit_edge ], [ %0, %while.end.loopexit ]
94 ; CHECK: DIVERGENT: %sub = sub nsw i32 %4, %i.0.lcssa
95 ; CHECK: DIVERGENT: store i32 %sub, i32 addrspace(4)* %arrayidx10, align 4
96 %4 = phi i32 [ %.pre, %if.then.while.end_crit_edge ], [ %storemerge.lcssa, %while.end.loopexit ]
97 %i.0.lcssa = phi i32 [ 0, %if.then.while.end_crit_edge ], [ %0, %while.end.loopexit ]
98 %sub = sub nsw i32 %4, %i.0.lcssa
99 store i32 %sub, i32 addrspace(4)* %arrayidx10, align 4
100 br label %new_exit
101
102 new_exit:
103 ret void
104 }
105
106 attributes #0 = { nounwind readnone }
107 attributes #1 = { nounwind readnone }
108 attributes #2 = { nounwind readnone }
109 attributes #3 = { nounwind readnone }
110 attributes #4 = { nounwind readnone }