llvm.org GIT mirror llvm / a8191fa
Merging r321751, r321806, and r321878: ------------------------------------------------------------------------ r321751 | arsenm | 2018-01-03 10:45:37 -0800 (Wed, 03 Jan 2018) | 25 lines StructurizeCFG: Fix broken backedge detection The work order was changed in r228186 from SCC order to RPO with an arbitrary sorting function. The sorting function attempted to move inner loop nodes earlier. This was was apparently relying on an assumption that every block in a given loop / the same loop depth would be seen before visiting another loop. In the broken testcase, a block outside of the loop was encountered before moving onto another block in the same loop. The testcase would then structurize such that one blocks unconditional successor could never be reached. Revert to plain RPO for the analysis phase. This fixes detecting edges as backedges that aren't really. The processing phase does use another visited set, and I'm unclear on whether the order there is as important. An arbitrary order doesn't work, and triggers some infinite loops. The reversed RPO list seems to work and is closer to the order that was used before, minus the arbitary custom sorting. A few of the changed tests now produce smaller code, and a few are slightly worse looking. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r321806 | arsenm | 2018-01-04 09:23:24 -0800 (Thu, 04 Jan 2018) | 4 lines StructurizeCFG: xfail one of the testcases from r321751 It fails with -verify-region-info. This seems to be a issue with RegionInfo itself which existed before. ------------------------------------------------------------------------ ------------------------------------------------------------------------ r321878 | arsenm | 2018-01-05 09:51:36 -0800 (Fri, 05 Jan 2018) | 4 lines RegionInfo: Use report_fatal_error instead of llvm_unreachable Otherwise when using -verify-region-info in a release build the error won't be emitted. ------------------------------------------------------------------------ git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_60@322686 91177308-0d34-0410-b5e6-96231b3b80d8 Hans Wennborg 1 year, 8 months ago
8 changed file(s) with 425 addition(s) and 162 deletion(s). Raw diff Collapse all Expand all
::verifyBBInRegion(BlockT *BB) const {
253253 template
254254 void RegionBase
255255 if (!contains(BB))
256 llvm_unreachable("Broken region found: enumerated BB not in region!");
256 report_fatal_error("Broken region found: enumerated BB not in region!");
257257
258258 BlockT *entry = getEntry(), *exit = getExit();
259259
260260 for (BlockT *Succ :
261261 make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
262262 if (!contains(Succ) && exit != Succ)
263 llvm_unreachable("Broken region found: edges leaving the region must go "
264 "to the exit node!");
263 report_fatal_error("Broken region found: edges leaving the region must go "
264 "to the exit node!");
265265 }
266266
267267 if (entry != BB) {
268268 for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
269269 InvBlockTraits::child_end(BB))) {
270270 if (!contains(Pred))
271 llvm_unreachable("Broken region found: edges entering the region must "
272 "go to the entry node!");
271 report_fatal_error("Broken region found: edges entering the region must "
272 "go to the entry node!");
273273 }
274274 }
275275 }
556556 } else {
557557 BlockT *BB = Element->template getNodeAs();
558558 if (getRegionFor(BB) != R)
559 llvm_unreachable("BB map does not match region nesting");
559 report_fatal_error("BB map does not match region nesting");
560560 }
561561 }
562562 }
1313 #include "llvm/ADT/SmallPtrSet.h"
1414 #include "llvm/ADT/SmallVector.h"
1515 #include "llvm/Analysis/DivergenceAnalysis.h"
16 #include "llvm/Analysis/LoopInfo.h"
1716 #include "llvm/Analysis/RegionInfo.h"
1817 #include "llvm/Analysis/RegionIterator.h"
1918 #include "llvm/Analysis/RegionPass.h"
176175 Region *ParentRegion;
177176
178177 DominatorTree *DT;
179 LoopInfo *LI;
180
181 SmallVector Order;
178
179 std::deque Order;
182180 BBSet Visited;
183181
184182 BBPhiMap DeletedPhis;
203201
204202 void gatherPredicates(RegionNode *N);
205203
206 void collectInfos();
204 void analyzeNode(RegionNode *N);
207205
208206 void insertConditions(bool Loops);
209207
257255 AU.addRequired();
258256 AU.addRequiredID(LowerSwitchID);
259257 AU.addRequired();
260 AU.addRequired();
261258
262259 AU.addPreserved();
263260 RegionPass::getAnalysisUsage(AU);
291288
292289 /// \brief Build up the general order of nodes
293290 void StructurizeCFG::orderNodes() {
294 ReversePostOrderTraversal RPOT(ParentRegion);
295 SmallDenseMap LoopBlocks;
296
297 // The reverse post-order traversal of the list gives us an ordering close
298 // to what we want. The only problem with it is that sometimes backedges
299 // for outer loops will be visited before backedges for inner loops.
300 for (RegionNode *RN : RPOT) {
301 BasicBlock *BB = RN->getEntry();
302 Loop *Loop = LI->getLoopFor(BB);
303 ++LoopBlocks[Loop];
304 }
305
306 unsigned CurrentLoopDepth = 0;
307 Loop *CurrentLoop = nullptr;
308 for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
309 BasicBlock *BB = (*I)->getEntry();
310 unsigned LoopDepth = LI->getLoopDepth(BB);
311
312 if (is_contained(Order, *I))
313 continue;
314
315 if (LoopDepth < CurrentLoopDepth) {
316 // Make sure we have visited all blocks in this loop before moving back to
317 // the outer loop.
318
319 auto LoopI = I;
320 while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
321 LoopI++;
322 BasicBlock *LoopBB = (*LoopI)->getEntry();
323 if (LI->getLoopFor(LoopBB) == CurrentLoop) {
324 --BlockCount;
325 Order.push_back(*LoopI);
326 }
327 }
328 }
329
330 CurrentLoop = LI->getLoopFor(BB);
331 if (CurrentLoop)
332 LoopBlocks[CurrentLoop]--;
333
334 CurrentLoopDepth = LoopDepth;
335 Order.push_back(*I);
336 }
337
338 // This pass originally used a post-order traversal and then operated on
339 // the list in reverse. Now that we are using a reverse post-order traversal
340 // rather than re-working the whole pass to operate on the list in order,
341 // we just reverse the list and continue to operate on it in reverse.
342 std::reverse(Order.begin(), Order.end());
291 assert(Visited.empty());
292 assert(Predicates.empty());
293 assert(Loops.empty());
294 assert(LoopPreds.empty());
295
296 // This must be RPO order for the back edge detection to work
297 for (RegionNode *RN : ReversePostOrderTraversal(ParentRegion)) {
298 // FIXME: Is there a better order to use for structurization?
299 Order.push_back(RN);
300 analyzeNode(RN);
301 }
343302 }
344303
345304 /// \brief Determine the end of the loops
465424 }
466425
467426 /// \brief Collect various loop and predicate infos
468 void StructurizeCFG::collectInfos() {
469 // Reset predicate
470 Predicates.clear();
471
472 // and loop infos
473 Loops.clear();
474 LoopPreds.clear();
475
476 // Reset the visited nodes
477 Visited.clear();
478
479 for (RegionNode *RN : reverse(Order)) {
480 DEBUG(dbgs() << "Visiting: "
481 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
482 << RN->getEntry()->getName() << " Loop Depth: "
483 << LI->getLoopDepth(RN->getEntry()) << "\n");
484
485 // Analyze all the conditions leading to a node
486 gatherPredicates(RN);
487
488 // Remember that we've seen this node
489 Visited.insert(RN->getEntry());
490
491 // Find the last back edges
492 analyzeLoops(RN);
493 }
427 void StructurizeCFG::analyzeNode(RegionNode *RN) {
428 DEBUG(dbgs() << "Visiting: "
429 << (RN->isSubRegion() ? "SubRegion with entry: " : "")
430 << RN->getEntry()->getName() << '\n');
431
432 // Analyze all the conditions leading to a node
433 gatherPredicates(RN);
434
435 // Remember that we've seen this node
436 Visited.insert(RN->getEntry());
437
438 // Find the last back edges
439 analyzeLoops(RN);
494440 }
495441
496442 /// \brief Insert the missing branch conditions
663609 BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
664610 LLVMContext &Context = Func->getContext();
665611 BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
666 Order.back()->getEntry();
612 Order.front()->getEntry();
667613 BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
668614 Func, Insert);
669615 DT->addNewBlock(Flow, Dominator);
743689 /// Take one node from the order vector and wire it up
744690 void StructurizeCFG::wireFlow(bool ExitUseAllowed,
745691 BasicBlock *LoopEnd) {
746 RegionNode *Node = Order.pop_back_val();
692 RegionNode *Node = Order.front();
693 Order.pop_front();
747694 Visited.insert(Node->getEntry());
748695
749696 if (isPredictableTrue(Node)) {
767714
768715 PrevNode = Node;
769716 while (!Order.empty() && !Visited.count(LoopEnd) &&
770 dominatesPredicates(Entry, Order.back())) {
717 dominatesPredicates(Entry, Order.front())) {
771718 handleLoops(false, LoopEnd);
772719 }
773720
778725
779726 void StructurizeCFG::handleLoops(bool ExitUseAllowed,
780727 BasicBlock *LoopEnd) {
781 RegionNode *Node = Order.back();
728 RegionNode *Node = Order.front();
782729 BasicBlock *LoopStart = Node->getEntry();
783730
784731 if (!Loops.count(LoopStart)) {
923870 ParentRegion = R;
924871
925872 DT = &getAnalysis().getDomTree();
926 LI = &getAnalysis().getLoopInfo();
927873
928874 orderNodes();
929 collectInfos();
875
930876 createFlow();
931877 insertConditions(false);
932878 insertConditions(true);
6565
6666 ; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
6767 ; OPT: llvm.amdgcn.break
68 ; OPT: llvm.amdgcn.loop
68 ; OPT: llvm.amdgcn.break
6969 ; OPT: llvm.amdgcn.if.break
7070 ; OPT: llvm.amdgcn.if.break
71 ; OPT: llvm.amdgcn.loop
7172 ; OPT: llvm.amdgcn.end.cf
7273
7374 ; GCN-LABEL: {{^}}multi_if_break_loop:
123123 ; Earlier version of above, before a run of the structurizer.
124124 ; IR-LABEL: @nested_loop_conditions(
125125
126 ; IR: Flow7:
127 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
128 ; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
129 ; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
130 ; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
131 ; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
126 ; IR: %tmp1235 = icmp slt i32 %tmp1134, 9
127 ; IR: br i1 %tmp1235, label %bb14.lr.ph, label %Flow
128
129 ; IR: bb14.lr.ph:
130 ; IR: br label %bb14
131
132 ; IR: Flow3:
133 ; IR: call void @llvm.amdgcn.end.cf(i64 %18)
134 ; IR: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %17)
135 ; IR: %1 = extractvalue { i1, i64 } %0, 0
136 ; IR: %2 = extractvalue { i1, i64 } %0, 1
137 ; IR: br i1 %1, label %bb4.bb13_crit_edge, label %Flow4
138
139 ; IR: bb4.bb13_crit_edge:
140 ; IR: br label %Flow4
141
142 ; IR: Flow4:
143 ; IR: %3 = phi i1 [ true, %bb4.bb13_crit_edge ], [ false, %Flow3 ]
144 ; IR: call void @llvm.amdgcn.end.cf(i64 %2)
145 ; IR: br label %Flow
146
147 ; IR: bb13:
148 ; IR: br label %bb31
149
150 ; IR: Flow:
151 ; IR: %4 = phi i1 [ %3, %Flow4 ], [ true, %bb ]
152 ; IR: %5 = call { i1, i64 } @llvm.amdgcn.if(i1 %4)
153 ; IR: %6 = extractvalue { i1, i64 } %5, 0
154 ; IR: %7 = extractvalue { i1, i64 } %5, 1
155 ; IR: br i1 %6, label %bb13, label %bb31
156
157 ; IR: bb14:
158 ; IR: %phi.broken = phi i64 [ %18, %Flow2 ], [ 0, %bb14.lr.ph ]
159 ; IR: %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %16, %Flow2 ]
160 ; IR: %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %15, %Flow2 ]
161 ; IR: %tmp15 = icmp eq i32 %tmp1037, 1
162 ; IR: %8 = xor i1 %tmp15, true
163 ; IR: %9 = call { i1, i64 } @llvm.amdgcn.if(i1 %8)
164 ; IR: %10 = extractvalue { i1, i64 } %9, 0
165 ; IR: %11 = extractvalue { i1, i64 } %9, 1
166 ; IR: br i1 %10, label %bb31.loopexit, label %Flow1
132167
133168 ; IR: Flow1:
134 ; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
135 ; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
136 ; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
137 ; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
138 ; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
139 ; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
140 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
141 ; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
142 ; IR-NEXT: br i1 %18, label %Flow7, label %bb14
169 ; IR: %12 = call { i1, i64 } @llvm.amdgcn.else(i64 %11)
170 ; IR: %13 = extractvalue { i1, i64 } %12, 0
171 ; IR: %14 = extractvalue { i1, i64 } %12, 1
172 ; IR: br i1 %13, label %bb16, label %Flow2
173
174 ; IR: bb16:
175 ; IR: %tmp17 = bitcast i64 %tmp3 to <2 x i32>
176 ; IR: br label %bb18
143177
144178 ; IR: Flow2:
145 ; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
146 ; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
147 ; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
148 ; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
149 ; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
150 ; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
151 ; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
152 ; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
153 ; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
154 ; IR-NEXT: br i1 %25, label %bb21, label %Flow3
179 ; IR: %loop.phi = phi i64 [ %21, %bb21 ], [ %phi.broken, %Flow1 ]
180 ; IR: %15 = phi <4 x i32> [ %tmp9, %bb21 ], [ undef, %Flow1 ]
181 ; IR: %16 = phi i32 [ %tmp10, %bb21 ], [ undef, %Flow1 ]
182 ; IR: %17 = phi i1 [ %20, %bb21 ], [ false, %Flow1 ]
183 ; IR: %18 = call i64 @llvm.amdgcn.else.break(i64 %14, i64 %loop.phi)
184 ; IR: call void @llvm.amdgcn.end.cf(i64 %14)
185 ; IR: %19 = call i1 @llvm.amdgcn.loop(i64 %18)
186 ; IR: br i1 %19, label %Flow3, label %bb14
187
188 ; IR: bb18:
189 ; IR: %tmp19 = load volatile i32, i32 addrspace(1)* undef
190 ; IR: %tmp20 = icmp slt i32 %tmp19, 9
191 ; IR: br i1 %tmp20, label %bb21, label %bb18
155192
156193 ; IR: bb21:
157 ; IR: %tmp12 = icmp slt i32 %tmp11, 9
158 ; IR-NEXT: %27 = xor i1 %tmp12, true
159 ; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
160 ; IR-NEXT: br label %Flow3
161
162 ; IR: Flow3:
163 ; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
164 ; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
165 ; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
166 ; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
167 ; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
168 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
169 ; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
194 ; IR: %tmp22 = extractelement <2 x i32> %tmp17, i64 1
195 ; IR: %tmp23 = lshr i32 %tmp22, 16
196 ; IR: %tmp24 = select i1 undef, i32 undef, i32 %tmp23
197 ; IR: %tmp25 = uitofp i32 %tmp24 to float
198 ; IR: %tmp26 = fmul float %tmp25, 0x3EF0001000000000
199 ; IR: %tmp27 = fsub float %tmp26, undef
200 ; IR: %tmp28 = fcmp olt float %tmp27, 5.000000e-01
201 ; IR: %tmp29 = select i1 %tmp28, i64 1, i64 2
202 ; IR: %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
203 ; IR: %tmp7 = zext i32 %tmp30 to i64
204 ; IR: %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
205 ; IR: %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
206 ; IR: %tmp10 = extractelement <4 x i32> %tmp9, i64 0
207 ; IR: %tmp11 = load volatile i32, i32 addrspace(1)* undef
208 ; IR: %tmp12 = icmp slt i32 %tmp11, 9
209 ; IR: %20 = xor i1 %tmp12, true
210 ; IR: %21 = call i64 @llvm.amdgcn.if.break(i1 %20, i64 %phi.broken)
211 ; IR: br label %Flow2
212
213 ; IR: bb31.loopexit:
214 ; IR: br label %Flow1
170215
171216 ; IR: bb31:
172 ; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
173 ; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
174 ; IR-NEXT: ret void
217 ; IR: call void @llvm.amdgcn.end.cf(i64 %7)
218 ; IR: store volatile i32 0, i32 addrspace(1)* undef
219 ; IR: ret void
175220
176221
177222 ; GCN-LABEL: {{^}}nested_loop_conditions:
0 ; XFAIL: *
1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg -verify-region-info %s
2
3 ; FIXME: Merge into backedge-id-bug
4 ; Variant which has an issue with region construction
5
6 define amdgpu_kernel void @loop_backedge_misidentified_alt(i32 addrspace(1)* %arg0) #0 {
7 entry:
8 %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
9 %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
10 %tid = call i32 @llvm.amdgcn.workitem.id.x()
11 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
12 %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
13 br label %LOOP.HEADER
14
15 LOOP.HEADER:
16 %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
17 call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
18 %tmp12 = zext i32 %i to i64
19 %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
20 %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
21 %tmp15 = extractelement <4 x i32> %tmp14, i64 0
22 %tmp16 = and i32 %tmp15, 65535
23 %tmp17 = icmp eq i32 %tmp16, 1
24 br i1 %tmp17, label %bb18, label %bb62
25
26 bb18:
27 %tmp19 = extractelement <2 x i32> %tmp, i64 0
28 %tmp22 = lshr i32 %tmp19, 16
29 %tmp24 = urem i32 %tmp22, 52
30 %tmp25 = mul nuw nsw i32 %tmp24, 52
31 br label %INNER_LOOP
32
33 INNER_LOOP:
34 %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
35 call void asm sideeffect "; inner loop body", ""() #0
36 %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
37 %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
38 br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
39
40 INNER_LOOP_BREAK:
41 %tmp59 = extractelement <4 x i32> %tmp14, i64 2
42 call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
43 br label %END_ELSE_BLOCK
44
45 bb62:
46 %load13 = icmp ult i32 %tmp16, 271
47 ;br i1 %load13, label %bb64, label %INCREMENT_I
48 ; branching directly to the return avoids the bug
49 br i1 %load13, label %RETURN, label %INCREMENT_I
50
51
52 bb64:
53 call void asm sideeffect "s_nop 42", "~{memory}"() #0
54 br label %RETURN
55
56 INCREMENT_I:
57 %inc.i = add i32 %i, 1
58 call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
59 br label %END_ELSE_BLOCK
60
61 END_ELSE_BLOCK:
62 %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
63 call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
64 %cmp.end.else.block = icmp eq i32 %i.final, -1
65 br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
66
67 RETURN:
68 call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
69 store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
70 ret void
71 }
72
73 declare i32 @llvm.amdgcn.workitem.id.x() #1
74
75 attributes #0 = { convergent nounwind }
76 attributes #1 = { convergent nounwind readnone }
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -structurizecfg %s | FileCheck %s
2
3 ; StructurizeCFG::orderNodes used an arbitrary and nonsensical sorting
4 ; function which broke the basic backedge identification algorithm. It
5 ; would use RPO order, but then do a weird partial sort by the loop
6 ; depth assuming blocks are sorted by loop. However a block can appear
7 ; in between blocks of a loop that is not part of a loop, breaking the
8 ; assumption of the sort.
9 ;
10 ; The collectInfos must be done in RPO order. The actual
11 ; structurization order I think is less important, but unless the loop
12 ; headers are identified in RPO order, it finds the wrong set of back
13 ; edges.
14
15 define amdgpu_kernel void @loop_backedge_misidentified(i32 addrspace(1)* %arg0) #0 {
16 ; CHECK-LABEL: @loop_backedge_misidentified(
17 ; CHECK-NEXT: entry:
18 ; CHECK-NEXT: [[TMP:%.*]] = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
19 ; CHECK-NEXT: [[LOAD1:%.*]] = load volatile <2 x float>, <2 x float> addrspace(1)* undef
20 ; CHECK-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
21 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG0:%.*]], i32 [[TID]]
22 ; CHECK-NEXT: [[I_INITIAL:%.*]] = load volatile i32, i32 addrspace(1)* [[GEP]], align 4
23 ; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
24 ; CHECK: LOOP.HEADER:
25 ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_INITIAL]], [[ENTRY:%.*]] ], [ [[TMP10:%.*]], [[FLOW4:%.*]] ]
26 ; CHECK-NEXT: call void asm sideeffect "s_nop 0x100b
27 ; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[I]] to i64
28 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 [[TMP12]]
29 ; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP13]], align 16
30 ; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP14]], i64 0
31 ; CHECK-NEXT: [[TMP16:%.*]] = and i32 [[TMP15]], 65535
32 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 1
33 ; CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[TMP17]], true
34 ; CHECK-NEXT: br i1 [[TMP0]], label [[BB62:%.*]], label [[FLOW:%.*]]
35 ; CHECK: Flow2:
36 ; CHECK-NEXT: br label [[FLOW]]
37 ; CHECK: bb18:
38 ; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP]], i64 0
39 ; CHECK-NEXT: [[TMP22:%.*]] = lshr i32 [[TMP19]], 16
40 ; CHECK-NEXT: [[TMP24:%.*]] = urem i32 [[TMP22]], 52
41 ; CHECK-NEXT: [[TMP25:%.*]] = mul nuw nsw i32 [[TMP24]], 52
42 ; CHECK-NEXT: br label [[INNER_LOOP:%.*]]
43 ; CHECK: Flow3:
44 ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[TMP59:%.*]], [[INNER_LOOP_BREAK:%.*]] ], [ [[TMP7:%.*]], [[FLOW]] ]
45 ; CHECK-NEXT: [[TMP2:%.*]] = phi i1 [ true, [[INNER_LOOP_BREAK]] ], [ [[TMP8:%.*]], [[FLOW]] ]
46 ; CHECK-NEXT: br i1 [[TMP2]], label [[END_ELSE_BLOCK:%.*]], label [[FLOW4]]
47 ; CHECK: INNER_LOOP:
48 ; CHECK-NEXT: [[INNER_LOOP_J:%.*]] = phi i32 [ [[INNER_LOOP_J_INC:%.*]], [[INNER_LOOP]] ], [ [[TMP25]], [[BB18:%.*]] ]
49 ; CHECK-NEXT: call void asm sideeffect "
50 ; CHECK-NEXT: [[INNER_LOOP_J_INC]] = add nsw i32 [[INNER_LOOP_J]], 1
51 ; CHECK-NEXT: [[INNER_LOOP_CMP:%.*]] = icmp eq i32 [[INNER_LOOP_J]], 0
52 ; CHECK-NEXT: br i1 [[INNER_LOOP_CMP]], label [[INNER_LOOP_BREAK]], label [[INNER_LOOP]]
53 ; CHECK: INNER_LOOP_BREAK:
54 ; CHECK-NEXT: [[TMP59]] = extractelement <4 x i32> [[TMP14]], i64 2
55 ; CHECK-NEXT: call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
56 ; CHECK-NEXT: br label [[FLOW3:%.*]]
57 ; CHECK: bb62:
58 ; CHECK-NEXT: [[LOAD13:%.*]] = icmp ult i32 [[TMP16]], 271
59 ; CHECK-NEXT: [[TMP3:%.*]] = xor i1 [[LOAD13]], true
60 ; CHECK-NEXT: br i1 [[TMP3]], label [[INCREMENT_I:%.*]], label [[FLOW1:%.*]]
61 ; CHECK: Flow1:
62 ; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[INC_I:%.*]], [[INCREMENT_I]] ], [ undef, [[BB62]] ]
63 ; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ true, [[INCREMENT_I]] ], [ false, [[BB62]] ]
64 ; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ false, [[INCREMENT_I]] ], [ true, [[BB62]] ]
65 ; CHECK-NEXT: br i1 [[TMP6]], label [[BB64:%.*]], label [[FLOW2:%.*]]
66 ; CHECK: bb64:
67 ; CHECK-NEXT: call void asm sideeffect "s_nop 42", "~{memory}"() #0
68 ; CHECK-NEXT: br label [[FLOW2]]
69 ; CHECK: Flow:
70 ; CHECK-NEXT: [[TMP7]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP_HEADER]] ]
71 ; CHECK-NEXT: [[TMP8]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ false, [[LOOP_HEADER]] ]
72 ; CHECK-NEXT: [[TMP9:%.*]] = phi i1 [ false, [[FLOW2]] ], [ true, [[LOOP_HEADER]] ]
73 ; CHECK-NEXT: br i1 [[TMP9]], label [[BB18]], label [[FLOW3]]
74 ; CHECK: INCREMENT_I:
75 ; CHECK-NEXT: [[INC_I]] = add i32 [[I]], 1
76 ; CHECK-NEXT: call void asm sideeffect "s_nop 0x1336
77 ; CHECK-NEXT: br label [[FLOW1]]
78 ; CHECK: END_ELSE_BLOCK:
79 ; CHECK-NEXT: [[I_FINAL:%.*]] = phi i32 [ [[TMP1]], [[FLOW3]] ]
80 ; CHECK-NEXT: call void asm sideeffect "s_nop 0x1337
81 ; CHECK-NEXT: [[CMP_END_ELSE_BLOCK:%.*]] = icmp eq i32 [[I_FINAL]], -1
82 ; CHECK-NEXT: br label [[FLOW4]]
83 ; CHECK: Flow4:
84 ; CHECK-NEXT: [[TMP10]] = phi i32 [ [[I_FINAL]], [[END_ELSE_BLOCK]] ], [ undef, [[FLOW3]] ]
85 ; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ [[CMP_END_ELSE_BLOCK]], [[END_ELSE_BLOCK]] ], [ true, [[FLOW3]] ]
86 ; CHECK-NEXT: br i1 [[TMP11]], label [[RETURN:%.*]], label [[LOOP_HEADER]]
87 ; CHECK: RETURN:
88 ; CHECK-NEXT: call void asm sideeffect "s_nop 0x99
89 ; CHECK-NEXT: store volatile <2 x float> [[LOAD1]], <2 x float> addrspace(1)* undef, align 8
90 ; CHECK-NEXT: ret void
91 ;
92 entry:
93 %tmp = load volatile <2 x i32>, <2 x i32> addrspace(1)* undef, align 16
94 %load1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
95 %tid = call i32 @llvm.amdgcn.workitem.id.x()
96 %gep = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i32 %tid
97 %i.initial = load volatile i32, i32 addrspace(1)* %gep, align 4
98 br label %LOOP.HEADER
99
100 LOOP.HEADER:
101 %i = phi i32 [ %i.final, %END_ELSE_BLOCK ], [ %i.initial, %entry ]
102 call void asm sideeffect "s_nop 0x100b ; loop $0 ", "r,~{memory}"(i32 %i) #0
103 %tmp12 = zext i32 %i to i64
104 %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* null, i64 %tmp12
105 %tmp14 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp13, align 16
106 %tmp15 = extractelement <4 x i32> %tmp14, i64 0
107 %tmp16 = and i32 %tmp15, 65535
108 %tmp17 = icmp eq i32 %tmp16, 1
109 br i1 %tmp17, label %bb18, label %bb62
110
111 bb18:
112 %tmp19 = extractelement <2 x i32> %tmp, i64 0
113 %tmp22 = lshr i32 %tmp19, 16
114 %tmp24 = urem i32 %tmp22, 52
115 %tmp25 = mul nuw nsw i32 %tmp24, 52
116 br label %INNER_LOOP
117
118 INNER_LOOP:
119 %inner.loop.j = phi i32 [ %tmp25, %bb18 ], [ %inner.loop.j.inc, %INNER_LOOP ]
120 call void asm sideeffect "; inner loop body", ""() #0
121 %inner.loop.j.inc = add nsw i32 %inner.loop.j, 1
122 %inner.loop.cmp = icmp eq i32 %inner.loop.j, 0
123 br i1 %inner.loop.cmp, label %INNER_LOOP_BREAK, label %INNER_LOOP
124
125 INNER_LOOP_BREAK:
126 %tmp59 = extractelement <4 x i32> %tmp14, i64 2
127 call void asm sideeffect "s_nop 23 ", "~{memory}"() #0
128 br label %END_ELSE_BLOCK
129
130 bb62:
131 %load13 = icmp ult i32 %tmp16, 271
132 br i1 %load13, label %bb64, label %INCREMENT_I
133
134 bb64:
135 call void asm sideeffect "s_nop 42", "~{memory}"() #0
136 br label %RETURN
137
138 INCREMENT_I:
139 %inc.i = add i32 %i, 1
140 call void asm sideeffect "s_nop 0x1336 ; increment $0", "v,~{memory}"(i32 %inc.i) #0
141 br label %END_ELSE_BLOCK
142
143 END_ELSE_BLOCK:
144 %i.final = phi i32 [ %tmp59, %INNER_LOOP_BREAK ], [ %inc.i, %INCREMENT_I ]
145 call void asm sideeffect "s_nop 0x1337 ; end else block $0", "v,~{memory}"(i32 %i.final) #0
146 %cmp.end.else.block = icmp eq i32 %i.final, -1
147 br i1 %cmp.end.else.block, label %RETURN, label %LOOP.HEADER
148
149 RETURN:
150 call void asm sideeffect "s_nop 0x99 ; ClosureEval return", "~{memory}"() #0
151 store volatile <2 x float> %load1, <2 x float> addrspace(1)* undef, align 8
152 ret void
153 }
154
155 ; The same function, except break to return block goes directly to the
156 ; return, which managed to hide the bug.
157 ; FIXME: Merge variant from backedge-id-bug-xfail
158
159 declare i32 @llvm.amdgcn.workitem.id.x() #1
160
161 attributes #0 = { convergent nounwind }
162 attributes #1 = { convergent nounwind readnone }
0 if not 'AMDGPU' in config.root.targets:
1 config.unsupported = True
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -S -structurizecfg %s -o - | FileCheck %s
12
23 define void @main(float addrspace(1)* %out) {
3
4 ; CHECK: main_body:
5 ; CHECK: br label %LOOP.outer
4 ; CHECK-LABEL: @main(
5 ; CHECK-NEXT: main_body:
6 ; CHECK-NEXT: br label [[LOOP_OUTER:%.*]]
7 ; CHECK: LOOP.outer:
8 ; CHECK-NEXT: [[TEMP8_0_PH:%.*]] = phi float [ 0.000000e+00, [[MAIN_BODY:%.*]] ], [ [[TMP13:%.*]], [[FLOW3:%.*]] ]
9 ; CHECK-NEXT: [[TEMP4_0_PH:%.*]] = phi i32 [ 0, [[MAIN_BODY]] ], [ [[TMP12:%.*]], [[FLOW3]] ]
10 ; CHECK-NEXT: br label [[LOOP:%.*]]
11 ; CHECK: LOOP:
12 ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ undef, [[LOOP_OUTER]] ], [ [[TMP12]], [[FLOW:%.*]] ]
13 ; CHECK-NEXT: [[TMP1:%.*]] = phi float [ undef, [[LOOP_OUTER]] ], [ [[TMP13]], [[FLOW]] ]
14 ; CHECK-NEXT: [[TEMP4_0:%.*]] = phi i32 [ [[TEMP4_0_PH]], [[LOOP_OUTER]] ], [ [[TMP15:%.*]], [[FLOW]] ]
15 ; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TEMP4_0]], 1
16 ; CHECK-NEXT: [[TMP22:%.*]] = icmp sgt i32 [[TMP20]], 3
17 ; CHECK-NEXT: [[TMP2:%.*]] = xor i1 [[TMP22]], true
18 ; CHECK-NEXT: br i1 [[TMP2]], label [[ENDIF:%.*]], label [[FLOW]]
19 ; CHECK: Flow2:
20 ; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TEMP8_0_PH]], [[IF29:%.*]] ], [ [[TMP9:%.*]], [[FLOW1:%.*]] ]
21 ; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[TMP20]], [[IF29]] ], [ undef, [[FLOW1]] ]
22 ; CHECK-NEXT: [[TMP5:%.*]] = phi i1 [ [[TMP32:%.*]], [[IF29]] ], [ true, [[FLOW1]] ]
23 ; CHECK-NEXT: br label [[FLOW]]
24 ; CHECK: Flow3:
25 ; CHECK-NEXT: br i1 [[TMP16:%.*]], label [[ENDLOOP:%.*]], label [[LOOP_OUTER]]
26 ; CHECK: ENDLOOP:
27 ; CHECK-NEXT: [[TEMP8_1:%.*]] = phi float [ [[TMP14:%.*]], [[FLOW3]] ]
28 ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP20]], 3
29 ; CHECK-NEXT: [[DOT45:%.*]] = select i1 [[TMP23]], float 0.000000e+00, float 1.000000e+00
30 ; CHECK-NEXT: store float [[DOT45]], float addrspace(1)* [[OUT:%.*]]
31 ; CHECK-NEXT: ret void
32 ; CHECK: ENDIF:
33 ; CHECK-NEXT: [[TMP31:%.*]] = icmp sgt i32 [[TMP20]], 1
34 ; CHECK-NEXT: [[TMP6:%.*]] = xor i1 [[TMP31]], true
35 ; CHECK-NEXT: br i1 [[TMP6]], label [[ENDIF28:%.*]], label [[FLOW1]]
36 ; CHECK: Flow1:
37 ; CHECK-NEXT: [[TMP7:%.*]] = phi i32 [ [[TMP20]], [[ENDIF28]] ], [ [[TMP0]], [[ENDIF]] ]
38 ; CHECK-NEXT: [[TMP8:%.*]] = phi float [ [[TMP35:%.*]], [[ENDIF28]] ], [ [[TMP1]], [[ENDIF]] ]
39 ; CHECK-NEXT: [[TMP9]] = phi float [ [[TMP35]], [[ENDIF28]] ], [ [[TEMP8_0_PH]], [[ENDIF]] ]
40 ; CHECK-NEXT: [[TMP10:%.*]] = phi i1 [ [[TMP36:%.*]], [[ENDIF28]] ], [ true, [[ENDIF]] ]
41 ; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ false, [[ENDIF28]] ], [ true, [[ENDIF]] ]
42 ; CHECK-NEXT: br i1 [[TMP11]], label [[IF29]], label [[FLOW2:%.*]]
43 ; CHECK: IF29:
44 ; CHECK-NEXT: [[TMP32]] = icmp sgt i32 [[TMP20]], 2
45 ; CHECK-NEXT: br label [[FLOW2]]
46 ; CHECK: Flow:
47 ; CHECK-NEXT: [[TMP12]] = phi i32 [ [[TMP7]], [[FLOW2]] ], [ [[TMP0]], [[LOOP]] ]
48 ; CHECK-NEXT: [[TMP13]] = phi float [ [[TMP8]], [[FLOW2]] ], [ [[TMP1]], [[LOOP]] ]
49 ; CHECK-NEXT: [[TMP14]] = phi float [ [[TMP3]], [[FLOW2]] ], [ [[TEMP8_0_PH]], [[LOOP]] ]
50 ; CHECK-NEXT: [[TMP15]] = phi i32 [ [[TMP4]], [[FLOW2]] ], [ undef, [[LOOP]] ]
51 ; CHECK-NEXT: [[TMP16]] = phi i1 [ [[TMP10]], [[FLOW2]] ], [ true, [[LOOP]] ]
52 ; CHECK-NEXT: [[TMP17:%.*]] = phi i1 [ [[TMP5]], [[FLOW2]] ], [ true, [[LOOP]] ]
53 ; CHECK-NEXT: br i1 [[TMP17]], label [[FLOW3]], label [[LOOP]]
54 ; CHECK: ENDIF28:
55 ; CHECK-NEXT: [[TMP35]] = fadd float [[TEMP8_0_PH]], 1.000000e+00
56 ; CHECK-NEXT: [[TMP36]] = icmp sgt i32 [[TMP20]], 2
57 ; CHECK-NEXT: br label [[FLOW1]]
58 ;
659 main_body:
760 br label %LOOP.outer
861
9 ; CHECK: LOOP.outer:
10 ; CHECK: br label %LOOP
1162 LOOP.outer: ; preds = %ENDIF28, %main_body
1263 %temp8.0.ph = phi float [ 0.000000e+00, %main_body ], [ %tmp35, %ENDIF28 ]
1364 %temp4.0.ph = phi i32 [ 0, %main_body ], [ %tmp20, %ENDIF28 ]
1465 br label %LOOP
1566
16 ; CHECK: LOOP:
17 ; br i1 %{{[0-9]+}}, label %ENDIF, label %Flow
1867 LOOP: ; preds = %IF29, %LOOP.outer
1968 %temp4.0 = phi i32 [ %temp4.0.ph, %LOOP.outer ], [ %tmp20, %IF29 ]
2069 %tmp20 = add i32 %temp4.0, 1
2170 %tmp22 = icmp sgt i32 %tmp20, 3
2271 br i1 %tmp22, label %ENDLOOP, label %ENDIF
2372
24 ; CHECK: Flow3
25 ; CHECK: br i1 %{{[0-9]+}}, label %ENDLOOP, label %LOOP.outer
26
27 ; CHECK: ENDLOOP:
28 ; CHECK: ret void
2973 ENDLOOP: ; preds = %ENDIF28, %IF29, %LOOP
3074 %temp8.1 = phi float [ %temp8.0.ph, %LOOP ], [ %temp8.0.ph, %IF29 ], [ %tmp35, %ENDIF28 ]
3175 %tmp23 = icmp eq i32 %tmp20, 3
3377 store float %.45, float addrspace(1)* %out
3478 ret void
3579
36 ; CHECK: ENDIF:
37 ; CHECK: br i1 %tmp31, label %IF29, label %Flow1
3880 ENDIF: ; preds = %LOOP
3981 %tmp31 = icmp sgt i32 %tmp20, 1
4082 br i1 %tmp31, label %IF29, label %ENDIF28
4183
42 ; CHECK: Flow:
43 ; CHECK: br i1 %{{[0-9]+}}, label %Flow2, label %LOOP
44
45 ; CHECK: IF29:
46 ; CHECK: br label %Flow1
4784 IF29: ; preds = %ENDIF
4885 %tmp32 = icmp sgt i32 %tmp20, 2
4986 br i1 %tmp32, label %ENDLOOP, label %LOOP
5087
51 ; CHECK: Flow1:
52 ; CHECK: br label %Flow
53
54 ; CHECK: Flow2:
55 ; CHECK: br i1 %{{[0-9]+}}, label %ENDIF28, label %Flow3
56
57 ; CHECK: ENDIF28:
58 ; CHECK: br label %Flow3
5988 ENDIF28: ; preds = %ENDIF
6089 %tmp35 = fadd float %temp8.0.ph, 1.0
6190 %tmp36 = icmp sgt i32 %tmp20, 2