llvm.org GIT mirror llvm / dc1f81f
[ARM] Allow unrolling of multi-block loops. Before, loop unrolling was only enabled for loops with a single block. This restriction has been removed and replaced by: - allow a maximum of two exiting blocks, - a four basic block limit for cores with a branch predictor. Differential Revision: https://reviews.llvm.org/D38952 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@316313 91177308-0d34-0410-b5e6-96231b3b80d8 Sam Parker 1 year, 11 months ago
2 changed file(s) with 352 addition(s) and 17 deletion(s). Raw diff Collapse all Expand all
585585 if (!ST->isMClass())
586586 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP);
587587
588 // Only enable on Thumb-2 targets for simple loops.
589 if (!ST->isThumb2() || L->getNumBlocks() != 1)
590 return;
591
592588 // Disable loop unrolling for Oz and Os.
593589 UP.OptSizeThreshold = 0;
594590 UP.PartialOptSizeThreshold = 0;
595 BasicBlock *BB = L->getLoopLatch();
596 if (BB->getParent()->optForSize())
591 if (L->getHeader()->getParent()->optForSize())
592 return;
593
594 // Only enable on Thumb-2 targets.
595 if (!ST->isThumb2())
596 return;
597
598 SmallVector ExitingBlocks;
599 L->getExitingBlocks(ExitingBlocks);
600 DEBUG(dbgs() << "Loop has:\n"
601 << "Blocks: " << L->getNumBlocks() << "\n"
602 << "Exit blocks: " << ExitingBlocks.size() << "\n");
603
604 // Only allow another exit other than the latch. This acts as an early exit
605 // as it mirrors the profitability calculation of the runtime unroller.
606 if (ExitingBlocks.size() > 2)
607 return;
608
609 // Limit the CFG of the loop body for targets with a branch predictor.
610 // Allowing 4 blocks permits if-then-else diamonds in the body.
611 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
597612 return;
598613
599614 // Scan the loop: don't unroll loops with calls as this could prevent
600615 // inlining.
601616 unsigned Cost = 0;
602 for (auto &I : *BB) {
603 if (isa(I) || isa(I)) {
604 ImmutableCallSite CS(&I);
605 if (const Function *F = CS.getCalledFunction()) {
606 if (!isLoweredToCall(F))
607 continue;
617 for (auto *BB : L->getBlocks()) {
618 for (auto &I : *BB) {
619 if (isa(I) || isa(I)) {
620 ImmutableCallSite CS(&I);
621 if (const Function *F = CS.getCalledFunction()) {
622 if (!isLoweredToCall(F))
623 continue;
624 }
625 return;
608626 }
609 return;
627 SmallVector Operands(I.value_op_begin(),
628 I.value_op_end());
629 Cost += getUserCost(&I, Operands);
610630 }
611 SmallVector Operands(I.value_op_begin(),
612 I.value_op_end());
613 Cost += getUserCost(&I, Operands);
614 }
631 }
632
633 DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
615634
616635 UP.Partial = true;
617636 UP.Runtime = true;
0 ; RUN: opt -mtriple=thumbv8m.main -mcpu=cortex-m33 -loop-unroll -S < %s -o - | FileCheck %s
1 ; RUN: opt -mtriple=thumbv7em -mcpu=cortex-m7 -loop-unroll -S < %s -o - | FileCheck %s
2
3 ;CHECK-LABEL: test_three_blocks
4 ;CHECK: for.body.epil:
5 ;CHECK: if.then.epil:
6 ;CHECK: for.inc.epil:
7 ;CHECK: for.body:
8 ;CHECK: if.then:
9 ;CHECK: for.inc:
10 ;CHECK: for.body.epil.1:
11 ;CHECK: if.then.epil.1:
12 ;CHECK: for.inc.epil.1:
13 ;CHECK: for.body.epil.2:
14 ;CHECK: if.then.epil.2:
15 ;CHECK: for.inc.epil.2:
16 ;CHECK: if.then.1:
17 ;CHECK: for.inc.1:
18 ;CHECK: if.then.2:
19 ;CHECK: for.inc.2:
20 ;CHECK: if.then.3:
21 ;CHECK: for.inc.3:
22 define void @test_three_blocks(i32* nocapture %Output,
23 i32* nocapture readonly %Condition,
24 i32* nocapture readonly %Input,
25 i32 %MaxJ) {
26 entry:
27 %cmp8 = icmp eq i32 %MaxJ, 0
28 br i1 %cmp8, label %for.cond.cleanup, label %for.body.preheader
29
30 for.body.preheader: ; preds = %entry
31 br label %for.body
32
33 for.cond.cleanup: ; preds = %for.inc, %entry
34 %temp.0.lcssa = phi i32 [ 0, %entry ], [ %temp.1, %for.inc ]
35 store i32 %temp.0.lcssa, i32* %Output, align 4
36 ret void
37
38 for.body: ; preds = %for.body.preheader, %for.inc
39 %j.010 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
40 %temp.09 = phi i32 [ %temp.1, %for.inc ], [ 0, %for.body.preheader ]
41 %arrayidx = getelementptr inbounds i32, i32* %Condition, i32 %j.010
42 %0 = load i32, i32* %arrayidx, align 4
43 %tobool = icmp eq i32 %0, 0
44 br i1 %tobool, label %for.inc, label %if.then
45
46 if.then: ; preds = %for.body
47 %arrayidx1 = getelementptr inbounds i32, i32* %Input, i32 %j.010
48 %1 = load i32, i32* %arrayidx1, align 4
49 %add = add i32 %1, %temp.09
50 br label %for.inc
51
52 for.inc: ; preds = %for.body, %if.then
53 %temp.1 = phi i32 [ %add, %if.then ], [ %temp.09, %for.body ]
54 %inc = add nuw i32 %j.010, 1
55 %exitcond = icmp eq i32 %inc, %MaxJ
56 br i1 %exitcond, label %for.cond.cleanup, label %for.body
57 }
58
59 ;CHECK-LABEL: test_two_exits
60 ;CHECK: for.body:
61 ;CHECK: if.end:
62 ;CHECK: cleanup.loopexit:
63 ;CHECK: cleanup:
64 ;CHECK: for.body.1:
65 ;CHECK: if.end.1:
66 ;CHECK: for.body.2:
67 ;CHECK: if.end.2:
68 ;CHECK: for.body.3:
69 ;CHECK: if.end.3:
70 define void @test_two_exits(i32* nocapture %Output,
71 i32* nocapture readonly %Condition,
72 i32* nocapture readonly %Input,
73 i32 %MaxJ) {
74 entry:
75 %cmp14 = icmp eq i32 %MaxJ, 0
76 br i1 %cmp14, label %cleanup, label %for.body.preheader
77
78 for.body.preheader: ; preds = %entry
79 br label %for.body
80
81 for.body: ; preds = %for.body.preheader, %if.end
82 %j.016 = phi i32 [ %inc, %if.end ], [ 0, %for.body.preheader ]
83 %temp.015 = phi i32 [ %temp.0.add, %if.end ], [ 0, %for.body.preheader ]
84 %arrayidx = getelementptr inbounds i32, i32* %Input, i32 %j.016
85 %0 = load i32, i32* %arrayidx, align 4
86 %cmp1 = icmp ugt i32 %0, 65535
87 br i1 %cmp1, label %cleanup, label %if.end
88
89 if.end: ; preds = %for.body
90 %arrayidx2 = getelementptr inbounds i32, i32* %Condition, i32 %j.016
91 %1 = load i32, i32* %arrayidx2, align 4
92 %tobool = icmp eq i32 %1, 0
93 %add = select i1 %tobool, i32 0, i32 %0
94 %temp.0.add = add i32 %add, %temp.015
95 %inc = add nuw i32 %j.016, 1
96 %cmp = icmp ult i32 %inc, %MaxJ
97 br i1 %cmp, label %for.body, label %cleanup
98
99 cleanup: ; preds = %if.end, %for.body, %entry
100 %temp.0.lcssa = phi i32 [ 0, %entry ], [ %temp.015, %for.body ], [ %temp.0.add, %if.end ]
101 store i32 %temp.0.lcssa, i32* %Output, align 4
102 ret void
103 }
104
105 ;CHECK-LABEL: test_three_exits
106 ;CHECK-NOT: for.body.epil
107 ;CHECK-NOT: if.end.epil
108 ;CHECK-LABEL: for.body
109 ;CHECK-LABEL: if.end
110 ;CHECK-LABEL: if.end5
111 define void @test_three_exits(i32* nocapture %Output,
112 i32* nocapture readonly %Condition,
113 i32* nocapture readonly %Input,
114 i32 %MaxJ) {
115 entry:
116 %cmp20 = icmp eq i32 %MaxJ, 0
117 br i1 %cmp20, label %cleanup, label %for.body.preheader
118
119 for.body.preheader: ; preds = %entry
120 br label %for.body
121
122 for.body: ; preds = %for.body.preheader, %if.end5
123 %j.022 = phi i32 [ %inc, %if.end5 ], [ 0, %for.body.preheader ]
124 %temp.021 = phi i32 [ %temp.0.add, %if.end5 ], [ 0, %for.body.preheader ]
125 %arrayidx = getelementptr inbounds i32, i32* %Condition, i32 %j.022
126 %0 = load i32, i32* %arrayidx, align 4
127 %cmp1 = icmp ugt i32 %0, 65535
128 br i1 %cmp1, label %cleanup, label %if.end
129
130 if.end: ; preds = %for.body
131 %arrayidx2 = getelementptr inbounds i32, i32* %Input, i32 %j.022
132 %1 = load i32, i32* %arrayidx2, align 4
133 %cmp3 = icmp ugt i32 %1, 65535
134 br i1 %cmp3, label %cleanup, label %if.end5
135
136 if.end5: ; preds = %if.end
137 %tobool = icmp eq i32 %0, 0
138 %add = select i1 %tobool, i32 0, i32 %1
139 %temp.0.add = add i32 %add, %temp.021
140 %inc = add nuw i32 %j.022, 1
141 %cmp = icmp ult i32 %inc, %MaxJ
142 br i1 %cmp, label %for.body, label %cleanup
143
144 cleanup: ; preds = %if.end5, %for.body, %if.end, %entry
145 %temp.0.lcssa = phi i32 [ 0, %entry ], [ %temp.021, %if.end ], [ %temp.021, %for.body ], [ %temp.0.add, %if.end5 ]
146 store i32 %temp.0.lcssa, i32* %Output, align 4
147 ret void
148 }
149
150 ;CHECK-LABEL: test_four_blocks
151 ;CHECK: for.body.epil:
152 ;CHECK: if.else.epil:
153 ;CHECK: if.then.epil:
154 ;CHECK: for.cond.cleanup:
155 ;CHECK: for.body:
156 ;CHECK: if.then:
157 ;CHECK: for.inc:
158 ;CHECK: for.body.epil.1:
159 ;CHECK: if.else.epil.1:
160 ;CHECK: if.then.epil.1:
161 ;CHECK: for.inc.epil.1:
162 ;CHECK: for.body.epil.2:
163 ;CHECK: if.else.epil.2:
164 ;CHECK: if.then.epil.2:
165 ;CHECK: for.inc.epil.2:
166 ;CHECK: if.else.1:
167 ;CHECK: if.then.1:
168 ;CHECK: for.inc.1:
169 ;CHECK: if.else.2:
170 ;CHECK: if.then.2:
171 ;CHECK: for.inc.2:
172 ;CHECK: if.else.3:
173 ;CHECK: if.then.3:
174 ;CHECK: for.inc.3:
175 define void @test_four_blocks(i32* nocapture %Output,
176 i32* nocapture readonly %Condition,
177 i32* nocapture readonly %Input,
178 i32 %MaxJ) {
179 entry:
180 %cmp25 = icmp ugt i32 %MaxJ, 1
181 br i1 %cmp25, label %for.body.lr.ph, label %for.cond.cleanup
182
183 for.body.lr.ph: ; preds = %entry
184 %.pre = load i32, i32* %Input, align 4
185 br label %for.body
186
187 for.cond.cleanup: ; preds = %for.inc, %entry
188 %temp.0.lcssa = phi i32 [ 0, %entry ], [ %temp.1, %for.inc ]
189 store i32 %temp.0.lcssa, i32* %Output, align 4
190 ret void
191
192 for.body: ; preds = %for.inc, %for.body.lr.ph
193 %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %2, %for.inc ]
194 %j.027 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.inc ]
195 %temp.026 = phi i32 [ 0, %for.body.lr.ph ], [ %temp.1, %for.inc ]
196 %arrayidx = getelementptr inbounds i32, i32* %Condition, i32 %j.027
197 %1 = load i32, i32* %arrayidx, align 4
198 %cmp1 = icmp ugt i32 %1, 65535
199 %arrayidx2 = getelementptr inbounds i32, i32* %Input, i32 %j.027
200 %2 = load i32, i32* %arrayidx2, align 4
201 %cmp4 = icmp ugt i32 %2, %0
202 br i1 %cmp1, label %if.then, label %if.else
203
204 if.then: ; preds = %for.body
205 %cond = zext i1 %cmp4 to i32
206 %add = add i32 %temp.026, %cond
207 br label %for.inc
208
209 if.else: ; preds = %for.body
210 %not.cmp4 = xor i1 %cmp4, true
211 %sub = sext i1 %not.cmp4 to i32
212 %sub10.sink = add i32 %j.027, %sub
213 %arrayidx11 = getelementptr inbounds i32, i32* %Input, i32 %sub10.sink
214 %3 = load i32, i32* %arrayidx11, align 4
215 %sub13 = sub i32 %temp.026, %3
216 br label %for.inc
217
218 for.inc: ; preds = %if.then, %if.else
219 %temp.1 = phi i32 [ %add, %if.then ], [ %sub13, %if.else ]
220 %inc = add nuw i32 %j.027, 1
221 %exitcond = icmp eq i32 %inc, %MaxJ
222 br i1 %exitcond, label %for.cond.cleanup, label %for.body
223 }
224
225 ;CHECK-LABEL: test_five_blocks
226 ;CHECK-NOT: for.body.epil:
227 ;CHECK: for.body:
228 ;CHECK: if.end:
229 ;CHECK: if.else:
230 ;CHECK: for.inc:
231 ;CHECK-NOT: for.inc.1:
232 define void @test_five_blocks(i32* nocapture %Output,
233 i32* nocapture readonly %Condition,
234 i32* nocapture readonly %Input,
235 i32 %MaxJ) {
236 entry:
237 %cmp24 = icmp ugt i32 %MaxJ, 1
238 br i1 %cmp24, label %for.body.preheader, label %cleanup
239
240 for.body.preheader: ; preds = %entry
241 br label %for.body
242
243 for.body: ; preds = %for.body.preheader, %for.inc
244 %j.026 = phi i32 [ %inc, %for.inc ], [ 1, %for.body.preheader ]
245 %temp.025 = phi i32 [ %temp.1, %for.inc ], [ 0, %for.body.preheader ]
246 %arrayidx = getelementptr inbounds i32, i32* %Input, i32 %j.026
247 %0 = load i32, i32* %arrayidx, align 4
248 %add = add i32 %0, %temp.025
249 %cmp1 = icmp ugt i32 %add, 16777215
250 br i1 %cmp1, label %cleanup, label %if.end
251
252 if.end: ; preds = %for.body
253 %arrayidx2 = getelementptr inbounds i32, i32* %Condition, i32 %j.026
254 %1 = load i32, i32* %arrayidx2, align 4
255 %cmp3 = icmp ugt i32 %1, 65535
256 br i1 %cmp3, label %if.then4, label %if.else
257
258 if.then4: ; preds = %if.end
259 %sub = add i32 %j.026, -1
260 %arrayidx6 = getelementptr inbounds i32, i32* %Input, i32 %sub
261 %2 = load i32, i32* %arrayidx6, align 4
262 %cmp7 = icmp ugt i32 %0, %2
263 %cond = zext i1 %cmp7 to i32
264 %add8 = add i32 %add, %cond
265 br label %for.inc
266
267 if.else: ; preds = %if.end
268 %and = and i32 %add, %0
269 br label %for.inc
270
271 for.inc: ; preds = %if.then4, %if.else
272 %temp.1 = phi i32 [ %add8, %if.then4 ], [ %and, %if.else ]
273 %inc = add nuw i32 %j.026, 1
274 %cmp = icmp ult i32 %inc, %MaxJ
275 br i1 %cmp, label %for.body, label %cleanup
276
277 cleanup: ; preds = %for.inc, %for.body, %entry
278 %temp.2 = phi i32 [ 0, %entry ], [ %add, %for.body ], [ %temp.1, %for.inc ]
279 store i32 %temp.2, i32* %Output, align 4
280 ret void
281 }
282
283 ;CHECK-LABEL: iterate_inc
284 ;CHECK: while.body:
285 ;CHECK: while.end:
286 ;CHECK: while.body.1:
287 ;CHECK: while.body.2:
288 ;CHECK: while.body.3:
289 %struct.Node = type { %struct.Node*, i32 }
290 define void @iterate_inc(%struct.Node* %n, i32 %limit) {
291 entry:
292 %tobool5 = icmp eq %struct.Node* %n, null
293 br i1 %tobool5, label %while.end, label %land.rhs.preheader
294
295 land.rhs.preheader: ; preds = %entry
296 br label %land.rhs
297
298 land.rhs: ; preds = %land.rhs.preheader, %while.body
299 %list.addr.06 = phi %struct.Node* [ %2, %while.body ], [ %n, %land.rhs.preheader ]
300 %val = getelementptr inbounds %struct.Node, %struct.Node* %list.addr.06, i32 0, i32 1
301 %0 = load i32, i32* %val, align 4
302 %cmp = icmp slt i32 %0, %limit
303 br i1 %cmp, label %while.body, label %while.end
304
305 while.body: ; preds = %land.rhs
306 %inc = add nsw i32 %0, 1
307 store i32 %inc, i32* %val, align 4
308 %1 = bitcast %struct.Node* %list.addr.06 to %struct.Node**
309 %2 = load %struct.Node*, %struct.Node** %1, align 4
310 %tobool = icmp eq %struct.Node* %2, null
311 br i1 %tobool, label %while.end, label %land.rhs
312
313 while.end: ; preds = %land.rhs, %while.body, %entry
314 ret void
315 }