llvm.org GIT mirror llvm / 66f113a
[LoopUnroll] Enable option to peel remainder loop On some targets, the penalty of executing runtime unrolling checks and then not the unrolled loop can be significantly detrimental to performance. This results in the need to be more conservative with the unroll count, keeping a trip count of 2 reduces the overhead as well as increasing the chance of the unrolled body being executed. But being conservative leaves performance gains on the table. This patch enables the unrolling of the remainder loop introduced by runtime unrolling. This can help reduce the overhead of misunrolled loops because the cost of non-taken branches is much less than the cost of the backedge that would normally be executed in the remainder loop. This allows larger unroll factors to be used without suffering performance loses with smaller iteration counts. Differential Revision: https://reviews.llvm.org/D36309 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@310824 91177308-0d34-0410-b5e6-96231b3b80d8 Sam Parker 2 years ago
6 changed file(s) with 121 addition(s) and 14 deletion(s). Raw diff Collapse all Expand all
381381 bool UpperBound;
382382 /// Allow peeling off loop iterations for loops with low dynamic tripcount.
383383 bool AllowPeeling;
384 /// Allow unrolling of all the iterations of the runtime loop remainder.
385 bool UnrollRemainder;
384386 };
385387
386388 /// \brief Get target-customized preferences for the generic loop unrolling
4141 bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
4242 bool AllowRuntime, bool AllowExpensiveTripCount,
4343 bool PreserveCondBr, bool PreserveOnlyFirst,
44 unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI,
45 ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
46 OptimizationRemarkEmitter *ORE, bool PreserveLCSSA);
44 unsigned TripMultiple, unsigned PeelCount, bool UnrollRemainder,
45 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
46 AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
47 bool PreserveLCSSA);
4748
4849 bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
4950 bool AllowExpensiveTripCount,
50 bool UseEpilogRemainder, LoopInfo *LI,
51 bool UseEpilogRemainder, bool UnrollRemainder,
52 LoopInfo *LI,
5153 ScalarEvolution *SE, DominatorTree *DT,
54 AssumptionCache *AC,
55 OptimizationRemarkEmitter *ORE,
5256 bool PreserveLCSSA);
5357
5458 void computePeelCount(Loop *L, unsigned LoopSize,
114114 cl::desc("Allows loops to be peeled when the dynamic "
115115 "trip count is known to be low."));
116116
117 static cl::opt UnrollUnrollRemainder(
118 "unroll-remainder", cl::Hidden,
119 cl::desc("Allow the loop remainder to be unrolled."));
120
117121 // This option isn't ever intended to be enabled, it serves to allow
118122 // experiments to check the assumptions about when this kind of revisit is
119123 // necessary.
152156 UP.Partial = false;
153157 UP.Runtime = false;
154158 UP.AllowRemainder = true;
159 UP.UnrollRemainder = false;
155160 UP.AllowExpensiveTripCount = false;
156161 UP.Force = false;
157162 UP.UpperBound = false;
187192 UP.UpperBound = false;
188193 if (UnrollAllowPeeling.getNumOccurrences() > 0)
189194 UP.AllowPeeling = UnrollAllowPeeling;
195 if (UnrollUnrollRemainder.getNumOccurrences() > 0)
196 UP.UnrollRemainder = UnrollUnrollRemainder;
190197
191198 // Apply user values provided by argument
192199 if (UserThreshold.hasValue()) {
10331040 // Unroll the loop.
10341041 if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime,
10351042 UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero,
1036 TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE,
1043 TripMultiple, UP.PeelCount, UP.UnrollRemainder,
1044 LI, &SE, &DT, &AC, &ORE,
10371045 PreserveLCSSA))
10381046 return false;
10391047
294294 bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
295295 bool AllowRuntime, bool AllowExpensiveTripCount,
296296 bool PreserveCondBr, bool PreserveOnlyFirst,
297 unsigned TripMultiple, unsigned PeelCount, LoopInfo *LI,
297 unsigned TripMultiple, unsigned PeelCount,
298 bool UnrollRemainder, LoopInfo *LI,
298299 ScalarEvolution *SE, DominatorTree *DT,
299300 AssumptionCache *AC, OptimizationRemarkEmitter *ORE,
300301 bool PreserveLCSSA) {
417418
418419 if (RuntimeTripCount && TripMultiple % Count != 0 &&
419420 !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
420 EpilogProfitability, LI, SE, DT,
421 EpilogProfitability, UnrollRemainder,
422 LI, SE, DT, AC, ORE,
421423 PreserveLCSSA)) {
422424 if (Force)
423425 RuntimeTripCount = false;
293293 /// Return the new cloned loop that is created when CreateRemainderLoop is true.
294294 static Loop *
295295 CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
296 const bool UseEpilogRemainder, BasicBlock *InsertTop,
296 const bool UseEpilogRemainder, const bool UnrollRemainder,
297 BasicBlock *InsertTop,
297298 BasicBlock *InsertBot, BasicBlock *Preheader,
298299 std::vector &NewBlocks, LoopBlocksDFS &LoopBlocks,
299300 ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
412413 }
413414
414415 LLVMContext &Context = NewLoop->getHeader()->getContext();
415 SmallVector DisableOperands;
416 DisableOperands.push_back(MDString::get(Context, "llvm.loop.unroll.disable"));
417 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
418 MDs.push_back(DisableNode);
416 if (!UnrollRemainder) {
417 SmallVector DisableOperands;
418 DisableOperands.push_back(MDString::get(Context,
419 "llvm.loop.unroll.disable"));
420 MDNode *DisableNode = MDNode::get(Context, DisableOperands);
421 MDs.push_back(DisableNode);
422 }
419423
420424 MDNode *NewLoopID = MDNode::get(Context, MDs);
421425 // Set operand 0 to refer to the loop id itself.
524528 bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
525529 bool AllowExpensiveTripCount,
526530 bool UseEpilogRemainder,
531 bool UnrollRemainder,
527532 LoopInfo *LI, ScalarEvolution *SE,
528 DominatorTree *DT, bool PreserveLCSSA) {
533 DominatorTree *DT, AssumptionCache *AC,
534 OptimizationRemarkEmitter *ORE,
535 bool PreserveLCSSA) {
529536 DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
530537 DEBUG(L->dump());
531538
738745 BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
739746 BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
740747 Loop *remainderLoop = CloneLoopBlocks(
741 L, ModVal, CreateRemainderLoop, UseEpilogRemainder, InsertTop, InsertBot,
748 L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder,
749 InsertTop, InsertBot,
742750 NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
743751
744752 // Insert the cloned blocks into the function.
882890 formDedicatedExitBlocks(remainderLoop, DT, LI, PreserveLCSSA);
883891 }
884892
893 if (remainderLoop && UnrollRemainder) {
894 UnrollLoop(remainderLoop, /*Count*/Count - 1, /*TripCount*/Count - 1,
895 /*Force*/false, /*AllowRuntime*/false,
896 /*AllowExpensiveTripCount*/false, /*PreserveCondBr*/true,
897 /*PreserveOnlyFirst*/false, /*TripMultiple*/1,
898 /*PeelCount*/0, /*UnrollRemainder*/false, LI, SE, DT, AC, ORE,
899 PreserveLCSSA);
900 }
901
885902 NumRuntimeUnrolled++;
886903 return true;
887904 }
0 ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-count=4 -unroll-remainder -instcombine | FileCheck %s
1
2 ; CHECK-LABEL: unroll
3 define i32 @unroll(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %N) local_unnamed_addr #0 {
4 entry:
5 %cmp9 = icmp eq i32 %N, 0
6 br i1 %cmp9, label %for.cond.cleanup, label %for.body.lr.ph
7
8 for.body.lr.ph:
9 %wide.trip.count = zext i32 %N to i64
10 br label %for.body
11
12 for.cond.cleanup:
13 %c.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
14 ret i32 %c.0.lcssa
15
16 ; CHECK-LABEL: for.body.lr.ph
17 ; CHECK: [[COUNT:%[a-z.0-9]+]] = add nsw i64 %wide.trip.count, -1
18 ; CHECK: %xtraiter = and i64 %wide.trip.count, 3
19 ; CHECK: [[CMP:%[a-z.0-9]+]] = icmp ult i64 [[COUNT]], 3
20 ; CHECK: br i1 [[CMP]], label %[[CLEANUP:.*]], label %for.body.lr.ph.new
21
22 ; CHECK-LABEL: for.body.lr.ph.new:
23 ; CHECK: %unroll_iter = sub nsw i64 %wide.trip.count, %xtraiter
24 ; CHECK: br label %for.body
25
26 ; CHECK: [[CLEANUP]]:
27 ; CHECK: [[MOD:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 0
28 ; CHECK: br i1 [[MOD]], label %[[EXIT:.*]], label %[[EPIL_PEEL0_PRE:.*]]
29
30 ; CHECK: [[EPIL_PEEL0_PRE]]:
31 ; CHECK: br label %[[EPIL_PEEL0:.*]]
32
33 ; CHECK: [[EPIL_PEEL0]]:
34 ; CHECK: [[PEEL_CMP0:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 1
35 ; CHECK: br i1 [[PEEL_CMP0]], label %[[EPIL_EXIT:.*]], label %[[EPIL_PEEL1:.*]],
36
37 ; CHECK: [[EPIL_EXIT]]:
38 ; CHECK: br label %[[EXIT]]
39
40 ; CHECK: [[EXIT]]:
41 ; CHECK: ret i32
42
43 ; CHECK-LABEL: for.body:
44 ; CHECK: [[INDVAR0:%[a-z.0-9]+]] = phi i64 [ 0, %for.body.lr.ph
45 ; CHECK: [[ITER:%[a-z.0-9]+]] = phi i64 [ %unroll_iter
46 ; CHECK: or i64 [[INDVAR0]], 1
47 ; CHECK: or i64 [[INDVAR0]], 2
48 ; CHECK: or i64 [[INDVAR0]], 3
49 ; CHECK: add nsw i64 [[INDVAR0]], 4
50 ; CHECK: [[SUB:%[a-z.0-9]+]] = add i64 [[ITER]], -4
51 ; CHECK: [[ITER_CMP:%[a-z.0-9]+]] = icmp eq i64 [[SUB]], 0
52 ; CHECK: br i1 [[ITER_CMP]], label %[[LOOP_EXIT:.*]], label %for.body
53
54 ; CHECK: [[EPIL_PEEL1]]:
55 ; CHECK: [[PEEL_CMP1:%[a-z.0-9]+]] = icmp eq i64 %xtraiter, 2
56 ; CHECK: br i1 [[PEEL_CMP1]], label %[[EPIL_EXIT]], label %[[EPIL_PEEL2:.*]],
57
58 ; CHECK: [[EPIL_PEEL2]]:
59 ; CHECK: br label %[[EXIT]]
60
61 for.body:
62 %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
63 %c.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add, %for.body ]
64 %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
65 %0 = load i32, i32* %arrayidx, align 4
66 %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
67 %1 = load i32, i32* %arrayidx2, align 4
68 %mul = mul nsw i32 %1, %0
69 %add = add nsw i32 %mul, %c.010
70 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
71 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
72 br i1 %exitcond, label %for.cond.cleanup, label %for.body
73 }