llvm.org GIT mirror llvm / 7bac219
[PGO] Enhance pgo counter promotion This is an incremental change to the promotion feature. There are two problems with the current behavior: 1) loops with multiple exiting blocks are totally disabled 2) a counter update can only be promoted one level up in the loop nest -- which does help much for short trip count inner loops inside a high trip-count outer loops. Due to this limitation, we still saw very large profile count fluctuations from run to run for the affected loops which are usually very hot. This patch adds the support for promotion counters iteratively across the loop nest. It also turns on the promotion for loops with multiple exiting blocks (with a limit). For single-threaded applications, the performance impact is flat on average. For instance, dealII improves, but povray regresses. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@307863 91177308-0d34-0410-b5e6-96231b3b80d8 Xinliang David Li 3 years ago
4 changed file(s) with 284 addition(s) and 46 deletion(s). Raw diff Collapse all Expand all
111111 cl::desc("Do counter register promotion"),
112112 cl::init(false));
113113 cl::opt MaxNumOfPromotionsPerLoop(
114 cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(10),
114 cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
115115 cl::desc("Max number counter promotions per loop to avoid"
116116 " increasing register pressure too much"));
117117
120120 MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
121121 cl::desc("Max number of allowed counter promotions"));
122122
123 cl::opt SpeculativeCounterPromotion(
124 cl::ZeroOrMore, "speculative-counter-promotion", cl::init(false),
125 cl::desc("Allow counter promotion for loops with multiple exiting blocks "
126 " or top-tested loops. "));
123 cl::opt SpeculativeCounterPromotionMaxExiting(
124 cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
125 cl::desc("The max number of exiting blocks of a loop to allow "
126 " speculative counter promotion"));
127
128 cl::opt SpeculativeCounterPromotionToLoop(
129 cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
130 cl::desc("When the option is false, if the target block is in a loop, "
131 "the promotion will be disallowed unless the promoted counter "
132 " update can be further/iteratively promoted into an acyclic "
133 " region."));
134
135 cl::opt IterativeCounterPromotion(
136 cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
137 cl::desc("Allow counter promotion across the whole loop nest."));
127138
128139 class InstrProfilingLegacyPass : public ModulePass {
129140 InstrProfiling InstrProf;
149160 }
150161 };
151162
163 ///
152164 /// A helper class to promote one counter RMW operation in the loop
153165 /// into register update.
154166 ///
157169 ///
158170 class PGOCounterPromoterHelper : public LoadAndStorePromoter {
159171 public:
160 PGOCounterPromoterHelper(Instruction *L, Instruction *S, SSAUpdater &SSA,
161 Value *Init, BasicBlock *PH,
162 ArrayRef ExitBlocks,
163 ArrayRef InsertPts)
172 PGOCounterPromoterHelper(
173 Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
174 BasicBlock *PH, ArrayRef ExitBlocks,
175 ArrayRef InsertPts,
176 DenseMap> &LoopToCands,
177 LoopInfo &LI)
164178 : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
165 InsertPts(InsertPts) {
179 InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
166180 assert(isa(L));
167181 assert(isa(S));
168182 SSA.AddAvailableValue(PH, Init);
169183 }
184
170185 void doExtraRewritesBeforeFinalDeletion() const override {
171186 for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
172187 BasicBlock *ExitBlock = ExitBlocks[i];
178193 Value *Addr = cast(Store)->getPointerOperand();
179194 IRBuilder<> Builder(InsertPos);
180195 if (AtomicCounterUpdatePromoted)
196 // automic update currently can only be promoted across the current
197 // loop, not the whole loop nest.
181198 Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
182199 AtomicOrdering::SequentiallyConsistent);
183200 else {
184201 LoadInst *OldVal = Builder.CreateLoad(Addr, "pgocount.promoted");
185202 auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
186 Builder.CreateStore(NewVal, Addr);
203 auto *NewStore = Builder.CreateStore(NewVal, Addr);
204
205 // Now update the parent loop's candidate list:
206 if (IterativeCounterPromotion) {
207 auto *TargetLoop = LI.getLoopFor(ExitBlock);
208 if (TargetLoop)
209 LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
210 }
187211 }
188212 }
189213 }
192216 Instruction *Store;
193217 ArrayRef ExitBlocks;
194218 ArrayRef InsertPts;
219 DenseMap> &LoopToCandidates;
220 LoopInfo &LI;
195221 };
196222
197223 /// A helper class to do register promotion for all profile counter
199225 ///
200226 class PGOCounterPromoter {
201227 public:
202 PGOCounterPromoter(ArrayRef Cands, Loop &Loop)
203 : Candidates(Cands), ExitBlocks(), InsertPts(), ParentLoop(Loop) {
228 PGOCounterPromoter(
229 DenseMap> &LoopToCands,
230 Loop &CurLoop, LoopInfo &LI)
231 : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
232 LI(LI) {
204233
205234 SmallVector LoopExitBlocks;
206235 SmallPtrSet BlockSet;
207 ParentLoop.getExitBlocks(LoopExitBlocks);
236 L.getExitBlocks(LoopExitBlocks);
208237
209238 for (BasicBlock *ExitBlock : LoopExitBlocks) {
210239 if (BlockSet.insert(ExitBlock).second) {
215244 }
216245
217246 bool run(int64_t *NumPromoted) {
218 // We can't insert into a catchswitch.
219 bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
220 return isa(Exit->getTerminator());
221 });
222
223 if (HasCatchSwitch)
247 unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
248 if (MaxProm == 0)
224249 return false;
225250
226 if (!ParentLoop.hasDedicatedExits())
227 return false;
228
229 BasicBlock *PH = ParentLoop.getLoopPreheader();
230 if (!PH)
231 return false;
232
233 BasicBlock *H = ParentLoop.getHeader();
234 bool TopTested =
235 ((ParentLoop.getBlocks().size() > 1) && ParentLoop.isLoopExiting(H));
236 if (!SpeculativeCounterPromotion &&
237 (TopTested || ParentLoop.getExitingBlock() == nullptr))
238 return false;
239
240251 unsigned Promoted = 0;
241 for (auto &Cand : Candidates) {
252 for (auto &Cand : LoopToCandidates[&L]) {
242253
243254 SmallVector NewPHIs;
244255 SSAUpdater SSA(&NewPHIs);
245256 Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
257
246258 PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
247 PH, ExitBlocks, InsertPts);
259 L.getLoopPreheader(), ExitBlocks,
260 InsertPts, LoopToCandidates, LI);
248261 Promoter.run(SmallVector({Cand.first, Cand.second}));
249262 Promoted++;
250 if (Promoted >= MaxNumOfPromotionsPerLoop)
263 if (Promoted >= MaxProm)
251264 break;
265
252266 (*NumPromoted)++;
253267 if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
254268 break;
255269 }
256270
257271 DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
258 << ParentLoop.getLoopDepth() << ")\n");
272 << L.getLoopDepth() << ")\n");
259273 return Promoted != 0;
260274 }
261275
262276 private:
263 ArrayRef Candidates;
277 bool allowSpeculativeCounterPromotion(Loop *LP) {
278 SmallVector ExitingBlocks;
279 L.getExitingBlocks(ExitingBlocks);
280 // Not considierered speculative.
281 if (ExitingBlocks.size() == 1)
282 return true;
283 if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
284 return false;
285 return true;
286 }
287
288 // Returns the max number of Counter Promotions for LP.
289 unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
290 // We can't insert into a catchswitch.
291 SmallVector LoopExitBlocks;
292 LP->getExitBlocks(LoopExitBlocks);
293 if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
294 return isa(Exit->getTerminator());
295 }))
296 return 0;
297
298 if (!LP->hasDedicatedExits())
299 return 0;
300
301 BasicBlock *PH = LP->getLoopPreheader();
302 if (!PH)
303 return 0;
304
305 SmallVector ExitingBlocks;
306 LP->getExitingBlocks(ExitingBlocks);
307 // Not considierered speculative.
308 if (ExitingBlocks.size() == 1)
309 return MaxNumOfPromotionsPerLoop;
310
311 if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
312 return 0;
313
314 // Whether the target block is in a loop does not matter:
315 if (SpeculativeCounterPromotionToLoop)
316 return MaxNumOfPromotionsPerLoop;
317
318 // Now check the target block:
319 unsigned MaxProm = MaxNumOfPromotionsPerLoop;
320 for (auto *TargetBlock : LoopExitBlocks) {
321 auto *TargetLoop = LI.getLoopFor(TargetBlock);
322 if (!TargetLoop)
323 continue;
324 unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
325 unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
326 MaxProm =
327 std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
328 PendingCandsInTarget);
329 }
330 return MaxProm;
331 }
332
333 DenseMap> &LoopToCandidates;
264334 SmallVector ExitBlocks;
265335 SmallVector InsertPts;
266 Loop &ParentLoop;
336 Loop &L;
337 LoopInfo &LI;
267338 };
268339
269340 } // end anonymous namespace
348419
349420 SmallVector Loops = LI.getLoopsInPreorder();
350421
351 for (auto *Loop : Loops) {
352 PGOCounterPromoter Promoter(LoopPromotionCandidates[Loop], *Loop);
422 // Do a post-order traversal of the loops so that counter updates can be
423 // iteratively hoisted outside the loop nest.
424 for (auto *Loop : llvm::reverse(Loops)) {
425 PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI);
353426 Promoter.run(&TotalCountersPromoted);
354427 }
355428 }
None ; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
1 ; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
0 ; RUN: opt < %s -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
1 ; RUN: opt < %s --passes=instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
22
33 $__llvm_profile_raw_version = comdat any
44
None ; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
1 ; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion -S | FileCheck --check-prefix=PROMO %s
0 ; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
1 ; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -speculative-counter-promotion-max-exiting=3 -S | FileCheck --check-prefix=PROMO %s
22
33 @g = common local_unnamed_addr global i32 0, align 4
44
0 ; TEST that counter updates are promoted outside the whole loop nest
1 ; RUN: opt < %s -pgo-instr-gen -instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s
2 ; RUN: opt < %s --passes=pgo-instr-gen,instrprof -do-counter-promotion=true -S | FileCheck --check-prefix=PROMO %s
3
4 @g = common local_unnamed_addr global i32 0, align 4
5 @c = local_unnamed_addr global i32 10, align 4
6
7 ; Function Attrs: noinline norecurse nounwind uwtable
8 define void @bar() local_unnamed_addr #0 {
9 bb:
10 %tmp2 = load i32, i32* @g, align 4, !tbaa !2
11 %tmp3 = add nsw i32 %tmp2, 1
12 store i32 %tmp3, i32* @g, align 4, !tbaa !2
13 ret void
14 }
15
16 ; Function Attrs: norecurse nounwind uwtable
17 define i32 @main() local_unnamed_addr #1 {
18 bb:
19 store i32 0, i32* @g, align 4, !tbaa !2
20 %tmp = load i32, i32* @c, align 4, !tbaa !2
21 %tmp1 = icmp sgt i32 %tmp, 0
22 br i1 %tmp1, label %bb2_1, label %bb84
23
24 bb2_1:
25 br label %bb2
26
27 bb2: ; preds = %bb39, %bb
28 %tmp3 = phi i32 [ %tmp40, %bb39 ], [ %tmp, %bb2_1 ]
29 %tmp5 = phi i32 [ %tmp43, %bb39 ], [ 0, %bb2_1 ]
30 %tmp7 = icmp sgt i32 %tmp3, 0
31 br i1 %tmp7, label %bb14_1, label %bb39
32
33 bb8: ; preds = %bb39
34 ; PROMO-LABEL: bb8
35 ; PROMO: load {{.*}} @__profc_main{{.*}}
36 ; PROMO-NEXT: add
37 ; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
38 ; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
39 ; PROMO-NEXT: add
40 ; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
41 ; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
42 ; PROMO-NEXT: add
43 ; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
44 ; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
45 ; PROMO-NEXT: add
46 ; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
47 ; PROMO-NEXT: load {{.*}} @__profc_main{{.*}}
48 ; PROMO-NEXT: add
49 ; PROMO-NEXT: store {{.*}}@__profc_main{{.*}}
50
51 %tmp13 = icmp sgt i32 %tmp40, 0
52 br i1 %tmp13, label %bb45, label %bb84
53
54 bb14_1:
55 br label %bb14
56
57 bb14: ; preds = %bb29, %bb2
58 %tmp15 = phi i32 [ %tmp30, %bb29 ], [ %tmp3, %bb14_1 ]
59 %tmp16 = phi i64 [ %tmp31, %bb29 ], [ 0, %bb14_1 ]
60 %tmp17 = phi i64 [ %tmp32, %bb29 ], [ 0, %bb14_1 ]
61 %tmp18 = phi i32 [ %tmp33, %bb29 ], [ 0, %bb14_1 ]
62 %tmp19 = icmp sgt i32 %tmp15, 0
63 br i1 %tmp19, label %bb20_split, label %bb29
64
65 bb20_split:
66 br label %bb20
67
68 bb20: ; preds = %bb20, %bb14
69 %tmp21 = phi i64 [ %tmp23, %bb20 ], [ 0, %bb20_split ]
70 %tmp22 = phi i32 [ %tmp24, %bb20 ], [ 0, %bb20_split ]
71 %tmp23 = add nuw i64 %tmp21, 1
72 tail call void @bar()
73 %tmp24 = add nuw nsw i32 %tmp22, 1
74 %tmp25 = load i32, i32* @c, align 4, !tbaa !2
75 %tmp26 = icmp slt i32 %tmp24, %tmp25
76 br i1 %tmp26, label %bb20, label %bb27
77
78 bb27: ; preds = %bb20
79 %tmp28 = add i64 %tmp23, %tmp16
80 br label %bb29
81
82 bb29: ; preds = %bb27, %bb14
83 %tmp30 = phi i32 [ %tmp25, %bb27 ], [ %tmp15, %bb14 ]
84 %tmp31 = phi i64 [ %tmp28, %bb27 ], [ %tmp16, %bb14 ]
85 %tmp32 = add nuw i64 %tmp17, 1
86 %tmp33 = add nuw nsw i32 %tmp18, 1
87 %tmp34 = icmp slt i32 %tmp33, %tmp30
88 br i1 %tmp34, label %bb14, label %bb35
89
90 bb35: ; preds = %bb29
91 %tmp36 = insertelement <2 x i64> undef, i64 %tmp31, i32 0
92 br label %bb39
93
94 bb39: ; preds = %bb35, %bb2
95 %tmp40 = phi i32 [ %tmp30, %bb35 ], [ %tmp3, %bb2 ]
96 %tmp43 = add nuw nsw i32 %tmp5, 1
97 %tmp44 = icmp slt i32 %tmp43, %tmp40
98 br i1 %tmp44, label %bb2, label %bb8
99
100 bb45: ; preds = %bb67, %bb8
101 %tmp46 = phi i32 [ %tmp68, %bb67 ], [ %tmp40, %bb8 ]
102 %tmp47 = phi i64 [ %tmp69, %bb67 ], [ 0, %bb8 ]
103 %tmp48 = phi i64 [ %tmp70, %bb67 ], [ 0, %bb8 ]
104 %tmp49 = phi i32 [ %tmp71, %bb67 ], [ 0, %bb8 ]
105 %tmp50 = icmp sgt i32 %tmp46, 0
106 br i1 %tmp50, label %bb57, label %bb67
107
108 bb51: ; preds = %bb67
109 %tmp56 = icmp sgt i32 %tmp68, 0
110 br i1 %tmp56, label %bb73, label %bb84
111
112 bb57: ; preds = %bb57, %bb45
113 %tmp58 = phi i64 [ %tmp60, %bb57 ], [ 0, %bb45 ]
114 %tmp59 = phi i32 [ %tmp61, %bb57 ], [ 0, %bb45 ]
115 %tmp60 = add nuw i64 %tmp58, 1
116 tail call void @bar()
117 %tmp61 = add nuw nsw i32 %tmp59, 1
118 %tmp62 = load i32, i32* @c, align 4, !tbaa !2
119 %tmp63 = mul nsw i32 %tmp62, 10
120 %tmp64 = icmp slt i32 %tmp61, %tmp63
121 br i1 %tmp64, label %bb57, label %bb65
122
123 bb65: ; preds = %bb57
124 %tmp66 = add i64 %tmp60, %tmp47
125 br label %bb67
126
127 bb67: ; preds = %bb65, %bb45
128 %tmp68 = phi i32 [ %tmp62, %bb65 ], [ %tmp46, %bb45 ]
129 %tmp69 = phi i64 [ %tmp66, %bb65 ], [ %tmp47, %bb45 ]
130 %tmp70 = add nuw i64 %tmp48, 1
131 %tmp71 = add nuw nsw i32 %tmp49, 1
132 %tmp72 = icmp slt i32 %tmp71, %tmp68
133 br i1 %tmp72, label %bb45, label %bb51
134
135 bb73: ; preds = %bb73, %bb51
136 %tmp74 = phi i64 [ %tmp76, %bb73 ], [ 0, %bb51 ]
137 %tmp75 = phi i32 [ %tmp77, %bb73 ], [ 0, %bb51 ]
138 %tmp76 = add nuw i64 %tmp74, 1
139 tail call void @bar()
140 %tmp77 = add nuw nsw i32 %tmp75, 1
141 %tmp78 = load i32, i32* @c, align 4, !tbaa !2
142 %tmp79 = mul nsw i32 %tmp78, 100
143 %tmp80 = icmp slt i32 %tmp77, %tmp79
144 br i1 %tmp80, label %bb73, label %bb81
145
146 bb81: ; preds = %bb73
147 br label %bb84
148
149 bb84: ; preds = %bb81, %bb51, %bb8, %bb
150 ret i32 0
151 }
152
153 attributes #0 = { noinline }
154 attributes #1 = { norecurse nounwind uwtable }
155
156 !llvm.module.flags = !{!0}
157 !llvm.ident = !{!1}
158
159 !0 = !{i32 1, !"wchar_size", i32 4}
160 !1 = !{!"clang version 5.0.0 (trunk 307355)"}
161 !2 = !{!3, !3, i64 0}
162 !3 = !{!"int", !4, i64 0}
163 !4 = !{!"omnipotent char", !5, i64 0}
164 !5 = !{!"Simple C/C++ TBAA"}