llvm.org GIT mirror llvm / bc3b9e5
Implement callsite-hotness based inline cost for Sample-based PGO Summary: For sample-based PGO, using BFI to calculate callsite count is sometime not accurate. This is because with sampling based approach, if a callsite resides in a hot loop deeply nested in a bunch of cold branches, the callsite's BFI frequency would be inaccurately calculated due to lack of samples in the cold branch. E.g. if (A1 && A2 && A3 && ..... && A10) { for (i=0; i < 100000000; i++) { callsite(); } } Assume that A1 to A100 are all 100% taken, and callsite has 1000 samples and thus is considerred hot. Because the loop's trip count is huge, it's normal that all branches outside the loop has no sample at all. As a result, we can only use static branch probability to derive the the frequency of the loop header. Assuming that static heuristic thinks each branch is 50% taken, then the count calculated from BFI will be 1/(2^10) of the actual value. In order to get more accurate callsite count, we directly annotate the weight on the call instruction, and directly use it when checking callsite hotness. Note that this mechanism can also be shared by instrumentation based callsite hotness analysis. The side benefit is that it breaks the dependency from Inliner to BFI as call count is embedded in the IR. Reviewers: davidxl, eraman, dnovillo Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D22118 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@275073 91177308-0d34-0410-b5e6-96231b3b80d8 Dehao Chen 4 years ago
5 changed file(s) with 103 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
221221 /// Returns false if no metadata or invalid metadata was found.
222222 bool extractProfMetadata(uint64_t &TrueVal, uint64_t &FalseVal);
223223
224 /// Retrieve total raw weight values of a branch.
225 /// Returns true on success with profile total weights filled in.
226 /// Returns false if no metadata was found.
227 bool extractProfTotalWeight(uint64_t &TotalVal);
228
224229 /// Set the debug location information for this instruction.
225230 void setDebugLoc(DebugLoc Loc) { DbgLoc = std::move(Loc); }
226231
632632 Threshold = OptSizeThreshold;
633633 }
634634
635 bool HotCallsite = false;
636 uint64_t TotalWeight;
637 if (CS.getInstruction()->extractProfTotalWeight(TotalWeight) &&
638 PSI->isHotCount(TotalWeight))
639 HotCallsite = true;
640
635641 // Listen to the inlinehint attribute or profile based hotness information
636642 // when it would increase the threshold and the caller does not need to
637643 // minimize its size.
638644 bool InlineHint = Callee.hasFnAttribute(Attribute::InlineHint) ||
639 PSI->isHotFunction(&Callee);
645 PSI->isHotFunction(&Callee) ||
646 HotCallsite;
640647 if (InlineHint && HintThreshold > Threshold && !Caller->optForMinSize())
641648 Threshold = HintThreshold;
642649
13111311 return true;
13121312 }
13131313
1314 bool Instruction::extractProfTotalWeight(uint64_t &TotalVal) {
1315 assert((getOpcode() == Instruction::Br ||
1316 getOpcode() == Instruction::Select ||
1317 getOpcode() == Instruction::Call) &&
1318 "Looking for branch weights on something besides branch");
1319
1320 TotalVal = 0;
1321 auto *ProfileData = getMetadata(LLVMContext::MD_prof);
1322 if (!ProfileData)
1323 return false;
1324
1325 auto *ProfDataName = dyn_cast(ProfileData->getOperand(0));
1326 if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
1327 return false;
1328
1329 TotalVal = 0;
1330 for (int i = 1; i < ProfileData->getNumOperands(); i++) {
1331 auto *V = mdconst::dyn_extract(ProfileData->getOperand(i));
1332 if (!V)
1333 return false;
1334 TotalVal += V->getValue().getZExtValue();
1335 }
1336 return true;
1337 }
1338
13141339 void Instruction::clearMetadataHashEntries() {
13151340 assert(hasMetadataHashEntry() && "Caller should check");
13161341 getContext().pImpl->InstructionMetadata.erase(this);
986986 MDBuilder MDB(Ctx);
987987 for (auto &BI : F) {
988988 BasicBlock *BB = &BI;
989
990 if (BlockWeights[BB]) {
991 for (auto &I : BB->getInstList()) {
992 if (CallInst *CI = dyn_cast(&I)) {
993 if (!dyn_cast(&I)) {
994 SmallVector Weights;
995 Weights.push_back(BlockWeights[BB]);
996 CI->setMetadata(LLVMContext::MD_prof,
997 MDB.createBranchWeights(Weights));
998 }
999 }
1000 }
1001 }
9891002 TerminatorInst *TI = BB->getTerminator();
9901003 if (TI->getNumSuccessors() == 1)
9911004 continue;
0 ; RUN: opt < %s -inline -inline-threshold=0 -inlinehint-threshold=100 -S | FileCheck %s
1
2 ; This tests that a hot callsite gets the (higher) inlinehint-threshold even without
3 ; without inline hints and gets inlined because the cost is less than
4 ; inlinehint-threshold. A cold callee with identical body does not get inlined because
5 ; cost exceeds the inline-threshold
6
7 define i32 @callee1(i32 %x) {
8 %x1 = add i32 %x, 1
9 %x2 = add i32 %x1, 1
10 %x3 = add i32 %x2, 1
11
12 ret i32 %x3
13 }
14
15 define i32 @callee2(i32 %x) {
16 ; CHECK-LABEL: @callee2(
17 %x1 = add i32 %x, 1
18 %x2 = add i32 %x1, 1
19 %x3 = add i32 %x2, 1
20
21 ret i32 %x3
22 }
23
24 define i32 @caller2(i32 %y1) {
25 ; CHECK-LABEL: @caller2(
26 ; CHECK: call i32 @callee2
27 ; CHECK-NOT: call i32 @callee1
28 ; CHECK: ret i32 %x3.i
29 %y2 = call i32 @callee2(i32 %y1), !prof !22
30 %y3 = call i32 @callee1(i32 %y2), !prof !21
31 ret i32 %y3
32 }
33
34 !llvm.module.flags = !{!1}
35 !21 = !{!"branch_weights", i64 300}
36 !22 = !{!"branch_weights", i64 1}
37
38 !1 = !{i32 1, !"ProfileSummary", !2}
39 !2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
40 !3 = !{!"ProfileFormat", !"InstrProf"}
41 !4 = !{!"TotalCount", i64 10000}
42 !5 = !{!"MaxCount", i64 1000}
43 !6 = !{!"MaxInternalCount", i64 1}
44 !7 = !{!"MaxFunctionCount", i64 1000}
45 !8 = !{!"NumCounts", i64 3}
46 !9 = !{!"NumFunctions", i64 3}
47 !10 = !{!"DetailedSummary", !11}
48 !11 = !{!12, !13, !14}
49 !12 = !{i32 10000, i64 100, i32 1}
50 !13 = !{i32 999000, i64 100, i32 1}
51 !14 = !{i32 999999, i64 1, i32 2}