llvm.org GIT mirror llvm / 9f93c9d
Improve profile-guided heuristics to use estimated trip count. Summary: Existing heuristic uses the ratio between the function entry frequency and the loop invocation frequency to find cold loops. However, even if the loop executes frequently, if it has a small trip count per each invocation, vectorization is not beneficial. On the other hand, even if the loop invocation frequency is much smaller than the function invocation frequency, if the trip count is high it is still beneficial to vectorize the loop. This patch uses estimated trip count computed from the profile metadata as a primary metric to determine coldness of the loop. If the estimated trip count cannot be computed, it falls back to the original heuristics. Reviewers: Ayal, mssimpso, mkuper, danielcdh, wmi, tejohnson Reviewed By: tejohnson Subscribers: tejohnson, mzolotukhin, llvm-commits Differential Revision: https://reviews.llvm.org/D32451 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@305729 91177308-0d34-0410-b5e6-96231b3b80d8 Taewook Oh 2 years ago
4 changed file(s) with 111 addition(s) and 55 deletion(s). Raw diff Collapse all Expand all
8686 std::function *GetLAA;
8787 OptimizationRemarkEmitter *ORE;
8888
89 BlockFrequency ColdEntryFreq;
90
9189 PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
9290
9391 // Shim for old PM.
57015701 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
57025702
57035703 // We should not collect Uniforms more than once per VF. Right now,
5704 // this function is called from collectUniformsAndScalars(), which
5704 // this function is called from collectUniformsAndScalars(), which
57055705 // already does this check. Collecting Uniforms for VF=1 does not make any
57065706 // sense.
57075707
57085708 assert(VF >= 2 && !Uniforms.count(VF) &&
57095709 "This function should not be visited twice for the same VF");
57105710
5711 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
5711 // Visit the list of Uniforms. If we'll not find any uniform value, we'll
57125712 // not analyze again. Uniforms.count(VF) will return 1.
57135713 Uniforms[VF].clear();
57145714
59875987 continue;
59885988
59895989 Value *Ptr = getPointerOperand(&I);
5990 // We don't check wrapping here because we don't know yet if Ptr will be
5991 // part of a full group or a group with gaps. Checking wrapping for all
5990 // We don't check wrapping here because we don't know yet if Ptr will be
5991 // part of a full group or a group with gaps. Checking wrapping for all
59925992 // pointers (even those that end up in groups with no gaps) will be overly
5993 // conservative. For full groups, wrapping should be ok since if we would
5993 // conservative. For full groups, wrapping should be ok since if we would
59945994 // wrap around the address space we would do a memory access at nullptr
59955995 // even without the transformation. The wrapping checks are therefore
59965996 // deferred until after we've formed the interleaved groups.
62436243 Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
62446244 if (LastMember) {
62456245 Value *LastMemberPtr = getPointerOperand(LastMember);
6246 if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
6246 if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
62476247 /*ShouldCheckWrap=*/true)) {
62486248 DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
62496249 "last group member potentially pointer-wrapping.\n");
62516251 }
62526252 } else {
62536253 // Case 3: A non-reversed interleaved load group with gaps: We need
6254 // to execute at least one scalar epilogue iteration. This will ensure
6254 // to execute at least one scalar epilogue iteration. This will ensure
62556255 // we don't speculatively access memory out-of-bounds. We only need
6256 // to look for a member at index factor - 1, since every group must have
6256 // to look for a member at index factor - 1, since every group must have
62576257 // a member at index zero.
62586258 if (Group->isReverse()) {
62596259 releaseGroup(Group);
77887788
77897789 // Check the loop for a trip count threshold:
77907790 // do not vectorize loops with a tiny trip count.
7791 const unsigned MaxTC = SE->getSmallConstantMaxTripCount(L);
7792 if (MaxTC > 0u && MaxTC < TinyTripCountVectorThreshold) {
7791 unsigned ExpectedTC = SE->getSmallConstantMaxTripCount(L);
7792 bool HasExpectedTC = (ExpectedTC > 0);
7793
7794 if (!HasExpectedTC && LoopVectorizeWithBlockFrequency) {
7795 auto EstimatedTC = getLoopEstimatedTripCount(L);
7796 if (EstimatedTC) {
7797 ExpectedTC = *EstimatedTC;
7798 HasExpectedTC = true;
7799 }
7800 }
7801
7802 if (HasExpectedTC && ExpectedTC < TinyTripCountVectorThreshold) {
77937803 DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
77947804 << "This loop is not worth vectorizing.");
77957805 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
78207830 // optimized for size.
78217831 bool OptForSize =
78227832 Hints.getForce() != LoopVectorizeHints::FK_Enabled && F->optForSize();
7823
7824 // Compute the weighted frequency of this loop being executed and see if it
7825 // is less than 20% of the function entry baseline frequency. Note that we
7826 // always have a canonical loop here because we think we *can* vectorize.
7827 // FIXME: This is hidden behind a flag due to pervasive problems with
7828 // exactly what block frequency models.
7829 if (LoopVectorizeWithBlockFrequency) {
7830 BlockFrequency LoopEntryFreq = BFI->getBlockFreq(L->getLoopPreheader());
7831 if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
7832 LoopEntryFreq < ColdEntryFreq)
7833 OptForSize = true;
7834 }
78357833
78367834 // Check the function attributes to see if implicit floats are allowed.
78377835 // FIXME: This check doesn't seem possibly correct -- what if the loop is
80148012 DB = &DB_;
80158013 ORE = &ORE_;
80168014
8017 // Compute some weights outside of the loop over the loops. Compute this
8018 // using a BranchProbability to re-use its scaling math.
8019 const BranchProbability ColdProb(1, 5); // 20%
8020 ColdEntryFreq = BlockFrequency(BFI->getEntryFreq()) * ColdProb;
8021
80228015 // Don't attempt if
80238016 // 1. the target claims to have no vector registers, and
80248017 // 2. interleaving won't help ILP.
114114 ret void
115115 }
116116
117 ; N is unknown, we need a tail. Can't vectorize because the loop is cold.
118 ;CHECK-LABEL: @example4(
119 ;CHECK-NOT: <4 x i32>
120 ;CHECK: ret void
121 define void @example4(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) {
122 %1 = icmp eq i32 %n, 0
123 br i1 %1, label %._crit_edge, label %.lr.ph, !prof !0
124
125 .lr.ph: ; preds = %0, %.lr.ph
126 %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
127 %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
128 %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
129 %2 = add nsw i32 %.05, -1
130 %3 = getelementptr inbounds i32, i32* %.023, i64 1
131 %4 = load i32, i32* %.023, align 16
132 %5 = getelementptr inbounds i32, i32* %.014, i64 1
133 store i32 %4, i32* %.014, align 16
134 %6 = icmp eq i32 %2, 0
135 br i1 %6, label %._crit_edge, label %.lr.ph
136
137 ._crit_edge: ; preds = %.lr.ph, %0
138 ret void
139 }
140
141 !0 = !{!"branch_weights", i32 64, i32 4}
142
143117 ; We can't vectorize this one because we need a runtime ptr check.
144118 ;CHECK-LABEL: @example23(
145119 ;CHECK-NOT: <4 x i32>
0 ; This test verifies that the loop vectorizer will not vectorizes low trip count
1 ; loops that require runtime checks (Trip count is computed with profile info).
2 ; REQUIRES: asserts
3 ; RUN: opt < %s -loop-vectorize -loop-vectorize-with-block-frequency -S | FileCheck %s
4
5 target datalayout = "E-m:e-p:32:32-i64:32-f64:32:64-a:0:32-n32-S128"
6
7 @tab = common global [32 x i8] zeroinitializer, align 1
8
9 define i32 @foo_low_trip_count1(i32 %bound) {
10 ; Simple loop with low tripcount. Should not be vectorized.
11
12 ; CHECK-LABEL: @foo_low_trip_count1(
13 ; CHECK-NOT: <{{[0-9]+}} x i8>
14
15 entry:
16 br label %for.body
17
18 for.body: ; preds = %for.body, %entry
19 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
20 %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
21 %0 = load i8, i8* %arrayidx, align 1
22 %cmp1 = icmp eq i8 %0, 0
23 %. = select i1 %cmp1, i8 2, i8 1
24 store i8 %., i8* %arrayidx, align 1
25 %inc = add nsw i32 %i.08, 1
26 %exitcond = icmp eq i32 %i.08, %bound
27 br i1 %exitcond, label %for.end, label %for.body, !prof !1
28
29 for.end: ; preds = %for.body
30 ret i32 0
31 }
32
33 define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
34 ; The loop has a same invocation count with the function, but has a low
35 ; trip_count per invocation and not worth to vectorize.
36
37 ; CHECK-LABEL: @foo_low_trip_count2(
38 ; CHECK-NOT: <{{[0-9]+}} x i8>
39
40 entry:
41 br label %for.body
42
43 for.body: ; preds = %for.body, %entry
44 %i.08 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
45 %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
46 %0 = load i8, i8* %arrayidx, align 1
47 %cmp1 = icmp eq i8 %0, 0
48 %. = select i1 %cmp1, i8 2, i8 1
49 store i8 %., i8* %arrayidx, align 1
50 %inc = add nsw i32 %i.08, 1
51 %exitcond = icmp eq i32 %i.08, %bound
52 br i1 %exitcond, label %for.end, label %for.body, !prof !1
53
54 for.end: ; preds = %for.body
55 ret i32 0
56 }
57
58 define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
59 ; The loop has low invocation count compare to the function invocation count,
60 ; but has a high trip count per invocation. Vectorize it.
61
62 ; CHECK-LABEL: @foo_low_trip_count3(
63 ; CHECK: vector.body:
64
65 entry:
66 br i1 %cond, label %for.preheader, label %for.end, !prof !2
67
68 for.preheader:
69 br label %for.body
70
71 for.body: ; preds = %for.body, %entry
72 %i.08 = phi i32 [ 0, %for.preheader ], [ %inc, %for.body ]
73 %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 %i.08
74 %0 = load i8, i8* %arrayidx, align 1
75 %cmp1 = icmp eq i8 %0, 0
76 %. = select i1 %cmp1, i8 2, i8 1
77 store i8 %., i8* %arrayidx, align 1
78 %inc = add nsw i32 %i.08, 1
79 %exitcond = icmp eq i32 %i.08, %bound
80 br i1 %exitcond, label %for.end, label %for.body, !prof !3
81
82 for.end: ; preds = %for.body
83 ret i32 0
84 }
85
86
87 !0 = !{!"function_entry_count", i64 100}
88 !1 = !{!"branch_weights", i32 100, i32 0}
89 !2 = !{!"branch_weights", i32 10, i32 90}
90 !3 = !{!"branch_weights", i32 10, i32 10000}