llvm.org GIT mirror llvm / f465370
Reapply "blockfreq: Rewrite BlockFrequencyInfoImpl" This reverts commit r206677, reapplying my BlockFrequencyInfo rewrite. I've done a careful audit, added some asserts, and fixed a couple of bugs (unfortunately, they were in unlikely code paths). There's a small chance that this will appease the failing bots [1][2]. (If so, great!) If not, I have a follow-up commit ready that will temporarily add -debug-only=block-freq to the two failing tests, allowing me to compare the code path between what the failing bots and what my machines (and the rest of the bots) are doing. Once I've triggered those builds, I'll revert both commits so the bots go green again. [1]: http://bb.pgr.jp/builders/ninja-x64-msvc-RA-centos6/builds/1816 [2]: http://llvm-amd64.freebsd.your.org/b/builders/clang-i386-freebsd/builds/18445 <rdar://problem/14292693> git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206704 91177308-0d34-0410-b5e6-96231b3b80d8 Duncan P. N. Exon Smith 6 years ago
12 changed file(s) with 2992 addition(s) and 378 deletion(s). Raw diff Collapse all Expand all
66 //
77 //===----------------------------------------------------------------------===//
88 //
9 // Shared implementation of BlockFrequencyInfo for IR and Machine Instructions.
9 // Shared implementation of BlockFrequency for IR and Machine Instructions.
1010 //
1111 //===----------------------------------------------------------------------===//
1212
1515
1616 #include "llvm/ADT/DenseMap.h"
1717 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineBasicBlock.h"
19 #include "llvm/CodeGen/MachineFunction.h"
2018 #include "llvm/IR/BasicBlock.h"
2119 #include "llvm/Support/BlockFrequency.h"
2220 #include "llvm/Support/BranchProbability.h"
2523 #include
2624 #include
2725
26 //===----------------------------------------------------------------------===//
27 //
28 // PositiveFloat definition.
29 //
30 // TODO: Make this private to BlockFrequencyInfoImpl or delete.
31 //
32 //===----------------------------------------------------------------------===//
2833 namespace llvm {
2934
30
35 class PositiveFloatBase {
36 public:
37 static const int32_t MaxExponent = 16383;
38 static const int32_t MinExponent = -16382;
39 static const int DefaultPrecision = 10;
40
41 static void dump(uint64_t D, int16_t E, int Width);
42 static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width,
43 unsigned Precision);
44 static std::string toString(uint64_t D, int16_t E, int Width,
45 unsigned Precision);
46 static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); }
47 static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); }
48 static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
49
50 static std::pair splitSigned(int64_t N) {
51 if (N >= 0)
52 return std::make_pair(N, false);
53 uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N);
54 return std::make_pair(Unsigned, true);
55 }
56 static int64_t joinSigned(uint64_t U, bool IsNeg) {
57 if (U > uint64_t(INT64_MAX))
58 return IsNeg ? INT64_MIN : INT64_MAX;
59 return IsNeg ? -int64_t(U) : int64_t(U);
60 }
61
62 static int32_t extractLg(const std::pair &Lg) {
63 return Lg.first;
64 }
65 static int32_t extractLgFloor(const std::pair &Lg) {
66 return Lg.first - (Lg.second > 0);
67 }
68 static int32_t extractLgCeiling(const std::pair &Lg) {
69 return Lg.first + (Lg.second < 0);
70 }
71
72 static std::pair divide64(uint64_t L, uint64_t R);
73 static std::pair multiply64(uint64_t L, uint64_t R);
74
75 static int compare(uint64_t L, uint64_t R, int Shift) {
76 assert(Shift >= 0);
77 assert(Shift < 64);
78
79 uint64_t L_adjusted = L >> Shift;
80 if (L_adjusted < R)
81 return -1;
82 if (L_adjusted > R)
83 return 1;
84
85 return L > L_adjusted << Shift ? 1 : 0;
86 }
87 };
88
89 /// \brief Simple representation of a positive floating point.
90 ///
91 /// PositiveFloat is a positive floating point number. It uses simple
92 /// saturation arithmetic, and every operation is well-defined for every value.
93 ///
94 /// The number is split into a signed exponent and unsigned digits. The number
95 /// represented is \c getDigits()*2^getExponent(). In this way, the digits are
96 /// much like the mantissa in the x87 long double, but there is no canonical
97 /// form, so the same number can be represented by many bit representations
98 /// (it's always in "denormal" mode).
99 ///
100 /// PositiveFloat is templated on the underlying integer type for digits, which
101 /// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t.
102 ///
103 /// Unlike builtin floating point types, PositiveFloat is portable.
104 ///
105 /// Unlike APFloat, PositiveFloat does not model architecture floating point
106 /// behaviour (this should make it a little faster), and implements most
107 /// operators (this makes it usable).
108 ///
109 /// PositiveFloat is totally ordered. However, there is no canonical form, so
110 /// there are multiple representations of most scalars. E.g.:
111 ///
112 /// PositiveFloat(8u, 0) == PositiveFloat(4u, 1)
113 /// PositiveFloat(4u, 1) == PositiveFloat(2u, 2)
114 /// PositiveFloat(2u, 2) == PositiveFloat(1u, 3)
115 ///
116 /// PositiveFloat implements most arithmetic operations. Precision is kept
117 /// where possible. Uses simple saturation arithmetic, so that operations
118 /// saturate to 0.0 or getLargest() rather than under or overflowing. It has
119 /// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0.
120 /// Any other division by 0.0 is defined to be getLargest().
121 ///
122 /// As a convenience for modifying the exponent, left and right shifting are
123 /// both implemented, and both interpret negative shifts as positive shifts in
124 /// the opposite direction.
125 ///
126 /// Future work might extract most of the implementation into a base class
127 /// (e.g., \c Float) that has an \c IsSigned template parameter. The initial
128 /// use case for this only needed positive semantics, but it wouldn't take much
129 /// work to extend.
130 ///
131 /// Exponents are limited to the range accepted by x87 long double. This makes
132 /// it trivial to add functionality to convert to APFloat (this is already
133 /// relied on for the implementation of printing).
134 template class PositiveFloat : PositiveFloatBase {
135 public:
136 static_assert(!std::numeric_limits::is_signed,
137 "only unsigned floats supported");
138
139 typedef DigitsT DigitsType;
140
141 private:
142 typedef std::numeric_limits DigitsLimits;
143
144 static const int Width = sizeof(DigitsType) * 8;
145 static_assert(Width <= 64, "invalid integer width for digits");
146
147 private:
148 DigitsType Digits;
149 int16_t Exponent;
150
151 public:
152 PositiveFloat() : Digits(0), Exponent(0) {}
153
154 PositiveFloat(DigitsType Digits, int16_t Exponent)
155 : Digits(Digits), Exponent(Exponent) {}
156
157 private:
158 PositiveFloat(const std::pair &X)
159 : Digits(X.first), Exponent(X.second) {}
160
161 public:
162 static PositiveFloat getZero() { return PositiveFloat(0, 0); }
163 static PositiveFloat getOne() { return PositiveFloat(1, 0); }
164 static PositiveFloat getLargest() {
165 return PositiveFloat(DigitsLimits::max(), MaxExponent);
166 }
167 static PositiveFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); }
168 static PositiveFloat getInverseFloat(uint64_t N) {
169 return getFloat(N).invert();
170 }
171 static PositiveFloat getFraction(DigitsType N, DigitsType D) {
172 return getQuotient(N, D);
173 }
174
175 int16_t getExponent() const { return Exponent; }
176 DigitsType getDigits() const { return Digits; }
177
178 /// \brief Convert to the given integer type.
179 ///
180 /// Convert to \c IntT using simple saturating arithmetic, truncating if
181 /// necessary.
182 template IntT toInt() const;
183
184 bool isZero() const { return !Digits; }
185 bool isLargest() const { return *this == getLargest(); }
186 bool isOne() const {
187 if (Exponent > 0 || Exponent <= -Width)
188 return false;
189 return Digits == DigitsType(1) << -Exponent;
190 }
191
192 /// \brief The log base 2, rounded.
193 ///
194 /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN.
195 int32_t lg() const { return extractLg(lgImpl()); }
196
197 /// \brief The log base 2, rounded towards INT32_MIN.
198 ///
199 /// Get the lg floor. lg 0 is defined to be INT32_MIN.
200 int32_t lgFloor() const { return extractLgFloor(lgImpl()); }
201
202 /// \brief The log base 2, rounded towards INT32_MAX.
203 ///
204 /// Get the lg ceiling. lg 0 is defined to be INT32_MIN.
205 int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); }
206
207 bool operator==(const PositiveFloat &X) const { return compare(X) == 0; }
208 bool operator<(const PositiveFloat &X) const { return compare(X) < 0; }
209 bool operator!=(const PositiveFloat &X) const { return compare(X) != 0; }
210 bool operator>(const PositiveFloat &X) const { return compare(X) > 0; }
211 bool operator<=(const PositiveFloat &X) const { return compare(X) <= 0; }
212 bool operator>=(const PositiveFloat &X) const { return compare(X) >= 0; }
213
214 bool operator!() const { return isZero(); }
215
216 /// \brief Convert to a decimal representation in a string.
217 ///
218 /// Convert to a string. Uses scientific notation for very large/small
219 /// numbers. Scientific notation is used roughly for numbers outside of the
220 /// range 2^-64 through 2^64.
221 ///
222 /// \c Precision indicates the number of decimal digits of precision to use;
223 /// 0 requests the maximum available.
224 ///
225 /// As a special case to make debugging easier, if the number is small enough
226 /// to convert without scientific notation and has more than \c Precision
227 /// digits before the decimal place, it's printed accurately to the first
228 /// digit past zero. E.g., assuming 10 digits of precision:
229 ///
230 /// 98765432198.7654... => 98765432198.8
231 /// 8765432198.7654... => 8765432198.8
232 /// 765432198.7654... => 765432198.8
233 /// 65432198.7654... => 65432198.77
234 /// 5432198.7654... => 5432198.765
235 std::string toString(unsigned Precision = DefaultPrecision) {
236 return PositiveFloatBase::toString(Digits, Exponent, Width, Precision);
237 }
238
239 /// \brief Print a decimal representation.
240 ///
241 /// Print a string. See toString for documentation.
242 raw_ostream &print(raw_ostream &OS,
243 unsigned Precision = DefaultPrecision) const {
244 return PositiveFloatBase::print(OS, Digits, Exponent, Width, Precision);
245 }
246 void dump() const { return PositiveFloatBase::dump(Digits, Exponent, Width); }
247
248 PositiveFloat &operator+=(const PositiveFloat &X);
249 PositiveFloat &operator-=(const PositiveFloat &X);
250 PositiveFloat &operator*=(const PositiveFloat &X);
251 PositiveFloat &operator/=(const PositiveFloat &X);
252 PositiveFloat &operator<<=(int16_t Shift) { shiftLeft(Shift); return *this; }
253 PositiveFloat &operator>>=(int16_t Shift) { shiftRight(Shift); return *this; }
254
255 private:
256 void shiftLeft(int32_t Shift);
257 void shiftRight(int32_t Shift);
258
259 /// \brief Adjust two floats to have matching exponents.
260 ///
261 /// Adjust \c this and \c X to have matching exponents. Returns the new \c X
262 /// by value. Does nothing if \a isZero() for either.
263 ///
264 /// The value that compares smaller will lose precision, and possibly become
265 /// \a isZero().
266 PositiveFloat matchExponents(PositiveFloat X);
267
268 /// \brief Increase exponent to match another float.
269 ///
270 /// Increases \c this to have an exponent matching \c X. May decrease the
271 /// exponent of \c X in the process, and \c this may possibly become \a
272 /// isZero().
273 void increaseExponentToMatch(PositiveFloat &X, int32_t ExponentDiff);
274
275 public:
276 /// \brief Scale a large number accurately.
277 ///
278 /// Scale N (multiply it by this). Uses full precision multiplication, even
279 /// if Width is smaller than 64, so information is not lost.
280 uint64_t scale(uint64_t N) const;
281 uint64_t scaleByInverse(uint64_t N) const {
282 // TODO: implement directly, rather than relying on inverse. Inverse is
283 // expensive.
284 return inverse().scale(N);
285 }
286 int64_t scale(int64_t N) const {
287 std::pair Unsigned = splitSigned(N);
288 return joinSigned(scale(Unsigned.first), Unsigned.second);
289 }
290 int64_t scaleByInverse(int64_t N) const {
291 std::pair Unsigned = splitSigned(N);
292 return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second);
293 }
294
295 int compare(const PositiveFloat &X) const;
296 int compareTo(uint64_t N) const {
297 PositiveFloat Float = getFloat(N);
298 int Compare = compare(Float);
299 if (Width == 64 || Compare != 0)
300 return Compare;
301
302 // Check for precision loss. We know *this == RoundTrip.
303 uint64_t RoundTrip = Float.template toInt();
304 return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1;
305 }
306 int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); }
307
308 PositiveFloat &invert() { return *this = PositiveFloat::getFloat(1) / *this; }
309 PositiveFloat inverse() const { return PositiveFloat(*this).invert(); }
310
311 private:
312 static PositiveFloat getProduct(DigitsType L, DigitsType R);
313 static PositiveFloat getQuotient(DigitsType Dividend, DigitsType Divisor);
314
315 std::pair lgImpl() const;
316 static int countLeadingZerosWidth(DigitsType Digits) {
317 if (Width == 64)
318 return countLeadingZeros64(Digits);
319 if (Width == 32)
320 return countLeadingZeros32(Digits);
321 return countLeadingZeros32(Digits) + Width - 32;
322 }
323
324 static PositiveFloat adjustToWidth(uint64_t N, int32_t S) {
325 assert(S >= MinExponent);
326 assert(S <= MaxExponent);
327 if (Width == 64 || N <= DigitsLimits::max())
328 return PositiveFloat(N, S);
329
330 // Shift right.
331 int Shift = 64 - Width - countLeadingZeros64(N);
332 DigitsType Shifted = N >> Shift;
333
334 // Round.
335 assert(S + Shift <= MaxExponent);
336 return getRounded(PositiveFloat(Shifted, S + Shift),
337 N & UINT64_C(1) << (Shift - 1));
338 }
339
340 static PositiveFloat getRounded(PositiveFloat P, bool Round) {
341 if (!Round)
342 return P;
343 if (P.Digits == DigitsLimits::max())
344 // Careful of overflow in the exponent.
345 return PositiveFloat(1, P.Exponent) <<= Width;
346 return PositiveFloat(P.Digits + 1, P.Exponent);
347 }
348 };
349
350 #define POSITIVE_FLOAT_BOP(op, base) \
351 template \
352 PositiveFloat operator op(const PositiveFloat &L, \
353 const PositiveFloat &R) { \
354 return PositiveFloat(L) base R; \
355 }
356 POSITIVE_FLOAT_BOP(+, += )
357 POSITIVE_FLOAT_BOP(-, -= )
358 POSITIVE_FLOAT_BOP(*, *= )
359 POSITIVE_FLOAT_BOP(/, /= )
360 POSITIVE_FLOAT_BOP(<<, <<= )
361 POSITIVE_FLOAT_BOP(>>, >>= )
362 #undef POSITIVE_FLOAT_BOP
363
364 template
365 raw_ostream &operator<<(raw_ostream &OS, const PositiveFloat &X) {
366 return X.print(OS, 10);
367 }
368
369 #define POSITIVE_FLOAT_COMPARE_TO_TYPE(op, T1, T2) \
370 template \
371 bool operator op(const PositiveFloat &L, T1 R) { \
372 return L.compareTo(T2(R)) op 0; \
373 } \
374 template \
375 bool operator op(T1 L, const PositiveFloat &R) { \
376 return 0 op R.compareTo(T2(L)); \
377 }
378 #define POSITIVE_FLOAT_COMPARE_TO(op) \
379 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, uint64_t, uint64_t) \
380 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, uint32_t, uint64_t) \
381 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, int64_t, int64_t) \
382 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, int32_t, int64_t)
383 POSITIVE_FLOAT_COMPARE_TO(< )
384 POSITIVE_FLOAT_COMPARE_TO(> )
385 POSITIVE_FLOAT_COMPARE_TO(== )
386 POSITIVE_FLOAT_COMPARE_TO(!= )
387 POSITIVE_FLOAT_COMPARE_TO(<= )
388 POSITIVE_FLOAT_COMPARE_TO(>= )
389 #undef POSITIVE_FLOAT_COMPARE_TO
390 #undef POSITIVE_FLOAT_COMPARE_TO_TYPE
391
392 template
393 uint64_t PositiveFloat::scale(uint64_t N) const {
394 if (Width == 64 || N <= DigitsLimits::max())
395 return (getFloat(N) * *this).template toInt();
396
397 // Defer to the 64-bit version.
398 return PositiveFloat(Digits, Exponent).scale(N);
399 }
400
401 template
402 PositiveFloat PositiveFloat::getProduct(DigitsType L,
403 DigitsType R) {
404 // Check for zero.
405 if (!L || !R)
406 return getZero();
407
408 // Check for numbers that we can compute with 64-bit math.
409 if (Width <= 32 || (L <= UINT32_MAX && R <= UINT32_MAX))
410 return adjustToWidth(uint64_t(L) * uint64_t(R), 0);
411
412 // Do the full thing.
413 return PositiveFloat(multiply64(L, R));
414 }
415 template
416 PositiveFloat PositiveFloat::getQuotient(DigitsType Dividend,
417 DigitsType Divisor) {
418 // Check for zero.
419 if (!Dividend)
420 return getZero();
421 if (!Divisor)
422 return getLargest();
423
424 if (Width == 64)
425 return PositiveFloat(divide64(Dividend, Divisor));
426
427 // We can compute this with 64-bit math.
428 int Shift = countLeadingZeros64(Dividend);
429 uint64_t Shifted = uint64_t(Dividend) << Shift;
430 uint64_t Quotient = Shifted / Divisor;
431
432 // If Quotient needs to be shifted, then adjustToWidth will round.
433 if (Quotient > DigitsLimits::max())
434 return adjustToWidth(Quotient, -Shift);
435
436 // Round based on the value of the next bit.
437 return getRounded(PositiveFloat(Quotient, -Shift),
438 Shifted % Divisor >= getHalf(Divisor));
439 }
440
441 template
442 template
443 IntT PositiveFloat::toInt() const {
444 typedef std::numeric_limits Limits;
445 if (*this < 1)
446 return 0;
447 if (*this >= Limits::max())
448 return Limits::max();
449
450 IntT N = Digits;
451 if (Exponent > 0) {
452 assert(size_t(Exponent) < sizeof(IntT) * 8);
453 return N << Exponent;
454 }
455 if (Exponent < 0) {
456 assert(size_t(-Exponent) < sizeof(IntT) * 8);
457 return N >> -Exponent;
458 }
459 return N;
460 }
461
462 template
463 std::pair PositiveFloat::lgImpl() const {
464 if (isZero())
465 return std::make_pair(INT32_MIN, 0);
466
467 // Get the floor of the lg of Digits.
468 int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1;
469
470 // Get the floor of the lg of this.
471 int32_t Floor = Exponent + LocalFloor;
472 if (Digits == UINT64_C(1) << LocalFloor)
473 return std::make_pair(Floor, 0);
474
475 // Round based on the next digit.
476 assert(LocalFloor >= 1);
477 bool Round = Digits & UINT64_C(1) << (LocalFloor - 1);
478 return std::make_pair(Floor + Round, Round ? 1 : -1);
479 }
480
481 template
482 PositiveFloat PositiveFloat::matchExponents(PositiveFloat X) {
483 if (isZero() || X.isZero() || Exponent == X.Exponent)
484 return X;
485
486 int32_t Diff = int32_t(X.Exponent) - int32_t(Exponent);
487 if (Diff > 0)
488 increaseExponentToMatch(X, Diff);
489 else
490 X.increaseExponentToMatch(*this, -Diff);
491 return X;
492 }
493 template
494 void PositiveFloat::increaseExponentToMatch(PositiveFloat &X,
495 int32_t ExponentDiff) {
496 assert(ExponentDiff > 0);
497 if (ExponentDiff >= 2 * Width) {
498 *this = getZero();
499 return;
500 }
501
502 // Use up any leading zeros on X, and then shift this.
503 int32_t ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff);
504 assert(ShiftX < Width);
505
506 int32_t ShiftThis = ExponentDiff - ShiftX;
507 if (ShiftThis >= Width) {
508 *this = getZero();
509 return;
510 }
511
512 X.Digits <<= ShiftX;
513 X.Exponent -= ShiftX;
514 Digits >>= ShiftThis;
515 Exponent += ShiftThis;
516 return;
517 }
518
519 template
520 PositiveFloat &PositiveFloat::
521 operator+=(const PositiveFloat &X) {
522 if (isLargest() || X.isZero())
523 return *this;
524 if (isZero() || X.isLargest())
525 return *this = X;
526
527 // Normalize exponents.
528 PositiveFloat Scaled = matchExponents(X);
529
530 // Check for zero again.
531 if (isZero())
532 return *this = Scaled;
533 if (Scaled.isZero())
534 return *this;
535
536 // Compute sum.
537 DigitsType Sum = Digits + Scaled.Digits;
538 bool DidOverflow = Sum < Digits;
539 Digits = Sum;
540 if (!DidOverflow)
541 return *this;
542
543 if (Exponent == MaxExponent)
544 return *this = getLargest();
545
546 ++Exponent;
547 Digits = UINT64_C(1) << (Width - 1) | Digits >> 1;
548
549 return *this;
550 }
551 template
552 PositiveFloat &PositiveFloat::
553 operator-=(const PositiveFloat &X) {
554 if (X.isZero())
555 return *this;
556 if (*this <= X)
557 return *this = getZero();
558
559 // Normalize exponents.
560 PositiveFloat Scaled = matchExponents(X);
561 assert(Digits >= Scaled.Digits);
562
563 // Compute difference.
564 if (!Scaled.isZero()) {
565 Digits -= Scaled.Digits;
566 return *this;
567 }
568
569 // Check if X just barely lost its last bit. E.g., for 32-bit:
570 //
571 // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32
572 if (*this == PositiveFloat(1, X.lgFloor() + Width)) {
573 Digits = DigitsType(0) - 1;
574 --Exponent;
575 }
576 return *this;
577 }
578 template
579 PositiveFloat &PositiveFloat::
580 operator*=(const PositiveFloat &X) {
581 if (isZero())
582 return *this;
583 if (X.isZero())
584 return *this = X;
585
586 // Save the exponents.
587 int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent);
588
589 // Get the raw product.
590 *this = getProduct(Digits, X.Digits);
591
592 // Combine with exponents.
593 return *this <<= Exponents;
594 }
595 template
596 PositiveFloat &PositiveFloat::
597 operator/=(const PositiveFloat &X) {
598 if (isZero())
599 return *this;
600 if (X.isZero())
601 return *this = getLargest();
602
603 // Save the exponents.
604 int32_t Exponents = int32_t(Exponent) - int32_t(X.Exponent);
605
606 // Get the raw quotient.
607 *this = getQuotient(Digits, X.Digits);
608
609 // Combine with exponents.
610 return *this <<= Exponents;
611 }
612 template
613 void PositiveFloat::shiftLeft(int32_t Shift) {
614 if (!Shift || isZero())
615 return;
616 assert(Shift != INT32_MIN);
617 if (Shift < 0) {
618 shiftRight(-Shift);
619 return;
620 }
621
622 // Shift as much as we can in the exponent.
623 int32_t ExponentShift = std::min(Shift, MaxExponent - Exponent);
624 Exponent += ExponentShift;
625 if (ExponentShift == Shift)
626 return;
627
628 // Check this late, since it's rare.
629 if (isLargest())
630 return;
631
632 // Shift the digits themselves.
633 Shift -= ExponentShift;
634 if (Shift > countLeadingZerosWidth(Digits)) {
635 // Saturate.
636 *this = getLargest();
637 return;
638 }
639
640 Digits <<= Shift;
641 return;
642 }
643
644 template
645 void PositiveFloat::shiftRight(int32_t Shift) {
646 if (!Shift || isZero())
647 return;
648 assert(Shift != INT32_MIN);
649 if (Shift < 0) {
650 shiftLeft(-Shift);
651 return;
652 }
653
654 // Shift as much as we can in the exponent.
655 int32_t ExponentShift = std::min(Shift, Exponent - MinExponent);
656 Exponent -= ExponentShift;
657 if (ExponentShift == Shift)
658 return;
659
660 // Shift the digits themselves.
661 Shift -= ExponentShift;
662 if (Shift >= Width) {
663 // Saturate.
664 *this = getZero();
665 return;
666 }
667
668 Digits >>= Shift;
669 return;
670 }
671
672 template
673 int PositiveFloat::compare(const PositiveFloat &X) const {
674 // Check for zero.
675 if (isZero())
676 return X.isZero() ? 0 : -1;
677 if (X.isZero())
678 return 1;
679
680 // Check for the scale. Use lgFloor to be sure that the exponent difference
681 // is always lower than 64.
682 int32_t lgL = lgFloor(), lgR = X.lgFloor();
683 if (lgL != lgR)
684 return lgL < lgR ? -1 : 1;
685
686 // Compare digits.
687 if (Exponent < X.Exponent)
688 return PositiveFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent);
689
690 return -PositiveFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent);
691 }
692
693 template struct isPodLike> {
694 static const bool value = true;
695 };
696 }
697
698 //===----------------------------------------------------------------------===//
699 //
700 // BlockMass definition.
701 //
702 // TODO: Make this private to BlockFrequencyInfoImpl or delete.
703 //
704 //===----------------------------------------------------------------------===//
705 namespace llvm {
706
707 /// \brief Mass of a block.
708 ///
709 /// This class implements a sort of fixed-point fraction always between 0.0 and
710 /// 1.0. getMass() == UINT64_MAX indicates a value of 1.0.
711 ///
712 /// Masses can be added and subtracted. Simple saturation arithmetic is used,
713 /// so arithmetic operations never overflow or underflow.
714 ///
715 /// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
716 /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
717 /// quite, maximum precision).
718 ///
719 /// Masses can be scaled by \a BranchProbability at maximum precision.
720 class BlockMass {
721 uint64_t Mass;
722
723 public:
724 BlockMass() : Mass(0) {}
725 explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
726
727 static BlockMass getEmpty() { return BlockMass(); }
728 static BlockMass getFull() { return BlockMass(UINT64_MAX); }
729
730 uint64_t getMass() const { return Mass; }
731
732 bool isFull() const { return Mass == UINT64_MAX; }
733 bool isEmpty() const { return !Mass; }
734
735 bool operator!() const { return isEmpty(); }
736
737 /// \brief Add another mass.
738 ///
739 /// Adds another mass, saturating at \a isFull() rather than overflowing.
740 BlockMass &operator+=(const BlockMass &X) {
741 uint64_t Sum = Mass + X.Mass;
742 Mass = Sum < Mass ? UINT64_MAX : Sum;
743 return *this;
744 }
745
746 /// \brief Subtract another mass.
747 ///
748 /// Subtracts another mass, saturating at \a isEmpty() rather than
749 /// undeflowing.
750 BlockMass &operator-=(const BlockMass &X) {
751 uint64_t Diff = Mass - X.Mass;
752 Mass = Diff > Mass ? 0 : Diff;
753 return *this;
754 }
755
756 /// \brief Scale by another mass.
757 ///
758 /// The current implementation is a little imprecise, but it's relatively
759 /// fast, never overflows, and maintains the property that 1.0*1.0==1.0
760 /// (where isFull represents the number 1.0). It's an approximation of
761 /// 128-bit multiply that gets right-shifted by 64-bits.
762 ///
763 /// For a given digit size, multiplying two-digit numbers looks like:
764 ///
765 /// U1 . L1
766 /// * U2 . L2
767 /// ============
768 /// 0 . . L1*L2
769 /// + 0 . U1*L2 . 0 // (shift left once by a digit-size)
770 /// + 0 . U2*L1 . 0 // (shift left once by a digit-size)
771 /// + U1*L2 . 0 . 0 // (shift left twice by a digit-size)
772 ///
773 /// BlockMass has 64-bit numbers. Split each into two 32-bit digits, stored
774 /// 64-bit. Add 1 to the lower digits, to model isFull as 1.0; this won't
775 /// overflow, since we have 64-bit storage for each digit.
776 ///
777 /// To do this accurately, (a) multiply into two 64-bit digits, incrementing
778 /// the upper digit on overflows of the lower digit (carry), (b) subtract 1
779 /// from the lower digit, decrementing the upper digit on underflow (carry),
780 /// and (c) truncate the lower digit. For the 1.0*1.0 case, the upper digit
781 /// will be 0 at the end of step (a), and then will underflow back to isFull
782 /// (1.0) in step (b).
783 ///
784 /// Instead, the implementation does something a little faster with a small
785 /// loss of accuracy: ignore the lower 64-bit digit entirely. The loss of
786 /// accuracy is small, since the sum of the unmodelled carries is 0 or 1
787 /// (i.e., step (a) will overflow at most once, and step (b) will underflow
788 /// only if step (a) overflows).
789 ///
790 /// This is the formula we're calculating:
791 ///
792 /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>32 + (U2 * (L1+1))>>32
793 ///
794 /// As a demonstration of 1.0*1.0, consider two 4-bit numbers that are both
795 /// full (1111).
796 ///
797 /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>2 + (U2 * (L1+1))>>2
798 /// 11.11 * 11.11 == 11 * 11 + (11 * (11+1))/4 + (11 * (11+1))/4
799 /// == 1001 + (11 * 100)/4 + (11 * 100)/4
800 /// == 1001 + 1100/4 + 1100/4
801 /// == 1001 + 0011 + 0011
802 /// == 1111
803 BlockMass &operator*=(const BlockMass &X) {
804 uint64_t U1 = Mass >> 32, L1 = Mass & UINT32_MAX, U2 = X.Mass >> 32,
805 L2 = X.Mass & UINT32_MAX;
806 Mass = U1 * U2 + (U1 * (L2 + 1) >> 32) + ((L1 + 1) * U2 >> 32);
807 return *this;
808 }
809
810 /// \brief Multiply by a branch probability.
811 ///
812 /// Multiply by P. Guarantees full precision.
813 ///
814 /// This could be naively implemented by multiplying by the numerator and
815 /// dividing by the denominator, but in what order? Multiplying first can
816 /// overflow, while dividing first will lose precision (potentially, changing
817 /// a non-zero mass to zero).
818 ///
819 /// The implementation mixes the two methods. Since \a BranchProbability
820 /// uses 32-bits and \a BlockMass 64-bits, shift the mass as far to the left
821 /// as there is room, then divide by the denominator to get a quotient.
822 /// Multiplying by the numerator and right shifting gives a first
823 /// approximation.
824 ///
825 /// Calculate the error in this first approximation by calculating the
826 /// opposite mass (multiply by the opposite numerator and shift) and
827 /// subtracting both from teh original mass.
828 ///
829 /// Add to the first approximation the correct fraction of this error value.
830 /// This time, multiply first and then divide, since there is no danger of
831 /// overflow.
832 ///
833 /// \pre P represents a fraction between 0.0 and 1.0.
834 BlockMass &operator*=(const BranchProbability &P);
835
836 bool operator==(const BlockMass &X) const { return Mass == X.Mass; }
837 bool operator!=(const BlockMass &X) const { return Mass != X.Mass; }
838 bool operator<=(const BlockMass &X) const { return Mass <= X.Mass; }
839 bool operator>=(const BlockMass &X) const { return Mass >= X.Mass; }
840 bool operator<(const BlockMass &X) const { return Mass < X.Mass; }
841 bool operator>(const BlockMass &X) const { return Mass > X.Mass; }
842
843 /// \brief Convert to floating point.
844 ///
845 /// Convert to a float. \a isFull() gives 1.0, while \a isEmpty() gives
846 /// slightly above 0.0.
847 PositiveFloat toFloat() const;
848
849 void dump() const;
850 raw_ostream &print(raw_ostream &OS) const;
851 };
852
853 inline BlockMass operator+(const BlockMass &L, const BlockMass &R) {
854 return BlockMass(L) += R;
855 }
856 inline BlockMass operator-(const BlockMass &L, const BlockMass &R) {
857 return BlockMass(L) -= R;
858 }
859 inline BlockMass operator*(const BlockMass &L, const BlockMass &R) {
860 return BlockMass(L) *= R;
861 }
862 inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) {
863 return BlockMass(L) *= R;
864 }
865 inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) {
866 return BlockMass(R) *= L;
867 }
868
869 inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) {
870 return X.print(OS);
871 }
872
873 template <> struct isPodLike {
874 static const bool value = true;
875 };
876 }
877
878 //===----------------------------------------------------------------------===//
879 //
880 // BlockFrequencyInfoImpl definition.
881 //
882 //===----------------------------------------------------------------------===//
883 namespace llvm {
884
885 class BasicBlock;
31886 class BranchProbabilityInfo;
32 class BlockFrequencyInfo;
887 class Function;
888 class Loop;
889 class LoopInfo;
890 class MachineBasicBlock;
33891 class MachineBranchProbabilityInfo;
34 class MachineBlockFrequencyInfo;
892 class MachineFunction;
893 class MachineLoop;
894 class MachineLoopInfo;
895
896 /// \brief Base class for BlockFrequencyInfoImpl
897 ///
898 /// BlockFrequencyInfoImplBase has supporting data structures and some
899 /// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
900 /// the block type (or that call such algorithms) are skipped here.
901 ///
902 /// Nevertheless, the majority of the overall algorithm documention lives with
903 /// BlockFrequencyInfoImpl. See there for details.
904 class BlockFrequencyInfoImplBase {
905 public:
906 typedef PositiveFloat Float;
907
908 /// \brief Representative of a block.
909 ///
910 /// This is a simple wrapper around an index into the reverse-post-order
911 /// traversal of the blocks.
912 ///
913 /// Unlike a block pointer, its order has meaning (location in the
914 /// topological sort) and it's class is the same regardless of block type.
915 struct BlockNode {
916 typedef uint32_t IndexType;
917 IndexType Index;
918
919 bool operator==(const BlockNode &X) const { return Index == X.Index; }
920 bool operator!=(const BlockNode &X) const { return Index != X.Index; }
921 bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
922 bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
923 bool operator<(const BlockNode &X) const { return Index < X.Index; }
924 bool operator>(const BlockNode &X) const { return Index > X.Index; }
925
926 BlockNode() : Index(UINT32_MAX) {}
927 BlockNode(IndexType Index) : Index(Index) {}
928
929 bool isValid() const { return Index <= getMaxIndex(); }
930 static size_t getMaxIndex() { return UINT32_MAX - 1; }
931 };
932
933 /// \brief Stats about a block itself.
934 struct FrequencyData {
935 Float Floating;
936 uint64_t Integer;
937 };
938
939 /// \brief Index of loop information.
940 struct WorkingData {
941 BlockNode ContainingLoop; ///< The block whose loop this block is inside.
942 uint32_t LoopIndex; ///< Index into PackagedLoops.
943 bool IsPackaged; ///< Has ContainingLoop been packaged up?
944 bool IsAPackage; ///< Has this block's loop been packaged up?
945 BlockMass Mass; ///< Mass distribution from the entry block.
946
947 WorkingData()
948 : LoopIndex(UINT32_MAX), IsPackaged(false), IsAPackage(false) {}
949
950 bool hasLoopHeader() const { return ContainingLoop.isValid(); }
951 bool isLoopHeader() const { return LoopIndex != UINT32_MAX; }
952 };
953
954 /// \brief Unscaled probability weight.
955 ///
956 /// Probability weight for an edge in the graph (including the
957 /// successor/target node).
958 ///
959 /// All edges in the original function are 32-bit. However, exit edges from
960 /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
961 /// space in general.
962 ///
963 /// In addition to the raw weight amount, Weight stores the type of the edge
964 /// in the current context (i.e., the context of the loop being processed).
965 /// Is this a local edge within the loop, an exit from the loop, or a
966 /// backedge to the loop header?
967 struct Weight {
968 enum DistType { Local, Exit, Backedge };
969 DistType Type;
970 BlockNode TargetNode;
971 uint64_t Amount;
972 Weight() : Type(Local), Amount(0) {}
973 };
974
975 /// \brief Distribution of unscaled probability weight.
976 ///
977 /// Distribution of unscaled probability weight to a set of successors.
978 ///
979 /// This class collates the successor edge weights for later processing.
980 ///
981 /// \a DidOverflow indicates whether \a Total did overflow while adding to
982 /// the distribution. It should never overflow twice. There's no flag for
983 /// whether \a ForwardTotal overflows, since when \a Total exceeds 32-bits
984 /// they both get re-computed during \a normalize().
985 struct Distribution {
986 typedef SmallVector WeightList;
987 WeightList Weights; ///< Individual successor weights.
988 uint64_t Total; ///< Sum of all weights.
989 bool DidOverflow; ///< Whether \a Total did overflow.
990 uint32_t ForwardTotal; ///< Total excluding backedges.
991
992 Distribution() : Total(0), DidOverflow(false), ForwardTotal(0) {}
993 void addLocal(const BlockNode &Node, uint64_t Amount) {
994 add(Node, Amount, Weight::Local);
995 }
996 void addExit(const BlockNode &Node, uint64_t Amount) {
997 add(Node, Amount, Weight::Exit);
998 }
999 void addBackedge(const BlockNode &Node, uint64_t Amount) {
1000 add(Node, Amount, Weight::Backedge);
1001 }
1002
1003 /// \brief Normalize the distribution.
1004 ///
1005 /// Combines multiple edges to the same \a Weight::TargetNode and scales
1006 /// down so that \a Total fits into 32-bits.
1007 ///
1008 /// This is linear in the size of \a Weights. For the vast majority of
1009 /// cases, adjacent edge weights are combined by sorting WeightList and
1010 /// combining adjacent weights. However, for very large edge lists an
1011 /// auxiliary hash table is used.
1012 void normalize();
1013
1014 private:
1015 void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
1016 };
1017
1018 /// \brief Data for a packaged loop.
1019 ///
1020 /// Contains the data necessary to represent represent a loop as a node once
1021 /// it's packaged.
1022 ///
1023 /// PackagedLoopData inherits from BlockData to give the node the necessary
1024 /// stats. Further, it has a list of successors, list of members, and stores
1025 /// the backedge mass assigned to this loop.
1026 struct PackagedLoopData {
1027 typedef SmallVector, 4> ExitMap;
1028 typedef SmallVector MemberList;
1029 BlockNode Header; ///< Header.
1030 ExitMap Exits; ///< Successor edges (and weights).
1031 MemberList Members; ///< Members of the loop.
1032 BlockMass BackedgeMass; ///< Mass returned to loop header.
1033 BlockMass Mass;
1034 Float Scale;
1035
1036 PackagedLoopData(const BlockNode &Header) : Header(Header) {}
1037 };
1038
1039 /// \brief Data about each block. This is used downstream.
1040 std::vector Freqs;
1041
1042 /// \brief Loop data: see initializeLoops().
1043 std::vector Working;
1044
1045 /// \brief Indexed information about packaged loops.
1046 std::vector PackagedLoops;
1047
1048 /// \brief Create the initial loop packages.
1049 ///
1050 /// Initializes PackagedLoops using the data in Working about backedges
1051 /// and containing loops. Called by initializeLoops().
1052 ///
1053 /// \post WorkingData::LoopIndex has been initialized for every loop header
1054 /// and PackagedLoopData::Members has been initialized.
1055
1056 /// \brief Add all edges out of a packaged loop to the distribution.
1057 ///
1058 /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
1059 /// successor edge.
1060 void addLoopSuccessorsToDist(const BlockNode &LoopHead,
1061 const BlockNode &LocalLoopHead,
1062 Distribution &Dist);
1063
1064 /// \brief Add an edge to the distribution.
1065 ///
1066 /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
1067 /// edge is forward/exit/backedge is in the context of LoopHead. Otherwise,
1068 /// every edge should be a forward edge (since all the loops are packaged
1069 /// up).
1070 void addToDist(Distribution &Dist, const BlockNode &LoopHead,
1071 const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
1072
1073 PackagedLoopData &getLoopPackage(const BlockNode &Head) {
1074 assert(Head.Index < Working.size());
1075 size_t Index = Working[Head.Index].LoopIndex;
1076 assert(Index < PackagedLoops.size());
1077 return PackagedLoops[Index];
1078 }
1079
1080 /// \brief Distribute mass according to a distribution.
1081 ///
1082 /// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
1083 /// backedges and exits are stored in its entry in PackagedLoops.
1084 ///
1085 /// Mass is distributed in parallel from two copies of the source mass.
1086 ///
1087 /// The first mass (forward) represents the distribution of mass through the
1088 /// local DAG. This distribution should lose mass at loop exits and ignore
1089 /// backedges.
1090 ///
1091 /// The second mass (general) represents the behavior of the loop in the
1092 /// global context. In a given distribution from the head, how much mass
1093 /// exits, and to where? How much mass returns to the loop head?
1094 ///
1095 /// The forward mass should be split up between local successors and exits,
1096 /// but only actually distributed to the local successors. The general mass
1097 /// should be split up between all three types of successors, but distributed
1098 /// only to exits and backedges.
1099 void distributeMass(const BlockNode &Source, const BlockNode &LoopHead,
1100 Distribution &Dist);
1101
1102 /// \brief Compute the loop scale for a loop.
1103 void computeLoopScale(const BlockNode &LoopHead);
1104
1105 /// \brief Package up a loop.
1106 void packageLoop(const BlockNode &LoopHead);
1107
1108 /// \brief Finalize frequency metrics.
1109 ///
1110 /// Unwraps loop packages, calculates final frequencies, and cleans up
1111 /// no-longer-needed data structures.
1112 void finalizeMetrics();
1113
1114 /// \brief Clear all memory.
1115 void clear();
1116
1117 virtual std::string getBlockName(const BlockNode &Node) const;
1118
1119 virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
1120 void dump() const { print(dbgs()); }
1121
1122 Float getFloatingBlockFreq(const BlockNode &Node) const;
1123
1124 BlockFrequency getBlockFreq(const BlockNode &Node) const;
1125
1126 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
1127 raw_ostream &printBlockFreq(raw_ostream &OS,
1128 const BlockFrequency &Freq) const;
1129
1130 uint64_t getEntryFreq() const {
1131 assert(!Freqs.empty());
1132 return Freqs[0].Integer;
1133 }
1134 /// \brief Virtual destructor.
1135 ///
1136 /// Need a virtual destructor to mask the compiler warning about
1137 /// getBlockName().
1138 virtual ~BlockFrequencyInfoImplBase() {}
1139 };
351140
361141 namespace bfi_detail {
371142 template struct TypeMap {};
391144 typedef BasicBlock BlockT;
401145 typedef Function FunctionT;
411146 typedef BranchProbabilityInfo BranchProbabilityInfoT;
1147 typedef Loop LoopT;
1148 typedef LoopInfo LoopInfoT;
421149 };
431150 template <> struct TypeMap {
441151 typedef MachineBasicBlock BlockT;
451152 typedef MachineFunction FunctionT;
461153 typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
1154 typedef MachineLoop LoopT;
1155 typedef MachineLoopInfo LoopInfoT;
471156 };
48 }
49
50 /// BlockFrequencyInfoImpl implements block frequency algorithm for IR and
51 /// Machine Instructions. Algorithm starts with value ENTRY_FREQ
52 /// for the entry block and then propagates frequencies using branch weights
53 /// from (Machine)BranchProbabilityInfo. LoopInfo is not required because
54 /// algorithm can find "backedges" by itself.
55 template
56 class BlockFrequencyInfoImpl {
1157
1158 /// \brief Get the name of a MachineBasicBlock.
1159 ///
1160 /// Get the name of a MachineBasicBlock. It's templated so that including from
1161 /// CodeGen is unnecessary (that would be a layering issue).
1162 ///
1163 /// This is used mainly for debug output. The name is similar to
1164 /// MachineBasicBlock::getFullName(), but skips the name of the function.
1165 template std::string getBlockName(const BlockT *BB) {
1166 assert(BB && "Unexpected nullptr");
1167 auto MachineName = "BB" + Twine(BB->getNumber());
1168 if (BB->getBasicBlock())
1169 return (MachineName + "[" + BB->getName() + "]").str();
1170 return MachineName.str();
1171 }
1172 /// \brief Get the name of a BasicBlock.
1173 template <> inline std::string getBlockName(const BasicBlock *BB) {
1174 assert(BB && "Unexpected nullptr");
1175 return BB->getName().str();
1176 }
1177 }
1178
1179 /// \brief Shared implementation for block frequency analysis.
1180 ///
1181 /// This is a shared implementation of BlockFrequencyInfo and
1182 /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
1183 /// blocks.
1184 ///
1185 /// This algorithm leverages BlockMass and PositiveFloat to maintain precision,
1186 /// separates mass distribution from loop scaling, and dithers to eliminate
1187 /// probability mass loss.
1188 ///
1189 /// The implementation is split between BlockFrequencyInfoImpl, which knows the
1190 /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
1191 /// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
1192 /// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
1193 /// reverse-post order. This gives two advantages: it's easy to compare the
1194 /// relative ordering of two nodes, and maps keyed on BlockT can be represented
1195 /// by vectors.
1196 ///
1197 /// This algorithm is O(V+E), unless there is irreducible control flow, in
1198 /// which case it's O(V*E) in the worst case.
1199 ///
1200 /// These are the main stages:
1201 ///
1202 /// 0. Reverse post-order traversal (\a initializeRPOT()).
1203 ///
1204 /// Run a single post-order traversal and save it (in reverse) in RPOT.
1205 /// All other stages make use of this ordering. Save a lookup from BlockT
1206 /// to BlockNode (the index into RPOT) in Nodes.
1207 ///
1208 /// 1. Loop indexing (\a initializeLoops()).
1209 ///
1210 /// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
1211 /// the algorithm. In particular, store the immediate members of each loop
1212 /// in reverse post-order.
1213 ///
1214 /// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
1215 ///
1216 /// For each loop (bottom-up), distribute mass through the DAG resulting
1217 /// from ignoring backedges and treating sub-loops as a single pseudo-node.
1218 /// Track the backedge mass distributed to the loop header, and use it to
1219 /// calculate the loop scale (number of loop iterations).
1220 ///
1221 /// Visiting loops bottom-up is a post-order traversal of loop headers.
1222 /// For each loop, immediate members that represent sub-loops will already
1223 /// have been visited and packaged into a pseudo-node.
1224 ///
1225 /// Distributing mass in a loop is a reverse-post-order traversal through
1226 /// the loop. Start by assigning full mass to the Loop header. For each
1227 /// node in the loop:
1228 ///
1229 /// - Fetch and categorize the weight distribution for its successors.
1230 /// If this is a packaged-subloop, the weight distribution is stored
1231 /// in \a PackagedLoopData::Exits. Otherwise, fetch it from
1232 /// BranchProbabilityInfo.
1233 ///
1234 /// - Each successor is categorized as \a Weight::Local, a normal
1235 /// forward edge within the current loop, \a Weight::Backedge, a
1236 /// backedge to the loop header, or \a Weight::Exit, any successor
1237 /// outside the loop. The weight, the successor, and its category
1238 /// are stored in \a Distribution. There can be multiple edges to
1239 /// each successor.
1240 ///
1241 /// - Normalize the distribution: scale weights down so that their sum
1242 /// is 32-bits, and coalesce multiple edges to the same node.
1243 ///
1244 /// - Distribute the mass accordingly, dithering to minimize mass loss,
1245 /// as described in \a distributeMass(). Mass is distributed in
1246 /// parallel in two ways: forward, and general. Local successors
1247 /// take their mass from the forward mass, while exit and backedge
1248 /// successors take their mass from the general mass. Additionally,
1249 /// exit edges use up (ignored) mass from the forward mass, and local
1250 /// edges use up (ignored) mass from the general distribution.
1251 ///
1252 /// Finally, calculate the loop scale from the accumulated backedge mass.
1253 ///
1254 /// 3. Distribute mass in the function (\a computeMassInFunction()).
1255 ///
1256 /// Finally, distribute mass through the DAG resulting from packaging all
1257 /// loops in the function. This uses the same algorithm as distributing
1258 /// mass in a loop, except that there are no exit or backedge edges.
1259 ///
1260 /// 4. Loop unpackaging and cleanup (\a finalizeMetrics()).
1261 ///
1262 /// Initialize the frequency to a floating point representation of its
1263 /// mass.
1264 ///
1265 /// Visit loops top-down (reverse post-order), scaling the loop header's
1266 /// frequency by its psuedo-node's mass and loop scale. Keep track of the
1267 /// minimum and maximum final frequencies.
1268 ///
1269 /// Using the min and max frequencies as a guide, translate floating point
1270 /// frequencies to an appropriate range in uint64_t.
1271 ///
1272 /// It has some known flaws.
1273 ///
1274 /// - Irreducible control flow isn't modelled correctly. In particular,
1275 /// LoopInfo and MachineLoopInfo ignore irreducible backedges. The main
1276 /// result is that irreducible SCCs will under-scaled. No mass is lost,
1277 /// but the computed branch weights for the loop pseudo-node will be
1278 /// incorrect.
1279 ///
1280 /// Modelling irreducible control flow exactly involves setting up and
1281 /// solving a group of infinite geometric series. Such precision is
1282 /// unlikely to be worthwhile, since most of our algorithms give up on
1283 /// irreducible control flow anyway.
1284 ///
1285 /// Nevertheless, we might find that we need to get closer. If
1286 /// LoopInfo/MachineLoopInfo flags loops with irreducible control flow
1287 /// (and/or the function as a whole), we can find the SCCs, compute an
1288 /// approximate exit frequency for the SCC as a whole, and scale up
1289 /// accordingly.
1290 ///
1291 /// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
1292 /// BlockFrequency's 64-bit integer precision.
1293 template class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
571294 typedef typename bfi_detail::TypeMap::BlockT BlockT;
581295 typedef typename bfi_detail::TypeMap::FunctionT FunctionT;
591296 typedef typename bfi_detail::TypeMap::BranchProbabilityInfoT
601297 BranchProbabilityInfoT;
61
62 DenseMap Freqs;
63
64 BranchProbabilityInfoT *BPI;
65
66 FunctionT *Fn;
67
68 typedef GraphTraits< Inverse > GT;
69
70 static const uint64_t EntryFreq = 1 << 14;
71
72 std::string getBlockName(BasicBlock *BB) const {
73 return BB->getName().str();
74 }
75
76 std::string getBlockName(MachineBasicBlock *MBB) const {
77 std::string str;
78 raw_string_ostream ss(str);
79 ss << "BB#" << MBB->getNumber();
80
81 if (const BasicBlock *BB = MBB->getBasicBlock())
82 ss << " derived from LLVM BB " << BB->getName();
83
84 return ss.str();
85 }
86
87 void setBlockFreq(BlockT *BB, BlockFrequency Freq) {
88 Freqs[BB] = Freq;
89 DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = ";
90 printBlockFreq(dbgs(), Freq) << "\n");
91 }
92
93 /// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst
94 /// edge probability.
95 BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const {
96 BranchProbability Prob = BPI->getEdgeProbability(Src, Dst);
97 return getBlockFreq(Src) * Prob;
98 }
99
100 /// incBlockFreq - Increase BB block frequency by FREQ.
101 ///
102 void incBlockFreq(BlockT *BB, BlockFrequency Freq) {
103 Freqs[BB] += Freq;
104 DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += ";
105 printBlockFreq(dbgs(), Freq) << " --> ";
106 printBlockFreq(dbgs(), Freqs[BB]) << "\n");
107 }
108
109 // All blocks in postorder.
110 std::vector POT;
111
112 // Map Block -> Position in reverse-postorder list.
113 DenseMap RPO;
114
115 // For each loop header, record the per-iteration probability of exiting the
116 // loop. This is the reciprocal of the expected number of loop iterations.
117 typedef DenseMap LoopExitProbMap;
118 LoopExitProbMap LoopExitProb;
119
120 // (reverse-)postorder traversal iterators.
121 typedef typename std::vector::iterator pot_iterator;
122 typedef typename std::vector::reverse_iterator rpot_iterator;
123
124 pot_iterator pot_begin() { return POT.begin(); }
125 pot_iterator pot_end() { return POT.end(); }
126
127 rpot_iterator rpot_begin() { return POT.rbegin(); }
128 rpot_iterator rpot_end() { return POT.rend(); }
129
130 rpot_iterator rpot_at(BlockT *BB) {
131 rpot_iterator I = rpot_begin();
132 unsigned idx = RPO.lookup(BB);
133 assert(idx);
134 std::advance(I, idx - 1);
135
136 assert(*I == BB);
137 return I;
138 }
139
140 /// isBackedge - Return if edge Src -> Dst is a reachable backedge.
141 ///
142 bool isBackedge(BlockT *Src, BlockT *Dst) const {
143 unsigned a = RPO.lookup(Src);
144 if (!a)
145 return false;
146 unsigned b = RPO.lookup(Dst);
147 assert(b && "Destination block should be reachable");
148 return a >= b;
149 }
150
151 /// getSingleBlockPred - return single BB block predecessor or NULL if
152 /// BB has none or more predecessors.
153 BlockT *getSingleBlockPred(BlockT *BB) {
154 typename GT::ChildIteratorType
155 PI = GraphTraits< Inverse >::child_begin(BB),
156 PE = GraphTraits< Inverse >::child_end(BB);
157
158 if (PI == PE)
159 return nullptr;
160
161 BlockT *Pred = *PI;
162
163 ++PI;
164 if (PI != PE)
165 return nullptr;
166
167 return Pred;
168 }
169
170 void doBlock(BlockT *BB, BlockT *LoopHead,
171 SmallPtrSet &BlocksInLoop) {
172
173 DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n");
174 setBlockFreq(BB, 0);
175
176 if (BB == LoopHead) {
177 setBlockFreq(BB, EntryFreq);
178 return;
179 }
180
181 if (BlockT *Pred = getSingleBlockPred(BB)) {
182 if (BlocksInLoop.count(Pred))
183 setBlockFreq(BB, getEdgeFreq(Pred, BB));
184 // TODO: else? irreducible, ignore it for now.
185 return;
186 }
187
188 bool isInLoop = false;
189 bool isLoopHead = false;
190
191 for (typename GT::ChildIteratorType
192 PI = GraphTraits< Inverse >::child_begin(BB),
193 PE = GraphTraits< Inverse >::child_end(BB);
194 PI != PE; ++PI) {
195 BlockT *Pred = *PI;
196
197 if (isBackedge(Pred, BB)) {
198 isLoopHead = true;
199 } else if (BlocksInLoop.count(Pred)) {
200 incBlockFreq(BB, getEdgeFreq(Pred, BB));
201 isInLoop = true;
202 }
203 // TODO: else? irreducible.
204 }
205
206 if (!isInLoop)
207 return;
208
209 if (!isLoopHead)
210 return;
211
212 // This block is a loop header, so boost its frequency by the expected
213 // number of loop iterations. The loop blocks will be revisited so they all
214 // get this boost.
215 typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB);
216 assert(I != LoopExitProb.end() && "Loop header missing from table");
217 Freqs[BB] /= I->second;
218 DEBUG(dbgs() << "Loop header scaled to ";
219 printBlockFreq(dbgs(), Freqs[BB]) << ".\n");
220 }
221
222 /// doLoop - Propagate block frequency down through the loop.
223 void doLoop(BlockT *Head, BlockT *Tail) {
224 DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", "
225 << getBlockName(Tail) << ")\n");
226
227 SmallPtrSet BlocksInLoop;
228
229 for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) {
230 BlockT *BB = *I;
231 doBlock(BB, Head, BlocksInLoop);
232
233 BlocksInLoop.insert(BB);
234 if (I == E)
235 break;
236 }
237
238 // Compute loop's cyclic probability using backedges probabilities.
239 BlockFrequency BackFreq;
240 for (typename GT::ChildIteratorType
241 PI = GraphTraits< Inverse >::child_begin(Head),
242 PE = GraphTraits< Inverse >::child_end(Head);
243 PI != PE; ++PI) {
244 BlockT *Pred = *PI;
245 assert(Pred);
246 if (isBackedge(Pred, Head))
247 BackFreq += getEdgeFreq(Pred, Head);
248 }
249
250 // The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head)
251 // only counts edges entering the loop, not the loop backedges.
252 // The probability of leaving the loop on each iteration is:
253 //
254 // ExitProb = 1 - CyclicProb
255 //
256 // The Expected number of loop iterations is:
257 //
258 // Iterations = 1 / ExitProb
259 //
260 uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1));
261 uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1));
262 if (N < D)
263 N = D - N;
264 else
265 // We'd expect N < D, but rounding and saturation means that can't be
266 // guaranteed.
267 N = 1;
268
269 // Now ExitProb = N / D, make sure it fits in an i32/i32 fraction.
270 assert(N <= D);
271 if (D > UINT32_MAX) {
272 unsigned Shift = 32 - countLeadingZeros(D);
273 D >>= Shift;
274 N >>= Shift;
275 if (N == 0)
276 N = 1;
277 }
278 BranchProbability LEP = BranchProbability(N, D);
279 LoopExitProb.insert(std::make_pair(Head, LEP));
280 DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP
281 << " from 1 - ";
282 printBlockFreq(dbgs(), BackFreq) << " / ";
283 printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n");
284 }
285
286 friend class BlockFrequencyInfo;
287 friend class MachineBlockFrequencyInfo;
288
289 BlockFrequencyInfoImpl() { }
290
291 void doFunction(FunctionT *fn, BranchProbabilityInfoT *bpi) {
292 Fn = fn;
293 BPI = bpi;
294
295 // Clear everything.
296 RPO.clear();
297 POT.clear();
298 LoopExitProb.clear();
299 Freqs.clear();
300
301 BlockT *EntryBlock = fn->begin();
302
303 std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT));
304
305 unsigned RPOidx = 0;
306 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
307 BlockT *BB = *I;
308 RPO[BB] = ++RPOidx;
309 DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n");
310 }
311
312 // Travel over all blocks in postorder.
313 for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) {
314 BlockT *BB = *I;
315 BlockT *LastTail = nullptr;
316 DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n");
317
318 for (typename GT::ChildIteratorType
319 PI = GraphTraits< Inverse >::child_begin(BB),
320 PE = GraphTraits< Inverse >::child_end(BB);
321 PI != PE; ++PI) {
322
323 BlockT *Pred = *PI;
324 if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail]))
325 LastTail = Pred;
326 }
327
328 if (LastTail)
329 doLoop(BB, LastTail);
330 }
331
332 // At the end assume the whole function as a loop, and travel over it once
333 // again.
334 doLoop(*(rpot_begin()), *(pot_begin()));
1298 typedef typename bfi_detail::TypeMap::LoopT LoopT;
1299 typedef typename bfi_detail::TypeMap::LoopInfoT LoopInfoT;
1300
1301 typedef GraphTraits Successor;
1302 typedef GraphTraits> Predecessor;
1303
1304 const BranchProbabilityInfoT *BPI;
1305 const LoopInfoT *LI;
1306 const FunctionT *F;
1307
1308 // All blocks in reverse postorder.
1309 std::vector RPOT;
1310 DenseMap Nodes;
1311
1312 typedef typename std::vector::const_iterator rpot_iterator;
1313
1314 rpot_iterator rpot_begin() const { return RPOT.begin(); }
1315 rpot_iterator rpot_end() const { return RPOT.end(); }
1316
1317 size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
1318
1319 BlockNode getNode(const rpot_iterator &I) const {
1320 return BlockNode(getIndex(I));
1321 }
1322 BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); }
1323
1324 const BlockT *getBlock(const BlockNode &Node) const {
1325 assert(Node.Index < RPOT.size());
1326 return RPOT[Node.Index];
1327 }
1328
1329 void initializeRPOT();
1330 void initializeLoops();
1331 void runOnFunction(const FunctionT *F);
1332
1333 void propagateMassToSuccessors(const BlockNode &LoopHead,
1334 const BlockNode &Node);
1335 void computeMassInLoops();
1336 void computeMassInLoop(const BlockNode &LoopHead);
1337 void computeMassInFunction();
1338
1339 std::string getBlockName(const BlockNode &Node) const override {
1340 return bfi_detail::getBlockName(getBlock(Node));
3351341 }
3361342
3371343 public:
338
339 uint64_t getEntryFreq() { return EntryFreq; }
340
341 /// getBlockFreq - Return block frequency. Return 0 if we don't have it.
1344 const FunctionT *getFunction() const { return F; }
1345
1346 void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI,
1347 const LoopInfoT *LI);
1348 BlockFrequencyInfoImpl() : BPI(0), LI(0), F(0) {}
1349
1350 using BlockFrequencyInfoImplBase::getEntryFreq;
3421351 BlockFrequency getBlockFreq(const BlockT *BB) const {
343 typename DenseMap::const_iterator
344 I = Freqs.find(BB);
345 if (I != Freqs.end())
346 return I->second;
347 return 0;
348 }
349
350 void print(raw_ostream &OS) const {
351 OS << "\n\n---- Block Freqs ----\n";
352 for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) {
353 BlockT *BB = I++;
354 OS << " " << getBlockName(BB) << " = ";
355 printBlockFreq(OS, getBlockFreq(BB)) << "\n";
356
357 for (typename GraphTraits::ChildIteratorType
358 SI = GraphTraits::child_begin(BB),
359 SE = GraphTraits::child_end(BB); SI != SE; ++SI) {
360 BlockT *Succ = *SI;
361 OS << " " << getBlockName(BB) << " -> " << getBlockName(Succ)
362 << " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n";
363 }
364 }
365 }
366
367 void dump() const {
368 print(dbgs());
369 }
370
371 // Utility method that looks up the block frequency associated with BB and
372 // prints it to OS.
373 raw_ostream &printBlockFreq(raw_ostream &OS,
374 const BlockT *BB) {
375 return printBlockFreq(OS, getBlockFreq(BB));
376 }
377
378 raw_ostream &printBlockFreq(raw_ostream &OS,
379 const BlockFrequency &Freq) const {
380 // Convert fixed-point number to decimal.
381 uint64_t Frequency = Freq.getFrequency();
382 OS << Frequency / EntryFreq << ".";
383 uint64_t Rem = Frequency % EntryFreq;
384 uint64_t Eps = 1;
385 do {
386 Rem *= 10;
387 Eps *= 10;
388 OS << Rem / EntryFreq;
389 Rem = Rem % EntryFreq;
390 } while (Rem >= Eps/2);
1352 return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
1353 }
1354 Float getFloatingBlockFreq(const BlockT *BB) const {
1355 return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
1356 }
1357
1358 /// \brief Print the frequencies for the current function.
1359 ///
1360 /// Prints the frequencies for the blocks in the current function.
1361 ///
1362 /// Blocks are printed in the natural iteration order of the function, rather
1363 /// than reverse post-order. This provides two advantages: writing -analyze
1364 /// tests is easier (since blocks come out in source order), and even
1365 /// unreachable blocks are printed.
1366 ///
1367 /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
1368 /// we need to override it here.
1369 raw_ostream &print(raw_ostream &OS) const override;
1370 using BlockFrequencyInfoImplBase::dump;
1371
1372 using BlockFrequencyInfoImplBase::printBlockFreq;
1373 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
1374 return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
1375 }
1376 };
1377
1378 template
1379 void BlockFrequencyInfoImpl::doFunction(const FunctionT *F,
1380 const BranchProbabilityInfoT *BPI,
1381 const LoopInfoT *LI) {
1382 // Save the parameters.
1383 this->BPI = BPI;
1384 this->LI = LI;
1385 this->F = F;
1386
1387 // Clean up left-over data structures.
1388 BlockFrequencyInfoImplBase::clear();
1389 RPOT.clear();
1390 Nodes.clear();
1391
1392 // Initialize.
1393 DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n================="
1394 << std::string(F->getName().size(), '=') << "\n");
1395 initializeRPOT();
1396 initializeLoops();
1397
1398 // Visit loops in post-order to find thelocal mass distribution, and then do
1399 // the full function.
1400 computeMassInLoops();
1401 computeMassInFunction();
1402 finalizeMetrics();
1403 }
1404
1405 template void BlockFrequencyInfoImpl::initializeRPOT() {
1406 const BlockT *Entry = F->begin();
1407 RPOT.reserve(F->size());
1408 std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
1409 std::reverse(RPOT.begin(), RPOT.end());
1410
1411 assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
1412 "More nodes in function than Block Frequency Info supports");
1413
1414 DEBUG(dbgs() << "reverse-post-order-traversal\n");
1415 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
1416 BlockNode Node = getNode(I);
1417 DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
1418 Nodes[*I] = Node;
1419 }
1420
1421 Working.resize(RPOT.size());
1422 Freqs.resize(RPOT.size());
1423 }
1424
1425 template void BlockFrequencyInfoImpl::initializeLoops() {
1426 DEBUG(dbgs() << "loop-detection\n");
1427 if (LI->empty())
1428 return;
1429
1430 // Visit loops top down and assign them an index.
1431 std::deque Q;
1432 Q.insert(Q.end(), LI->begin(), LI->end());
1433 while (!Q.empty()) {
1434 const LoopT *Loop = Q.front();
1435 Q.pop_front();
1436 Q.insert(Q.end(), Loop->begin(), Loop->end());
1437
1438 // Save the order this loop was visited.
1439 BlockNode Header = getNode(Loop->getHeader());
1440 assert(Header.isValid());
1441
1442 Working[Header.Index].LoopIndex = PackagedLoops.size();
1443 PackagedLoops.emplace_back(Header);
1444 DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
1445 }
1446
1447 // Visit nodes in reverse post-order and add them to their deepest containing
1448 // loop.
1449 for (size_t Index = 0; Index < RPOT.size(); ++Index) {
1450 const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
1451 if (!Loop)
1452 continue;
1453
1454 // If this is a loop header, find its parent loop (if any).
1455 if (Working[Index].isLoopHeader())
1456 if (!(Loop = Loop->getParentLoop()))
1457 continue;
1458
1459 // Add this node to its containing loop's member list.
1460 BlockNode Header = getNode(Loop->getHeader());
1461 assert(Header.isValid());
1462 const auto &HeaderData = Working[Header.Index];
1463 assert(HeaderData.isLoopHeader());
1464
1465 Working[Index].ContainingLoop = Header;
1466 PackagedLoops[HeaderData.LoopIndex].Members.push_back(Index);
1467 DEBUG(dbgs() << " - loop = " << getBlockName(Header)
1468 << ": member = " << getBlockName(Index) << "\n");
1469 }
1470 }
1471
1472 template void BlockFrequencyInfoImpl::computeMassInLoops() {
1473 // Visit loops with the deepest first, and the top-level loops last.
1474 for (auto L = PackagedLoops.rbegin(), LE = PackagedLoops.rend(); L != LE; ++L)
1475 computeMassInLoop(L->Header);
1476 }
1477
1478 template
1479 void BlockFrequencyInfoImpl::computeMassInLoop(const BlockNode &LoopHead) {
1480 // Compute mass in loop.
1481 DEBUG(dbgs() << "compute-mass-in-loop: " << getBlockName(LoopHead) << "\n");
1482
1483 Working[LoopHead.Index].Mass = BlockMass::getFull();
1484 propagateMassToSuccessors(LoopHead, LoopHead);
1485
1486 for (const BlockNode &M : getLoopPackage(LoopHead).Members)
1487 propagateMassToSuccessors(LoopHead, M);
1488
1489 computeLoopScale(LoopHead);
1490 packageLoop(LoopHead);
1491 }
1492
1493 template void BlockFrequencyInfoImpl::computeMassInFunction() {
1494 // Compute mass in function.
1495 DEBUG(dbgs() << "compute-mass-in-function\n");
1496 assert(!Working.empty() && "no blocks in function");
1497 assert(!Working[0].isLoopHeader() && "entry block is a loop header");
1498
1499 Working[0].Mass = BlockMass::getFull();
1500 for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
1501 // Check for nodes that have been packaged.
1502 BlockNode Node = getNode(I);
1503 if (Working[Node.Index].hasLoopHeader())
1504 continue;
1505
1506 propagateMassToSuccessors(BlockNode(), Node);
1507 }
1508 }
1509
1510 template
1511 void
1512 BlockFrequencyInfoImpl::propagateMassToSuccessors(const BlockNode &LoopHead,
1513 const BlockNode &Node) {
1514 DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
1515 // Calculate probability for successors.
1516 Distribution Dist;
1517 if (Node != LoopHead && Working[Node.Index].isLoopHeader())
1518 addLoopSuccessorsToDist(LoopHead, Node, Dist);
1519 else {
1520 const BlockT *BB = getBlock(Node);
1521 for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB);
1522 SI != SE; ++SI)
1523 // Do not dereference SI, or getEdgeWeight() is linear in the number of
1524 // successors.
1525 addToDist(Dist, LoopHead, Node, getNode(*SI), BPI->getEdgeWeight(BB, SI));
1526 }
1527
1528 // Distribute mass to successors, saving exit and backedge data in the
1529 // loop header.
1530 distributeMass(Node, LoopHead, Dist);
1531 }
1532
1533 template
1534 raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const {
1535 if (!F)
3911536 return OS;
392 }
393
394 };
395
1537 OS << "block-frequency-info: " << F->getName() << "\n";
1538 for (const BlockT &BB : *F)
1539 OS << " - " << bfi_detail::getBlockName(&BB)
1540 << ": float = " << getFloatingBlockFreq(&BB)
1541 << ", int = " << getBlockFreq(&BB).getFrequency() << "\n";
1542
1543 // Add an extra newline for readability.
1544 OS << "\n";
1545 return OS;
1546 }
3961547 }
3971548
3981549 #endif
1010 //
1111 //===----------------------------------------------------------------------===//
1212
13 #define DEBUG_TYPE "block-freq"
1314 #include "llvm/Analysis/BlockFrequencyInfo.h"
1415 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
1516 #include "llvm/Analysis/BranchProbabilityInfo.h"
105106 INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
106107 "Block Frequency Analysis", true, true)
107108 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
109 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
108110 INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
109111 "Block Frequency Analysis", true, true)
110112
119121
120122 void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
121123 AU.addRequired();
124 AU.addRequired();
122125 AU.setPreservesAll();
123126 }
124127
125128 bool BlockFrequencyInfo::runOnFunction(Function &F) {
126129 BranchProbabilityInfo &BPI = getAnalysis();
130 LoopInfo &LI = getAnalysis();
127131 if (!BFI)
128132 BFI.reset(new ImplType);
129 BFI->doFunction(&F, &BPI);
133 BFI->doFunction(&F, &BPI, &LI);
130134 #ifndef NDEBUG
131135 if (ViewBlockFreqPropagationDAG != GVDT_None)
132136 view();
157161 }
158162
159163 const Function *BlockFrequencyInfo::getFunction() const {
160 return BFI ? BFI->Fn : nullptr;
164 return BFI ? BFI->getFunction() : nullptr;
161165 }
162166
163167 raw_ostream &BlockFrequencyInfo::
0 //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Loops should be simplified before this analysis.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #define DEBUG_TYPE "block-freq"
14 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
15 #include "llvm/ADT/APFloat.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include
18
19 using namespace llvm;
20
21 //===----------------------------------------------------------------------===//
22 //
23 // PositiveFloat implementation.
24 //
25 //===----------------------------------------------------------------------===//
26 #ifndef _MSC_VER
27 const int32_t PositiveFloatBase::MaxExponent;
28 const int32_t PositiveFloatBase::MinExponent;
29 #endif
30
31 static void appendDigit(std::string &Str, unsigned D) {
32 assert(D < 10);
33 Str += '0' + D % 10;
34 }
35
36 static void appendNumber(std::string &Str, uint64_t N) {
37 while (N) {
38 appendDigit(Str, N % 10);
39 N /= 10;
40 }
41 }
42
43 static bool doesRoundUp(char Digit) {
44 switch (Digit) {
45 case '5':
46 case '6':
47 case '7':
48 case '8':
49 case '9':
50 return true;
51 default:
52 return false;
53 }
54 }
55
56 static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
57 assert(E >= PositiveFloatBase::MinExponent);
58 assert(E <= PositiveFloatBase::MaxExponent);
59
60 // Find a new E, but don't let it increase past MaxExponent.
61 int LeadingZeros = PositiveFloatBase::countLeadingZeros64(D);
62 int NewE = std::min(PositiveFloatBase::MaxExponent, E + 63 - LeadingZeros);
63 int Shift = 63 - (NewE - E);
64 assert(Shift <= LeadingZeros);
65 assert(Shift == LeadingZeros || NewE == PositiveFloatBase::MaxExponent);
66 D <<= Shift;
67 E = NewE;
68
69 // Check for a denormal.
70 unsigned AdjustedE = E + 16383;
71 if (!(D >> 63)) {
72 assert(E == PositiveFloatBase::MaxExponent);
73 AdjustedE = 0;
74 }
75
76 // Build the float and print it.
77 uint64_t RawBits[2] = {D, AdjustedE};
78 APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits));
79 SmallVector Chars;
80 Float.toString(Chars, Precision, 0);
81 return std::string(Chars.begin(), Chars.end());
82 }
83
84 static std::string stripTrailingZeros(const std::string &Float) {
85 size_t NonZero = Float.find_last_not_of('0');
86 assert(NonZero != std::string::npos && "no . in floating point string");
87
88 if (Float[NonZero] == '.')
89 ++NonZero;
90
91 return Float.substr(0, NonZero + 1);
92 }
93
94 std::string PositiveFloatBase::toString(uint64_t D, int16_t E, int Width,
95 unsigned Precision) {
96 if (!D)
97 return "0.0";
98
99 // Canonicalize exponent and digits.
100 uint64_t Above0 = 0;
101 uint64_t Below0 = 0;
102 uint64_t Extra = 0;
103 int ExtraShift = 0;
104 if (E == 0) {
105 Above0 = D;
106 } else if (E > 0) {
107 if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) {
108 D <<= Shift;
109 E -= Shift;
110
111 if (!E)
112 Above0 = D;
113 }
114 } else if (E > -64) {
115 Above0 = D >> -E;
116 Below0 = D << (64 + E);
117 } else if (E > -120) {
118 Below0 = D >> (-E - 64);
119 Extra = D << (128 + E);
120 ExtraShift = -64 - E;
121 }
122
123 // Fall back on APFloat for very small and very large numbers.
124 if (!Above0 && !Below0)
125 return toStringAPFloat(D, E, Precision);
126
127 // Append the digits before the decimal.
128 std::string Str;
129 size_t DigitsOut = 0;
130 if (Above0) {
131 appendNumber(Str, Above0);
132 DigitsOut = Str.size();
133 } else
134 appendDigit(Str, 0);
135 std::reverse(Str.begin(), Str.end());
136
137 // Return early if there's nothing after the decimal.
138 if (!Below0)
139 return Str + ".0";
140
141 // Append the decimal and beyond.
142 Str += '.';
143 uint64_t Error = UINT64_C(1) << (64 - Width);
144
145 // We need to shift Below0 to the right to make space for calculating
146 // digits. Save the precision we're losing in Extra.
147 Extra = (Below0 & 0xf) << 56 | (Extra >> 8);
148 Below0 >>= 4;
149 size_t SinceDot = 0;
150 size_t AfterDot = Str.size();
151 do {
152 if (ExtraShift) {
153 --ExtraShift;
154 Error *= 5;
155 } else
156 Error *= 10;
157
158 Below0 *= 10;
159 Extra *= 10;
160 Below0 += (Extra >> 60);
161 Extra = Extra & (UINT64_MAX >> 4);
162 appendDigit(Str, Below0 >> 60);
163 Below0 = Below0 & (UINT64_MAX >> 4);
164 if (DigitsOut || Str.back() != '0')
165 ++DigitsOut;
166 ++SinceDot;
167 } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 &&
168 (!Precision || DigitsOut <= Precision || SinceDot < 2));
169
170 // Return early for maximum precision.
171 if (!Precision || DigitsOut <= Precision)
172 return stripTrailingZeros(Str);
173
174 // Find where to truncate.
175 size_t Truncate =
176 std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1);
177
178 // Check if there's anything to truncate.
179 if (Truncate >= Str.size())
180 return stripTrailingZeros(Str);
181
182 bool Carry = doesRoundUp(Str[Truncate]);
183 if (!Carry)
184 return stripTrailingZeros(Str.substr(0, Truncate));
185
186 // Round with the first truncated digit.
187 for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend();
188 I != E; ++I) {
189 if (*I == '.')
190 continue;
191 if (*I == '9') {
192 *I = '0';
193 continue;
194 }
195
196 ++*I;
197 Carry = false;
198 break;
199 }
200
201 // Add "1" in front if we still need to carry.
202 return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate));
203 }
204
205 raw_ostream &PositiveFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E,
206 int Width, unsigned Precision) {
207 return OS << toString(D, E, Width, Precision);
208 }
209
210 void PositiveFloatBase::dump(uint64_t D, int16_t E, int Width) {
211 print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E
212 << "]";
213 }
214
215 static std::pair
216 getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) {
217 if (ShouldRound)
218 if (!++N)
219 // Rounding caused an overflow.
220 return std::make_pair(UINT64_C(1), Shift + 64);
221 return std::make_pair(N, Shift);
222 }
223
224 std::pair PositiveFloatBase::divide64(uint64_t Dividend,
225 uint64_t Divisor) {
226 // Input should be sanitized.
227 assert(Divisor);
228 assert(Dividend);
229
230 // Minimize size of divisor.
231 int16_t Shift = 0;
232 if (int Zeros = countTrailingZeros(Divisor)) {
233 Shift -= Zeros;
234 Divisor >>= Zeros;
235 }
236
237 // Check for powers of two.
238 if (Divisor == 1)
239 return std::make_pair(Dividend, Shift);
240
241 // Maximize size of dividend.
242 if (int Zeros = countLeadingZeros64(Dividend)) {
243 Shift -= Zeros;
244 Dividend <<= Zeros;
245 }
246
247 // Start with the result of a divide.
248 uint64_t Quotient = Dividend / Divisor;
249 Dividend %= Divisor;
250
251 // Continue building the quotient with long division.
252 //
253 // TODO: continue with largers digits.
254 while (!(Quotient >> 63) && Dividend) {
255 // Shift Dividend, and check for overflow.
256 bool IsOverflow = Dividend >> 63;
257 Dividend <<= 1;
258 --Shift;
259
260 // Divide.
261 bool DoesDivide = IsOverflow || Divisor <= Dividend;
262 Quotient = (Quotient << 1) | uint64_t(DoesDivide);
263 Dividend -= DoesDivide ? Divisor : 0;
264 }
265
266 // Round.
267 if (Dividend >= getHalf(Divisor))
268 if (!++Quotient)
269 // Rounding caused an overflow in Quotient.
270 return std::make_pair(UINT64_C(1), Shift + 64);
271
272 return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift);
273 }
274
275 std::pair PositiveFloatBase::multiply64(uint64_t L,
276 uint64_t R) {
277 // Separate into two 32-bit digits (U.L).
278 uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX;
279
280 // Compute cross products.
281 uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR;
282
283 // Sum into two 64-bit digits.
284 uint64_t Upper = P1, Lower = P4;
285 auto addWithCarry = [&](uint64_t N) {
286 uint64_t NewLower = Lower + (N << 32);
287 Upper += (N >> 32) + (NewLower < Lower);
288 Lower = NewLower;
289 };
290 addWithCarry(P2);
291 addWithCarry(P3);
292
293 // Check whether the upper digit is empty.
294 if (!Upper)
295 return std::make_pair(Lower, 0);
296
297 // Shift as little as possible to maximize precision.
298 unsigned LeadingZeros = countLeadingZeros64(Upper);
299 int16_t Shift = 64 - LeadingZeros;
300 if (LeadingZeros)
301 Upper = Upper << LeadingZeros | Lower >> Shift;
302 bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1));
303 return getRoundedFloat(Upper, ShouldRound, Shift);
304 }
305
306 //===----------------------------------------------------------------------===//
307 //
308 // BlockMass implementation.
309 //
310 //===----------------------------------------------------------------------===//
311 BlockMass &BlockMass::operator*=(const BranchProbability &P) {
312 uint32_t N = P.getNumerator(), D = P.getDenominator();
313 assert(D && "divide by 0");
314 assert(N <= D && "fraction greater than 1");
315
316 // Fast path for multiplying by 1.0.
317 if (!Mass || N == D)
318 return *this;
319
320 // Get as much precision as we can.
321 int Shift = countLeadingZeros(Mass);
322 uint64_t ShiftedQuotient = (Mass << Shift) / D;
323 uint64_t Product = ShiftedQuotient * N >> Shift;
324
325 // Now check for what's lost.
326 uint64_t Left = ShiftedQuotient * (D - N) >> Shift;
327 uint64_t Lost = Mass - Product - Left;
328
329 // TODO: prove this assertion.
330 assert(Lost <= UINT32_MAX);
331
332 // Take the product plus a portion of the spoils.
333 Mass = Product + Lost * N / D;
334 return *this;
335 }
336
337 PositiveFloat BlockMass::toFloat() const {
338 if (isFull())
339 return PositiveFloat(1, 0);
340 return PositiveFloat(getMass() + 1, -64);
341 }
342
343 void BlockMass::dump() const { print(dbgs()); }
344
345 static char getHexDigit(int N) {
346 assert(N < 16);
347 if (N < 10)
348 return '0' + N;
349 return 'a' + N - 10;
350 }
351 raw_ostream &BlockMass::print(raw_ostream &OS) const {
352 for (int Digits = 0; Digits < 16; ++Digits)
353 OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
354 return OS;
355 }
356
357 //===----------------------------------------------------------------------===//
358 //
359 // BlockFrequencyInfoImpl implementation.
360 //
361 //===----------------------------------------------------------------------===//
362 namespace {
363
364 typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
365 typedef BlockFrequencyInfoImplBase::Distribution Distribution;
366 typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList;
367 typedef BlockFrequencyInfoImplBase::Float Float;
368 typedef BlockFrequencyInfoImplBase::PackagedLoopData PackagedLoopData;
369 typedef BlockFrequencyInfoImplBase::Weight Weight;
370 typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData;
371
372 /// \brief Dithering mass distributer.
373 ///
374 /// This class splits up a single mass into portions by weight, dithering to
375 /// spread out error. No mass is lost. The dithering precision depends on the
376 /// precision of the product of \a BlockMass and \a BranchProbability.
377 ///
378 /// The distribution algorithm follows.
379 ///
380 /// 1. Initialize by saving the sum of the weights in \a RemWeight and the
381 /// mass to distribute in \a RemMass.
382 ///
383 /// 2. For each portion:
384 ///
385 /// 1. Construct a branch probability, P, as the portion's weight divided
386 /// by the current value of \a RemWeight.
387 /// 2. Calculate the portion's mass as \a RemMass times P.
388 /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting
389 /// the current portion's weight and mass.
390 ///
391 /// Mass is distributed in two ways: full distribution and forward
392 /// distribution. The latter ignores backedges, and uses the parallel fields
393 /// \a RemForwardWeight and \a RemForwardMass.
394 struct DitheringDistributer {
395 uint32_t RemWeight;
396 uint32_t RemForwardWeight;
397
398 BlockMass RemMass;
399 BlockMass RemForwardMass;
400
401 DitheringDistributer(Distribution &Dist, const BlockMass &Mass);
402
403 BlockMass takeLocalMass(uint32_t Weight) {
404 (void)takeMass(Weight);
405 return takeForwardMass(Weight);
406 }
407 BlockMass takeExitMass(uint32_t Weight) {
408 (void)takeForwardMass(Weight);
409 return takeMass(Weight);
410 }
411 BlockMass takeBackedgeMass(uint32_t Weight) { return takeMass(Weight); }
412
413 private:
414 BlockMass takeForwardMass(uint32_t Weight);
415 BlockMass takeMass(uint32_t Weight);
416 };
417 }
418
419 DitheringDistributer::DitheringDistributer(Distribution &Dist,
420 const BlockMass &Mass) {
421 Dist.normalize();
422 RemWeight = Dist.Total;
423 RemForwardWeight = Dist.ForwardTotal;
424 RemMass = Mass;
425 RemForwardMass = Dist.ForwardTotal ? Mass : BlockMass();
426 }
427
428 BlockMass DitheringDistributer::takeForwardMass(uint32_t Weight) {
429 // Compute the amount of mass to take.
430 assert(Weight && "invalid weight");
431 assert(Weight <= RemForwardWeight);
432 BlockMass Mass = RemForwardMass * BranchProbability(Weight, RemForwardWeight);
433
434 // Decrement totals (dither).
435 RemForwardWeight -= Weight;
436 RemForwardMass -= Mass;
437 return Mass;
438 }
439 BlockMass DitheringDistributer::takeMass(uint32_t Weight) {
440 assert(Weight && "invalid weight");
441 assert(Weight <= RemWeight);
442 BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);
443
444 // Decrement totals (dither).
445 RemWeight -= Weight;
446 RemMass -= Mass;
447 return Mass;
448 }
449
450 void Distribution::add(const BlockNode &Node, uint64_t Amount,
451 Weight::DistType Type) {
452 assert(Amount && "invalid weight of 0");
453 uint64_t NewTotal = Total + Amount;
454
455 // Check for overflow. It should be impossible to overflow twice.
456 bool IsOverflow = NewTotal < Total;
457 assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");
458 DidOverflow |= IsOverflow;
459
460 // Update the total.
461 Total = NewTotal;
462
463 // Save the weight.
464 Weight W;
465 W.TargetNode = Node;
466 W.Amount = Amount;
467 W.Type = Type;
468 Weights.push_back(W);
469
470 if (Type == Weight::Backedge)
471 return;
472
473 // Update forward total. Don't worry about overflow here, since then Total
474 // will exceed 32-bits and they'll both be recomputed in normalize().
475 ForwardTotal += Amount;
476 }
477
478 static void combineWeight(Weight &W, const Weight &OtherW) {
479 assert(OtherW.TargetNode.isValid());
480 if (!W.Amount) {
481 W = OtherW;
482 return;
483 }
484 assert(W.Type == OtherW.Type);
485 assert(W.TargetNode == OtherW.TargetNode);
486 assert(W.Amount < W.Amount + OtherW.Amount);
487 W.Amount += OtherW.Amount;
488 }
489 static void combineWeightsBySorting(WeightList &Weights) {
490 // Sort so edges to the same node are adjacent.
491 std::sort(Weights.begin(), Weights.end(),
492 [](const Weight &L,
493 const Weight &R) { return L.TargetNode < R.TargetNode; });
494
495 // Combine adjacent edges.
496 WeightList::iterator O = Weights.begin();
497 for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;
498 ++O, (I = L)) {
499 *O = *I;
500
501 // Find the adjacent weights to the same node.
502 for (++L; L != E && I->TargetNode == L->TargetNode; ++L)
503 combineWeight(*O, *L);
504 }
505
506 // Erase extra entries.
507 Weights.erase(O, Weights.end());
508 return;
509 }
510 static void combineWeightsByHashing(WeightList &Weights) {
511 // Collect weights into a DenseMap.
512 typedef DenseMap HashTable;
513 HashTable Combined(NextPowerOf2(2 * Weights.size()));
514 for (const Weight &W : Weights)
515 combineWeight(Combined[W.TargetNode.Index], W);
516
517 // Check whether anything changed.
518 if (Weights.size() == Combined.size())
519 return;
520
521 // Fill in the new weights.
522 Weights.clear();
523 Weights.reserve(Combined.size());
524 for (const auto &I : Combined)
525 Weights.push_back(I.second);
526 }
527 static void combineWeights(WeightList &Weights) {
528 // Use a hash table for many successors to keep this linear.
529 if (Weights.size() > 128) {
530 combineWeightsByHashing(Weights);
531 return;
532 }
533
534 combineWeightsBySorting(Weights);
535 }
536 static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
537 assert(Shift >= 0);
538 assert(Shift < 64);
539 if (!Shift)
540 return N;
541 return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
542 }
543 void Distribution::normalize() {
544 // Early exit for termination nodes.
545 if (Weights.empty())
546 return;
547
548 // Only bother if there are multiple successors.
549 if (Weights.size() > 1)
550 combineWeights(Weights);
551
552 // Early exit when combined into a single successor.
553 if (Weights.size() == 1) {
554 Total = 1;
555 ForwardTotal = Weights.front().Type != Weight::Backedge;
556 Weights.front().Amount = 1;
557 return;
558 }
559
560 // Determine how much to shift right so that the total fits into 32-bits.
561 //
562 // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1
563 // for each weight can cause a 32-bit overflow.
564 int Shift = 0;
565 if (DidOverflow)
566 Shift = 33;
567 else if (Total > UINT32_MAX)
568 Shift = 33 - countLeadingZeros(Total);
569
570 // Early exit if nothing needs to be scaled.
571 if (!Shift)
572 return;
573
574 // Recompute the total through accumulation (rather than shifting it) so that
575 // it's accurate after shifting. ForwardTotal is dirty here anyway.
576 Total = 0;
577 ForwardTotal = 0;
578
579 // Sum the weights to each node and shift right if necessary.
580 for (Weight &W : Weights) {
581 // Scale down below UINT32_MAX. Since Shift is larger than necessary, we
582 // can round here without concern about overflow.
583 assert(W.TargetNode.isValid());
584 W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));
585 assert(W.Amount <= UINT32_MAX);
586
587 // Update the total.
588 Total += W.Amount;
589 if (W.Type == Weight::Backedge)
590 continue;
591
592 // Update the forward total.
593 ForwardTotal += W.Amount;
594 }
595 assert(Total <= UINT32_MAX);
596 }
597
598 void BlockFrequencyInfoImplBase::clear() {
599 *this = BlockFrequencyInfoImplBase();
600 }
601
602 /// \brief Clear all memory not needed downstream.
603 ///
604 /// Releases all memory not used downstream. In particular, saves Freqs.
605 static void cleanup(BlockFrequencyInfoImplBase &BFI) {
606 std::vector SavedFreqs(std::move(BFI.Freqs));
607 BFI.clear();
608 BFI.Freqs = std::move(SavedFreqs);
609 }
610
611 /// \brief Get a possibly packaged node.
612 ///
613 /// Get the node currently representing Node, which could be a containing
614 /// loop.
615 ///
616 /// This function should only be called when distributing mass. As long as
617 /// there are no irreducilbe edges to Node, then it will have complexity O(1)
618 /// in this context.
619 ///
620 /// In general, the complexity is O(L), where L is the number of loop headers
621 /// Node has been packaged into. Since this method is called in the context
622 /// of distributing mass, L will be the number of loop headers an early exit
623 /// edge jumps out of.
624 static BlockNode getPackagedNode(const BlockFrequencyInfoImplBase &BFI,
625 const BlockNode &Node) {
626 assert(Node.isValid());
627 if (!BFI.Working[Node.Index].IsPackaged)
628 return Node;
629 if (!BFI.Working[Node.Index].ContainingLoop.isValid())
630 return Node;
631 return getPackagedNode(BFI, BFI.Working[Node.Index].ContainingLoop);
632 }
633
634 /// \brief Get the appropriate mass for a possible pseudo-node loop package.
635 ///
636 /// Get appropriate mass for Node. If Node is a loop-header (whose loop has
637 /// been packaged), returns the mass of its pseudo-node. If it's a node inside
638 /// a packaged loop, it returns the loop's pseudo-node.
639 static BlockMass &getPackageMass(BlockFrequencyInfoImplBase &BFI,
640 const BlockNode &Node) {
641 assert(Node.isValid());
642 assert(!BFI.Working[Node.Index].IsPackaged);
643 if (!BFI.Working[Node.Index].IsAPackage)
644 return BFI.Working[Node.Index].Mass;
645
646 return BFI.getLoopPackage(Node).Mass;
647 }
648
649 void BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
650 const BlockNode &LoopHead,
651 const BlockNode &Pred,
652 const BlockNode &Succ,
653 uint64_t Weight) {
654 if (!Weight)
655 Weight = 1;
656
657 #ifndef NDEBUG
658 auto debugSuccessor = [&](const char *Type, const BlockNode &Resolved) {
659 dbgs() << " =>"
660 << " [" << Type << "] weight = " << Weight;
661 if (Succ != LoopHead)
662 dbgs() << ", succ = " << getBlockName(Succ);
663 if (Resolved != Succ)
664 dbgs() << ", resolved = " << getBlockName(Resolved);
665 dbgs() << "\n";
666 };
667 (void)debugSuccessor;
668 #endif
669
670 if (Succ == LoopHead) {
671 DEBUG(debugSuccessor("backedge", Succ));
672 Dist.addBackedge(LoopHead, Weight);
673 return;
674 }
675 BlockNode Resolved = getPackagedNode(*this, Succ);
676 assert(Resolved != LoopHead);
677
678 if (Working[Resolved.Index].ContainingLoop != LoopHead) {
679 DEBUG(debugSuccessor(" exit ", Resolved));
680 Dist.addExit(Resolved, Weight);
681 return;
682 }
683
684 if (!LoopHead.isValid() && Resolved < Pred) {
685 // Irreducible backedge. Skip this edge in the distribution.
686 DEBUG(debugSuccessor("skipped ", Resolved));
687 return;
688 }
689
690 DEBUG(debugSuccessor(" local ", Resolved));
691 Dist.addLocal(Resolved, Weight);
692 }
693
694 void BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
695 const BlockNode &LoopHead, const BlockNode &LocalLoopHead,
696 Distribution &Dist) {
697 PackagedLoopData &LoopPackage = getLoopPackage(LocalLoopHead);
698 const PackagedLoopData::ExitMap &Exits = LoopPackage.Exits;
699
700 // Copy the exit map into Dist.
701 for (const auto &I : Exits)
702 addToDist(Dist, LoopHead, LocalLoopHead, I.first, I.second.getMass());
703
704 // We don't need this map any more. Clear it to prevent quadratic memory
705 // usage in deeply nested loops with irreducible control flow.
706 LoopPackage.Exits.clear();
707 }
708
709 /// \brief Get the maximum allowed loop scale.
710 ///
711 /// Gives the maximum number of estimated iterations allowed for a loop.
712 /// Downstream users have trouble with very large numbers (even within
713 /// 64-bits). Perhaps they can be changed to use PositiveFloat.
714 ///
715 /// TODO: change downstream users so that this can be increased or removed.
716 static Float getMaxLoopScale() { return Float(1, 12); }
717
718 /// \brief Compute the loop scale for a loop.
719 void BlockFrequencyInfoImplBase::computeLoopScale(const BlockNode &LoopHead) {
720 // Compute loop scale.
721 DEBUG(dbgs() << "compute-loop-scale: " << getBlockName(LoopHead) << "\n");
722
723 // LoopScale == 1 / ExitMass
724 // ExitMass == HeadMass - BackedgeMass
725 PackagedLoopData &LoopPackage = getLoopPackage(LoopHead);
726 BlockMass ExitMass = BlockMass::getFull() - LoopPackage.BackedgeMass;
727
728 // Block scale stores the inverse of the scale.
729 LoopPackage.Scale = ExitMass.toFloat().inverse();
730
731 DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
732 << " - " << LoopPackage.BackedgeMass << ")\n"
733 << " - scale = " << LoopPackage.Scale << "\n");
734
735 if (LoopPackage.Scale > getMaxLoopScale()) {
736 LoopPackage.Scale = getMaxLoopScale();
737 DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
738 }
739 }
740
741 /// \brief Package up a loop.
742 void BlockFrequencyInfoImplBase::packageLoop(const BlockNode &LoopHead) {
743 DEBUG(dbgs() << "packaging-loop: " << getBlockName(LoopHead) << "\n");
744 Working[LoopHead.Index].IsAPackage = true;
745 for (const BlockNode &M : getLoopPackage(LoopHead).Members) {
746 DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
747 Working[M.Index].IsPackaged = true;
748 }
749 }
750
751 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
752 const BlockNode &LoopHead,
753 Distribution &Dist) {
754 BlockMass Mass = getPackageMass(*this, Source);
755 DEBUG(dbgs() << " => mass: " << Mass
756 << " ( general | forward )\n");
757
758 // Distribute mass to successors as laid out in Dist.
759 DitheringDistributer D(Dist, Mass);
760
761 #ifndef NDEBUG
762 auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
763 const char *Desc) {
764 dbgs() << " => assign " << M << " (" << D.RemMass << "|"
765 << D.RemForwardMass << ")";
766 if (Desc)
767 dbgs() << " [" << Desc << "]";
768 if (T.isValid())
769 dbgs() << " to " << getBlockName(T);
770 dbgs() << "\n";
771 };
772 (void)debugAssign;
773 #endif
774
775 PackagedLoopData *LoopPackage = 0;
776 if (LoopHead.isValid())
777 LoopPackage = &getLoopPackage(LoopHead);
778 for (const Weight &W : Dist.Weights) {
779 // Check for a local edge (forward and non-exit).
780 if (W.Type == Weight::Local) {
781 BlockMass Local = D.takeLocalMass(W.Amount);
782 getPackageMass(*this, W.TargetNode) += Local;
783 DEBUG(debugAssign(W.TargetNode, Local, nullptr));
784 continue;
785 }
786
787 // Backedges and exits only make sense if we're processing a loop.
788 assert(LoopPackage && "backedge or exit outside of loop");
789
790 // Check for a backedge.
791 if (W.Type == Weight::Backedge) {
792 BlockMass Back = D.takeBackedgeMass(W.Amount);
793 LoopPackage->BackedgeMass += Back;
794 DEBUG(debugAssign(BlockNode(), Back, "back"));
795 continue;
796 }
797
798 // This must be an exit.
799 assert(W.Type == Weight::Exit);
800 BlockMass Exit = D.takeExitMass(W.Amount);
801 LoopPackage->Exits.push_back(std::make_pair(W.TargetNode, Exit));
802 DEBUG(debugAssign(W.TargetNode, Exit, "exit"));
803 }
804 }
805
806 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
807 const Float &Min, const Float &Max) {
808 // Scale the Factor to a size that creates integers. Ideally, integers would
809 // be scaled so that Max == UINT64_MAX so that they can be best
810 // differentiated. However, the register allocator currently deals poorly
811 // with large numbers. Instead, push Min up a little from 1 to give some
812 // room to differentiate small, unequal numbers.
813 //
814 // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max.
815 Float ScalingFactor = Min.inverse();
816 if ((Max / Min).lg() < 60)
817 ScalingFactor <<= 3;
818
819 // Translate the floats to integers.
820 DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
821 << ", factor = " << ScalingFactor << "\n");
822 for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
823 Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor;
824 BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt());
825 DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
826 << BFI.Freqs[Index].Floating << ", scaled = " << Scaled
827 << ", int = " << BFI.Freqs[Index].Integer << "\n");
828 }
829 }
830
831 static void scaleBlockData(BlockFrequencyInfoImplBase &BFI,
832 const BlockNode &Node,
833 const PackagedLoopData &Loop) {
834 Float F = Loop.Mass.toFloat() * Loop.Scale;
835
836 Float &Current = BFI.Freqs[Node.Index].Floating;
837 Float Updated = Current * F;
838
839 DEBUG(dbgs() << " - " << BFI.getBlockName(Node) << ": " << Current << " => "
840 << Updated << "\n");
841
842 Current = Updated;
843 }
844
845 /// \brief Unwrap a loop package.
846 ///
847 /// Visits all the members of a loop, adjusting their BlockData according to
848 /// the loop's pseudo-node.
849 static void unwrapLoopPackage(BlockFrequencyInfoImplBase &BFI,
850 const BlockNode &Head) {
851 assert(Head.isValid());
852
853 PackagedLoopData &LoopPackage = BFI.getLoopPackage(Head);
854 DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getBlockName(Head)
855 << ": mass = " << LoopPackage.Mass
856 << ", scale = " << LoopPackage.Scale << "\n");
857 scaleBlockData(BFI, Head, LoopPackage);
858
859 // Propagate the head scale through the loop. Since members are visited in
860 // RPO, the head scale will be updated by the loop scale first, and then the
861 // final head scale will be used for updated the rest of the members.
862 for (const BlockNode &M : LoopPackage.Members) {
863 const FrequencyData &HeadData = BFI.Freqs[Head.Index];
864 FrequencyData &Freqs = BFI.Freqs[M.Index];
865 Float NewFreq = Freqs.Floating * HeadData.Floating;
866 DEBUG(dbgs() << " - " << BFI.getBlockName(M) << ": " << Freqs.Floating
867 << " => " << NewFreq << "\n");
868 Freqs.Floating = NewFreq;
869 }
870 }
871
872 void BlockFrequencyInfoImplBase::finalizeMetrics() {
873 // Set initial frequencies from loop-local masses.
874 for (size_t Index = 0; Index < Working.size(); ++Index)
875 Freqs[Index].Floating = Working[Index].Mass.toFloat();
876
877 // Unwrap loop packages in reverse post-order, tracking min and max
878 // frequencies.
879 auto Min = Float::getLargest();
880 auto Max = Float::getZero();
881 for (size_t Index = 0; Index < Working.size(); ++Index) {
882 if (Working[Index].isLoopHeader())
883 unwrapLoopPackage(*this, BlockNode(Index));
884
885 // Update max scale.
886 Min = std::min(Min, Freqs[Index].Floating);
887 Max = std::max(Max, Freqs[Index].Floating);
888 }
889
890 // Convert to integers.
891 convertFloatingToInteger(*this, Min, Max);
892
893 // Clean up data structures.
894 cleanup(*this);
895
896 // Print out the final stats.
897 DEBUG(dump());
898 }
899
900 BlockFrequency
901 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
902 if (!Node.isValid())
903 return 0;
904 return Freqs[Node.Index].Integer;
905 }
906 Float
907 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
908 if (!Node.isValid())
909 return Float::getZero();
910 return Freqs[Node.Index].Floating;
911 }
912
913 std::string
914 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
915 return std::string();
916 }
917
918 raw_ostream &
919 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
920 const BlockNode &Node) const {
921 return OS << getFloatingBlockFreq(Node);
922 }
923
924 raw_ostream &
925 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
926 const BlockFrequency &Freq) const {
927 Float Block(Freq.getFrequency(), 0);
928 Float Entry(getEntryFreq(), 0);
929
930 return OS << Block / Entry;
931 }
66 Analysis.cpp
77 BasicAliasAnalysis.cpp
88 BlockFrequencyInfo.cpp
9 BlockFrequencyInfoImpl.cpp
910 BranchProbabilityInfo.cpp
1011 CFG.cpp
1112 CFGPrinter.cpp
1010 //
1111 //===----------------------------------------------------------------------===//
1212
13 #define DEBUG_TYPE "block-freq"
1314 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
1415 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
1516 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/MachineLoopInfo.h"
1619 #include "llvm/CodeGen/Passes.h"
1720 #include "llvm/InitializePasses.h"
1821 #include "llvm/Support/CommandLine.h"
111114 INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
112115 "Machine Block Frequency Analysis", true, true)
113116 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
117 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
114118 INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
115119 "Machine Block Frequency Analysis", true, true)
116120
126130
127131 void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
128132 AU.addRequired();
133 AU.addRequired();
129134 AU.setPreservesAll();
130135 MachineFunctionPass::getAnalysisUsage(AU);
131136 }
132137
133138 bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
134139 MachineBranchProbabilityInfo &MBPI =
135 getAnalysis();
140 getAnalysis();
141 MachineLoopInfo &MLI = getAnalysis();
136142 if (!MBFI)
137143 MBFI.reset(new ImplType);
138 MBFI->doFunction(&F, &MBPI);
144 MBFI->doFunction(&F, &MBPI, &MLI);
139145 #ifndef NDEBUG
140146 if (ViewMachineBlockFreqPropagationDAG != GVDT_None) {
141147 view();
165171 }
166172
167173 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
168 return MBFI ? MBFI->Fn : nullptr;
174 return MBFI ? MBFI->getFunction() : nullptr;
169175 }
170176
171177 raw_ostream &
0 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 declare void @g(i32 %x)
3
4 ; CHECK-LABEL: Printing analysis {{.*}} for function 'branch_weight_0':
5 ; CHECK-NEXT: block-frequency-info: branch_weight_0
6 define void @branch_weight_0(i32 %a) {
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %for.body
10
11 ; Check that we get 1,4 instead of 0,3.
12 ; CHECK-NEXT: for.body: float = 4.0,
13 for.body:
14 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
15 call void @g(i32 %i)
16 %inc = add i32 %i, 1
17 %cmp = icmp ugt i32 %inc, %a
18 br i1 %cmp, label %for.end, label %for.body, !prof !0
19
20 ; CHECK-NEXT: for.end: float = 1.0, int = [[ENTRY]]
21 for.end:
22 ret void
23 }
24
25 !0 = metadata !{metadata !"branch_weights", i32 0, i32 3}
26
27 ; CHECK-LABEL: Printing analysis {{.*}} for function 'infinite_loop'
28 ; CHECK-NEXT: block-frequency-info: infinite_loop
29 define void @infinite_loop(i1 %x) {
30 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
31 entry:
32 br i1 %x, label %for.body, label %for.end, !prof !1
33
34 ; Check that the loop scale maxes out at 4096, giving 2048 here.
35 ; CHECK-NEXT: for.body: float = 2048.0,
36 for.body:
37 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
38 call void @g(i32 %i)
39 %inc = add i32 %i, 1
40 br label %for.body
41
42 ; Check that the exit weight is half of entry, since half is lost in the
43 ; infinite loop above.
44 ; CHECK-NEXT: for.end: float = 0.5,
45 for.end:
46 ret void
47 }
48
49 !1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
0 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
11
22 define i32 @test1(i32 %i, i32* %a) {
3 ; CHECK: Printing analysis {{.*}} for function 'test1'
4 ; CHECK: entry = 1.0
3 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test1':
4 ; CHECK-NEXT: block-frequency-info: test1
5 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
56 entry:
67 br label %body
78
89 ; Loop backedges are weighted and thus their bodies have a greater frequency.
9 ; CHECK: body = 32.0
10 ; CHECK-NEXT: body: float = 32.0,
1011 body:
1112 %iv = phi i32 [ 0, %entry ], [ %next, %body ]
1213 %base = phi i32 [ 0, %entry ], [ %sum, %body ]
1718 %exitcond = icmp eq i32 %next, %i
1819 br i1 %exitcond, label %exit, label %body
1920
20 ; CHECK: exit = 1.0
21 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
2122 exit:
2223 ret i32 %sum
2324 }
2425
2526 define i32 @test2(i32 %i, i32 %a, i32 %b) {
26 ; CHECK: Printing analysis {{.*}} for function 'test2'
27 ; CHECK: entry = 1.0
27 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test2':
28 ; CHECK-NEXT: block-frequency-info: test2
29 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
2830 entry:
2931 %cond = icmp ult i32 %i, 42
3032 br i1 %cond, label %then, label %else, !prof !0
3133
3234 ; The 'then' branch is predicted more likely via branch weight metadata.
33 ; CHECK: then = 0.94116
35 ; CHECK-NEXT: then: float = 0.9411{{[0-9]*}},
3436 then:
3537 br label %exit
3638
37 ; CHECK: else = 0.05877
39 ; CHECK-NEXT: else: float = 0.05882{{[0-9]*}},
3840 else:
3941 br label %exit
4042
41 ; FIXME: It may be a bug that we don't sum back to 1.0.
42 ; CHECK: exit = 0.99993
43 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
4344 exit:
4445 %result = phi i32 [ %a, %then ], [ %b, %else ]
4546 ret i32 %result
4849 !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
4950
5051 define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
51 ; CHECK: Printing analysis {{.*}} for function 'test3'
52 ; CHECK: entry = 1.0
52 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test3':
53 ; CHECK-NEXT: block-frequency-info: test3
54 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
5355 entry:
5456 switch i32 %i, label %case_a [ i32 1, label %case_b
5557 i32 2, label %case_c
5658 i32 3, label %case_d
5759 i32 4, label %case_e ], !prof !1
5860
59 ; CHECK: case_a = 0.04998
61 ; CHECK-NEXT: case_a: float = 0.05,
6062 case_a:
6163 br label %exit
6264
63 ; CHECK: case_b = 0.04998
65 ; CHECK-NEXT: case_b: float = 0.05,
6466 case_b:
6567 br label %exit
6668
6769 ; The 'case_c' branch is predicted more likely via branch weight metadata.
68 ; CHECK: case_c = 0.79998
70 ; CHECK-NEXT: case_c: float = 0.8,
6971 case_c:
7072 br label %exit
7173
72 ; CHECK: case_d = 0.04998
74 ; CHECK-NEXT: case_d: float = 0.05,
7375 case_d:
7476 br label %exit
7577
76 ; CHECK: case_e = 0.04998
78 ; CHECK-NEXT: case_e: float = 0.05,
7779 case_e:
7880 br label %exit
7981
80 ; FIXME: It may be a bug that we don't sum back to 1.0.
81 ; CHECK: exit = 0.99993
82 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
8283 exit:
8384 %result = phi i32 [ %a, %case_a ],
8485 [ %b, %case_b ],
9091
9192 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
9293
93 ; CHECK: Printing analysis {{.*}} for function 'nested_loops'
94 ; CHECK: entry = 1.0
95 ; This test doesn't seem to be assigning sensible frequencies to nested loops.
9694 define void @nested_loops(i32 %a) {
95 ; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loops':
96 ; CHECK-NEXT: block-frequency-info: nested_loops
97 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
9798 entry:
9899 br label %for.cond1.preheader
99100
101 ; CHECK-NEXT: for.cond1.preheader: float = 4001.0,
100102 for.cond1.preheader:
101103 %x.024 = phi i32 [ 0, %entry ], [ %inc12, %for.inc11 ]
102104 br label %for.cond4.preheader
103105
106 ; CHECK-NEXT: for.cond4.preheader: float = 16008001.0,
104107 for.cond4.preheader:
105108 %y.023 = phi i32 [ 0, %for.cond1.preheader ], [ %inc9, %for.inc8 ]
106109 %add = add i32 %y.023, %x.024
107110 br label %for.body6
108111
112 ; CHECK-NEXT: for.body6: float = 64048012001.0,
109113 for.body6:
110114 %z.022 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
111115 %add7 = add i32 %add, %z.022
112 tail call void @g(i32 %add7) #2
116 tail call void @g(i32 %add7)
113117 %inc = add i32 %z.022, 1
114118 %cmp5 = icmp ugt i32 %inc, %a
115119 br i1 %cmp5, label %for.inc8, label %for.body6, !prof !2
116120
121 ; CHECK-NEXT: for.inc8: float = 16008001.0,
117122 for.inc8:
118123 %inc9 = add i32 %y.023, 1
119124 %cmp2 = icmp ugt i32 %inc9, %a
120125 br i1 %cmp2, label %for.inc11, label %for.cond4.preheader, !prof !2
121126
127 ; CHECK-NEXT: for.inc11: float = 4001.0,
122128 for.inc11:
123129 %inc12 = add i32 %x.024, 1
124130 %cmp = icmp ugt i32 %inc12, %a
125131 br i1 %cmp, label %for.end13, label %for.cond1.preheader, !prof !2
126132
133 ; CHECK-NEXT: for.end13: float = 1.0, int = [[ENTRY]]
127134 for.end13:
128135 ret void
129136 }
130137
131 declare void @g(i32) #1
138 declare void @g(i32)
132139
133140 !2 = metadata !{metadata !"branch_weights", i32 1, i32 4000}
0 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit':
3 ; CHECK-NEXT: block-frequency-info: double_exit
4 define i32 @double_exit(i32 %N) {
5 ; Mass = 1
6 ; Frequency = 1
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %outer
10
11 ; Mass = 1
12 ; Backedge mass = 1/3, exit mass = 2/3
13 ; Loop scale = 3/2
14 ; Psuedo-edges = exit
15 ; Psuedo-mass = 1
16 ; Frequency = 1*3/2*1 = 3/2
17 ; CHECK-NEXT: outer: float = 1.5,
18 outer:
19 %I.0 = phi i32 [ 0, %entry ], [ %inc6, %outer.inc ]
20 %Return.0 = phi i32 [ 0, %entry ], [ %Return.1, %outer.inc ]
21 %cmp = icmp slt i32 %I.0, %N
22 br i1 %cmp, label %inner, label %exit, !prof !2 ; 2:1
23
24 ; Mass = 1
25 ; Backedge mass = 3/5, exit mass = 2/5
26 ; Loop scale = 5/2
27 ; Pseudo-edges = outer.inc @ 1/5, exit @ 1/5
28 ; Pseudo-mass = 2/3
29 ; Frequency = 3/2*1*5/2*2/3 = 5/2
30 ; CHECK-NEXT: inner: float = 2.5,
31 inner:
32 %Return.1 = phi i32 [ %Return.0, %outer ], [ %call4, %inner.inc ]
33 %J.0 = phi i32 [ %I.0, %outer ], [ %inc, %inner.inc ]
34 %cmp2 = icmp slt i32 %J.0, %N
35 br i1 %cmp2, label %inner.body, label %outer.inc, !prof !1 ; 4:1
36
37 ; Mass = 4/5
38 ; Frequency = 5/2*4/5 = 2
39 ; CHECK-NEXT: inner.body: float = 2.0,
40 inner.body:
41 %call = call i32 @c2(i32 %I.0, i32 %J.0)
42 %tobool = icmp ne i32 %call, 0
43 br i1 %tobool, label %exit, label %inner.inc, !prof !0 ; 3:1
44
45 ; Mass = 3/5
46 ; Frequency = 5/2*3/5 = 3/2
47 ; CHECK-NEXT: inner.inc: float = 1.5,
48 inner.inc:
49 %call4 = call i32 @logic2(i32 %Return.1, i32 %I.0, i32 %J.0)
50 %inc = add nsw i32 %J.0, 1
51 br label %inner
52
53 ; Mass = 1/3
54 ; Frequency = 3/2*1/3 = 1/2
55 ; CHECK-NEXT: outer.inc: float = 0.5,
56 outer.inc:
57 %inc6 = add nsw i32 %I.0, 1
58 br label %outer
59
60 ; Mass = 1
61 ; Frequency = 1
62 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
63 exit:
64 %Return.2 = phi i32 [ %Return.1, %inner.body ], [ %Return.0, %outer ]
65 ret i32 %Return.2
66 }
67
68 !0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
69 !1 = metadata !{metadata !"branch_weights", i32 4, i32 1}
70 !2 = metadata !{metadata !"branch_weights", i32 2, i32 1}
71
72 declare i32 @c2(i32, i32)
73 declare i32 @logic2(i32, i32, i32)
74
75 ; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit_in_loop':
76 ; CHECK-NEXT: block-frequency-info: double_exit_in_loop
77 define i32 @double_exit_in_loop(i32 %N) {
78 ; Mass = 1
79 ; Frequency = 1
80 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
81 entry:
82 br label %outer
83
84 ; Mass = 1
85 ; Backedge mass = 1/2, exit mass = 1/2
86 ; Loop scale = 2
87 ; Pseudo-edges = exit
88 ; Psuedo-mass = 1
89 ; Frequency = 1*2*1 = 2
90 ; CHECK-NEXT: outer: float = 2.0,
91 outer:
92 %I.0 = phi i32 [ 0, %entry ], [ %inc12, %outer.inc ]
93 %Return.0 = phi i32 [ 0, %entry ], [ %Return.3, %outer.inc ]
94 %cmp = icmp slt i32 %I.0, %N
95 br i1 %cmp, label %middle, label %exit, !prof !3 ; 1:1
96
97 ; Mass = 1
98 ; Backedge mass = 1/3, exit mass = 2/3
99 ; Loop scale = 3/2
100 ; Psuedo-edges = outer.inc
101 ; Psuedo-mass = 1/2
102 ; Frequency = 2*1*3/2*1/2 = 3/2
103 ; CHECK-NEXT: middle: float = 1.5,
104 middle:
105 %J.0 = phi i32 [ %I.0, %outer ], [ %inc9, %middle.inc ]
106 %Return.1 = phi i32 [ %Return.0, %outer ], [ %Return.2, %middle.inc ]
107 %cmp2 = icmp slt i32 %J.0, %N
108 br i1 %cmp2, label %inner, label %outer.inc, !prof !2 ; 2:1
109
110 ; Mass = 1
111 ; Backedge mass = 3/5, exit mass = 2/5
112 ; Loop scale = 5/2
113 ; Pseudo-edges = middle.inc @ 1/5, outer.inc @ 1/5
114 ; Pseudo-mass = 2/3
115 ; Frequency = 3/2*1*5/2*2/3 = 5/2
116 ; CHECK-NEXT: inner: float = 2.5,
117 inner:
118 %Return.2 = phi i32 [ %Return.1, %middle ], [ %call7, %inner.inc ]
119 %K.0 = phi i32 [ %J.0, %middle ], [ %inc, %inner.inc ]
120 %cmp5 = icmp slt i32 %K.0, %N
121 br i1 %cmp5, label %inner.body, label %middle.inc, !prof !1 ; 4:1
122
123 ; Mass = 4/5
124 ; Frequency = 5/2*4/5 = 2
125 ; CHECK-NEXT: inner.body: float = 2.0,
126 inner.body:
127 %call = call i32 @c3(i32 %I.0, i32 %J.0, i32 %K.0)
128 %tobool = icmp ne i32 %call, 0
129 br i1 %tobool, label %outer.inc, label %inner.inc, !prof !0 ; 3:1
130
131 ; Mass = 3/5
132 ; Frequency = 5/2*3/5 = 3/2
133 ; CHECK-NEXT: inner.inc: float = 1.5,
134 inner.inc:
135 %call7 = call i32 @logic3(i32 %Return.2, i32 %I.0, i32 %J.0, i32 %K.0)
136 %inc = add nsw i32 %K.0, 1
137 br label %inner
138
139 ; Mass = 1/3
140 ; Frequency = 3/2*1/3 = 1/2
141 ; CHECK-NEXT: middle.inc: float = 0.5,
142 middle.inc:
143 %inc9 = add nsw i32 %J.0, 1
144 br label %middle
145
146 ; Mass = 1/2
147 ; Frequency = 2*1/2 = 1
148 ; CHECK-NEXT: outer.inc: float = 1.0,
149 outer.inc:
150 %Return.3 = phi i32 [ %Return.2, %inner.body ], [ %Return.1, %middle ]
151 %inc12 = add nsw i32 %I.0, 1
152 br label %outer
153
154 ; Mass = 1
155 ; Frequency = 1
156 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
157 exit:
158 ret i32 %Return.0
159 }
160
161 !3 = metadata !{metadata !"branch_weights", i32 1, i32 1}
162
163 declare i32 @c3(i32, i32, i32)
164 declare i32 @logic3(i32, i32, i32, i32)
0 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; A loop with multiple exits should be handled correctly.
3 ;
4 ; CHECK-LABEL: Printing analysis {{.*}} for function 'multiexit':
5 ; CHECK-NEXT: block-frequency-info: multiexit
6 define void @multiexit(i32 %a) {
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %loop.1
10
11 ; CHECK-NEXT: loop.1: float = 1.333{{3*}},
12 loop.1:
13 %i = phi i32 [ 0, %entry ], [ %inc.2, %loop.2 ]
14 call void @f(i32 %i)
15 %inc.1 = add i32 %i, 1
16 %cmp.1 = icmp ugt i32 %inc.1, %a
17 br i1 %cmp.1, label %exit.1, label %loop.2, !prof !0
18
19 ; CHECK-NEXT: loop.2: float = 0.666{{6*7}},
20 loop.2:
21 call void @g(i32 %inc.1)
22 %inc.2 = add i32 %inc.1, 1
23 %cmp.2 = icmp ugt i32 %inc.2, %a
24 br i1 %cmp.2, label %exit.2, label %loop.1, !prof !1
25
26 ; CHECK-NEXT: exit.1: float = 0.666{{6*7}},
27 exit.1:
28 call void @h(i32 %inc.1)
29 br label %return
30
31 ; CHECK-NEXT: exit.2: float = 0.333{{3*}},
32 exit.2:
33 call void @i(i32 %inc.2)
34 br label %return
35
36 ; CHECK-NEXT: return: float = 1.0, int = [[ENTRY]]
37 return:
38 ret void
39 }
40
41 declare void @f(i32 %x)
42 declare void @g(i32 %x)
43 declare void @h(i32 %x)
44 declare void @i(i32 %x)
45
46 !0 = metadata !{metadata !"branch_weights", i32 3, i32 3}
47 !1 = metadata !{metadata !"branch_weights", i32 5, i32 5}
48
49 ; The current BlockFrequencyInfo algorithm doesn't handle multiple entrances
50 ; into a loop very well. The frequencies assigned to blocks in the loop are
51 ; predictable (and not absurd), but also not correct and therefore not worth
52 ; testing.
53 ;
54 ; There are two testcases below.
55 ;
56 ; For each testcase, I use a CHECK-NEXT/NOT combo like an XFAIL with the
57 ; granularity of a single check. If/when this behaviour is fixed, we'll know
58 ; about it, and the test should be updated.
59 ;
60 ; Testcase #1
61 ; ===========
62 ;
63 ; In this case c1 and c2 should have frequencies of 15/7 and 13/7,
64 ; respectively. To calculate this, consider assigning 1.0 to entry, and
65 ; distributing frequency iteratively (to infinity). At the first iteration,
66 ; entry gives 3/4 to c1 and 1/4 to c2. At every step after, c1 and c2 give 3/4
67 ; of what they have to each other. Somehow, all of it comes out to exit.
68 ;
69 ; c1 = 3/4 + 1/4*3/4 + 3/4*3^2/4^2 + 1/4*3^3/4^3 + 3/4*3^3/4^3 + ...
70 ; c2 = 1/4 + 3/4*3/4 + 1/4*3^2/4^2 + 3/4*3^3/4^3 + 1/4*3^3/4^3 + ...
71 ;
72 ; Simplify by splitting up the odd and even terms of the series and taking out
73 ; factors so that the infite series matches:
74 ;
75 ; c1 = 3/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
76 ; + 3/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
77 ; c2 = 1/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
78 ; + 9/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
79 ;
80 ; c1 = 15/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
81 ; c2 = 13/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
82 ;
83 ; Since this geometric series sums to 16/7:
84 ;
85 ; c1 = 15/7
86 ; c2 = 13/7
87 ;
88 ; If we treat c1 and c2 as members of the same loop, the exit frequency of the
89 ; loop as a whole is 1/4, so the loop scale should be 4. Summing c1 and c2
90 ; gives 28/7, or 4.0, which is nice confirmation of the math above.
91 ;
92 ; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
93 ; returns 3/4 and 13/16, respectively. LoopInfo ignores edges between loops
94 ; (and doesn't see any loops here at all), and -block-freq ignores the
95 ; irreducible edge from c2 to c1.
96 ;
97 ; CHECK-LABEL: Printing analysis {{.*}} for function 'multientry':
98 ; CHECK-NEXT: block-frequency-info: multientry
99 define void @multientry(i32 %a) {
100 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
101 entry:
102 %choose = call i32 @choose(i32 %a)
103 %compare = icmp ugt i32 %choose, %a
104 br i1 %compare, label %c1, label %c2, !prof !2
105
106 ; This is like a single-line XFAIL (see above).
107 ; CHECK-NEXT: c1:
108 ; CHECK-NOT: float = 2.142857{{[0-9]*}},
109 c1:
110 %i1 = phi i32 [ %a, %entry ], [ %i2.inc, %c2 ]
111 %i1.inc = add i32 %i1, 1
112 %choose1 = call i32 @choose(i32 %i1)
113 %compare1 = icmp ugt i32 %choose1, %a
114 br i1 %compare1, label %c2, label %exit, !prof !2
115
116 ; This is like a single-line XFAIL (see above).
117 ; CHECK-NEXT: c2:
118 ; CHECK-NOT: float = 1.857142{{[0-9]*}},
119 c2:
120 %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ]
121 %i2.inc = add i32 %i2, 1
122 %choose2 = call i32 @choose(i32 %i2)
123 %compare2 = icmp ugt i32 %choose2, %a
124 br i1 %compare2, label %c1, label %exit, !prof !2
125
126 ; We still shouldn't lose any frequency.
127 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
128 exit:
129 ret void
130 }
131
132 ; Testcase #2
133 ; ===========
134 ;
135 ; In this case c1 and c2 should be treated as equals in a single loop. The
136 ; exit frequency is 1/3, so the scaling factor for the loop should be 3.0. The
137 ; loop is entered 2/3 of the time, and c1 and c2 split the total loop frequency
138 ; evenly (1/2), so they should each have frequencies of 1.0 (3.0*2/3*1/2).
139 ; Another way of computing this result is by assigning 1.0 to entry and showing
140 ; that c1 and c2 should accumulate frequencies of:
141 ;
142 ; 1/3 + 2/9 + 4/27 + 8/81 + ...
143 ; 2^0/3^1 + 2^1/3^2 + 2^2/3^3 + 2^3/3^4 + ...
144 ;
145 ; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent
146 ; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This
147 ; infinite series sums to 1.
148 ;
149 ; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
150 ; returns 1/2 and 3/4, respectively. LoopInfo ignores edges between loops (and
151 ; treats c1 and c2 as self-loops only), and -block-freq ignores the irreducible
152 ; edge from c2 to c1.
153 ;
154 ; Below I use a CHECK-NEXT/NOT combo like an XFAIL with the granularity of a
155 ; single check. If/when this behaviour is fixed, we'll know about it, and the
156 ; test should be updated.
157 ;
158 ; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
159 ; CHECK-NEXT: block-frequency-info: crossloops
160 define void @crossloops(i32 %a) {
161 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
162 entry:
163 %choose = call i32 @choose(i32 %a)
164 switch i32 %choose, label %exit [ i32 1, label %c1
165 i32 2, label %c2 ], !prof !3
166
167 ; This is like a single-line XFAIL (see above).
168 ; CHECK-NEXT: c1:
169 ; CHECK-NOT: float = 1.0,
170 c1:
171 %i1 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ]
172 %i1.inc = add i32 %i1, 1
173 %choose1 = call i32 @choose(i32 %i1)
174 switch i32 %choose1, label %exit [ i32 1, label %c1
175 i32 2, label %c2 ], !prof !3
176
177 ; This is like a single-line XFAIL (see above).
178 ; CHECK-NEXT: c2:
179 ; CHECK-NOT: float = 1.0,
180 c2:
181 %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ]
182 %i2.inc = add i32 %i2, 1
183 %choose2 = call i32 @choose(i32 %i2)
184 switch i32 %choose2, label %exit [ i32 1, label %c1
185 i32 2, label %c2 ], !prof !3
186
187 ; We still shouldn't lose any frequency.
188 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
189 exit:
190 ret void
191 }
192
193 declare i32 @choose(i32)
194
195 !2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
196 !3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2}
0 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_with_branch':
3 ; CHECK-NEXT: block-frequency-info: loop_with_branch
4 define void @loop_with_branch(i32 %a) {
5 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
6 entry:
7 %skip_loop = call i1 @foo0(i32 %a)
8 br i1 %skip_loop, label %skip, label %header, !prof !0
9
10 ; CHECK-NEXT: skip: float = 0.25,
11 skip:
12 br label %exit
13
14 ; CHECK-NEXT: header: float = 4.5,
15 header:
16 %i = phi i32 [ 0, %entry ], [ %i.next, %back ]
17 %i.next = add i32 %i, 1
18 %choose = call i2 @foo1(i32 %i)
19 switch i2 %choose, label %exit [ i2 0, label %left
20 i2 1, label %right ], !prof !1
21
22 ; CHECK-NEXT: left: float = 1.5,
23 left:
24 br label %back
25