llvm.org GIT mirror llvm / f44eda4
Revert "blockfreq: Rewrite BlockFrequencyInfoImpl" This reverts commit r206704, as expected. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206707 91177308-0d34-0410-b5e6-96231b3b80d8 Duncan P. N. Exon Smith 6 years ago
12 changed file(s) with 376 addition(s) and 2990 deletion(s). Raw diff Collapse all Expand all
66 //
77 //===----------------------------------------------------------------------===//
88 //
9 // Shared implementation of BlockFrequency for IR and Machine Instructions.
9 // Shared implementation of BlockFrequencyInfo for IR and Machine Instructions.
1010 //
1111 //===----------------------------------------------------------------------===//
1212
1515
1616 #include "llvm/ADT/DenseMap.h"
1717 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineBasicBlock.h"
19 #include "llvm/CodeGen/MachineFunction.h"
1820 #include "llvm/IR/BasicBlock.h"
1921 #include "llvm/Support/BlockFrequency.h"
2022 #include "llvm/Support/BranchProbability.h"
2325 #include
2426 #include
2527
26 //===----------------------------------------------------------------------===//
27 //
28 // PositiveFloat definition.
29 //
30 // TODO: Make this private to BlockFrequencyInfoImpl or delete.
31 //
32 //===----------------------------------------------------------------------===//
3328 namespace llvm {
3429
35 class PositiveFloatBase {
36 public:
37 static const int32_t MaxExponent = 16383;
38 static const int32_t MinExponent = -16382;
39 static const int DefaultPrecision = 10;
40
41 static void dump(uint64_t D, int16_t E, int Width);
42 static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width,
43 unsigned Precision);
44 static std::string toString(uint64_t D, int16_t E, int Width,
45 unsigned Precision);
46 static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); }
47 static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); }
48 static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
49
50 static std::pair splitSigned(int64_t N) {
51 if (N >= 0)
52 return std::make_pair(N, false);
53 uint64_t Unsigned = N == INT64_MIN ? UINT64_C(1) << 63 : uint64_t(-N);
54 return std::make_pair(Unsigned, true);
55 }
56 static int64_t joinSigned(uint64_t U, bool IsNeg) {
57 if (U > uint64_t(INT64_MAX))
58 return IsNeg ? INT64_MIN : INT64_MAX;
59 return IsNeg ? -int64_t(U) : int64_t(U);
60 }
61
62 static int32_t extractLg(const std::pair &Lg) {
63 return Lg.first;
64 }
65 static int32_t extractLgFloor(const std::pair &Lg) {
66 return Lg.first - (Lg.second > 0);
67 }
68 static int32_t extractLgCeiling(const std::pair &Lg) {
69 return Lg.first + (Lg.second < 0);
70 }
71
72 static std::pair divide64(uint64_t L, uint64_t R);
73 static std::pair multiply64(uint64_t L, uint64_t R);
74
75 static int compare(uint64_t L, uint64_t R, int Shift) {
76 assert(Shift >= 0);
77 assert(Shift < 64);
78
79 uint64_t L_adjusted = L >> Shift;
80 if (L_adjusted < R)
81 return -1;
82 if (L_adjusted > R)
83 return 1;
84
85 return L > L_adjusted << Shift ? 1 : 0;
86 }
87 };
88
89 /// \brief Simple representation of a positive floating point.
90 ///
91 /// PositiveFloat is a positive floating point number. It uses simple
92 /// saturation arithmetic, and every operation is well-defined for every value.
93 ///
94 /// The number is split into a signed exponent and unsigned digits. The number
95 /// represented is \c getDigits()*2^getExponent(). In this way, the digits are
96 /// much like the mantissa in the x87 long double, but there is no canonical
97 /// form, so the same number can be represented by many bit representations
98 /// (it's always in "denormal" mode).
99 ///
100 /// PositiveFloat is templated on the underlying integer type for digits, which
101 /// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t.
102 ///
103 /// Unlike builtin floating point types, PositiveFloat is portable.
104 ///
105 /// Unlike APFloat, PositiveFloat does not model architecture floating point
106 /// behaviour (this should make it a little faster), and implements most
107 /// operators (this makes it usable).
108 ///
109 /// PositiveFloat is totally ordered. However, there is no canonical form, so
110 /// there are multiple representations of most scalars. E.g.:
111 ///
112 /// PositiveFloat(8u, 0) == PositiveFloat(4u, 1)
113 /// PositiveFloat(4u, 1) == PositiveFloat(2u, 2)
114 /// PositiveFloat(2u, 2) == PositiveFloat(1u, 3)
115 ///
116 /// PositiveFloat implements most arithmetic operations. Precision is kept
117 /// where possible. Uses simple saturation arithmetic, so that operations
118 /// saturate to 0.0 or getLargest() rather than under or overflowing. It has
119 /// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0.
120 /// Any other division by 0.0 is defined to be getLargest().
121 ///
122 /// As a convenience for modifying the exponent, left and right shifting are
123 /// both implemented, and both interpret negative shifts as positive shifts in
124 /// the opposite direction.
125 ///
126 /// Future work might extract most of the implementation into a base class
127 /// (e.g., \c Float) that has an \c IsSigned template parameter. The initial
128 /// use case for this only needed positive semantics, but it wouldn't take much
129 /// work to extend.
130 ///
131 /// Exponents are limited to the range accepted by x87 long double. This makes
132 /// it trivial to add functionality to convert to APFloat (this is already
133 /// relied on for the implementation of printing).
134 template class PositiveFloat : PositiveFloatBase {
135 public:
136 static_assert(!std::numeric_limits::is_signed,
137 "only unsigned floats supported");
138
139 typedef DigitsT DigitsType;
140
141 private:
142 typedef std::numeric_limits DigitsLimits;
143
144 static const int Width = sizeof(DigitsType) * 8;
145 static_assert(Width <= 64, "invalid integer width for digits");
146
147 private:
148 DigitsType Digits;
149 int16_t Exponent;
150
151 public:
152 PositiveFloat() : Digits(0), Exponent(0) {}
153
154 PositiveFloat(DigitsType Digits, int16_t Exponent)
155 : Digits(Digits), Exponent(Exponent) {}
156
157 private:
158 PositiveFloat(const std::pair &X)
159 : Digits(X.first), Exponent(X.second) {}
160
161 public:
162 static PositiveFloat getZero() { return PositiveFloat(0, 0); }
163 static PositiveFloat getOne() { return PositiveFloat(1, 0); }
164 static PositiveFloat getLargest() {
165 return PositiveFloat(DigitsLimits::max(), MaxExponent);
166 }
167 static PositiveFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); }
168 static PositiveFloat getInverseFloat(uint64_t N) {
169 return getFloat(N).invert();
170 }
171 static PositiveFloat getFraction(DigitsType N, DigitsType D) {
172 return getQuotient(N, D);
173 }
174
175 int16_t getExponent() const { return Exponent; }
176 DigitsType getDigits() const { return Digits; }
177
178 /// \brief Convert to the given integer type.
179 ///
180 /// Convert to \c IntT using simple saturating arithmetic, truncating if
181 /// necessary.
182 template IntT toInt() const;
183
184 bool isZero() const { return !Digits; }
185 bool isLargest() const { return *this == getLargest(); }
186 bool isOne() const {
187 if (Exponent > 0 || Exponent <= -Width)
188 return false;
189 return Digits == DigitsType(1) << -Exponent;
190 }
191
192 /// \brief The log base 2, rounded.
193 ///
194 /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN.
195 int32_t lg() const { return extractLg(lgImpl()); }
196
197 /// \brief The log base 2, rounded towards INT32_MIN.
198 ///
199 /// Get the lg floor. lg 0 is defined to be INT32_MIN.
200 int32_t lgFloor() const { return extractLgFloor(lgImpl()); }
201
202 /// \brief The log base 2, rounded towards INT32_MAX.
203 ///
204 /// Get the lg ceiling. lg 0 is defined to be INT32_MIN.
205 int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); }
206
207 bool operator==(const PositiveFloat &X) const { return compare(X) == 0; }
208 bool operator<(const PositiveFloat &X) const { return compare(X) < 0; }
209 bool operator!=(const PositiveFloat &X) const { return compare(X) != 0; }
210 bool operator>(const PositiveFloat &X) const { return compare(X) > 0; }
211 bool operator<=(const PositiveFloat &X) const { return compare(X) <= 0; }
212 bool operator>=(const PositiveFloat &X) const { return compare(X) >= 0; }
213
214 bool operator!() const { return isZero(); }
215
216 /// \brief Convert to a decimal representation in a string.
217 ///
218 /// Convert to a string. Uses scientific notation for very large/small
219 /// numbers. Scientific notation is used roughly for numbers outside of the
220 /// range 2^-64 through 2^64.
221 ///
222 /// \c Precision indicates the number of decimal digits of precision to use;
223 /// 0 requests the maximum available.
224 ///
225 /// As a special case to make debugging easier, if the number is small enough
226 /// to convert without scientific notation and has more than \c Precision
227 /// digits before the decimal place, it's printed accurately to the first
228 /// digit past zero. E.g., assuming 10 digits of precision:
229 ///
230 /// 98765432198.7654... => 98765432198.8
231 /// 8765432198.7654... => 8765432198.8
232 /// 765432198.7654... => 765432198.8
233 /// 65432198.7654... => 65432198.77
234 /// 5432198.7654... => 5432198.765
235 std::string toString(unsigned Precision = DefaultPrecision) {
236 return PositiveFloatBase::toString(Digits, Exponent, Width, Precision);
237 }
238
239 /// \brief Print a decimal representation.
240 ///
241 /// Print a string. See toString for documentation.
242 raw_ostream &print(raw_ostream &OS,
243 unsigned Precision = DefaultPrecision) const {
244 return PositiveFloatBase::print(OS, Digits, Exponent, Width, Precision);
245 }
246 void dump() const { return PositiveFloatBase::dump(Digits, Exponent, Width); }
247
248 PositiveFloat &operator+=(const PositiveFloat &X);
249 PositiveFloat &operator-=(const PositiveFloat &X);
250 PositiveFloat &operator*=(const PositiveFloat &X);
251 PositiveFloat &operator/=(const PositiveFloat &X);
252 PositiveFloat &operator<<=(int16_t Shift) { shiftLeft(Shift); return *this; }
253 PositiveFloat &operator>>=(int16_t Shift) { shiftRight(Shift); return *this; }
254
255 private:
256 void shiftLeft(int32_t Shift);
257 void shiftRight(int32_t Shift);
258
259 /// \brief Adjust two floats to have matching exponents.
260 ///
261 /// Adjust \c this and \c X to have matching exponents. Returns the new \c X
262 /// by value. Does nothing if \a isZero() for either.
263 ///
264 /// The value that compares smaller will lose precision, and possibly become
265 /// \a isZero().
266 PositiveFloat matchExponents(PositiveFloat X);
267
268 /// \brief Increase exponent to match another float.
269 ///
270 /// Increases \c this to have an exponent matching \c X. May decrease the
271 /// exponent of \c X in the process, and \c this may possibly become \a
272 /// isZero().
273 void increaseExponentToMatch(PositiveFloat &X, int32_t ExponentDiff);
274
275 public:
276 /// \brief Scale a large number accurately.
277 ///
278 /// Scale N (multiply it by this). Uses full precision multiplication, even
279 /// if Width is smaller than 64, so information is not lost.
280 uint64_t scale(uint64_t N) const;
281 uint64_t scaleByInverse(uint64_t N) const {
282 // TODO: implement directly, rather than relying on inverse. Inverse is
283 // expensive.
284 return inverse().scale(N);
285 }
286 int64_t scale(int64_t N) const {
287 std::pair Unsigned = splitSigned(N);
288 return joinSigned(scale(Unsigned.first), Unsigned.second);
289 }
290 int64_t scaleByInverse(int64_t N) const {
291 std::pair Unsigned = splitSigned(N);
292 return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second);
293 }
294
295 int compare(const PositiveFloat &X) const;
296 int compareTo(uint64_t N) const {
297 PositiveFloat Float = getFloat(N);
298 int Compare = compare(Float);
299 if (Width == 64 || Compare != 0)
300 return Compare;
301
302 // Check for precision loss. We know *this == RoundTrip.
303 uint64_t RoundTrip = Float.template toInt();
304 return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1;
305 }
306 int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); }
307
308 PositiveFloat &invert() { return *this = PositiveFloat::getFloat(1) / *this; }
309 PositiveFloat inverse() const { return PositiveFloat(*this).invert(); }
310
311 private:
312 static PositiveFloat getProduct(DigitsType L, DigitsType R);
313 static PositiveFloat getQuotient(DigitsType Dividend, DigitsType Divisor);
314
315 std::pair lgImpl() const;
316 static int countLeadingZerosWidth(DigitsType Digits) {
317 if (Width == 64)
318 return countLeadingZeros64(Digits);
319 if (Width == 32)
320 return countLeadingZeros32(Digits);
321 return countLeadingZeros32(Digits) + Width - 32;
322 }
323
324 static PositiveFloat adjustToWidth(uint64_t N, int32_t S) {
325 assert(S >= MinExponent);
326 assert(S <= MaxExponent);
327 if (Width == 64 || N <= DigitsLimits::max())
328 return PositiveFloat(N, S);
329
330 // Shift right.
331 int Shift = 64 - Width - countLeadingZeros64(N);
332 DigitsType Shifted = N >> Shift;
333
334 // Round.
335 assert(S + Shift <= MaxExponent);
336 return getRounded(PositiveFloat(Shifted, S + Shift),
337 N & UINT64_C(1) << (Shift - 1));
338 }
339
340 static PositiveFloat getRounded(PositiveFloat P, bool Round) {
341 if (!Round)
342 return P;
343 if (P.Digits == DigitsLimits::max())
344 // Careful of overflow in the exponent.
345 return PositiveFloat(1, P.Exponent) <<= Width;
346 return PositiveFloat(P.Digits + 1, P.Exponent);
347 }
348 };
349
350 #define POSITIVE_FLOAT_BOP(op, base) \
351 template \
352 PositiveFloat operator op(const PositiveFloat &L, \
353 const PositiveFloat &R) { \
354 return PositiveFloat(L) base R; \
355 }
356 POSITIVE_FLOAT_BOP(+, += )
357 POSITIVE_FLOAT_BOP(-, -= )
358 POSITIVE_FLOAT_BOP(*, *= )
359 POSITIVE_FLOAT_BOP(/, /= )
360 POSITIVE_FLOAT_BOP(<<, <<= )
361 POSITIVE_FLOAT_BOP(>>, >>= )
362 #undef POSITIVE_FLOAT_BOP
363
364 template
365 raw_ostream &operator<<(raw_ostream &OS, const PositiveFloat &X) {
366 return X.print(OS, 10);
367 }
368
369 #define POSITIVE_FLOAT_COMPARE_TO_TYPE(op, T1, T2) \
370 template \
371 bool operator op(const PositiveFloat &L, T1 R) { \
372 return L.compareTo(T2(R)) op 0; \
373 } \
374 template \
375 bool operator op(T1 L, const PositiveFloat &R) { \
376 return 0 op R.compareTo(T2(L)); \
377 }
378 #define POSITIVE_FLOAT_COMPARE_TO(op) \
379 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, uint64_t, uint64_t) \
380 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, uint32_t, uint64_t) \
381 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, int64_t, int64_t) \
382 POSITIVE_FLOAT_COMPARE_TO_TYPE(op, int32_t, int64_t)
383 POSITIVE_FLOAT_COMPARE_TO(< )
384 POSITIVE_FLOAT_COMPARE_TO(> )
385 POSITIVE_FLOAT_COMPARE_TO(== )
386 POSITIVE_FLOAT_COMPARE_TO(!= )
387 POSITIVE_FLOAT_COMPARE_TO(<= )
388 POSITIVE_FLOAT_COMPARE_TO(>= )
389 #undef POSITIVE_FLOAT_COMPARE_TO
390 #undef POSITIVE_FLOAT_COMPARE_TO_TYPE
391
392 template
393 uint64_t PositiveFloat::scale(uint64_t N) const {
394 if (Width == 64 || N <= DigitsLimits::max())
395 return (getFloat(N) * *this).template toInt();
396
397 // Defer to the 64-bit version.
398 return PositiveFloat(Digits, Exponent).scale(N);
399 }
400
401 template
402 PositiveFloat PositiveFloat::getProduct(DigitsType L,
403 DigitsType R) {
404 // Check for zero.
405 if (!L || !R)
406 return getZero();
407
408 // Check for numbers that we can compute with 64-bit math.
409 if (Width <= 32 || (L <= UINT32_MAX && R <= UINT32_MAX))
410 return adjustToWidth(uint64_t(L) * uint64_t(R), 0);
411
412 // Do the full thing.
413 return PositiveFloat(multiply64(L, R));
414 }
415 template
416 PositiveFloat PositiveFloat::getQuotient(DigitsType Dividend,
417 DigitsType Divisor) {
418 // Check for zero.
419 if (!Dividend)
420 return getZero();
421 if (!Divisor)
422 return getLargest();
423
424 if (Width == 64)
425 return PositiveFloat(divide64(Dividend, Divisor));
426
427 // We can compute this with 64-bit math.
428 int Shift = countLeadingZeros64(Dividend);
429 uint64_t Shifted = uint64_t(Dividend) << Shift;
430 uint64_t Quotient = Shifted / Divisor;
431
432 // If Quotient needs to be shifted, then adjustToWidth will round.
433 if (Quotient > DigitsLimits::max())
434 return adjustToWidth(Quotient, -Shift);
435
436 // Round based on the value of the next bit.
437 return getRounded(PositiveFloat(Quotient, -Shift),
438 Shifted % Divisor >= getHalf(Divisor));
439 }
440
441 template
442 template
443 IntT PositiveFloat::toInt() const {
444 typedef std::numeric_limits Limits;
445 if (*this < 1)
446 return 0;
447 if (*this >= Limits::max())
448 return Limits::max();
449
450 IntT N = Digits;
451 if (Exponent > 0) {
452 assert(size_t(Exponent) < sizeof(IntT) * 8);
453 return N << Exponent;
454 }
455 if (Exponent < 0) {
456 assert(size_t(-Exponent) < sizeof(IntT) * 8);
457 return N >> -Exponent;
458 }
459 return N;
460 }
461
462 template
463 std::pair PositiveFloat::lgImpl() const {
464 if (isZero())
465 return std::make_pair(INT32_MIN, 0);
466
467 // Get the floor of the lg of Digits.
468 int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1;
469
470 // Get the floor of the lg of this.
471 int32_t Floor = Exponent + LocalFloor;
472 if (Digits == UINT64_C(1) << LocalFloor)
473 return std::make_pair(Floor, 0);
474
475 // Round based on the next digit.
476 assert(LocalFloor >= 1);
477 bool Round = Digits & UINT64_C(1) << (LocalFloor - 1);
478 return std::make_pair(Floor + Round, Round ? 1 : -1);
479 }
480
481 template
482 PositiveFloat PositiveFloat::matchExponents(PositiveFloat X) {
483 if (isZero() || X.isZero() || Exponent == X.Exponent)
484 return X;
485
486 int32_t Diff = int32_t(X.Exponent) - int32_t(Exponent);
487 if (Diff > 0)
488 increaseExponentToMatch(X, Diff);
489 else
490 X.increaseExponentToMatch(*this, -Diff);
491 return X;
492 }
493 template
494 void PositiveFloat::increaseExponentToMatch(PositiveFloat &X,
495 int32_t ExponentDiff) {
496 assert(ExponentDiff > 0);
497 if (ExponentDiff >= 2 * Width) {
498 *this = getZero();
499 return;
500 }
501
502 // Use up any leading zeros on X, and then shift this.
503 int32_t ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff);
504 assert(ShiftX < Width);
505
506 int32_t ShiftThis = ExponentDiff - ShiftX;
507 if (ShiftThis >= Width) {
508 *this = getZero();
509 return;
510 }
511
512 X.Digits <<= ShiftX;
513 X.Exponent -= ShiftX;
514 Digits >>= ShiftThis;
515 Exponent += ShiftThis;
516 return;
517 }
518
519 template
520 PositiveFloat &PositiveFloat::
521 operator+=(const PositiveFloat &X) {
522 if (isLargest() || X.isZero())
523 return *this;
524 if (isZero() || X.isLargest())
525 return *this = X;
526
527 // Normalize exponents.
528 PositiveFloat Scaled = matchExponents(X);
529
530 // Check for zero again.
531 if (isZero())
532 return *this = Scaled;
533 if (Scaled.isZero())
534 return *this;
535
536 // Compute sum.
537 DigitsType Sum = Digits + Scaled.Digits;
538 bool DidOverflow = Sum < Digits;
539 Digits = Sum;
540 if (!DidOverflow)
541 return *this;
542
543 if (Exponent == MaxExponent)
544 return *this = getLargest();
545
546 ++Exponent;
547 Digits = UINT64_C(1) << (Width - 1) | Digits >> 1;
548
549 return *this;
550 }
551 template
552 PositiveFloat &PositiveFloat::
553 operator-=(const PositiveFloat &X) {
554 if (X.isZero())
555 return *this;
556 if (*this <= X)
557 return *this = getZero();
558
559 // Normalize exponents.
560 PositiveFloat Scaled = matchExponents(X);
561 assert(Digits >= Scaled.Digits);
562
563 // Compute difference.
564 if (!Scaled.isZero()) {
565 Digits -= Scaled.Digits;
566 return *this;
567 }
568
569 // Check if X just barely lost its last bit. E.g., for 32-bit:
570 //
571 // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32
572 if (*this == PositiveFloat(1, X.lgFloor() + Width)) {
573 Digits = DigitsType(0) - 1;
574 --Exponent;
575 }
576 return *this;
577 }
578 template
579 PositiveFloat &PositiveFloat::
580 operator*=(const PositiveFloat &X) {
581 if (isZero())
582 return *this;
583 if (X.isZero())
584 return *this = X;
585
586 // Save the exponents.
587 int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent);
588
589 // Get the raw product.
590 *this = getProduct(Digits, X.Digits);
591
592 // Combine with exponents.
593 return *this <<= Exponents;
594 }
595 template
596 PositiveFloat &PositiveFloat::
597 operator/=(const PositiveFloat &X) {
598 if (isZero())
599 return *this;
600 if (X.isZero())
601 return *this = getLargest();
602
603 // Save the exponents.
604 int32_t Exponents = int32_t(Exponent) - int32_t(X.Exponent);
605
606 // Get the raw quotient.
607 *this = getQuotient(Digits, X.Digits);
608
609 // Combine with exponents.
610 return *this <<= Exponents;
611 }
612 template
613 void PositiveFloat::shiftLeft(int32_t Shift) {
614 if (!Shift || isZero())
615 return;
616 assert(Shift != INT32_MIN);
617 if (Shift < 0) {
618 shiftRight(-Shift);
619 return;
620 }
621
622 // Shift as much as we can in the exponent.
623 int32_t ExponentShift = std::min(Shift, MaxExponent - Exponent);
624 Exponent += ExponentShift;
625 if (ExponentShift == Shift)
626 return;
627
628 // Check this late, since it's rare.
629 if (isLargest())
630 return;
631
632 // Shift the digits themselves.
633 Shift -= ExponentShift;
634 if (Shift > countLeadingZerosWidth(Digits)) {
635 // Saturate.
636 *this = getLargest();
637 return;
638 }
639
640 Digits <<= Shift;
641 return;
642 }
643
644 template
645 void PositiveFloat::shiftRight(int32_t Shift) {
646 if (!Shift || isZero())
647 return;
648 assert(Shift != INT32_MIN);
649 if (Shift < 0) {
650 shiftLeft(-Shift);
651 return;
652 }
653
654 // Shift as much as we can in the exponent.
655 int32_t ExponentShift = std::min(Shift, Exponent - MinExponent);
656 Exponent -= ExponentShift;
657 if (ExponentShift == Shift)
658 return;
659
660 // Shift the digits themselves.
661 Shift -= ExponentShift;
662 if (Shift >= Width) {
663 // Saturate.
664 *this = getZero();
665 return;
666 }
667
668 Digits >>= Shift;
669 return;
670 }
671
672 template
673 int PositiveFloat::compare(const PositiveFloat &X) const {
674 // Check for zero.
675 if (isZero())
676 return X.isZero() ? 0 : -1;
677 if (X.isZero())
678 return 1;
679
680 // Check for the scale. Use lgFloor to be sure that the exponent difference
681 // is always lower than 64.
682 int32_t lgL = lgFloor(), lgR = X.lgFloor();
683 if (lgL != lgR)
684 return lgL < lgR ? -1 : 1;
685
686 // Compare digits.
687 if (Exponent < X.Exponent)
688 return PositiveFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent);
689
690 return -PositiveFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent);
691 }
692
693 template struct isPodLike> {
694 static const bool value = true;
695 };
696 }
697
698 //===----------------------------------------------------------------------===//
699 //
700 // BlockMass definition.
701 //
702 // TODO: Make this private to BlockFrequencyInfoImpl or delete.
703 //
704 //===----------------------------------------------------------------------===//
705 namespace llvm {
706
707 /// \brief Mass of a block.
708 ///
709 /// This class implements a sort of fixed-point fraction always between 0.0 and
710 /// 1.0. getMass() == UINT64_MAX indicates a value of 1.0.
711 ///
712 /// Masses can be added and subtracted. Simple saturation arithmetic is used,
713 /// so arithmetic operations never overflow or underflow.
714 ///
715 /// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
716 /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
717 /// quite, maximum precision).
718 ///
719 /// Masses can be scaled by \a BranchProbability at maximum precision.
720 class BlockMass {
721 uint64_t Mass;
722
723 public:
724 BlockMass() : Mass(0) {}
725 explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
726
727 static BlockMass getEmpty() { return BlockMass(); }
728 static BlockMass getFull() { return BlockMass(UINT64_MAX); }
729
730 uint64_t getMass() const { return Mass; }
731
732 bool isFull() const { return Mass == UINT64_MAX; }
733 bool isEmpty() const { return !Mass; }
734
735 bool operator!() const { return isEmpty(); }
736
737 /// \brief Add another mass.
738 ///
739 /// Adds another mass, saturating at \a isFull() rather than overflowing.
740 BlockMass &operator+=(const BlockMass &X) {
741 uint64_t Sum = Mass + X.Mass;
742 Mass = Sum < Mass ? UINT64_MAX : Sum;
743 return *this;
744 }
745
746 /// \brief Subtract another mass.
747 ///
748 /// Subtracts another mass, saturating at \a isEmpty() rather than
749 /// undeflowing.
750 BlockMass &operator-=(const BlockMass &X) {
751 uint64_t Diff = Mass - X.Mass;
752 Mass = Diff > Mass ? 0 : Diff;
753 return *this;
754 }
755
756 /// \brief Scale by another mass.
757 ///
758 /// The current implementation is a little imprecise, but it's relatively
759 /// fast, never overflows, and maintains the property that 1.0*1.0==1.0
760 /// (where isFull represents the number 1.0). It's an approximation of
761 /// 128-bit multiply that gets right-shifted by 64-bits.
762 ///
763 /// For a given digit size, multiplying two-digit numbers looks like:
764 ///
765 /// U1 . L1
766 /// * U2 . L2
767 /// ============
768 /// 0 . . L1*L2
769 /// + 0 . U1*L2 . 0 // (shift left once by a digit-size)
770 /// + 0 . U2*L1 . 0 // (shift left once by a digit-size)
771 /// + U1*L2 . 0 . 0 // (shift left twice by a digit-size)
772 ///
773 /// BlockMass has 64-bit numbers. Split each into two 32-bit digits, stored
774 /// 64-bit. Add 1 to the lower digits, to model isFull as 1.0; this won't
775 /// overflow, since we have 64-bit storage for each digit.
776 ///
777 /// To do this accurately, (a) multiply into two 64-bit digits, incrementing
778 /// the upper digit on overflows of the lower digit (carry), (b) subtract 1
779 /// from the lower digit, decrementing the upper digit on underflow (carry),
780 /// and (c) truncate the lower digit. For the 1.0*1.0 case, the upper digit
781 /// will be 0 at the end of step (a), and then will underflow back to isFull
782 /// (1.0) in step (b).
783 ///
784 /// Instead, the implementation does something a little faster with a small
785 /// loss of accuracy: ignore the lower 64-bit digit entirely. The loss of
786 /// accuracy is small, since the sum of the unmodelled carries is 0 or 1
787 /// (i.e., step (a) will overflow at most once, and step (b) will underflow
788 /// only if step (a) overflows).
789 ///
790 /// This is the formula we're calculating:
791 ///
792 /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>32 + (U2 * (L1+1))>>32
793 ///
794 /// As a demonstration of 1.0*1.0, consider two 4-bit numbers that are both
795 /// full (1111).
796 ///
797 /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>2 + (U2 * (L1+1))>>2
798 /// 11.11 * 11.11 == 11 * 11 + (11 * (11+1))/4 + (11 * (11+1))/4
799 /// == 1001 + (11 * 100)/4 + (11 * 100)/4
800 /// == 1001 + 1100/4 + 1100/4
801 /// == 1001 + 0011 + 0011
802 /// == 1111
803 BlockMass &operator*=(const BlockMass &X) {
804 uint64_t U1 = Mass >> 32, L1 = Mass & UINT32_MAX, U2 = X.Mass >> 32,
805 L2 = X.Mass & UINT32_MAX;
806 Mass = U1 * U2 + (U1 * (L2 + 1) >> 32) + ((L1 + 1) * U2 >> 32);
807 return *this;
808 }
809
810 /// \brief Multiply by a branch probability.
811 ///
812 /// Multiply by P. Guarantees full precision.
813 ///
814 /// This could be naively implemented by multiplying by the numerator and
815 /// dividing by the denominator, but in what order? Multiplying first can
816 /// overflow, while dividing first will lose precision (potentially, changing
817 /// a non-zero mass to zero).
818 ///
819 /// The implementation mixes the two methods. Since \a BranchProbability
820 /// uses 32-bits and \a BlockMass 64-bits, shift the mass as far to the left
821 /// as there is room, then divide by the denominator to get a quotient.
822 /// Multiplying by the numerator and right shifting gives a first
823 /// approximation.
824 ///
825 /// Calculate the error in this first approximation by calculating the
826 /// opposite mass (multiply by the opposite numerator and shift) and
827 /// subtracting both from teh original mass.
828 ///
829 /// Add to the first approximation the correct fraction of this error value.
830 /// This time, multiply first and then divide, since there is no danger of
831 /// overflow.
832 ///
833 /// \pre P represents a fraction between 0.0 and 1.0.
834 BlockMass &operator*=(const BranchProbability &P);
835
836 bool operator==(const BlockMass &X) const { return Mass == X.Mass; }
837 bool operator!=(const BlockMass &X) const { return Mass != X.Mass; }
838 bool operator<=(const BlockMass &X) const { return Mass <= X.Mass; }
839 bool operator>=(const BlockMass &X) const { return Mass >= X.Mass; }
840 bool operator<(const BlockMass &X) const { return Mass < X.Mass; }
841 bool operator>(const BlockMass &X) const { return Mass > X.Mass; }
842
843 /// \brief Convert to floating point.
844 ///
845 /// Convert to a float. \a isFull() gives 1.0, while \a isEmpty() gives
846 /// slightly above 0.0.
847 PositiveFloat toFloat() const;
848
849 void dump() const;
850 raw_ostream &print(raw_ostream &OS) const;
851 };
852
853 inline BlockMass operator+(const BlockMass &L, const BlockMass &R) {
854 return BlockMass(L) += R;
855 }
856 inline BlockMass operator-(const BlockMass &L, const BlockMass &R) {
857 return BlockMass(L) -= R;
858 }
859 inline BlockMass operator*(const BlockMass &L, const BlockMass &R) {
860 return BlockMass(L) *= R;
861 }
862 inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) {
863 return BlockMass(L) *= R;
864 }
865 inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) {
866 return BlockMass(R) *= L;
867 }
868
869 inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) {
870 return X.print(OS);
871 }
872
873 template <> struct isPodLike {
874 static const bool value = true;
875 };
876 }
877
878 //===----------------------------------------------------------------------===//
879 //
880 // BlockFrequencyInfoImpl definition.
881 //
882 //===----------------------------------------------------------------------===//
883 namespace llvm {
884
885 class BasicBlock;
30
88631 class BranchProbabilityInfo;
887 class Function;
888 class Loop;
889 class LoopInfo;
890 class MachineBasicBlock;
32 class BlockFrequencyInfo;
89133 class MachineBranchProbabilityInfo;
892 class MachineFunction;
893 class MachineLoop;
894 class MachineLoopInfo;
895
896 /// \brief Base class for BlockFrequencyInfoImpl
897 ///
898 /// BlockFrequencyInfoImplBase has supporting data structures and some
899 /// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
900 /// the block type (or that call such algorithms) are skipped here.
901 ///
902 /// Nevertheless, the majority of the overall algorithm documention lives with
903 /// BlockFrequencyInfoImpl. See there for details.
904 class BlockFrequencyInfoImplBase {
905 public:
906 typedef PositiveFloat Float;
907
908 /// \brief Representative of a block.
909 ///
910 /// This is a simple wrapper around an index into the reverse-post-order
911 /// traversal of the blocks.
912 ///
913 /// Unlike a block pointer, its order has meaning (location in the
914 /// topological sort) and it's class is the same regardless of block type.
915 struct BlockNode {
916 typedef uint32_t IndexType;
917 IndexType Index;
918
919 bool operator==(const BlockNode &X) const { return Index == X.Index; }
920 bool operator!=(const BlockNode &X) const { return Index != X.Index; }
921 bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
922 bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
923 bool operator<(const BlockNode &X) const { return Index < X.Index; }
924 bool operator>(const BlockNode &X) const { return Index > X.Index; }
925
926 BlockNode() : Index(UINT32_MAX) {}
927 BlockNode(IndexType Index) : Index(Index) {}
928
929 bool isValid() const { return Index <= getMaxIndex(); }
930 static size_t getMaxIndex() { return UINT32_MAX - 1; }
931 };
932
933 /// \brief Stats about a block itself.
934 struct FrequencyData {
935 Float Floating;
936 uint64_t Integer;
937 };
938
939 /// \brief Index of loop information.
940 struct WorkingData {
941 BlockNode ContainingLoop; ///< The block whose loop this block is inside.
942 uint32_t LoopIndex; ///< Index into PackagedLoops.
943 bool IsPackaged; ///< Has ContainingLoop been packaged up?
944 bool IsAPackage; ///< Has this block's loop been packaged up?
945 BlockMass Mass; ///< Mass distribution from the entry block.
946
947 WorkingData()
948 : LoopIndex(UINT32_MAX), IsPackaged(false), IsAPackage(false) {}
949
950 bool hasLoopHeader() const { return ContainingLoop.isValid(); }
951 bool isLoopHeader() const { return LoopIndex != UINT32_MAX; }
952 };
953
954 /// \brief Unscaled probability weight.
955 ///
956 /// Probability weight for an edge in the graph (including the
957 /// successor/target node).
958 ///
959 /// All edges in the original function are 32-bit. However, exit edges from
960 /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
961 /// space in general.
962 ///
963 /// In addition to the raw weight amount, Weight stores the type of the edge
964 /// in the current context (i.e., the context of the loop being processed).
965 /// Is this a local edge within the loop, an exit from the loop, or a
966 /// backedge to the loop header?
967 struct Weight {
968 enum DistType { Local, Exit, Backedge };
969 DistType Type;
970 BlockNode TargetNode;
971 uint64_t Amount;
972 Weight() : Type(Local), Amount(0) {}
973 };
974
975 /// \brief Distribution of unscaled probability weight.
976 ///
977 /// Distribution of unscaled probability weight to a set of successors.
978 ///
979 /// This class collates the successor edge weights for later processing.
980 ///
981 /// \a DidOverflow indicates whether \a Total did overflow while adding to
982 /// the distribution. It should never overflow twice. There's no flag for
983 /// whether \a ForwardTotal overflows, since when \a Total exceeds 32-bits
984 /// they both get re-computed during \a normalize().
985 struct Distribution {
986 typedef SmallVector WeightList;
987 WeightList Weights; ///< Individual successor weights.
988 uint64_t Total; ///< Sum of all weights.
989 bool DidOverflow; ///< Whether \a Total did overflow.
990 uint32_t ForwardTotal; ///< Total excluding backedges.
991
992 Distribution() : Total(0), DidOverflow(false), ForwardTotal(0) {}
993 void addLocal(const BlockNode &Node, uint64_t Amount) {
994 add(Node, Amount, Weight::Local);
995 }
996 void addExit(const BlockNode &Node, uint64_t Amount) {
997 add(Node, Amount, Weight::Exit);
998 }
999 void addBackedge(const BlockNode &Node, uint64_t Amount) {
1000 add(Node, Amount, Weight::Backedge);
1001 }
1002
1003 /// \brief Normalize the distribution.
1004 ///
1005 /// Combines multiple edges to the same \a Weight::TargetNode and scales
1006 /// down so that \a Total fits into 32-bits.
1007 ///
1008 /// This is linear in the size of \a Weights. For the vast majority of
1009 /// cases, adjacent edge weights are combined by sorting WeightList and
1010 /// combining adjacent weights. However, for very large edge lists an
1011 /// auxiliary hash table is used.
1012 void normalize();
1013
1014 private:
1015 void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
1016 };
1017
1018 /// \brief Data for a packaged loop.
1019 ///
1020 /// Contains the data necessary to represent represent a loop as a node once
1021 /// it's packaged.
1022 ///
1023 /// PackagedLoopData inherits from BlockData to give the node the necessary
1024 /// stats. Further, it has a list of successors, list of members, and stores
1025 /// the backedge mass assigned to this loop.
1026 struct PackagedLoopData {
1027 typedef SmallVector, 4> ExitMap;
1028 typedef SmallVector MemberList;
1029 BlockNode Header; ///< Header.
1030 ExitMap Exits; ///< Successor edges (and weights).
1031 MemberList Members; ///< Members of the loop.
1032 BlockMass BackedgeMass; ///< Mass returned to loop header.
1033 BlockMass Mass;
1034 Float Scale;
1035
1036 PackagedLoopData(const BlockNode &Header) : Header(Header) {}
1037 };
1038
1039 /// \brief Data about each block. This is used downstream.
1040 std::vector Freqs;
1041
1042 /// \brief Loop data: see initializeLoops().
1043 std::vector Working;
1044
1045 /// \brief Indexed information about packaged loops.
1046 std::vector PackagedLoops;
1047
1048 /// \brief Create the initial loop packages.
1049 ///
1050 /// Initializes PackagedLoops using the data in Working about backedges
1051 /// and containing loops. Called by initializeLoops().
1052 ///
1053 /// \post WorkingData::LoopIndex has been initialized for every loop header
1054 /// and PackagedLoopData::Members has been initialized.
1055
1056 /// \brief Add all edges out of a packaged loop to the distribution.
1057 ///
1058 /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
1059 /// successor edge.
1060 void addLoopSuccessorsToDist(const BlockNode &LoopHead,
1061 const BlockNode &LocalLoopHead,
1062 Distribution &Dist);
1063
1064 /// \brief Add an edge to the distribution.
1065 ///
1066 /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
1067 /// edge is forward/exit/backedge is in the context of LoopHead. Otherwise,
1068 /// every edge should be a forward edge (since all the loops are packaged
1069 /// up).
1070 void addToDist(Distribution &Dist, const BlockNode &LoopHead,
1071 const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
1072
1073 PackagedLoopData &getLoopPackage(const BlockNode &Head) {
1074 assert(Head.Index < Working.size());
1075 size_t Index = Working[Head.Index].LoopIndex;
1076 assert(Index < PackagedLoops.size());
1077 return PackagedLoops[Index];
1078 }
1079
1080 /// \brief Distribute mass according to a distribution.
1081 ///
1082 /// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
1083 /// backedges and exits are stored in its entry in PackagedLoops.
1084 ///
1085 /// Mass is distributed in parallel from two copies of the source mass.
1086 ///
1087 /// The first mass (forward) represents the distribution of mass through the
1088 /// local DAG. This distribution should lose mass at loop exits and ignore
1089 /// backedges.
1090 ///
1091 /// The second mass (general) represents the behavior of the loop in the
1092 /// global context. In a given distribution from the head, how much mass
1093 /// exits, and to where? How much mass returns to the loop head?
1094 ///
1095 /// The forward mass should be split up between local successors and exits,
1096 /// but only actually distributed to the local successors. The general mass
1097 /// should be split up between all three types of successors, but distributed
1098 /// only to exits and backedges.
1099 void distributeMass(const BlockNode &Source, const BlockNode &LoopHead,
1100 Distribution &Dist);
1101
1102 /// \brief Compute the loop scale for a loop.
1103 void computeLoopScale(const BlockNode &LoopHead);
1104
1105 /// \brief Package up a loop.
1106 void packageLoop(const BlockNode &LoopHead);
1107
1108 /// \brief Finalize frequency metrics.
1109 ///
1110 /// Unwraps loop packages, calculates final frequencies, and cleans up
1111 /// no-longer-needed data structures.
1112 void finalizeMetrics();
1113
1114 /// \brief Clear all memory.
1115 void clear();
1116
1117 virtual std::string getBlockName(const BlockNode &Node) const;
1118
1119 virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
1120 void dump() const { print(dbgs()); }
1121
1122 Float getFloatingBlockFreq(const BlockNode &Node) const;
1123
1124 BlockFrequency getBlockFreq(const BlockNode &Node) const;
1125
1126 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
1127 raw_ostream &printBlockFreq(raw_ostream &OS,
1128 const BlockFrequency &Freq) const;
1129
1130 uint64_t getEntryFreq() const {
1131 assert(!Freqs.empty());
1132 return Freqs[0].Integer;
1133 }
1134 /// \brief Virtual destructor.
1135 ///
1136 /// Need a virtual destructor to mask the compiler warning about
1137 /// getBlockName().
1138 virtual ~BlockFrequencyInfoImplBase() {}
1139 };
34 class MachineBlockFrequencyInfo;
114035
114136 namespace bfi_detail {
114237 template struct TypeMap {};
114439 typedef BasicBlock BlockT;
114540 typedef Function FunctionT;
114641 typedef BranchProbabilityInfo BranchProbabilityInfoT;
1147 typedef Loop LoopT;
1148 typedef LoopInfo LoopInfoT;
114942 };
115043 template <> struct TypeMap {
115144 typedef MachineBasicBlock BlockT;
115245 typedef MachineFunction FunctionT;
115346 typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
1154 typedef MachineLoop LoopT;
1155 typedef MachineLoopInfo LoopInfoT;
115647 };
1157
1158 /// \brief Get the name of a MachineBasicBlock.
1159 ///
1160 /// Get the name of a MachineBasicBlock. It's templated so that including from
1161 /// CodeGen is unnecessary (that would be a layering issue).
1162 ///
1163 /// This is used mainly for debug output. The name is similar to
1164 /// MachineBasicBlock::getFullName(), but skips the name of the function.
1165 template std::string getBlockName(const BlockT *BB) {
1166 assert(BB && "Unexpected nullptr");
1167 auto MachineName = "BB" + Twine(BB->getNumber());
1168 if (BB->getBasicBlock())
1169 return (MachineName + "[" + BB->getName() + "]").str();
1170 return MachineName.str();
117148 }
1172 /// \brief Get the name of a BasicBlock.
1173 template <> inline std::string getBlockName(const BasicBlock *BB) {
1174 assert(BB && "Unexpected nullptr");
1175 return BB->getName().str();
1176 }
1177 }
1178
1179 /// \brief Shared implementation for block frequency analysis.
1180 ///
1181 /// This is a shared implementation of BlockFrequencyInfo and
1182 /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
1183 /// blocks.
1184 ///
1185 /// This algorithm leverages BlockMass and PositiveFloat to maintain precision,
1186 /// separates mass distribution from loop scaling, and dithers to eliminate
1187 /// probability mass loss.
1188 ///
1189 /// The implementation is split between BlockFrequencyInfoImpl, which knows the
1190 /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
1191 /// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
1192 /// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
1193 /// reverse-post order. This gives two advantages: it's easy to compare the
1194 /// relative ordering of two nodes, and maps keyed on BlockT can be represented
1195 /// by vectors.
1196 ///
1197 /// This algorithm is O(V+E), unless there is irreducible control flow, in
1198 /// which case it's O(V*E) in the worst case.
1199 ///
1200 /// These are the main stages:
1201 ///
1202 /// 0. Reverse post-order traversal (\a initializeRPOT()).
1203 ///
1204 /// Run a single post-order traversal and save it (in reverse) in RPOT.
1205 /// All other stages make use of this ordering. Save a lookup from BlockT
1206 /// to BlockNode (the index into RPOT) in Nodes.
1207 ///
1208 /// 1. Loop indexing (\a initializeLoops()).
1209 ///
1210 /// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
1211 /// the algorithm. In particular, store the immediate members of each loop
1212 /// in reverse post-order.
1213 ///
1214 /// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
1215 ///
1216 /// For each loop (bottom-up), distribute mass through the DAG resulting
1217 /// from ignoring backedges and treating sub-loops as a single pseudo-node.
1218 /// Track the backedge mass distributed to the loop header, and use it to
1219 /// calculate the loop scale (number of loop iterations).
1220 ///
1221 /// Visiting loops bottom-up is a post-order traversal of loop headers.
1222 /// For each loop, immediate members that represent sub-loops will already
1223 /// have been visited and packaged into a pseudo-node.
1224 ///
1225 /// Distributing mass in a loop is a reverse-post-order traversal through
1226 /// the loop. Start by assigning full mass to the Loop header. For each
1227 /// node in the loop:
1228 ///
1229 /// - Fetch and categorize the weight distribution for its successors.
1230 /// If this is a packaged-subloop, the weight distribution is stored
1231 /// in \a PackagedLoopData::Exits. Otherwise, fetch it from
1232 /// BranchProbabilityInfo.
1233 ///
1234 /// - Each successor is categorized as \a Weight::Local, a normal
1235 /// forward edge within the current loop, \a Weight::Backedge, a
1236 /// backedge to the loop header, or \a Weight::Exit, any successor
1237 /// outside the loop. The weight, the successor, and its category
1238 /// are stored in \a Distribution. There can be multiple edges to
1239 /// each successor.
1240 ///
1241 /// - Normalize the distribution: scale weights down so that their sum
1242 /// is 32-bits, and coalesce multiple edges to the same node.
1243 ///
1244 /// - Distribute the mass accordingly, dithering to minimize mass loss,
1245 /// as described in \a distributeMass(). Mass is distributed in
1246 /// parallel in two ways: forward, and general. Local successors
1247 /// take their mass from the forward mass, while exit and backedge
1248 /// successors take their mass from the general mass. Additionally,
1249 /// exit edges use up (ignored) mass from the forward mass, and local
1250 /// edges use up (ignored) mass from the general distribution.
1251 ///
1252 /// Finally, calculate the loop scale from the accumulated backedge mass.
1253 ///
1254 /// 3. Distribute mass in the function (\a computeMassInFunction()).
1255 ///
1256 /// Finally, distribute mass through the DAG resulting from packaging all
1257 /// loops in the function. This uses the same algorithm as distributing
1258 /// mass in a loop, except that there are no exit or backedge edges.
1259 ///
1260 /// 4. Loop unpackaging and cleanup (\a finalizeMetrics()).
1261 ///
1262 /// Initialize the frequency to a floating point representation of its
1263 /// mass.
1264 ///
1265 /// Visit loops top-down (reverse post-order), scaling the loop header's
1266 /// frequency by its psuedo-node's mass and loop scale. Keep track of the
1267 /// minimum and maximum final frequencies.
1268 ///
1269 /// Using the min and max frequencies as a guide, translate floating point
1270 /// frequencies to an appropriate range in uint64_t.
1271 ///
1272 /// It has some known flaws.
1273 ///
1274 /// - Irreducible control flow isn't modelled correctly. In particular,
1275 /// LoopInfo and MachineLoopInfo ignore irreducible backedges. The main
1276 /// result is that irreducible SCCs will under-scaled. No mass is lost,
1277 /// but the computed branch weights for the loop pseudo-node will be
1278 /// incorrect.
1279 ///
1280 /// Modelling irreducible control flow exactly involves setting up and
1281 /// solving a group of infinite geometric series. Such precision is
1282 /// unlikely to be worthwhile, since most of our algorithms give up on
1283 /// irreducible control flow anyway.
1284 ///
1285 /// Nevertheless, we might find that we need to get closer. If
1286 /// LoopInfo/MachineLoopInfo flags loops with irreducible control flow
1287 /// (and/or the function as a whole), we can find the SCCs, compute an
1288 /// approximate exit frequency for the SCC as a whole, and scale up
1289 /// accordingly.
1290 ///
1291 /// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
1292 /// BlockFrequency's 64-bit integer precision.
1293 template class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
49
50 /// BlockFrequencyInfoImpl implements block frequency algorithm for IR and
51 /// Machine Instructions. Algorithm starts with value ENTRY_FREQ
52 /// for the entry block and then propagates frequencies using branch weights
53 /// from (Machine)BranchProbabilityInfo. LoopInfo is not required because
54 /// algorithm can find "backedges" by itself.
55 template
56 class BlockFrequencyInfoImpl {
129457 typedef typename bfi_detail::TypeMap::BlockT BlockT;
129558 typedef typename bfi_detail::TypeMap::FunctionT FunctionT;
129659 typedef typename bfi_detail::TypeMap::BranchProbabilityInfoT
129760 BranchProbabilityInfoT;
1298 typedef typename bfi_detail::TypeMap::LoopT LoopT;
1299 typedef typename bfi_detail::TypeMap::LoopInfoT LoopInfoT;
1300
1301 typedef GraphTraits Successor;
1302 typedef GraphTraits> Predecessor;
1303
1304 const BranchProbabilityInfoT *BPI;
1305 const LoopInfoT *LI;
1306 const FunctionT *F;
1307
1308 // All blocks in reverse postorder.
1309 std::vector RPOT;
1310 DenseMap Nodes;
1311
1312 typedef typename std::vector::const_iterator rpot_iterator;
1313
1314 rpot_iterator rpot_begin() const { return RPOT.begin(); }
1315 rpot_iterator rpot_end() const { return RPOT.end(); }
1316
1317 size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
1318
1319 BlockNode getNode(const rpot_iterator &I) const {
1320 return BlockNode(getIndex(I));
1321 }
1322 BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); }
1323
1324 const BlockT *getBlock(const BlockNode &Node) const {
1325 assert(Node.Index < RPOT.size());
1326 return RPOT[Node.Index];
1327 }
1328
1329 void initializeRPOT();
1330 void initializeLoops();
1331 void runOnFunction(const FunctionT *F);
1332
1333 void propagateMassToSuccessors(const BlockNode &LoopHead,
1334 const BlockNode &Node);
1335 void computeMassInLoops();
1336 void computeMassInLoop(const BlockNode &LoopHead);
1337 void computeMassInFunction();
1338
1339 std::string getBlockName(const BlockNode &Node) const override {
1340 return bfi_detail::getBlockName(getBlock(Node));
61
62 DenseMap Freqs;
63
64 BranchProbabilityInfoT *BPI;
65
66 FunctionT *Fn;
67
68 typedef GraphTraits< Inverse > GT;
69
70 static const uint64_t EntryFreq = 1 << 14;
71
72 std::string getBlockName(BasicBlock *BB) const {
73 return BB->getName().str();
74 }
75
76 std::string getBlockName(MachineBasicBlock *MBB) const {
77 std::string str;
78 raw_string_ostream ss(str);
79 ss << "BB#" << MBB->getNumber();
80
81 if (const BasicBlock *BB = MBB->getBasicBlock())
82 ss << " derived from LLVM BB " << BB->getName();
83
84 return ss.str();
85 }
86
87 void setBlockFreq(BlockT *BB, BlockFrequency Freq) {
88 Freqs[BB] = Freq;
89 DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = ";
90 printBlockFreq(dbgs(), Freq) << "\n");
91 }
92
93 /// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst
94 /// edge probability.
95 BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const {
96 BranchProbability Prob = BPI->getEdgeProbability(Src, Dst);
97 return getBlockFreq(Src) * Prob;
98 }
99
100 /// incBlockFreq - Increase BB block frequency by FREQ.
101 ///
102 void incBlockFreq(BlockT *BB, BlockFrequency Freq) {
103 Freqs[BB] += Freq;
104 DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += ";
105 printBlockFreq(dbgs(), Freq) << " --> ";
106 printBlockFreq(dbgs(), Freqs[BB]) << "\n");
107 }
108
109 // All blocks in postorder.
110 std::vector POT;
111
112 // Map Block -> Position in reverse-postorder list.
113 DenseMap RPO;
114
115 // For each loop header, record the per-iteration probability of exiting the
116 // loop. This is the reciprocal of the expected number of loop iterations.
117 typedef DenseMap LoopExitProbMap;
118 LoopExitProbMap LoopExitProb;
119
120 // (reverse-)postorder traversal iterators.
121 typedef typename std::vector::iterator pot_iterator;
122 typedef typename std::vector::reverse_iterator rpot_iterator;
123
124 pot_iterator pot_begin() { return POT.begin(); }
125 pot_iterator pot_end() { return POT.end(); }
126
127 rpot_iterator rpot_begin() { return POT.rbegin(); }
128 rpot_iterator rpot_end() { return POT.rend(); }
129
130 rpot_iterator rpot_at(BlockT *BB) {
131 rpot_iterator I = rpot_begin();
132 unsigned idx = RPO.lookup(BB);
133 assert(idx);
134 std::advance(I, idx - 1);
135
136 assert(*I == BB);
137 return I;
138 }
139
140 /// isBackedge - Return if edge Src -> Dst is a reachable backedge.
141 ///
142 bool isBackedge(BlockT *Src, BlockT *Dst) const {
143 unsigned a = RPO.lookup(Src);
144 if (!a)
145 return false;
146 unsigned b = RPO.lookup(Dst);
147 assert(b && "Destination block should be reachable");
148 return a >= b;
149 }
150
151 /// getSingleBlockPred - return single BB block predecessor or NULL if
152 /// BB has none or more predecessors.
153 BlockT *getSingleBlockPred(BlockT *BB) {
154 typename GT::ChildIteratorType
155 PI = GraphTraits< Inverse >::child_begin(BB),
156 PE = GraphTraits< Inverse >::child_end(BB);
157
158 if (PI == PE)
159 return nullptr;
160
161 BlockT *Pred = *PI;
162
163 ++PI;
164 if (PI != PE)
165 return nullptr;
166
167 return Pred;
168 }
169
170 void doBlock(BlockT *BB, BlockT *LoopHead,
171 SmallPtrSet &BlocksInLoop) {
172
173 DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n");
174 setBlockFreq(BB, 0);
175
176 if (BB == LoopHead) {
177 setBlockFreq(BB, EntryFreq);
178 return;
179 }
180
181 if (BlockT *Pred = getSingleBlockPred(BB)) {
182 if (BlocksInLoop.count(Pred))
183 setBlockFreq(BB, getEdgeFreq(Pred, BB));
184 // TODO: else? irreducible, ignore it for now.
185 return;
186 }
187
188 bool isInLoop = false;
189 bool isLoopHead = false;
190
191 for (typename GT::ChildIteratorType
192 PI = GraphTraits< Inverse >::child_begin(BB),
193 PE = GraphTraits< Inverse >::child_end(BB);
194 PI != PE; ++PI) {
195 BlockT *Pred = *PI;
196
197 if (isBackedge(Pred, BB)) {
198 isLoopHead = true;
199 } else if (BlocksInLoop.count(Pred)) {
200 incBlockFreq(BB, getEdgeFreq(Pred, BB));
201 isInLoop = true;
202 }
203 // TODO: else? irreducible.
204 }
205
206 if (!isInLoop)
207 return;
208
209 if (!isLoopHead)
210 return;
211
212 // This block is a loop header, so boost its frequency by the expected
213 // number of loop iterations. The loop blocks will be revisited so they all
214 // get this boost.
215 typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB);
216 assert(I != LoopExitProb.end() && "Loop header missing from table");
217 Freqs[BB] /= I->second;
218 DEBUG(dbgs() << "Loop header scaled to ";
219 printBlockFreq(dbgs(), Freqs[BB]) << ".\n");
220 }
221
222 /// doLoop - Propagate block frequency down through the loop.
223 void doLoop(BlockT *Head, BlockT *Tail) {
224 DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", "
225 << getBlockName(Tail) << ")\n");
226
227 SmallPtrSet BlocksInLoop;
228
229 for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) {
230 BlockT *BB = *I;
231 doBlock(BB, Head, BlocksInLoop);
232
233 BlocksInLoop.insert(BB);
234 if (I == E)
235 break;
236 }
237
238 // Compute loop's cyclic probability using backedges probabilities.
239 BlockFrequency BackFreq;
240 for (typename GT::ChildIteratorType
241 PI = GraphTraits< Inverse >::child_begin(Head),
242 PE = GraphTraits< Inverse >::child_end(Head);
243 PI != PE; ++PI) {
244 BlockT *Pred = *PI;
245 assert(Pred);
246 if (isBackedge(Pred, Head))
247 BackFreq += getEdgeFreq(Pred, Head);
248 }
249
250 // The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head)
251 // only counts edges entering the loop, not the loop backedges.
252 // The probability of leaving the loop on each iteration is:
253 //
254 // ExitProb = 1 - CyclicProb
255 //
256 // The Expected number of loop iterations is:
257 //
258 // Iterations = 1 / ExitProb
259 //
260 uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1));
261 uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1));
262 if (N < D)
263 N = D - N;
264 else
265 // We'd expect N < D, but rounding and saturation means that can't be
266 // guaranteed.
267 N = 1;
268
269 // Now ExitProb = N / D, make sure it fits in an i32/i32 fraction.
270 assert(N <= D);
271 if (D > UINT32_MAX) {
272 unsigned Shift = 32 - countLeadingZeros(D);
273 D >>= Shift;
274 N >>= Shift;
275 if (N == 0)
276 N = 1;
277 }
278 BranchProbability LEP = BranchProbability(N, D);
279 LoopExitProb.insert(std::make_pair(Head, LEP));
280 DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP
281 << " from 1 - ";
282 printBlockFreq(dbgs(), BackFreq) << " / ";
283 printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n");
284 }
285
286 friend class BlockFrequencyInfo;
287 friend class MachineBlockFrequencyInfo;
288
289 BlockFrequencyInfoImpl() { }
290
291 void doFunction(FunctionT *fn, BranchProbabilityInfoT *bpi) {
292 Fn = fn;
293 BPI = bpi;
294
295 // Clear everything.
296 RPO.clear();
297 POT.clear();
298 LoopExitProb.clear();
299 Freqs.clear();
300
301 BlockT *EntryBlock = fn->begin();
302
303 std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT));
304
305 unsigned RPOidx = 0;
306 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
307 BlockT *BB = *I;
308 RPO[BB] = ++RPOidx;
309 DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n");
310 }
311
312 // Travel over all blocks in postorder.
313 for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) {
314 BlockT *BB = *I;
315 BlockT *LastTail = nullptr;
316 DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n");
317
318 for (typename GT::ChildIteratorType
319 PI = GraphTraits< Inverse >::child_begin(BB),
320 PE = GraphTraits< Inverse >::child_end(BB);
321 PI != PE; ++PI) {
322
323 BlockT *Pred = *PI;
324 if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail]))
325 LastTail = Pred;
326 }
327
328 if (LastTail)
329 doLoop(BB, LastTail);
330 }
331
332 // At the end assume the whole function as a loop, and travel over it once
333 // again.
334 doLoop(*(rpot_begin()), *(pot_begin()));
1341335 }
1342336
1343337 public:
1344 const FunctionT *getFunction() const { return F; }
1345
1346 void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI,
1347 const LoopInfoT *LI);
1348 BlockFrequencyInfoImpl() : BPI(0), LI(0), F(0) {}
1349
1350 using BlockFrequencyInfoImplBase::getEntryFreq;
338
339 uint64_t getEntryFreq() { return EntryFreq; }
340
341 /// getBlockFreq - Return block frequency. Return 0 if we don't have it.
1351342 BlockFrequency getBlockFreq(const BlockT *BB) const {
1352 return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
1353 }
1354 Float getFloatingBlockFreq(const BlockT *BB) const {
1355 return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
1356 }
1357
1358 /// \brief Print the frequencies for the current function.
1359 ///
1360 /// Prints the frequencies for the blocks in the current function.
1361 ///
1362 /// Blocks are printed in the natural iteration order of the function, rather
1363 /// than reverse post-order. This provides two advantages: writing -analyze
1364 /// tests is easier (since blocks come out in source order), and even
1365 /// unreachable blocks are printed.
1366 ///
1367 /// \a BlockFrequencyInfoImplBase::print() only knows reverse post-order, so
1368 /// we need to override it here.
1369 raw_ostream &print(raw_ostream &OS) const override;
1370 using BlockFrequencyInfoImplBase::dump;
1371
1372 using BlockFrequencyInfoImplBase::printBlockFreq;
1373 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
1374 return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
1375 }
343 typename DenseMap::const_iterator
344 I = Freqs.find(BB);
345 if (I != Freqs.end())
346 return I->second;
347 return 0;
348 }
349
350 void print(raw_ostream &OS) const {
351 OS << "\n\n---- Block Freqs ----\n";
352 for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) {
353 BlockT *BB = I++;
354 OS << " " << getBlockName(BB) << " = ";
355 printBlockFreq(OS, getBlockFreq(BB)) << "\n";
356
357 for (typename GraphTraits::ChildIteratorType
358 SI = GraphTraits::child_begin(BB),
359 SE = GraphTraits::child_end(BB); SI != SE; ++SI) {
360 BlockT *Succ = *SI;
361 OS << " " << getBlockName(BB) << " -> " << getBlockName(Succ)
362 << " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n";
363 }
364 }
365 }
366
367 void dump() const {
368 print(dbgs());
369 }
370
371 // Utility method that looks up the block frequency associated with BB and
372 // prints it to OS.
373 raw_ostream &printBlockFreq(raw_ostream &OS,
374 const BlockT *BB) {
375 return printBlockFreq(OS, getBlockFreq(BB));
376 }
377
378 raw_ostream &printBlockFreq(raw_ostream &OS,
379 const BlockFrequency &Freq) const {
380 // Convert fixed-point number to decimal.
381 uint64_t Frequency = Freq.getFrequency();
382 OS << Frequency / EntryFreq << ".";
383 uint64_t Rem = Frequency % EntryFreq;
384 uint64_t Eps = 1;
385 do {
386 Rem *= 10;
387 Eps *= 10;
388 OS << Rem / EntryFreq;
389 Rem = Rem % EntryFreq;
390 } while (Rem >= Eps/2);
391 return OS;
392 }
393
1376394 };
1377395
1378 template
1379 void BlockFrequencyInfoImpl::doFunction(const FunctionT *F,
1380 const BranchProbabilityInfoT *BPI,
1381 const LoopInfoT *LI) {
1382 // Save the parameters.
1383 this->BPI = BPI;
1384 this->LI = LI;
1385 this->F = F;
1386
1387 // Clean up left-over data structures.
1388 BlockFrequencyInfoImplBase::clear();
1389 RPOT.clear();
1390 Nodes.clear();
1391
1392 // Initialize.
1393 DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n================="
1394 << std::string(F->getName().size(), '=') << "\n");
1395 initializeRPOT();
1396 initializeLoops();
1397
1398 // Visit loops in post-order to find thelocal mass distribution, and then do
1399 // the full function.
1400 computeMassInLoops();
1401 computeMassInFunction();
1402 finalizeMetrics();
1403396 }
1404397
1405 template void BlockFrequencyInfoImpl::initializeRPOT() {
1406 const BlockT *Entry = F->begin();
1407 RPOT.reserve(F->size());
1408 std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
1409 std::reverse(RPOT.begin(), RPOT.end());
1410
1411 assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
1412 "More nodes in function than Block Frequency Info supports");
1413
1414 DEBUG(dbgs() << "reverse-post-order-traversal\n");
1415 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
1416 BlockNode Node = getNode(I);
1417 DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
1418 Nodes[*I] = Node;
1419 }
1420
1421 Working.resize(RPOT.size());
1422 Freqs.resize(RPOT.size());
1423 }
1424
1425 template void BlockFrequencyInfoImpl::initializeLoops() {
1426 DEBUG(dbgs() << "loop-detection\n");
1427 if (LI->empty())
1428 return;
1429
1430 // Visit loops top down and assign them an index.
1431 std::deque Q;
1432 Q.insert(Q.end(), LI->begin(), LI->end());
1433 while (!Q.empty()) {
1434 const LoopT *Loop = Q.front();
1435 Q.pop_front();
1436 Q.insert(Q.end(), Loop->begin(), Loop->end());
1437
1438 // Save the order this loop was visited.
1439 BlockNode Header = getNode(Loop->getHeader());
1440 assert(Header.isValid());
1441
1442 Working[Header.Index].LoopIndex = PackagedLoops.size();
1443 PackagedLoops.emplace_back(Header);
1444 DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
1445 }
1446
1447 // Visit nodes in reverse post-order and add them to their deepest containing
1448 // loop.
1449 for (size_t Index = 0; Index < RPOT.size(); ++Index) {
1450 const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
1451 if (!Loop)
1452 continue;
1453
1454 // If this is a loop header, find its parent loop (if any).
1455 if (Working[Index].isLoopHeader())
1456 if (!(Loop = Loop->getParentLoop()))
1457 continue;
1458
1459 // Add this node to its containing loop's member list.
1460 BlockNode Header = getNode(Loop->getHeader());
1461 assert(Header.isValid());
1462 const auto &HeaderData = Working[Header.Index];
1463 assert(HeaderData.isLoopHeader());
1464
1465 Working[Index].ContainingLoop = Header;
1466 PackagedLoops[HeaderData.LoopIndex].Members.push_back(Index);
1467 DEBUG(dbgs() << " - loop = " << getBlockName(Header)
1468 << ": member = " << getBlockName(Index) << "\n");
1469 }
1470 }
1471
1472 template void BlockFrequencyInfoImpl::computeMassInLoops() {
1473 // Visit loops with the deepest first, and the top-level loops last.
1474 for (auto L = PackagedLoops.rbegin(), LE = PackagedLoops.rend(); L != LE; ++L)
1475 computeMassInLoop(L->Header);
1476 }
1477
1478 template
1479 void BlockFrequencyInfoImpl::computeMassInLoop(const BlockNode &LoopHead) {
1480 // Compute mass in loop.
1481 DEBUG(dbgs() << "compute-mass-in-loop: " << getBlockName(LoopHead) << "\n");
1482
1483 Working[LoopHead.Index].Mass = BlockMass::getFull();
1484 propagateMassToSuccessors(LoopHead, LoopHead);
1485
1486 for (const BlockNode &M : getLoopPackage(LoopHead).Members)
1487 propagateMassToSuccessors(LoopHead, M);
1488
1489 computeLoopScale(LoopHead);
1490 packageLoop(LoopHead);
1491 }
1492
1493 template void BlockFrequencyInfoImpl::computeMassInFunction() {
1494 // Compute mass in function.
1495 DEBUG(dbgs() << "compute-mass-in-function\n");
1496 assert(!Working.empty() && "no blocks in function");
1497 assert(!Working[0].isLoopHeader() && "entry block is a loop header");
1498
1499 Working[0].Mass = BlockMass::getFull();
1500 for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
1501 // Check for nodes that have been packaged.
1502 BlockNode Node = getNode(I);
1503 if (Working[Node.Index].hasLoopHeader())
1504 continue;
1505
1506 propagateMassToSuccessors(BlockNode(), Node);
1507 }
1508 }
1509
1510 template
1511 void
1512 BlockFrequencyInfoImpl::propagateMassToSuccessors(const BlockNode &LoopHead,
1513 const BlockNode &Node) {
1514 DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
1515 // Calculate probability for successors.
1516 Distribution Dist;
1517 if (Node != LoopHead && Working[Node.Index].isLoopHeader())
1518 addLoopSuccessorsToDist(LoopHead, Node, Dist);
1519 else {
1520 const BlockT *BB = getBlock(Node);
1521 for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB);
1522 SI != SE; ++SI)
1523 // Do not dereference SI, or getEdgeWeight() is linear in the number of
1524 // successors.
1525 addToDist(Dist, LoopHead, Node, getNode(*SI), BPI->getEdgeWeight(BB, SI));
1526 }
1527
1528 // Distribute mass to successors, saving exit and backedge data in the
1529 // loop header.
1530 distributeMass(Node, LoopHead, Dist);
1531 }
1532
1533 template
1534 raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const {
1535 if (!F)
1536 return OS;
1537 OS << "block-frequency-info: " << F->getName() << "\n";
1538 for (const BlockT &BB : *F)
1539 OS << " - " << bfi_detail::getBlockName(&BB)
1540 << ": float = " << getFloatingBlockFreq(&BB)
1541 << ", int = " << getBlockFreq(&BB).getFrequency() << "\n";
1542
1543 // Add an extra newline for readability.
1544 OS << "\n";
1545 return OS;
1546 }
1547 }
1548
1549398 #endif
1010 //
1111 //===----------------------------------------------------------------------===//
1212
13 #define DEBUG_TYPE "block-freq"
1413 #include "llvm/Analysis/BlockFrequencyInfo.h"
1514 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
1615 #include "llvm/Analysis/BranchProbabilityInfo.h"
106105 INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
107106 "Block Frequency Analysis", true, true)
108107 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
109 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
110108 INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
111109 "Block Frequency Analysis", true, true)
112110
121119
122120 void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
123121 AU.addRequired();
124 AU.addRequired();
125122 AU.setPreservesAll();
126123 }
127124
128125 bool BlockFrequencyInfo::runOnFunction(Function &F) {
129126 BranchProbabilityInfo &BPI = getAnalysis();
130 LoopInfo &LI = getAnalysis();
131127 if (!BFI)
132128 BFI.reset(new ImplType);
133 BFI->doFunction(&F, &BPI, &LI);
129 BFI->doFunction(&F, &BPI);
134130 #ifndef NDEBUG
135131 if (ViewBlockFreqPropagationDAG != GVDT_None)
136132 view();
161157 }
162158
163159 const Function *BlockFrequencyInfo::getFunction() const {
164 return BFI ? BFI->getFunction() : nullptr;
160 return BFI ? BFI->Fn : nullptr;
165161 }
166162
167163 raw_ostream &BlockFrequencyInfo::
+0
-932
lib/Analysis/BlockFrequencyInfoImpl.cpp less more
None //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Loops should be simplified before this analysis.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #define DEBUG_TYPE "block-freq"
14 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
15 #include "llvm/ADT/APFloat.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include
18
19 using namespace llvm;
20
21 //===----------------------------------------------------------------------===//
22 //
23 // PositiveFloat implementation.
24 //
25 //===----------------------------------------------------------------------===//
26 #ifndef _MSC_VER
27 const int32_t PositiveFloatBase::MaxExponent;
28 const int32_t PositiveFloatBase::MinExponent;
29 #endif
30
31 static void appendDigit(std::string &Str, unsigned D) {
32 assert(D < 10);
33 Str += '0' + D % 10;
34 }
35
36 static void appendNumber(std::string &Str, uint64_t N) {
37 while (N) {
38 appendDigit(Str, N % 10);
39 N /= 10;
40 }
41 }
42
43 static bool doesRoundUp(char Digit) {
44 switch (Digit) {
45 case '5':
46 case '6':
47 case '7':
48 case '8':
49 case '9':
50 return true;
51 default:
52 return false;
53 }
54 }
55
56 static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
57 assert(E >= PositiveFloatBase::MinExponent);
58 assert(E <= PositiveFloatBase::MaxExponent);
59
60 // Find a new E, but don't let it increase past MaxExponent.
61 int LeadingZeros = PositiveFloatBase::countLeadingZeros64(D);
62 int NewE = std::min(PositiveFloatBase::MaxExponent, E + 63 - LeadingZeros);
63 int Shift = 63 - (NewE - E);
64 assert(Shift <= LeadingZeros);
65 assert(Shift == LeadingZeros || NewE == PositiveFloatBase::MaxExponent);
66 D <<= Shift;
67 E = NewE;
68
69 // Check for a denormal.
70 unsigned AdjustedE = E + 16383;
71 if (!(D >> 63)) {
72 assert(E == PositiveFloatBase::MaxExponent);
73 AdjustedE = 0;
74 }
75
76 // Build the float and print it.
77 uint64_t RawBits[2] = {D, AdjustedE};
78 APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits));
79 SmallVector Chars;
80 Float.toString(Chars, Precision, 0);
81 return std::string(Chars.begin(), Chars.end());
82 }
83
84 static std::string stripTrailingZeros(const std::string &Float) {
85 size_t NonZero = Float.find_last_not_of('0');
86 assert(NonZero != std::string::npos && "no . in floating point string");
87
88 if (Float[NonZero] == '.')
89 ++NonZero;
90
91 return Float.substr(0, NonZero + 1);
92 }
93
94 std::string PositiveFloatBase::toString(uint64_t D, int16_t E, int Width,
95 unsigned Precision) {
96 if (!D)
97 return "0.0";
98
99 // Canonicalize exponent and digits.
100 uint64_t Above0 = 0;
101 uint64_t Below0 = 0;
102 uint64_t Extra = 0;
103 int ExtraShift = 0;
104 if (E == 0) {
105 Above0 = D;
106 } else if (E > 0) {
107 if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) {
108 D <<= Shift;
109 E -= Shift;
110
111 if (!E)
112 Above0 = D;
113 }
114 } else if (E > -64) {
115 Above0 = D >> -E;
116 Below0 = D << (64 + E);
117 } else if (E > -120) {
118 Below0 = D >> (-E - 64);
119 Extra = D << (128 + E);
120 ExtraShift = -64 - E;
121 }
122
123 // Fall back on APFloat for very small and very large numbers.
124 if (!Above0 && !Below0)
125 return toStringAPFloat(D, E, Precision);
126
127 // Append the digits before the decimal.
128 std::string Str;
129 size_t DigitsOut = 0;
130 if (Above0) {
131 appendNumber(Str, Above0);
132 DigitsOut = Str.size();
133 } else
134 appendDigit(Str, 0);
135 std::reverse(Str.begin(), Str.end());
136
137 // Return early if there's nothing after the decimal.
138 if (!Below0)
139 return Str + ".0";
140
141 // Append the decimal and beyond.
142 Str += '.';
143 uint64_t Error = UINT64_C(1) << (64 - Width);
144
145 // We need to shift Below0 to the right to make space for calculating
146 // digits. Save the precision we're losing in Extra.
147 Extra = (Below0 & 0xf) << 56 | (Extra >> 8);
148 Below0 >>= 4;
149 size_t SinceDot = 0;
150 size_t AfterDot = Str.size();
151 do {
152 if (ExtraShift) {
153 --ExtraShift;
154 Error *= 5;
155 } else
156 Error *= 10;
157
158 Below0 *= 10;
159 Extra *= 10;
160 Below0 += (Extra >> 60);
161 Extra = Extra & (UINT64_MAX >> 4);
162 appendDigit(Str, Below0 >> 60);
163 Below0 = Below0 & (UINT64_MAX >> 4);
164 if (DigitsOut || Str.back() != '0')
165 ++DigitsOut;
166 ++SinceDot;
167 } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 &&
168 (!Precision || DigitsOut <= Precision || SinceDot < 2));
169
170 // Return early for maximum precision.
171 if (!Precision || DigitsOut <= Precision)
172 return stripTrailingZeros(Str);
173
174 // Find where to truncate.
175 size_t Truncate =
176 std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1);
177
178 // Check if there's anything to truncate.
179 if (Truncate >= Str.size())
180 return stripTrailingZeros(Str);
181
182 bool Carry = doesRoundUp(Str[Truncate]);
183 if (!Carry)
184 return stripTrailingZeros(Str.substr(0, Truncate));
185
186 // Round with the first truncated digit.
187 for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend();
188 I != E; ++I) {
189 if (*I == '.')
190 continue;
191 if (*I == '9') {
192 *I = '0';
193 continue;
194 }
195
196 ++*I;
197 Carry = false;
198 break;
199 }
200
201 // Add "1" in front if we still need to carry.
202 return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate));
203 }
204
205 raw_ostream &PositiveFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E,
206 int Width, unsigned Precision) {
207 return OS << toString(D, E, Width, Precision);
208 }
209
210 void PositiveFloatBase::dump(uint64_t D, int16_t E, int Width) {
211 print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E
212 << "]";
213 }
214
215 static std::pair
216 getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) {
217 if (ShouldRound)
218 if (!++N)
219 // Rounding caused an overflow.
220 return std::make_pair(UINT64_C(1), Shift + 64);
221 return std::make_pair(N, Shift);
222 }
223
224 std::pair PositiveFloatBase::divide64(uint64_t Dividend,
225 uint64_t Divisor) {
226 // Input should be sanitized.
227 assert(Divisor);
228 assert(Dividend);
229
230 // Minimize size of divisor.
231 int16_t Shift = 0;
232 if (int Zeros = countTrailingZeros(Divisor)) {
233 Shift -= Zeros;
234 Divisor >>= Zeros;
235 }
236
237 // Check for powers of two.
238 if (Divisor == 1)
239 return std::make_pair(Dividend, Shift);
240
241 // Maximize size of dividend.
242 if (int Zeros = countLeadingZeros64(Dividend)) {
243 Shift -= Zeros;
244 Dividend <<= Zeros;
245 }
246
247 // Start with the result of a divide.
248 uint64_t Quotient = Dividend / Divisor;
249 Dividend %= Divisor;
250
251 // Continue building the quotient with long division.
252 //
253 // TODO: continue with largers digits.
254 while (!(Quotient >> 63) && Dividend) {
255 // Shift Dividend, and check for overflow.
256 bool IsOverflow = Dividend >> 63;
257 Dividend <<= 1;
258 --Shift;
259
260 // Divide.
261 bool DoesDivide = IsOverflow || Divisor <= Dividend;
262 Quotient = (Quotient << 1) | uint64_t(DoesDivide);
263 Dividend -= DoesDivide ? Divisor : 0;
264 }
265
266 // Round.
267 if (Dividend >= getHalf(Divisor))
268 if (!++Quotient)
269 // Rounding caused an overflow in Quotient.
270 return std::make_pair(UINT64_C(1), Shift + 64);
271
272 return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift);
273 }
274
275 std::pair PositiveFloatBase::multiply64(uint64_t L,
276 uint64_t R) {
277 // Separate into two 32-bit digits (U.L).
278 uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX;
279
280 // Compute cross products.
281 uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR;
282
283 // Sum into two 64-bit digits.
284 uint64_t Upper = P1, Lower = P4;
285 auto addWithCarry = [&](uint64_t N) {
286 uint64_t NewLower = Lower + (N << 32);
287 Upper += (N >> 32) + (NewLower < Lower);
288 Lower = NewLower;
289 };
290 addWithCarry(P2);
291 addWithCarry(P3);
292
293 // Check whether the upper digit is empty.
294 if (!Upper)
295 return std::make_pair(Lower, 0);
296
297 // Shift as little as possible to maximize precision.
298 unsigned LeadingZeros = countLeadingZeros64(Upper);
299 int16_t Shift = 64 - LeadingZeros;
300 if (LeadingZeros)
301 Upper = Upper << LeadingZeros | Lower >> Shift;
302 bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1));
303 return getRoundedFloat(Upper, ShouldRound, Shift);
304 }
305
306 //===----------------------------------------------------------------------===//
307 //
308 // BlockMass implementation.
309 //
310 //===----------------------------------------------------------------------===//
311 BlockMass &BlockMass::operator*=(const BranchProbability &P) {
312 uint32_t N = P.getNumerator(), D = P.getDenominator();
313 assert(D && "divide by 0");
314 assert(N <= D && "fraction greater than 1");
315
316 // Fast path for multiplying by 1.0.
317 if (!Mass || N == D)
318 return *this;
319
320 // Get as much precision as we can.
321 int Shift = countLeadingZeros(Mass);
322 uint64_t ShiftedQuotient = (Mass << Shift) / D;
323 uint64_t Product = ShiftedQuotient * N >> Shift;
324
325 // Now check for what's lost.
326 uint64_t Left = ShiftedQuotient * (D - N) >> Shift;
327 uint64_t Lost = Mass - Product - Left;
328
329 // TODO: prove this assertion.
330 assert(Lost <= UINT32_MAX);
331
332 // Take the product plus a portion of the spoils.
333 Mass = Product + Lost * N / D;
334 return *this;
335 }
336
337 PositiveFloat BlockMass::toFloat() const {
338 if (isFull())
339 return PositiveFloat(1, 0);
340 return PositiveFloat(getMass() + 1, -64);
341 }
342
343 void BlockMass::dump() const { print(dbgs()); }
344
345 static char getHexDigit(int N) {
346 assert(N < 16);
347 if (N < 10)
348 return '0' + N;
349 return 'a' + N - 10;
350 }
351 raw_ostream &BlockMass::print(raw_ostream &OS) const {
352 for (int Digits = 0; Digits < 16; ++Digits)
353 OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
354 return OS;
355 }
356
357 //===----------------------------------------------------------------------===//
358 //
359 // BlockFrequencyInfoImpl implementation.
360 //
361 //===----------------------------------------------------------------------===//
362 namespace {
363
364 typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
365 typedef BlockFrequencyInfoImplBase::Distribution Distribution;
366 typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList;
367 typedef BlockFrequencyInfoImplBase::Float Float;
368 typedef BlockFrequencyInfoImplBase::PackagedLoopData PackagedLoopData;
369 typedef BlockFrequencyInfoImplBase::Weight Weight;
370 typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData;
371
372 /// \brief Dithering mass distributer.
373 ///
374 /// This class splits up a single mass into portions by weight, dithering to
375 /// spread out error. No mass is lost. The dithering precision depends on the
376 /// precision of the product of \a BlockMass and \a BranchProbability.
377 ///
378 /// The distribution algorithm follows.
379 ///
380 /// 1. Initialize by saving the sum of the weights in \a RemWeight and the
381 /// mass to distribute in \a RemMass.
382 ///
383 /// 2. For each portion:
384 ///
385 /// 1. Construct a branch probability, P, as the portion's weight divided
386 /// by the current value of \a RemWeight.
387 /// 2. Calculate the portion's mass as \a RemMass times P.
388 /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting
389 /// the current portion's weight and mass.
390 ///
391 /// Mass is distributed in two ways: full distribution and forward
392 /// distribution. The latter ignores backedges, and uses the parallel fields
393 /// \a RemForwardWeight and \a RemForwardMass.
394 struct DitheringDistributer {
395 uint32_t RemWeight;
396 uint32_t RemForwardWeight;
397
398 BlockMass RemMass;
399 BlockMass RemForwardMass;
400
401 DitheringDistributer(Distribution &Dist, const BlockMass &Mass);
402
403 BlockMass takeLocalMass(uint32_t Weight) {
404 (void)takeMass(Weight);
405 return takeForwardMass(Weight);
406 }
407 BlockMass takeExitMass(uint32_t Weight) {
408 (void)takeForwardMass(Weight);
409 return takeMass(Weight);
410 }
411 BlockMass takeBackedgeMass(uint32_t Weight) { return takeMass(Weight); }
412
413 private:
414 BlockMass takeForwardMass(uint32_t Weight);
415 BlockMass takeMass(uint32_t Weight);
416 };
417 }
418
419 DitheringDistributer::DitheringDistributer(Distribution &Dist,
420 const BlockMass &Mass) {
421 Dist.normalize();
422 RemWeight = Dist.Total;
423 RemForwardWeight = Dist.ForwardTotal;
424 RemMass = Mass;
425 RemForwardMass = Dist.ForwardTotal ? Mass : BlockMass();
426 }
427
428 BlockMass DitheringDistributer::takeForwardMass(uint32_t Weight) {
429 // Compute the amount of mass to take.
430 assert(Weight && "invalid weight");
431 assert(Weight <= RemForwardWeight);
432 BlockMass Mass = RemForwardMass * BranchProbability(Weight, RemForwardWeight);
433
434 // Decrement totals (dither).
435 RemForwardWeight -= Weight;
436 RemForwardMass -= Mass;
437 return Mass;
438 }
439 BlockMass DitheringDistributer::takeMass(uint32_t Weight) {
440 assert(Weight && "invalid weight");
441 assert(Weight <= RemWeight);
442 BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);
443
444 // Decrement totals (dither).
445 RemWeight -= Weight;
446 RemMass -= Mass;
447 return Mass;
448 }
449
450 void Distribution::add(const BlockNode &Node, uint64_t Amount,
451 Weight::DistType Type) {
452 assert(Amount && "invalid weight of 0");
453 uint64_t NewTotal = Total + Amount;
454
455 // Check for overflow. It should be impossible to overflow twice.
456 bool IsOverflow = NewTotal < Total;
457 assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");
458 DidOverflow |= IsOverflow;
459
460 // Update the total.
461 Total = NewTotal;
462
463 // Save the weight.
464 Weight W;
465 W.TargetNode = Node;
466 W.Amount = Amount;
467 W.Type = Type;
468 Weights.push_back(W);
469
470 if (Type == Weight::Backedge)
471 return;
472
473 // Update forward total. Don't worry about overflow here, since then Total
474 // will exceed 32-bits and they'll both be recomputed in normalize().
475 ForwardTotal += Amount;
476 }
477
478 static void combineWeight(Weight &W, const Weight &OtherW) {
479 assert(OtherW.TargetNode.isValid());
480 if (!W.Amount) {
481 W = OtherW;
482 return;
483 }
484 assert(W.Type == OtherW.Type);
485 assert(W.TargetNode == OtherW.TargetNode);
486 assert(W.Amount < W.Amount + OtherW.Amount);
487 W.Amount += OtherW.Amount;
488 }
489 static void combineWeightsBySorting(WeightList &Weights) {
490 // Sort so edges to the same node are adjacent.
491 std::sort(Weights.begin(), Weights.end(),
492 [](const Weight &L,
493 const Weight &R) { return L.TargetNode < R.TargetNode; });
494
495 // Combine adjacent edges.
496 WeightList::iterator O = Weights.begin();
497 for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;
498 ++O, (I = L)) {
499 *O = *I;
500
501 // Find the adjacent weights to the same node.
502 for (++L; L != E && I->TargetNode == L->TargetNode; ++L)
503 combineWeight(*O, *L);
504 }
505
506 // Erase extra entries.
507 Weights.erase(O, Weights.end());
508 return;
509 }
510 static void combineWeightsByHashing(WeightList &Weights) {
511 // Collect weights into a DenseMap.
512 typedef DenseMap HashTable;
513 HashTable Combined(NextPowerOf2(2 * Weights.size()));
514 for (const Weight &W : Weights)
515 combineWeight(Combined[W.TargetNode.Index], W);
516
517 // Check whether anything changed.
518 if (Weights.size() == Combined.size())
519 return;
520
521 // Fill in the new weights.
522 Weights.clear();
523 Weights.reserve(Combined.size());
524 for (const auto &I : Combined)
525 Weights.push_back(I.second);
526 }
527 static void combineWeights(WeightList &Weights) {
528 // Use a hash table for many successors to keep this linear.
529 if (Weights.size() > 128) {
530 combineWeightsByHashing(Weights);
531 return;
532 }
533
534 combineWeightsBySorting(Weights);
535 }
536 static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
537 assert(Shift >= 0);
538 assert(Shift < 64);
539 if (!Shift)
540 return N;
541 return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
542 }
543 void Distribution::normalize() {
544 // Early exit for termination nodes.
545 if (Weights.empty())
546 return;
547
548 // Only bother if there are multiple successors.
549 if (Weights.size() > 1)
550 combineWeights(Weights);
551
552 // Early exit when combined into a single successor.
553 if (Weights.size() == 1) {
554 Total = 1;
555 ForwardTotal = Weights.front().Type != Weight::Backedge;
556 Weights.front().Amount = 1;
557 return;
558 }
559
560 // Determine how much to shift right so that the total fits into 32-bits.
561 //
562 // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1
563 // for each weight can cause a 32-bit overflow.
564 int Shift = 0;
565 if (DidOverflow)
566 Shift = 33;
567 else if (Total > UINT32_MAX)
568 Shift = 33 - countLeadingZeros(Total);
569
570 // Early exit if nothing needs to be scaled.
571 if (!Shift)
572 return;
573
574 // Recompute the total through accumulation (rather than shifting it) so that
575 // it's accurate after shifting. ForwardTotal is dirty here anyway.
576 Total = 0;
577 ForwardTotal = 0;
578
579 // Sum the weights to each node and shift right if necessary.
580 for (Weight &W : Weights) {
581 // Scale down below UINT32_MAX. Since Shift is larger than necessary, we
582 // can round here without concern about overflow.
583 assert(W.TargetNode.isValid());
584 W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));
585 assert(W.Amount <= UINT32_MAX);
586
587 // Update the total.
588 Total += W.Amount;
589 if (W.Type == Weight::Backedge)
590 continue;
591
592 // Update the forward total.
593 ForwardTotal += W.Amount;
594 }
595 assert(Total <= UINT32_MAX);
596 }
597
598 void BlockFrequencyInfoImplBase::clear() {
599 *this = BlockFrequencyInfoImplBase();
600 }
601
602 /// \brief Clear all memory not needed downstream.
603 ///
604 /// Releases all memory not used downstream. In particular, saves Freqs.
605 static void cleanup(BlockFrequencyInfoImplBase &BFI) {
606 std::vector SavedFreqs(std::move(BFI.Freqs));
607 BFI.clear();
608 BFI.Freqs = std::move(SavedFreqs);
609 }
610
611 /// \brief Get a possibly packaged node.
612 ///
613 /// Get the node currently representing Node, which could be a containing
614 /// loop.
615 ///
616 /// This function should only be called when distributing mass. As long as
617 /// there are no irreducilbe edges to Node, then it will have complexity O(1)
618 /// in this context.
619 ///
620 /// In general, the complexity is O(L), where L is the number of loop headers
621 /// Node has been packaged into. Since this method is called in the context
622 /// of distributing mass, L will be the number of loop headers an early exit
623 /// edge jumps out of.
624 static BlockNode getPackagedNode(const BlockFrequencyInfoImplBase &BFI,
625 const BlockNode &Node) {
626 assert(Node.isValid());
627 if (!BFI.Working[Node.Index].IsPackaged)
628 return Node;
629 if (!BFI.Working[Node.Index].ContainingLoop.isValid())
630 return Node;
631 return getPackagedNode(BFI, BFI.Working[Node.Index].ContainingLoop);
632 }
633
634 /// \brief Get the appropriate mass for a possible pseudo-node loop package.
635 ///
636 /// Get appropriate mass for Node. If Node is a loop-header (whose loop has
637 /// been packaged), returns the mass of its pseudo-node. If it's a node inside
638 /// a packaged loop, it returns the loop's pseudo-node.
639 static BlockMass &getPackageMass(BlockFrequencyInfoImplBase &BFI,
640 const BlockNode &Node) {
641 assert(Node.isValid());
642 assert(!BFI.Working[Node.Index].IsPackaged);
643 if (!BFI.Working[Node.Index].IsAPackage)
644 return BFI.Working[Node.Index].Mass;
645
646 return BFI.getLoopPackage(Node).Mass;
647 }
648
649 void BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
650 const BlockNode &LoopHead,
651 const BlockNode &Pred,
652 const BlockNode &Succ,
653 uint64_t Weight) {
654 if (!Weight)
655 Weight = 1;
656
657 #ifndef NDEBUG
658 auto debugSuccessor = [&](const char *Type, const BlockNode &Resolved) {
659 dbgs() << " =>"
660 << " [" << Type << "] weight = " << Weight;
661 if (Succ != LoopHead)
662 dbgs() << ", succ = " << getBlockName(Succ);
663 if (Resolved != Succ)
664 dbgs() << ", resolved = " << getBlockName(Resolved);
665 dbgs() << "\n";
666 };
667 (void)debugSuccessor;
668 #endif
669
670 if (Succ == LoopHead) {
671 DEBUG(debugSuccessor("backedge", Succ));
672 Dist.addBackedge(LoopHead, Weight);
673 return;
674 }
675 BlockNode Resolved = getPackagedNode(*this, Succ);
676 assert(Resolved != LoopHead);
677
678 if (Working[Resolved.Index].ContainingLoop != LoopHead) {
679 DEBUG(debugSuccessor(" exit ", Resolved));
680 Dist.addExit(Resolved, Weight);
681 return;
682 }
683
684 if (!LoopHead.isValid() && Resolved < Pred) {
685 // Irreducible backedge. Skip this edge in the distribution.
686 DEBUG(debugSuccessor("skipped ", Resolved));
687 return;
688 }
689
690 DEBUG(debugSuccessor(" local ", Resolved));
691 Dist.addLocal(Resolved, Weight);
692 }
693
694 void BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
695 const BlockNode &LoopHead, const BlockNode &LocalLoopHead,
696 Distribution &Dist) {
697 PackagedLoopData &LoopPackage = getLoopPackage(LocalLoopHead);
698 const PackagedLoopData::ExitMap &Exits = LoopPackage.Exits;
699
700 // Copy the exit map into Dist.
701 for (const auto &I : Exits)
702 addToDist(Dist, LoopHead, LocalLoopHead, I.first, I.second.getMass());
703
704 // We don't need this map any more. Clear it to prevent quadratic memory
705 // usage in deeply nested loops with irreducible control flow.
706 LoopPackage.Exits.clear();
707 }
708
709 /// \brief Get the maximum allowed loop scale.
710 ///
711 /// Gives the maximum number of estimated iterations allowed for a loop.
712 /// Downstream users have trouble with very large numbers (even within
713 /// 64-bits). Perhaps they can be changed to use PositiveFloat.
714 ///
715 /// TODO: change downstream users so that this can be increased or removed.
716 static Float getMaxLoopScale() { return Float(1, 12); }
717
718 /// \brief Compute the loop scale for a loop.
719 void BlockFrequencyInfoImplBase::computeLoopScale(const BlockNode &LoopHead) {
720 // Compute loop scale.
721 DEBUG(dbgs() << "compute-loop-scale: " << getBlockName(LoopHead) << "\n");
722
723 // LoopScale == 1 / ExitMass
724 // ExitMass == HeadMass - BackedgeMass
725 PackagedLoopData &LoopPackage = getLoopPackage(LoopHead);
726 BlockMass ExitMass = BlockMass::getFull() - LoopPackage.BackedgeMass;
727
728 // Block scale stores the inverse of the scale.
729 LoopPackage.Scale = ExitMass.toFloat().inverse();
730
731 DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
732 << " - " << LoopPackage.BackedgeMass << ")\n"
733 << " - scale = " << LoopPackage.Scale << "\n");
734
735 if (LoopPackage.Scale > getMaxLoopScale()) {
736 LoopPackage.Scale = getMaxLoopScale();
737 DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
738 }
739 }
740
741 /// \brief Package up a loop.
742 void BlockFrequencyInfoImplBase::packageLoop(const BlockNode &LoopHead) {
743 DEBUG(dbgs() << "packaging-loop: " << getBlockName(LoopHead) << "\n");
744 Working[LoopHead.Index].IsAPackage = true;
745 for (const BlockNode &M : getLoopPackage(LoopHead).Members) {
746 DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
747 Working[M.Index].IsPackaged = true;
748 }
749 }
750
751 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
752 const BlockNode &LoopHead,
753 Distribution &Dist) {
754 BlockMass Mass = getPackageMass(*this, Source);
755 DEBUG(dbgs() << " => mass: " << Mass
756 << " ( general | forward )\n");
757
758 // Distribute mass to successors as laid out in Dist.
759 DitheringDistributer D(Dist, Mass);
760
761 #ifndef NDEBUG
762 auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
763 const char *Desc) {
764 dbgs() << " => assign " << M << " (" << D.RemMass << "|"
765 << D.RemForwardMass << ")";
766 if (Desc)
767 dbgs() << " [" << Desc << "]";
768 if (T.isValid())
769 dbgs() << " to " << getBlockName(T);
770 dbgs() << "\n";
771 };
772 (void)debugAssign;
773 #endif
774
775 PackagedLoopData *LoopPackage = 0;
776 if (LoopHead.isValid())
777 LoopPackage = &getLoopPackage(LoopHead);
778 for (const Weight &W : Dist.Weights) {
779 // Check for a local edge (forward and non-exit).
780 if (W.Type == Weight::Local) {
781 BlockMass Local = D.takeLocalMass(W.Amount);
782 getPackageMass(*this, W.TargetNode) += Local;
783 DEBUG(debugAssign(W.TargetNode, Local, nullptr));
784 continue;
785 }
786
787 // Backedges and exits only make sense if we're processing a loop.
788 assert(LoopPackage && "backedge or exit outside of loop");
789
790 // Check for a backedge.
791 if (W.Type == Weight::Backedge) {
792 BlockMass Back = D.takeBackedgeMass(W.Amount);
793 LoopPackage->BackedgeMass += Back;
794 DEBUG(debugAssign(BlockNode(), Back, "back"));
795 continue;
796 }
797
798 // This must be an exit.
799 assert(W.Type == Weight::Exit);
800 BlockMass Exit = D.takeExitMass(W.Amount);
801 LoopPackage->Exits.push_back(std::make_pair(W.TargetNode, Exit));
802 DEBUG(debugAssign(W.TargetNode, Exit, "exit"));
803 }
804 }
805
806 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
807 const Float &Min, const Float &Max) {
808 // Scale the Factor to a size that creates integers. Ideally, integers would
809 // be scaled so that Max == UINT64_MAX so that they can be best
810 // differentiated. However, the register allocator currently deals poorly
811 // with large numbers. Instead, push Min up a little from 1 to give some
812 // room to differentiate small, unequal numbers.
813 //
814 // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max.
815 Float ScalingFactor = Min.inverse();
816 if ((Max / Min).lg() < 60)
817 ScalingFactor <<= 3;
818
819 // Translate the floats to integers.
820 DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
821 << ", factor = " << ScalingFactor << "\n");
822 for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
823 Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor;
824 BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt());
825 DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
826 << BFI.Freqs[Index].Floating << ", scaled = " << Scaled
827 << ", int = " << BFI.Freqs[Index].Integer << "\n");
828 }
829 }
830
831 static void scaleBlockData(BlockFrequencyInfoImplBase &BFI,
832 const BlockNode &Node,
833 const PackagedLoopData &Loop) {
834 Float F = Loop.Mass.toFloat() * Loop.Scale;
835
836 Float &Current = BFI.Freqs[Node.Index].Floating;
837 Float Updated = Current * F;
838
839 DEBUG(dbgs() << " - " << BFI.getBlockName(Node) << ": " << Current << " => "
840 << Updated << "\n");
841
842 Current = Updated;
843 }
844
845 /// \brief Unwrap a loop package.
846 ///
847 /// Visits all the members of a loop, adjusting their BlockData according to
848 /// the loop's pseudo-node.
849 static void unwrapLoopPackage(BlockFrequencyInfoImplBase &BFI,
850 const BlockNode &Head) {
851 assert(Head.isValid());
852
853 PackagedLoopData &LoopPackage = BFI.getLoopPackage(Head);
854 DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getBlockName(Head)
855 << ": mass = " << LoopPackage.Mass
856 << ", scale = " << LoopPackage.Scale << "\n");
857 scaleBlockData(BFI, Head, LoopPackage);
858
859 // Propagate the head scale through the loop. Since members are visited in
860 // RPO, the head scale will be updated by the loop scale first, and then the
861 // final head scale will be used for updated the rest of the members.
862 for (const BlockNode &M : LoopPackage.Members) {
863 const FrequencyData &HeadData = BFI.Freqs[Head.Index];
864 FrequencyData &Freqs = BFI.Freqs[M.Index];
865 Float NewFreq = Freqs.Floating * HeadData.Floating;
866 DEBUG(dbgs() << " - " << BFI.getBlockName(M) << ": " << Freqs.Floating
867 << " => " << NewFreq << "\n");
868 Freqs.Floating = NewFreq;
869 }
870 }
871
872 void BlockFrequencyInfoImplBase::finalizeMetrics() {
873 // Set initial frequencies from loop-local masses.
874 for (size_t Index = 0; Index < Working.size(); ++Index)
875 Freqs[Index].Floating = Working[Index].Mass.toFloat();
876
877 // Unwrap loop packages in reverse post-order, tracking min and max
878 // frequencies.
879 auto Min = Float::getLargest();
880 auto Max = Float::getZero();
881 for (size_t Index = 0; Index < Working.size(); ++Index) {
882 if (Working[Index].isLoopHeader())
883 unwrapLoopPackage(*this, BlockNode(Index));
884
885 // Update max scale.
886 Min = std::min(Min, Freqs[Index].Floating);
887 Max = std::max(Max, Freqs[Index].Floating);
888 }
889
890 // Convert to integers.
891 convertFloatingToInteger(*this, Min, Max);
892
893 // Clean up data structures.
894 cleanup(*this);
895
896 // Print out the final stats.
897 DEBUG(dump());
898 }
899
900 BlockFrequency
901 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
902 if (!Node.isValid())
903 return 0;
904 return Freqs[Node.Index].Integer;
905 }
906 Float
907 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
908 if (!Node.isValid())
909 return Float::getZero();
910 return Freqs[Node.Index].Floating;
911 }
912
913 std::string
914 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
915 return std::string();
916 }
917
918 raw_ostream &
919 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
920 const BlockNode &Node) const {
921 return OS << getFloatingBlockFreq(Node);
922 }
923
924 raw_ostream &
925 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
926 const BlockFrequency &Freq) const {
927 Float Block(Freq.getFrequency(), 0);
928 Float Entry(getEntryFreq(), 0);
929
930 return OS << Block / Entry;
931 }
66 Analysis.cpp
77 BasicAliasAnalysis.cpp
88 BlockFrequencyInfo.cpp
9 BlockFrequencyInfoImpl.cpp
109 BranchProbabilityInfo.cpp
1110 CFG.cpp
1211 CFGPrinter.cpp
1010 //
1111 //===----------------------------------------------------------------------===//
1212
13 #define DEBUG_TYPE "block-freq"
1413 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
1514 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
1615 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/MachineLoopInfo.h"
1916 #include "llvm/CodeGen/Passes.h"
2017 #include "llvm/InitializePasses.h"
2118 #include "llvm/Support/CommandLine.h"
114111 INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
115112 "Machine Block Frequency Analysis", true, true)
116113 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
117 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
118114 INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
119115 "Machine Block Frequency Analysis", true, true)
120116
130126
131127 void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
132128 AU.addRequired();
133 AU.addRequired();
134129 AU.setPreservesAll();
135130 MachineFunctionPass::getAnalysisUsage(AU);
136131 }
137132
138133 bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
139134 MachineBranchProbabilityInfo &MBPI =
140 getAnalysis();
141 MachineLoopInfo &MLI = getAnalysisInfo>();
135 getAnalysisInfo>();
142136 if (!MBFI)
143137 MBFI.reset(new ImplType);
144 MBFI->doFunction(&F, &MBPI, &MLI);
138 MBFI->doFunction(&F, &MBPI);
145139 #ifndef NDEBUG
146140 if (ViewMachineBlockFreqPropagationDAG != GVDT_None) {
147141 view();
171165 }
172166
173167 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
174 return MBFI ? MBFI->getFunction() : nullptr;
168 return MBFI ? MBFI->Fn : nullptr;
175169 }
176170
177171 raw_ostream &
+0
-50
test/Analysis/BlockFrequencyInfo/bad_input.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 declare void @g(i32 %x)
3
4 ; CHECK-LABEL: Printing analysis {{.*}} for function 'branch_weight_0':
5 ; CHECK-NEXT: block-frequency-info: branch_weight_0
6 define void @branch_weight_0(i32 %a) {
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %for.body
10
11 ; Check that we get 1,4 instead of 0,3.
12 ; CHECK-NEXT: for.body: float = 4.0,
13 for.body:
14 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
15 call void @g(i32 %i)
16 %inc = add i32 %i, 1
17 %cmp = icmp ugt i32 %inc, %a
18 br i1 %cmp, label %for.end, label %for.body, !prof !0
19
20 ; CHECK-NEXT: for.end: float = 1.0, int = [[ENTRY]]
21 for.end:
22 ret void
23 }
24
25 !0 = metadata !{metadata !"branch_weights", i32 0, i32 3}
26
27 ; CHECK-LABEL: Printing analysis {{.*}} for function 'infinite_loop'
28 ; CHECK-NEXT: block-frequency-info: infinite_loop
29 define void @infinite_loop(i1 %x) {
30 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
31 entry:
32 br i1 %x, label %for.body, label %for.end, !prof !1
33
34 ; Check that the loop scale maxes out at 4096, giving 2048 here.
35 ; CHECK-NEXT: for.body: float = 2048.0,
36 for.body:
37 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
38 call void @g(i32 %i)
39 %inc = add i32 %i, 1
40 br label %for.body
41
42 ; Check that the exit weight is half of entry, since half is lost in the
43 ; infinite loop above.
44 ; CHECK-NEXT: for.end: float = 0.5,
45 for.end:
46 ret void
47 }
48
49 !1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
0 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
11
22 define i32 @test1(i32 %i, i32* %a) {
3 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test1':
4 ; CHECK-NEXT: block-frequency-info: test1
5 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
3 ; CHECK: Printing analysis {{.*}} for function 'test1'
4 ; CHECK: entry = 1.0
65 entry:
76 br label %body
87
98 ; Loop backedges are weighted and thus their bodies have a greater frequency.
10 ; CHECK-NEXT: body: float = 32.0,
9 ; CHECK: body = 32.0
1110 body:
1211 %iv = phi i32 [ 0, %entry ], [ %next, %body ]
1312 %base = phi i32 [ 0, %entry ], [ %sum, %body ]
1817 %exitcond = icmp eq i32 %next, %i
1918 br i1 %exitcond, label %exit, label %body
2019
21 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
20 ; CHECK: exit = 1.0
2221 exit:
2322 ret i32 %sum
2423 }
2524
2625 define i32 @test2(i32 %i, i32 %a, i32 %b) {
27 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test2':
28 ; CHECK-NEXT: block-frequency-info: test2
29 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
26 ; CHECK: Printing analysis {{.*}} for function 'test2'
27 ; CHECK: entry = 1.0
3028 entry:
3129 %cond = icmp ult i32 %i, 42
3230 br i1 %cond, label %then, label %else, !prof !0
3331
3432 ; The 'then' branch is predicted more likely via branch weight metadata.
35 ; CHECK-NEXT: then: float = 0.9411{{[0-9]*}},
33 ; CHECK: then = 0.94116
3634 then:
3735 br label %exit
3836
39 ; CHECK-NEXT: else: float = 0.05882{{[0-9]*}},
37 ; CHECK: else = 0.05877
4038 else:
4139 br label %exit
4240
43 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
41 ; FIXME: It may be a bug that we don't sum back to 1.0.
42 ; CHECK: exit = 0.99993
4443 exit:
4544 %result = phi i32 [ %a, %then ], [ %b, %else ]
4645 ret i32 %result
4948 !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
5049
5150 define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
52 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test3':
53 ; CHECK-NEXT: block-frequency-info: test3
54 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
51 ; CHECK: Printing analysis {{.*}} for function 'test3'
52 ; CHECK: entry = 1.0
5553 entry:
5654 switch i32 %i, label %case_a [ i32 1, label %case_b
5755 i32 2, label %case_c
5856 i32 3, label %case_d
5957 i32 4, label %case_e ], !prof !1
6058
61 ; CHECK-NEXT: case_a: float = 0.05,
59 ; CHECK: case_a = 0.04998
6260 case_a:
6361 br label %exit
6462
65 ; CHECK-NEXT: case_b: float = 0.05,
63 ; CHECK: case_b = 0.04998
6664 case_b:
6765 br label %exit
6866
6967 ; The 'case_c' branch is predicted more likely via branch weight metadata.
70 ; CHECK-NEXT: case_c: float = 0.8,
68 ; CHECK: case_c = 0.79998
7169 case_c:
7270 br label %exit
7371
74 ; CHECK-NEXT: case_d: float = 0.05,
72 ; CHECK: case_d = 0.04998
7573 case_d:
7674 br label %exit
7775
78 ; CHECK-NEXT: case_e: float = 0.05,
76 ; CHECK: case_e = 0.04998
7977 case_e:
8078 br label %exit
8179
82 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
80 ; FIXME: It may be a bug that we don't sum back to 1.0.
81 ; CHECK: exit = 0.99993
8382 exit:
8483 %result = phi i32 [ %a, %case_a ],
8584 [ %b, %case_b ],
9190
9291 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
9392
93 ; CHECK: Printing analysis {{.*}} for function 'nested_loops'
94 ; CHECK: entry = 1.0
95 ; This test doesn't seem to be assigning sensible frequencies to nested loops.
9496 define void @nested_loops(i32 %a) {
95 ; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loops':
96 ; CHECK-NEXT: block-frequency-info: nested_loops
97 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
9897 entry:
9998 br label %for.cond1.preheader
10099
101 ; CHECK-NEXT: for.cond1.preheader: float = 4001.0,
102100 for.cond1.preheader:
103101 %x.024 = phi i32 [ 0, %entry ], [ %inc12, %for.inc11 ]
104102 br label %for.cond4.preheader
105103
106 ; CHECK-NEXT: for.cond4.preheader: float = 16008001.0,
107104 for.cond4.preheader:
108105 %y.023 = phi i32 [ 0, %for.cond1.preheader ], [ %inc9, %for.inc8 ]
109106 %add = add i32 %y.023, %x.024
110107 br label %for.body6
111108
112 ; CHECK-NEXT: for.body6: float = 64048012001.0,
113109 for.body6:
114110 %z.022 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
115111 %add7 = add i32 %add, %z.022
116 tail call void @g(i32 %add7)
112 tail call void @g(i32 %add7) #2
117113 %inc = add i32 %z.022, 1
118114 %cmp5 = icmp ugt i32 %inc, %a
119115 br i1 %cmp5, label %for.inc8, label %for.body6, !prof !2
120116
121 ; CHECK-NEXT: for.inc8: float = 16008001.0,
122117 for.inc8:
123118 %inc9 = add i32 %y.023, 1
124119 %cmp2 = icmp ugt i32 %inc9, %a
125120 br i1 %cmp2, label %for.inc11, label %for.cond4.preheader, !prof !2
126121
127 ; CHECK-NEXT: for.inc11: float = 4001.0,
128122 for.inc11:
129123 %inc12 = add i32 %x.024, 1
130124 %cmp = icmp ugt i32 %inc12, %a
131125 br i1 %cmp, label %for.end13, label %for.cond1.preheader, !prof !2
132126
133 ; CHECK-NEXT: for.end13: float = 1.0, int = [[ENTRY]]
134127 for.end13:
135128 ret void
136129 }
137130
138 declare void @g(i32)
131 declare void @g(i32) #1
139132
140133 !2 = metadata !{metadata !"branch_weights", i32 1, i32 4000}
+0
-165
test/Analysis/BlockFrequencyInfo/double_exit.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit':
3 ; CHECK-NEXT: block-frequency-info: double_exit
4 define i32 @double_exit(i32 %N) {
5 ; Mass = 1
6 ; Frequency = 1
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %outer
10
11 ; Mass = 1
12 ; Backedge mass = 1/3, exit mass = 2/3
13 ; Loop scale = 3/2
14 ; Psuedo-edges = exit
15 ; Psuedo-mass = 1
16 ; Frequency = 1*3/2*1 = 3/2
17 ; CHECK-NEXT: outer: float = 1.5,
18 outer:
19 %I.0 = phi i32 [ 0, %entry ], [ %inc6, %outer.inc ]
20 %Return.0 = phi i32 [ 0, %entry ], [ %Return.1, %outer.inc ]
21 %cmp = icmp slt i32 %I.0, %N
22 br i1 %cmp, label %inner, label %exit, !prof !2 ; 2:1
23
24 ; Mass = 1
25 ; Backedge mass = 3/5, exit mass = 2/5
26 ; Loop scale = 5/2
27 ; Pseudo-edges = outer.inc @ 1/5, exit @ 1/5
28 ; Pseudo-mass = 2/3
29 ; Frequency = 3/2*1*5/2*2/3 = 5/2
30 ; CHECK-NEXT: inner: float = 2.5,
31 inner:
32 %Return.1 = phi i32 [ %Return.0, %outer ], [ %call4, %inner.inc ]
33 %J.0 = phi i32 [ %I.0, %outer ], [ %inc, %inner.inc ]
34 %cmp2 = icmp slt i32 %J.0, %N
35 br i1 %cmp2, label %inner.body, label %outer.inc, !prof !1 ; 4:1
36
37 ; Mass = 4/5
38 ; Frequency = 5/2*4/5 = 2
39 ; CHECK-NEXT: inner.body: float = 2.0,
40 inner.body:
41 %call = call i32 @c2(i32 %I.0, i32 %J.0)
42 %tobool = icmp ne i32 %call, 0
43 br i1 %tobool, label %exit, label %inner.inc, !prof !0 ; 3:1
44
45 ; Mass = 3/5
46 ; Frequency = 5/2*3/5 = 3/2
47 ; CHECK-NEXT: inner.inc: float = 1.5,
48 inner.inc:
49 %call4 = call i32 @logic2(i32 %Return.1, i32 %I.0, i32 %J.0)
50 %inc = add nsw i32 %J.0, 1
51 br label %inner
52
53 ; Mass = 1/3
54 ; Frequency = 3/2*1/3 = 1/2
55 ; CHECK-NEXT: outer.inc: float = 0.5,
56 outer.inc:
57 %inc6 = add nsw i32 %I.0, 1
58 br label %outer
59
60 ; Mass = 1
61 ; Frequency = 1
62 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
63 exit:
64 %Return.2 = phi i32 [ %Return.1, %inner.body ], [ %Return.0, %outer ]
65 ret i32 %Return.2
66 }
67
68 !0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
69 !1 = metadata !{metadata !"branch_weights", i32 4, i32 1}
70 !2 = metadata !{metadata !"branch_weights", i32 2, i32 1}
71
72 declare i32 @c2(i32, i32)
73 declare i32 @logic2(i32, i32, i32)
74
75 ; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit_in_loop':
76 ; CHECK-NEXT: block-frequency-info: double_exit_in_loop
77 define i32 @double_exit_in_loop(i32 %N) {
78 ; Mass = 1
79 ; Frequency = 1
80 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
81 entry:
82 br label %outer
83
84 ; Mass = 1
85 ; Backedge mass = 1/2, exit mass = 1/2
86 ; Loop scale = 2
87 ; Pseudo-edges = exit
88 ; Psuedo-mass = 1
89 ; Frequency = 1*2*1 = 2
90 ; CHECK-NEXT: outer: float = 2.0,
91 outer:
92 %I.0 = phi i32 [ 0, %entry ], [ %inc12, %outer.inc ]
93 %Return.0 = phi i32 [ 0, %entry ], [ %Return.3, %outer.inc ]
94 %cmp = icmp slt i32 %I.0, %N
95 br i1 %cmp, label %middle, label %exit, !prof !3 ; 1:1
96
97 ; Mass = 1
98 ; Backedge mass = 1/3, exit mass = 2/3
99 ; Loop scale = 3/2
100 ; Psuedo-edges = outer.inc
101 ; Psuedo-mass = 1/2
102 ; Frequency = 2*1*3/2*1/2 = 3/2
103 ; CHECK-NEXT: middle: float = 1.5,
104 middle:
105 %J.0 = phi i32 [ %I.0, %outer ], [ %inc9, %middle.inc ]
106 %Return.1 = phi i32 [ %Return.0, %outer ], [ %Return.2, %middle.inc ]
107 %cmp2 = icmp slt i32 %J.0, %N
108 br i1 %cmp2, label %inner, label %outer.inc, !prof !2 ; 2:1
109
110 ; Mass = 1
111 ; Backedge mass = 3/5, exit mass = 2/5
112 ; Loop scale = 5/2
113 ; Pseudo-edges = middle.inc @ 1/5, outer.inc @ 1/5
114 ; Pseudo-mass = 2/3
115 ; Frequency = 3/2*1*5/2*2/3 = 5/2
116 ; CHECK-NEXT: inner: float = 2.5,
117 inner:
118 %Return.2 = phi i32 [ %Return.1, %middle ], [ %call7, %inner.inc ]
119 %K.0 = phi i32 [ %J.0, %middle ], [ %inc, %inner.inc ]
120 %cmp5 = icmp slt i32 %K.0, %N
121 br i1 %cmp5, label %inner.body, label %middle.inc, !prof !1 ; 4:1
122
123 ; Mass = 4/5
124 ; Frequency = 5/2*4/5 = 2
125 ; CHECK-NEXT: inner.body: float = 2.0,
126 inner.body:
127 %call = call i32 @c3(i32 %I.0, i32 %J.0, i32 %K.0)
128 %tobool = icmp ne i32 %call, 0
129 br i1 %tobool, label %outer.inc, label %inner.inc, !prof !0 ; 3:1
130
131 ; Mass = 3/5
132 ; Frequency = 5/2*3/5 = 3/2
133 ; CHECK-NEXT: inner.inc: float = 1.5,
134 inner.inc:
135 %call7 = call i32 @logic3(i32 %Return.2, i32 %I.0, i32 %J.0, i32 %K.0)
136 %inc = add nsw i32 %K.0, 1
137 br label %inner
138
139 ; Mass = 1/3
140 ; Frequency = 3/2*1/3 = 1/2
141 ; CHECK-NEXT: middle.inc: float = 0.5,
142 middle.inc:
143 %inc9 = add nsw i32 %J.0, 1
144 br label %middle
145
146 ; Mass = 1/2
147 ; Frequency = 2*1/2 = 1
148 ; CHECK-NEXT: outer.inc: float = 1.0,
149 outer.inc:
150 %Return.3 = phi i32 [ %Return.2, %inner.body ], [ %Return.1, %middle ]
151 %inc12 = add nsw i32 %I.0, 1
152 br label %outer
153
154 ; Mass = 1
155 ; Frequency = 1
156 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
157 exit:
158 ret i32 %Return.0
159 }
160
161 !3 = metadata !{metadata !"branch_weights", i32 1, i32 1}
162
163 declare i32 @c3(i32, i32, i32)
164 declare i32 @logic3(i32, i32, i32, i32)
+0
-197
test/Analysis/BlockFrequencyInfo/irreducible.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; A loop with multiple exits should be handled correctly.
3 ;
4 ; CHECK-LABEL: Printing analysis {{.*}} for function 'multiexit':
5 ; CHECK-NEXT: block-frequency-info: multiexit
6 define void @multiexit(i32 %a) {
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %loop.1
10
11 ; CHECK-NEXT: loop.1: float = 1.333{{3*}},
12 loop.1:
13 %i = phi i32 [ 0, %entry ], [ %inc.2, %loop.2 ]
14 call void @f(i32 %i)
15 %inc.1 = add i32 %i, 1
16 %cmp.1 = icmp ugt i32 %inc.1, %a
17 br i1 %cmp.1, label %exit.1, label %loop.2, !prof !0
18
19 ; CHECK-NEXT: loop.2: float = 0.666{{6*7}},
20 loop.2:
21 call void @g(i32 %inc.1)
22 %inc.2 = add i32 %inc.1, 1
23 %cmp.2 = icmp ugt i32 %inc.2, %a
24 br i1 %cmp.2, label %exit.2, label %loop.1, !prof !1
25
26 ; CHECK-NEXT: exit.1: float = 0.666{{6*7}},
27 exit.1:
28 call void @h(i32 %inc.1)
29 br label %return
30
31 ; CHECK-NEXT: exit.2: float = 0.333{{3*}},
32 exit.2:
33 call void @i(i32 %inc.2)
34 br label %return
35
36 ; CHECK-NEXT: return: float = 1.0, int = [[ENTRY]]
37 return:
38 ret void
39 }
40
41 declare void @f(i32 %x)
42 declare void @g(i32 %x)
43 declare void @h(i32 %x)
44 declare void @i(i32 %x)
45
46 !0 = metadata !{metadata !"branch_weights", i32 3, i32 3}
47 !1 = metadata !{metadata !"branch_weights", i32 5, i32 5}
48
49 ; The current BlockFrequencyInfo algorithm doesn't handle multiple entrances
50 ; into a loop very well. The frequencies assigned to blocks in the loop are
51 ; predictable (and not absurd), but also not correct and therefore not worth
52 ; testing.
53 ;
54 ; There are two testcases below.
55 ;
56 ; For each testcase, I use a CHECK-NEXT/NOT combo like an XFAIL with the
57 ; granularity of a single check. If/when this behaviour is fixed, we'll know
58 ; about it, and the test should be updated.
59 ;
60 ; Testcase #1
61 ; ===========
62 ;
63 ; In this case c1 and c2 should have frequencies of 15/7 and 13/7,
64 ; respectively. To calculate this, consider assigning 1.0 to entry, and
65 ; distributing frequency iteratively (to infinity). At the first iteration,
66 ; entry gives 3/4 to c1 and 1/4 to c2. At every step after, c1 and c2 give 3/4
67 ; of what they have to each other. Somehow, all of it comes out to exit.
68 ;
69 ; c1 = 3/4 + 1/4*3/4 + 3/4*3^2/4^2 + 1/4*3^3/4^3 + 3/4*3^3/4^3 + ...
70 ; c2 = 1/4 + 3/4*3/4 + 1/4*3^2/4^2 + 3/4*3^3/4^3 + 1/4*3^3/4^3 + ...
71 ;
72 ; Simplify by splitting up the odd and even terms of the series and taking out
73 ; factors so that the infite series matches:
74 ;
75 ; c1 = 3/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
76 ; + 3/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
77 ; c2 = 1/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
78 ; + 9/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
79 ;
80 ; c1 = 15/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
81 ; c2 = 13/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
82 ;
83 ; Since this geometric series sums to 16/7:
84 ;
85 ; c1 = 15/7
86 ; c2 = 13/7
87 ;
88 ; If we treat c1 and c2 as members of the same loop, the exit frequency of the
89 ; loop as a whole is 1/4, so the loop scale should be 4. Summing c1 and c2
90 ; gives 28/7, or 4.0, which is nice confirmation of the math above.
91 ;
92 ; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
93 ; returns 3/4 and 13/16, respectively. LoopInfo ignores edges between loops
94 ; (and doesn't see any loops here at all), and -block-freq ignores the
95 ; irreducible edge from c2 to c1.
96 ;
97 ; CHECK-LABEL: Printing analysis {{.*}} for function 'multientry':
98 ; CHECK-NEXT: block-frequency-info: multientry
99 define void @multientry(i32 %a) {
100 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
101 entry:
102 %choose = call i32 @choose(i32 %a)
103 %compare = icmp ugt i32 %choose, %a
104 br i1 %compare, label %c1, label %c2, !prof !2
105
106 ; This is like a single-line XFAIL (see above).
107 ; CHECK-NEXT: c1:
108 ; CHECK-NOT: float = 2.142857{{[0-9]*}},
109 c1:
110 %i1 = phi i32 [ %a, %entry ], [ %i2.inc, %c2 ]
111 %i1.inc = add i32 %i1, 1
112 %choose1 = call i32 @choose(i32 %i1)
113 %compare1 = icmp ugt i32 %choose1, %a
114 br i1 %compare1, label %c2, label %exit, !prof !2
115
116 ; This is like a single-line XFAIL (see above).
117 ; CHECK-NEXT: c2:
118 ; CHECK-NOT: float = 1.857142{{[0-9]*}},
119 c2:
120 %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ]
121 %i2.inc = add i32 %i2, 1
122 %choose2 = call i32 @choose(i32 %i2)
123 %compare2 = icmp ugt i32 %choose2, %a
124 br i1 %compare2, label %c1, label %exit, !prof !2
125
126 ; We still shouldn't lose any frequency.
127 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
128 exit:
129 ret void
130 }
131
132 ; Testcase #2
133 ; ===========
134 ;
135 ; In this case c1 and c2 should be treated as equals in a single loop. The
136 ; exit frequency is 1/3, so the scaling factor for the loop should be 3.0. The
137 ; loop is entered 2/3 of the time, and c1 and c2 split the total loop frequency
138 ; evenly (1/2), so they should each have frequencies of 1.0 (3.0*2/3*1/2).
139 ; Another way of computing this result is by assigning 1.0 to entry and showing
140 ; that c1 and c2 should accumulate frequencies of:
141 ;
142 ; 1/3 + 2/9 + 4/27 + 8/81 + ...
143 ; 2^0/3^1 + 2^1/3^2 + 2^2/3^3 + 2^3/3^4 + ...
144 ;
145 ; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent
146 ; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This
147 ; infinite series sums to 1.
148 ;
149 ; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
150 ; returns 1/2 and 3/4, respectively. LoopInfo ignores edges between loops (and
151 ; treats c1 and c2 as self-loops only), and -block-freq ignores the irreducible
152 ; edge from c2 to c1.
153 ;
154 ; Below I use a CHECK-NEXT/NOT combo like an XFAIL with the granularity of a
155 ; single check. If/when this behaviour is fixed, we'll know about it, and the
156 ; test should be updated.
157 ;
158 ; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
159 ; CHECK-NEXT: block-frequency-info: crossloops
160 define void @crossloops(i32 %a) {
161 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
162 entry:
163 %choose = call i32 @choose(i32 %a)
164 switch i32 %choose, label %exit [ i32 1, label %c1
165 i32 2, label %c2 ], !prof !3
166
167 ; This is like a single-line XFAIL (see above).
168 ; CHECK-NEXT: c1:
169 ; CHECK-NOT: float = 1.0,
170 c1:
171 %i1 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ]
172 %i1.inc = add i32 %i1, 1
173 %choose1 = call i32 @choose(i32 %i1)
174 switch i32 %choose1, label %exit [ i32 1, label %c1
175 i32 2, label %c2 ], !prof !3
176
177 ; This is like a single-line XFAIL (see above).
178 ; CHECK-NEXT: c2:
179 ; CHECK-NOT: float = 1.0,
180 c2:
181 %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ]
182 %i2.inc = add i32 %i2, 1
183 %choose2 = call i32 @choose(i32 %i2)
184 switch i32 %choose2, label %exit [ i32 1, label %c1
185 i32 2, label %c2 ], !prof !3
186
187 ; We still shouldn't lose any frequency.
188 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
189 exit:
190 ret void
191 }
192
193 declare i32 @choose(i32)
194
195 !2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
196 !3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2}
+0
-44
test/Analysis/BlockFrequencyInfo/loop_with_branch.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_with_branch':
3 ; CHECK-NEXT: block-frequency-info: loop_with_branch
4 define void @loop_with_branch(i32 %a) {
5 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
6 entry:
7 %skip_loop = call i1 @foo0(i32 %a)
8 br i1 %skip_loop, label %skip, label %header, !prof !0
9
10 ; CHECK-NEXT: skip: float = 0.25,
11 skip:
12 br label %exit
13
14 ; CHECK-NEXT: header: float = 4.5,
15 header:
16 %i = phi i32 [ 0, %entry ], [ %i.next, %back ]
17 %i.next = add i32 %i, 1
18 %choose = call i2 @foo1(i32 %i)
19 switch i2 %choose, label %exit [ i2 0, label %left
20 i2 1, label %right ], !prof !1
21
22 ; CHECK-NEXT: left: float = 1.5,
23 left:
24 br label %back
25
26 ; CHECK-NEXT: right: float = 2.25,
27 right:
28 br label %back
29
30 ; CHECK-NEXT: back: float = 3.75,
31 back:
32 br label %header
33
34 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
35 exit:
36 ret void
37 }
38
39 declare i1 @foo0(i32)
40 declare i2 @foo1(i32)
41
42 !0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
43 !1 = metadata !{metadata !"branch_weights", i32 1, i32 2, i32 3}
+0
-59
test/Analysis/BlockFrequencyInfo/nested_loop_with_branches.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loop_with_branches'
3 ; CHECK-NEXT: block-frequency-info: nested_loop_with_branches
4 define void @nested_loop_with_branches(i32 %a) {
5 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
6 entry:
7 %v0 = call i1 @foo0(i32 %a)
8 br i1 %v0, label %exit, label %outer, !prof !0
9
10 ; CHECK-NEXT: outer: float = 12.0,
11 outer:
12 %i = phi i32 [ 0, %entry ], [ %i.next, %inner.end ], [ %i.next, %no_inner ]
13 %i.next = add i32 %i, 1
14 %do_inner = call i1 @foo1(i32 %i)
15 br i1 %do_inner, label %no_inner, label %inner, !prof !0
16
17 ; CHECK-NEXT: inner: float = 36.0,
18 inner:
19 %j = phi i32 [ 0, %outer ], [ %j.next, %inner.end ]
20 %side = call i1 @foo3(i32 %j)
21 br i1 %side, label %left, label %right, !prof !0
22
23 ; CHECK-NEXT: left: float = 9.0,
24 left:
25 %v4 = call i1 @foo4(i32 %j)
26 br label %inner.end
27
28 ; CHECK-NEXT: right: float = 27.0,
29 right:
30 %v5 = call i1 @foo5(i32 %j)
31 br label %inner.end
32
33 ; CHECK-NEXT: inner.end: float = 36.0,
34 inner.end:
35 %stay_inner = phi i1 [ %v4, %left ], [ %v5, %right ]
36 %j.next = add i32 %j, 1
37 br i1 %stay_inner, label %inner, label %outer, !prof !1
38
39 ; CHECK-NEXT: no_inner: float = 3.0,
40 no_inner:
41 %continue = call i1 @foo6(i32 %i)
42 br i1 %continue, label %outer, label %exit, !prof !1
43
44 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
45 exit:
46 ret void
47 }
48
49 declare i1 @foo0(i32)
50 declare i1 @foo1(i32)
51 declare i1 @foo2(i32)
52 declare i1 @foo3(i32)
53 declare i1 @foo4(i32)
54 declare i1 @foo5(i32)
55 declare i1 @foo6(i32)
56
57 !0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
58 !1 = metadata !{metadata !"branch_weights", i32 3, i32 1}
286286 ; CHECKFP: .LBB{{[0-9_]+}}
287287 ; CHECKFP-NEXT: ldc r2, 40
288288 ; CHECKFP-NEXT: add r2, r10, r2
289 ; CHECKFP-NEXT: add r2, r2, r0
289 ; CHECKFP-NEXT: add r0, r2, r0
290290 ; CHECKFP-NEXT: mov r3, r1
291 ; CHECKFP-NEXT: mov r2, r0
291292 ; CHECKFP-NEXT: ldw r9, r10[4]
292293 ; CHECKFP-NEXT: ldw r8, r10[5]
293294 ; CHECKFP-NEXT: ldw r7, r10[6]
335336 ; CHECK-NEXT: ldc r2, 36
336337 ; CHECK-NEXT: ldaw r3, sp[0]
337338 ; CHECK-NEXT: add r2, r3, r2
338 ; CHECK-NEXT: add r2, r2, r0
339 ; CHECK-NEXT: add r0, r2, r0
339340 ; CHECK-NEXT: mov r3, r1
341 ; CHECK-NEXT: mov r2, r0
340342 ; CHECK-NEXT: ldw r10, sp[2]
341343 ; CHECK-NEXT: ldw r9, sp[3]
342344 ; CHECK-NEXT: ldw r8, sp[4]