llvm.org GIT mirror llvm / 2033057
Revert "blockfreq: Rewrite BlockFrequencyInfoImpl" (#2) This reverts commit r206666, as planned. Still stumped on why the bots are failing. Sanitizer bots haven't turned anything up. If anyone can help me debug either of the failures (referenced in r206666) I'll owe them a beer. (In the meantime, I'll be auditing my patch for undefined behaviour.) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206677 91177308-0d34-0410-b5e6-96231b3b80d8 Duncan P. N. Exon Smith 6 years ago
12 changed file(s) with 376 addition(s) and 3163 deletion(s). Raw diff Collapse all Expand all
66 //
77 //===----------------------------------------------------------------------===//
88 //
9 // Shared implementation of BlockFrequency for IR and Machine Instructions.
9 // Shared implementation of BlockFrequencyInfo for IR and Machine Instructions.
1010 //
1111 //===----------------------------------------------------------------------===//
1212
1515
1616 #include "llvm/ADT/DenseMap.h"
1717 #include "llvm/ADT/PostOrderIterator.h"
18 #include "llvm/CodeGen/MachineBasicBlock.h"
19 #include "llvm/CodeGen/MachineFunction.h"
1820 #include "llvm/IR/BasicBlock.h"
1921 #include "llvm/Support/BlockFrequency.h"
2022 #include "llvm/Support/BranchProbability.h"
2325 #include
2426 #include
2527
26 //===----------------------------------------------------------------------===//
27 //
28 // PositiveFloat definition.
29 //
30 // TODO: Make this private to BlockFrequencyInfoImpl or delete.
31 //
32 //===----------------------------------------------------------------------===//
3328 namespace llvm {
3429
35 class PositiveFloatBase {
36 public:
37 static const int MaxExponent = 16383;
38 static const int MinExponent = -16382;
39 static const int DefaultPrecision = 10;
40
41 static void dump(uint64_t D, int16_t E, int Width);
42 static raw_ostream &print(raw_ostream &OS, uint64_t D, int16_t E, int Width,
43 unsigned Precision);
44 static std::string toString(uint64_t D, int16_t E, int Width,
45 unsigned Precision);
46 static int countLeadingZeros32(uint32_t N) { return countLeadingZeros(N); }
47 static int countLeadingZeros64(uint64_t N) { return countLeadingZeros(N); }
48 static uint64_t getHalf(uint64_t N) { return (N >> 1) + (N & 1); }
49
50 static std::pair splitSigned(int64_t N) {
51 if (N >= 0)
52 return std::make_pair(N, false);
53 if (N == INT64_MIN)
54 return std::make_pair(uint64_t(N) + 1, true);
55 return std::make_pair(-N, true);
56 }
57 static int64_t joinSigned(uint64_t U, bool IsNeg) {
58 if (U > INT64_MAX)
59 return IsNeg ? INT64_MIN : INT64_MAX;
60 return IsNeg ? -int16_t(U) : U;
61 }
62
63 static int32_t extractLg(const std::pair &Lg) {
64 return Lg.first;
65 }
66 static int32_t extractLgFloor(const std::pair &Lg) {
67 return Lg.first - (Lg.second > 0);
68 }
69 static int32_t extractLgCeiling(const std::pair &Lg) {
70 return Lg.first + (Lg.second < 0);
71 }
72 static uint64_t getDiff(int16_t L, int16_t R) {
73 assert(L <= R && "arguments in wrong order");
74 return (uint64_t)R - (uint64_t)L;
75 }
76
77 static std::pair divide64(uint64_t L, uint64_t R);
78 static std::pair multiply64(uint64_t L, uint64_t R);
79
80 static int compare(uint64_t L, uint64_t R, int Shift) {
81 assert(Shift >= 0);
82 assert(Shift < 64);
83
84 uint64_t L_adjusted = L >> Shift;
85 if (L_adjusted < R)
86 return -1;
87 if (L_adjusted > R)
88 return 1;
89
90 return L > L_adjusted << Shift ? 1 : 0;
91 }
92 };
93
94 /// \brief Simple representation of a positive floating point.
95 ///
96 /// PositiveFloat is a positive floating point number. It uses simple
97 /// saturation arithmetic, and every operation is well-defined for every value.
98 ///
99 /// The number is split into a signed exponent and unsigned digits. The number
100 /// represented is \c getDigits()*2^getExponent(). In this way, the digits are
101 /// much like the mantissa in the x87 long double, but there is no canonical
102 /// form, so the same number can be represented by many bit representations
103 /// (it's always in "denormal" mode).
104 ///
105 /// PositiveFloat is templated on the underlying integer type for digits, which
106 /// is expected to be one of uint64_t, uint32_t, uint16_t or uint8_t.
107 ///
108 /// Unlike builtin floating point types, PositiveFloat is portable.
109 ///
110 /// Unlike APFloat, PositiveFloat does not model architecture floating point
111 /// behaviour (this should make it a little faster), and implements most
112 /// operators (this makes it usable).
113 ///
114 /// PositiveFloat is totally ordered. However, there is no canonical form, so
115 /// there are multiple representations of most scalars. E.g.:
116 ///
117 /// PositiveFloat(8u, 0) == PositiveFloat(4u, 1)
118 /// PositiveFloat(4u, 1) == PositiveFloat(2u, 2)
119 /// PositiveFloat(2u, 2) == PositiveFloat(1u, 3)
120 ///
121 /// PositiveFloat implements most arithmetic operations. Precision is kept
122 /// where possible. Uses simple saturation arithmetic, so that operations
123 /// saturate to 0.0 or getLargest() rather than under or overflowing. It has
124 /// some extra arithmetic for unit inversion. 0.0/0.0 is defined to be 0.0.
125 /// Any other division by 0.0 is defined to be getLargest().
126 ///
127 /// As a convenience for modifying the exponent, left and right shifting are
128 /// both implemented, and both interpret negative shifts as positive shifts in
129 /// the opposite direction.
130 ///
131 /// Future work might extract most of the implementation into a base class
132 /// (e.g., \c Float) that has an \c IsSigned template parameter. The initial
133 /// use case for this only needed positive semantics, but it wouldn't take much
134 /// work to extend.
135 ///
136 /// Exponents are limited to the range accepted by x87 long double. This makes
137 /// it trivial to add functionality to convert to APFloat (this is already
138 /// relied on for the implementation of printing).
139 template class PositiveFloat : PositiveFloatBase {
140 public:
141 static_assert(!std::numeric_limits::is_signed,
142 "only unsigned floats supported");
143
144 typedef DigitsT DigitsType;
145
146 private:
147 typedef std::numeric_limits DigitsLimits;
148
149 static const int Width = sizeof(DigitsType) * 8;
150 static_assert(Width <= 64, "invalid integer width for digits");
151
152 private:
153 DigitsType Digits;
154 int16_t Exponent;
155
156 public:
157 PositiveFloat() : Digits(0), Exponent(0) {}
158
159 PositiveFloat(DigitsType Digits, int16_t Exponent)
160 : Digits(Digits), Exponent(Exponent) {}
161
162 private:
163 PositiveFloat(const std::pair &X)
164 : Digits(X.first), Exponent(X.second) {}
165
166 public:
167 static PositiveFloat getZero() { return PositiveFloat(0, 0); }
168 static PositiveFloat getOne() { return PositiveFloat(1, 0); }
169 static PositiveFloat getLargest() {
170 return PositiveFloat(DigitsLimits::max(), MaxExponent);
171 }
172 static PositiveFloat getFloat(uint64_t N) { return adjustToWidth(N, 0); }
173 static PositiveFloat getInverseFloat(uint64_t N) {
174 return getFloat(N).invert();
175 }
176 static PositiveFloat getFraction(DigitsType N, DigitsType D) {
177 return getQuotient(N, D);
178 }
179
180 int16_t getExponent() const { return Exponent; }
181 DigitsType getDigits() const { return Digits; }
182
183 template IntT toInt() const;
184
185 bool isZero() const { return !Digits; }
186 bool isLargest() const { return *this == getLargest(); }
187 bool isOne() const {
188 if (Exponent > 0 || Exponent <= -Width)
189 return false;
190 return Digits == DigitsType(1) << -Exponent;
191 }
192
193 /// \brief The log base 2, rounded.
194 ///
195 /// Get the lg of the scalar. lg 0 is defined to be INT32_MIN.
196 int32_t lg() const { return extractLg(lgImpl()); }
197
198 /// \brief The log base 2, rounded towards INT32_MIN.
199 ///
200 /// Get the lg floor. lg 0 is defined to be INT32_MIN.
201 int32_t lgFloor() const { return extractLgFloor(lgImpl()); }
202
203 /// \brief The log base 2, rounded towards INT32_MAX.
204 ///
205 /// Get the lg ceiling. lg 0 is defined to be INT32_MIN.
206 int32_t lgCeiling() const { return extractLgCeiling(lgImpl()); }
207
208 bool operator==(const PositiveFloat &X) const { return compare(X) == 0; }
209 bool operator<(const PositiveFloat &X) const { return compare(X) < 0; }
210 bool operator!=(const PositiveFloat &X) const { return compare(X) != 0; }
211 bool operator>(const PositiveFloat &X) const { return compare(X) > 0; }
212 bool operator<=(const PositiveFloat &X) const { return compare(X) <= 0; }
213 bool operator>=(const PositiveFloat &X) const { return compare(X) >= 0; }
214
215 bool operator!() const { return isZero(); }
216
217 /// \brief Convert to a decimal representation in a string.
218 ///
219 /// Convert to a string. Uses scientific notation for very large/small
220 /// numbers. Scientific notation is used roughly for numbers outside of the
221 /// range 2^-64 through 2^64.
222 ///
223 /// \c Precision indicates the number of decimal digits of precision to use;
224 /// 0 requests the maximum available.
225 ///
226 /// As a special case to make debugging easier, if the number is small enough
227 /// to convert without scientific notation and has more than \c Precision
228 /// digits before the decimal place, it's printed accurately to the first
229 /// digit past zero. E.g., assuming 10 digits of precision:
230 ///
231 /// 98765432198.7654... => 98765432198.8
232 /// 8765432198.7654... => 8765432198.8
233 /// 765432198.7654... => 765432198.8
234 /// 65432198.7654... => 65432198.77
235 /// 5432198.7654... => 5432198.765
236 std::string toString(unsigned Precision = DefaultPrecision) {
237 return PositiveFloatBase::toString(Digits, Exponent, Width, Precision);
238 }
239
240 /// \brief Print a decimal representation.
241 ///
242 /// Print a string. See toString for documentation.
243 raw_ostream &print(raw_ostream &OS,
244 unsigned Precision = DefaultPrecision) const {
245 return PositiveFloatBase::print(OS, Digits, Exponent, Width, Precision);
246 }
247 void dump() const { return PositiveFloatBase::dump(Digits, Exponent, Width); }
248
249 PositiveFloat &operator+=(const PositiveFloat &X);
250 PositiveFloat &operator-=(const PositiveFloat &X);
251 PositiveFloat &operator*=(const PositiveFloat &X);
252 PositiveFloat &operator/=(const PositiveFloat &X);
253 PositiveFloat &operator<<=(int16_t Shift) { return shiftLeft(Shift); }
254 PositiveFloat &operator>>=(int16_t Shift) { return shiftRight(Shift); }
255
256 private:
257 PositiveFloat &shiftLeft(int32_t Shift);
258 PositiveFloat &shiftRight(int32_t Shift);
259 PositiveFloat normalizeExponents(PositiveFloat X);
260
261 public:
262 /// \brief Scale a large number accurately.
263 ///
264 /// Scale N (multiply it by this). Uses full precision multiplication, even
265 /// if Width is smaller than 64, so information is not lost.
266 uint64_t scale(uint64_t N) const;
267 uint64_t scaleByInverse(uint64_t N) const {
268 // TODO: implement directly, rather than relying on inverse. Inverse is
269 // expensive.
270 return inverse().scale(N);
271 }
272 int64_t scale(int64_t N) const {
273 std::pair Unsigned = splitSigned(N);
274 return joinSigned(scale(Unsigned.first), Unsigned.second);
275 }
276 int64_t scaleByInverse(int64_t N) const {
277 std::pair Unsigned = splitSigned(N);
278 return joinSigned(scaleByInverse(Unsigned.first), Unsigned.second);
279 }
280
281 int compare(const PositiveFloat &X) const;
282 int compareTo(uint64_t N) const {
283 PositiveFloat Float = getFloat(N);
284 int Compare = compare(Float);
285 if (Width == 64 || Compare != 0)
286 return Compare;
287
288 // Check for precision loss. We know *this == RoundTrip.
289 uint64_t RoundTrip = Float.template toInt();
290 return N == RoundTrip ? 0 : RoundTrip < N ? -1 : 1;
291 }
292 int compareTo(int64_t N) const { return N < 0 ? 1 : compareTo(uint64_t(N)); }
293
294 PositiveFloat &invert() { return *this = PositiveFloat::getFloat(1) / *this; }
295 PositiveFloat inverse() const { return PositiveFloat(*this).invert(); }
296
297 private:
298 static PositiveFloat getProduct(DigitsType L, DigitsType R);
299 static PositiveFloat getQuotient(DigitsType Dividend, DigitsType Divisor);
300
301 std::pair lgImpl() const;
302 static int countLeadingZerosWidth(DigitsType Digits) {
303 if (Width == 64)
304 return countLeadingZeros64(Digits);
305 if (Width == 32)
306 return countLeadingZeros32(Digits);
307 return countLeadingZeros32(Digits) + Width - 32;
308 }
309
310 static PositiveFloat adjustToWidth(uint64_t N, int S) {
311 assert(S >= MinExponent);
312 assert(S <= MaxExponent);
313 if (Width == 64 || N <= DigitsLimits::max())
314 return PositiveFloat(N, S);
315
316 // Shift right.
317 int Shift = 64 - Width - countLeadingZeros64(N);
318 DigitsType Shifted = N >> Shift;
319
320 // Round.
321 assert(S + Shift <= MaxExponent);
322 return getRounded(PositiveFloat(Shifted, S + Shift),
323 N & UINT64_C(1) << (Shift - 1));
324 }
325
326 static PositiveFloat getRounded(PositiveFloat P, bool Round) {
327 if (!Round)
328 return P;
329 if (P.Digits == DigitsLimits::max())
330 // Careful of overflow in the exponent.
331 return PositiveFloat(1, P.Exponent) <<= Width;
332 return PositiveFloat(P.Digits + 1, P.Exponent);
333 }
334 };
335
336 template
337 PositiveFloat operator+(const PositiveFloat &L,
338 const PositiveFloat &R) {
339 return PositiveFloat(L) += R;
340 }
341 template
342 PositiveFloat operator-(const PositiveFloat &L,
343 const PositiveFloat &R) {
344 return PositiveFloat(L) -= R;
345 }
346 template
347 PositiveFloat operator*(const PositiveFloat &L,
348 const PositiveFloat &R) {
349 return PositiveFloat(L) *= R;
350 }
351 template
352 PositiveFloat operator/(const PositiveFloat &L,
353 const PositiveFloat &R) {
354 return PositiveFloat(L) /= R;
355 }
356 template
357 PositiveFloat operator<<(const PositiveFloat &F,
358 int16_t Shift) {
359 return PositiveFloat(F) <<= Shift;
360 }
361 template
362 PositiveFloat operator>>(const PositiveFloat &F,
363 int16_t Shift) {
364 return PositiveFloat(F) >>= Shift;
365 }
366
367 template
368 raw_ostream &operator<<(raw_ostream &OS, const PositiveFloat &X) {
369 return X.print(OS, 10);
370 }
371
372 template
373 bool operator<(const PositiveFloat &L, uint64_t R) {
374 return L.compareTo(R) < 0;
375 }
376 template
377 bool operator>(const PositiveFloat &L, uint64_t R) {
378 return L.compareTo(R) > 0;
379 }
380 template
381 bool operator==(const PositiveFloat &L, uint64_t R) {
382 return L.compareTo(R) == 0;
383 }
384 template
385 bool operator!=(const PositiveFloat &L, uint64_t R) {
386 return L.compareTo(R) != 0;
387 }
388 template
389 bool operator<=(const PositiveFloat &L, uint64_t R) {
390 return L.compareTo(R) <= 0;
391 }
392 template
393 bool operator>=(const PositiveFloat &L, uint64_t R) {
394 return L.compareTo(R) >= 0;
395 }
396
397 template
398 bool operator<(const PositiveFloat &L, int64_t R) {
399 return L.compareTo(R) < 0;
400 }
401 template
402 bool operator>(const PositiveFloat &L, int64_t R) {
403 return L.compareTo(R) > 0;
404 }
405 template
406 bool operator==(const PositiveFloat &L, int64_t R) {
407 return L.compareTo(R) == 0;
408 }
409 template
410 bool operator!=(const PositiveFloat &L, int64_t R) {
411 return L.compareTo(R) != 0;
412 }
413 template
414 bool operator<=(const PositiveFloat &L, int64_t R) {
415 return L.compareTo(R) <= 0;
416 }
417 template
418 bool operator>=(const PositiveFloat &L, int64_t R) {
419 return L.compareTo(R) >= 0;
420 }
421
422 template
423 bool operator<(const PositiveFloat &L, uint32_t R) {
424 return L.compareTo(uint64_t(R)) < 0;
425 }
426 template
427 bool operator>(const PositiveFloat &L, uint32_t R) {
428 return L.compareTo(uint64_t(R)) > 0;
429 }
430 template
431 bool operator==(const PositiveFloat &L, uint32_t R) {
432 return L.compareTo(uint64_t(R)) == 0;
433 }
434 template
435 bool operator!=(const PositiveFloat &L, uint32_t R) {
436 return L.compareTo(uint64_t(R)) != 0;
437 }
438 template
439 bool operator<=(const PositiveFloat &L, uint32_t R) {
440 return L.compareTo(uint64_t(R)) <= 0;
441 }
442 template
443 bool operator>=(const PositiveFloat &L, uint32_t R) {
444 return L.compareTo(uint64_t(R)) >= 0;
445 }
446
447 template
448 bool operator<(const PositiveFloat &L, int32_t R) {
449 return L.compareTo(int64_t(R)) < 0;
450 }
451 template
452 bool operator>(const PositiveFloat &L, int32_t R) {
453 return L.compareTo(int64_t(R)) > 0;
454 }
455 template
456 bool operator==(const PositiveFloat &L, int32_t R) {
457 return L.compareTo(int64_t(R)) == 0;
458 }
459 template
460 bool operator!=(const PositiveFloat &L, int32_t R) {
461 return L.compareTo(int64_t(R)) != 0;
462 }
463 template
464 bool operator<=(const PositiveFloat &L, int32_t R) {
465 return L.compareTo(int64_t(R)) <= 0;
466 }
467 template
468 bool operator>=(const PositiveFloat &L, int32_t R) {
469 return L.compareTo(int64_t(R)) >= 0;
470 }
471
472 template
473 bool operator<(uint64_t L, const PositiveFloat &R) {
474 return R > L;
475 }
476 template
477 bool operator>(uint64_t L, const PositiveFloat &R) {
478 return R < L;
479 }
480 template
481 bool operator==(uint64_t L, const PositiveFloat &R) {
482 return R == L;
483 }
484 template
485 bool operator<=(uint64_t L, const PositiveFloat &R) {
486 return R >= L;
487 }
488 template
489 bool operator>=(uint64_t L, const PositiveFloat &R) {
490 return R <= L;
491 }
492 template
493 bool operator!=(uint64_t L, const PositiveFloat &R) {
494 return R != L;
495 }
496 template
497 bool operator<(int64_t L, const PositiveFloat &R) {
498 return R > L;
499 }
500 template
501 bool operator>(int64_t L, const PositiveFloat &R) {
502 return R < L;
503 }
504 template
505 bool operator==(int64_t L, const PositiveFloat &R) {
506 return R == L;
507 }
508 template
509 bool operator<=(int64_t L, const PositiveFloat &R) {
510 return R >= L;
511 }
512 template
513 bool operator>=(int64_t L, const PositiveFloat &R) {
514 return R <= L;
515 }
516 template
517 bool operator!=(int64_t L, const PositiveFloat &R) {
518 return R != L;
519 }
520 template
521 bool operator<(uint32_t L, const PositiveFloat &R) {
522 return R > L;
523 }
524 template
525 bool operator>(uint32_t L, const PositiveFloat &R) {
526 return R < L;
527 }
528 template
529 bool operator==(uint32_t L, const PositiveFloat &R) {
530 return R == L;
531 }
532 template
533 bool operator<=(uint32_t L, const PositiveFloat &R) {
534 return R >= L;
535 }
536 template
537 bool operator>=(uint32_t L, const PositiveFloat &R) {
538 return R <= L;
539 }
540 template
541 bool operator!=(uint32_t L, const PositiveFloat &R) {
542 return R != L;
543 }
544 template
545 bool operator<(int32_t L, const PositiveFloat &R) {
546 return R > L;
547 }
548 template
549 bool operator>(int32_t L, const PositiveFloat &R) {
550 return R < L;
551 }
552 template
553 bool operator==(int32_t L, const PositiveFloat &R) {
554 return R == L;
555 }
556 template
557 bool operator<=(int32_t L, const PositiveFloat &R) {
558 return R >= L;
559 }
560 template
561 bool operator>=(int32_t L, const PositiveFloat &R) {
562 return R <= L;
563 }
564 template
565 bool operator!=(int32_t L, const PositiveFloat &R) {
566 return R != L;
567 }
568
569 template
570 uint64_t PositiveFloat::scale(uint64_t N) const {
571 if (Width == 64 || N <= DigitsLimits::max())
572 return (getFloat(N) * *this).template toInt();
573 std::pair Lg = lgImpl();
574 if (extractLgFloor(Lg) >= 64)
575 return UINT64_MAX;
576 if (extractLgCeiling(Lg) <= -64)
577 return 0;
578
579 uint64_t Result = 0;
580 for (int Bit = 0; Bit < 64; Bit += Width) {
581 PositiveFloat Digit = getFloat(N & DigitsLimits::max() << Bit);
582 Digit *= *this;
583
584 uint64_t Sum = Result + (Digit.toInt() >> Bit);
585 if (Sum < Result)
586 return UINT64_MAX;
587 Result = Sum;
588 }
589 return Result;
590 }
591
592 template
593 PositiveFloat PositiveFloat::getProduct(DigitsType L,
594 DigitsType R) {
595 // Check for zero.
596 if (!L || !R)
597 return getZero();
598
599 // Check for numbers that we can compute with 64-bit math.
600 if (Width <= 32)
601 return adjustToWidth(uint64_t(L) * uint64_t(R), 0);
602
603 // Do the full thing.
604 return PositiveFloat(multiply64(L, R));
605 }
606 template
607 PositiveFloat PositiveFloat::getQuotient(DigitsType Dividend,
608 DigitsType Divisor) {
609 // Check for zero.
610 if (!Dividend)
611 return getZero();
612 if (!Divisor)
613 return getLargest();
614
615 if (Width == 64)
616 return PositiveFloat(divide64(Dividend, Divisor));
617
618 // We can compute this with 64-bit math.
619 int Shift = countLeadingZeros64(Dividend);
620 uint64_t Shifted = uint64_t(Dividend) << Shift;
621 uint64_t Quotient = Shifted / Divisor;
622
623 // If Quotient needs to be shifted, then adjustToWidth will round.
624 if (Quotient > DigitsLimits::max())
625 return adjustToWidth(Quotient, -Shift);
626
627 // Round based on the value of the next bit.
628 return getRounded(PositiveFloat(Quotient, -Shift),
629 Shifted % Divisor >= getHalf(Divisor));
630 }
631
632 template
633 template
634 IntT PositiveFloat::toInt() const {
635 typedef std::numeric_limits Limits;
636 if (*this < 1)
637 return 0;
638 if (*this >= Limits::max())
639 return Limits::max();
640
641 IntT N = Digits;
642 if (Exponent > 0) {
643 assert(size_t(Exponent) < sizeof(IntT) * 8);
644 return N << Exponent;
645 }
646 if (Exponent < 0) {
647 assert(size_t(-Exponent) < sizeof(IntT) * 8);
648 return N >> -Exponent;
649 }
650 return N;
651 }
652
653 template
654 std::pair PositiveFloat::lgImpl() const {
655 if (isZero())
656 return std::make_pair(INT32_MIN, 0);
657
658 // Get the floor of the lg of Digits.
659 int32_t LocalFloor = Width - countLeadingZerosWidth(Digits) - 1;
660
661 // Get the floor of the lg of this.
662 int32_t Floor = Exponent + LocalFloor;
663 if (Digits == UINT64_C(1) << LocalFloor)
664 return std::make_pair(Floor, 0);
665
666 // Round based on the next digit.
667 bool Round = Digits & UINT64_C(1) << (LocalFloor - 1);
668 return std::make_pair(Floor + Round, Round ? 1 : -1);
669 }
670
671 template
672 PositiveFloat
673 PositiveFloat::normalizeExponents(PositiveFloat X) {
674 if (isZero() || X.isZero())
675 return X;
676
677 if (Exponent > X.Exponent) {
678 // Reverse the arguments.
679 *this = X.normalizeExponents(*this);
680 return X;
681 }
682
683 if (Exponent == X.Exponent)
684 return X;
685
686 int ExponentDiff = getDiff(Exponent, X.Exponent);
687 if (ExponentDiff >= 2 * Width) {
688 *this = getZero();
689 return X;
690 }
691
692 // Use up any leading zeros on X, and then shift this.
693 int ShiftX = std::min(countLeadingZerosWidth(X.Digits), ExponentDiff);
694 int ShiftThis = ExponentDiff - ShiftX;
695
696 if (ShiftThis >= Width) {
697 *this = getZero();
698 return X;
699 }
700
701 X.Digits <<= ShiftX;
702 X.Exponent -= ShiftX;
703 Digits >>= ShiftThis;
704 Exponent += ShiftThis;
705 return X;
706 }
707
708 template
709 PositiveFloat &PositiveFloat::
710 operator+=(const PositiveFloat &X) {
711 if (isLargest() || X.isZero())
712 return *this;
713 if (isZero() || X.isLargest())
714 return *this = X;
715
716 // Normalize exponents.
717 PositiveFloat Scaled = normalizeExponents(X);
718
719 // Check for zero again.
720 if (isZero())
721 return *this = Scaled;
722 if (Scaled.isZero())
723 return *this;
724
725 // Compute sum.
726 DigitsType Sum = Digits + Scaled.Digits;
727 bool DidOverflow = Sum < Digits || Sum < Scaled.Digits;
728 Digits = Sum;
729 if (!DidOverflow)
730 return *this;
731
732 if (Exponent == MaxExponent)
733 return *this = getLargest();
734
735 ++Exponent;
736 Digits = Digits >> 1 | UINT64_C(1) << (Width - 1);
737
738 return *this;
739 }
740 template
741 PositiveFloat &PositiveFloat::
742 operator-=(const PositiveFloat &X) {
743 if (X.isZero())
744 return *this;
745 if (*this <= X)
746 return *this = getZero();
747
748 // Normalize exponents.
749 PositiveFloat Scaled = normalizeExponents(X);
750 assert(Digits >= Scaled.Digits);
751
752 // Compute difference.
753 if (!Scaled.isZero()) {
754 Digits -= Scaled.Digits;
755 return *this;
756 }
757
758 // Check if X just barely lost its last bit. E.g., for 32-bit:
759 //
760 // 1*2^32 - 1*2^0 == 0xffffffff != 1*2^32
761 if (*this == PositiveFloat(1, X.lgFloor() + Width)) {
762 Digits = DigitsType(0) - 1;
763 --Exponent;
764 }
765 return *this;
766 }
767 template
768 PositiveFloat &PositiveFloat::
769 operator*=(const PositiveFloat &X) {
770 if (isZero())
771 return *this;
772 if (X.isZero())
773 return *this = X;
774
775 // Save the exponents.
776 int32_t Exponents = int32_t(Exponent) + int32_t(X.Exponent);
777
778 // Get the raw product.
779 *this = getProduct(Digits, X.Digits);
780
781 // Combine with exponents.
782 return *this <<= Exponents;
783 }
784 template
785 PositiveFloat &PositiveFloat::
786 operator/=(const PositiveFloat &X) {
787 if (isZero())
788 return *this;
789 if (X.isZero())
790 return *this = getLargest();
791
792 // Save the exponents.
793 int32_t Exponents = int32_t(Exponent) + -int32_t(X.Exponent);
794
795 // Get the raw quotient.
796 *this = getQuotient(Digits, X.Digits);
797
798 // Combine with exponents.
799 return *this <<= Exponents;
800 }
801 template
802 PositiveFloat &PositiveFloat::shiftLeft(int32_t Shift) {
803 if (Shift < 0)
804 return shiftRight(-Shift);
805 if (!Shift || isZero())
806 return *this;
807
808 // Shift as much as we can in the exponent.
809 int16_t ExponentShift = std::min(Shift, MaxExponent - Exponent);
810 Exponent += ExponentShift;
811 if (ExponentShift == Shift)
812 return *this;
813
814 // Check this late, since it's rare.
815 if (isLargest())
816 return *this;
817
818 // Shift as far as possible.
819 int32_t RawShift = std::min(Shift, countLeadingZerosWidth(Digits));
820 if (RawShift + ExponentShift < Shift)
821 // Saturate.
822 return *this = getLargest();
823
824 Digits <<= Shift;
825 return *this;
826 }
827
828 template
829 PositiveFloat &PositiveFloat::shiftRight(int32_t Shift) {
830 if (Shift < 0)
831 return shiftLeft(-Shift);
832 if (!Shift || isZero())
833 return *this;
834
835 // Shift as much as we can in the exponent.
836 int16_t ExponentShift = std::min(Shift, Exponent - MinExponent);
837 Exponent -= ExponentShift;
838 if (ExponentShift == Shift)
839 return *this;
840
841 // Shift as far as possible.
842 int32_t RawShift = Shift - ExponentShift;
843 if (RawShift >= Width)
844 // Saturate.
845 return *this = getZero();
846
847 // May result in zero.
848 Digits >>= Shift;
849 return *this;
850 }
851
852 template
853 int PositiveFloat::compare(const PositiveFloat &X) const {
854 // Check for zero.
855 if (isZero())
856 return X.isZero() ? 0 : -1;
857 if (X.isZero())
858 return 1;
859
860 // Check for the scale. Use lgFloor to be sure that the exponent difference
861 // is always lower than 64.
862 int32_t lgL = lgFloor(), lgR = X.lgFloor();
863 if (lgL != lgR)
864 return lgL < lgR ? -1 : 1;
865
866 // Compare digits.
867 if (Exponent < X.Exponent)
868 return PositiveFloatBase::compare(Digits, X.Digits, X.Exponent - Exponent);
869
870 return -PositiveFloatBase::compare(X.Digits, Digits, Exponent - X.Exponent);
871 }
872
873 template struct isPodLike> {
874 static const bool value = true;
875 };
876 }
877
878 //===----------------------------------------------------------------------===//
879 //
880 // BlockMass definition.
881 //
882 // TODO: Make this private to BlockFrequencyInfoImpl or delete.
883 //
884 //===----------------------------------------------------------------------===//
885 namespace llvm {
886
887 /// \brief Mass of a block.
888 ///
889 /// This class implements a sort of fixed-point fraction always between 0.0 and
890 /// 1.0. getMass() == UINT64_MAX indicates a value of 1.0.
891 ///
892 /// Masses can be added and subtracted. Simple saturation arithmetic is used,
893 /// so arithmetic operations never overflow or underflow.
894 ///
895 /// Masses can be multiplied. Multiplication treats full mass as 1.0 and uses
896 /// an inexpensive floating-point algorithm that's off-by-one (almost, but not
897 /// quite, maximum precision).
898 ///
899 /// Masses can be scaled by \a BranchProbability at maximum precision.
900 class BlockMass {
901 uint64_t Mass;
902
903 public:
904 BlockMass() : Mass(0) {}
905 explicit BlockMass(uint64_t Mass) : Mass(Mass) {}
906
907 static BlockMass getEmpty() { return BlockMass(); }
908 static BlockMass getFull() { return BlockMass(UINT64_MAX); }
909
910 uint64_t getMass() const { return Mass; }
911
912 bool isFull() const { return Mass == UINT64_MAX; }
913 bool isEmpty() const { return !Mass; }
914
915 bool operator!() const { return isEmpty(); }
916
917 /// \brief Add another mass.
918 ///
919 /// Adds another mass, saturating at \a isFull() rather than overflowing.
920 BlockMass &operator+=(const BlockMass &X) {
921 uint64_t Sum = Mass + X.Mass;
922 Mass = Sum < Mass ? UINT64_MAX : Sum;
923 return *this;
924 }
925
926 /// \brief Subtract another mass.
927 ///
928 /// Subtracts another mass, saturating at \a isEmpty() rather than
929 /// undeflowing.
930 BlockMass &operator-=(const BlockMass &X) {
931 uint64_t Diff = Mass - X.Mass;
932 Mass = Diff > Mass ? 0 : Diff;
933 return *this;
934 }
935
936 /// \brief Scale by another mass.
937 ///
938 /// The current implementation is a little imprecise, but it's relatively
939 /// fast, never overflows, and maintains the property that 1.0*1.0==1.0
940 /// (where isFull represents the number 1.0). It's an approximation of
941 /// 128-bit multiply that gets right-shifted by 64-bits.
942 ///
943 /// For a given digit size, multiplying two-digit numbers looks like:
944 ///
945 /// U1 . L1
946 /// * U2 . L2
947 /// ============
948 /// 0 . . L1*L2
949 /// + 0 . U1*L2 . 0 // (shift left once by a digit-size)
950 /// + 0 . U2*L1 . 0 // (shift left once by a digit-size)
951 /// + U1*L2 . 0 . 0 // (shift left twice by a digit-size)
952 ///
953 /// BlockMass has 64-bit numbers. Split each into two 32-bit digits, stored
954 /// 64-bit. Add 1 to the lower digits, to model isFull as 1.0; this won't
955 /// overflow, since we have 64-bit storage for each digit.
956 ///
957 /// To do this accurately, (a) multiply into two 64-bit digits, incrementing
958 /// the upper digit on overflows of the lower digit (carry), (b) subtract 1
959 /// from the lower digit, decrementing the upper digit on underflow (carry),
960 /// and (c) truncate the lower digit. For the 1.0*1.0 case, the upper digit
961 /// will be 0 at the end of step (a), and then will underflow back to isFull
962 /// (1.0) in step (b).
963 ///
964 /// Instead, the implementation does something a little faster with a small
965 /// loss of accuracy: ignore the lower 64-bit digit entirely. The loss of
966 /// accuracy is small, since the sum of the unmodelled carries is 0 or 1
967 /// (i.e., step (a) will overflow at most once, and step (b) will underflow
968 /// only if step (a) overflows).
969 ///
970 /// This is the formula we're calculating:
971 ///
972 /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>32 + (U2 * (L1+1))>>32
973 ///
974 /// As a demonstration of 1.0*1.0, consider two 4-bit numbers that are both
975 /// full (1111).
976 ///
977 /// U1.L1 * U2.L2 == U1 * U2 + (U1 * (L2+1))>>2 + (U2 * (L1+1))>>2
978 /// 11.11 * 11.11 == 11 * 11 + (11 * (11+1))/4 + (11 * (11+1))/4
979 /// == 1001 + (11 * 100)/4 + (11 * 100)/4
980 /// == 1001 + 1100/4 + 1100/4
981 /// == 1001 + 0011 + 0011
982 /// == 1111
983 BlockMass &operator*=(const BlockMass &X) {
984 uint64_t U1 = Mass >> 32, L1 = Mass & UINT32_MAX, U2 = X.Mass >> 32,
985 L2 = X.Mass & UINT32_MAX;
986 Mass = U1 * U2 + (U1 * (L2 + 1) >> 32) + ((L1 + 1) * U2 >> 32);
987 return *this;
988 }
989
990 /// \brief Multiply by a branch probability.
991 ///
992 /// Multiply by P. Guarantees full precision.
993 ///
994 /// This could be naively implemented by multiplying by the numerator and
995 /// dividing by the denominator, but in what order? Multiplying first can
996 /// overflow, while dividing first will lose precision (potentially, changing
997 /// a non-zero mass to zero).
998 ///
999 /// The implementation mixes the two methods. Since \a BranchProbability
1000 /// uses 32-bits and \a BlockMass 64-bits, shift the mass as far to the left
1001 /// as there is room, then divide by the denominator to get a quotient.
1002 /// Multiplying by the numerator and right shifting gives a first
1003 /// approximation.
1004 ///
1005 /// Calculate the error in this first approximation by calculating the
1006 /// opposite mass (multiply by the opposite numerator and shift) and
1007 /// subtracting both from teh original mass.
1008 ///
1009 /// Add to the first approximation the correct fraction of this error value.
1010 /// This time, multiply first and then divide, since there is no danger of
1011 /// overflow.
1012 ///
1013 /// \pre P represents a fraction between 0.0 and 1.0.
1014 BlockMass &operator*=(const BranchProbability &P);
1015
1016 bool operator==(const BlockMass &X) const { return Mass == X.Mass; }
1017 bool operator<(const BlockMass &X) const { return Mass < X.Mass; }
1018 bool operator!=(const BlockMass &X) const { return !(*this == X); }
1019 bool operator>(const BlockMass &X) const { return X < *this; }
1020 bool operator<=(const BlockMass &X) const { return !(*this > X); }
1021 bool operator>=(const BlockMass &X) const { return !(*this < X); }
1022
1023 /// \brief Convert to floating point.
1024 ///
1025 /// Convert to a float. \a isFull() gives 1.0, while \a isEmpty() gives
1026 /// slightly above 0.0.
1027 PositiveFloat toFloat() const;
1028
1029 void dump() const;
1030 raw_ostream &print(raw_ostream &OS) const;
1031 };
1032
1033 inline BlockMass operator+(const BlockMass &L, const BlockMass &R) {
1034 return BlockMass(L) += R;
1035 }
1036 inline BlockMass operator-(const BlockMass &L, const BlockMass &R) {
1037 return BlockMass(L) -= R;
1038 }
1039 inline BlockMass operator*(const BlockMass &L, const BlockMass &R) {
1040 return BlockMass(L) *= R;
1041 }
1042 inline BlockMass operator*(const BlockMass &L, const BranchProbability &R) {
1043 return BlockMass(L) *= R;
1044 }
1045 inline BlockMass operator*(const BranchProbability &L, const BlockMass &R) {
1046 return BlockMass(R) *= L;
1047 }
1048
1049 inline raw_ostream &operator<<(raw_ostream &OS, const BlockMass &X) {
1050 return X.print(OS);
1051 }
1052
1053 template <> struct isPodLike {
1054 static const bool value = true;
1055 };
1056 }
1057
1058 //===----------------------------------------------------------------------===//
1059 //
1060 // BlockFrequencyInfoImpl definition.
1061 //
1062 //===----------------------------------------------------------------------===//
1063 namespace llvm {
1064
1065 class BasicBlock;
30
106631 class BranchProbabilityInfo;
1067 class Function;
1068 class Loop;
1069 class LoopInfo;
1070 class MachineBasicBlock;
32 class BlockFrequencyInfo;
107133 class MachineBranchProbabilityInfo;
1072 class MachineFunction;
1073 class MachineLoop;
1074 class MachineLoopInfo;
1075
1076 /// \brief Base class for BlockFrequencyInfoImpl
1077 ///
1078 /// BlockFrequencyInfoImplBase has supporting data structures and some
1079 /// algorithms for BlockFrequencyInfoImplBase. Only algorithms that depend on
1080 /// the block type (or that call such algorithms) are skipped here.
1081 ///
1082 /// Nevertheless, the majority of the overall algorithm documention lives with
1083 /// BlockFrequencyInfoImpl. See there for details.
1084 class BlockFrequencyInfoImplBase {
1085 public:
1086 typedef PositiveFloat Float;
1087
1088 /// \brief Representative of a block.
1089 ///
1090 /// This is a simple wrapper around an index into the reverse-post-order
1091 /// traversal of the blocks.
1092 ///
1093 /// Unlike a block pointer, its order has meaning (location in the
1094 /// topological sort) and it's class is the same regardless of block type.
1095 struct BlockNode {
1096 typedef uint32_t IndexType;
1097 IndexType Index;
1098
1099 bool operator==(const BlockNode &X) const { return Index == X.Index; }
1100 bool operator!=(const BlockNode &X) const { return Index != X.Index; }
1101 bool operator<=(const BlockNode &X) const { return Index <= X.Index; }
1102 bool operator>=(const BlockNode &X) const { return Index >= X.Index; }
1103 bool operator<(const BlockNode &X) const { return Index < X.Index; }
1104 bool operator>(const BlockNode &X) const { return Index > X.Index; }
1105
1106 BlockNode() : Index(UINT32_MAX) {}
1107 BlockNode(IndexType Index) : Index(Index) {}
1108
1109 bool isValid() const { return Index <= getMaxIndex(); }
1110 static size_t getMaxIndex() { return UINT32_MAX - 1; }
1111 };
1112
1113 /// \brief Stats about a block itself.
1114 struct FrequencyData {
1115 Float Floating;
1116 uint64_t Integer;
1117 };
1118
1119 /// \brief Index of loop information.
1120 struct WorkingData {
1121 BlockNode ContainingLoop; ///< The block whose loop this block is inside.
1122 uint32_t LoopIndex; ///< Index into PackagedLoops.
1123 bool IsPackaged; ///< Has ContainingLoop been packaged up?
1124 bool IsAPackage; ///< Has this block's loop been packaged up?
1125 BlockMass Mass; ///< Mass distribution from the entry block.
1126
1127 WorkingData()
1128 : LoopIndex(UINT32_MAX), IsPackaged(false), IsAPackage(false) {}
1129
1130 bool hasLoopHeader() const { return ContainingLoop.isValid(); }
1131 bool isLoopHeader() const { return LoopIndex != UINT32_MAX; }
1132 };
1133
1134 /// \brief Unscaled probability weight.
1135 ///
1136 /// Probability weight for an edge in the graph (including the
1137 /// successor/target node).
1138 ///
1139 /// All edges in the original function are 32-bit. However, exit edges from
1140 /// loop packages are taken from 64-bit exit masses, so we need 64-bits of
1141 /// space in general.
1142 ///
1143 /// In addition to the raw weight amount, Weight stores the type of the edge
1144 /// in the current context (i.e., the context of the loop being processed).
1145 /// Is this a local edge within the loop, an exit from the loop, or a
1146 /// backedge to the loop header?
1147 struct Weight {
1148 enum DistType { Local, Exit, Backedge };
1149 DistType Type;
1150 BlockNode TargetNode;
1151 uint64_t Amount;
1152 Weight() : Type(Local), Amount(0) {}
1153 };
1154
1155 /// \brief Distribution of unscaled probability weight.
1156 ///
1157 /// Distribution of unscaled probability weight to a set of successors.
1158 ///
1159 /// This class collates the successor edge weights for later processing.
1160 ///
1161 /// \a DidOverflow indicates whether \a Total did overflow while adding to
1162 /// the distribution. It should never overflow twice. There's no flag for
1163 /// whether \a ForwardTotal overflows, since when \a Total exceeds 32-bits
1164 /// they both get re-computed during \a normalize().
1165 struct Distribution {
1166 typedef SmallVector WeightList;
1167 WeightList Weights; ///< Individual successor weights.
1168 uint64_t Total; ///< Sum of all weights.
1169 bool DidOverflow; ///< Whether \a Total did overflow.
1170 uint32_t ForwardTotal; ///< Total excluding backedges.
1171
1172 Distribution() : Total(0), DidOverflow(false), ForwardTotal(0) {}
1173 void addLocal(const BlockNode &Node, uint64_t Amount) {
1174 add(Node, Amount, Weight::Local);
1175 }
1176 void addExit(const BlockNode &Node, uint64_t Amount) {
1177 add(Node, Amount, Weight::Exit);
1178 }
1179 void addBackedge(const BlockNode &Node, uint64_t Amount) {
1180 add(Node, Amount, Weight::Backedge);
1181 }
1182
1183 /// \brief Normalize the distribution.
1184 ///
1185 /// Combines multiple edges to the same \a Weight::TargetNode and scales
1186 /// down so that \a Total fits into 32-bits.
1187 ///
1188 /// This is linear in the size of \a Weights. For the vast majority of
1189 /// cases, adjacent edge weights are combined by sorting WeightList and
1190 /// combining adjacent weights. However, for very large edge lists an
1191 /// auxiliary hash table is used.
1192 void normalize();
1193
1194 private:
1195 void add(const BlockNode &Node, uint64_t Amount, Weight::DistType Type);
1196 };
1197
1198 /// \brief Data for a packaged loop.
1199 ///
1200 /// Contains the data necessary to represent represent a loop as a node once
1201 /// it's packaged.
1202 ///
1203 /// PackagedLoopData inherits from BlockData to give the node the necessary
1204 /// stats. Further, it has a list of successors, list of members, and stores
1205 /// the backedge mass assigned to this loop.
1206 struct PackagedLoopData {
1207 typedef SmallVector, 4> ExitMap;
1208 typedef SmallVector MemberList;
1209 BlockNode Header; ///< Header.
1210 ExitMap Exits; ///< Successor edges (and weights).
1211 MemberList Members; ///< Members of the loop.
1212 BlockMass BackedgeMass; ///< Mass returned to loop header.
1213 BlockMass Mass;
1214 Float Scale;
1215
1216 PackagedLoopData(const BlockNode &Header) : Header(Header) {}
1217 };
1218
1219 /// \brief Data about each block. This is used downstream.
1220 std::vector Freqs;
1221
1222 /// \brief Loop data: see initializeLoops().
1223 std::vector Working;
1224
1225 /// \brief Indexed information about packaged loops.
1226 std::vector PackagedLoops;
1227
1228 /// \brief Create the initial loop packages.
1229 ///
1230 /// Initializes PackagedLoops using the data in Working about backedges
1231 /// and containing loops. Called by initializeLoops().
1232 ///
1233 /// \post WorkingData::LoopIndex has been initialized for every loop header
1234 /// and PackagedLoopData::Members has been initialized.
1235
1236 /// \brief Add all edges out of a packaged loop to the distribution.
1237 ///
1238 /// Adds all edges from LocalLoopHead to Dist. Calls addToDist() to add each
1239 /// successor edge.
1240 void addLoopSuccessorsToDist(const BlockNode &LoopHead,
1241 const BlockNode &LocalLoopHead,
1242 Distribution &Dist);
1243
1244 /// \brief Add an edge to the distribution.
1245 ///
1246 /// Adds an edge to Succ to Dist. If \c LoopHead.isValid(), then whether the
1247 /// edge is forward/exit/backedge is in the context of LoopHead. Otherwise,
1248 /// every edge should be a forward edge (since all the loops are packaged
1249 /// up).
1250 void addToDist(Distribution &Dist, const BlockNode &LoopHead,
1251 const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight);
1252
1253 PackagedLoopData &getLoopPackage(const BlockNode &Head) {
1254 assert(Head.Index < Working.size());
1255 size_t Index = Working[Head.Index].LoopIndex;
1256 assert(Index < PackagedLoops.size());
1257 return PackagedLoops[Index];
1258 }
1259
1260 /// \brief Distribute mass according to a distribution.
1261 ///
1262 /// Distributes the mass in Source according to Dist. If LoopHead.isValid(),
1263 /// backedges and exits are stored in its entry in PackagedLoops.
1264 ///
1265 /// Mass is distributed in parallel from two copies of the source mass.
1266 ///
1267 /// The first mass (forward) represents the distribution of mass through the
1268 /// local DAG. This distribution should lose mass at loop exits and ignore
1269 /// backedges.
1270 ///
1271 /// The second mass (general) represents the behavior of the loop in the
1272 /// global context. In a given distribution from the head, how much mass
1273 /// exits, and to where? How much mass returns to the loop head?
1274 ///
1275 /// The forward mass should be split up between local successors and exits,
1276 /// but only actually distributed to the local successors. The general mass
1277 /// should be split up between all three types of successors, but distributed
1278 /// only to exits and backedges.
1279 void distributeMass(const BlockNode &Source, const BlockNode &LoopHead,
1280 Distribution &Dist);
1281
1282 /// \brief Compute the loop scale for a loop.
1283 void computeLoopScale(const BlockNode &LoopHead);
1284
1285 /// \brief Package up a loop.
1286 void packageLoop(const BlockNode &LoopHead);
1287
1288 /// \brief Finalize frequency metrics.
1289 ///
1290 /// Unwraps loop packages, calculates final frequencies, and cleans up
1291 /// no-longer-needed data structures.
1292 void finalizeMetrics();
1293
1294 /// \brief Clear all memory.
1295 void clear();
1296
1297 virtual std::string getBlockName(const BlockNode &Node) const;
1298
1299 virtual raw_ostream &print(raw_ostream &OS) const { return OS; }
1300 void dump() const { print(dbgs()); }
1301
1302 Float getFloatingBlockFreq(const BlockNode &Node) const;
1303
1304 BlockFrequency getBlockFreq(const BlockNode &Node) const;
1305
1306 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockNode &Node) const;
1307 raw_ostream &printBlockFreq(raw_ostream &OS,
1308 const BlockFrequency &Freq) const;
1309
1310 uint64_t getEntryFreq() const {
1311 assert(!Freqs.empty());
1312 return Freqs[0].Integer;
1313 }
1314 /// \brief Virtual destructor.
1315 ///
1316 /// Need a virtual destructor to mask the compiler warning about
1317 /// getBlockName().
1318 virtual ~BlockFrequencyInfoImplBase() {}
1319 };
34 class MachineBlockFrequencyInfo;
132035
132136 namespace bfi_detail {
132237 template struct TypeMap {};
132439 typedef BasicBlock BlockT;
132540 typedef Function FunctionT;
132641 typedef BranchProbabilityInfo BranchProbabilityInfoT;
1327 typedef Loop LoopT;
1328 typedef LoopInfo LoopInfoT;
132942 };
133043 template <> struct TypeMap {
133144 typedef MachineBasicBlock BlockT;
133245 typedef MachineFunction FunctionT;
133346 typedef MachineBranchProbabilityInfo BranchProbabilityInfoT;
1334 typedef MachineLoop LoopT;
1335 typedef MachineLoopInfo LoopInfoT;
133647 };
1337
1338 /// \brief Get the name of a MachineBasicBlock.
1339 ///
1340 /// Get the name of a MachineBasicBlock. It's templated so that including from
1341 /// CodeGen is unnecessary (that would be a layering issue).
1342 ///
1343 /// This is used mainly for debug output. The name is similar to
1344 /// MachineBasicBlock::getFullName(), but skips the name of the function.
1345 template std::string getBlockName(const BlockT *BB) {
1346 assert(BB && "Unexpected nullptr");
1347 if (BB->getBasicBlock())
1348 return BB->getName().str();
1349 return (Twine("BB") + Twine(BB->getNumber())).str();
135048 }
1351 /// \brief Get the name of a BasicBlock.
1352 template <> inline std::string getBlockName(const BasicBlock *BB) {
1353 assert(BB && "Unexpected nullptr");
1354 return BB->getName().str();
1355 }
1356 }
1357
1358 /// \brief Shared implementation for block frequency analysis.
1359 ///
1360 /// This is a shared implementation of BlockFrequencyInfo and
1361 /// MachineBlockFrequencyInfo, and calculates the relative frequencies of
1362 /// blocks.
1363 ///
1364 /// This algorithm leverages BlockMass and PositiveFloat to maintain precision,
1365 /// separates mass distribution from loop scaling, and dithers to eliminate
1366 /// probability mass loss.
1367 ///
1368 /// The implementation is split between BlockFrequencyInfoImpl, which knows the
1369 /// type of graph being modelled (BasicBlock vs. MachineBasicBlock), and
1370 /// BlockFrequencyInfoImplBase, which doesn't. The base class uses \a
1371 /// BlockNode, a wrapper around a uint32_t. BlockNode is numbered from 0 in
1372 /// reverse-post order. This gives two advantages: it's easy to compare the
1373 /// relative ordering of two nodes, and maps keyed on BlockT can be represented
1374 /// by vectors.
1375 ///
1376 /// This algorithm is O(V+E), unless there is irreducible control flow, in
1377 /// which case it's O(V*E) in the worst case.
1378 ///
1379 /// These are the main stages:
1380 ///
1381 /// 0. Reverse post-order traversal (\a initializeRPOT()).
1382 ///
1383 /// Run a single post-order traversal and save it (in reverse) in RPOT.
1384 /// All other stages make use of this ordering. Save a lookup from BlockT
1385 /// to BlockNode (the index into RPOT) in Nodes.
1386 ///
1387 /// 1. Loop indexing (\a initializeLoops()).
1388 ///
1389 /// Translate LoopInfo/MachineLoopInfo into a form suitable for the rest of
1390 /// the algorithm. In particular, store the immediate members of each loop
1391 /// in reverse post-order.
1392 ///
1393 /// 2. Calculate mass and scale in loops (\a computeMassInLoops()).
1394 ///
1395 /// For each loop (bottom-up), distribute mass through the DAG resulting
1396 /// from ignoring backedges and treating sub-loops as a single pseudo-node.
1397 /// Track the backedge mass distributed to the loop header, and use it to
1398 /// calculate the loop scale (number of loop iterations).
1399 ///
1400 /// Visiting loops bottom-up is a post-order traversal of loop headers.
1401 /// For each loop, immediate members that represent sub-loops will already
1402 /// have been visited and packaged into a pseudo-node.
1403 ///
1404 /// Distributing mass in a loop is a reverse-post-order traversal through
1405 /// the loop. Start by assigning full mass to the Loop header. For each
1406 /// node in the loop:
1407 ///
1408 /// - Fetch and categorize the weight distribution for its successors.
1409 /// If this is a packaged-subloop, the weight distribution is stored
1410 /// in \a PackagedLoopData::Exits. Otherwise, fetch it from
1411 /// BranchProbabilityInfo.
1412 ///
1413 /// - Each successor is categorized as \a Weight::Local, a normal
1414 /// forward edge within the current loop, \a Weight::Backedge, a
1415 /// backedge to the loop header, or \a Weight::Exit, any successor
1416 /// outside the loop. The weight, the successor, and its category
1417 /// are stored in \a Distribution. There can be multiple edges to
1418 /// each successor.
1419 ///
1420 /// - Normalize the distribution: scale weights down so that their sum
1421 /// is 32-bits, and coalesce multiple edges to the same node.
1422 ///
1423 /// - Distribute the mass accordingly, dithering to minimize mass loss,
1424 /// as described in \a distributeMass(). Mass is distributed in
1425 /// parallel in two ways: forward, and general. Local successors
1426 /// take their mass from the forward mass, while exit and backedge
1427 /// successors take their mass from the general mass. Additionally,
1428 /// exit edges use up (ignored) mass from the forward mass, and local
1429 /// edges use up (ignored) mass from the general distribution.
1430 ///
1431 /// Finally, calculate the loop scale from the accumulated backedge mass.
1432 ///
1433 /// 3. Distribute mass in the function (\a computeMassInFunction()).
1434 ///
1435 /// Finally, distribute mass through the DAG resulting from packaging all
1436 /// loops in the function. This uses the same algorithm as distributing
1437 /// mass in a loop, except that there are no exit or backedge edges.
1438 ///
1439 /// 4. Loop unpackaging and cleanup (\a finalizeMetrics()).
1440 ///
1441 /// Initialize the frequency to a floating point representation of its
1442 /// mass.
1443 ///
1444 /// Visit loops top-down (reverse post-order), scaling the loop header's
1445 /// frequency by its psuedo-node's mass and loop scale. Keep track of the
1446 /// minimum and maximum final frequencies.
1447 ///
1448 /// Using the min and max frequencies as a guide, translate floating point
1449 /// frequencies to an appropriate range in uint64_t.
1450 ///
1451 /// It has some known flaws.
1452 ///
1453 /// - Irreducible control flow isn't modelled correctly. In particular,
1454 /// LoopInfo and MachineLoopInfo ignore irreducible backedges. The main
1455 /// result is that irreducible SCCs will under-scaled. No mass is lost,
1456 /// but the computed branch weights for the loop pseudo-node will be
1457 /// incorrect.
1458 ///
1459 /// Modelling irreducible control flow exactly involves setting up and
1460 /// solving a group of infinite geometric series. Such precision is
1461 /// unlikely to be worthwhile, since most of our algorithms give up on
1462 /// irreducible control flow anyway.
1463 ///
1464 /// Nevertheless, we might find that we need to get closer. If
1465 /// LoopInfo/MachineLoopInfo flags loops with irreducible control flow
1466 /// (and/or the function as a whole), we can find the SCCs, compute an
1467 /// approximate exit frequency for the SCC as a whole, and scale up
1468 /// accordingly.
1469 ///
1470 /// - Loop scale is limited to 4096 per loop (2^12) to avoid exhausting
1471 /// BlockFrequency's 64-bit integer precision.
1472 template class BlockFrequencyInfoImpl : BlockFrequencyInfoImplBase {
49
50 /// BlockFrequencyInfoImpl implements block frequency algorithm for IR and
51 /// Machine Instructions. Algorithm starts with value ENTRY_FREQ
52 /// for the entry block and then propagates frequencies using branch weights
53 /// from (Machine)BranchProbabilityInfo. LoopInfo is not required because
54 /// algorithm can find "backedges" by itself.
55 template
56 class BlockFrequencyInfoImpl {
147357 typedef typename bfi_detail::TypeMap::BlockT BlockT;
147458 typedef typename bfi_detail::TypeMap::FunctionT FunctionT;
147559 typedef typename bfi_detail::TypeMap::BranchProbabilityInfoT
147660 BranchProbabilityInfoT;
1477 typedef typename bfi_detail::TypeMap::LoopT LoopT;
1478 typedef typename bfi_detail::TypeMap::LoopInfoT LoopInfoT;
1479
1480 typedef GraphTraits Successor;
1481 typedef GraphTraits> Predecessor;
1482
1483 const BranchProbabilityInfoT *BPI;
1484 const LoopInfoT *LI;
1485 const FunctionT *F;
1486
1487 // All blocks in reverse postorder.
1488 std::vector RPOT;
1489 DenseMap Nodes;
1490
1491 typedef typename std::vector::const_iterator rpot_iterator;
1492
1493 rpot_iterator rpot_begin() const { return RPOT.begin(); }
1494 rpot_iterator rpot_end() const { return RPOT.end(); }
1495
1496 size_t getIndex(const rpot_iterator &I) const { return I - rpot_begin(); }
1497
1498 BlockNode getNode(const rpot_iterator &I) const {
1499 return BlockNode(getIndex(I));
1500 }
1501 BlockNode getNode(const BlockT *BB) const { return Nodes.lookup(BB); }
1502
1503 const BlockT *getBlock(const BlockNode &Node) const {
1504 return RPOT[Node.Index];
1505 }
1506
1507 void initializeRPOT();
1508 void initializeLoops();
1509 void runOnFunction(const FunctionT *F);
1510
1511 void propagateMassToSuccessors(const BlockNode &LoopHead,
1512 const BlockNode &Node);
1513 void computeMassInLoops();
1514 void computeMassInLoop(const BlockNode &LoopHead);
1515 void computeMassInFunction();
1516
1517 std::string getBlockName(const BlockNode &Node) const override {
1518 return bfi_detail::getBlockName(getBlock(Node));
61
62 DenseMap Freqs;
63
64 BranchProbabilityInfoT *BPI;
65
66 FunctionT *Fn;
67
68 typedef GraphTraits< Inverse > GT;
69
70 static const uint64_t EntryFreq = 1 << 14;
71
72 std::string getBlockName(BasicBlock *BB) const {
73 return BB->getName().str();
74 }
75
76 std::string getBlockName(MachineBasicBlock *MBB) const {
77 std::string str;
78 raw_string_ostream ss(str);
79 ss << "BB#" << MBB->getNumber();
80
81 if (const BasicBlock *BB = MBB->getBasicBlock())
82 ss << " derived from LLVM BB " << BB->getName();
83
84 return ss.str();
85 }
86
87 void setBlockFreq(BlockT *BB, BlockFrequency Freq) {
88 Freqs[BB] = Freq;
89 DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") = ";
90 printBlockFreq(dbgs(), Freq) << "\n");
91 }
92
93 /// getEdgeFreq - Return edge frequency based on SRC frequency and Src -> Dst
94 /// edge probability.
95 BlockFrequency getEdgeFreq(BlockT *Src, BlockT *Dst) const {
96 BranchProbability Prob = BPI->getEdgeProbability(Src, Dst);
97 return getBlockFreq(Src) * Prob;
98 }
99
100 /// incBlockFreq - Increase BB block frequency by FREQ.
101 ///
102 void incBlockFreq(BlockT *BB, BlockFrequency Freq) {
103 Freqs[BB] += Freq;
104 DEBUG(dbgs() << "Frequency(" << getBlockName(BB) << ") += ";
105 printBlockFreq(dbgs(), Freq) << " --> ";
106 printBlockFreq(dbgs(), Freqs[BB]) << "\n");
107 }
108
109 // All blocks in postorder.
110 std::vector POT;
111
112 // Map Block -> Position in reverse-postorder list.
113 DenseMap RPO;
114
115 // For each loop header, record the per-iteration probability of exiting the
116 // loop. This is the reciprocal of the expected number of loop iterations.
117 typedef DenseMap LoopExitProbMap;
118 LoopExitProbMap LoopExitProb;
119
120 // (reverse-)postorder traversal iterators.
121 typedef typename std::vector::iterator pot_iterator;
122 typedef typename std::vector::reverse_iterator rpot_iterator;
123
124 pot_iterator pot_begin() { return POT.begin(); }
125 pot_iterator pot_end() { return POT.end(); }
126
127 rpot_iterator rpot_begin() { return POT.rbegin(); }
128 rpot_iterator rpot_end() { return POT.rend(); }
129
130 rpot_iterator rpot_at(BlockT *BB) {
131 rpot_iterator I = rpot_begin();
132 unsigned idx = RPO.lookup(BB);
133 assert(idx);
134 std::advance(I, idx - 1);
135
136 assert(*I == BB);
137 return I;
138 }
139
140 /// isBackedge - Return if edge Src -> Dst is a reachable backedge.
141 ///
142 bool isBackedge(BlockT *Src, BlockT *Dst) const {
143 unsigned a = RPO.lookup(Src);
144 if (!a)
145 return false;
146 unsigned b = RPO.lookup(Dst);
147 assert(b && "Destination block should be reachable");
148 return a >= b;
149 }
150
151 /// getSingleBlockPred - return single BB block predecessor or NULL if
152 /// BB has none or more predecessors.
153 BlockT *getSingleBlockPred(BlockT *BB) {
154 typename GT::ChildIteratorType
155 PI = GraphTraits< Inverse >::child_begin(BB),
156 PE = GraphTraits< Inverse >::child_end(BB);
157
158 if (PI == PE)
159 return nullptr;
160
161 BlockT *Pred = *PI;
162
163 ++PI;
164 if (PI != PE)
165 return nullptr;
166
167 return Pred;
168 }
169
170 void doBlock(BlockT *BB, BlockT *LoopHead,
171 SmallPtrSet &BlocksInLoop) {
172
173 DEBUG(dbgs() << "doBlock(" << getBlockName(BB) << ")\n");
174 setBlockFreq(BB, 0);
175
176 if (BB == LoopHead) {
177 setBlockFreq(BB, EntryFreq);
178 return;
179 }
180
181 if (BlockT *Pred = getSingleBlockPred(BB)) {
182 if (BlocksInLoop.count(Pred))
183 setBlockFreq(BB, getEdgeFreq(Pred, BB));
184 // TODO: else? irreducible, ignore it for now.
185 return;
186 }
187
188 bool isInLoop = false;
189 bool isLoopHead = false;
190
191 for (typename GT::ChildIteratorType
192 PI = GraphTraits< Inverse >::child_begin(BB),
193 PE = GraphTraits< Inverse >::child_end(BB);
194 PI != PE; ++PI) {
195 BlockT *Pred = *PI;
196
197 if (isBackedge(Pred, BB)) {
198 isLoopHead = true;
199 } else if (BlocksInLoop.count(Pred)) {
200 incBlockFreq(BB, getEdgeFreq(Pred, BB));
201 isInLoop = true;
202 }
203 // TODO: else? irreducible.
204 }
205
206 if (!isInLoop)
207 return;
208
209 if (!isLoopHead)
210 return;
211
212 // This block is a loop header, so boost its frequency by the expected
213 // number of loop iterations. The loop blocks will be revisited so they all
214 // get this boost.
215 typename LoopExitProbMap::const_iterator I = LoopExitProb.find(BB);
216 assert(I != LoopExitProb.end() && "Loop header missing from table");
217 Freqs[BB] /= I->second;
218 DEBUG(dbgs() << "Loop header scaled to ";
219 printBlockFreq(dbgs(), Freqs[BB]) << ".\n");
220 }
221
222 /// doLoop - Propagate block frequency down through the loop.
223 void doLoop(BlockT *Head, BlockT *Tail) {
224 DEBUG(dbgs() << "doLoop(" << getBlockName(Head) << ", "
225 << getBlockName(Tail) << ")\n");
226
227 SmallPtrSet BlocksInLoop;
228
229 for (rpot_iterator I = rpot_at(Head), E = rpot_at(Tail); ; ++I) {
230 BlockT *BB = *I;
231 doBlock(BB, Head, BlocksInLoop);
232
233 BlocksInLoop.insert(BB);
234 if (I == E)
235 break;
236 }
237
238 // Compute loop's cyclic probability using backedges probabilities.
239 BlockFrequency BackFreq;
240 for (typename GT::ChildIteratorType
241 PI = GraphTraits< Inverse >::child_begin(Head),
242 PE = GraphTraits< Inverse >::child_end(Head);
243 PI != PE; ++PI) {
244 BlockT *Pred = *PI;
245 assert(Pred);
246 if (isBackedge(Pred, Head))
247 BackFreq += getEdgeFreq(Pred, Head);
248 }
249
250 // The cyclic probability is freq(BackEdges) / freq(Head), where freq(Head)
251 // only counts edges entering the loop, not the loop backedges.
252 // The probability of leaving the loop on each iteration is:
253 //
254 // ExitProb = 1 - CyclicProb
255 //
256 // The Expected number of loop iterations is:
257 //
258 // Iterations = 1 / ExitProb
259 //
260 uint64_t D = std::max(getBlockFreq(Head).getFrequency(), UINT64_C(1));
261 uint64_t N = std::max(BackFreq.getFrequency(), UINT64_C(1));
262 if (N < D)
263 N = D - N;
264 else
265 // We'd expect N < D, but rounding and saturation means that can't be
266 // guaranteed.
267 N = 1;
268
269 // Now ExitProb = N / D, make sure it fits in an i32/i32 fraction.
270 assert(N <= D);
271 if (D > UINT32_MAX) {
272 unsigned Shift = 32 - countLeadingZeros(D);
273 D >>= Shift;
274 N >>= Shift;
275 if (N == 0)
276 N = 1;
277 }
278 BranchProbability LEP = BranchProbability(N, D);
279 LoopExitProb.insert(std::make_pair(Head, LEP));
280 DEBUG(dbgs() << "LoopExitProb[" << getBlockName(Head) << "] = " << LEP
281 << " from 1 - ";
282 printBlockFreq(dbgs(), BackFreq) << " / ";
283 printBlockFreq(dbgs(), getBlockFreq(Head)) << ".\n");
284 }
285
286 friend class BlockFrequencyInfo;
287 friend class MachineBlockFrequencyInfo;
288
289 BlockFrequencyInfoImpl() { }
290
291 void doFunction(FunctionT *fn, BranchProbabilityInfoT *bpi) {
292 Fn = fn;
293 BPI = bpi;
294
295 // Clear everything.
296 RPO.clear();
297 POT.clear();
298 LoopExitProb.clear();
299 Freqs.clear();
300
301 BlockT *EntryBlock = fn->begin();
302
303 std::copy(po_begin(EntryBlock), po_end(EntryBlock), std::back_inserter(POT));
304
305 unsigned RPOidx = 0;
306 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
307 BlockT *BB = *I;
308 RPO[BB] = ++RPOidx;
309 DEBUG(dbgs() << "RPO[" << getBlockName(BB) << "] = " << RPO[BB] << "\n");
310 }
311
312 // Travel over all blocks in postorder.
313 for (pot_iterator I = pot_begin(), E = pot_end(); I != E; ++I) {
314 BlockT *BB = *I;
315 BlockT *LastTail = nullptr;
316 DEBUG(dbgs() << "POT: " << getBlockName(BB) << "\n");
317
318 for (typename GT::ChildIteratorType
319 PI = GraphTraits< Inverse >::child_begin(BB),
320 PE = GraphTraits< Inverse >::child_end(BB);
321 PI != PE; ++PI) {
322
323 BlockT *Pred = *PI;
324 if (isBackedge(Pred, BB) && (!LastTail || RPO[Pred] > RPO[LastTail]))
325 LastTail = Pred;
326 }
327
328 if (LastTail)
329 doLoop(BB, LastTail);
330 }
331
332 // At the end assume the whole function as a loop, and travel over it once
333 // again.
334 doLoop(*(rpot_begin()), *(pot_begin()));
1519335 }
1520336
1521337 public:
1522 const FunctionT *getFunction() const { return F; }
1523
1524 void doFunction(const FunctionT *F, const BranchProbabilityInfoT *BPI,
1525 const LoopInfoT *LI);
1526 BlockFrequencyInfoImpl() : BPI(0), LI(0), F(0) {}
1527
1528 using BlockFrequencyInfoImplBase::getEntryFreq;
338
339 uint64_t getEntryFreq() { return EntryFreq; }
340
341 /// getBlockFreq - Return block frequency. Return 0 if we don't have it.
1529342 BlockFrequency getBlockFreq(const BlockT *BB) const {
1530 return BlockFrequencyInfoImplBase::getBlockFreq(getNode(BB));
1531 }
1532 Float getFloatingBlockFreq(const BlockT *BB) const {
1533 return BlockFrequencyInfoImplBase::getFloatingBlockFreq(getNode(BB));
1534 }
1535
1536 /// \brief Print the frequencies for the current function.
1537 ///
1538 /// Prints the frequencies for the blocks in the current function.
1539 ///
1540 /// Blocks are printed in the natural iteration order of the function, rather
1541 /// than reverse post-order. This provides two advantages: writing -analyze
1542 /// tests is easier (since blocks come out in source order), and even
1543 /// unreachable blocks are printed.
1544 raw_ostream &print(raw_ostream &OS) const override;
1545 using BlockFrequencyInfoImplBase::dump;
1546
1547 using BlockFrequencyInfoImplBase::printBlockFreq;
1548 raw_ostream &printBlockFreq(raw_ostream &OS, const BlockT *BB) const {
1549 return BlockFrequencyInfoImplBase::printBlockFreq(OS, getNode(BB));
1550 }
343 typename DenseMap::const_iterator
344 I = Freqs.find(BB);
345 if (I != Freqs.end())
346 return I->second;
347 return 0;
348 }
349
350 void print(raw_ostream &OS) const {
351 OS << "\n\n---- Block Freqs ----\n";
352 for (typename FunctionT::iterator I = Fn->begin(), E = Fn->end(); I != E;) {
353 BlockT *BB = I++;
354 OS << " " << getBlockName(BB) << " = ";
355 printBlockFreq(OS, getBlockFreq(BB)) << "\n";
356
357 for (typename GraphTraits::ChildIteratorType
358 SI = GraphTraits::child_begin(BB),
359 SE = GraphTraits::child_end(BB); SI != SE; ++SI) {
360 BlockT *Succ = *SI;
361 OS << " " << getBlockName(BB) << " -> " << getBlockName(Succ)
362 << " = "; printBlockFreq(OS, getEdgeFreq(BB, Succ)) << "\n";
363 }
364 }
365 }
366
367 void dump() const {
368 print(dbgs());
369 }
370
371 // Utility method that looks up the block frequency associated with BB and
372 // prints it to OS.
373 raw_ostream &printBlockFreq(raw_ostream &OS,
374 const BlockT *BB) {
375 return printBlockFreq(OS, getBlockFreq(BB));
376 }
377
378 raw_ostream &printBlockFreq(raw_ostream &OS,
379 const BlockFrequency &Freq) const {
380 // Convert fixed-point number to decimal.
381 uint64_t Frequency = Freq.getFrequency();
382 OS << Frequency / EntryFreq << ".";
383 uint64_t Rem = Frequency % EntryFreq;
384 uint64_t Eps = 1;
385 do {
386 Rem *= 10;
387 Eps *= 10;
388 OS << Rem / EntryFreq;
389 Rem = Rem % EntryFreq;
390 } while (Rem >= Eps/2);
391 return OS;
392 }
393
1551394 };
1552395
1553 template
1554 void BlockFrequencyInfoImpl::doFunction(const FunctionT *F,
1555 const BranchProbabilityInfoT *BPI,
1556 const LoopInfoT *LI) {
1557 // Save the parameters.
1558 this->BPI = BPI;
1559 this->LI = LI;
1560 this->F = F;
1561
1562 // Clean up left-over data structures.
1563 BlockFrequencyInfoImplBase::clear();
1564 RPOT.clear();
1565 Nodes.clear();
1566
1567 // Initialize.
1568 DEBUG(dbgs() << "\nblock-frequency: " << F->getName() << "\n================="
1569 << std::string(F->getName().size(), '=') << "\n");
1570 initializeRPOT();
1571 initializeLoops();
1572
1573 // Visit loops in post-order to find thelocal mass distribution, and then do
1574 // the full function.
1575 computeMassInLoops();
1576 computeMassInFunction();
1577 finalizeMetrics();
1578396 }
1579397
1580 template void BlockFrequencyInfoImpl::initializeRPOT() {
1581 const BlockT *Entry = F->begin();
1582 RPOT.reserve(F->size());
1583 std::copy(po_begin(Entry), po_end(Entry), std::back_inserter(RPOT));
1584 std::reverse(RPOT.begin(), RPOT.end());
1585
1586 assert(RPOT.size() - 1 <= BlockNode::getMaxIndex() &&
1587 "More nodes in function than Block Frequency Info supports");
1588
1589 DEBUG(dbgs() << "reverse-post-order-traversal\n");
1590 for (rpot_iterator I = rpot_begin(), E = rpot_end(); I != E; ++I) {
1591 BlockNode Node = getNode(I);
1592 DEBUG(dbgs() << " - " << getIndex(I) << ": " << getBlockName(Node) << "\n");
1593 Nodes[*I] = Node;
1594 }
1595
1596 Working.resize(RPOT.size());
1597 Freqs.resize(RPOT.size());
1598 }
1599
1600 template void BlockFrequencyInfoImpl::initializeLoops() {
1601 DEBUG(dbgs() << "loop-detection\n");
1602 if (LI->empty())
1603 return;
1604
1605 // Visit loops top down and assign them an index.
1606 std::deque Q;
1607 Q.insert(Q.end(), LI->begin(), LI->end());
1608 while (!Q.empty()) {
1609 const LoopT *Loop = Q.front();
1610 Q.pop_front();
1611 Q.insert(Q.end(), Loop->begin(), Loop->end());
1612
1613 // Save the order this loop was visited.
1614 BlockNode Header = getNode(Loop->getHeader());
1615 assert(Header.isValid());
1616
1617 Working[Header.Index].LoopIndex = PackagedLoops.size();
1618 PackagedLoops.emplace_back(Header);
1619 DEBUG(dbgs() << " - loop = " << getBlockName(Header) << "\n");
1620 }
1621
1622 // Visit nodes in reverse post-order and add them to their deepest containing
1623 // loop.
1624 for (size_t Index = 0; Index < RPOT.size(); ++Index) {
1625 const LoopT *Loop = LI->getLoopFor(RPOT[Index]);
1626 if (!Loop)
1627 continue;
1628
1629 // If this is a loop header, find its parent loop (if any).
1630 if (Working[Index].isLoopHeader())
1631 if (!(Loop = Loop->getParentLoop()))
1632 continue;
1633
1634 // Add this node to its containing loop's member list.
1635 BlockNode Header = getNode(Loop->getHeader());
1636 assert(Header.isValid());
1637 const auto &HeaderData = Working[Header.Index];
1638 assert(HeaderData.isLoopHeader());
1639
1640 Working[Index].ContainingLoop = Header;
1641 PackagedLoops[HeaderData.LoopIndex].Members.push_back(Index);
1642 DEBUG(dbgs() << " - loop = " << getBlockName(Header)
1643 << ": member = " << getBlockName(Index) << "\n");
1644 }
1645 }
1646
1647 template void BlockFrequencyInfoImpl::computeMassInLoops() {
1648 // Visit loops with the deepest first, and the top-level loops last.
1649 for (auto L = PackagedLoops.rbegin(), LE = PackagedLoops.rend(); L != LE; ++L)
1650 computeMassInLoop(L->Header);
1651 }
1652
1653 template
1654 void BlockFrequencyInfoImpl::computeMassInLoop(const BlockNode &LoopHead) {
1655 // Compute mass in loop.
1656 DEBUG(dbgs() << "compute-mass-in-loop: " << getBlockName(LoopHead) << "\n");
1657
1658 Working[LoopHead.Index].Mass = BlockMass::getFull();
1659 propagateMassToSuccessors(LoopHead, LoopHead);
1660
1661 for (const BlockNode &M : getLoopPackage(LoopHead).Members)
1662 propagateMassToSuccessors(LoopHead, M);
1663
1664 computeLoopScale(LoopHead);
1665 packageLoop(LoopHead);
1666 }
1667
1668 template void BlockFrequencyInfoImpl::computeMassInFunction() {
1669 // Compute mass in function.
1670 DEBUG(dbgs() << "compute-mass-in-function\n");
1671 Working[0].Mass = BlockMass::getFull();
1672 for (rpot_iterator I = rpot_begin(), IE = rpot_end(); I != IE; ++I) {
1673 // Check for nodes that have been packaged.
1674 BlockNode Node = getNode(I);
1675 if (Working[Node.Index].hasLoopHeader())
1676 continue;
1677
1678 propagateMassToSuccessors(BlockNode(), Node);
1679 }
1680 }
1681
1682 template
1683 void
1684 BlockFrequencyInfoImpl::propagateMassToSuccessors(const BlockNode &LoopHead,
1685 const BlockNode &Node) {
1686 DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n");
1687 // Calculate probability for successors.
1688 Distribution Dist;
1689 if (Node != LoopHead && Working[Node.Index].isLoopHeader())
1690 addLoopSuccessorsToDist(LoopHead, Node, Dist);
1691 else {
1692 const BlockT *BB = getBlock(Node);
1693 for (auto SI = Successor::child_begin(BB), SE = Successor::child_end(BB);
1694 SI != SE; ++SI)
1695 // Do not dereference SI, or getEdgeWeight() is linear in the number of
1696 // successors.
1697 addToDist(Dist, LoopHead, Node, getNode(*SI), BPI->getEdgeWeight(BB, SI));
1698 }
1699
1700 // Distribute mass to successors, saving exit and backedge data in the
1701 // loop header.
1702 distributeMass(Node, LoopHead, Dist);
1703 }
1704
1705 template
1706 raw_ostream &BlockFrequencyInfoImpl::print(raw_ostream &OS) const {
1707 if (!F)
1708 return OS;
1709 OS << "block-frequency-info: " << F->getName() << "\n";
1710 for (const BlockT &BB : *F)
1711 OS << " - " << bfi_detail::getBlockName(&BB)
1712 << ": float = " << getFloatingBlockFreq(&BB)
1713 << ", int = " << getBlockFreq(&BB).getFrequency() << "\n";
1714
1715 // Add an extra newline for readability.
1716 OS << "\n";
1717 return OS;
1718 }
1719 }
1720
1721398 #endif
1010 //
1111 //===----------------------------------------------------------------------===//
1212
13 #define DEBUG_TYPE "block-freq"
1413 #include "llvm/Analysis/BlockFrequencyInfo.h"
1514 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
1615 #include "llvm/Analysis/BranchProbabilityInfo.h"
106105 INITIALIZE_PASS_BEGIN(BlockFrequencyInfo, "block-freq",
107106 "Block Frequency Analysis", true, true)
108107 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfo)
109 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
110108 INITIALIZE_PASS_END(BlockFrequencyInfo, "block-freq",
111109 "Block Frequency Analysis", true, true)
112110
121119
122120 void BlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
123121 AU.addRequired();
124 AU.addRequired();
125122 AU.setPreservesAll();
126123 }
127124
128125 bool BlockFrequencyInfo::runOnFunction(Function &F) {
129126 BranchProbabilityInfo &BPI = getAnalysis();
130 LoopInfo &LI = getAnalysis();
131127 if (!BFI)
132128 BFI.reset(new ImplType);
133 BFI->doFunction(&F, &BPI, &LI);
129 BFI->doFunction(&F, &BPI);
134130 #ifndef NDEBUG
135131 if (ViewBlockFreqPropagationDAG != GVDT_None)
136132 view();
161157 }
162158
163159 const Function *BlockFrequencyInfo::getFunction() const {
164 return BFI ? BFI->getFunction() : nullptr;
160 return BFI ? BFI->Fn : nullptr;
165161 }
166162
167163 raw_ostream &BlockFrequencyInfo::
+0
-933
lib/Analysis/BlockFrequencyInfoImpl.cpp less more
None //===- BlockFrequencyImplInfo.cpp - Block Frequency Info Implementation ---===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // Loops should be simplified before this analysis.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #define DEBUG_TYPE "block-freq"
14 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
15 #include "llvm/ADT/APFloat.h"
16 #include "llvm/Support/raw_ostream.h"
17 #include
18
19 using namespace llvm;
20
21 //===----------------------------------------------------------------------===//
22 //
23 // PositiveFloat implementation.
24 //
25 //===----------------------------------------------------------------------===//
26 #ifndef _MSC_VER
27 const int PositiveFloatBase::MaxExponent;
28 const int PositiveFloatBase::MinExponent;
29 #endif
30
31 static void appendDigit(std::string &Str, unsigned D) {
32 assert(D < 10);
33 Str += '0' + D % 10;
34 }
35
36 static void appendNumber(std::string &Str, uint64_t N) {
37 while (N) {
38 appendDigit(Str, N % 10);
39 N /= 10;
40 }
41 }
42
43 static bool doesRoundUp(char Digit) {
44 switch (Digit) {
45 case '5':
46 case '6':
47 case '7':
48 case '8':
49 case '9':
50 return true;
51 default:
52 return false;
53 }
54 }
55
56 static std::string toStringAPFloat(uint64_t D, int E, unsigned Precision) {
57 assert(E >= PositiveFloatBase::MinExponent);
58 assert(E <= PositiveFloatBase::MaxExponent);
59
60 // Find a new E, but don't let it increase past MaxExponent.
61 int LeadingZeros = PositiveFloatBase::countLeadingZeros64(D);
62 int NewE = std::min(PositiveFloatBase::MaxExponent, E + 63 - LeadingZeros);
63 int Shift = 63 - (NewE - E);
64 assert(Shift <= LeadingZeros);
65 assert(Shift == LeadingZeros || NewE == PositiveFloatBase::MaxExponent);
66 D <<= Shift;
67 E = NewE;
68
69 // Check for a denormal.
70 unsigned AdjustedE = E + 16383;
71 if (!(D >> 63)) {
72 assert(E == PositiveFloatBase::MaxExponent);
73 AdjustedE = 0;
74 }
75
76 // Build the float and print it.
77 uint64_t RawBits[2] = {D, AdjustedE};
78 APFloat Float(APFloat::x87DoubleExtended, APInt(80, RawBits));
79 SmallVector Chars;
80 Float.toString(Chars, Precision, 0);
81 return std::string(Chars.begin(), Chars.end());
82 }
83
84 static std::string stripTrailingZeros(std::string Float) {
85 size_t NonZero = Float.find_last_not_of('0');
86 assert(NonZero != std::string::npos && "no . in floating point string");
87
88 if (Float[NonZero] == '.')
89 ++NonZero;
90
91 return Float.substr(0, NonZero + 1);
92 }
93
94 std::string PositiveFloatBase::toString(uint64_t D, int16_t E, int Width,
95 unsigned Precision) {
96 if (!D)
97 return "0.0";
98
99 // Canonicalize exponent and digits.
100 uint64_t Above0 = 0;
101 uint64_t Below0 = 0;
102 uint64_t Extra = 0;
103 int ExtraShift = 0;
104 if (E == 0) {
105 Above0 = D;
106 } else if (E > 0) {
107 if (int Shift = std::min(int16_t(countLeadingZeros64(D)), E)) {
108 D <<= Shift;
109 E -= Shift;
110
111 if (!E)
112 Above0 = D;
113 }
114 } else if (E > -64) {
115 Above0 = D >> -E;
116 Below0 = D << (64 + E);
117 } else if (E > -120) {
118 Below0 = D >> (-E - 64);
119 Extra = D << (128 + E);
120 ExtraShift = -64 - E;
121 }
122
123 // Fall back on APFloat for very small and very large numbers.
124 if (!Above0 && !Below0)
125 return toStringAPFloat(D, E, Precision);
126
127 // Append the digits before the decimal.
128 std::string Str;
129 size_t DigitsOut = 0;
130 if (Above0) {
131 appendNumber(Str, Above0);
132 DigitsOut = Str.size();
133 } else
134 appendDigit(Str, 0);
135 std::reverse(Str.begin(), Str.end());
136
137 // Return early if there's nothing after the decimal.
138 if (!Below0)
139 return Str + ".0";
140
141 // Append the decimal and beyond.
142 Str += '.';
143 uint64_t Error = UINT64_C(1) << (64 - Width);
144
145 // We need to shift Below0 to the right to make space for calculating
146 // digits. Save the precision we're losing in Extra.
147 Extra = (Below0 & 0xf) << 56 | (Extra >> 8);
148 Below0 >>= 4;
149 size_t SinceDot = 0;
150 size_t AfterDot = Str.size();
151 do {
152 if (ExtraShift) {
153 --ExtraShift;
154 Error *= 5;
155 } else
156 Error *= 10;
157
158 Below0 *= 10;
159 Extra *= 10;
160 Below0 += (Extra >> 60);
161 Extra = Extra & (UINT64_MAX >> 4);
162 appendDigit(Str, Below0 >> 60);
163 Below0 = Below0 & (UINT64_MAX >> 4);
164 if (DigitsOut || Str.back() != '0')
165 ++DigitsOut;
166 ++SinceDot;
167 } while (Error && (Below0 << 4 | Extra >> 60) >= Error / 2 &&
168 (!Precision || DigitsOut <= Precision || SinceDot < 2));
169
170 // Return early for maximum precision.
171 if (!Precision || DigitsOut <= Precision)
172 return stripTrailingZeros(Str);
173
174 // Find where to truncate.
175 size_t Truncate =
176 std::max(Str.size() - (DigitsOut - Precision), AfterDot + 1);
177
178 // Check if there's anything to truncate.
179 if (Truncate >= Str.size())
180 return stripTrailingZeros(Str);
181
182 bool Carry = doesRoundUp(Str[Truncate]);
183 if (!Carry)
184 return stripTrailingZeros(Str.substr(0, Truncate));
185
186 // Round with the first truncated digit.
187 for (std::string::reverse_iterator I(Str.begin() + Truncate), E = Str.rend();
188 I != E; ++I) {
189 if (*I == '.')
190 continue;
191 if (*I == '9') {
192 *I = '0';
193 continue;
194 }
195
196 ++*I;
197 Carry = false;
198 break;
199 }
200
201 // Add "1" in front if we still need to carry.
202 return stripTrailingZeros(std::string(Carry, '1') + Str.substr(0, Truncate));
203 }
204
205 raw_ostream &PositiveFloatBase::print(raw_ostream &OS, uint64_t D, int16_t E,
206 int Width, unsigned Precision) {
207 return OS << toString(D, E, Width, Precision);
208 }
209
210 void PositiveFloatBase::dump(uint64_t D, int16_t E, int Width) {
211 print(dbgs(), D, E, Width, 0) << "[" << Width << ":" << D << "*2^" << E
212 << "]";
213 }
214
215 static std::pair
216 getRoundedFloat(uint64_t N, bool ShouldRound, int64_t Shift) {
217 if (ShouldRound)
218 if (!++N)
219 // Rounding caused an overflow.
220 return std::make_pair(UINT64_C(1), Shift + 64);
221 return std::make_pair(N, Shift);
222 }
223
224 std::pair PositiveFloatBase::divide64(uint64_t Dividend,
225 uint64_t Divisor) {
226 // Input should be sanitized.
227 assert(Divisor);
228 assert(Dividend);
229
230 // Minimize size of divisor.
231 int16_t Shift = 0;
232 if (int Zeros = countTrailingZeros(Divisor)) {
233 Shift -= Zeros;
234 Divisor >>= Zeros;
235 }
236
237 // Check for powers of two.
238 if (Divisor == 1)
239 return std::make_pair(Dividend, Shift);
240
241 // Maximize size of dividend.
242 if (int Zeros = countLeadingZeros64(Dividend)) {
243 Shift -= Zeros;
244 Dividend <<= Zeros;
245 }
246
247 // Start with the result of a divide.
248 uint64_t Quotient = Dividend / Divisor;
249 Dividend %= Divisor;
250
251 // Continue building the quotient with long division.
252 //
253 // TODO: continue with largers digits.
254 while (!(Quotient >> 63) && Dividend) {
255 // Shift Dividend, and check for overflow.
256 bool IsOverflow = Dividend >> 63;
257 Dividend <<= 1;
258 --Shift;
259
260 // Divide.
261 bool DoesDivide = IsOverflow || Divisor <= Dividend;
262 Quotient = (Quotient << 1) | uint64_t(DoesDivide);
263 Dividend -= DoesDivide ? Divisor : 0;
264 }
265
266 // Round.
267 if (Dividend >= getHalf(Divisor))
268 if (!++Quotient)
269 // Rounding caused an overflow in Quotient.
270 return std::make_pair(UINT64_C(1), Shift + 64);
271
272 return getRoundedFloat(Quotient, Dividend >= getHalf(Divisor), Shift);
273 }
274
275 static void addWithCarry(uint64_t &Upper, uint64_t &Lower, uint64_t N) {
276 uint64_t NewLower = Lower + (N << 32);
277 Upper += (N >> 32) + (NewLower < Lower);
278 Lower = NewLower;
279 }
280
281 std::pair PositiveFloatBase::multiply64(uint64_t L,
282 uint64_t R) {
283 // Separate into two 32-bit digits (U.L).
284 uint64_t UL = L >> 32, LL = L & UINT32_MAX, UR = R >> 32, LR = R & UINT32_MAX;
285
286 // Compute cross products.
287 uint64_t P1 = UL * UR, P2 = UL * LR, P3 = LL * UR, P4 = LL * LR;
288
289 // Sum into two 64-bit digits.
290 uint64_t Upper = P1, Lower = P4;
291 addWithCarry(Upper, Lower, P2);
292 addWithCarry(Upper, Lower, P3);
293
294 // Check for the lower 32 bits.
295 if (!Upper)
296 return std::make_pair(Lower, 0);
297
298 // Shift as little as possible to maximize precision.
299 unsigned LeadingZeros = countLeadingZeros64(Upper);
300 int16_t Shift = 64 - LeadingZeros;
301 if (LeadingZeros)
302 Upper = Upper << LeadingZeros | Lower >> Shift;
303 bool ShouldRound = Shift && (Lower & UINT64_C(1) << (Shift - 1));
304 return getRoundedFloat(Upper, ShouldRound, Shift);
305 }
306
307 //===----------------------------------------------------------------------===//
308 //
309 // BlockMass implementation.
310 //
311 //===----------------------------------------------------------------------===//
312 BlockMass &BlockMass::operator*=(const BranchProbability &P) {
313 uint32_t N = P.getNumerator(), D = P.getDenominator();
314 assert(D || "divide by 0");
315 assert(N <= D || "fraction greater than 1");
316
317 // Fast path for multiplying by 1.0.
318 if (!Mass || N == D)
319 return *this;
320
321 // Get as much precision as we can.
322 int Shift = countLeadingZeros(Mass);
323 uint64_t ShiftedQuotient = (Mass << Shift) / D;
324 uint64_t Product = ShiftedQuotient * N >> Shift;
325
326 // Now check for what's lost.
327 uint64_t Left = ShiftedQuotient * (D - N) >> Shift;
328 uint64_t Lost = Mass - Product - Left;
329
330 // TODO: prove this assertion.
331 assert(Lost <= UINT32_MAX);
332
333 // Take the product plus a portion of the spoils.
334 Mass = Product + Lost * N / D;
335 return *this;
336 }
337
338 PositiveFloat BlockMass::toFloat() const {
339 if (isFull())
340 return PositiveFloat(1, 0);
341 return PositiveFloat(getMass() + 1, -64);
342 }
343
344 void BlockMass::dump() const { print(dbgs()); }
345
346 static char getHexDigit(int N) {
347 assert(N < 16);
348 if (N < 10)
349 return '0' + N;
350 return 'a' + N - 10;
351 }
352 raw_ostream &BlockMass::print(raw_ostream &OS) const {
353 for (int Digits = 0; Digits < 16; ++Digits)
354 OS << getHexDigit(Mass >> (60 - Digits * 4) & 0xf);
355 return OS;
356 }
357
358 //===----------------------------------------------------------------------===//
359 //
360 // BlockFrequencyInfoImpl implementation.
361 //
362 //===----------------------------------------------------------------------===//
363 namespace {
364
365 typedef BlockFrequencyInfoImplBase::BlockNode BlockNode;
366 typedef BlockFrequencyInfoImplBase::Distribution Distribution;
367 typedef BlockFrequencyInfoImplBase::Distribution::WeightList WeightList;
368 typedef BlockFrequencyInfoImplBase::Float Float;
369 typedef BlockFrequencyInfoImplBase::PackagedLoopData PackagedLoopData;
370 typedef BlockFrequencyInfoImplBase::Weight Weight;
371 typedef BlockFrequencyInfoImplBase::FrequencyData FrequencyData;
372
373 /// \brief Dithering mass distributer.
374 ///
375 /// This class splits up a single mass into portions by weight, dithering to
376 /// spread out error. No mass is lost. The dithering precision depends on the
377 /// precision of the product of \a BlockMass and \a BranchProbability.
378 ///
379 /// The distribution algorithm follows.
380 ///
381 /// 1. Initialize by saving the sum of the weights in \a RemWeight and the
382 /// mass to distribute in \a RemMass.
383 ///
384 /// 2. For each portion:
385 ///
386 /// 1. Construct a branch probability, P, as the portion's weight divided
387 /// by the current value of \a RemWeight.
388 /// 2. Calculate the portion's mass as \a RemMass times P.
389 /// 3. Update \a RemWeight and \a RemMass at each portion by subtracting
390 /// the current portion's weight and mass.
391 ///
392 /// Mass is distributed in two ways: full distribution and forward
393 /// distribution. The latter ignores backedges, and uses the parallel fields
394 /// \a RemForwardWeight and \a RemForwardMass.
395 struct DitheringDistributer {
396 uint32_t RemWeight;
397 uint32_t RemForwardWeight;
398
399 BlockMass RemMass;
400 BlockMass RemForwardMass;
401
402 DitheringDistributer(Distribution &Dist, const BlockMass &Mass);
403
404 BlockMass takeLocalMass(uint32_t Weight) {
405 (void)takeMass(Weight);
406 return takeForwardMass(Weight);
407 }
408 BlockMass takeExitMass(uint32_t Weight) {
409 (void)takeForwardMass(Weight);
410 return takeMass(Weight);
411 }
412 BlockMass takeBackedgeMass(uint32_t Weight) { return takeMass(Weight); }
413
414 private:
415 BlockMass takeForwardMass(uint32_t Weight);
416 BlockMass takeMass(uint32_t Weight);
417 };
418 }
419
420 DitheringDistributer::DitheringDistributer(Distribution &Dist,
421 const BlockMass &Mass) {
422 Dist.normalize();
423 RemWeight = Dist.Total;
424 RemForwardWeight = Dist.ForwardTotal;
425 RemMass = Mass;
426 RemForwardMass = Dist.ForwardTotal ? Mass : BlockMass();
427 }
428
429 BlockMass DitheringDistributer::takeForwardMass(uint32_t Weight) {
430 // Compute the amount of mass to take.
431 assert(Weight && "invalid weight");
432 assert(Weight <= RemForwardWeight);
433 BlockMass Mass = RemForwardMass * BranchProbability(Weight, RemForwardWeight);
434
435 // Decrement totals (dither).
436 RemForwardWeight -= Weight;
437 RemForwardMass -= Mass;
438 return Mass;
439 }
440 BlockMass DitheringDistributer::takeMass(uint32_t Weight) {
441 assert(Weight && "invalid weight");
442 assert(Weight <= RemWeight);
443 BlockMass Mass = RemMass * BranchProbability(Weight, RemWeight);
444
445 // Decrement totals (dither).
446 RemWeight -= Weight;
447 RemMass -= Mass;
448 return Mass;
449 }
450
451 void Distribution::add(const BlockNode &Node, uint64_t Amount,
452 Weight::DistType Type) {
453 assert(Amount && "invalid weight of 0");
454 uint64_t NewTotal = Total + Amount;
455
456 // Check for overflow. It should be impossible to overflow twice.
457 bool IsOverflow = NewTotal < Total;
458 assert(!(DidOverflow && IsOverflow) && "unexpected repeated overflow");
459 DidOverflow |= IsOverflow;
460
461 // Update the total.
462 Total = NewTotal;
463
464 // Save the weight.
465 Weight W;
466 W.TargetNode = Node;
467 W.Amount = Amount;
468 W.Type = Type;
469 Weights.push_back(W);
470
471 if (Type == Weight::Backedge)
472 return;
473
474 // Update forward total. Don't worry about overflow here, since then Total
475 // will exceed 32-bits and they'll both be recomputed in normalize().
476 ForwardTotal += Amount;
477 }
478
479 static void combineWeight(Weight &W, const Weight &OtherW) {
480 assert(OtherW.TargetNode.isValid());
481 if (!W.Amount) {
482 W = OtherW;
483 return;
484 }
485 assert(W.Type == OtherW.Type);
486 assert(W.TargetNode == OtherW.TargetNode);
487 assert(W.Amount < W.Amount + OtherW.Amount);
488 W.Amount += OtherW.Amount;
489 }
490 static void combineWeightsBySorting(WeightList &Weights) {
491 // Sort so edges to the same node are adjacent.
492 std::sort(Weights.begin(), Weights.end(),
493 [](const Weight &L,
494 const Weight &R) { return L.TargetNode < R.TargetNode; });
495
496 // Combine adjacent edges.
497 WeightList::iterator O = Weights.begin();
498 for (WeightList::const_iterator I = O, L = O, E = Weights.end(); I != E;
499 ++O, (I = L)) {
500 *O = *I;
501
502 // Find the adjacent weights to the same node.
503 for (++L; L != E && I->TargetNode == L->TargetNode; ++L)
504 combineWeight(*O, *L);
505 }
506
507 // Erase extra entries.
508 Weights.erase(O, Weights.end());
509 return;
510 }
511 static void combineWeightsByHashing(WeightList &Weights) {
512 // Collect weights into a DenseMap.
513 typedef DenseMap HashTable;
514 HashTable Combined(NextPowerOf2(2 * Weights.size()));
515 for (const Weight &W : Weights)
516 combineWeight(Combined[W.TargetNode.Index], W);
517
518 // Check whether anything changed.
519 if (Weights.size() == Combined.size())
520 return;
521
522 // Fill in the new weights.
523 Weights.clear();
524 Weights.reserve(Combined.size());
525 for (const auto &I : Combined)
526 Weights.push_back(I.second);
527 }
528 static void combineWeights(WeightList &Weights) {
529 // Use a hash table for many successors to keep this linear.
530 if (Weights.size() > 128) {
531 combineWeightsByHashing(Weights);
532 return;
533 }
534
535 combineWeightsBySorting(Weights);
536 }
537 static uint64_t shiftRightAndRound(uint64_t N, int Shift) {
538 assert(Shift >= 0);
539 assert(Shift < 64);
540 if (!Shift)
541 return N;
542 return (N >> Shift) + (UINT64_C(1) & N >> (Shift - 1));
543 }
544 void Distribution::normalize() {
545 // Early exit for termination nodes.
546 if (Weights.empty())
547 return;
548
549 // Only bother if there are multiple successors.
550 if (Weights.size() > 1)
551 combineWeights(Weights);
552
553 // Early exit when combined into a single successor.
554 if (Weights.size() == 1) {
555 Total = 1;
556 ForwardTotal = Weights.front().Type != Weight::Backedge;
557 Weights.front().Amount = 1;
558 return;
559 }
560
561 // Determine how much to shift right so that the total fits into 32-bits.
562 //
563 // If we shift at all, shift by 1 extra. Otherwise, the lower limit of 1
564 // for each weight can cause a 32-bit overflow.
565 int Shift = 0;
566 if (DidOverflow)
567 Shift = 33;
568 else if (Total > UINT32_MAX)
569 Shift = 33 - countLeadingZeros(Total);
570
571 // Early exit if nothing needs to be scaled.
572 if (!Shift)
573 return;
574
575 // Recompute the total through accumulation (rather than shifting it) so that
576 // it's accurate after shifting. ForwardTotal is dirty here anyway.
577 Total = 0;
578 ForwardTotal = 0;
579
580 // Sum the weights to each node and shift right if necessary.
581 for (Weight &W : Weights) {
582 // Scale down below UINT32_MAX. Since Shift is larger than necessary, we
583 // can round here without concern about overflow.
584 assert(W.TargetNode.isValid());
585 W.Amount = std::max(UINT64_C(1), shiftRightAndRound(W.Amount, Shift));
586 assert(W.Amount <= UINT32_MAX);
587
588 // Update the total.
589 Total += W.Amount;
590 if (W.Type == Weight::Backedge)
591 continue;
592
593 // Update the forward total.
594 ForwardTotal += W.Amount;
595 }
596 assert(Total <= UINT32_MAX);
597 }
598
599 void BlockFrequencyInfoImplBase::clear() {
600 *this = BlockFrequencyInfoImplBase();
601 }
602
603 /// \brief Clear all memory not needed downstream.
604 ///
605 /// Releases all memory not used downstream. In particular, saves Freqs.
606 static void cleanup(BlockFrequencyInfoImplBase &BFI) {
607 std::vector SavedFreqs(std::move(BFI.Freqs));
608 BFI.clear();
609 BFI.Freqs = std::move(SavedFreqs);
610 }
611
612 /// \brief Get a possibly packaged node.
613 ///
614 /// Get the node currently representing Node, which could be a containing
615 /// loop.
616 ///
617 /// This function should only be called when distributing mass. As long as
618 /// there are no irreducilbe edges to Node, then it will have complexity O(1)
619 /// in this context.
620 ///
621 /// In general, the complexity is O(L), where L is the number of loop headers
622 /// Node has been packaged into. Since this method is called in the context
623 /// of distributing mass, L will be the number of loop headers an early exit
624 /// edge jumps out of.
625 static BlockNode getPackagedNode(const BlockFrequencyInfoImplBase &BFI,
626 const BlockNode &Node) {
627 assert(Node.isValid());
628 if (!BFI.Working[Node.Index].IsPackaged)
629 return Node;
630 if (!BFI.Working[Node.Index].ContainingLoop.isValid())
631 return Node;
632 return getPackagedNode(BFI, BFI.Working[Node.Index].ContainingLoop);
633 }
634
635 /// \brief Get the appropriate mass for a possible pseudo-node loop package.
636 ///
637 /// Get appropriate mass for Node. If Node is a loop-header (whose loop has
638 /// been packaged), returns the mass of its pseudo-node. If it's a node inside
639 /// a packaged loop, it returns the loop's pseudo-node.
640 static BlockMass &getPackageMass(BlockFrequencyInfoImplBase &BFI,
641 const BlockNode &Node) {
642 assert(Node.isValid());
643 assert(!BFI.Working[Node.Index].IsPackaged);
644 if (!BFI.Working[Node.Index].IsAPackage)
645 return BFI.Working[Node.Index].Mass;
646
647 return BFI.getLoopPackage(Node).Mass;
648 }
649
650 void BlockFrequencyInfoImplBase::addToDist(Distribution &Dist,
651 const BlockNode &LoopHead,
652 const BlockNode &Pred,
653 const BlockNode &Succ,
654 uint64_t Weight) {
655 if (!Weight)
656 Weight = 1;
657
658 #ifndef NDEBUG
659 auto debugSuccessor = [&](const char *Type, const BlockNode &Resolved) {
660 dbgs() << " =>"
661 << " [" << Type << "] weight = " << Weight;
662 if (Succ != LoopHead)
663 dbgs() << ", succ = " << getBlockName(Succ);
664 if (Resolved != Succ)
665 dbgs() << ", resolved = " << getBlockName(Resolved);
666 dbgs() << "\n";
667 };
668 (void)debugSuccessor;
669 #endif
670
671 if (Succ == LoopHead) {
672 DEBUG(debugSuccessor("backedge", Succ));
673 Dist.addBackedge(LoopHead, Weight);
674 return;
675 }
676 BlockNode Resolved = getPackagedNode(*this, Succ);
677 assert(Resolved != LoopHead);
678
679 if (Working[Resolved.Index].ContainingLoop != LoopHead) {
680 DEBUG(debugSuccessor(" exit ", Resolved));
681 Dist.addExit(Resolved, Weight);
682 return;
683 }
684
685 if (!LoopHead.isValid() && Resolved < Pred) {
686 // Irreducible backedge. Skip this edge in the distribution.
687 DEBUG(debugSuccessor("skipped ", Resolved));
688 return;
689 }
690
691 DEBUG(debugSuccessor(" local ", Resolved));
692 Dist.addLocal(Resolved, Weight);
693 }
694
695 void BlockFrequencyInfoImplBase::addLoopSuccessorsToDist(
696 const BlockNode &LoopHead, const BlockNode &LocalLoopHead,
697 Distribution &Dist) {
698 PackagedLoopData &LoopPackage = getLoopPackage(LocalLoopHead);
699 const PackagedLoopData::ExitMap &Exits = LoopPackage.Exits;
700
701 // Copy the exit map into Dist.
702 for (const auto &I : Exits)
703 addToDist(Dist, LoopHead, LocalLoopHead, I.first, I.second.getMass());
704
705 // We don't need this map any more. Clear it to prevent quadratic memory
706 // usage in deeply nested loops with irreducible control flow.
707 LoopPackage.Exits.clear();
708 }
709
710 /// \brief Get the maximum allowed loop scale.
711 ///
712 /// Gives the maximum number of estimated iterations allowed for a loop.
713 /// Downstream users have trouble with very large numbers (even within
714 /// 64-bits). Perhaps they can be changed to use PositiveFloat.
715 ///
716 /// TODO: change downstream users so that this can be increased or removed.
717 static Float getMaxLoopScale() { return Float(1, 12); }
718
719 /// \brief Compute the loop scale for a loop.
720 void BlockFrequencyInfoImplBase::computeLoopScale(const BlockNode &LoopHead) {
721 // Compute loop scale.
722 DEBUG(dbgs() << "compute-loop-scale: " << getBlockName(LoopHead) << "\n");
723
724 // LoopScale == 1 / ExitMass
725 // ExitMass == HeadMass - BackedgeMass
726 PackagedLoopData &LoopPackage = getLoopPackage(LoopHead);
727 BlockMass ExitMass = BlockMass::getFull() - LoopPackage.BackedgeMass;
728
729 // Block scale stores the inverse of the scale.
730 LoopPackage.Scale = ExitMass.toFloat().inverse();
731
732 DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull()
733 << " - " << LoopPackage.BackedgeMass << ")\n"
734 << " - scale = " << LoopPackage.Scale << "\n");
735
736 if (LoopPackage.Scale > getMaxLoopScale()) {
737 LoopPackage.Scale = getMaxLoopScale();
738 DEBUG(dbgs() << " - reduced-to-max-scale: " << getMaxLoopScale() << "\n");
739 }
740 }
741
742 /// \brief Package up a loop.
743 void BlockFrequencyInfoImplBase::packageLoop(const BlockNode &LoopHead) {
744 DEBUG(dbgs() << "packaging-loop: " << getBlockName(LoopHead) << "\n");
745 Working[LoopHead.Index].IsAPackage = true;
746 for (const BlockNode &M : getLoopPackage(LoopHead).Members) {
747 DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n");
748 Working[M.Index].IsPackaged = true;
749 }
750 }
751
752 void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source,
753 const BlockNode &LoopHead,
754 Distribution &Dist) {
755 BlockMass Mass = getPackageMass(*this, Source);
756 DEBUG(dbgs() << " => mass: " << Mass
757 << " ( general | forward )\n");
758
759 // Distribute mass to successors as laid out in Dist.
760 DitheringDistributer D(Dist, Mass);
761
762 #ifndef NDEBUG
763 auto debugAssign = [&](const BlockNode &T, const BlockMass &M,
764 const char *Desc) {
765 dbgs() << " => assign " << M << " (" << D.RemMass << "|"
766 << D.RemForwardMass << ")";
767 if (Desc)
768 dbgs() << " [" << Desc << "]";
769 if (T.isValid())
770 dbgs() << " to " << getBlockName(T);
771 dbgs() << "\n";
772 };
773 (void)debugAssign;
774 #endif
775
776 PackagedLoopData *LoopPackage = 0;
777 if (LoopHead.isValid())
778 LoopPackage = &getLoopPackage(LoopHead);
779 for (const Weight &W : Dist.Weights) {
780 // Check for a local edge (forward and non-exit).
781 if (W.Type == Weight::Local) {
782 BlockMass Local = D.takeLocalMass(W.Amount);
783 getPackageMass(*this, W.TargetNode) += Local;
784 DEBUG(debugAssign(W.TargetNode, Local, nullptr));
785 continue;
786 }
787
788 // Backedges and exits only make sense if we're processing a loop.
789 assert(LoopPackage && "backedge or exit outside of loop");
790
791 // Check for a backedge.
792 if (W.Type == Weight::Backedge) {
793 BlockMass Back = D.takeBackedgeMass(W.Amount);
794 LoopPackage->BackedgeMass += Back;
795 DEBUG(debugAssign(BlockNode(), Back, "back"));
796 continue;
797 }
798
799 // This must be an exit.
800 assert(W.Type == Weight::Exit);
801 BlockMass Exit = D.takeExitMass(W.Amount);
802 LoopPackage->Exits.push_back(std::make_pair(W.TargetNode, Exit));
803 DEBUG(debugAssign(W.TargetNode, Exit, "exit"));
804 }
805 }
806
807 static void convertFloatingToInteger(BlockFrequencyInfoImplBase &BFI,
808 const Float &Min, const Float &Max) {
809 // Scale the Factor to a size that creates integers. Ideally, integers would
810 // be scaled so that Max == UINT64_MAX so that they can be best
811 // differentiated. However, the register allocator currently deals poorly
812 // with large numbers. Instead, push Min up a little from 1 to give some
813 // room to differentiate small, unequal numbers.
814 //
815 // TODO: fix issues downstream so that ScalingFactor can be Float(1,64)/Max.
816 Float ScalingFactor = Min.inverse();
817 if ((Max / Min).lg() < 60)
818 ScalingFactor <<= 3;
819
820 // Translate the floats to integers.
821 DEBUG(dbgs() << "float-to-int: min = " << Min << ", max = " << Max
822 << ", factor = " << ScalingFactor << "\n");
823 for (size_t Index = 0; Index < BFI.Freqs.size(); ++Index) {
824 Float Scaled = BFI.Freqs[Index].Floating * ScalingFactor;
825 BFI.Freqs[Index].Integer = std::max(UINT64_C(1), Scaled.toInt());
826 DEBUG(dbgs() << " - " << BFI.getBlockName(Index) << ": float = "
827 << BFI.Freqs[Index].Floating << ", scaled = " << Scaled
828 << ", int = " << BFI.Freqs[Index].Integer << "\n");
829 }
830 }
831
832 static void scaleBlockData(BlockFrequencyInfoImplBase &BFI,
833 const BlockNode &Node,
834 const PackagedLoopData &Loop) {
835 Float F = Loop.Mass.toFloat() * Loop.Scale;
836
837 Float &Current = BFI.Freqs[Node.Index].Floating;
838 Float Updated = Current * F;
839
840 DEBUG(dbgs() << " - " << BFI.getBlockName(Node) << ": " << Current << " => "
841 << Updated << "\n");
842
843 Current = Updated;
844 }
845
846 /// \brief Unwrap a loop package.
847 ///
848 /// Visits all the members of a loop, adjusting their BlockData according to
849 /// the loop's pseudo-node.
850 static void unwrapLoopPackage(BlockFrequencyInfoImplBase &BFI,
851 const BlockNode &Head) {
852 assert(Head.isValid());
853
854 PackagedLoopData &LoopPackage = BFI.getLoopPackage(Head);
855 DEBUG(dbgs() << "unwrap-loop-package: " << BFI.getBlockName(Head)
856 << ": mass = " << LoopPackage.Mass
857 << ", scale = " << LoopPackage.Scale << "\n");
858 scaleBlockData(BFI, Head, LoopPackage);
859
860 // Propagate the head scale through the loop. Since members are visited in
861 // RPO, the head scale will be updated by the loop scale first, and then the
862 // final head scale will be used for updated the rest of the members.
863 for (const BlockNode &M : LoopPackage.Members) {
864 const FrequencyData &HeadData = BFI.Freqs[Head.Index];
865 FrequencyData &Freqs = BFI.Freqs[M.Index];
866 Float NewFreq = Freqs.Floating * HeadData.Floating;
867 DEBUG(dbgs() << " - " << BFI.getBlockName(M) << ": " << Freqs.Floating
868 << " => " << NewFreq << "\n");
869 Freqs.Floating = NewFreq;
870 }
871 }
872
873 void BlockFrequencyInfoImplBase::finalizeMetrics() {
874 // Set initial frequencies from loop-local masses.
875 for (size_t Index = 0; Index < Working.size(); ++Index)
876 Freqs[Index].Floating = Working[Index].Mass.toFloat();
877
878 // Unwrap loop packages in reverse post-order, tracking min and max
879 // frequencies.
880 auto Min = Float::getLargest();
881 auto Max = Float::getZero();
882 for (size_t Index = 0; Index < Working.size(); ++Index) {
883 if (Working[Index].isLoopHeader())
884 unwrapLoopPackage(*this, BlockNode(Index));
885
886 // Update max scale.
887 Min = std::min(Min, Freqs[Index].Floating);
888 Max = std::max(Max, Freqs[Index].Floating);
889 }
890
891 // Convert to integers.
892 convertFloatingToInteger(*this, Min, Max);
893
894 // Clean up data structures.
895 cleanup(*this);
896
897 // Print out the final stats.
898 DEBUG(dump());
899 }
900
901 BlockFrequency
902 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
903 if (!Node.isValid())
904 return 0;
905 return Freqs[Node.Index].Integer;
906 }
907 Float
908 BlockFrequencyInfoImplBase::getFloatingBlockFreq(const BlockNode &Node) const {
909 if (!Node.isValid())
910 return Float::getZero();
911 return Freqs[Node.Index].Floating;
912 }
913
914 std::string
915 BlockFrequencyInfoImplBase::getBlockName(const BlockNode &Node) const {
916 return std::string();
917 }
918
919 raw_ostream &
920 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
921 const BlockNode &Node) const {
922 return OS << getFloatingBlockFreq(Node);
923 }
924
925 raw_ostream &
926 BlockFrequencyInfoImplBase::printBlockFreq(raw_ostream &OS,
927 const BlockFrequency &Freq) const {
928 Float Block(Freq.getFrequency(), 0);
929 Float Entry(getEntryFreq(), 0);
930
931 return OS << Block / Entry;
932 }
66 Analysis.cpp
77 BasicAliasAnalysis.cpp
88 BlockFrequencyInfo.cpp
9 BlockFrequencyInfoImpl.cpp
109 BranchProbabilityInfo.cpp
1110 CFG.cpp
1211 CFGPrinter.cpp
1010 //
1111 //===----------------------------------------------------------------------===//
1212
13 #define DEBUG_TYPE "block-freq"
1413 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
1514 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
1615 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/MachineLoopInfo.h"
1916 #include "llvm/CodeGen/Passes.h"
2017 #include "llvm/InitializePasses.h"
2118 #include "llvm/Support/CommandLine.h"
114111 INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
115112 "Machine Block Frequency Analysis", true, true)
116113 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
117 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
118114 INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
119115 "Machine Block Frequency Analysis", true, true)
120116
130126
131127 void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
132128 AU.addRequired();
133 AU.addRequired();
134129 AU.setPreservesAll();
135130 MachineFunctionPass::getAnalysisUsage(AU);
136131 }
137132
138133 bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
139134 MachineBranchProbabilityInfo &MBPI =
140 getAnalysis();
141 MachineLoopInfo &MLI = getAnalysisInfo>();
135 getAnalysisInfo>();
142136 if (!MBFI)
143137 MBFI.reset(new ImplType);
144 MBFI->doFunction(&F, &MBPI, &MLI);
138 MBFI->doFunction(&F, &MBPI);
145139 #ifndef NDEBUG
146140 if (ViewMachineBlockFreqPropagationDAG != GVDT_None) {
147141 view();
171165 }
172166
173167 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
174 return MBFI ? MBFI->getFunction() : nullptr;
168 return MBFI ? MBFI->Fn : nullptr;
175169 }
176170
177171 raw_ostream &
+0
-50
test/Analysis/BlockFrequencyInfo/bad_input.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 declare void @g(i32 %x)
3
4 ; CHECK-LABEL: Printing analysis {{.*}} for function 'branch_weight_0':
5 ; CHECK-NEXT: block-frequency-info: branch_weight_0
6 define void @branch_weight_0(i32 %a) {
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %for.body
10
11 ; Check that we get 1,4 instead of 0,3.
12 ; CHECK-NEXT: for.body: float = 4.0,
13 for.body:
14 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
15 call void @g(i32 %i)
16 %inc = add i32 %i, 1
17 %cmp = icmp ugt i32 %inc, %a
18 br i1 %cmp, label %for.end, label %for.body, !prof !0
19
20 ; CHECK-NEXT: for.end: float = 1.0, int = [[ENTRY]]
21 for.end:
22 ret void
23 }
24
25 !0 = metadata !{metadata !"branch_weights", i32 0, i32 3}
26
27 ; CHECK-LABEL: Printing analysis {{.*}} for function 'infinite_loop'
28 ; CHECK-NEXT: block-frequency-info: infinite_loop
29 define void @infinite_loop(i1 %x) {
30 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
31 entry:
32 br i1 %x, label %for.body, label %for.end, !prof !1
33
34 ; Check that the loop scale maxes out at 4096, giving 2048 here.
35 ; CHECK-NEXT: for.body: float = 2048.0,
36 for.body:
37 %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
38 call void @g(i32 %i)
39 %inc = add i32 %i, 1
40 br label %for.body
41
42 ; Check that the exit weight is half of entry, since half is lost in the
43 ; infinite loop above.
44 ; CHECK-NEXT: for.end: float = 0.5,
45 for.end:
46 ret void
47 }
48
49 !1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
0 ; RUN: opt < %s -analyze -block-freq | FileCheck %s
11
22 define i32 @test1(i32 %i, i32* %a) {
3 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test1':
4 ; CHECK-NEXT: block-frequency-info: test1
5 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
3 ; CHECK: Printing analysis {{.*}} for function 'test1'
4 ; CHECK: entry = 1.0
65 entry:
76 br label %body
87
98 ; Loop backedges are weighted and thus their bodies have a greater frequency.
10 ; CHECK-NEXT: body: float = 32.0,
9 ; CHECK: body = 32.0
1110 body:
1211 %iv = phi i32 [ 0, %entry ], [ %next, %body ]
1312 %base = phi i32 [ 0, %entry ], [ %sum, %body ]
1817 %exitcond = icmp eq i32 %next, %i
1918 br i1 %exitcond, label %exit, label %body
2019
21 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
20 ; CHECK: exit = 1.0
2221 exit:
2322 ret i32 %sum
2423 }
2524
2625 define i32 @test2(i32 %i, i32 %a, i32 %b) {
27 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test2':
28 ; CHECK-NEXT: block-frequency-info: test2
29 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
26 ; CHECK: Printing analysis {{.*}} for function 'test2'
27 ; CHECK: entry = 1.0
3028 entry:
3129 %cond = icmp ult i32 %i, 42
3230 br i1 %cond, label %then, label %else, !prof !0
3331
3432 ; The 'then' branch is predicted more likely via branch weight metadata.
35 ; CHECK-NEXT: then: float = 0.9411{{[0-9]*}},
33 ; CHECK: then = 0.94116
3634 then:
3735 br label %exit
3836
39 ; CHECK-NEXT: else: float = 0.05882{{[0-9]*}},
37 ; CHECK: else = 0.05877
4038 else:
4139 br label %exit
4240
43 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
41 ; FIXME: It may be a bug that we don't sum back to 1.0.
42 ; CHECK: exit = 0.99993
4443 exit:
4544 %result = phi i32 [ %a, %then ], [ %b, %else ]
4645 ret i32 %result
4948 !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
5049
5150 define i32 @test3(i32 %i, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
52 ; CHECK-LABEL: Printing analysis {{.*}} for function 'test3':
53 ; CHECK-NEXT: block-frequency-info: test3
54 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
51 ; CHECK: Printing analysis {{.*}} for function 'test3'
52 ; CHECK: entry = 1.0
5553 entry:
5654 switch i32 %i, label %case_a [ i32 1, label %case_b
5755 i32 2, label %case_c
5856 i32 3, label %case_d
5957 i32 4, label %case_e ], !prof !1
6058
61 ; CHECK-NEXT: case_a: float = 0.05,
59 ; CHECK: case_a = 0.04998
6260 case_a:
6361 br label %exit
6462
65 ; CHECK-NEXT: case_b: float = 0.05,
63 ; CHECK: case_b = 0.04998
6664 case_b:
6765 br label %exit
6866
6967 ; The 'case_c' branch is predicted more likely via branch weight metadata.
70 ; CHECK-NEXT: case_c: float = 0.8,
68 ; CHECK: case_c = 0.79998
7169 case_c:
7270 br label %exit
7371
74 ; CHECK-NEXT: case_d: float = 0.05,
72 ; CHECK: case_d = 0.04998
7573 case_d:
7674 br label %exit
7775
78 ; CHECK-NEXT: case_e: float = 0.05,
76 ; CHECK: case_e = 0.04998
7977 case_e:
8078 br label %exit
8179
82 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
80 ; FIXME: It may be a bug that we don't sum back to 1.0.
81 ; CHECK: exit = 0.99993
8382 exit:
8483 %result = phi i32 [ %a, %case_a ],
8584 [ %b, %case_b ],
9190
9291 !1 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 64, i32 4, i32 4}
9392
93 ; CHECK: Printing analysis {{.*}} for function 'nested_loops'
94 ; CHECK: entry = 1.0
95 ; This test doesn't seem to be assigning sensible frequencies to nested loops.
9496 define void @nested_loops(i32 %a) {
95 ; CHECK-LABEL: Printing analysis {{.*}} for function 'nested_loops':
96 ; CHECK-NEXT: block-frequency-info: nested_loops
97 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
9897 entry:
9998 br label %for.cond1.preheader
10099
101 ; CHECK-NEXT: for.cond1.preheader: float = 4001.0,
102100 for.cond1.preheader:
103101 %x.024 = phi i32 [ 0, %entry ], [ %inc12, %for.inc11 ]
104102 br label %for.cond4.preheader
105103
106 ; CHECK-NEXT: for.cond4.preheader: float = 16008001.0,
107104 for.cond4.preheader:
108105 %y.023 = phi i32 [ 0, %for.cond1.preheader ], [ %inc9, %for.inc8 ]
109106 %add = add i32 %y.023, %x.024
110107 br label %for.body6
111108
112 ; CHECK-NEXT: for.body6: float = 64048012001.0,
113109 for.body6:
114110 %z.022 = phi i32 [ 0, %for.cond4.preheader ], [ %inc, %for.body6 ]
115111 %add7 = add i32 %add, %z.022
116 tail call void @g(i32 %add7)
112 tail call void @g(i32 %add7) #2
117113 %inc = add i32 %z.022, 1
118114 %cmp5 = icmp ugt i32 %inc, %a
119115 br i1 %cmp5, label %for.inc8, label %for.body6, !prof !2
120116
121 ; CHECK-NEXT: for.inc8: float = 16008001.0,
122117 for.inc8:
123118 %inc9 = add i32 %y.023, 1
124119 %cmp2 = icmp ugt i32 %inc9, %a
125120 br i1 %cmp2, label %for.inc11, label %for.cond4.preheader, !prof !2
126121
127 ; CHECK-NEXT: for.inc11: float = 4001.0,
128122 for.inc11:
129123 %inc12 = add i32 %x.024, 1
130124 %cmp = icmp ugt i32 %inc12, %a
131125 br i1 %cmp, label %for.end13, label %for.cond1.preheader, !prof !2
132126
133 ; CHECK-NEXT: for.end13: float = 1.0, int = [[ENTRY]]
134127 for.end13:
135128 ret void
136129 }
137130
138 declare void @g(i32)
131 declare void @g(i32) #1
139132
140133 !2 = metadata !{metadata !"branch_weights", i32 1, i32 4000}
+0
-165
test/Analysis/BlockFrequencyInfo/double_exit.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit':
3 ; CHECK-NEXT: block-frequency-info: double_exit
4 define i32 @double_exit(i32 %N) {
5 ; Mass = 1
6 ; Frequency = 1
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %outer
10
11 ; Mass = 1
12 ; Backedge mass = 1/3, exit mass = 2/3
13 ; Loop scale = 3/2
14 ; Psuedo-edges = exit
15 ; Psuedo-mass = 1
16 ; Frequency = 1*3/2*1 = 3/2
17 ; CHECK-NEXT: outer: float = 1.5,
18 outer:
19 %I.0 = phi i32 [ 0, %entry ], [ %inc6, %outer.inc ]
20 %Return.0 = phi i32 [ 0, %entry ], [ %Return.1, %outer.inc ]
21 %cmp = icmp slt i32 %I.0, %N
22 br i1 %cmp, label %inner, label %exit, !prof !2 ; 2:1
23
24 ; Mass = 1
25 ; Backedge mass = 3/5, exit mass = 2/5
26 ; Loop scale = 5/2
27 ; Pseudo-edges = outer.inc @ 1/5, exit @ 1/5
28 ; Pseudo-mass = 2/3
29 ; Frequency = 3/2*1*5/2*2/3 = 5/2
30 ; CHECK-NEXT: inner: float = 2.5,
31 inner:
32 %Return.1 = phi i32 [ %Return.0, %outer ], [ %call4, %inner.inc ]
33 %J.0 = phi i32 [ %I.0, %outer ], [ %inc, %inner.inc ]
34 %cmp2 = icmp slt i32 %J.0, %N
35 br i1 %cmp2, label %inner.body, label %outer.inc, !prof !1 ; 4:1
36
37 ; Mass = 4/5
38 ; Frequency = 5/2*4/5 = 2
39 ; CHECK-NEXT: inner.body: float = 2.0,
40 inner.body:
41 %call = call i32 @c2(i32 %I.0, i32 %J.0)
42 %tobool = icmp ne i32 %call, 0
43 br i1 %tobool, label %exit, label %inner.inc, !prof !0 ; 3:1
44
45 ; Mass = 3/5
46 ; Frequency = 5/2*3/5 = 3/2
47 ; CHECK-NEXT: inner.inc: float = 1.5,
48 inner.inc:
49 %call4 = call i32 @logic2(i32 %Return.1, i32 %I.0, i32 %J.0)
50 %inc = add nsw i32 %J.0, 1
51 br label %inner
52
53 ; Mass = 1/3
54 ; Frequency = 3/2*1/3 = 1/2
55 ; CHECK-NEXT: outer.inc: float = 0.5,
56 outer.inc:
57 %inc6 = add nsw i32 %I.0, 1
58 br label %outer
59
60 ; Mass = 1
61 ; Frequency = 1
62 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
63 exit:
64 %Return.2 = phi i32 [ %Return.1, %inner.body ], [ %Return.0, %outer ]
65 ret i32 %Return.2
66 }
67
68 !0 = metadata !{metadata !"branch_weights", i32 1, i32 3}
69 !1 = metadata !{metadata !"branch_weights", i32 4, i32 1}
70 !2 = metadata !{metadata !"branch_weights", i32 2, i32 1}
71
72 declare i32 @c2(i32, i32)
73 declare i32 @logic2(i32, i32, i32)
74
75 ; CHECK-LABEL: Printing analysis {{.*}} for function 'double_exit_in_loop':
76 ; CHECK-NEXT: block-frequency-info: double_exit_in_loop
77 define i32 @double_exit_in_loop(i32 %N) {
78 ; Mass = 1
79 ; Frequency = 1
80 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
81 entry:
82 br label %outer
83
84 ; Mass = 1
85 ; Backedge mass = 1/2, exit mass = 1/2
86 ; Loop scale = 2
87 ; Pseudo-edges = exit
88 ; Psuedo-mass = 1
89 ; Frequency = 1*2*1 = 2
90 ; CHECK-NEXT: outer: float = 2.0,
91 outer:
92 %I.0 = phi i32 [ 0, %entry ], [ %inc12, %outer.inc ]
93 %Return.0 = phi i32 [ 0, %entry ], [ %Return.3, %outer.inc ]
94 %cmp = icmp slt i32 %I.0, %N
95 br i1 %cmp, label %middle, label %exit, !prof !3 ; 1:1
96
97 ; Mass = 1
98 ; Backedge mass = 1/3, exit mass = 2/3
99 ; Loop scale = 3/2
100 ; Psuedo-edges = outer.inc
101 ; Psuedo-mass = 1/2
102 ; Frequency = 2*1*3/2*1/2 = 3/2
103 ; CHECK-NEXT: middle: float = 1.5,
104 middle:
105 %J.0 = phi i32 [ %I.0, %outer ], [ %inc9, %middle.inc ]
106 %Return.1 = phi i32 [ %Return.0, %outer ], [ %Return.2, %middle.inc ]
107 %cmp2 = icmp slt i32 %J.0, %N
108 br i1 %cmp2, label %inner, label %outer.inc, !prof !2 ; 2:1
109
110 ; Mass = 1
111 ; Backedge mass = 3/5, exit mass = 2/5
112 ; Loop scale = 5/2
113 ; Pseudo-edges = middle.inc @ 1/5, outer.inc @ 1/5
114 ; Pseudo-mass = 2/3
115 ; Frequency = 3/2*1*5/2*2/3 = 5/2
116 ; CHECK-NEXT: inner: float = 2.5,
117 inner:
118 %Return.2 = phi i32 [ %Return.1, %middle ], [ %call7, %inner.inc ]
119 %K.0 = phi i32 [ %J.0, %middle ], [ %inc, %inner.inc ]
120 %cmp5 = icmp slt i32 %K.0, %N
121 br i1 %cmp5, label %inner.body, label %middle.inc, !prof !1 ; 4:1
122
123 ; Mass = 4/5
124 ; Frequency = 5/2*4/5 = 2
125 ; CHECK-NEXT: inner.body: float = 2.0,
126 inner.body:
127 %call = call i32 @c3(i32 %I.0, i32 %J.0, i32 %K.0)
128 %tobool = icmp ne i32 %call, 0
129 br i1 %tobool, label %outer.inc, label %inner.inc, !prof !0 ; 3:1
130
131 ; Mass = 3/5
132 ; Frequency = 5/2*3/5 = 3/2
133 ; CHECK-NEXT: inner.inc: float = 1.5,
134 inner.inc:
135 %call7 = call i32 @logic3(i32 %Return.2, i32 %I.0, i32 %J.0, i32 %K.0)
136 %inc = add nsw i32 %K.0, 1
137 br label %inner
138
139 ; Mass = 1/3
140 ; Frequency = 3/2*1/3 = 1/2
141 ; CHECK-NEXT: middle.inc: float = 0.5,
142 middle.inc:
143 %inc9 = add nsw i32 %J.0, 1
144 br label %middle
145
146 ; Mass = 1/2
147 ; Frequency = 2*1/2 = 1
148 ; CHECK-NEXT: outer.inc: float = 1.0,
149 outer.inc:
150 %Return.3 = phi i32 [ %Return.2, %inner.body ], [ %Return.1, %middle ]
151 %inc12 = add nsw i32 %I.0, 1
152 br label %outer
153
154 ; Mass = 1
155 ; Frequency = 1
156 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
157 exit:
158 ret i32 %Return.0
159 }
160
161 !3 = metadata !{metadata !"branch_weights", i32 1, i32 1}
162
163 declare i32 @c3(i32, i32, i32)
164 declare i32 @logic3(i32, i32, i32, i32)
+0
-197
test/Analysis/BlockFrequencyInfo/irreducible.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; A loop with multiple exits should be handled correctly.
3 ;
4 ; CHECK-LABEL: Printing analysis {{.*}} for function 'multiexit':
5 ; CHECK-NEXT: block-frequency-info: multiexit
6 define void @multiexit(i32 %a) {
7 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
8 entry:
9 br label %loop.1
10
11 ; CHECK-NEXT: loop.1: float = 1.333{{3*}},
12 loop.1:
13 %i = phi i32 [ 0, %entry ], [ %inc.2, %loop.2 ]
14 call void @f(i32 %i)
15 %inc.1 = add i32 %i, 1
16 %cmp.1 = icmp ugt i32 %inc.1, %a
17 br i1 %cmp.1, label %exit.1, label %loop.2, !prof !0
18
19 ; CHECK-NEXT: loop.2: float = 0.666{{6*7}},
20 loop.2:
21 call void @g(i32 %inc.1)
22 %inc.2 = add i32 %inc.1, 1
23 %cmp.2 = icmp ugt i32 %inc.2, %a
24 br i1 %cmp.2, label %exit.2, label %loop.1, !prof !1
25
26 ; CHECK-NEXT: exit.1: float = 0.666{{6*7}},
27 exit.1:
28 call void @h(i32 %inc.1)
29 br label %return
30
31 ; CHECK-NEXT: exit.2: float = 0.333{{3*}},
32 exit.2:
33 call void @i(i32 %inc.2)
34 br label %return
35
36 ; CHECK-NEXT: return: float = 1.0, int = [[ENTRY]]
37 return:
38 ret void
39 }
40
41 declare void @f(i32 %x)
42 declare void @g(i32 %x)
43 declare void @h(i32 %x)
44 declare void @i(i32 %x)
45
46 !0 = metadata !{metadata !"branch_weights", i32 3, i32 3}
47 !1 = metadata !{metadata !"branch_weights", i32 5, i32 5}
48
49 ; The current BlockFrequencyInfo algorithm doesn't handle multiple entrances
50 ; into a loop very well. The frequencies assigned to blocks in the loop are
51 ; predictable (and not absurd), but also not correct and therefore not worth
52 ; testing.
53 ;
54 ; There are two testcases below.
55 ;
56 ; For each testcase, I use a CHECK-NEXT/NOT combo like an XFAIL with the
57 ; granularity of a single check. If/when this behaviour is fixed, we'll know
58 ; about it, and the test should be updated.
59 ;
60 ; Testcase #1
61 ; ===========
62 ;
63 ; In this case c1 and c2 should have frequencies of 15/7 and 13/7,
64 ; respectively. To calculate this, consider assigning 1.0 to entry, and
65 ; distributing frequency iteratively (to infinity). At the first iteration,
66 ; entry gives 3/4 to c1 and 1/4 to c2. At every step after, c1 and c2 give 3/4
67 ; of what they have to each other. Somehow, all of it comes out to exit.
68 ;
69 ; c1 = 3/4 + 1/4*3/4 + 3/4*3^2/4^2 + 1/4*3^3/4^3 + 3/4*3^3/4^3 + ...
70 ; c2 = 1/4 + 3/4*3/4 + 1/4*3^2/4^2 + 3/4*3^3/4^3 + 1/4*3^3/4^3 + ...
71 ;
72 ; Simplify by splitting up the odd and even terms of the series and taking out
73 ; factors so that the infite series matches:
74 ;
75 ; c1 = 3/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
76 ; + 3/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
77 ; c2 = 1/4 *(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
78 ; + 9/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
79 ;
80 ; c1 = 15/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
81 ; c2 = 13/16*(9^0/16^0 + 9^1/16^1 + 9^2/16^2 + ...)
82 ;
83 ; Since this geometric series sums to 16/7:
84 ;
85 ; c1 = 15/7
86 ; c2 = 13/7
87 ;
88 ; If we treat c1 and c2 as members of the same loop, the exit frequency of the
89 ; loop as a whole is 1/4, so the loop scale should be 4. Summing c1 and c2
90 ; gives 28/7, or 4.0, which is nice confirmation of the math above.
91 ;
92 ; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
93 ; returns 3/4 and 13/16, respectively. LoopInfo ignores edges between loops
94 ; (and doesn't see any loops here at all), and -block-freq ignores the
95 ; irreducible edge from c2 to c1.
96 ;
97 ; CHECK-LABEL: Printing analysis {{.*}} for function 'multientry':
98 ; CHECK-NEXT: block-frequency-info: multientry
99 define void @multientry(i32 %a) {
100 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
101 entry:
102 %choose = call i32 @choose(i32 %a)
103 %compare = icmp ugt i32 %choose, %a
104 br i1 %compare, label %c1, label %c2, !prof !2
105
106 ; This is like a single-line XFAIL (see above).
107 ; CHECK-NEXT: c1:
108 ; CHECK-NOT: float = 2.142857{{[0-9]*}},
109 c1:
110 %i1 = phi i32 [ %a, %entry ], [ %i2.inc, %c2 ]
111 %i1.inc = add i32 %i1, 1
112 %choose1 = call i32 @choose(i32 %i1)
113 %compare1 = icmp ugt i32 %choose1, %a
114 br i1 %compare1, label %c2, label %exit, !prof !2
115
116 ; This is like a single-line XFAIL (see above).
117 ; CHECK-NEXT: c2:
118 ; CHECK-NOT: float = 1.857142{{[0-9]*}},
119 c2:
120 %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ]
121 %i2.inc = add i32 %i2, 1
122 %choose2 = call i32 @choose(i32 %i2)
123 %compare2 = icmp ugt i32 %choose2, %a
124 br i1 %compare2, label %c1, label %exit, !prof !2
125
126 ; We still shouldn't lose any frequency.
127 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
128 exit:
129 ret void
130 }
131
132 ; Testcase #2
133 ; ===========
134 ;
135 ; In this case c1 and c2 should be treated as equals in a single loop. The
136 ; exit frequency is 1/3, so the scaling factor for the loop should be 3.0. The
137 ; loop is entered 2/3 of the time, and c1 and c2 split the total loop frequency
138 ; evenly (1/2), so they should each have frequencies of 1.0 (3.0*2/3*1/2).
139 ; Another way of computing this result is by assigning 1.0 to entry and showing
140 ; that c1 and c2 should accumulate frequencies of:
141 ;
142 ; 1/3 + 2/9 + 4/27 + 8/81 + ...
143 ; 2^0/3^1 + 2^1/3^2 + 2^2/3^3 + 2^3/3^4 + ...
144 ;
145 ; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent
146 ; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This
147 ; infinite series sums to 1.
148 ;
149 ; However, assuming c1 precedes c2 in reverse post-order, the current algorithm
150 ; returns 1/2 and 3/4, respectively. LoopInfo ignores edges between loops (and
151 ; treats c1 and c2 as self-loops only), and -block-freq ignores the irreducible
152 ; edge from c2 to c1.
153 ;
154 ; Below I use a CHECK-NEXT/NOT combo like an XFAIL with the granularity of a
155 ; single check. If/when this behaviour is fixed, we'll know about it, and the
156 ; test should be updated.
157 ;
158 ; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops':
159 ; CHECK-NEXT: block-frequency-info: crossloops
160 define void @crossloops(i32 %a) {
161 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
162 entry:
163 %choose = call i32 @choose(i32 %a)
164 switch i32 %choose, label %exit [ i32 1, label %c1
165 i32 2, label %c2 ], !prof !3
166
167 ; This is like a single-line XFAIL (see above).
168 ; CHECK-NEXT: c1:
169 ; CHECK-NOT: float = 1.0,
170 c1:
171 %i1 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ]
172 %i1.inc = add i32 %i1, 1
173 %choose1 = call i32 @choose(i32 %i1)
174 switch i32 %choose1, label %exit [ i32 1, label %c1
175 i32 2, label %c2 ], !prof !3
176
177 ; This is like a single-line XFAIL (see above).
178 ; CHECK-NEXT: c2:
179 ; CHECK-NOT: float = 1.0,
180 c2:
181 %i2 = phi i32 [ %a, %entry ], [ %i1.inc, %c1 ], [ %i2.inc, %c2 ]
182 %i2.inc = add i32 %i2, 1
183 %choose2 = call i32 @choose(i32 %i2)
184 switch i32 %choose2, label %exit [ i32 1, label %c1
185 i32 2, label %c2 ], !prof !3
186
187 ; We still shouldn't lose any frequency.
188 ; CHECK-NEXT: exit: float = 1.0, int = [[ENTRY]]
189 exit:
190 ret void
191 }
192
193 declare i32 @choose(i32)
194
195 !2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
196 !3 = metadata !{metadata !"branch_weights", i32 2, i32 2, i32 2}
+0
-44
test/Analysis/BlockFrequencyInfo/loop_with_branch.ll less more
None ; RUN: opt < %s -analyze -block-freq | FileCheck %s
1
2 ; CHECK-LABEL: Printing analysis {{.*}} for function 'loop_with_branch':
3 ; CHECK-NEXT: block-frequency-info: loop_with_branch
4 define void @loop_with_branch(i32 %a) {
5 ; CHECK-NEXT: entry: float = 1.0, int = [[ENTRY:[0-9]+]]
6 entry:
7 %skip_loop = call i1 @foo0(i32 %a)
8 br i1 %skip_loop, label %skip, label %header, !prof !0
9
10 ; CHECK-NEXT: skip: float = 0.25,
11 skip:
12 br label %exit
13
14 ; CHECK-NEXT: header: float = 4.5,
15 header:
16