llvm.org GIT mirror llvm / e7a1e3e
[block-freq] Add BlockFrequency::scale that returns a remainder from the division and make the private scale in BlockFrequency more performant. This change is the first in a series of changes improving LLVM's Block Frequency propogation implementation to not lose probability mass in branchy code when propogating block frequency information from a basic block to its successors. This patch is a simple infrastructure improvement that does not actually modify the block frequency algorithm. The specific changes are: 1. Changes the division algorithm used when scaling block frequencies by branch probabilities to a short division algorithm. This gives us the remainder for free as well as provides a nice speed boost. When I benched the old routine and the new routine on a Sandy Bridge iMac with disabled turbo mode performing 8192 iterations on an array of length 32768, I saw ~600% increase in speed in mean/median performance. 2. Exposes a scale method that returns a remainder. This is important so we can ensure that when we scale a block frequency by some branch probability BP = N/D, the remainder from the division by D can be retrieved and propagated to other children to ensure no probability mass is lost (more to come on this). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194950 91177308-0d34-0410-b5e6-96231b3b80d8 Michael Gottesman 5 years ago
3 changed file(s) with 200 addition(s) and 39 deletion(s). Raw diff Collapse all Expand all
2626 uint64_t Frequency;
2727 static const int64_t ENTRY_FREQ = 1 << 14;
2828
29 // Scale frequency by N/D, saturating on overflow.
30 void scale(uint32_t N, uint32_t D);
29 /// \brief Scale the given BlockFrequency by N/D. Return the remainder from
30 /// the division by D. Upon overflow, the routine will saturate and
31 /// additionally will return the remainder set to D.
32 uint32_t scale(uint32_t N, uint32_t D);
3133
3234 public:
3335 BlockFrequency(uint64_t Freq = 0) : Frequency(Freq) { }
5658 BlockFrequency &operator+=(const BlockFrequency &Freq);
5759 const BlockFrequency operator+(const BlockFrequency &Freq) const;
5860
61 /// \brief Scale the given BlockFrequency by N/D. Return the remainder from
62 /// the division by D. Upon overflow, the routine will saturate.
63 uint32_t scale(const BranchProbability &Prob);
64
5965 bool operator<(const BlockFrequency &RHS) const {
6066 return Frequency < RHS.Frequency;
6167 }
1818 using namespace llvm;
1919
2020 /// Multiply FREQ by N and store result in W array.
21 static void mult96bit(uint64_t freq, uint32_t N, uint64_t W[2]) {
21 static void mult96bit(uint64_t freq, uint32_t N, uint32_t W[3]) {
2222 uint64_t u0 = freq & UINT32_MAX;
2323 uint64_t u1 = freq >> 32;
2424
25 // Represent 96-bit value as w[2]:w[1]:w[0];
26 uint32_t w[3] = { 0, 0, 0 };
27
25 // Represent 96-bit value as W[2]:W[1]:W[0];
2826 uint64_t t = u0 * N;
2927 uint64_t k = t >> 32;
30 w[0] = t;
28 W[0] = t;
3129 t = u1 * N + k;
32 w[1] = t;
33 w[2] = t >> 32;
34
35 // W[1] - higher bits.
36 // W[0] - lower bits.
37 W[0] = w[0] + ((uint64_t) w[1] << 32);
38 W[1] = w[2];
30 W[1] = t;
31 W[2] = t >> 32;
3932 }
4033
34 /// Divide 96-bit value stored in W[2]:W[1]:W[0] by D. Since our word size is a
35 /// 32 bit unsigned integer, we can use a short division algorithm.
36 static uint64_t divrem96bit(uint32_t W[3], uint32_t D, uint32_t *Rout) {
37 // We assume that W[2] is non-zero since if W[2] is not then the user should
38 // just use hardware division.
39 assert(W[2] && "This routine assumes that W[2] is non-zero since if W[2] is "
40 "zero, the caller should just use 64/32 hardware.");
41 uint32_t Q[3] = { 0, 0, 0 };
4142
42 /// Divide 96-bit value stored in W array by D.
43 /// Return 64-bit quotient, saturated to UINT64_MAX on overflow.
44 static uint64_t div96bit(uint64_t W[2], uint32_t D) {
45 uint64_t y = W[0];
46 uint64_t x = W[1];
47 unsigned i;
48
49 assert(x != 0 && "This is really a 64-bit division");
50
51 // This long division algorithm automatically saturates on overflow.
52 for (i = 0; i < 64 && x; ++i) {
53 uint32_t t = -((x >> 31) & 1); // Splat bit 31 to bits 0-31.
54 x = (x << 1) | (y >> 63);
55 y = y << 1;
56 if ((x | t) >= D) {
57 x -= D;
58 ++y;
43 // The generalized short division algorithm sets i to m + n - 1, where n is
44 // the number of words in the divisior and m is the number of words by which
45 // the divident exceeds the divisor (i.e. m + n == the length of the dividend
46 // in words). Due to our assumption that W[2] is non-zero, we know that the
47 // dividend is of length 3 implying since n is 1 that m = 2. Thus we set i to
48 // m + n - 1 = 2 + 1 - 1 = 2.
49 uint32_t R = 0;
50 for (int i = 2; i >= 0; --i) {
51 uint64_t PartialD = uint64_t(R) << 32 | W[i];
52 if (PartialD == 0) {
53 Q[i] = 0;
54 R = 0;
55 } else if (PartialD < D) {
56 Q[i] = 0;
57 R = uint32_t(PartialD);
58 } else if (PartialD == D) {
59 Q[i] = 1;
60 R = 0;
61 } else {
62 Q[i] = uint32_t(PartialD / D);
63 R = uint32_t(PartialD - (Q[i] * D));
5964 }
6065 }
6166
62 return y << (64 - i);
67 // If Q[2] is non-zero, then we overflowed.
68 uint64_t Result;
69 if (Q[2]) {
70 Result = UINT64_MAX;
71 R = D;
72 } else {
73 // Form the final uint64_t result, avoiding endianness issues.
74 Result = uint64_t(Q[0]) | (uint64_t(Q[1]) << 32);
75 }
76
77 if (Rout)
78 *Rout = R;
79
80 return Result;
6381 }
6482
65
66 void BlockFrequency::scale(uint32_t N, uint32_t D) {
83 uint32_t BlockFrequency::scale(uint32_t N, uint32_t D) {
6784 assert(D != 0 && "Division by zero");
6885
6986 // Calculate Frequency * N.
7491 // If the product fits in 64 bits, just use built-in division.
7592 if (MulHi <= UINT32_MAX && MulRes >= MulLo) {
7693 Frequency = MulRes / D;
77 return;
94 return MulRes % D;
7895 }
7996
8097 // Product overflowed, use 96-bit operations.
81 // 96-bit value represented as W[1]:W[0].
82 uint64_t W[2];
98 // 96-bit value represented as W[2]:W[1]:W[0].
99 uint32_t W[3];
100 uint32_t R;
83101 mult96bit(Frequency, N, W);
84 Frequency = div96bit(W, D);
85 return;
102 Frequency = divrem96bit(W, D, &R);
103 return R;
86104 }
87105
88106 BlockFrequency &BlockFrequency::operator*=(const BranchProbability &Prob) {
126144 return Freq;
127145 }
128146
147 uint32_t BlockFrequency::scale(const BranchProbability &Prob) {
148 return scale(Prob.getNumerator(), Prob.getDenominator());
149 }
150
129151 void BlockFrequency::print(raw_ostream &OS) const {
130152 // Convert fixed-point number to decimal.
131153 OS << Frequency / getEntryFrequency() << ".";
1212 BranchProbability Prob(UINT32_MAX - 1, UINT32_MAX);
1313 Freq *= Prob;
1414 EXPECT_EQ(Freq.getFrequency(), 0u);
15
16 Freq = BlockFrequency(1);
17 uint32_t Remainder = Freq.scale(Prob);
18 EXPECT_EQ(Freq.getFrequency(), 0u);
19 EXPECT_EQ(Remainder, UINT32_MAX - 1);
1520 }
1621
1722 TEST(BlockFrequencyTest, OneToOne) {
1924 BranchProbability Prob(UINT32_MAX, UINT32_MAX);
2025 Freq *= Prob;
2126 EXPECT_EQ(Freq.getFrequency(), 1u);
27
28 Freq = BlockFrequency(1);
29 uint32_t Remainder = Freq.scale(Prob);
30 EXPECT_EQ(Freq.getFrequency(), 1u);
31 EXPECT_EQ(Remainder, 0u);
2232 }
2333
2434 TEST(BlockFrequencyTest, ThreeToOne) {
2636 BranchProbability Prob(3000000, 9000000);
2737 Freq *= Prob;
2838 EXPECT_EQ(Freq.getFrequency(), 1u);
39
40 Freq = BlockFrequency(3);
41 uint32_t Remainder = Freq.scale(Prob);
42 EXPECT_EQ(Freq.getFrequency(), 1u);
43 EXPECT_EQ(Remainder, 0u);
2944 }
3045
3146 TEST(BlockFrequencyTest, MaxToHalfMax) {
3348 BranchProbability Prob(UINT32_MAX / 2, UINT32_MAX);
3449 Freq *= Prob;
3550 EXPECT_EQ(Freq.getFrequency(), 9223372034707292159ULL);
51
52 Freq = BlockFrequency(UINT64_MAX);
53 uint32_t Remainder = Freq.scale(Prob);
54 EXPECT_EQ(Freq.getFrequency(), 9223372034707292159ULL);
55 EXPECT_EQ(Remainder, 0u);
3656 }
3757
3858 TEST(BlockFrequencyTest, BigToBig) {
4262 BranchProbability Prob(P, P);
4363 Freq *= Prob;
4464 EXPECT_EQ(Freq.getFrequency(), Big);
65
66 Freq = BlockFrequency(Big);
67 uint32_t Remainder = Freq.scale(Prob);
68 EXPECT_EQ(Freq.getFrequency(), Big);
69 EXPECT_EQ(Remainder, 0u);
4570 }
4671
4772 TEST(BlockFrequencyTest, MaxToMax) {
4974 BranchProbability Prob(UINT32_MAX, UINT32_MAX);
5075 Freq *= Prob;
5176 EXPECT_EQ(Freq.getFrequency(), UINT64_MAX);
77
78 // This additionally makes sure if we have a value equal to our saturating
79 // value, we do not signal saturation if the result equals said value, but
80 // saturating does not occur.
81 Freq = BlockFrequency(UINT64_MAX);
82 uint32_t Remainder = Freq.scale(Prob);
83 EXPECT_EQ(Freq.getFrequency(), UINT64_MAX);
84 EXPECT_EQ(Remainder, 0u);
85 }
86
87 TEST(BlockFrequencyTest, ScaleResultRemainderTest) {
88 struct {
89 uint64_t Freq;
90 uint32_t Prob[2];
91 uint64_t ExpectedFreq;
92 uint32_t ExpectedRemainder;
93 } Tests[80] = {
94 // Data for scaling that results in <= 64 bit division.
95 { 0x1423e2a50, { 0x64819521, 0x7765dd13 }, 0x10f418889, 0x92b9d25 },
96 { 0x35ef14ce, { 0x28ade3c7, 0x304532ae }, 0x2d73c33a, 0x2c0fd0b6 },
97 { 0xd03dbfbe24, { 0x790079, 0xe419f3 }, 0x6e776fc1fd, 0x4a06dd },
98 { 0x21d67410b, { 0x302a9dc2, 0x3ddb4442 }, 0x1a5948fd6, 0x265d1c2a },
99 { 0x8664aead, { 0x3d523513, 0x403523b1 }, 0x805a04cf, 0x324c27b8 },
100 { 0x201db0cf4, { 0x35112a7b, 0x79fc0c74 }, 0xdf8b07f6, 0x490c1dc4 },
101 { 0x13f1e4430a, { 0x21c92bf, 0x21e63aae }, 0x13e0cba15, 0x1df47c30 },
102 { 0x16c83229, { 0x3793f66f, 0x53180dea }, 0xf3ce7b6, 0x1d0c1b6b },
103 { 0xc62415be8, { 0x9cc4a63, 0x4327ae9b }, 0x1ce8b71ca, 0x3f2c696a },
104 { 0x6fac5e434, { 0xe5f9170, 0x1115e10b }, 0x5df23dd4c, 0x4dafc7c },
105 { 0x1929375f2, { 0x3a851375, 0x76c08456 }, 0xc662b082, 0x343589ee },
106 { 0x243c89db6, { 0x354ebfc0, 0x450ef197 }, 0x1bf8c1661, 0x4948e49 },
107 { 0x310e9b31a, { 0x1b1b8acf, 0x2d3629f0 }, 0x1d69c93f9, 0x73e3b96 },
108 { 0xa1fae921d, { 0xa7a098c, 0x10469f44 }, 0x684413d6c, 0x86a882c },
109 { 0xc1582d957, { 0x498e061, 0x59856bc }, 0x9edc5f4e7, 0x29b0653 },
110 { 0x57cfee75, { 0x1d061dc3, 0x7c8bfc17 }, 0x1476a220, 0x2383d33f },
111 { 0x139220080, { 0x294a6c71, 0x2a2b07c9 }, 0x1329e1c76, 0x7aa5da },
112 { 0x1665d353c, { 0x7080db5, 0xde0d75c }, 0xb590d9fb, 0x7ba8c38 },
113 { 0xe8f14541, { 0x5188e8b2, 0x736527ef }, 0xa4971be5, 0x6b612167 },
114 { 0x2f4775f29, { 0x254ef0fe, 0x435fcf50 }, 0x1a2e449c1, 0x28bbf5e },
115 { 0x27b85d8d7, { 0x304c8220, 0x5de678f2 }, 0x146e3bef9, 0x4b27097e },
116 { 0x1d362e36b, { 0x36c85b12, 0x37a66f55 }, 0x1cc19b8e6, 0x688e828 },
117 { 0x155fd48c7, { 0xf5894d, 0x1256108 }, 0x11e383602, 0x111f0cb },
118 { 0xb5db2d15, { 0x39bb26c5, 0x5bdcda3e }, 0x72499259, 0x59c4939b },
119 { 0x153990298, { 0x48921c09, 0x706eb817 }, 0xdb3268e8, 0x66bb8a80 },
120 { 0x28a7c3ed7, { 0x1f776fd7, 0x349f7a70 }, 0x184f73ae1, 0x28910321 },
121 { 0x724dbeab, { 0x1bd149f5, 0x253a085e }, 0x5569c0b3, 0xff8e2ed },
122 { 0xd8f0c513, { 0x18c8cc4c, 0x1b72bad0 }, 0xc3e30643, 0xd85e134 },
123 { 0x17ce3dcb, { 0x1e4c6260, 0x233b359e }, 0x1478f4af, 0x49ea31e },
124 { 0x1ce036ce0, { 0x29e3c8af, 0x5318dd4a }, 0xe8e76196, 0x11d5b9c4 },
125 { 0x1473ae2a, { 0x29b897ba, 0x2be29378 }, 0x13718185, 0x6f93b2c },
126 { 0x1dd41aa68, { 0x3d0a4441, 0x5a0e8f12 }, 0x1437b6bbf, 0x54b09ffa },
127 { 0x1b49e4a53, { 0x3430c1fe, 0x5a204aed }, 0xfcd6852f, 0x15ad6ed7 },
128 { 0x217941b19, { 0x12ced2bd, 0x21b68310 }, 0x12aca65b1, 0x1b2a9565 },
129 { 0xac6a4dc8, { 0x3ed68da8, 0x6fdca34c }, 0x60da926d, 0x22ff53e4 },
130 { 0x1c503a4e7, { 0xfcbbd32, 0x11e48d17 }, 0x18fec7d38, 0xa8aa816 },
131 { 0x1c885855, { 0x213e919d, 0x25941897 }, 0x193de743, 0x4ea09c },
132 { 0x29b9c168e, { 0x2b644aea, 0x45725ee7 }, 0x1a122e5d5, 0xbee1099 },
133 { 0x806a33f2, { 0x30a80a23, 0x5063733a }, 0x4db9a264, 0x1eaed76e },
134 { 0x282afc96b, { 0x143ae554, 0x1a9863ff }, 0x1e8de5204, 0x158d9020 },
135 // Data for scaling that results in > 64 bit division.
136 { 0x23ca5f2f672ca41c, { 0xecbc641, 0x111373f7 }, 0x1f0301e5e8295ab5, 0xf627f79 },
137 { 0x5e4f2468142265e3, { 0x1ddf5837, 0x32189233 }, 0x383ca7ba9fdd2c8c, 0x1c8f33e1 },
138 { 0x277a1a6f6b266bf6, { 0x415d81a8, 0x61eb5e1e }, 0x1a5a3e1d41b30c0f, 0x29cde3ae },
139 { 0x1bdbb49a237035cb, { 0xea5bf17, 0x1d25ffb3 }, 0xdffc51c53d44b93, 0x5170574 },
140 { 0x2bce6d29b64fb8, { 0x3bfd5631, 0x7525c9bb }, 0x166ebedda7ac57, 0x3026dfab },
141 { 0x3a02116103df5013, { 0x2ee18a83, 0x3299aea8 }, 0x35be8922ab1e2a84, 0x298d9919 },
142 { 0x7b5762390799b18c, { 0x12f8e5b9, 0x2563bcd4 }, 0x3e960077aca01209, 0x93afeb8 },
143 { 0x69cfd72537021579, { 0x4c35f468, 0x6a40feee }, 0x4be4cb3848be98a3, 0x4ff96b9e },
144 { 0x49dfdf835120f1c1, { 0x8cb3759, 0x559eb891 }, 0x79663f7120edade, 0x51b1fb5b },
145 { 0x74b5be5c27676381, { 0x47e4c5e0, 0x7c7b19ff }, 0x4367d2dff36a1028, 0x7a7b5608 },
146 { 0x4f50f97075e7f431, { 0x9a50a17, 0x11cd1185 }, 0x2af952b34c032df4, 0xfddc6a3 },
147 { 0x2f8b0d712e393be4, { 0x1487e386, 0x15aa356e }, 0x2d0df36478a776aa, 0x14e2564c },
148 { 0x224c1c75999d3de, { 0x3b2df0ea, 0x4523b100 }, 0x1d5b481d145f08a, 0x15145eec },
149 { 0x2bcbcea22a399a76, { 0x28b58212, 0x48dd013e }, 0x187814d084c47cab, 0x3a38ebe2 },
150 { 0x1dbfca91257cb2d1, { 0x1a8c04d9, 0x5e92502c }, 0x859cf7d00f77545, 0x7431f4d },
151 { 0x7f20039b57cda935, { 0xeccf651, 0x323f476e }, 0x25720cd976461a77, 0x202817a3 },
152 { 0x40512c6a586aa087, { 0x113b0423, 0x398c9eab }, 0x1341c03de8696a7e, 0x1e27284b },
153 { 0x63d802693f050a11, { 0xf50cdd6, 0xfce2a44 }, 0x60c0177bb5e46846, 0xf7ad89e },
154 { 0x2d956b422838de77, { 0xb2d345b, 0x1321e557 }, 0x1aa0ed16b6aa5319, 0xfe1a5ce },
155 { 0x5a1cdf0c1657bc91, { 0x1d77bb0c, 0x1f991ff1 }, 0x54097ee94ff87560, 0x11c4a26c },
156 { 0x3801b26d7e00176b, { 0xeed25da, 0x1a819d8b }, 0x1f89e96a3a639526, 0xcd51e7c },
157 { 0x37655e74338e1e45, { 0x300e170a, 0x5a1595fe }, 0x1d8cfb55fddc0441, 0x3df05434 },
158 { 0x7b38703f2a84e6, { 0x66d9053, 0xc79b6b9 }, 0x3f7d4c91774094, 0x26d939e },
159 { 0x2245063c0acb3215, { 0x30ce2f5b, 0x610e7271 }, 0x113b916468389235, 0x1b588512 },
160 { 0x6bc195877b7b8a7e, { 0x392004aa, 0x4a24e60c }, 0x530594fb17db6ba5, 0x35c0a5f0 },
161 { 0x40a3fde23c7b43db, { 0x4e712195, 0x6553e56e }, 0x320a799bc76a466a, 0x5e23a5eb },
162 { 0x1d3dfc2866fbccba, { 0x5075b517, 0x5fc42245 }, 0x18917f0061595bc3, 0x3fcf4527 },
163 { 0x19aeb14045a61121, { 0x1bf6edec, 0x707e2f4b }, 0x6626672a070bcc7, 0x3607801f },
164 { 0x44ff90486c531e9f, { 0x66598a, 0x8a90dc }, 0x32f6f2b0525199b0, 0x5ab576 },
165 { 0x3f3e7121092c5bcb, { 0x1c754df7, 0x5951a1b9 }, 0x14267f50b7ef375d, 0x221220a8 },
166 { 0x60e2dafb7e50a67e, { 0x4d96c66e, 0x65bd878d }, 0x49e31715ac393f8b, 0x4e97b195 },
167 { 0x656286667e0e6e29, { 0x9d971a2, 0xacda23b }, 0x5c6ee315ead6cb4f, 0x516f5bd },
168 { 0x1114e0974255d507, { 0x1c693, 0x2d6ff }, 0xaae42e4b35f6e60, 0x8b65 },
169 { 0x508c8baf3a70ff5a, { 0x3b26b779, 0x6ad78745 }, 0x2c98387636c4b365, 0x11dc6a51 },
170 { 0x5b47bc666bf1f9cf, { 0x10a87ed6, 0x187d358a }, 0x3e1767155848368b, 0xfb871c },
171 { 0x50954e3744460395, { 0x7a42263, 0xcdaa048 }, 0x2fe739f0aee1fee1, 0xb8add57 },
172 { 0x20020b406550dd8f, { 0x3318539, 0x42eead0 }, 0x186f326325fa346b, 0x10d3ae7 },
173 { 0x5bcb0b872439ffd5, { 0x6f61fb2, 0x9af7344 }, 0x41fa1e3bec3c1b30, 0x4fee45a },
174 { 0x7a670f365db87a53, { 0x417e102, 0x3bb54c67 }, 0x8642a558304fd9e, 0x3b65f514 },
175 { 0x1ef0db1e7bab1cd0, { 0x2b60cf38, 0x4188f78f }, 0x147ae0d6226b2ee6, 0x336b6106 }
176 };
177
178 for (unsigned i = 0; i < 80; i++) {
179 BlockFrequency Freq(Tests[i].Freq);
180 uint32_t Remainder = Freq.scale(BranchProbability(Tests[i].Prob[0],
181 Tests[i].Prob[1]));
182 EXPECT_EQ(Tests[i].ExpectedFreq, Freq.getFrequency());
183 EXPECT_EQ(Tests[i].ExpectedRemainder, Remainder);
184 }
52185 }
53186
54187 TEST(BlockFrequency, Divide) {