llvm.org GIT mirror llvm / b42a626
PR2621: Improvements to the SCEV AddRec binomial expansion. This version uses a new algorithm for evaluating the binomial coefficients which is significantly more efficient for AddRecs of more than 2 terms (see the comments in the code for details on how the algorithm works). It also fixes some bugs: it removes the arbitrary length restriction for AddRecs, it fixes the silent generation of incorrect code for AddRecs which require a wide calculation width, and it fixes an issue where we were incorrectly truncating the iteration count too far when evaluating an AddRec expression narrower than the induction variable. There are still a few related issues I know of: I think there's still an issue with the SCEVExpander expansion of AddRec in terms of the width of the induction variable used. The hack to avoid generating too-wide integers shouldn't be necessary; instead, the callers should be considering the cost of the expansion before expanding it (in addition to not expanding too-wide integers, we might not want to expand expressions that are really expensive, especially when optimizing for size; calculating an length-17 32-bit AddRec currently generates about 250 instructions of straight-line code on X86). Also, for long 32-bit AddRecs on X86, CodeGen really sucks at scheduling the code. I'm planning on filing follow-up PRs for these issues. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@54332 91177308-0d34-0410-b5e6-96231b3b80d8 Eli Friedman 12 years ago
3 changed file(s) with 198 addition(s) and 90 deletion(s). Raw diff Collapse all Expand all
506506 }
507507
508508
509 /// BinomialCoefficient - Compute BC(It, K). The result is of the same type as
510 /// It. Assume, K > 0.
509 /// BinomialCoefficient - Compute BC(It, K). The result has width W.
510 // Assume, K > 0.
511511 static SCEVHandle BinomialCoefficient(SCEVHandle It, unsigned K,
512 ScalarEvolution &SE) {
512 ScalarEvolution &SE,
513 const IntegerType* ResultTy) {
514 // Handle the simplest case efficiently.
515 if (K == 1)
516 return SE.getTruncateOrZeroExtend(It, ResultTy);
517
513518 // We are using the following formula for BC(It, K):
514519 //
515520 // BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / K!
516521 //
517 // Suppose, W is the bitwidth of It (and of the return value as well). We
518 // must be prepared for overflow. Hence, we must assure that the result of
519 // our computation is equal to the accurate one modulo 2^W. Unfortunately,
520 // division isn't safe in modular arithmetic. This means we must perform the
521 // whole computation accurately and then truncate the result to W bits.
522 // Suppose, W is the bitwidth of the return value. We must be prepared for
523 // overflow. Hence, we must assure that the result of our computation is
524 // equal to the accurate one modulo 2^W. Unfortunately, division isn't
525 // safe in modular arithmetic.
522526 //
523 // The dividend of the formula is a multiplication of K integers of bitwidth
524 // W. K*W bits suffice to compute it accurately.
527 // However, this code doesn't use exactly that formula; the formula it uses
528 // is something like the following, where T is the number of factors of 2 in
529 // K! (i.e. trailing zeros in the binary representation of K!), and ^ is
530 // exponentiation:
525531 //
526 // FIXME: We assume the divisor can be accurately computed using 16-bit
527 // unsigned integer type. It is true up to K = 8 (AddRecs of length 9). In
528 // future we may use APInt to use the minimum number of bits necessary to
529 // compute it accurately.
532 // BC(It, K) = (It * (It - 1) * ... * (It - K + 1)) / 2^T / (K! / 2^T)
530533 //
531 // It is safe to use unsigned division here: the dividend is nonnegative and
532 // the divisor is positive.
533
534 // Handle the simplest case efficiently.
535 if (K == 1)
536 return It;
537
538 assert(K < 9 && "We cannot handle such long AddRecs yet.");
539
540 // FIXME: A temporary hack to remove in future. Arbitrary precision integers
541 // aren't supported by the code generator yet. For the dividend, the bitwidth
542 // we use is the smallest power of 2 greater or equal to K*W and less or equal
543 // to 64. Note that setting the upper bound for bitwidth may still lead to
544 // miscompilation in some cases.
545 unsigned DividendBits = 1U << Log2_32_Ceil(K * It->getBitWidth());
546 if (DividendBits > 64)
547 DividendBits = 64;
548 #if 0 // Waiting for the APInt support in the code generator...
549 unsigned DividendBits = K * It->getBitWidth();
550 #endif
551
552 const IntegerType *DividendTy = IntegerType::get(DividendBits);
553 const SCEVHandle ExIt = SE.getTruncateOrZeroExtend(It, DividendTy);
554
555 // The final number of bits we need to perform the division is the maximum of
556 // dividend and divisor bitwidths.
557 const IntegerType *DivisionTy =
558 IntegerType::get(std::max(DividendBits, 16U));
559
560 // Compute K! We know K >= 2 here.
561 unsigned F = 2;
562 for (unsigned i = 3; i <= K; ++i)
563 F *= i;
564 APInt Divisor(DivisionTy->getBitWidth(), F);
565
566 // Handle this case efficiently, it is common to have constant iteration
567 // counts while computing loop exit values.
568 if (SCEVConstant *SC = dyn_cast(ExIt)) {
569 const APInt& N = SC->getValue()->getValue();
570 APInt Dividend(N.getBitWidth(), 1);
571 for (; K; --K)
572 Dividend *= N-(K-1);
573 if (DividendTy != DivisionTy)
574 Dividend = Dividend.zext(DivisionTy->getBitWidth());
575
576 APInt Result = Dividend.udiv(Divisor);
577 if (Result.getBitWidth() != It->getBitWidth())
578 Result = Result.trunc(It->getBitWidth());
579
580 return SE.getConstant(Result);
581 }
582
583 SCEVHandle Dividend = ExIt;
584 for (unsigned i = 1; i != K; ++i)
585 Dividend =
586 SE.getMulExpr(Dividend,
587 SE.getMinusSCEV(ExIt, SE.getIntegerSCEV(i, DividendTy)));
588
589 return SE.getTruncateOrZeroExtend(
590 SE.getUDivExpr(
591 SE.getTruncateOrZeroExtend(Dividend, DivisionTy),
592 SE.getConstant(Divisor)
593 ), It->getType());
534 // This formula is trivially equivalent to the previous formula. However,
535 // this formula can be implemented much more efficiently. The trick is that
536 // K! / 2^T is odd, and exact division by an odd number *is* safe in modular
537 // arithmetic. To do exact division in modular arithmetic, all we have
538 // to do is multiply by the inverse. Therefore, this step can be done at
539 // width W.
540 //
541 // The next issue is how to safely do the division by 2^T. The way this
542 // is done is by doing the multiplication step at a width of at least W + T
543 // bits. This way, the bottom W+T bits of the product are accurate. Then,
544 // when we perform the division by 2^T (which is equivalent to a right shift
545 // by T), the bottom W bits are accurate. Extra bits are okay; they'll get
546 // truncated out after the division by 2^T.
547 //
548 // In comparison to just directly using the first formula, this technique
549 // is much more efficient; using the first formula requires W * K bits,
550 // but this formula less than W + K bits. Also, the first formula requires
551 // a division step, whereas this formula only requires multiplies and shifts.
552 //
553 // It doesn't matter whether the subtraction step is done in the calculation
554 // width or the input iteration count's width; if the subtraction overflows,
555 // the result must be zero anyway. We prefer here to do it in the width of
556 // the induction variable because it helps a lot for certain cases; CodeGen
557 // isn't smart enough to ignore the overflow, which leads to much less
558 // efficient code if the width of the subtraction is wider than the native
559 // register width.
560 //
561 // (It's possible to not widen at all by pulling out factors of 2 before
562 // the multiplication; for example, K=2 can be calculated as
563 // It/2*(It+(It*INT_MIN/INT_MIN)+-1). However, it requires
564 // extra arithmetic, so it's not an obvious win, and it gets
565 // much more complicated for K > 3.)
566
567 // Protection from insane SCEVs; this bound is conservative,
568 // but it probably doesn't matter.
569 if (K > 1000)
570 return new SCEVCouldNotCompute();
571
572 unsigned W = ResultTy->getBitWidth();
573
574 // Calculate K! / 2^T and T; we divide out the factors of two before
575 // multiplying for calculating K! / 2^T to avoid overflow.
576 // Other overflow doesn't matter because we only care about the bottom
577 // W bits of the result.
578 APInt OddFactorial(W, 1);
579 unsigned T = 1;
580 for (unsigned i = 3; i <= K; ++i) {
581 APInt Mult(W, i);
582 unsigned TwoFactors = Mult.countTrailingZeros();
583 T += TwoFactors;
584 Mult = Mult.lshr(TwoFactors);
585 OddFactorial *= Mult;
586 }
587
588 // We need at least W + T bits for the multiplication step
589 // FIXME: A temporary hack; we round up the bitwidths
590 // to the nearest power of 2 to be nice to the code generator.
591 unsigned CalculationBits = 1U << Log2_32_Ceil(W + T);
592 // FIXME: Temporary hack to avoid generating integers that are too wide.
593 // Although, it's not completely clear how to determine how much
594 // widening is safe; for example, on X86, we can't really widen
595 // beyond 64 because we need to be able to do multiplication
596 // that's CalculationBits wide, but on X86-64, we can safely widen up to
597 // 128 bits.
598 if (CalculationBits > 64)
599 return new SCEVCouldNotCompute();
600
601 // Calcuate 2^T, at width T+W.
602 APInt DivFactor = APInt(CalculationBits, 1).shl(T);
603
604 // Calculate the multiplicative inverse of K! / 2^T;
605 // this multiplication factor will perform the exact division by
606 // K! / 2^T.
607 APInt Mod = APInt::getSignedMinValue(W+1);
608 APInt MultiplyFactor = OddFactorial.zext(W+1);
609 MultiplyFactor = MultiplyFactor.multiplicativeInverse(Mod);
610 MultiplyFactor = MultiplyFactor.trunc(W);
611
612 // Calculate the product, at width T+W
613 const IntegerType *CalculationTy = IntegerType::get(CalculationBits);
614 SCEVHandle Dividend = SE.getTruncateOrZeroExtend(It, CalculationTy);
615 for (unsigned i = 1; i != K; ++i) {
616 SCEVHandle S = SE.getMinusSCEV(It, SE.getIntegerSCEV(i, It->getType()));
617 Dividend = SE.getMulExpr(Dividend,
618 SE.getTruncateOrZeroExtend(S, CalculationTy));
619 }
620
621 // Divide by 2^T
622 SCEVHandle DivResult = SE.getUDivExpr(Dividend, SE.getConstant(DivFactor));
623
624 // Truncate the result, and divide by K! / 2^T.
625
626 return SE.getMulExpr(SE.getConstant(MultiplyFactor),
627 SE.getTruncateOrZeroExtend(DivResult, ResultTy));
594628 }
595629
596630 /// evaluateAtIteration - Return the value of this chain of recurrences at
609643 // The computation is correct in the face of overflow provided that the
610644 // multiplication is performed _after_ the evaluation of the binomial
611645 // coefficient.
612 SCEVHandle Val = SE.getMulExpr(getOperand(i),
613 BinomialCoefficient(It, i, SE));
646 SCEVHandle Val =
647 SE.getMulExpr(getOperand(i),
648 BinomialCoefficient(It, i, SE,
649 cast(getType())));
614650 Result = SE.getAddExpr(Result, Val);
615651 }
616652 return Result;
24402476 // loop iterates. Compute this now.
24412477 SCEVHandle IterationCount = getIterationCount(AddRec->getLoop());
24422478 if (IterationCount == UnknownValue) return UnknownValue;
2443 IterationCount = SE.getTruncateOrZeroExtend(IterationCount,
2444 AddRec->getType());
2445
2446 // If the value is affine, simplify the expression evaluation to just
2447 // Start + Step*IterationCount.
2448 if (AddRec->isAffine())
2449 return SE.getAddExpr(AddRec->getStart(),
2450 SE.getMulExpr(IterationCount,
2451 AddRec->getOperand(1)));
2452
2453 // Otherwise, evaluate it the hard way.
2479
2480 // Then, evaluate the AddRec.
24542481 return AddRec->evaluateAtIteration(IterationCount, SE);
24552482 }
24562483 return UnknownValue;
0 ; RUN: llvm-as < %s | opt -analyze -scalar-evolution -disable-output \
1 ; RUN: -scalar-evolution-max-iterations=0 | grep -F "Exits: 20028"
2 ; PR2621
3
4 define i32 @a() nounwind {
5 entry:
6 br label %bb1
7
8 bb:
9 trunc i32 %i.0 to i16
10 add i16 %0, %x16.0
11 add i32 %i.0, 1
12 br label %bb1
13
14 bb1:
15 %i.0 = phi i32 [ 0, %entry ], [ %2, %bb ]
16 %x16.0 = phi i16 [ 0, %entry ], [ %1, %bb ]
17 icmp ult i32 %i.0, 888888
18 br i1 %3, label %bb, label %bb2
19
20 bb2:
21 zext i16 %x16.0 to i32
22 ret i32 %4
23 }
24
0 ; RUN: llvm-as < %s | opt -analyze -scalar-evolution -disable-output \
1 ; RUN: -scalar-evolution-max-iterations=0 | grep -F "Exits: -19168"
2 ; PR2621
3
4 define i32 @a() nounwind {
5 entry:
6 br label %bb1
7
8 bb: ; preds = %bb1
9 add i16 %x17.0, 1 ; :0 [#uses=2]
10 add i16 %0, %x16.0 ; :1 [#uses=2]
11 add i16 %1, %x15.0 ; :2 [#uses=2]
12 add i16 %2, %x14.0 ; :3 [#uses=2]
13 add i16 %3, %x13.0 ; :4 [#uses=2]
14 add i16 %4, %x12.0 ; :5 [#uses=2]
15 add i16 %5, %x11.0 ; :6 [#uses=2]
16 add i16 %6, %x10.0 ; :7 [#uses=2]
17 add i16 %7, %x9.0 ; :8 [#uses=2]
18 add i16 %8, %x8.0 ; :9 [#uses=2]
19 add i16 %9, %x7.0 ; :10 [#uses=2]
20 add i16 %10, %x6.0 ; :11 [#uses=2]
21 add i16 %11, %x5.0 ; :12 [#uses=2]
22 add i16 %12, %x4.0 ; :13 [#uses=2]
23 add i16 %13, %x3.0 ; :14 [#uses=2]
24 add i16 %14, %x2.0 ; :15 [#uses=2]
25 add i16 %15, %x1.0 ; :16 [#uses=1]
26 add i32 %i.0, 1 ; :17 [#uses=1]
27 br label %bb1
28
29 bb1: ; preds = %bb, %entry
30 %x2.0 = phi i16 [ 0, %entry ], [ %15, %bb ] ; [#uses=1]
31 %x3.0 = phi i16 [ 0, %entry ], [ %14, %bb ] ; [#uses=1]
32 %x4.0 = phi i16 [ 0, %entry ], [ %13, %bb ] ; [#uses=1]
33 %x5.0 = phi i16 [ 0, %entry ], [ %12, %bb ] ; [#uses=1]
34 %x6.0 = phi i16 [ 0, %entry ], [ %11, %bb ] ; [#uses=1]
35 %x7.0 = phi i16 [ 0, %entry ], [ %10, %bb ] ; [#uses=1]
36 %x8.0 = phi i16 [ 0, %entry ], [ %9, %bb ] ; [#uses=1]
37 %x9.0 = phi i16 [ 0, %entry ], [ %8, %bb ] ; [#uses=1]
38 %x10.0 = phi i16 [ 0, %entry ], [ %7, %bb ] ; [#uses=1]
39 %x11.0 = phi i16 [ 0, %entry ], [ %6, %bb ] ; [#uses=1]
40 %x12.0 = phi i16 [ 0, %entry ], [ %5, %bb ] ; [#uses=1]
41 %x13.0 = phi i16 [ 0, %entry ], [ %4, %bb ] ; [#uses=1]
42 %x14.0 = phi i16 [ 0, %entry ], [ %3, %bb ] ; [#uses=1]
43 %x15.0 = phi i16 [ 0, %entry ], [ %2, %bb ] ; [#uses=1]
44 %x16.0 = phi i16 [ 0, %entry ], [ %1, %bb ] ; [#uses=1]
45 %x17.0 = phi i16 [ 0, %entry ], [ %0, %bb ] ; [#uses=1]
46 %i.0 = phi i32 [ 0, %entry ], [ %17, %bb ] ; [#uses=2]
47 %x1.0 = phi i16 [ 0, %entry ], [ %16, %bb ] ; [#uses=2]
48 icmp ult i32 %i.0, 8888 ; :18 [#uses=1]
49 br i1 %18, label %bb, label %bb2
50
51 bb2: ; preds = %bb1
52 zext i16 %x1.0 to i32 ; :19 [#uses=1]
53 ret i32 %19
54 }
55