llvm.org GIT mirror llvm / 0c01bc3
LSR: rewrite inner loops only. Rewriting the entire loop nest now requires -enable-lsr-nested. See PR11035 for some performance data. A few unit tests specifically test nested LSR, and are now under a flag. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@140762 91177308-0d34-0410-b5e6-96231b3b80d8 Andrew Trick 8 years ago
7 changed file(s) with 45 addition(s) and 14 deletion(s). Raw diff Collapse all Expand all
7777 using namespace llvm;
7878
7979 namespace llvm {
80 cl::opt EnableNested(
81 "enable-lsr-nested", cl::Hidden, cl::desc("Enable LSR on nested loops"));
82
8083 cl::opt EnableRetry(
8184 "enable-lsr-retry", cl::Hidden, cl::desc("Enable LSR retry"));
8285 }
722725 if (AR->getLoop() == L)
723726 AddRecCost += 1; /// TODO: This should be a function of the stride.
724727
725 // If this is an addrec for a loop that's already been visited by LSR,
726 // don't second-guess its addrec phi nodes. LSR isn't currently smart
727 // enough to reason about more than one loop at a time. Consider these
728 // registers free and leave them alone.
729 else if (L->contains(AR->getLoop()) ||
728 // If this is an addrec for another loop, don't second-guess its addrec phi
729 // nodes. LSR isn't currently smart enough to reason about more than one
730 // loop at a time. LSR has either already run on inner loops, will not run
731 // on other loops, and cannot be expected to change sibling loops. If the
732 // AddRec exists, consider it's register free and leave it alone. Otherwise,
733 // do not consider this formula at all.
734 // FIXME: why do we need to generate such fomulae?
735 else if (!EnableNested || L->contains(AR->getLoop()) ||
730736 (!AR->getLoop()->contains(L) &&
731737 DT.dominates(L->getHeader(), AR->getLoop()->getHeader()))) {
732738 for (BasicBlock::iterator I = AR->getLoop()->getHeader()->begin();
736742 SE.getEffectiveSCEVType(AR->getType())) &&
737743 SE.getSCEV(PN) == AR)
738744 return;
745 }
746 if (!EnableNested) {
747 Loose();
748 return;
739749 }
740750 // If this isn't one of the addrecs that the loop already has, it
741751 // would require a costly new phi and add. TODO: This isn't
38003810 // If loop preparation eliminates all interesting IV users, bail.
38013811 if (IU.empty()) return;
38023812
3813 // Skip nested loops until we can model them better with forulae.
3814 if (!EnableNested && !L->empty()) {
3815 DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
3816 return false;
3817 }
3818
38033819 // Start collecting data and preparing for the solver.
38043820 CollectInterestingTypesAndFactors();
38053821 CollectFixupsAndInitialFormulae();
None ; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s
0 ; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 -enable-lsr-nested < %s | FileCheck %s
11
22 ; LSR should recognize that this is an unrolled loop which can use
33 ; constant offset addressing, so that each of the following stores
66 ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #32]
77 ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #64]
88 ; CHECK: vstr.32 s{{.*}}, [{{(r[0-9]+)|(lr)}}, #96]
9
10 ; We can also save a register in the outer loop, but that requires
11 ; performing LSR on the outer loop.
912
1013 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
1114
None ; RUN: llc < %s -march=x86 | FileCheck %s
0 ; RUN: llc < %s -march=x86 -enable-lsr-nested | FileCheck %s
1 ;
2 ; Nested LSR is required to optimize this case.
3 ; We do not expect to see this form of IR without -enable-iv-rewrite.
14
25 define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
36 ; CHECK: borf:
None ; RUN: llc < %s -march=x86-64 | FileCheck %s
0 ; RUN: llc < %s -march=x86-64 -enable-lsr-nested | FileCheck %s
1 ;
2 ; Nested LSR is required to optimize this case.
3 ; We do not expect to see this form of IR without -enable-iv-rewrite.
14
25 define void @borf(i8* nocapture %in, i8* nocapture %out) nounwind {
36 ; CHECK: borf:
None ; RUN: llc < %s -march=x86-64 -o %t
0 ; RUN: llc < %s -march=x86-64 -enable-lsr-nested -o %t
11 ; RUN: not grep inc %t
22 ; RUN: grep dec %t | count 2
33 ; RUN: grep addq %t | count 12
1010 ; to insert new induction variables. Previously it would create a
1111 ; flood of new induction variables.
1212 ; Also, the loop reversal should kick in once.
13 ;
14 ; In this example, performing LSR on the entire loop nest,
15 ; as opposed to only the inner loop can further reduce induction variables,
16 ; and their related instructions and registers.
1317
1418 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
1519 target triple = "x86_64-unknown-linux-gnu"
None ; RUN: llc < %s -march=x86 | grep cmp | grep 240
1 ; RUN: llc < %s -march=x86 | grep inc | count 1
0 ; RUN: llc < %s -march=x86 -enable-lsr-nested | grep cmp | grep 240
1 ; RUN: llc < %s -march=x86 -enable-lsr-nested | grep inc | count 1
22
33 define i32 @foo(i32 %A, i32 %B, i32 %C, i32 %D) nounwind {
44 entry:
None ; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of loads added} | grep 2
1 ; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of spill slots allocated} | grep 1
2 ; RUN: llc < %s -march=x86 -stats -regalloc=linearscan |& grep {Number of machine instrs printed} | grep 34
0 ; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of loads added} | grep 2
1 ; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of spill slots allocated} | grep 1
2 ; RUN: llc < %s -march=x86 -stats -regalloc=linearscan -enable-lsr-nested |& grep {Number of machine instrs printed} | grep 34
33 ; PR3495
4 ;
5 ; Note: this should not spill at all with either good LSR or good regalloc.
46
57 target triple = "i386-pc-linux-gnu"
68 @x = external global [8 x i32], align 32 ; <[8 x i32]*> [#uses=1]