llvm.org GIT mirror llvm / 30fa583
[TailCallElim] Preserve DT and PDT Summary: Previously, in the NewPM pipeline, TailCallElim recalculates the DomTree when it modifies any instruction in the Function. For example, ``` CallInst *CI = dyn_cast<CallInst>(&I); ... CI->setTailCall(); Modified = true; ... if (!Modified || ...) return PreservedAnalyses::all(); ``` After applying this patch, the DomTree only recalculates if needed (plus an extra insertEdge() + an extra deleteEdge() call). When optimizing SQLite with `-passes="default<O3>"` pipeline of the newPM, the number of DomTree recalculation decreases by 6.2%, the number of nodes visited by DFS decreases by 2.9%. The time used by DomTree will decrease approximately 1%~2.5% after applying the patch. Statistics: ``` Before the patch: 23010 dom-tree-stats - Number of DomTree recalculations 489264 dom-tree-stats - Number of nodes visited by DFS -- DomTree After the patch: 21581 dom-tree-stats - Number of DomTree recalculations 475088 dom-tree-stats - Number of nodes visited by DFS -- DomTree ``` Reviewers: kuhar, dmgreen, brzycki, grosser, davide Reviewed By: kuhar, brzycki Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D49982 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@338954 91177308-0d34-0410-b5e6-96231b3b80d8 Chijun Sima 1 year, 1 month ago
16 changed file(s) with 71 addition(s) and 42 deletion(s). Raw diff Collapse all Expand all
228228 /// value defined by a PHI, propagate the right value into the return. It
229229 /// returns the new return instruction in the predecessor.
230230 ReturnInst *FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
231 BasicBlock *Pred);
231 BasicBlock *Pred,
232 DomTreeUpdater *DTU = nullptr);
232233
233234 /// Split the containing block at the specified instruction - everything before
234235 /// SplitBefore stays in the old basic block, and the rest of the instructions
6060 #include "llvm/Analysis/InstructionSimplify.h"
6161 #include "llvm/Analysis/Loads.h"
6262 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
63 #include "llvm/Analysis/PostDominators.h"
6364 #include "llvm/Analysis/TargetTransformInfo.h"
6465 #include "llvm/IR/CFG.h"
6566 #include "llvm/IR/CallSite.h"
6768 #include "llvm/IR/DataLayout.h"
6869 #include "llvm/IR/DerivedTypes.h"
6970 #include "llvm/IR/DiagnosticInfo.h"
71 #include "llvm/IR/DomTreeUpdater.h"
72 #include "llvm/IR/Dominators.h"
7073 #include "llvm/IR/Function.h"
7174 #include "llvm/IR/InstIterator.h"
7275 #include "llvm/IR/Instructions.h"
487490 return CI;
488491 }
489492
490 static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret,
491 BasicBlock *&OldEntry,
492 bool &TailCallsAreMarkedTail,
493 SmallVectorImpl &ArgumentPHIs,
494 AliasAnalysis *AA,
495 OptimizationRemarkEmitter *ORE) {
493 static bool eliminateRecursiveTailCall(
494 CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry,
495 bool &TailCallsAreMarkedTail, SmallVectorImpl &ArgumentPHIs,
496 AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
496497 // If we are introducing accumulator recursion to eliminate operations after
497498 // the call instruction that are both associative and commutative, the initial
498499 // value for the accumulator is placed in this variable. If this value is set
592593 PN->addIncoming(&*I, NewEntry);
593594 ArgumentPHIs.push_back(PN);
594595 }
596 // The entry block was changed from OldEntry to NewEntry.
597 // The forward DominatorTree needs to be recalculated when the EntryBB is
598 // changed. In this corner-case we recalculate the entire tree.
599 DTU.recalculate(*NewEntry->getParent());
595600 }
596601
597602 // If this function has self recursive calls in the tail position where some
667672
668673 BB->getInstList().erase(Ret); // Remove return.
669674 BB->getInstList().erase(CI); // Remove call.
675 DTU.insertEdge(BB, OldEntry);
670676 ++NumEliminated;
671677 return true;
672678 }
675681 BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
676682 bool &TailCallsAreMarkedTail, SmallVectorImpl &ArgumentPHIs,
677683 bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
678 AliasAnalysis *AA, OptimizationRemarkEmitter *ORE) {
684 AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
679685 bool Change = false;
680686
681687 // Make sure this block is a trivial return block.
701707 if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
702708 LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
703709 << "INTO UNCOND BRANCH PRED: " << *Pred);
704 ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
710 ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);
705711
706712 // Cleanup: if all predecessors of BB have been eliminated by
707713 // FoldReturnIntoUncondBranch, delete it. It is important to empty it,
708714 // because the ret instruction in there is still using a value which
709715 // eliminateRecursiveTailCall will attempt to remove.
710716 if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
711 BB->eraseFromParent();
717 DTU.deleteBB(BB);
712718
713719 eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
714 ArgumentPHIs, AA, ORE);
720 ArgumentPHIs, AA, ORE, DTU);
715721 ++NumRetDuped;
716722 Change = true;
717723 }
720726 return Change;
721727 }
722728
723 static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
724 bool &TailCallsAreMarkedTail,
725 SmallVectorImpl &ArgumentPHIs,
726 bool CannotTailCallElimCallsMarkedTail,
727 const TargetTransformInfo *TTI,
728 AliasAnalysis *AA,
729 OptimizationRemarkEmitter *ORE) {
729 static bool processReturningBlock(
730 ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail,
731 SmallVectorImpl &ArgumentPHIs,
732 bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
733 AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
730734 CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
731735 if (!CI)
732736 return false;
733737
734738 return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
735 ArgumentPHIs, AA, ORE);
739 ArgumentPHIs, AA, ORE, DTU);
736740 }
737741
738742 static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
739743 AliasAnalysis *AA,
740 OptimizationRemarkEmitter *ORE) {
744 OptimizationRemarkEmitter *ORE,
745 DomTreeUpdater &DTU) {
741746 if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
742747 return false;
743748
772777 if (ReturnInst *Ret = dyn_cast(BB->getTerminator())) {
773778 bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
774779 ArgumentPHIs, !CanTRETailMarkedCall,
775 TTI, AA, ORE);
780 TTI, AA, ORE, DTU);
776781 if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
777 Change = foldReturnAndProcessPred(BB, Ret, OldEntry,
778 TailCallsAreMarkedTail, ArgumentPHIs,
779 !CanTRETailMarkedCall, TTI, AA, ORE);
782 Change = foldReturnAndProcessPred(
783 BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,
784 !CanTRETailMarkedCall, TTI, AA, ORE, DTU);
780785 MadeChange |= Change;
781786 }
782787 }
809814 AU.addRequired();
810815 AU.addRequired();
811816 AU.addPreserved();
817 AU.addPreserved();
818 AU.addPreserved();
812819 }
813820
814821 bool runOnFunction(Function &F) override {
815822 if (skipFunction(F))
816823 return false;
817824
825 auto *DTWP = getAnalysisIfAvailable();
826 auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
827 auto *PDTWP = getAnalysisIfAvailable();
828 auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
829 // There is no noticable performance difference here between Lazy and Eager
830 // UpdateStrategy based on some test results. It is feasible to switch the
831 // UpdateStrategy to Lazy if we find it profitable later.
832 DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
833
818834 return eliminateTailRecursion(
819835 F, &getAnalysis().getTTI(F),
820836 &getAnalysis().getAAResults(),
821 &getAnalysis().getORE());
837 &getAnalysis().getORE(), DTU);
822838 }
823839 };
824840 }
842858 TargetTransformInfo &TTI = AM.getResult(F);
843859 AliasAnalysis &AA = AM.getResult(F);
844860 auto &ORE = AM.getResult(F);
845
846 bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE);
861 auto *DT = AM.getCachedResult(F);
862 auto *PDT = AM.getCachedResult(F);
863 // There is no noticable performance difference here between Lazy and Eager
864 // UpdateStrategy based on some test results. It is feasible to switch the
865 // UpdateStrategy to Lazy if we find it profitable later.
866 DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
867 bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE, DTU);
847868
848869 if (!Changed)
849870 return PreservedAnalyses::all();
850871 PreservedAnalyses PA;
851872 PA.preserve();
873 PA.preserve();
874 PA.preserve();
852875 return PA;
853876 }
645645 }
646646
647647 ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
648 BasicBlock *Pred) {
648 BasicBlock *Pred,
649 DomTreeUpdater *DTU) {
649650 Instruction *UncondBranch = Pred->getTerminator();
650651 // Clone the return and add it to the end of the predecessor.
651652 Instruction *NewRet = RI->clone();
679680 // longer branch to them.
680681 BB->removePredecessor(Pred);
681682 UncondBranch->eraseFromParent();
683
684 if (DTU)
685 DTU->deleteEdge(Pred, BB);
686
682687 return cast(NewRet);
683688 }
684689
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
11 ; PR7328
22 ; PR7506
33 define i32 @foo(i32 %x) {
None ; RUN: opt -tailcallelim -S < %s 2>&1 | FileCheck %s
0 ; RUN: opt -tailcallelim -verify-dom-info -S < %s 2>&1 | FileCheck %s
11
22 ; CHECK: add nsw i32
33 ; CHECK-NEXT: br label
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
1 ; RUN: opt < %s -passes=tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
1 ; RUN: opt < %s -passes=tailcallelim -verify-dom-info -S | FileCheck %s
22
33 define i32 @test1_factorial(i32 %x) {
44 entry:
0 ; REQUIRES: asserts
11 ; This function contains two tail calls, which should be eliminated
2 ; RUN: opt < %s -tailcallelim -stats -disable-output 2>&1 | grep "2 tailcallelim"
2 ; RUN: opt < %s -tailcallelim -verify-dom-info -stats -disable-output 2>&1 | grep "2 tailcallelim"
33
44 define i32 @Ack(i32 %M.1, i32 %N.1) {
55 entry:
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
11
22 declare void @noarg()
33 declare void @use(i32*)
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
11
22 define i32 @f_1(i32 %x) {
33 ; CHECK-LABEL: @f_1(
None ; RUN: opt < %s -tailcallelim -S | grep call | count 4
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | grep call | count 4
11 ; PR4323
22
33 ; Several cases where tail call elimination should not move the load above the
0 ; REQUIRES: asserts
11 ; Duplicate the return into if.end to enable TCE.
2 ; RUN: opt -tailcallelim -stats -disable-output < %s 2>&1 | FileCheck %s
2 ; RUN: opt -tailcallelim -verify-dom-info -stats -disable-output < %s 2>&1 | FileCheck %s
33
44 ; CHECK: Number of return duplicated
55
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
11
22 ; Don't turn this into an infinite loop, this is probably the implementation
33 ; of fabs and we expect the codegen to lower fabs.
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
11
22 ; CHECK: tail call void @callee0()
33 ; CHECK: notail call void @callee1()
None ; RUN: opt %s -tailcallelim -pass-remarks=tailcallelim -o /dev/null 2>&1 | FileCheck %s
0 ; RUN: opt %s -tailcallelim -verify-dom-info -pass-remarks=tailcallelim -o /dev/null 2>&1 | FileCheck %s
11 ; RUN: opt %s -o /dev/null -passes='require,tailcallelim' -pass-remarks=tailcallelim 2>&1 | FileCheck %s
22
33 ; CHECK: /home/davide/pat.c:2:20: transforming tail recursion into loop
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
11 ; PR4323
22
33 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
None ; RUN: opt < %s -tailcallelim -S | FileCheck %s
0 ; RUN: opt < %s -tailcallelim -verify-dom-info -S | FileCheck %s
11
22 ; Test that we don't tail call in a functions that calls returns_twice
33 ; functions.