llvm.org GIT mirror llvm / 45c9fb9
[HotColdSplit] Schedule splitting late to fix perf regression With or without PGO data applied, splitting early in the pipeline (either before the inliner or shortly after it) regresses performance across SPEC variants. The cause appears to be that splitting hides context for subsequent optimizations. Schedule splitting late again, in effect reversing r352080, which scheduled the splitting pass early for code size benefits (documented in https://reviews.llvm.org/D57082). Differential Revision: https://reviews.llvm.org/D58258 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@354158 91177308-0d34-0410-b5e6-96231b3b80d8 Vedant Kumar 6 months ago
7 changed file(s) with 43 addition(s) and 36 deletion(s). Raw diff Collapse all Expand all
273273 /// require some transformations for semantic reasons, they should explicitly
274274 /// build them.
275275 ModulePassManager buildModuleOptimizationPipeline(OptimizationLevel Level,
276 bool DebugLogging = false);
276 bool DebugLogging = false,
277 bool LTOPreLink = false);
277278
278279 /// Build a per-module default optimization pipeline.
279280 ///
287288 /// require some transformations for semantic reasons, they should explicitly
288289 /// build them.
289290 ModulePassManager buildPerModuleDefaultPipeline(OptimizationLevel Level,
290 bool DebugLogging = false);
291 bool DebugLogging = false,
292 bool LTOPreLink = false);
291293
292294 /// Build a pre-link, ThinLTO-targeting default optimization pipeline to
293295 /// a pass manager.
702702 if (EnableSyntheticCounts && !PGOOpt)
703703 MPM.addPass(SyntheticCountsPropagation());
704704
705 // Split out cold code. Splitting is done before inlining because 1) the most
706 // common kinds of cold regions can (a) be found before inlining and (b) do
707 // not grow after inlining, and 2) inhibiting inlining of cold code improves
708 // code size & compile time. Split after Mem2Reg to make code model estimates
709 // more accurate, but before InstCombine to allow it to clean things up.
710 if (EnableHotColdSplit && Phase != ThinLTOPhase::PostLink)
711 MPM.addPass(HotColdSplittingPass());
712
713705 // Require the GlobalsAA analysis for the module so we can query it within
714706 // the CGSCC pipeline.
715707 MPM.addPass(RequireAnalysisPass());
768760 return MPM;
769761 }
770762
771 ModulePassManager
772 PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
773 bool DebugLogging) {
763 ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
764 OptimizationLevel Level, bool DebugLogging, bool LTOPreLink) {
774765 ModulePassManager MPM(DebugLogging);
775766
776767 // Optimize globals now that the module is fully simplified.
879870 // alignment information, try to re-derive it here.
880871 OptimizePM.addPass(AlignmentFromAssumptionsPass());
881872
873 // Split out cold code. Splitting is done late to avoid hiding context from
874 // other optimizations and inadvertently regressing performance. The tradeoff
875 // is that this has a higher code size cost than splitting early.
876 if (EnableHotColdSplit && !LTOPreLink)
877 MPM.addPass(HotColdSplittingPass());
878
882879 // LoopSink pass sinks instructions hoisted by LICM, which serves as a
883880 // canonicalization pass that enables other optimizations. As a result,
884881 // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
922919
923920 ModulePassManager
924921 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
925 bool DebugLogging) {
922 bool DebugLogging, bool LTOPreLink) {
926923 assert(Level != O0 && "Must request optimizations for the default pipeline!");
927924
928925 ModulePassManager MPM(DebugLogging);
942939 DebugLogging));
943940
944941 // Now add the optimization pipeline.
945 MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging));
942 MPM.addPass(buildModuleOptimizationPipeline(Level, DebugLogging, LTOPreLink));
946943
947944 return MPM;
948945 }
10261023 bool DebugLogging) {
10271024 assert(Level != O0 && "Must request optimizations for the default pipeline!");
10281025 // FIXME: We should use a customized pre-link pipeline!
1029 return buildPerModuleDefaultPipeline(Level, DebugLogging);
1026 return buildPerModuleDefaultPipeline(Level, DebugLogging,
1027 /*LTOPreLink=*/true);
10301028 }
10311029
10321030 ModulePassManager
12061204 // to be run at link time if CFI is enabled. This pass does nothing if
12071205 // CFI is disabled.
12081206 MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
1207
1208 // Enable splitting late in the FullLTO post-link pipeline. This is done in
1209 // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses).
1210 if (EnableHotColdSplit)
1211 MPM.addPass(HotColdSplittingPass());
12091212
12101213 // Add late LTO optimization passes.
12111214 // Delete basic blocks, which optimization passes may have killed.
528528 if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile)
529529 addPGOInstrPasses(MPM);
530530
531 // Split out cold code before inlining. See comment in the new PM
532 // (\ref buildModuleSimplificationPipeline).
533 if (EnableHotColdSplit && DefaultOrPreLinkPipeline)
534 MPM.add(createHotColdSplittingPass());
535
536531 // We add a module alias analysis pass here. In part due to bugs in the
537532 // analysis infrastructure this "works" in that the analysis stays alive
538533 // for the entire SCC pass run below.
729724 MPM.add(createConstantMergePass()); // Merge dup global constants
730725 }
731726
727 // See comment in the new PM for justification of scheduling splitting at
728 // this stage (\ref buildModuleSimplificationPipeline).
729 if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO))
730 MPM.add(createHotColdSplittingPass());
731
732732 if (MergeFunctions)
733733 MPM.add(createMergeFunctionsPass());
734734
917917
918918 void PassManagerBuilder::addLateLTOOptimizationPasses(
919919 legacy::PassManagerBase &PM) {
920 // See comment in the new PM for justification of scheduling splitting at
921 // this stage (\ref buildLTODefaultPipeline).
922 if (EnableHotColdSplit)
923 PM.add(createHotColdSplittingPass());
924
920925 // Delete basic blocks, which optimization passes may have killed.
921926 PM.add(createCFGSimplificationPass());
922927
0 ; RUN: opt -module-summary %s -o %t.bc
1 ; RUN: llvm-lto -hot-cold-split=true -thinlto-action=run %t.bc -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=OLDPM-THINLTO-POSTLINK-Os
1 ; RUN: llvm-lto -hot-cold-split=true -thinlto-action=run %t.bc -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=OLDPM-ANYLTO-POSTLINK-Os
2 ; RUN: llvm-lto -hot-cold-split=true %t.bc -debug-pass=Structure 2>&1 | FileCheck %s -check-prefix=OLDPM-ANYLTO-POSTLINK-Os
23
34 ; REQUIRES: asserts
45
56 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
67 target triple = "x86_64-unknown-linux-gnu"
78
8 ; OLDPM-THINLTO-POSTLINK-Os-LABEL: Pass Arguments
9 ; OLDPM-THINLTO-POSTLINK-Os-NOT: Hot Cold Splitting
9 ; OLDPM-ANYLTO-POSTLINK-Os: Hot Cold Splitting
1212 ; GEN: Running pass: PGOInstrumentationGen
1313 ; USE: Running pass: PGOInstrumentationUse
1414 ; USE: Running pass: PGOIndirectCallPromotion
15 ; SPLIT: Running pass: HotColdSplittingPass
1615 ; USE: Running pass: PGOMemOPSizeOpt
1716 ; SAMPLE_USE_O: Running pass: ModuleToFunctionPassAdaptor<{{.*}}AddDiscriminatorsPass{{.*}}>
1817 ; SAMPLE_USE_PRE_LINK: Running pass: ModuleToFunctionPassAdaptor<{{.*}}AddDiscriminatorsPass{{.*}}>
2625 ; SAMPLE_USE_POST_LINK-NOT: Running pass: GlobalOptPass
2726 ; SAMPLE_USE_POST_LINK: Running pass: PGOIndirectCallPromotion
2827 ; SAMPLE_GEN: Running pass: ModuleToFunctionPassAdaptor<{{.*}}AddDiscriminatorsPass{{.*}}>
28 ; SPLIT: Running pass: HotColdSplittingPass
2929
3030 define void @foo() {
3131 ret void
0 ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=DEFAULT-Os
11 ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='lto-pre-link' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=LTO-PRELINK-Os
22 ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='thinlto-pre-link' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=THINLTO-PRELINK-Os
3 ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='lto' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=LTO-POSTLINK-Os
34 ; RUN: opt -mtriple=x86_64-- -Os -hot-cold-split=true -passes='thinlto' -debug-pass-manager < %s -o /dev/null 2>&1 | FileCheck %s -check-prefix=THINLTO-POSTLINK-Os
45
56 ; REQUIRES: asserts
67
7 ; Splitting should occur after Mem2Reg and should be followed by InstCombine.
8 ; Splitting should occur late.
89
9 ; DEFAULT-Os: Promote Memory to Register
1010 ; DEFAULT-Os: Hot Cold Splitting
11 ; DEFAULT-Os: Combine redundant instructions
11 ; DEFAULT-Os: Simplify the CFG
1212
13 ; LTO-PRELINK-Os-LABEL: Starting llvm::Module pass manager run.
14 ; LTO-PRELINK-Os: Running pass: {{.*}}PromotePass
15 ; LTO-PRELINK-Os: Running pass: HotColdSplittingPass
13 ; LTO-PRELINK-Os-NOT: pass: HotColdSplittingPass
1614
17 ; THINLTO-PRELINK-Os-LABEL: Running analysis: PassInstrumentationAnalysis
18 ; THINLTO-PRELINK-Os: Running pass: {{.*}}PromotePass
19 ; THINLTO-PRELINK-Os: Running pass: HotColdSplittingPass
15 ; THINLTO-PRELINK-Os-NOT: Running pass: HotColdSplittingPass
2016
21 ; THINLTO-POSTLINK-Os-NOT: HotColdSplitting
17 ; LTO-POSTLINK-Os: HotColdSplitting
18 ; THINLTO-POSTLINK-Os: HotColdSplitting
4040 ; PGOUSE: Function Integration/Inlining
4141 ; PGOUSE: PGOInstrumentationUsePass
4242 ; PGOUSE: PGOIndirectCallPromotion
43 ; SPLIT: Hot Cold Splitting
4443 ; PGOUSE: CallGraph Construction
4544 ; CHECK-O2-NEXT: Globals Alias Analysis
4645 ; CHECK-O2-NEXT: Call Graph SCC Pass Manager
9998 ; the runtime unrolling though.
10099 ; CHECK-O2: Loop Pass Manager
101100 ; CHECK-O2-NEXT: Loop Invariant Code Motion
101 ; SPLIT: Hot Cold Splitting
102102 ; CHECK-O2: FunctionPass Manager
103103 ; CHECK-O2: Loop Pass Manager
104104 ; CHECK-O2-NEXT: Loop Sink