llvm.org GIT mirror llvm / 65ad27f
[SLP] Emit optimization remarks The approach I followed was to emit the remark after getTreeCost concludes that SLP is profitable. I initially tried emitting them after the vectorizeRootInstruction calls in vectorizeChainsInBlock but I vaguely remember missing a few cases for example in HorizontalReduction::tryToReduce. ORE is placed in BoUpSLP so that it's available from everywhere (notably HorizontalReduction::tryToReduce). We use the first instruction in the root bundle as the locator for the remark. In order to get a sense how far the tree is spanning I've include the size of the tree in the remark. This is not perfect of course but it gives you at least a rough idea about the tree. Then you can follow up with -view-slp-tree to really see the actual tree. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@302811 91177308-0d34-0410-b5e6-96231b3b80d8 Adam Nemet 3 years ago
5 changed file(s) with 145 addition(s) and 13 deletion(s). Raw diff Collapse all Expand all
2323 #include "llvm/Analysis/AssumptionCache.h"
2424 #include "llvm/Analysis/DemandedBits.h"
2525 #include "llvm/Analysis/LoopInfo.h"
26 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
2627 #include "llvm/Analysis/ScalarEvolution.h"
2728 #include "llvm/Analysis/TargetTransformInfo.h"
2829 #include "llvm/IR/Function.h"
5859 // Glue for old PM.
5960 bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_,
6061 TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_,
61 DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_);
62 DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_,
63 OptimizationRemarkEmitter *ORE_);
6264
6365 private:
6466 /// \brief Collect store and getelementptr instructions and organize them
298298 BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
299299 TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
300300 DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
301 const DataLayout *DL)
301 const DataLayout *DL, OptimizationRemarkEmitter *ORE)
302302 : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func),
303303 SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB),
304 DL(DL), Builder(Se->getContext()) {
304 DL(DL), ORE(ORE), Builder(Se->getContext()) {
305305 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
306306 // Use the vector register size specified by the target unless overridden
307307 // by a command-line option.
360360 MinBWs.clear();
361361 }
362362
363 unsigned getTreeSize() const { return VectorizableTree.size(); }
364
363365 /// \brief Perform LICM and CSE on the newly generated gather sequences.
364366 void optimizeGatherSequence();
365367
397399 /// \returns True if the VectorizableTree is both tiny and not fully
398400 /// vectorizable. We do not vectorize such trees.
399401 bool isTreeTinyAndNotFullyVectorizable();
402
403 OptimizationRemarkEmitter *getORE() { return ORE; }
400404
401405 private:
402406 struct TreeEntry;
927931 AssumptionCache *AC;
928932 DemandedBits *DB;
929933 const DataLayout *DL;
934 OptimizationRemarkEmitter *ORE;
935
930936 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
931937 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
932938 /// Instruction builder to construct the vectorized tree.
37713777 auto *DT = &getAnalysis().getDomTree();
37723778 auto *AC = &getAnalysis().getAssumptionCache(F);
37733779 auto *DB = &getAnalysis().getDemandedBits();
3774
3775 return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
3780 auto *ORE = &getAnalysis().getORE();
3781
3782 return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
37763783 }
37773784
37783785 void getAnalysisUsage(AnalysisUsage &AU) const override {
37843791 AU.addRequired();
37853792 AU.addRequired();
37863793 AU.addRequired();
3794 AU.addRequired();
37873795 AU.addPreserved();
37883796 AU.addPreserved();
37893797 AU.addPreserved();
38023810 auto *DT = &AM.getResult(F);
38033811 auto *AC = &AM.getResult(F);
38043812 auto *DB = &AM.getResult(F);
3805
3806 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB);
3813 auto *ORE = &AM.getResult(F);
3814
3815 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
38073816 if (!Changed)
38083817 return PreservedAnalyses::all();
38093818
38183827 TargetTransformInfo *TTI_,
38193828 TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
38203829 LoopInfo *LI_, DominatorTree *DT_,
3821 AssumptionCache *AC_, DemandedBits *DB_) {
3830 AssumptionCache *AC_, DemandedBits *DB_,
3831 OptimizationRemarkEmitter *ORE_) {
38223832 SE = SE_;
38233833 TTI = TTI_;
38243834 TLI = TLI_;
38463856
38473857 // Use the bottom up slp vectorizer to construct chains that start with
38483858 // store instructions.
3849 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL);
3859 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
38503860
38513861 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
38523862 // delete instructions.
39353945 DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
39363946 if (Cost < -SLPCostThreshold) {
39373947 DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
3948 using namespace ore;
3949 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
3950 cast(Chain[i]))
3951 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
3952 << " and with tree size "
3953 << NV("TreeSize", R.getTreeSize()));
3954
39383955 R.vectorizeTree();
39393956
39403957 // Move to the next bundle.
41484165
41494166 if (Cost < -SLPCostThreshold) {
41504167 DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
4168 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
4169 cast(Ops[0]))
4170 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
4171 << " and with tree size "
4172 << ore::NV("TreeSize", R.getTreeSize()));
4173
41514174 Value *VectorizedRoot = R.vectorizeTree();
41524175
41534176 // Reconstruct the build vector by extracting the vectorized root. This
44914514
44924515 DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost
44934516 << ". (HorRdx)\n");
4517 auto *I0 = cast(VL[0]);
4518 V.getORE()->emit(
4519 OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0)
4520 << "Vectorized horizontal reduction with cost "
4521 << ore::NV("Cost", Cost) << " and with tree size "
4522 << ore::NV("TreeSize", V.getTreeSize()));
44944523
44954524 // Vectorize a tree.
44964525 DebugLoc Loc = cast(ReducedVals[i])->getDebugLoc();
51455174 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
51465175 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
51475176 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
5177 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
51485178 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
51495179
51505180 namespace llvm {
None ; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
0 ; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s
1 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
12
23 target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
34 target triple = "aarch64--linux-gnu"
2223 ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
2324 ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
2425 ; CHECK: sext i32 [[X]] to i64
25 ;
26
27 ; YAML: Pass: slp-vectorizer
28 ; YAML-NEXT: Name: VectorizedList
29 ; YAML-NEXT: Function: getelementptr_4x32
30 ; YAML-NEXT: Args:
31 ; YAML-NEXT: - String: 'SLP vectorized with cost '
32 ; YAML-NEXT: - Cost: '11'
33 ; YAML-NEXT: - String: ' and with tree size '
34 ; YAML-NEXT: - TreeSize: '5'
35
36 ; YAML: Pass: slp-vectorizer
37 ; YAML-NEXT: Name: VectorizedList
38 ; YAML-NEXT: Function: getelementptr_4x32
39 ; YAML-NEXT: Args:
40 ; YAML-NEXT: - String: 'SLP vectorized with cost '
41 ; YAML-NEXT: - Cost: '16'
42 ; YAML-NEXT: - String: ' and with tree size '
43 ; YAML-NEXT: - TreeSize: '3'
44
2645 define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
2746 entry:
2847 %cmp31 = icmp sgt i32 %n, 0
6887 ; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
6988 ; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
7089 ; CHECK: sext i32 [[X]] to i64
71 ;
90
91 ; YAML: Pass: slp-vectorizer
92 ; YAML-NEXT: Name: VectorizedList
93 ; YAML-NEXT: Function: getelementptr_2x32
94 ; YAML-NEXT: Args:
95 ; YAML-NEXT: - String: 'SLP vectorized with cost '
96 ; YAML-NEXT: - Cost: '11'
97 ; YAML-NEXT: - String: ' and with tree size '
98 ; YAML-NEXT: - TreeSize: '5'
99
100 ; YAML: Pass: slp-vectorizer
101 ; YAML-NEXT: Name: VectorizedList
102 ; YAML-NEXT: Function: getelementptr_2x32
103 ; YAML-NEXT: Args:
104 ; YAML-NEXT: - String: 'SLP vectorized with cost '
105 ; YAML-NEXT: - Cost: '6'
106 ; YAML-NEXT: - String: ' and with tree size '
107 ; YAML-NEXT: - TreeSize: '3'
108
72109 define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
73110 entry:
74111 %cmp31 = icmp sgt i32 %n, 0
None ; RUN: opt -slp-vectorizer -slp-threshold=-6 -S < %s | FileCheck %s
0 ; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s
1 ; RUN: cat %t | FileCheck -check-prefix=YAML %s
12
23 ; FIXME: The threshold is changed to keep this test case a bit smaller.
34 ; The AArch64 cost model should not give such high costs to select statements.
910 ; CHECK: load <4 x i32>
1011 ; CHECK: load <4 x i32>
1112 ; CHECK: select <4 x i1>
13
14 ; YAML: Pass: slp-vectorizer
15 ; YAML-NEXT: Name: VectorizedHorizontalReduction
16 ; YAML-NEXT: Function: test_select
17 ; YAML-NEXT: Args:
18 ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
19 ; YAML-NEXT: - Cost: '4'
20 ; YAML-NEXT: - String: ' and with tree size '
21 ; YAML-NEXT: - TreeSize: '8'
22
1223 define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) {
1324 entry:
1425 %cmp.22 = icmp sgt i32 %h, 0
92103 ; CHECK: load <4 x i32>
93104 ; CHECK: load <4 x i32>
94105 ; CHECK: mul nsw <4 x i32>
106
107 ; YAML: Pass: slp-vectorizer
108 ; YAML-NEXT: Name: VectorizedHorizontalReduction
109 ; YAML-NEXT: Function: reduction_with_br
110 ; YAML-NEXT: Args:
111 ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
112 ; YAML-NEXT: - Cost: '1'
113 ; YAML-NEXT: - String: ' and with tree size '
114 ; YAML-NEXT: - TreeSize: '3'
115
95116 entry:
96117 %cmp.16 = icmp sgt i32 %h, 0
97118 br i1 %cmp.16, label %for.body.lr.ph, label %for.end
149170 ; CHECK: load <8 x i8>
150171 ; CHECK: load <8 x i8>
151172 ; CHECK: select <8 x i1>
173
174 ; YAML: Pass: slp-vectorizer
175 ; YAML-NEXT: Name: VectorizedHorizontalReduction
176 ; YAML-NEXT: Function: test_unrolled_select
177 ; YAML-NEXT: Args:
178 ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost '
179 ; YAML-NEXT: - Cost: '-33'
180 ; YAML-NEXT: - String: ' and with tree size '
181 ; YAML-NEXT: - TreeSize: '10'
182
152183 define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 {
153184 entry:
154185 %cmp.43 = icmp sgt i32 %h, 0
0 ; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -pass-remarks=slp-vectorizer -o /dev/null < %s 2>&1 | FileCheck %s
1
2 define void @f(double* %r, double* %w) {
3 %r0 = getelementptr inbounds double, double* %r, i64 0
4 %r1 = getelementptr inbounds double, double* %r, i64 1
5 %f0 = load double, double* %r0
6 %f1 = load double, double* %r1
7 %add0 = fadd double %f0, %f0
8 %add1 = fadd double %f1, %f1
9 %w0 = getelementptr inbounds double, double* %w, i64 0
10 %w1 = getelementptr inbounds double, double* %w, i64 1
11 ; CHECK: remark: /tmp/s.c:5:10: Stores SLP vectorized with cost -4 and with tree size 3
12 store double %add0, double* %w0, !dbg !9
13 store double %add1, double* %w1
14 ret void
15 }
16
17
18 !llvm.dbg.cu = !{!0}
19 !llvm.module.flags = !{!3, !4, !5}
20 !llvm.ident = !{!6}
21
22 !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
23 !1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
24 !2 = !{}
25 !3 = !{i32 2, !"Dwarf Version", i32 4}
26 !4 = !{i32 2, !"Debug Info Version", i32 3}
27 !5 = !{i32 1, !"PIC Level", i32 2}
28 !6 = !{!"clang version 4.0.0 (trunk 281293) (llvm/trunk 281290)"}
29 !7 = distinct !DISubprogram(name: "baz", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: true, unit: !0, variables: !2)
30 !8 = !DISubroutineType(types: !2)
31 !9 = !DILocation(line: 5, column: 10, scope: !7)