llvm.org GIT mirror llvm / e620248
CodeGen: BlockPlacement: Increase tail duplication size for O3. At O3 we are more willing to increase size if we believe it will improve performance. The current threshold for tail-duplication of 2 instructions is conservative, and can be relaxed at O3. Benchmark results: llvm test-suite: 6% improvement in aha, due to duplication of loop latch 3% improvement in hexxagon 2% slowdown in lpbench. Seems related, but couldn't completely diagnose. Internal google benchmark: Produces 4% improvement on internal google protocol buffer serialization benchmarks. Differential-Revision: https://reviews.llvm.org/D32324 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@303084 91177308-0d34-0410-b5e6-96231b3b80d8 Kyle Butt 3 years ago
3 changed file(s) with 131 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
130130 cl::desc("Instruction cutoff for tail duplication during layout. "
131131 "Tail merging during layout is forced to have a threshold "
132132 "that won't conflict."), cl::init(2),
133 cl::Hidden);
134
135 // Heuristic for aggressive tail duplication.
136 static cl::opt TailDupPlacementAggressiveThreshold(
137 "tail-dup-placement-aggressive-threshold",
138 cl::desc("Instruction cutoff for aggressive tail duplication during "
139 "layout. Used at -O3. Tail merging during layout is forced to "
140 "have a threshold that won't conflict."), cl::init(3),
133141 cl::Hidden);
134142
135143 // Heuristic for tail duplication.
26452653 assert(BlockToChain.empty());
26462654 assert(ComputedEdges.empty());
26472655
2656 unsigned TailDupSize = TailDupPlacementThreshold;
2657 // If only the aggressive threshold is explicitly set, use it.
2658 if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
2659 TailDupPlacementThreshold.getNumOccurrences() == 0)
2660 TailDupSize = TailDupPlacementAggressiveThreshold;
2661
2662 TargetPassConfig *PassConfig = &getAnalysis();
2663 // For agressive optimization, we can adjust some thresholds to be less
2664 // conservative.
2665 if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) {
2666 // At O3 we should be more willing to copy blocks for tail duplication. This
2667 // increases size pressure, so we only do it at O3
2668 // Do this unless only the regular threshold is explicitly set.
2669 if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
2670 TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
2671 TailDupSize = TailDupPlacementAggressiveThreshold;
2672 }
2673
26482674 if (TailDupPlacement) {
26492675 MPDT = &getAnalysis();
2650 unsigned TailDupSize = TailDupPlacementThreshold;
26512676 if (MF.getFunction()->optForSize())
26522677 TailDupSize = 1;
26532678 TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
26572682 buildCFGChains();
26582683
26592684 // Changing the layout can create new tail merging opportunities.
2660 TargetPassConfig *PassConfig = &getAnalysis();
26612685 // TailMerge can create jump into if branches that make CFG irreducible for
26622686 // HW that requires structured CFG.
26632687 bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
26652689 BranchFoldPlacement;
26662690 // No tail merging opportunities if the block number is less than four.
26672691 if (MF.size() > 3 && EnableTailMerge) {
2668 unsigned TailMergeSize = TailDupPlacementThreshold + 1;
2692 unsigned TailMergeSize = TailDupSize + 1;
26692693 BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
26702694 *MBPI, TailMergeSize);
26712695
None ; RUN: llc -O2 < %s | FileCheck %s
0 ; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s
1 ; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s
12 target datalayout = "e-m:e-i64:64-n32:64"
23 target triple = "powerpc64le-grtev4-linux-gnu"
34
9899 ; test1
99100 ; test2
100101 ; test3
101 ; test4
102102 ; optional1
103103 ; optional2
104104 ; optional3
105 ; optional4
106105 ; exit
107106 ; even for 50/50 branches.
108107 ; Tail duplication puts test n+1 at the end of optional n
153152 test3:
154153 %tagbit3 = and i32 %tag, 4
155154 %tagbit3eq0 = icmp eq i32 %tagbit3, 0
155 br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
156 optional3:
157 call void @c()
158 br label %exit
159 exit:
160 ret void
161 }
162
163 ; Intended layout:
164 ; The chain-of-triangles based duplicating produces the layout when 3
165 ; instructions are allowed for tail-duplication.
166 ; test1
167 ; test2
168 ; test3
169 ; optional1
170 ; optional2
171 ; optional3
172 ; exit
173 ;
174 ; Otherwise it produces the layout:
175 ; test1
176 ; optional1
177 ; test2
178 ; optional2
179 ; test3
180 ; optional3
181 ; exit
182
183 ;CHECK-LABEL: straight_test_3_instr_test:
184 ; test1 may have been merged with entry
185 ;CHECK: mr [[TAGREG:[0-9]+]], 3
186 ;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30
187 ;CHECK-NEXT: cmplwi {{[0-9]+}}, 2
188
189 ;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]]
190 ;CHECK-O3-NEXT: # %test2
191 ;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
192 ;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
193 ;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
194 ;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
195 ;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
196 ;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
197 ;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
198 ;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
199 ;CHECK-O3: blr
200 ;CHECK-O3-NEXT: .[[OPT1LABEL]]:
201 ;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
202 ;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8
203 ;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]]
204 ;CHECK-O3-NEXT: .[[OPT2LABEL]]:
205 ;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
206 ;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32
207 ;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]]
208 ;CHECK-O3-NEXT: .[[OPT3LABEL]]:
209 ;CHECK-O3: b .[[EXITLABEL]]
210
211 ;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]]
212 ;CHECK-O2-NEXT: # %optional1
213 ;CHECK-O2: .[[TEST2LABEL]]: # %test2
214 ;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29
215 ;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8
216 ;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]]
217 ;CHECK-O2-NEXT: # %optional2
218 ;CHECK-O2: .[[TEST3LABEL]]: # %test3
219 ;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27
220 ;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32
221 ;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]]
222 ;CHECK-O2-NEXT: # %optional3
223 ;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
224 ;CHECK-O2: blr
225
226
227 define void @straight_test_3_instr_test(i32 %tag) {
228 entry:
229 br label %test1
230 test1:
231 %tagbit1 = and i32 %tag, 3
232 %tagbit1eq0 = icmp eq i32 %tagbit1, 2
233 br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2
234 optional1:
235 call void @a()
236 br label %test2
237 test2:
238 %tagbit2 = and i32 %tag, 12
239 %tagbit2eq0 = icmp eq i32 %tagbit2, 8
240 br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2
241 optional2:
242 call void @b()
243 br label %test3
244 test3:
245 %tagbit3 = and i32 %tag, 48
246 %tagbit3eq0 = icmp eq i32 %tagbit3, 32
156247 br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
157248 optional3:
158249 call void @c()
6565 ; X32-NEXT: jne .LBB1_8
6666 ; X32-NEXT: .LBB1_7:
6767 ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
68 ; X32-NEXT: jmp .LBB1_9
68 ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
69 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
70 ; X32-NEXT: je .LBB1_10
71 ; X32-NEXT: jmp .LBB1_11
6972 ; X32-NEXT: .LBB1_1:
7073 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
7174 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
7679 ; X32-NEXT: je .LBB1_7
7780 ; X32-NEXT: .LBB1_8: # %entry
7881 ; X32-NEXT: xorps %xmm3, %xmm3
79 ; X32-NEXT: .LBB1_9: # %entry
8082 ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
8183 ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp)
8284 ; X32-NEXT: jne .LBB1_11
83 ; X32-NEXT: # BB#10:
85 ; X32-NEXT: .LBB1_10:
8486 ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
8587 ; X32-NEXT: .LBB1_11: # %entry
8688 ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
102104 ; X64-NEXT: jne .LBB1_8
103105 ; X64-NEXT: .LBB1_7:
104106 ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
105 ; X64-NEXT: jmp .LBB1_9
107 ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
108 ; X64-NEXT: testl %esi, %esi
109 ; X64-NEXT: je .LBB1_10
110 ; X64-NEXT: jmp .LBB1_11
106111 ; X64-NEXT: .LBB1_1:
107112 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
108113 ; X64-NEXT: testl %edx, %edx
113118 ; X64-NEXT: je .LBB1_7
114119 ; X64-NEXT: .LBB1_8: # %entry
115120 ; X64-NEXT: xorps %xmm3, %xmm3
116 ; X64-NEXT: .LBB1_9: # %entry
117121 ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
118122 ; X64-NEXT: testl %esi, %esi
119123 ; X64-NEXT: jne .LBB1_11
120 ; X64-NEXT: # BB#10:
124 ; X64-NEXT: .LBB1_10:
121125 ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
122126 ; X64-NEXT: .LBB1_11: # %entry
123127 ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]