llvm.org GIT mirror llvm / 40f64cb
- Stop simplifycfg from duplicating "ret" instructions into unconditional branches. PR8575, rdar://5134905, rdar://8911460. - Allow codegen tail duplication to dup small return blocks after register allocation is done. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@124462 91177308-0d34-0410-b5e6-96231b3b80d8 Evan Cheng 8 years ago
11 changed file(s) with 78 addition(s) and 82 deletion(s). Raw diff Collapse all Expand all
464464 MaxDuplicateCount = TailDuplicateSize;
465465
466466 if (PreRegAlloc) {
467 // Pre-regalloc tail duplication hurts compile time and doesn't help
468 // much except for indirect branches.
469 if (TailBB->empty() || !TailBB->back().getDesc().isIndirectBranch())
467 if (TailBB->empty())
468 return false;
469 const TargetInstrDesc &TID = TailBB->back().getDesc();
470 // Pre-regalloc tail duplication hurts compile time and doesn't help
471 // much except for indirect branches and returns.
472 if (!TID.isIndirectBranch() && !TID.isReturn())
470473 return false;
471474 // If the target has hardware branch prediction that can handle indirect
472475 // branches, duplicating them can often make them predictable when there
501504 }
502505 // Heuristically, don't tail-duplicate calls if it would expand code size,
503506 // as it's less likely to be worth the extra cost.
504 if (InstrCount > 1 && HasCall)
507 if (InstrCount > 1 && (PreRegAlloc && HasCall))
505508 return false;
506509
507510 DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
2727 #include "llvm/ADT/Statistic.h"
2828 #include "llvm/ADT/STLExtras.h"
2929 #include "llvm/Support/CFG.h"
30 #include "llvm/Support/CommandLine.h"
3031 #include "llvm/Support/ConstantRange.h"
3132 #include "llvm/Support/Debug.h"
3233 #include "llvm/Support/raw_ostream.h"
3435 #include
3536 #include
3637 using namespace llvm;
38
39 static cl::opt
40 DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
41 cl::desc("Duplicate return instructions into unconditional branches"));
3742
3843 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
3944
20262031 }
20272032
20282033 // If we found some, do the transformation!
2029 if (!UncondBranchPreds.empty()) {
2034 if (!UncondBranchPreds.empty() && DupRet) {
20302035 while (!UncondBranchPreds.empty()) {
20312036 BasicBlock *Pred = UncondBranchPreds.pop_back_val();
20322037 DEBUG(dbgs() << "FOLDING: " << *BB
+0
-50
test/CodeGen/X86/critical-edge-split.ll less more
None ; RUN: llc < %s -mtriple=i386-apple-darwin -o /dev/null -stats -info-output-file - | grep asm-printer | grep 29
1
2 %CC = type { %Register }
3 %II = type { %"struct.XX::II::$_74" }
4 %JITFunction = type %YYValue* (%CC*, %YYValue**)
5 %YYValue = type { i32 (...)** }
6 %Register = type { %"struct.XX::ByteCodeFeatures" }
7 %"struct.XX::ByteCodeFeatures" = type { i32 }
8 %"struct.XX::II::$_74" = type { i8* }
9 @llvm.used = appending global [1 x i8*] [ i8* bitcast (%JITFunction* @loop to i8*) ], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
10
11 define %YYValue* @loop(%CC*, %YYValue**) nounwind {
12 ;
13 %3 = getelementptr %CC* %0, i32 -9 ; <%CC*> [#uses=1]
14 %4 = bitcast %CC* %3 to %YYValue** ; <%YYValue**> [#uses=2]
15 %5 = load %YYValue** %4 ; <%YYValue*> [#uses=3]
16 %unique_1.i = ptrtoint %YYValue* %5 to i1 ; [#uses=1]
17 br i1 %unique_1.i, label %loop, label %11
18
19 loop: ; preds = %6, %2
20 %.1 = phi %YYValue* [ inttoptr (i32 1 to %YYValue*), %2 ], [ %intAddValue, %6 ] ; <%YYValue*> [#uses=3]
21 %immediateCmp = icmp slt %YYValue* %.1, %5 ; [#uses=1]
22 br i1 %immediateCmp, label %6, label %8
23
24 ;
25 %lhsInt = ptrtoint %YYValue* %.1 to i32 ; [#uses=1]
26 %7 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %lhsInt, i32 2) ; <{ i32, i1 }> [#uses=2]
27 %intAdd = extractvalue { i32, i1 } %7, 0 ; [#uses=1]
28 %intAddValue = inttoptr i32 %intAdd to %YYValue* ; <%YYValue*> [#uses=1]
29 %intAddOverflow = extractvalue { i32, i1 } %7, 1 ; [#uses=1]
30 br i1 %intAddOverflow, label %.loopexit, label %loop
31
32 ;
33 ret %YYValue* inttoptr (i32 10 to %YYValue*)
34
35 .loopexit: ; preds = %6
36 %9 = bitcast %CC* %0 to %YYValue** ; <%YYValue**> [#uses=1]
37 store %YYValue* %.1, %YYValue** %9
38 store %YYValue* %5, %YYValue** %4
39 %10 = call fastcc %YYValue* @foobar(%II* inttoptr (i32 3431104 to %II*), %CC* %0, %YYValue** %1) ; <%YYValue*> [#uses=1]
40 ret %YYValue* %10
41
42 ;
43 %12 = call fastcc %YYValue* @foobar(%II* inttoptr (i32 3431080 to %II*), %CC* %0, %YYValue** %1) ; <%YYValue*> [#uses=1]
44 ret %YYValue* %12
45 }
46
47 declare fastcc %YYValue* @foobar(%II*, %CC*, %YYValue**) nounwind
48
49 declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind
6969
7070 ; Same as slightly_more_involved, but block_a is now a CFG diamond with
7171 ; fallthrough edges which should be preserved.
72 ; "callq block_a_merge_func" is tail duped.
7273
7374 ; CHECK: yet_more_involved:
7475 ; CHECK: jmp .LBB2_1
7778 ; CHECK-NEXT: callq bar99
7879 ; CHECK-NEXT: callq get
7980 ; CHECK-NEXT: cmpl $2999, %eax
80 ; CHECK-NEXT: jg .LBB2_6
81 ; CHECK-NEXT: jle .LBB2_5
82 ; CHECK-NEXT: callq block_a_false_func
83 ; CHECK-NEXT: callq block_a_merge_func
84 ; CHECK-NEXT: jmp .LBB2_1
85 ; CHECK-NEXT: .LBB2_5:
8186 ; CHECK-NEXT: callq block_a_true_func
82 ; CHECK-NEXT: jmp .LBB2_7
83 ; CHECK-NEXT: .LBB2_6:
84 ; CHECK-NEXT: callq block_a_false_func
85 ; CHECK-NEXT: .LBB2_7:
8687 ; CHECK-NEXT: callq block_a_merge_func
8788 ; CHECK-NEXT: .LBB2_1:
8889 ; CHECK-NEXT: callq body
None ; RUN: opt < %s -jump-threading -mem2reg -instcombine -simplifycfg -S | grep {ret i32 %v1}
1 ; There should be no uncond branches left.
2 ; RUN: opt < %s -jump-threading -mem2reg -instcombine -simplifycfg -S | not grep {br label}
0 ; RUN: opt < %s -jump-threading -mem2reg -instcombine -simplifycfg -S | FileCheck %s
31
42 declare i32 @f1()
53 declare i32 @f2()
64 declare void @f3()
75
86 define i32 @test(i1 %cond, i1 %cond2, i1 %cond3) {
7 ; CHECK: test
98 br i1 %cond, label %T1, label %F1
109
10 ; CHECK-NOT: T1:
1111 T1:
1212 %v1 = call i32 @f1()
1313 br label %Merge
1717 br label %Merge
1818
1919 Merge:
20 ; CHECK: Merge:
21 ; CHECK: %v1 = call i32 @f1()
22 ; CHECK-NEXT: %D = and i1 %cond2, %cond3
23 ; CHECK-NEXT: br i1 %D
2024 %A = phi i1 [true, %T1], [false, %F1]
2125 %B = phi i32 [%v1, %T1], [%v2, %F1]
2226 %C = and i1 %A, %cond2
None ; RUN: opt < %s -jump-threading -mem2reg -instcombine -simplifycfg -S | grep {ret i32 %v1}
1 ; There should be no uncond branches left.
2 ; RUN: opt < %s -jump-threading -mem2reg -instcombine -simplifycfg -S | not grep {br label}
0 ; RUN: opt < %s -jump-threading -mem2reg -instcombine -simplifycfg -S | FileCheck %s
31
42 declare i32 @f1()
53 declare i32 @f2()
64 declare void @f3()
75
86 define i32 @test(i1 %cond, i1 %cond2) {
7 ; CHECK: test
98 br i1 %cond, label %T1, label %F1
109
10 ; CHECK-NOT: T1
1111 T1:
1212 %v1 = call i32 @f1()
1313 br label %Merge
1717 br label %Merge
1818
1919 Merge:
20 ; CHECK: Merge:
21 ; CHECK: %v1 = call i32 @f1()
22 ; CHECK-NEXT: br i1 %cond2
2023 %A = phi i1 [true, %T1], [false, %F1]
2124 %B = phi i32 [%v1, %T1], [%v2, %F1]
2225 %C = and i1 %A, %cond2
None ; RUN: opt < %s -jump-threading -simplifycfg -S | grep {ret i32 1}
0 ; RUN: opt < %s -jump-threading -S | FileCheck %s
11 ; rdar://6402033
22
33 ; Test that we can thread through the block with the partially redundant load (%2).
55 target triple = "i386-apple-darwin7"
66
77 define i32 @foo(i32* %P) nounwind {
8 ; CHECK: foo
89 entry:
910 %0 = tail call i32 (...)* @f1() nounwind ; [#uses=1]
1011 %1 = icmp eq i32 %0, 0 ; [#uses=1]
1112 br i1 %1, label %bb1, label %bb
1213
1314 bb: ; preds = %entry
15 ; CHECK: bb1.thread:
16 ; CHECK: store
17 ; CHECK: br label %bb3
1418 store i32 42, i32* %P, align 4
1519 br label %bb1
1620
2529 ret i32 %res.0
2630
2731 bb3: ; preds = %bb1
32 ; CHECK: bb3:
33 ; CHECK: %res.01 = phi i32 [ 1, %bb1.thread ], [ 0, %bb1 ]
34 ; CHECK: ret i32 %res.01
2835 ret i32 %res.0
2936 }
3037
77 ; CHECK: i64 2, label
88 ; CHECK: i64 3, label
99 ; CHECK: i64 4, label
10 ; CHECK-NOT: br
1110 ; CHECK: }
1211
1312 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
2424 }
2525
2626
27 define void @test4() {
28 br label %return
29 return:
30 ret void
31 ; CHECK: @test4
32 ; CHECK-NEXT: ret void
33 }
34 @test4g = global i8* blockaddress(@test4, %return)
35
36
3727 ; PR5795
3828 define void @test5(i32 %A) {
3929 switch i32 %A, label %return [
146146 ; CHECK: i32 16, label %UnifiedReturnBlock
147147 ; CHECK: i32 17, label %UnifiedReturnBlock
148148 ; CHECK: i32 18, label %UnifiedReturnBlock
149 ; CHECK: i32 19, label %switch.edge
149 ; CHECK: i32 19, label %UnifiedReturnBlock
150150 ; CHECK: ]
151151 }
152152
440440 ; CHECK-NOT: switch
441441 ; CHECK: ret void
442442 }
443
444 ; PR8675
445 ; rdar://5134905
446 define zeroext i1 @test16(i32 %x) nounwind {
447 entry:
448 ; CHECK: @test16
449 ; CHECK: switch i32 %x, label %lor.rhs [
450 ; CHECK: i32 1, label %lor.end
451 ; CHECK: i32 2, label %lor.end
452 ; CHECK: i32 3, label %lor.end
453 ; CHECK: ]
454 %cmp.i = icmp eq i32 %x, 1
455 br i1 %cmp.i, label %lor.end, label %lor.lhs.false
456
457 lor.lhs.false:
458 %cmp.i2 = icmp eq i32 %x, 2
459 br i1 %cmp.i2, label %lor.end, label %lor.rhs
460
461 lor.rhs:
462 %cmp.i1 = icmp eq i32 %x, 3
463 br label %lor.end
464
465 lor.end:
466 %0 = phi i1 [ true, %lor.lhs.false ], [ true, %entry ], [ %cmp.i1, %lor.rhs ]
467 ret i1 %0
468 }
None ; RUN: opt < %s -simplifycfg -S | not grep br
1
0 ; RUN: opt < %s -simplifycfg -S | FileCheck %s
21
32 %llvm.dbg.anchor.type = type { i32, i32 }
43 %llvm.dbg.compile_unit.type = type { i32, { }*, i32, i8*, i8*, i8*, i1, i1, i8* }
1211
1312 declare void @llvm.dbg.stoppoint(i32, i32, { }*) nounwind
1413
15 define i1 @_ZN4llvm11SetCondInst7classofEPKNS_11InstructionE({ i32, i32 }* %I) {
14 define i1 @t({ i32, i32 }* %I) {
15 ; CHECK: t
16 ; CHECK: switch i32 %tmp.2.i, label %shortcirc_next.4 [
17 ; CHECK: i32 14, label %UnifiedReturnBlock
18 ; CHECK: i32 15, label %UnifiedReturnBlock
19 ; CHECK: i32 16, label %UnifiedReturnBlock
20 ; CHECK: i32 17, label %UnifiedReturnBlock
21 ; CHECK: i32 18, label %UnifiedReturnBlock
22 ; CHECK: i32 19, label %UnifiedReturnBlock
23 ; CHECK: ]
1624 entry:
1725 %tmp.1.i = getelementptr { i32, i32 }* %I, i64 0, i32 1 ; [#uses=1]
1826 %tmp.2.i = load i32* %tmp.1.i ; [#uses=6]