llvm.org GIT mirror
Codegen: Make chains from trellis-shaped CFGs Lay out trellis-shaped CFGs optimally. A trellis of the shape below: A B |\ /| | \ / | | X | | / \ | |/ \| C D would be laid out A; B->C ; D by the current layout algorithm. Now we identify trellises and lay them out either A->C; B->D or A->D; B->C. This scales with an increasing number of predecessors. A trellis is a a group of 2 or more predecessor blocks that all have the same successors. because of this we can tail duplicate to extend existing trellises. As an example consider the following CFG: B D F H / \ / \ / \ / \ A---C---E---G---Ret Where A,C,E,G are all small (Currently 2 instructions). The CFG preserving layout is then A,B,C,D,E,F,G,H,Ret. The current code will copy C into B, E into D and G into F and yield the layout A,C,B(C),E,D(E),F(G),G,H,ret define void @straight_test(i32 %tag) { entry: br label %test1 test1: ; A %tagbit1 = and i32 %tag, 1 %tagbit1eq0 = icmp eq i32 %tagbit1, 0 br i1 %tagbit1eq0, label %test2, label %optional1 optional1: ; B call void @a() br label %test2 test2: ; C %tagbit2 = and i32 %tag, 2 %tagbit2eq0 = icmp eq i32 %tagbit2, 0 br i1 %tagbit2eq0, label %test3, label %optional2 optional2: ; D call void @b() br label %test3 test3: ; E %tagbit3 = and i32 %tag, 4 %tagbit3eq0 = icmp eq i32 %tagbit3, 0 br i1 %tagbit3eq0, label %test4, label %optional3 optional3: ; F call void @c() br label %test4 test4: ; G %tagbit4 = and i32 %tag, 8 %tagbit4eq0 = icmp eq i32 %tagbit4, 0 br i1 %tagbit4eq0, label %exit, label %optional4 optional4: ; H call void @d() br label %exit exit: ret void } here is the layout after D27742: straight_test: # @straight_test ; ... Prologue elided ; BB#0: # %entry ; A (merged with test1) ; ... More prologue elided mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_2 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_3 b .LBB0_4 .LBB0_2: # %optional1 ; B (copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_4 .LBB0_3: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_5 b .LBB0_6 .LBB0_4: # %optional2 ; D (copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_5: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 b .LBB0_7 .LBB0_6: # %optional3 ; F (copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit ; Ret ld 30, 96(1) # 8-byte Folded Reload addi 1, 1, 112 ld 0, 16(1) mtlr 0 blr The tail-duplication has produced some benefit, but it has also produced a trellis which is not laid out optimally. With this patch, we improve the layouts of such trellises, and decrease the cost calculation for tail-duplication accordingly. This patch produces the layout A,C,E,G,B,D,F,H,Ret. This layout does have back edges, which is a negative, but it has a bigger compensating positive, which is that it handles the case where there are long strings of skipped blocks much better than the original layout. Both layouts handle runs of executed blocks equally well. Branch prediction also improves if there is any correlation between subsequent optional blocks. Here is the resulting concrete layout: straight_test: # @straight_test ; BB#0: # %entry ; A (merged with test1) mr 30, 3 andi. 3, 30, 1 bc 12, 1, .LBB0_4 ; BB#1: # %test2 ; C rlwinm. 3, 30, 0, 30, 30 bne 0, .LBB0_5 .LBB0_2: # %test3 ; E rlwinm. 3, 30, 0, 29, 29 bne 0, .LBB0_6 .LBB0_3: # %test4 ; G rlwinm. 3, 30, 0, 28, 28 bne 0, .LBB0_7 b .LBB0_8 .LBB0_4: # %optional1 ; B (Copy of C) bl a nop rlwinm. 3, 30, 0, 30, 30 beq 0, .LBB0_2 .LBB0_5: # %optional2 ; D (Copy of E) bl b nop rlwinm. 3, 30, 0, 29, 29 beq 0, .LBB0_3 .LBB0_6: # %optional3 ; F (Copy of G) bl c nop rlwinm. 3, 30, 0, 28, 28 beq 0, .LBB0_8 .LBB0_7: # %optional4 ; H bl d nop .LBB0_8: # %exit Differential Revision: https://reviews.llvm.org/D28522 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@295223 91177308-0d34-0410-b5e6-96231b3b80d8 Kyle Butt 3 years ago
24 changed file(s) with 746 addition(s) and 148 deletion(s).
 5 5 6 6 ; CHECK-NEXT: ; BB#1: ; %b3 7 7 ; CHECK: ldr [[LOAD:w[0-9]+]] 8 ; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]] 9 ; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]⏎ 8 ; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]⏎ 9 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]] 10 10 11 ; CHECK-NEXT: [[SKIP_LONG_B]]: 12 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]⏎ 11 ; CHECK-NEXT: [[B8]]: ; %b8⏎ 12 ; CHECK-NEXT: ret 13 13 14 14 ; CHECK-NEXT: [[B2]]: ; %b2 15 15 ; CHECK: mov w{{[0-9]+}}, #93 16 16 ; CHECK: bl _extfunc 17 17 ; CHECK: cbz w{{[0-9]+}}, [[B7]] 18 19 ; CHECK-NEXT: [[B8]]: ; %b8 20 ; CHECK-NEXT: ret⏎ 18 ; CHECK-NEXT: b [[B8]]⏎ 21 19 22 20 ; CHECK-NEXT: [[B7]]: ; %b7 23 21 ; CHECK: mov w{{[0-9]+}}, #13 24 22 ; CHECK: b _extfunc 23 25 24 define void @split_block_no_fallthrough(i64 %val) #0 { 26 25 bb: 27 26 %c0 = icmp sgt i64 %val, -5
 263 263 define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 { 264 264 ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ 265 265 ; CHECK: cmn 266 ; CHECK: b.gt⏎ 266 ; CHECK: b.le⏎ 267 267 ; CHECK: cmp 268 268 ; CHECK: b.gt 269 269 entry:
 10 10 ; 11 11 ; CHECK-LABEL: func 12 12 ; CHECK-NOT: and 13 ; CHECK: tbnz⏎ 13 ; CHECK: tbz⏎ 14 14 define void @func() { 15 15 %c0 = icmp sgt i64 0, 0 16 16 br i1 %c0, label %b1, label %b6
 7 7 ; GCNNOOPT: v_writelane_b32 8 8 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]] 9 9 10 11 ; GCN: ; BB#1 12 10 ; GCNNOOPT: v_readlane_b32 13 11 ; GCNNOOPT: v_readlane_b32 14 12 ; GCN: buffer_store_dword 15 ; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0) 16 ; TODO: This waitcnt can be eliminated⏎ 13 ; GCNNOOPT: s_endpgm⏎ 17 14 18 15 ; GCN: {{^}}[[END]]: 19 16 ; GCN: s_endpgm
 490 490 491 491 ; GCN-LABEL: {{^}}long_branch_hang: 492 492 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6 493 ; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]⏎ 493 ; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}⏎ 494 ; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]] 494 495 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: 495 496 496 497 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 1 1 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s 2 2 3 3 ; GCN-LABEL: {{^}}test_loop: 4 ; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:⏎ 4 ; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{\$}}⏎ 5 5 ; GCN: ds_read_b32 6 6 ; GCN: ds_write_b32 7 7 ; GCN: s_branch [[LABEL]]
 28 28 ; GCN: v_cmp_ne_u32_e64 29 29 30 30 ; GCN: BB{{[0-9]+_[0-9]+}}: 31 31 32 define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) { 32 33 bb: 33 34 %tmp = call i32 @llvm.amdgcn.workitem.id.x()
 438 438 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1 439 439 ; GCN-NOHSA: buffer_store_dword [[ONE]] 440 440 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]] 441 ; GCN; {{^}}[[EXIT]]:⏎ 441 ; GCN: {{^}}[[EXIT]]:⏎ 442 442 ; GCN: s_endpgm 443 443 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) { 444 444 bb3: ; preds = %bb2
 11 11 ; CHECK: bl _quux 12 12 ; CHECK-NOT: bl _quux 13 13 14 ; NOMERGE: bl _baz 15 ; NOMERGE: bl _baz⏎ 14 ; NOMERGE-DAG: bl _baz⏎ 15 ; NOMERGE-DAG: bl _baz 16 16 17 ; NOMERGE: bl _quux 18 ; NOMERGE: bl _quux⏎ 17 ; NOMERGE-DAG: bl _quux⏎ 18 ; NOMERGE-DAG: bl _quux 19 19 20 20 ; ModuleID = 'tail.c' 21 21 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 65 65 ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]: 66 66 ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0] 67 67 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0 68 ; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1⏎ 68 ; CHECK-ARMV7-NEXT: moveq r0, #1⏎ 69 69 ; CHECK-ARMV7-NEXT: bxeq lr 70 70 ; CHECK-ARMV7-NEXT: [[TRY]]: 71 ; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0] 72 ; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]⏎ 71 ; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]⏎ 72 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1 73 73 ; CHECK-ARMV7-NEXT: beq [[HEAD]] 74 74 ; CHECK-ARMV7-NEXT: clrex 75 ; CHECK-ARMV7-NEXT: mov [[RES]], #0⏎ 75 ; CHECK-ARMV7-NEXT: mov r0, #0⏎ 76 76 ; CHECK-ARMV7-NEXT: bx lr 77 77 78 78 ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
 134 134 135 135 ; Important to check for beginning of basic block, because if it gets 136 136 ; if-converted the test is probably no longer checking what it should. 137 ; CHECK: {{LBB[0-9]+_2}}:⏎ 137 ; CHECK: %end⏎ 138 138 ; CHECK-NEXT: vpop {d7, d8} 139 139 ; CHECK-NEXT: pop {r4, pc} 140 140
 15 15 ;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]] 16 16 ;CHECK-NEXT: # %test2 17 17 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 18 ;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]] 19 ;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]⏎ 18 ;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]⏎ 19 ;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit 20 ;CHECK: blr 20 21 ;CHECK-NEXT: [[BODY1LABEL]] 21 22 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 22 23 ;CHECK-NEXT: beq 0, [[EXITLABEL]] 23 ;CHECK-NEXT: [[BODY2LABEL]] 24 ;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit 25 ;CHECK: blr⏎ 24 ;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:⏎ 25 ;CHECK: b [[EXITLABEL]] 26 26 define void @tail_dup_break_cfg(i32 %tag) { 27 27 entry: 28 28 br label %test1 78 78 test2: 79 79 %tagbit2 = and i32 %tag, 2 80 80 %tagbit2eq0 = icmp ne i32 %tagbit2, 0 81 br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely⏎ 81 br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely⏎ 82 82 body2: 83 83 call void @b() 84 84 call void @b() 136 136 137 137 !1 = !{!"branch_weights", i32 5, i32 3} 138 138 !2 = !{!"branch_weights", i32 95, i32 5} 139 !3 = !{!"branch_weights", i32 7, i32 3}⏎ 139 !3 = !{!"branch_weights", i32 8, i32 3}⏎
 None ; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s⏎ 0 ; RUN: llc -O2 < %s | FileCheck %s⏎ 1 1 target datalayout = "e-m:e-i64:64-n32:64" 2 2 target triple = "powerpc64le-grtev4-linux-gnu" 3 3 4 4 ; Intended layout: 5 ; The outlining flag produces the layout⏎ 5 ; The chain-based outlining produces the layout⏎ 6 6 ; test1 7 7 ; test2 8 8 ; test3 9 9 ; test4 10 ; exit 11 10 ; optional1 12 11 ; optional2 13 12 ; optional3 14 13 ; optional4 14 ; exit 15 15 ; Tail duplication puts test n+1 at the end of optional n 16 16 ; so optional1 includes a copy of test2 at the end, and branches 17 17 ; to test3 (at the top) or falls through to optional 2. 18 ; The CHECK statements check for the whole string of tests and exit block,⏎ 18 ; The CHECK statements check for the whole string of tests⏎ 19 19 ; and then check that the correct test has been duplicated into the end of 20 20 ; the optional blocks and that the optional blocks are in the correct order. 21 ;CHECK-LABEL: f:⏎ 21 ;CHECK-LABEL: straight_test:⏎ 22 22 ; test1 may have been merged with entry 23 23 ;CHECK: mr [[TAGREG:[0-9]+]], 3 24 24 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 25 ;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]] 26 ;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2⏎ 25 ;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]⏎ 26 ;CHECK-NEXT: # %test2 27 27 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 28 ;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]] 29 ;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3⏎ 28 ;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]⏎ 29 ;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 30 30 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 31 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] 32 ;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4⏎ 31 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]⏎ 32 ;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4 33 33 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 34 ;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]] 35 ;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit⏎ 34 ;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]⏎ 35 ;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit 36 36 ;CHECK: blr 37 ;CHECK-NEXT: [[OPT1LABEL]]⏎ 37 ;CHECK-NEXT: .[[OPT1LABEL]]:⏎ 38 38 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 39 ;CHECK-NEXT: beq 0, [[TEST3LABEL]] 40 ;CHECK-NEXT: [[OPT2LABEL]]⏎ 39 ;CHECK-NEXT: beq 0, .[[TEST3LABEL]]⏎ 40 ;CHECK-NEXT: .[[OPT2LABEL]]: 41 41 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 42 ;CHECK-NEXT: beq 0, [[TEST4LABEL]] 43 ;CHECK-NEXT: [[OPT3LABEL]]⏎ 42 ;CHECK-NEXT: beq 0, .[[TEST4LABEL]]⏎ 43 ;CHECK-NEXT: .[[OPT3LABEL]]: 44 44 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 45 ;CHECK-NEXT: beq 0, [[EXITLABEL]] 46 ;CHECK-NEXT: [[OPT4LABEL]] 47 ;CHECK: b [[EXITLABEL]] 48 49 define void @f(i32 %tag) {⏎ 45 ;CHECK-NEXT: beq 0, .[[EXITLABEL]]⏎ 46 ;CHECK-NEXT: .[[OPT4LABEL]]: 47 ;CHECK: b .[[EXITLABEL]] 48 49 define void @straight_test(i32 %tag) { 50 50 entry: 51 51 br label %test1 52 52 test1: 53 53 %tagbit1 = and i32 %tag, 1 54 54 %tagbit1eq0 = icmp eq i32 %tagbit1, 0 55 br i1 %tagbit1eq0, label %test2, label %optional1⏎ 55 br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1⏎ 56 56 optional1: 57 57 call void @a() 58 58 call void @a() 62 62 test2: 63 63 %tagbit2 = and i32 %tag, 2 64 64 %tagbit2eq0 = icmp eq i32 %tagbit2, 0 65 br i1 %tagbit2eq0, label %test3, label %optional2⏎ 65 br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1⏎ 66 66 optional2: 67 67 call void @b() 68 68 call void @b() 72 72 test3: 73 73 %tagbit3 = and i32 %tag, 4 74 74 %tagbit3eq0 = icmp eq i32 %tagbit3, 0 75 br i1 %tagbit3eq0, label %test4, label %optional3⏎ 75 br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1⏎ 76 76 optional3: 77 77 call void @c() 78 78 call void @c() 82 82 test4: 83 83 %tagbit4 = and i32 %tag, 8 84 84 %tagbit4eq0 = icmp eq i32 %tagbit4, 0 85 br i1 %tagbit4eq0, label %exit, label %optional4⏎ 85 br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1⏎ 86 86 optional4: 87 87 call void @d() 88 88 call void @d() 90 90 call void @d() 91 91 br label %exit 92 92 exit: 93 ret void 94 } 95 96 ; Intended layout: 97 ; The chain-based outlining produces the layout 98 ; entry 99 ; --- Begin loop --- 100 ; for.latch 101 ; for.check 102 ; test1 103 ; test2 104 ; test3 105 ; test4 106 ; optional1 107 ; optional2 108 ; optional3 109 ; optional4 110 ; --- End loop --- 111 ; exit 112 ; The CHECK statements check for the whole string of tests and exit block, 113 ; and then check that the correct test has been duplicated into the end of 114 ; the optional blocks and that the optional blocks are in the correct order. 115 ;CHECK-LABEL: loop_test: 116 ;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4 117 ;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch 118 ;CHECK: addi 119 ;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check 120 ;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]]) 121 ;CHECK: # %test1 122 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1 123 ;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]] 124 ;CHECK-NEXT: # %test2 125 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 126 ;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]] 127 ;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3 128 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 129 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]] 130 ;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}} 131 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 132 ;CHECK-NEXT: beq 0, .[[LATCHLABEL]] 133 ;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]] 134 ;CHECK: [[OPT1LABEL]] 135 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30 136 ;CHECK-NEXT: beq 0, .[[TEST3LABEL]] 137 ;CHECK-NEXT: .[[OPT2LABEL]] 138 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29 139 ;CHECK-NEXT: beq 0, .[[TEST4LABEL]] 140 ;CHECK-NEXT: .[[OPT3LABEL]] 141 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28 142 ;CHECK-NEXT: beq 0, .[[LATCHLABEL]] 143 ;CHECK: [[OPT4LABEL]]: 144 ;CHECK: b .[[LATCHLABEL]] 145 define void @loop_test(i32* %tags, i32 %count) { 146 entry: 147 br label %for.check 148 for.check: 149 %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch] 150 %done.count = icmp ugt i32 %count.loop, 0 151 %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count 152 %tag = load i32, i32* %tag_ptr 153 %done.tag = icmp eq i32 %tag, 0 154 %done = and i1 %done.count, %done.tag 155 br i1 %done, label %test1, label %exit, !prof !1 156 test1: 157 %tagbit1 = and i32 %tag, 1 158 %tagbit1eq0 = icmp eq i32 %tagbit1, 0 159 br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1 160 optional1: 161 call void @a() 162 call void @a() 163 call void @a() 164 call void @a() 165 br label %test2 166 test2: 167 %tagbit2 = and i32 %tag, 2 168 %tagbit2eq0 = icmp eq i32 %tagbit2, 0 169 br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1 170 optional2: 171 call void @b() 172 call void @b() 173 call void @b() 174 call void @b() 175 br label %test3 176 test3: 177 %tagbit3 = and i32 %tag, 4 178 %tagbit3eq0 = icmp eq i32 %tagbit3, 0 179 br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1 180 optional3: 181 call void @c() 182 call void @c() 183 call void @c() 184 call void @c() 185 br label %test4 186 test4: 187 %tagbit4 = and i32 %tag, 8 188 %tagbit4eq0 = icmp eq i32 %tagbit4, 0 189 br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1 190 optional4: 191 call void @d() 192 call void @d() 193 call void @d() 194 call void @d() 195 br label %for.latch 196 for.latch: 197 %count.sub = sub i32 %count.loop, 1 198 br label %for.check 199 exit: 200 ret void 201 } 202 203 ; The block then2 is not unavoidable, meaning it does not dominate the exit. 204 ; But since it can be tail-duplicated, it should be placed as a fallthrough from 205 ; test2 and copied. The purpose here is to make sure that the tail-duplication 206 ; code is independent of the outlining code, which works by choosing the 207 ; "unavoidable" blocks. 208 ; CHECK-LABEL: avoidable_test: 209 ; CHECK: # %entry 210 ; CHECK: andi. 211 ; CHECK: # %test2 212 ; Make sure then2 falls through from test2 213 ; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}} 214 ; CHECK: # %then2 215 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 216 ; CHECK: # %else1 217 ; CHECK: bl a 218 ; CHECK: bl a 219 ; Make sure then2 was copied into else1 220 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29 221 ; CHECK: # %end1 222 ; CHECK: bl d 223 ; CHECK: # %else2 224 ; CHECK: bl c 225 ; CHECK: # %end2 226 define void @avoidable_test(i32 %tag) { 227 entry: 228 br label %test1 229 test1: 230 %tagbit1 = and i32 %tag, 1 231 %tagbit1eq0 = icmp eq i32 %tagbit1, 0 232 br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely 233 else1: 234 call void @a() 235 call void @a() 236 br label %then2 237 test2: 238 %tagbit2 = and i32 %tag, 2 239 %tagbit2eq0 = icmp eq i32 %tagbit2, 0 240 br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely 241 then2: 242 %tagbit3 = and i32 %tag, 4 243 %tagbit3eq0 = icmp eq i32 %tagbit3, 0 244 br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely 245 else2: 246 call void @c() 247 br label %end2 248 end2: 249 ret void 250 end1: 251 call void @d() 252 ret void 253 } 254 255 ; CHECK-LABEL: trellis_test 256 ; The number in the block labels is the expected block frequency given the 257 ; probabilities annotated. There is a conflict in the b;c->d;e trellis that 258 ; should be resolved as c->e;b->d. 259 ; The d;e->f;g trellis should be resolved as e->g;d->f. 260 ; The f;g->h;i trellis should be resolved as f->i;g->h. 261 ; The h;i->j;ret trellis contains a triangle edge, and should be resolved as 262 ; h->j->ret 263 ; CHECK: # %entry 264 ; CHECK: # %c10 265 ; CHECK: # %e9 266 ; CHECK: # %g10 267 ; CHECK: # %h10 268 ; CHECK: # %j8 269 ; CHECK: # %ret 270 ; CHECK: # %b6 271 ; CHECK: # %d7 272 ; CHECK: # %f6 273 ; CHECK: # %i6 274 define void @trellis_test(i32 %tag) { 275 entry: 276 br label %a16 277 a16: 278 call void @a() 279 call void @a() 280 %tagbits.a = and i32 %tag, 3 281 %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0 282 br i1 %tagbits.a.eq0, label %c10, label %b6, !prof !1 ; 10 to 6 283 c10: 284 call void @c() 285 call void @c() 286 %tagbits.c = and i32 %tag, 12 287 %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0 288 ; Both of these edges should be hotter than the other incoming edge 289 ; for e9 or d7 290 br i1 %tagbits.c.eq0, label %e9, label %d7, !prof !3 ; 6 to 4 291 e9: 292 call void @e() 293 call void @e() 294 %tagbits.e = and i32 %tag, 48 295 %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0 296 br i1 %tagbits.e.eq0, label %g10, label %f6, !prof !4 ; 7 to 2 297 g10: 298 call void @g() 299 call void @g() 300 %tagbits.g = and i32 %tag, 192 301 %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0 302 br i1 %tagbits.g.eq0, label %i6, label %h10, !prof !5 ; 2 to 8 303 i6: 304 call void @i() 305 call void @i() 306 %tagbits.i = and i32 %tag, 768 307 %tagbits.i.eq0 = icmp eq i32 %tagbits.i, 0 308 br i1 %tagbits.i.eq0, label %ret, label %j8, !prof !2 ; balanced (3 to 3) 309 b6: 310 call void @b() 311 call void @b() 312 %tagbits.b = and i32 %tag, 12 313 %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8 314 br i1 %tagbits.b.eq1, label %e9, label %d7, !prof !2 ; balanced (3 to 3) 315 d7: 316 call void @d() 317 call void @d() 318 %tagbits.d = and i32 %tag, 48 319 %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32 320 br i1 %tagbits.d.eq1, label %g10, label %f6, !prof !6 ; 3 to 4 321 f6: 322 call void @f() 323 call void @f() 324 %tagbits.f = and i32 %tag, 192 325 %tagbits.f.eq1 = icmp eq i32 %tagbits.f, 128 326 br i1 %tagbits.f.eq1, label %i6, label %h10, !prof !7 ; 4 to 2 327 h10: 328 call void @h() 329 call void @h() 330 %tagbits.h = and i32 %tag, 768 331 %tagbits.h.eq1 = icmp eq i32 %tagbits.h, 512 332 br i1 %tagbits.h.eq1, label %ret, label %j8, !prof !2 ; balanced (5 to 5) 333 j8: 334 call void @j() 335 call void @j() 336 br label %ret 337 ret: 338 ret void 339 } 340 341 ; Verify that we still consider tail-duplication opportunities if we find a 342 ; triangle trellis. Here D->F->G is the triangle, and D;E are both predecessors 343 ; of both F and G. The basic trellis algorithm picks the F->G edge, but after 344 ; checking, it's profitable to duplicate G into F. The weights here are not 345 ; really important. They are there to help make the test stable. 346 ; CHECK-LABEL: trellis_then_dup_test 347 ; CHECK: # %entry 348 ; CHECK: # %b 349 ; CHECK: # %d 350 ; CHECK: # %g 351 ; CHECK: # %ret1 352 ; CHECK: # %c 353 ; CHECK: # %e 354 ; CHECK: # %f 355 ; CHECK: # %ret2 356 ; CHECK: # %ret 357 define void @trellis_then_dup_test(i32 %tag) { 358 entry: 359 br label %a 360 a: 361 call void @a() 362 call void @a() 363 %tagbits.a = and i32 %tag, 3 364 %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0 365 br i1 %tagbits.a.eq0, label %b, label %c, !prof !1 ; 5 to 3 366 b: 367 call void @b() 368 call void @b() 369 %tagbits.b = and i32 %tag, 12 370 %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8 371 br i1 %tagbits.b.eq1, label %d, label %e, !prof !1 ; 5 to 3 372 d: 373 call void @d() 374 call void @d() 375 %tagbits.d = and i32 %tag, 48 376 %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32 377 br i1 %tagbits.d.eq1, label %g, label %f, !prof !1 ; 5 to 3 378 f: 379 call void @f() 380 call void @f() 381 br label %g 382 g: 383 %tagbits.g = and i32 %tag, 192 384 %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0 385 br i1 %tagbits.g.eq0, label %ret1, label %ret2, !prof !2 ; balanced 386 c: 387 call void @c() 388 call void @c() 389 %tagbits.c = and i32 %tag, 12 390 %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0 391 br i1 %tagbits.c.eq0, label %d, label %e, !prof !1 ; 5 to 3 392 e: 393 call void @e() 394 call void @e() 395 %tagbits.e = and i32 %tag, 48 396 %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0 397 br i1 %tagbits.e.eq0, label %g, label %f, !prof !1 ; 5 to 3 398 ret1: 399 call void @a() 400 br label %ret 401 ret2: 402 call void @b() 403 br label %ret 404 ret: 93 405 ret void 94 406 } 95 407 97 409 declare void @b() 98 410 declare void @c() 99 411 declare void @d() 412 declare void @e() 413 declare void @f() 414 declare void @g() 415 declare void @h() 416 declare void @i() 417 declare void @j() 418 419 !1 = !{!"branch_weights", i32 5, i32 3} 420 !2 = !{!"branch_weights", i32 50, i32 50} 421 !3 = !{!"branch_weights", i32 6, i32 4} 422 !4 = !{!"branch_weights", i32 7, i32 2} 423 !5 = !{!"branch_weights", i32 2, i32 8} 424 !6 = !{!"branch_weights", i32 3, i32 4} 425 !7 = !{!"branch_weights", i32 4, i32 2}
 66 66 ; CHECK: nop 67 67 ; CHECK:.LBB1_1: ! %entry 68 68 ; CHECK: mov %g0, %i0 69 ; CHECK: ! %entry 70 ; CHECK: cmp %i0, 0 71 ; CHECK: be .LBB1_5 72 ; CHECK: nop 73 ; CHECK:.LBB1_4: 74 ; CHECK: mov 1, %i0 75 ; CHECK: ba .LBB1_6 76 ; CHECK:.LBB1_2: ! Block address taken 77 ; CHECK: mov 1, %i0 69 78 ; CHECK: cmp %i0, 0 70 79 ; CHECK: bne .LBB1_4 71 ; CHECK: ba .LBB1_5 72 ; CHECK:.LBB1_2: ! Block address taken 73 ; CHECK: mov 1, %i0 74 ; CHECK: be .LBB1_5 75 ; CHECK:.LBB1_4: 76 ; CHECK: ba .LBB1_6⏎ 80 ; CHECK: nop⏎ 77 81 } 78 82 declare i8* @llvm.frameaddress(i32) #2 79 83
 None ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s⏎ 0 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s⏎ 1 1 2 2 ; Test memcpy, memmove, and memset intrinsics. 3 3
 313 313 define void @unnatural_cfg1() { 314 314 ; Test that we can handle a loop with an inner unnatural loop at the end of 315 315 ; a function. This is a gross CFG reduced out of the single source GCC. 316 ; CHECK: unnatural_cfg1⏎ 316 ; CHECK-LABEL: unnatural_cfg1⏎ 317 317 ; CHECK: %entry 318 318 ; CHECK: %loop.body1 319 319 ; CHECK: %loop.body2 351 351 ; Test that we can handle a loop with a nested natural loop *and* an unnatural 352 352 ; loop. This was reduced from a crash on block placement when run over 353 353 ; single-source GCC. 354 ; CHECK: unnatural_cfg2⏎ 354 ; CHECK-LABEL: unnatural_cfg2⏎ 355 355 ; CHECK: %entry 356 356 ; CHECK: %loop.body1 357 357 ; CHECK: %loop.body2 358 ; CHECK: %loop.body4 359 ; CHECK: %loop.inner2.begin 360 ; CHECK: %loop.inner2.begin 358 361 ; CHECK: %loop.body3 359 362 ; CHECK: %loop.inner1.begin 360 ; The end block is folded with %loop.body3... 361 ; CHECK-NOT: %loop.inner1.end 362 ; CHECK: %loop.body4 363 ; CHECK: %loop.inner2.begin 364 ; The loop.inner2.end block is folded 365 363 ; CHECK: %loop.header 366 364 ; CHECK: %bail 367 365 558 556 ; didn't correctly locate the fallthrough successor, assuming blindly that the 559 557 ; first one was the fallthrough successor. As a result, we would add an 560 558 ; erroneous jump to the landing pad thinking *that* was the default successor. 561 ; CHECK: test_eh_lpad_successor⏎ 559 ; CHECK-LABEL: test_eh_lpad_successor⏎ 562 560 ; CHECK: %entry 563 561 ; CHECK-NOT: jmp 564 562 ; CHECK: %loop 586 584 ; fallthrough simply won't occur. Make sure we don't crash trying to update 587 585 ; terminators for such constructs. 588 586 ; 589 ; CHECK: test_eh_throw⏎ 587 ; CHECK-LABEL: test_eh_throw⏎ 590 588 ; CHECK: %entry 591 589 ; CHECK: %cleanup 592 590 608 606 ; attempt to merge onto the wrong end of the inner loop just because we find it 609 607 ; first. This was reduced from a crasher in GCC's single source. 610 608 ; 611 ; CHECK: test_unnatural_cfg_backwards_inner_loop⏎ 609 ; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop⏎ 612 610 ; CHECK: %entry 613 611 ; CHECK: %loop2b 614 612 ; CHECK: %loop1 648 646 ; fallthrough because that happens to always produce unanalyzable branches on 649 647 ; x86. 650 648 ; 651 ; CHECK: unanalyzable_branch_to_loop_header⏎ 649 ; CHECK-LABEL: unanalyzable_branch_to_loop_header⏎ 652 650 ; CHECK: %entry 653 651 ; CHECK: %loop 654 652 ; CHECK: %exit 672 670 ; This branch is now analyzable and hence the destination block becomes the 673 671 ; hotter one. The right order is entry->bar->exit->foo. 674 672 ; 675 ; CHECK: unanalyzable_branch_to_best_succ⏎ 673 ; CHECK-LABEL: unanalyzable_branch_to_best_succ⏎ 676 674 ; CHECK: %entry 677 675 ; CHECK: %bar 678 676 ; CHECK: %exit 698 696 ; Ensure that we can handle unanalyzable branches where the destination block 699 697 ; gets selected as the best free block in the CFG. 700 698 ; 701 ; CHECK: unanalyzable_branch_to_free_block⏎ 699 ; CHECK-LABEL: unanalyzable_branch_to_free_block⏎ 702 700 ; CHECK: %entry 703 701 ; CHECK: %a 704 702 ; CHECK: %b 728 726 ; Ensure that we don't crash as we're building up many unanalyzable branches, 729 727 ; blocks, and loops. 730 728 ; 731 ; CHECK: many_unanalyzable_branches⏎ 729 ; CHECK-LABEL: many_unanalyzable_branches⏎ 732 730 ; CHECK: %entry 733 731 ; CHECK: %exit 734 732 947 945 ; strange layouts that are siginificantly less efficient, often times maing 948 946 ; it discontiguous. 949 947 ; 950 ; CHECK: @benchmark_heapsort⏎ 948 ; CHECK-LABEL: @benchmark_heapsort⏎ 951 949 ; CHECK: %entry 952 950 ; First rotated loop top. 953 951 ; CHECK: .p2align
 94 94 ; CHECK-NEXT: idivl %ebx 95 95 ; CHECK-NEXT: movl %eax, %esi 96 96 ; CHECK-NEXT: testl \$-256, %edi 97 ; CHECK-NEXT: jne .LBB3_5 98 ; CHECK-NEXT: jmp .LBB3_4⏎ 97 ; CHECK-NEXT: je .LBB3_4⏎ 98 ; CHECK-NEXT: .LBB3_5: 99 ; CHECK-NEXT: xorl %edx, %edx 100 ; CHECK-NEXT: movl %ecx, %eax 101 ; CHECK-NEXT: divl %ebx 102 ; CHECK-NEXT: jmp .LBB3_6 99 103 ; CHECK-NEXT: .LBB3_1: 100 104 ; CHECK-NEXT: movzbl %cl, %eax 101 105 ; CHECK-NEXT: # kill: %EAX %EAX %AX 102 106 ; CHECK-NEXT: divb %bl 103 107 ; CHECK-NEXT: movzbl %al, %esi 104 108 ; CHECK-NEXT: testl \$-256, %edi 105 ; CHECK-NEXT: je .LBB3_4 106 ; CHECK-NEXT: .LBB3_5: 107 ; CHECK-NEXT: xorl %edx, %edx 108 ; CHECK-NEXT: movl %ecx, %eax 109 ; CHECK-NEXT: divl %ebx 110 ; CHECK-NEXT: jmp .LBB3_6⏎ 109 ; CHECK-NEXT: jne .LBB3_5⏎ 111 110 ; CHECK-NEXT: .LBB3_4: 112 111 ; CHECK-NEXT: movzbl %cl, %eax 113 112 ; CHECK-NEXT: # kill: %EAX %EAX %AX
 59 59 ; X32-NEXT: xorps %xmm1, %xmm1 60 60 ; X32-NEXT: cmpl \$0, {{[0-9]+}}(%esp) 61 61 ; X32-NEXT: jne .LBB1_5 62 ; X32-NEXT: jmp .LBB1_4⏎ 62 ; X32-NEXT: .LBB1_4:⏎ 63 ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 64 ; X32-NEXT: cmpl \$0, {{[0-9]+}}(%esp) 65 ; X32-NEXT: jne .LBB1_8 66 ; X32-NEXT: .LBB1_7: 67 ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 68 ; X32-NEXT: jmp .LBB1_9 63 69 ; X32-NEXT: .LBB1_1: 64 70 ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 65 71 ; X32-NEXT: cmpl \$0, {{[0-9]+}}(%esp) 67 73 ; X32-NEXT: .LBB1_5: # %entry 68 74 ; X32-NEXT: xorps %xmm2, %xmm2 69 75 ; X32-NEXT: cmpl \$0, {{[0-9]+}}(%esp) 70 ; X32-NEXT: jne .LBB1_8 71 ; X32-NEXT: jmp .LBB1_7 72 ; X32-NEXT: .LBB1_4: 73 ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 74 ; X32-NEXT: cmpl \$0, {{[0-9]+}}(%esp) 75 76 ; X32-NEXT: je .LBB1_7 76 77 ; X32-NEXT: .LBB1_8: # %entry 77 78 ; X32-NEXT: xorps %xmm3, %xmm3 78 ; X32-NEXT: jmp .LBB1_9 79 ; X32-NEXT: .LBB1_7: 80 ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 81 79 ; X32-NEXT: .LBB1_9: # %entry 82 80 ; X32-NEXT: cmpl \$0, {{[0-9]+}}(%esp) 83 81 ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 98 96 ; X64-NEXT: xorps %xmm1, %xmm1 99 97 ; X64-NEXT: testl %edx, %edx 100 98 ; X64-NEXT: jne .LBB1_5 101 ; X64-NEXT: jmp .LBB1_4⏎ 99 ; X64-NEXT: .LBB1_4:⏎ 100 ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 101 ; X64-NEXT: testl %r8d, %r8d 102 ; X64-NEXT: jne .LBB1_8 103 ; X64-NEXT: .LBB1_7: 104 ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 105 ; X64-NEXT: jmp .LBB1_9 102 106 ; X64-NEXT: .LBB1_1: 103 107 ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero 104 108 ; X64-NEXT: testl %edx, %edx 106 110 ; X64-NEXT: .LBB1_5: # %entry 107 111 ; X64-NEXT: xorps %xmm2, %xmm2 108 112 ; X64-NEXT: testl %r8d, %r8d 109 ; X64-NEXT: jne .LBB1_8 110 ; X64-NEXT: jmp .LBB1_7 111 ; X64-NEXT: .LBB1_4: 112 ; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero 113 ; X64-NEXT: testl %r8d, %r8d 114 113 ; X64-NEXT: je .LBB1_7 115 114 ; X64-NEXT: .LBB1_8: # %entry 116 115 ; X64-NEXT: xorps %xmm3, %xmm3 117 ; X64-NEXT: jmp .LBB1_9 118 ; X64-NEXT: .LBB1_7: 119 ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero 120 116 ; X64-NEXT: .LBB1_9: # %entry 121 117 ; X64-NEXT: testl %esi, %esi 122 118 ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 214 210 ret <4 x i32> %zext 215 211 } 216 212 217 ; Fragile test warning - we need to induce the generation of a vselect ⏎ 213 ; Fragile test warning - we need to induce the generation of a vselect⏎ 218 214 ; post-legalization to cause the crash seen in: 219 215 ; https://llvm.org/bugs/show_bug.cgi?id=31672 220 216 ; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
 5 5 ; CHECK-LABEL: tail_dup_merge_loops 6 6 ; CHECK: # %entry 7 7 ; CHECK-NOT: # %{{[a-zA-Z_]+}} 8 ; CHECK: # %exit 9 ; CHECK-NOT: # %{{[a-zA-Z_]+}} 8 10 ; CHECK: # %inner_loop_exit 9 11 ; CHECK-NOT: # %{{[a-zA-Z_]+}} 10 12 ; CHECK: # %inner_loop_latch 11 13 ; CHECK-NOT: # %{{[a-zA-Z_]+}} 12 14 ; CHECK: # %inner_loop_test 13 ; CHECK-NOT: # %{{[a-zA-Z_]+}} 14 ; CHECK: # %exit 15 15 define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 { 16 16 entry: 17 17 %notlhs674.i = icmp eq i32 %a, 0
 None ; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s⏎ 0 ; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s⏎ 1 1 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 2 2 target triple = "x86_64-unknown-linux-gnu" 3 3
 112 112 ; CHECK-NEXT: jbe .LBB2_3 113 113 ; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} 114 114 ; CHECK-NEXT: ja .LBB2_4 115 ; CHECK-NEXT: jmp .LBB2_2⏎ 115 ; CHECK-NEXT: .LBB2_2:⏎ 116 ; CHECK-NEXT: movb \$1, %al 117 ; CHECK-NEXT: ret 116 118 ; CHECK-NEXT: .LBB2_3: 117 119 ; CHECK-NEXT: ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}} 118 120 ; CHECK-NEXT: jbe .LBB2_2 119 121 ; CHECK-NEXT: .LBB2_4: 120 122 ; CHECK-NEXT: xorl %eax, %eax 121 ; CHECK-NEXT: ret 122 ; CHECK-NEXT: .LBB2_2: 123 ; CHECK-NEXT: movb \$1, %al 124 123 ; CHECK-NEXT: ret 125 124 126 125 define i1 @dont_merge_oddly(float* %result) nounwind {
 18 18 19 19 ; Check that only one mov will be generated in the kernel loop. 20 20 ; CHECK-LABEL: foo: 21 ; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body⏎ 21 ; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{\$}}⏎ 22 22 ; CHECK-NOT: mov 23 23 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]] 24 24 ; CHECK-NOT: mov 55 55 56 56 ; Check that only two mov will be generated in the kernel loop. 57 57 ; CHECK-LABEL: goo: 58 ; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body⏎ 58 ; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{\$}}⏎ 59 59 ; CHECK-NOT: mov 60 60 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]] 61 61 ; CHECK-NOT: mov
 114 114 ; Test that the blocks are analyzed in the correct order. 115 115 ; CHECK-LABEL: cfg: 116 116 entry: 117 br i1 %x, label %bb1, label %bb2⏎ 117 br i1 %x, label %bb1, label %bb3⏎ 118 118 119 119 bb1: 120 120 %p1 = alloca %struct.S 121 121 ; CHECK: pushl %eax 122 122 ; CHECK: subl \$1020, %esp 123 br label %bb3⏎ 123 br label %bb4⏎ 124 124 125 bb2: 126 %p5 = alloca %struct.T 127 ; CHECK: pushl %eax 128 ; CHECK: subl \$2996, %esp 129 call void @g(%struct.T* %p5) 130 ret void 131 132 bb3: 125 133 %p2 = alloca %struct.T 126 134 ; CHECK: pushl %eax 127 135 ; CHECK: subl \$2996, %esp 128 br label %bb3 129 130 bb3: 131 br i1 %y, label %bb4, label %bb5⏎ 136 br label %bb4⏎ 132 137 133 138 bb4: 139 br i1 %y, label %bb5, label %bb2 140 141 bb5: 134 142 %p4 = alloca %struct.S 135 143 ; CHECK: subl \$1024, %esp 136 144 call void @f(%struct.S* %p4) 137 145 ret void 138 146 139 bb5: 140 %p5 = alloca %struct.T 141 ; CHECK: pushl %eax 142 ; CHECK: subl \$2996, %esp 143 call void @g(%struct.T* %p5) 144 ret void 145 147 } 146 148 147 149