llvm.org GIT mirror llvm / bd7c3fb
[LSR] Don't try and create post-inc expressions on non-rotated loops If a loop is not rotated (for example when optimizing for size), the latch is not the backedge. If we promote an expression to post-inc form, we not only increase register pressure and add a COPY for that IV expression but for all IVs! Motivating testcase: void f(float *a, float *b, float *c, int n) { while (n-- > 0) *c++ = *a++ + *b++; } It's imperative that the pointer increments be located in the latch block and not the header block; if not, we cannot use post-increment loads and stores and we have to keep both the post-inc and pre-inc values around until the end of the latch which bloats register usage. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@278658 91177308-0d34-0410-b5e6-96231b3b80d8 James Molloy 3 years ago
7 changed file(s) with 72 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
20682068 LSRInstance::OptimizeLoopTermCond() {
20692069 SmallPtrSet PostIncs;
20702070
2071 // We need a different set of heuristics for rotated and non-rotated loops.
2072 // If a loop is rotated then the latch is also the backedge, so inserting
2073 // post-inc expressions just before the latch is ideal. To reduce live ranges
2074 // it also makes sense to rewrite terminating conditions to use post-inc
2075 // expressions.
2076 //
2077 // If the loop is not rotated then the latch is not a backedge; the latch
2078 // check is done in the loop head. Adding post-inc expressions before the
2079 // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
2080 // in the loop body. In this case we do *not* want to use post-inc expressions
2081 // in the latch check, and we want to insert post-inc expressions before
2082 // the backedge.
20712083 BasicBlock *LatchBlock = L->getLoopLatch();
20722084 SmallVector ExitingBlocks;
20732085 L->getExitingBlocks(ExitingBlocks);
2074
2086 if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
2087 return LatchBlock != BB;
2088 })) {
2089 // The backedge doesn't exit the loop; treat this as a head-tested loop.
2090 IVIncInsertPos = LatchBlock->getTerminator();
2091 return;
2092 }
2093
2094 // Otherwise treat this as a rotated loop.
20752095 for (BasicBlock *ExitingBlock : ExitingBlocks) {
20762096
20772097 // Get the terminating condition for the loop if possible. If we
342342 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
343343 ; CHECK: image_store
344344 ; CHECK: s_wqm_b64 exec, exec
345 ; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2
345 ; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], 0
346346 ; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]]
347347
348 ; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
349
348350 ; CHECK: [[LOOPHDR]]: ; %loop
349 ; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
350351 ; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]]
351352 ; CHECK: s_cbranch_vccz
352353 ; CHECK: ; %break
1717 br i1 %1, label %bb3, label %bb1
1818
1919 bb1: ; preds = %bb
20 ; CHECK: bb1
21 ; CHECK: subs [[REG:r[0-9]+]], #1
2022 %tmp = tail call i32 @puts() nounwind
2123 %indvar.next = add i32 %indvar, 1
2224 br label %bb2
2325
2426 bb2: ; preds = %bb1, %entry
2527 ; CHECK: bb2
26 ; CHECK: subs [[REG:r[0-9]+]], #1
2728 ; CHECK: cmp [[REG]], #0
2829 ; CHECK: ble
2930 %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
0 ; RUN: llc -O3 -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
1 ; XFAIL: *
12 ;
23 ; Generate hardware loop when loop 'latch' block is different
34 ; from the loop 'exiting' block.
11 ;
22 ; Generate loop1 instruction for double loop sequence.
33
4 ; CHECK: loop0(.LBB{{.}}_{{.}}, #100)
5 ; CHECK: endloop0
64 ; CHECK: loop1(.LBB{{.}}_{{.}}, #100)
75 ; CHECK: loop0(.LBB{{.}}_{{.}}, #100)
86 ; CHECK: endloop0
22
33 ; CHECK-LABEL: t:
44 ; CHECK: movl (%r9,%rax,4), %e{{..}}
5 ; CHECK-NEXT: decq
5 ; CHECK-NEXT: testq
66 ; CHECK-NEXT: jne
77
88 ; ATOM-LABEL: t:
99 ; ATOM: movl (%r9,%r{{.+}},4), %e{{..}}
10 ; ATOM-NEXT: decq
10 ; ATOM-NEXT: testq
1111 ; ATOM-NEXT: jne
1212
1313 @Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5]
0 ; RUN: opt < %s -loop-reduce -S | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3 target triple = "thumbv7m-arm-none-eabi"
4
5 ; Check that the IV updates (incdec.ptr{,1,2}) are kept in the latch block
6 ; and not moved to the header/exiting block. Inserting them in the header
7 ; doubles register pressure and adds moves.
8
9 ; CHECK-LABEL: @f
10 ; CHECK: while.cond:
11 ; CHECK: icmp sgt i32 %n.addr.0, 0
12 ; CHECK: while.body:
13 ; CHECK: incdec.ptr =
14 ; CHECK: incdec.ptr1 =
15 ; CHECK: incdec.ptr2 =
16 ; CHECK: dec =
17 define void @f(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %n) {
18 entry:
19 br label %while.cond
20
21 while.cond: ; preds = %while.body, %entry
22 %a.addr.0 = phi float* [ %a, %entry ], [ %incdec.ptr, %while.body ]
23 %b.addr.0 = phi float* [ %b, %entry ], [ %incdec.ptr1, %while.body ]
24 %c.addr.0 = phi float* [ %c, %entry ], [ %incdec.ptr2, %while.body ]
25 %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %while.body ]
26 %cmp = icmp sgt i32 %n.addr.0, 0
27 br i1 %cmp, label %while.body, label %while.end
28
29 while.body: ; preds = %while.cond
30 %incdec.ptr = getelementptr inbounds float, float* %a.addr.0, i32 1
31 %tmp = load float, float* %a.addr.0, align 4
32 %incdec.ptr1 = getelementptr inbounds float, float* %b.addr.0, i32 1
33 %tmp1 = load float, float* %b.addr.0, align 4
34 %add = fadd float %tmp, %tmp1
35 %incdec.ptr2 = getelementptr inbounds float, float* %c.addr.0, i32 1
36 store float %add, float* %c.addr.0, align 4
37 %dec = add nsw i32 %n.addr.0, -1
38 br label %while.cond
39
40 while.end: ; preds = %while.cond
41 ret void
42 }