llvm.org GIT mirror llvm / 4a05fa1
Changed basic cost of store operation on X86 Store operation takes 2 UOps on X86 processors. The exact cost calculation affects several optimization passes including loop unroling. This change compensates performance degradation caused by https://reviews.llvm.org/D34458 and shows improvements on some benchmarks. Differential Revision: https://reviews.llvm.org/D35888 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@311285 91177308-0d34-0410-b5e6-96231b3b80d8 Elena Demikhovsky 2 years ago
4 changed file(s) with 122 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
21122112 return X86TTIImpl::getIntImmCost(Imm, Ty);
21132113 }
21142114
2115 unsigned X86TTIImpl::getUserCost(const User *U,
2116 ArrayRef Operands) {
2117 if (isa(U)) {
2118 Value *Ptr = U->getOperand(1);
2119 // Store instruction with index and scale costs 2 Uops.
2120 // Check the preceding GEP to identify non-const indices.
2121 if (auto GEP = dyn_cast(Ptr)) {
2122 if (!all_of(GEP->indices(), [](Value *V) { return isa(V); }))
2123 return TTI::TCC_Basic * 2;
2124 }
2125 return TTI::TCC_Basic;
2126 }
2127 return BaseT::getUserCost(U, Operands);
2128 }
2129
21152130 // Return an average cost of Gather / Scatter instruction, maybe improved later
21162131 int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
21172132 unsigned Alignment, unsigned AddressSpace) {
101101
102102 int getIntImmCost(const APInt &Imm, Type *Ty);
103103
104 unsigned getUserCost(const User *U, ArrayRef Operands);
105
104106 int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
105107 int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
106108 Type *Ty);
0 ; REQUIRES: asserts
1 ; RUN: opt -mcpu=core-avx2 -loop-unroll --debug-only=loop-unroll -S -unroll-allow-partial < %s 2>&1 | FileCheck %s
2
3 target triple = "x86_64-unknown-linux-gnu"
4
5 ; CHECK: Loop Unroll: F[foo] Loop %loop.2.header
6 ; CHECK: Loop Size = 27
7 ; CHECK-NOT: UNROLLING loop %loop.2.header
8 ; CHECK: Loop Unroll: F[foo] Loop %loop.header
9 ; CHECK: Loop Size = 25
10 ; CHECK: UNROLLING loop %loop.header by 2
11
12 define void @foo(i32 * %out) {
13 entry:
14 %0 = alloca [1024 x i32]
15 %x0 = alloca [1024 x i32]
16 %x01 = alloca [1024 x i32]
17 %x02 = alloca [1024 x i32]
18 %x03 = alloca [1024 x i32]
19 %x04 = alloca [1024 x i32]
20 %x05 = alloca [1024 x i32]
21 %x06 = alloca [1024 x i32]
22 br label %loop.header
23
24 loop.header:
25 %counter = phi i32 [0, %entry], [%inc, %loop.inc]
26 br label %loop.body
27
28 loop.body:
29 %ptr = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter
30 store i32 %counter, i32* %ptr
31 %val = add i32 %counter, 5
32 %xptr = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter
33 store i32 %val, i32* %xptr
34 %val1 = add i32 %counter, 6
35 %xptr1 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter
36 store i32 %val1, i32* %xptr1
37 %val2 = add i32 %counter, 7
38 %xptr2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter
39 store i32 %val2, i32* %xptr2
40 %val3 = add i32 %counter, 8
41 %xptr3 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter
42 store i32 %val3, i32* %xptr3
43 %val4 = add i32 %counter, 9
44 %xptr4 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter
45 store i32 %val4, i32* %xptr4
46 %val5 = add i32 %counter, 10
47 %xptr5 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter
48 store i32 %val5, i32* %xptr5
49 br label %loop.inc
50
51 loop.inc:
52 %inc = add i32 %counter, 2
53 %1 = icmp sge i32 %inc, 1023
54 br i1 %1, label %exit.0, label %loop.header
55
56 exit.0:
57 %2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 5
58 %3 = load i32, i32* %2
59 store i32 %3, i32 * %out
60 br label %loop.2.header
61
62
63 loop.2.header:
64 %counter.2 = phi i32 [0, %exit.0], [%inc.2, %loop.2.inc]
65 br label %loop.2.body
66
67 loop.2.body:
68 %ptr.2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 %counter.2
69 store i32 %counter.2, i32* %ptr.2
70 %val.2 = add i32 %counter.2, 5
71 %xptr.2 = getelementptr [1024 x i32], [1024 x i32]* %x0, i32 0, i32 %counter.2
72 store i32 %val.2, i32* %xptr.2
73 %val1.2 = add i32 %counter.2, 6
74 %xptr1.2 = getelementptr [1024 x i32], [1024 x i32]* %x01, i32 0, i32 %counter.2
75 store i32 %val1, i32* %xptr1.2
76 %val2.2 = add i32 %counter.2, 7
77 %xptr2.2 = getelementptr [1024 x i32], [1024 x i32]* %x02, i32 0, i32 %counter.2
78 store i32 %val2, i32* %xptr2.2
79 %val3.2 = add i32 %counter.2, 8
80 %xptr3.2 = getelementptr [1024 x i32], [1024 x i32]* %x03, i32 0, i32 %counter.2
81 store i32 %val3.2, i32* %xptr3.2
82 %val4.2 = add i32 %counter.2, 9
83 %xptr4.2 = getelementptr [1024 x i32], [1024 x i32]* %x04, i32 0, i32 %counter.2
84 store i32 %val4.2, i32* %xptr4.2
85 %val5.2 = add i32 %counter.2, 10
86 %xptr5.2 = getelementptr [1024 x i32], [1024 x i32]* %x05, i32 0, i32 %counter.2
87 store i32 %val5.2, i32* %xptr5.2
88 %xptr6.2 = getelementptr [1024 x i32], [1024 x i32]* %x06, i32 0, i32 %counter.2
89 store i32 %val5.2, i32* %xptr6.2
90 br label %loop.2.inc
91
92 loop.2.inc:
93 %inc.2 = add i32 %counter.2, 2
94 %4 = icmp sge i32 %inc.2, 1023
95 br i1 %4, label %exit.2, label %loop.2.header
96
97 exit.2:
98 %x2 = getelementptr [1024 x i32], [1024 x i32]* %0, i32 0, i32 6
99 %x3 = load i32, i32* %x2
100 %out2 = getelementptr i32, i32 * %out, i32 1
101 store i32 %3, i32 * %out2
102 ret void
103 }
171171 %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
172172 store i32 %add, i32* %arrayidx2, align 4
173173 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
174 %exitcond = icmp eq i64 %indvars.iv.next, 64
174 %exitcond = icmp eq i64 %indvars.iv.next, 48
175175 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
176176
177177 for.end: ; preds = %for.body