llvm.org GIT mirror llvm / 08eb02a
[REVERT][LV][X86] update the cost of interleaving mem. access of floats reverted my changes will be committed later after fixing the failure This patch contains update of the costs of interleaved loads of v8f32 of stride 3 and 8. Differential Revision: https://reviews.llvm.org/D39403 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317433 91177308-0d34-0410-b5e6-96231b3b80d8 Mohammed Agabaria 2 years ago
2 changed file(s) with 1 addition(s) and 145 deletion(s). Raw diff Collapse all Expand all
26432643 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
26442644 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
26452645 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2646 { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
26472646
26482647 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
26492648 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
26502649 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
26512650 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
2652 { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
2653
2654 { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
2651 { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
26552652 };
26562653
26572654 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+0
-141
test/Analysis/CostModel/interleaved-load-float.ll less more
None ; REQUIRES: asserts
1 ; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
2 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
3 target triple = "i386-unknown-linux-gnu"
4
5 @src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
6 @dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
7
8 ; Function Attrs: norecurse nounwind
9 define void @stride8(float %k, i32 %width_) {
10 entry:
11
12 ; CHECK: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float
13
14 %cmp72 = icmp sgt i32 %width_, 0
15 br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
16
17 for.body.lr.ph: ; preds = %entry
18 br label %for.body
19
20 for.cond.cleanup.loopexit: ; preds = %for.body
21 br label %for.cond.cleanup
22
23 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
24 ret void
25
26 for.body: ; preds = %for.body.lr.ph, %for.body
27 %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
28 %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
29 %0 = load float, float* %arrayidx, align 4
30 %mul = fmul fast float %0, %k
31 %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
32 %1 = load float, float* %arrayidx2, align 4
33 %add3 = fadd fast float %1, %mul
34 store float %add3, float* %arrayidx2, align 4
35 %add4 = or i32 %i.073, 1
36 %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
37 %2 = load float, float* %arrayidx5, align 4
38 %mul6 = fmul fast float %2, %k
39 %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
40 %3 = load float, float* %arrayidx8, align 4
41 %add9 = fadd fast float %3, %mul6
42 store float %add9, float* %arrayidx8, align 4
43 %add10 = or i32 %i.073, 2
44 %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
45 %4 = load float, float* %arrayidx11, align 4
46 %mul12 = fmul fast float %4, %k
47 %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
48 %5 = load float, float* %arrayidx14, align 4
49 %add15 = fadd fast float %5, %mul12
50 store float %add15, float* %arrayidx14, align 4
51 %add16 = or i32 %i.073, 3
52 %arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
53 %6 = load float, float* %arrayidx17, align 4
54 %mul18 = fmul fast float %6, %k
55 %arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
56 %7 = load float, float* %arrayidx20, align 4
57 %add21 = fadd fast float %7, %mul18
58 store float %add21, float* %arrayidx20, align 4
59 %add22 = or i32 %i.073, 4
60 %arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
61 %8 = load float, float* %arrayidx23, align 4
62 %mul24 = fmul fast float %8, %k
63 %arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
64 %9 = load float, float* %arrayidx26, align 4
65 %add27 = fadd fast float %9, %mul24
66 store float %add27, float* %arrayidx26, align 4
67 %add28 = or i32 %i.073, 5
68 %arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
69 %10 = load float, float* %arrayidx29, align 4
70 %mul30 = fmul fast float %10, %k
71 %arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
72 %11 = load float, float* %arrayidx32, align 4
73 %add33 = fadd fast float %11, %mul30
74 store float %add33, float* %arrayidx32, align 4
75 %add34 = or i32 %i.073, 6
76 %arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
77 %12 = load float, float* %arrayidx35, align 4
78 %mul36 = fmul fast float %12, %k
79 %arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
80 %13 = load float, float* %arrayidx38, align 4
81 %add39 = fadd fast float %13, %mul36
82 store float %add39, float* %arrayidx38, align 4
83 %add40 = or i32 %i.073, 7
84 %arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
85 %14 = load float, float* %arrayidx41, align 4
86 %mul42 = fmul fast float %14, %k
87 %arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
88 %15 = load float, float* %arrayidx44, align 4
89 %add45 = fadd fast float %15, %mul42
90 store float %add45, float* %arrayidx44, align 4
91 %add46 = add nuw nsw i32 %i.073, 8
92 %cmp = icmp slt i32 %add46, %width_
93 br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
94 }
95
96 ; Function Attrs: norecurse nounwind
97 define void @stride3(float %k, i32 %width_) {
98 entry:
99
100 ; CHECK: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float
101
102 %cmp27 = icmp sgt i32 %width_, 0
103 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
104
105 for.body.lr.ph: ; preds = %entry
106 br label %for.body
107
108 for.cond.cleanup: ; preds = %for.body, %entry
109 ret void
110
111 for.body: ; preds = %for.body.lr.ph, %for.body
112 %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
113 %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
114 %0 = load float, float* %arrayidx, align 4
115 %mul = fmul fast float %0, %k
116 %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
117 %1 = load float, float* %arrayidx2, align 4
118 %add3 = fadd fast float %1, %mul
119 store float %add3, float* %arrayidx2, align 4
120 %add4 = add nuw nsw i32 %i.028, 1
121 %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
122 %2 = load float, float* %arrayidx5, align 4
123 %mul6 = fmul fast float %2, %k
124 %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
125 %3 = load float, float* %arrayidx8, align 4
126 %add9 = fadd fast float %3, %mul6
127 store float %add9, float* %arrayidx8, align 4
128 %add10 = add nuw nsw i32 %i.028, 2
129 %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
130 %4 = load float, float* %arrayidx11, align 4
131 %mul12 = fmul fast float %4, %k
132 %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
133 %5 = load float, float* %arrayidx14, align 4
134 %add15 = fadd fast float %5, %mul12
135 store float %add15, float* %arrayidx14, align 4
136 %add16 = add nuw nsw i32 %i.028, 3
137 %cmp = icmp slt i32 %add16, %width_
138 br i1 %cmp, label %for.body, label %for.cond.cleanup
139 }
140