llvm.org GIT mirror llvm / 84066df
[LV][X86] update the cost of interleaving mem. access of floats Recommit: This patch contains update of the costs of interleaved loads of v8f32 of stride 3 and 8. fixed the location of the lit test it works with make check-all. Differential Revision: https://reviews.llvm.org/D39403 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@317471 91177308-0d34-0410-b5e6-96231b3b80d8 Mohammed Agabaria 2 years ago
2 changed file(s) with 145 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
26432643 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
26442644 { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
26452645 { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
2646 { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
26462647
26472648 { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
26482649 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
26492650 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
26502651 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
2651 { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
2652 { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
2653
2654 { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
26522655 };
26532656
26542657 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
0 ; REQUIRES: asserts
1 ; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
2 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
3 target triple = "i386-unknown-linux-gnu"
4
5 @src = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
6 @dst = common local_unnamed_addr global [120 x float] zeroinitializer, align 4
7
8 ; Function Attrs: norecurse nounwind
9 define void @stride8(float %k, i32 %width_) {
10 entry:
11
12 ; CHECK: Found an estimated cost of 48 for VF 8 For instruction: %0 = load float
13
14 %cmp72 = icmp sgt i32 %width_, 0
15 br i1 %cmp72, label %for.body.lr.ph, label %for.cond.cleanup
16
17 for.body.lr.ph: ; preds = %entry
18 br label %for.body
19
20 for.cond.cleanup.loopexit: ; preds = %for.body
21 br label %for.cond.cleanup
22
23 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
24 ret void
25
26 for.body: ; preds = %for.body.lr.ph, %for.body
27 %i.073 = phi i32 [ 0, %for.body.lr.ph ], [ %add46, %for.body ]
28 %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.073
29 %0 = load float, float* %arrayidx, align 4
30 %mul = fmul fast float %0, %k
31 %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.073
32 %1 = load float, float* %arrayidx2, align 4
33 %add3 = fadd fast float %1, %mul
34 store float %add3, float* %arrayidx2, align 4
35 %add4 = or i32 %i.073, 1
36 %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
37 %2 = load float, float* %arrayidx5, align 4
38 %mul6 = fmul fast float %2, %k
39 %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
40 %3 = load float, float* %arrayidx8, align 4
41 %add9 = fadd fast float %3, %mul6
42 store float %add9, float* %arrayidx8, align 4
43 %add10 = or i32 %i.073, 2
44 %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
45 %4 = load float, float* %arrayidx11, align 4
46 %mul12 = fmul fast float %4, %k
47 %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
48 %5 = load float, float* %arrayidx14, align 4
49 %add15 = fadd fast float %5, %mul12
50 store float %add15, float* %arrayidx14, align 4
51 %add16 = or i32 %i.073, 3
52 %arrayidx17 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add16
53 %6 = load float, float* %arrayidx17, align 4
54 %mul18 = fmul fast float %6, %k
55 %arrayidx20 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add16
56 %7 = load float, float* %arrayidx20, align 4
57 %add21 = fadd fast float %7, %mul18
58 store float %add21, float* %arrayidx20, align 4
59 %add22 = or i32 %i.073, 4
60 %arrayidx23 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add22
61 %8 = load float, float* %arrayidx23, align 4
62 %mul24 = fmul fast float %8, %k
63 %arrayidx26 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add22
64 %9 = load float, float* %arrayidx26, align 4
65 %add27 = fadd fast float %9, %mul24
66 store float %add27, float* %arrayidx26, align 4
67 %add28 = or i32 %i.073, 5
68 %arrayidx29 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add28
69 %10 = load float, float* %arrayidx29, align 4
70 %mul30 = fmul fast float %10, %k
71 %arrayidx32 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add28
72 %11 = load float, float* %arrayidx32, align 4
73 %add33 = fadd fast float %11, %mul30
74 store float %add33, float* %arrayidx32, align 4
75 %add34 = or i32 %i.073, 6
76 %arrayidx35 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add34
77 %12 = load float, float* %arrayidx35, align 4
78 %mul36 = fmul fast float %12, %k
79 %arrayidx38 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add34
80 %13 = load float, float* %arrayidx38, align 4
81 %add39 = fadd fast float %13, %mul36
82 store float %add39, float* %arrayidx38, align 4
83 %add40 = or i32 %i.073, 7
84 %arrayidx41 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add40
85 %14 = load float, float* %arrayidx41, align 4
86 %mul42 = fmul fast float %14, %k
87 %arrayidx44 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add40
88 %15 = load float, float* %arrayidx44, align 4
89 %add45 = fadd fast float %15, %mul42
90 store float %add45, float* %arrayidx44, align 4
91 %add46 = add nuw nsw i32 %i.073, 8
92 %cmp = icmp slt i32 %add46, %width_
93 br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
94 }
95
96 ; Function Attrs: norecurse nounwind
97 define void @stride3(float %k, i32 %width_) {
98 entry:
99
100 ; CHECK: Found an estimated cost of 20 for VF 8 For instruction: %0 = load float
101
102 %cmp27 = icmp sgt i32 %width_, 0
103 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
104
105 for.body.lr.ph: ; preds = %entry
106 br label %for.body
107
108 for.cond.cleanup: ; preds = %for.body, %entry
109 ret void
110
111 for.body: ; preds = %for.body.lr.ph, %for.body
112 %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
113 %arrayidx = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %i.028
114 %0 = load float, float* %arrayidx, align 4
115 %mul = fmul fast float %0, %k
116 %arrayidx2 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %i.028
117 %1 = load float, float* %arrayidx2, align 4
118 %add3 = fadd fast float %1, %mul
119 store float %add3, float* %arrayidx2, align 4
120 %add4 = add nuw nsw i32 %i.028, 1
121 %arrayidx5 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add4
122 %2 = load float, float* %arrayidx5, align 4
123 %mul6 = fmul fast float %2, %k
124 %arrayidx8 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add4
125 %3 = load float, float* %arrayidx8, align 4
126 %add9 = fadd fast float %3, %mul6
127 store float %add9, float* %arrayidx8, align 4
128 %add10 = add nuw nsw i32 %i.028, 2
129 %arrayidx11 = getelementptr inbounds [120 x float], [120 x float]* @src, i32 0, i32 %add10
130 %4 = load float, float* %arrayidx11, align 4
131 %mul12 = fmul fast float %4, %k
132 %arrayidx14 = getelementptr inbounds [120 x float], [120 x float]* @dst, i32 0, i32 %add10
133 %5 = load float, float* %arrayidx14, align 4
134 %add15 = fadd fast float %5, %mul12
135 store float %add15, float* %arrayidx14, align 4
136 %add16 = add nuw nsw i32 %i.028, 3
137 %cmp = icmp slt i32 %add16, %width_
138 br i1 %cmp, label %for.body, label %for.cond.cleanup
139 }
140