llvm.org GIT mirror llvm / 89ec3b0
[TTI][X86] update costs of interleaved load\store of i64\double This patch contains more accurate cost of interelaved load\store of stride 2 for the types int64\double on AVX2. Reviewers: delena, RKSimon, craig.topper, dorit Reviewed By: dorit Differential Revision: https://reviews.llvm.org/D40008 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@318385 91177308-0d34-0410-b5e6-96231b3b80d8 Mohammed Agabaria 2 years ago
3 changed file(s) with 86 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
26462646 // The cost of the loads/stores is accounted for separately.
26472647 //
26482648 static const CostTblEntry AVX2InterleavedLoadTbl[] = {
2649 { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
2650 { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
2651
26492652 { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
26502653 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
26512654 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
26632666 };
26642667
26652668 static const CostTblEntry AVX2InterleavedStoreTbl[] = {
2669 { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
2670 { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
2671
26662672 { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
26672673 { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
26682674 { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
0 ; REQUIRES: asserts
1 ; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=skylake %s 2>&1 | FileCheck %s
2 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
3 target triple = "i386-unknown-linux-gnu"
4
5 @doublesrc = common local_unnamed_addr global [120 x double] zeroinitializer, align 4
6 @doubledst = common local_unnamed_addr global [120 x double] zeroinitializer, align 4
7
8 ; Function Attrs: norecurse nounwind
9 define void @stride2double(double %k, i32 %width_) {
10 entry:
11
12 ; CHECK: Found an estimated cost of 8 for VF 4 For instruction: %0 = load double
13 ; CHECK: Found an estimated cost of 8 for VF 4 For instruction: store double
14
15 %cmp27 = icmp sgt i32 %width_, 0
16 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
17
18 for.body.lr.ph: ; preds = %entry
19 br label %for.body
20
21 for.cond.cleanup: ; preds = %for.body, %entry
22 ret void
23
24 for.body: ; preds = %for.body.lr.ph, %for.body
25 %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
26 %arrayidx = getelementptr inbounds [120 x double], [120 x double]* @doublesrc, i32 0, i32 %i.028
27 %0 = load double, double* %arrayidx, align 4
28 %arrayidx2 = getelementptr inbounds [120 x double], [120 x double]* @doubledst, i32 0, i32 %i.028
29 store double %0, double* %arrayidx2, align 4
30 %add4 = add nuw nsw i32 %i.028, 1
31 %arrayidx5 = getelementptr inbounds [120 x double], [120 x double]* @doublesrc, i32 0, i32 %add4
32 %1 = load double, double* %arrayidx5, align 4
33 %arrayidx8 = getelementptr inbounds [120 x double], [120 x double]* @doubledst, i32 0, i32 %add4
34 store double %1, double* %arrayidx8, align 4
35 %add16 = add nuw nsw i32 %i.028, 2
36 %cmp = icmp slt i32 %add16, %width_
37 br i1 %cmp, label %for.body, label %for.cond.cleanup
38 }
39
0 ; REQUIRES: asserts
1 ; RUN: opt -S -loop-vectorize -debug-only=loop-vectorize -mcpu=core-avx2 %s 2>&1 | FileCheck %s
2 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
3 target triple = "i386-unknown-linux-gnu"
4
5 @i64src = common local_unnamed_addr global [120 x i64] zeroinitializer, align 4
6 @i64dst = common local_unnamed_addr global [120 x i64] zeroinitializer, align 4
7
8 ; Function Attrs: norecurse nounwind
9 define void @stride2i64(i64 %k, i32 %width_) {
10 entry:
11
12 ; CHECK: Found an estimated cost of 8 for VF 4 For instruction: %0 = load i64
13 ; CHECK: Found an estimated cost of 8 for VF 4 For instruction: store i64
14
15 %cmp27 = icmp sgt i32 %width_, 0
16 br i1 %cmp27, label %for.body.lr.ph, label %for.cond.cleanup
17
18 for.body.lr.ph: ; preds = %entry
19 br label %for.body
20
21 for.cond.cleanup: ; preds = %for.body, %entry
22 ret void
23
24 for.body:
25 %i.028 = phi i32 [ 0, %for.body.lr.ph ], [ %add16, %for.body ]
26 %arrayidx = getelementptr inbounds [120 x i64], [120 x i64]* @i64src, i32 0, i32 %i.028
27 %0 = load i64, i64* %arrayidx, align 4
28 %arrayidx2 = getelementptr inbounds [120 x i64], [120 x i64]* @i64dst, i32 0, i32 %i.028
29 store i64 %0, i64* %arrayidx2, align 4
30 %add4 = add nuw nsw i32 %i.028, 1
31 %arrayidx5 = getelementptr inbounds [120 x i64], [120 x i64]* @i64src, i32 0, i32 %add4
32 %1 = load i64, i64* %arrayidx5, align 4
33 %arrayidx8 = getelementptr inbounds [120 x i64], [120 x i64]* @i64dst, i32 0, i32 %add4
34 store i64 %1, i64* %arrayidx8, align 4
35 %add16 = add nuw nsw i32 %i.028, 2
36 %cmp = icmp slt i32 %add16, %width_
37 br i1 %cmp, label %for.body, label %for.cond.cleanup
38 }
39