llvm.org GIT mirror llvm / ffb8f09
[X86][CM] update add\sub costs of vectors of 64 in X86\SLM arch this patch updates the cost of addq\subq (add\subtract of vectors of 64bits) based on the performance numbers of SLM arch. Differential Revision: https://reviews.llvm.org/D33983 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306974 91177308-0d34-0410-b5e6-96231b3b80d8 Mohammed Agabaria 2 years ago
3 changed file(s) with 78 addition(s) and 11 deletion(s). Raw diff Collapse all Expand all
141141 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
142142 { ISD::FADD, MVT::v2f64, 2 }, // addpd
143143 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
144 // v2i64/v4i64 mul is custom lowered as a series of long
145 // multiplies(3), shifts(3) and adds(2).
146 // slm muldq version throughput is 2
147 { ISD::MUL, MVT::v2i64, 11 },
144 // v2i64/v4i64 mul is custom lowered as a series of long:
145 // multiplies(3), shifts(3) and adds(2)
146 // slm muldq version throughput is 2 and addq throughput 4
147 // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
148 // 3X4 (addq throughput) = 17
149 { ISD::MUL, MVT::v2i64, 17 },
150 // slm addq\subq throughput is 4
151 { ISD::ADD, MVT::v2i64, 4 },
152 { ISD::SUB, MVT::v2i64, 4 },
148153 };
149154
150155 if (ST->isSLM()) {
11
22 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
33 target triple = "x86_64-unknown-linux-gnu"
4
5 define <2 x i64> @slm-costs_64_vector_add(<2 x i64> %a, <2 x i64> %b) {
6 entry:
7 ; SLM: cost of 4 {{.*}} add <2 x i64>
8 %res = add <2 x i64> %a, %b
9 ret <2 x i64> %res
10 }
11
12 define <2 x i64> @slm-costs_64_vector_sub(<2 x i64> %a, <2 x i64> %b) {
13 entry:
14 ; SLM: cost of 4 {{.*}} sub <2 x i64>
15 %res = sub <2 x i64> %a, %b
16 ret <2 x i64> %res
17 }
418
519 ; 8bit mul
620 define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) {
1226
1327 define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) {
1428 entry:
15 ; SLM: cost of 11 {{.*}} mul nsw <2 x i8>
29 ; SLM: cost of 17 {{.*}} mul nsw <2 x i8>
1630 %res = mul nsw <2 x i8> %a, %b
1731 ret <2 x i8> %res
1832 }
96110
97111 define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) {
98112 entry:
99 ; SLM: cost of 11 {{.*}} mul nsw <2 x i16>
113 ; SLM: cost of 17 {{.*}} mul nsw <2 x i16>
100114 %res = mul nsw <2 x i16> %a, %b
101115 ret <2 x i16> %res
102116 }
180194
181195 define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) {
182196 entry:
183 ; SLM: cost of 11 {{.*}} mul nsw <2 x i32>
197 ; SLM: cost of 17 {{.*}} mul nsw <2 x i32>
184198 %res = mul nsw <2 x i32> %a, %b
185199 ret <2 x i32> %res
186200 }
216230
217231 define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) {
218232 entry:
219 ; SLM: cost of 11 {{.*}} mul nsw <2 x i64>
233 ; SLM: cost of 17 {{.*}} mul nsw <2 x i64>
220234 %res = mul nsw <2 x i64> %a, %b
221235 ret <2 x i64> %res
222236 }
223237
224238 define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) {
225239 entry:
226 ; SLM: cost of 22 {{.*}} mul nsw <4 x i64>
240 ; SLM: cost of 34 {{.*}} mul nsw <4 x i64>
227241 %res = mul nsw <4 x i64> %a, %b
228242 ret <4 x i64> %res
229243 }
230244
231245 define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) {
232246 entry:
233 ; SLM: cost of 44 {{.*}} mul nsw <8 x i64>
247 ; SLM: cost of 68 {{.*}} mul nsw <8 x i64>
234248 %res = mul nsw <8 x i64> %a, %b
235249 ret <8 x i64> %res
236250 }
237251
238252 define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) {
239253 entry:
240 ; SLM: cost of 88 {{.*}} mul nsw <16 x i64>
254 ; SLM: cost of 136 {{.*}} mul nsw <16 x i64>
241255 %res = mul nsw <16 x i64> %a, %b
242256 ret <16 x i64> %res
243257 }
0 ; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
1 ; This test should not be vectorized in X86\SLM arch
2 ; Vectorizing the 64bit multiply in this case is wrong since
3 ; it can be done with a lower bit mode (notice that the sources is 16bit)
4 ; Also addq\subq (quad word) has a high cost on SLM arch.
5 ; this test has a bad performance (regression of -70%) if vectorized on SLM arch
6 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
7 target triple = "x86_64-unknown-linux-gnu"
8
9 define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
10 entry:
11 ; MSG: LV: Selecting VF: 1.
12 %cmp17 = icmp sgt i32 %LastIndex, 0
13 br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
14
15 for.body.lr.ph: ; preds = %entry
16 %conv5 = sext i16 %Scale to i64
17 %sh_prom = and i64 %conv5, 4294967295
18 %0 = sext i16 %lag to i64
19 %wide.trip.count = zext i32 %LastIndex to i64
20 br label %for.body
21
22 for.cond.cleanup.loopexit: ; preds = %for.body
23 %conv8 = trunc i64 %add7 to i32
24 br label %for.cond.cleanup
25
26 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
27 %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
28 ret i32 %Accumulator.0.lcssa
29
30 for.body: ; preds = %for.body, %for.body.lr.ph
31 %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
32 %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
33 %arrayidx = getelementptr inbounds i16, i16* %InputData, i64 %indvars.iv
34 %1 = load i16, i16* %arrayidx, align 2
35 %conv = sext i16 %1 to i64
36 %2 = add nsw i64 %indvars.iv, %0
37 %arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
38 %3 = load i16, i16* %arrayidx3, align 2
39 %conv4 = sext i16 %3 to i64
40 %mul = mul nsw i64 %conv4, %conv
41 %shr = ashr i64 %mul, %sh_prom
42 %add7 = add i64 %shr, %Accumulator.018
43 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
44 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
45 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
46 }
47