llvm.org GIT mirror llvm / 8c69b6f
[PowerPC] Update Vector Costs for P9 For the power9 CPU, vector operations consume a pair of execution units rather than one execution unit like a scalar operation. Update the target transform cost functions to reflect the higher cost of vector operations when targeting Power9. Patch by RolandF. Differential revision: https://reviews.llvm.org/D55461 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@352261 91177308-0d34-0410-b5e6-96231b3b80d8 Nemanja Ivanovic 8 months ago
7 changed file(s) with 167 addition(s) and 13 deletion(s). Raw diff Collapse all Expand all
189189 "Enable POWER9 vector instructions",
190190 [FeatureISA3_0, FeatureP8Vector,
191191 FeatureP9Altivec]>;
192 // A separate feature for this even though it is equivalent to P9Vector
193 // because this is a feature of the implementation rather than the architecture
194 // and may go away with future CPU's.
195 def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
196 "VectorsUseTwoUnits",
197 "true",
198 "Vectors use two units">;
192199
193200 // Since new processors generally contain a superset of features of those that
194201 // came before them, the idea is to make implementations of new processors
221228 list Power8FeatureList =
222229 !listconcat(Power7FeatureList, Power8SpecificFeatures);
223230 list Power9SpecificFeatures =
224 [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0];
231 [DirectivePwr9, FeatureP9Altivec, FeatureP9Vector, FeatureISA3_0,
232 FeatureVectorsUseTwoUnits];
225233 list Power9FeatureList =
226234 !listconcat(Power8FeatureList, Power9SpecificFeatures);
227235 }
106106 IsISA3_0 = false;
107107 UseLongCalls = false;
108108 SecurePlt = false;
109 VectorsUseTwoUnits = false;
109110
110111 HasPOPCNTD = POPCNTD_Unavailable;
111112 }
134134 bool IsISA3_0;
135135 bool UseLongCalls;
136136 bool SecurePlt;
137 bool VectorsUseTwoUnits;
137138
138139 POPCNTDKind HasPOPCNTD;
139140
258259 bool isPPC4xx() const { return IsPPC4xx; }
259260 bool isPPC6xx() const { return IsPPC6xx; }
260261 bool isSecurePlt() const {return SecurePlt; }
262 bool vectorsUseTwoUnits() const {return VectorsUseTwoUnits; }
261263 bool isE500() const { return IsE500; }
262264 bool isFeatureMFTB() const { return FeatureMFTB; }
263265 bool isDeprecatedDST() const { return DeprecatedDST; }
322322 return 2;
323323 }
324324
325 // Adjust the cost of vector instructions on targets which there is overlap
326 // between the vector and scalar units, thereby reducing the overall throughput
327 // of vector code wrt. scalar code.
328 int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
329 Type *Ty2) {
330 if (!ST->vectorsUseTwoUnits() || !Ty1->isVectorTy())
331 return Cost;
332
333 std::pair LT1 = TLI->getTypeLegalizationCost(DL, Ty1);
334 // If type legalization involves splitting the vector, we don't want to
335 // double the cost at every step - only the last step.
336 if (LT1.first != 1 || !LT1.second.isVector())
337 return Cost;
338 int ISD = TLI->InstructionOpcodeToISD(Opcode);
339 if (TLI->isOperationExpand(ISD, LT1.second))
340 return Cost;
341
342 if (Ty2) {
343 std::pair LT2 = TLI->getTypeLegalizationCost(DL, Ty2);
344 if (LT2.first != 1 || !LT2.second.isVector())
345 return Cost;
346 }
347
348 return Cost * 2;
349 }
350
325351 int PPCTTIImpl::getArithmeticInstrCost(
326352 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
327353 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
329355 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
330356
331357 // Fallback to the default implementation.
332 return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
333 Opd1PropInfo, Opd2PropInfo);
358 int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
359 Opd1PropInfo, Opd2PropInfo);
360 return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
334361 }
335362
336363 int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
343370 // instruction). We need one such shuffle instruction for each actual
344371 // register (this is not true for arbitrary shuffles, but is true for the
345372 // structured types of shuffles covered by TTI::ShuffleKind).
346 return LT.first;
373 return vectorCostAdjustment(LT.first, Instruction::ShuffleVector, Tp,
374 nullptr);
347375 }
348376
349377 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
350378 const Instruction *I) {
351379 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
352380
353 return BaseT::getCastInstrCost(Opcode, Dst, Src);
381 int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src);
382 return vectorCostAdjustment(Cost, Opcode, Dst, Src);
354383 }
355384
356385 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
357386 const Instruction *I) {
358 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
387 int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
388 return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
359389 }
360390
361391 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
364394 int ISD = TLI->InstructionOpcodeToISD(Opcode);
365395 assert(ISD && "Invalid opcode");
366396
397 int Cost = BaseT::getVectorInstrCost(Opcode, Val, Index);
398 Cost = vectorCostAdjustment(Cost, Opcode, Val, nullptr);
399
367400 if (ST->hasVSX() && Val->getScalarType()->isDoubleTy()) {
368 // Double-precision scalars are already located in index #0.
369 if (Index == 0)
401 // Double-precision scalars are already located in index #0 (or #1 if LE).
402 if (ISD == ISD::EXTRACT_VECTOR_ELT && Index == ST->isLittleEndian() ? 1 : 0)
370403 return 0;
371404
372 return BaseT::getVectorInstrCost(Opcode, Val, Index);
405 return Cost;
406
373407 } else if (ST->hasQPX() && Val->getScalarType()->isFloatingPointTy()) {
374408 // Floating point scalars are already located in index #0.
375409 if (Index == 0)
376410 return 0;
377411
378 return BaseT::getVectorInstrCost(Opcode, Val, Index);
412 return Cost;
379413 }
380414
381415 // Estimated cost of a load-hit-store delay. This was obtained
392426 // these need to be estimated as very costly.
393427 if (ISD == ISD::EXTRACT_VECTOR_ELT ||
394428 ISD == ISD::INSERT_VECTOR_ELT)
395 return LHSPenalty + BaseT::getVectorInstrCost(Opcode, Val, Index);
396
397 return BaseT::getVectorInstrCost(Opcode, Val, Index);
429 return LHSPenalty + Cost;
430
431 return Cost;
398432 }
399433
400434 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
405439 "Invalid Opcode");
406440
407441 int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
442 Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
408443
409444 bool IsAltivecType = ST->hasAltivec() &&
410445 (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
6969 unsigned getCacheLineSize();
7070 unsigned getPrefetchDistance();
7171 unsigned getMaxInterleaveFactor(unsigned VF);
72 int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
7273 int getArithmeticInstrCost(
7374 unsigned Opcode, Type *Ty,
7475 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
0 ; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s
1 ; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-P9 %s
2 ; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-LE %s
3
4 define void @testi16(i16 %arg1, i16 %arg2, i16* %arg3) {
5
6 %s1 = add i16 %arg1, %arg2
7 %s2 = zext i16 %arg1 to i32
8 %s3 = load i16, i16* %arg3
9 store i16 %arg2, i16* %arg3
10 %c = icmp eq i16 %arg1, %arg2
11
12 ret void
13 ; CHECK: cost of 1 {{.*}} add
14 ; CHECK: cost of 1 {{.*}} zext
15 ; CHECK: cost of 1 {{.*}} load
16 ; CHECK: cost of 1 {{.*}} store
17 ; CHECK: cost of 1 {{.*}} icmp
18 ; CHECK-P9: cost of 1 {{.*}} add
19 ; CHECK-P9: cost of 1 {{.*}} zext
20 ; CHECK-P9: cost of 1 {{.*}} load
21 ; CHECK-P9: cost of 1 {{.*}} store
22 ; CHECK-P9: cost of 1 {{.*}} icmp
23 }
24
25 define void @test4xi16(<4 x i16> %arg1, <4 x i16> %arg2) {
26
27 %v1 = add <4 x i16> %arg1, %arg2
28 %v2 = zext <4 x i16> %arg1 to <4 x i32>
29 %v3 = shufflevector <4 x i16> %arg1, <4 x i16> undef, <4 x i32> zeroinitializer
30 %c = icmp eq <4 x i16> %arg1, %arg2
31
32 ret void
33 ; CHECK: cost of 1 {{.*}} add
34 ; CHECK: cost of 1 {{.*}} zext
35 ; CHECK: cost of 1 {{.*}} shufflevector
36 ; CHECK: cost of 1 {{.*}} icmp
37 ; CHECK-P9: cost of 2 {{.*}} add
38 ; CHECK-P9: cost of 2 {{.*}} zext
39 ; CHECK-P9: cost of 2 {{.*}} shufflevector
40 ; CHECK-P9: cost of 2 {{.*}} icmp
41 }
42
43 define void @test4xi32(<4 x i32> %arg1, <4 x i32> %arg2, <4 x i32>* %arg3) {
44
45 %v1 = load <4 x i32>, <4 x i32>* %arg3
46 store <4 x i32> %arg2, <4 x i32>* %arg3
47
48 ret void
49 ; CHECK: cost of 1 {{.*}} load
50 ; CHECK: cost of 1 {{.*}} store
51 ; CHECK-P9: cost of 2 {{.*}} load
52 ; CHECK-P9: cost of 2 {{.*}} store
53 }
54
55 define void @test2xdouble(<2 x double> %arg1) {
56 %v1 = extractelement <2 x double> %arg1, i32 0
57 %v2 = extractelement <2 x double> %arg1, i32 1
58
59 ret void
60 ; CHECK: cost of 0 {{.*}} extractelement
61 ; CHECK: cost of 1 {{.*}} extractelement
62 ; CHECK-P9: cost of 0 {{.*}} extractelement
63 ; CHECK-P9: cost of 2 {{.*}} extractelement
64 ; CHECK-LE-LABEL: test2xdouble
65 ; CHECK-LE: cost of 2 {{.*}} extractelement
66 ; CHECK-LE: cost of 0 {{.*}} extractelement
67 }
0 ; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr9 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P9
1 ; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s --check-prefix=CHECK-P8
2
3 %struct._pp = type { i16, i16, i16, i16 }
4
5 ; Function Attrs: norecurse nounwind readonly
6 define [5 x double] @foo(double %k, i64 %n, %struct._pp* nocapture readonly %p) local_unnamed_addr #0 {
7 entry:
8 %cmp17 = icmp sgt i64 %n, 0
9 br i1 %cmp17, label %for.body, label %for.cond.cleanup
10
11 for.cond.cleanup: ; preds = %for.body, %entry
12 %retval.sroa.0.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
13 %retval.sroa.4.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add10, %for.body ]
14 %.fca.0.insert = insertvalue [5 x double] undef, double %retval.sroa.0.0.lcssa, 0
15 %.fca.1.insert = insertvalue [5 x double] %.fca.0.insert, double %retval.sroa.4.0.lcssa, 1
16 ret [5 x double] %.fca.1.insert
17
18 for.body: ; preds = %entry, %for.body
19 %i.020 = phi i64 [ %inc, %for.body ], [ 0, %entry ]
20 %retval.sroa.4.019 = phi double [ %add10, %for.body ], [ 0.000000e+00, %entry ]
21 %retval.sroa.0.018 = phi double [ %add, %for.body ], [ 0.000000e+00, %entry ]
22 %r1 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 2
23 %0 = load i16, i16* %r1, align 2
24 %conv2 = uitofp i16 %0 to double
25 %mul = fmul double %conv2, %k
26 %add = fadd double %retval.sroa.0.018, %mul
27 %g5 = getelementptr inbounds %struct._pp, %struct._pp* %p, i64 %i.020, i32 1
28 %1 = load i16, i16* %g5, align 2
29 %conv7 = uitofp i16 %1 to double
30 %mul8 = fmul double %conv7, %k
31 %add10 = fadd double %retval.sroa.4.019, %mul8
32 %inc = add nuw nsw i64 %i.020, 1
33 %exitcond = icmp eq i64 %inc, %n
34 br i1 %exitcond, label %for.cond.cleanup, label %for.body
35 }
36
37 ; CHECK-P8: load <2 x i16>
38 ; CHECK-P9-NOT: load <2 x i16>