llvm.org GIT mirror llvm / 9c6b24c
[X86] updating TTI costs for arithmetic instructions on X86\SLM arch. updated instructions: pmulld, pmullw, pmulhw, mulsd, mulps, mulpd, divss, divps, divsd, divpd, addpd and subpd. special optimization case which replaces pmulld with pmullw\pmulhw\pshuf seq. In case if the real operands bitwidth <= 16. Differential Revision: https://reviews.llvm.org/D28104 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@291657 91177308-0d34-0410-b5e6-96231b3b80d8 Mohammed Agabaria 2 years ago
23 changed file(s) with 616 addition(s) and 29 deletion(s). Raw diff Collapse all Expand all
517517 unsigned getMaxInterleaveFactor(unsigned VF) const;
518518
519519 /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
520 /// \p Args is an optional argument which holds the instruction operands
521 /// values so the TTI can analyize those values searching for special
522 /// cases\optimizations based on those values.
520523 int getArithmeticInstrCost(
521524 unsigned Opcode, Type *Ty, OperandValueKind Opd1Info = OK_AnyValue,
522525 OperandValueKind Opd2Info = OK_AnyValue,
523526 OperandValueProperties Opd1PropInfo = OP_None,
524 OperandValueProperties Opd2PropInfo = OP_None) const;
527 OperandValueProperties Opd2PropInfo = OP_None,
528 ArrayRef Args = ArrayRef()) const;
525529
526530 /// \return The cost of a shuffle instruction of kind Kind and of type Tp.
527531 /// The index and subtype parameters are used by the subvector insertion and
762766 getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
763767 OperandValueKind Opd2Info,
764768 OperandValueProperties Opd1PropInfo,
765 OperandValueProperties Opd2PropInfo) = 0;
769 OperandValueProperties Opd2PropInfo,
770 ArrayRef Args) = 0;
766771 virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
767772 Type *SubTp) = 0;
768773 virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0;
983988 getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
984989 OperandValueKind Opd2Info,
985990 OperandValueProperties Opd1PropInfo,
986 OperandValueProperties Opd2PropInfo) override {
991 OperandValueProperties Opd2PropInfo,
992 ArrayRef Args) override {
987993 return Impl.getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
988 Opd1PropInfo, Opd2PropInfo);
994 Opd1PropInfo, Opd2PropInfo, Args);
989995 }
990996 int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
991997 Type *SubTp) override {
305305 TTI::OperandValueKind Opd1Info,
306306 TTI::OperandValueKind Opd2Info,
307307 TTI::OperandValueProperties Opd1PropInfo,
308 TTI::OperandValueProperties Opd2PropInfo) {
308 TTI::OperandValueProperties Opd2PropInfo,
309 ArrayRef Args) {
309310 return 1;
310311 }
311312
426427 return VF;
427428 }
428429 protected:
430 // Obtain the minimum required size to hold the value (without the sign)
431 // In case of a vector it returns the min required size for one element.
432 unsigned minRequiredElementSize(const Value* Val, bool &isSigned) {
433 if (isa(Val) || isa(Val)) {
434 const auto* VectorValue = cast(Val);
435
436 // In case of a vector need to pick the max between the min
437 // required size for each element
438 auto *VT = cast(Val->getType());
439
440 // Assume unsigned elements
441 isSigned = false;
442
443 // The max required size is the total vector width divided by num
444 // of elements in the vector
445 unsigned MaxRequiredSize = VT->getBitWidth() / VT->getNumElements();
446
447 unsigned MinRequiredSize = 0;
448 for(unsigned i = 0, e = VT->getNumElements(); i < e; ++i) {
449 if (auto* IntElement =
450 dyn_cast(VectorValue->getAggregateElement(i))) {
451 bool signedElement = IntElement->getValue().isNegative();
452 // Get the element min required size.
453 unsigned ElementMinRequiredSize =
454 IntElement->getValue().getMinSignedBits() - 1;
455 // In case one element is signed then all the vector is signed.
456 isSigned |= signedElement;
457 // Save the max required bit size between all the elements.
458 MinRequiredSize = std::max(MinRequiredSize, ElementMinRequiredSize);
459 }
460 else {
461 // not an int constant element
462 return MaxRequiredSize;
463 }
464 }
465 return MinRequiredSize;
466 }
467
468 if (const auto* CI = dyn_cast(Val)) {
469 isSigned = CI->getValue().isNegative();
470 return CI->getValue().getMinSignedBits() - 1;
471 }
472
473 if (const auto* Cast = dyn_cast(Val)) {
474 isSigned = true;
475 return Cast->getSrcTy()->getScalarSizeInBits() - 1;
476 }
477
478 if (const auto* Cast = dyn_cast(Val)) {
479 isSigned = false;
480 return Cast->getSrcTy()->getScalarSizeInBits();
481 }
482
483 isSigned = false;
484 return Val->getType()->getScalarSizeInBits();
485 }
486
429487 bool isStridedAccess(const SCEV *Ptr) {
430488 return Ptr && isa(Ptr);
431489 }
307307 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
308308 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
309309 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
310 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
310 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
311 ArrayRef Args = ArrayRef()) {
311312 // Check if any of the operands are vector operands.
312313 const TargetLoweringBase *TLI = getTLI();
313314 int ISD = TLI->InstructionOpcodeToISD(Opcode);
437437 getOperandInfo(I->getOperand(0));
438438 TargetTransformInfo::OperandValueKind Op2VK =
439439 getOperandInfo(I->getOperand(1));
440 SmallVector Operands(I->operand_values());
440441 return TTI->getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK,
441 Op2VK);
442 Op2VK, TargetTransformInfo::OP_None,
443 TargetTransformInfo::OP_None,
444 Operands);
442445 }
443446 case Instruction::Select: {
444447 const SelectInst *SI = cast(I);
276276 int TargetTransformInfo::getArithmeticInstrCost(
277277 unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
278278 OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
279 OperandValueProperties Opd2PropInfo) const {
279 OperandValueProperties Opd2PropInfo,
280 ArrayRef Args) const {
280281 int Cost = TTIImpl->getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
281 Opd1PropInfo, Opd2PropInfo);
282 Opd1PropInfo, Opd2PropInfo, Args);
282283 assert(Cost >= 0 && "TTI should not produce negative costs!");
283284 return Cost;
284285 }
373373 int AArch64TTIImpl::getArithmeticInstrCost(
374374 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
375375 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
376 TTI::OperandValueProperties Opd2PropInfo) {
376 TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) {
377377 // Legalize the type.
378378 std::pair LT = TLI->getTypeLegalizationCost(DL, Ty);
379379
101101 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
102102 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
103103 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
104 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
104 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
105 ArrayRef Args = ArrayRef());
105106
106107 int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
107108
109109 int AMDGPUTTIImpl::getArithmeticInstrCost(
110110 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
111111 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
112 TTI::OperandValueProperties Opd2PropInfo) {
112 TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args ) {
113113
114114 EVT OrigTy = TLI->getValueType(DL, Ty);
115115 if (!OrigTy.isSimple()) {
8282 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
8383 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
8484 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
85 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
85 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
86 ArrayRef Args = ArrayRef());
8687
8788 unsigned getCFInstrCost(unsigned Opcode);
8889
432432 int ARMTTIImpl::getArithmeticInstrCost(
433433 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
434434 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
435 TTI::OperandValueProperties Opd2PropInfo) {
435 TTI::OperandValueProperties Opd2PropInfo,
436 ArrayRef Args) {
436437
437438 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
438439 std::pair LT = TLI->getTypeLegalizationCost(DL, Ty);
113113 TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
114114 TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
115115 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
116 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
116 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
117 ArrayRef Args = ArrayRef());
117118
118119 int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
119120 unsigned AddressSpace);
5353 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
5454 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
5555 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
56 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None) {
56 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
57 ArrayRef Args = ArrayRef()) {
5758 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5859
5960 switch (ISD) {
114114 int NVPTXTTIImpl::getArithmeticInstrCost(
115115 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
116116 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
117 TTI::OperandValueProperties Opd2PropInfo) {
117 TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) {
118118 // Legalize the type.
119119 std::pair LT = TLI->getTypeLegalizationCost(DL, Ty);
120120
5353 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
5454 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
5555 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
56 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
56 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
57 ArrayRef Args = ArrayRef());
5758
5859 void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP);
5960 };
280280 int PPCTTIImpl::getArithmeticInstrCost(
281281 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
282282 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
283 TTI::OperandValueProperties Opd2PropInfo) {
283 TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) {
284284 assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
285285
286286 // Fallback to the default implementation.
7070 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
7171 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
7272 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
73 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
73 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
74 ArrayRef Args = ArrayRef());
7475 int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
7576 int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
7677 int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
4545 unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
4646 unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
4747 TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
48 TTI::OperandValueProperties Opd2PropInfo) {
48 TTI::OperandValueProperties Opd2PropInfo, ArrayRef Args) {
4949
5050 unsigned Cost = BasicTTIImplBase::getArithmeticInstrCost(
5151 Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
6060 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
6161 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
6262 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
63 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
63 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
64 ArrayRef Args = ArrayRef());
6465 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
6566
6667 /// @}
113113 }
114114
115115 int X86TTIImpl::getArithmeticInstrCost(
116 unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
117 TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
118 TTI::OperandValueProperties Opd2PropInfo) {
116 unsigned Opcode, Type *Ty,
117 TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
118 TTI::OperandValueProperties Opd1PropInfo,
119 TTI::OperandValueProperties Opd2PropInfo,
120 ArrayRef Args) {
119121 // Legalize the type.
120122 std::pair LT = TLI->getTypeLegalizationCost(DL, Ty);
121123
122124 int ISD = TLI->InstructionOpcodeToISD(Opcode);
123125 assert(ISD && "Invalid opcode");
126
127 static const CostTblEntry SLMCostTable[] = {
128 { ISD::MUL, MVT::v4i32, 11 }, // pmulld
129 { ISD::MUL, MVT::v8i16, 2 }, // pmullw
130 { ISD::MUL, MVT::v16i8, 14 }, // extend/pmullw/trunc sequence.
131 { ISD::FMUL, MVT::f64, 2 }, // mulsd
132 { ISD::FMUL, MVT::v2f64, 4 }, // mulpd
133 { ISD::FMUL, MVT::v4f32, 2 }, // mulps
134 { ISD::FDIV, MVT::f32, 17 }, // divss
135 { ISD::FDIV, MVT::v4f32, 39 }, // divps
136 { ISD::FDIV, MVT::f64, 32 }, // divsd
137 { ISD::FDIV, MVT::v2f64, 69 }, // divpd
138 { ISD::FADD, MVT::v2f64, 2 }, // addpd
139 { ISD::FSUB, MVT::v2f64, 2 }, // subpd
140 // v2i64/v4i64 mul is custom lowered as a series of long
141 // multiplies(3), shifts(3) and adds(2).
142 // slm muldq version throughput is 2
143 { ISD::MUL, MVT::v2i64, 11 },
144 };
145
146 if (ST->isSLM()) {
147 if (Args.size() == 2 && ISD == ISD::MUL && LT.second == MVT::v4i32) {
148 // Check if the operands can be shrinked into a smaller datatype.
149 bool Op1Signed = false;
150 unsigned Op1MinSize = BaseT::minRequiredElementSize(Args[0], Op1Signed);
151 bool Op2Signed = false;
152 unsigned Op2MinSize = BaseT::minRequiredElementSize(Args[1], Op2Signed);
153
154 bool signedMode = Op1Signed | Op2Signed;
155 unsigned OpMinSize = std::max(Op1MinSize, Op2MinSize);
156
157 if (OpMinSize <= 7)
158 return LT.first * 3; // pmullw/sext
159 if (!signedMode && OpMinSize <= 8)
160 return LT.first * 3; // pmullw/zext
161 if (OpMinSize <= 15)
162 return LT.first * 5; // pmullw/pmulhw/pshuf
163 if (!signedMode && OpMinSize <= 16)
164 return LT.first * 5; // pmullw/pmulhw/pshuf
165 }
166 if (const auto *Entry = CostTableLookup(SLMCostTable, ISD,
167 LT.second)) {
168 return LT.first * Entry->Cost;
169 }
170 }
124171
125172 if (ISD == ISD::SDIV &&
126173 Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
5959 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
6060 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
6161 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
62 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None);
62 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
63 ArrayRef Args = ArrayRef());
6364 int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
6465 int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
6566 int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
7979 #include "llvm/IR/Module.h"
8080 #include "llvm/IR/PatternMatch.h"
8181 #include "llvm/IR/Type.h"
82 #include "llvm/IR/User.h"
8283 #include "llvm/IR/Value.h"
8384 #include "llvm/IR/ValueHandle.h"
8485 #include "llvm/IR/Verifier.h"
69486949 } else if (Legal->isUniform(Op2)) {
69496950 Op2VK = TargetTransformInfo::OK_UniformValue;
69506951 }
6951
6952 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK,
6953 Op1VP, Op2VP);
6952 SmallVector Operands(I->operand_values());
6953 return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
6954 Op2VK, Op1VP, Op2VP, Operands);
69546955 }
69556956 case Instruction::Select: {
69566957 SelectInst *SI = cast(I);
0 ; RUN: opt < %s -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefix=SLM
1
2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
3 target triple = "x86_64-unknown-linux-gnu"
4
5 ; 8bit mul
6 define i8 @slm-costs_8_scalar_mul(i8 %a, i8 %b) {
7 entry:
8 ; SLM: cost of 1 {{.*}} mul nsw i8
9 %res = mul nsw i8 %a, %b
10 ret i8 %res
11 }
12
13 define <2 x i8> @slm-costs_8_v2_mul(<2 x i8> %a, <2 x i8> %b) {
14 entry:
15 ; SLM: cost of 11 {{.*}} mul nsw <2 x i8>
16 %res = mul nsw <2 x i8> %a, %b
17 ret <2 x i8> %res
18 }
19
20 define <4 x i8> @slm-costs_8_v4_mul(<4 x i8> %a, <4 x i8> %b) {
21 entry:
22 ; SLM: cost of 3 {{.*}} mul nsw <4 x i8>
23 %res = mul nsw <4 x i8> %a, %b
24 ret <4 x i8> %res
25 }
26
27 define <4 x i32> @slm-costs_8_v4_zext_mul(<4 x i8> %a) {
28 entry:
29 ; SLM: cost of 3 {{.*}} mul nsw <4 x i32>
30 %zext = zext <4 x i8> %a to <4 x i32>
31 %res = mul nsw <4 x i32> %zext,
32 ret <4 x i32> %res
33 }
34
35 define <4 x i32> @slm-costs_8_v4_zext_mul_fail(<4 x i8> %a) {
36 entry:
37 ; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
38 %zext = zext <4 x i8> %a to <4 x i32>
39 %res = mul nsw <4 x i32> %zext,
40 ret <4 x i32> %res
41 }
42
43 define <4 x i32> @slm-costs_8_v4_zext_mul_fail_2(<4 x i8> %a) {
44 entry:
45 ; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
46 %zext = zext <4 x i8> %a to <4 x i32>
47 %res = mul nsw <4 x i32> %zext,
48 ret <4 x i32> %res
49 }
50
51 define <4 x i32> @slm-costs_8_v4_sext_mul(<4 x i8> %a) {
52 entry:
53 ; SLM: cost of 3 {{.*}} mul nsw <4 x i32>
54 %sext = sext <4 x i8> %a to <4 x i32>
55 %res = mul nsw <4 x i32> %sext,
56 ret <4 x i32> %res
57 }
58
59 define <4 x i32> @slm-costs_8_v4_sext_mul_fail(<4 x i8> %a) {
60 entry:
61 ; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
62 %sext = sext <4 x i8> %a to <4 x i32>
63 %res = mul nsw <4 x i32> %sext,
64 ret <4 x i32> %res
65 }
66
67 define <4 x i32> @slm-costs_8_v4_sext_mul_fail_2(<4 x i8> %a) {
68 entry:
69 ; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
70 %sext = sext <4 x i8> %a to <4 x i32>
71 %res = mul nsw <4 x i32> %sext,
72 ret <4 x i32> %res
73 }
74
75 define <8 x i8> @slm-costs_8_v8_mul(<8 x i8> %a, <8 x i8> %b) {
76 entry:
77 ; SLM: cost of 2 {{.*}} mul nsw <8 x i8>
78 %res = mul nsw <8 x i8> %a, %b
79 ret <8 x i8> %res
80 }
81
82 define <16 x i8> @slm-costs_8_v16_mul(<16 x i8> %a, <16 x i8> %b) {
83 entry:
84 ; SLM: cost of 14 {{.*}} mul nsw <16 x i8>
85 %res = mul nsw <16 x i8> %a, %b
86 ret <16 x i8> %res
87 }
88
89 ; 16bit mul
90 define i16 @slm-costs_16_scalar_mul(i16 %a, i16 %b) {
91 entry:
92 ; SLM: cost of 1 {{.*}} mul nsw i16
93 %res = mul nsw i16 %a, %b
94 ret i16 %res
95 }
96
97 define <2 x i16> @slm-costs_16_v2_mul(<2 x i16> %a, <2 x i16> %b) {
98 entry:
99 ; SLM: cost of 11 {{.*}} mul nsw <2 x i16>
100 %res = mul nsw <2 x i16> %a, %b
101 ret <2 x i16> %res
102 }
103
104 define <4 x i16> @slm-costs_16_v4_mul(<4 x i16> %a, <4 x i16> %b) {
105 entry:
106 ; SLM: cost of 5 {{.*}} mul nsw <4 x i16>
107 %res = mul nsw <4 x i16> %a, %b
108 ret <4 x i16> %res
109 }
110
111 define <4 x i32> @slm-costs_16_v4_zext_mul(<4 x i16> %a) {
112 entry:
113 ; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
114 %zext = zext <4 x i16> %a to <4 x i32>
115 %res = mul nsw <4 x i32> %zext,
116 ret <4 x i32> %res
117 }
118
119 define <4 x i32> @slm-costs_16_v4_zext_mul_fail(<4 x i16> %a) {
120 entry:
121 ; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
122 %zext = zext <4 x i16> %a to <4 x i32>
123 %res = mul nsw <4 x i32> %zext,
124 ret <4 x i32> %res
125 }
126
127 define <4 x i32> @slm-costs_16_v4_zext_mul_fail_2(<4 x i16> %a) {
128 entry:
129 ; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
130 %zext = zext <4 x i16> %a to <4 x i32>
131 %res = mul nsw <4 x i32> %zext,
132 ret <4 x i32> %res
133 }
134
135 define <4 x i32> @slm-costs_16_v4_sext_mul(<4 x i16> %a) {
136 entry:
137 ; SLM: cost of 5 {{.*}} mul nsw <4 x i32>
138 %sext = sext <4 x i16> %a to <4 x i32>
139 %res = mul nsw <4 x i32> %sext,
140 ret <4 x i32> %res
141 }
142
143 define <4 x i32> @slm-costs_16_v4_sext_mul_fail(<4 x i16> %a) {
144 entry:
145 ; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
146 %sext = sext <4 x i16> %a to <4 x i32>
147 %res = mul nsw <4 x i32> %sext,
148 ret <4 x i32> %res
149 }
150
151 define <4 x i32> @slm-costs_16_v4_sext_mul_fail_2(<4 x i16> %a) {
152 entry:
153 ; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
154 %sext = sext <4 x i16> %a to <4 x i32>
155 %res = mul nsw <4 x i32> %sext,
156 ret <4 x i32> %res
157 }
158
159 define <8 x i16> @slm-costs_16_v8_mul(<8 x i16> %a, <8 x i16> %b) {
160 entry:
161 ; SLM: cost of 2 {{.*}} mul nsw <8 x i16>
162 %res = mul nsw <8 x i16> %a, %b
163 ret <8 x i16> %res
164 }
165
166 define <16 x i16> @slm-costs_16_v16_mul(<16 x i16> %a, <16 x i16> %b) {
167 entry:
168 ; SLM: cost of 4 {{.*}} mul nsw <16 x i16>
169 %res = mul nsw <16 x i16> %a, %b
170 ret <16 x i16> %res
171 }
172
173 ; 32bit mul
174 define i32 @slm-costs_32_scalar_mul(i32 %a, i32 %b) {
175 entry:
176 ; SLM: cost of 1 {{.*}} mul nsw i32
177 %res = mul nsw i32 %a, %b
178 ret i32 %res
179 }
180
181 define <2 x i32> @slm-costs_32_v2_mul(<2 x i32> %a, <2 x i32> %b) {
182 entry:
183 ; SLM: cost of 11 {{.*}} mul nsw <2 x i32>
184 %res = mul nsw <2 x i32> %a, %b
185 ret <2 x i32> %res
186 }
187
188 define <4 x i32> @slm-costs_32_v4_mul(<4 x i32> %a, <4 x i32> %b) {
189 entry:
190 ; SLM: cost of 11 {{.*}} mul nsw <4 x i32>
191 %res = mul nsw <4 x i32> %a, %b
192 ret <4 x i32> %res
193 }
194
195 define <8 x i32> @slm-costs_32_v8_mul(<8 x i32> %a, <8 x i32> %b) {
196 entry:
197 ; SLM: cost of 22 {{.*}} mul nsw <8 x i32>
198 %res = mul nsw <8 x i32> %a, %b
199 ret <8 x i32> %res
200 }
201
202 define <16 x i32> @slm-costs_32_v16_mul(<16 x i32> %a, <16 x i32> %b) {
203 entry:
204 ; SLM: cost of 44 {{.*}} mul nsw <16 x i32>
205 %res = mul nsw <16 x i32> %a, %b
206 ret <16 x i32> %res
207 }
208
209 ; 64bit mul
210 define i64 @slm-costs_64_scalar_mul(i64 %a, i64 %b) {
211 entry:
212 ; SLM: cost of 1 {{.*}} mul nsw i64
213 %res = mul nsw i64 %a, %b
214 ret i64 %res
215 }
216
217 define <2 x i64> @slm-costs_64_v2_mul(<2 x i64> %a, <2 x i64> %b) {
218 entry:
219 ; SLM: cost of 11 {{.*}} mul nsw <2 x i64>
220 %res = mul nsw <2 x i64> %a, %b
221 ret <2 x i64> %res
222 }
223
224 define <4 x i64> @slm-costs_64_v4_mul(<4 x i64> %a, <4 x i64> %b) {
225 entry:
226 ; SLM: cost of 22 {{.*}} mul nsw <4 x i64>
227 %res = mul nsw <4 x i64> %a, %b
228 ret <4 x i64> %res
229 }
230
231 define <8 x i64> @slm-costs_64_v8_mul(<8 x i64> %a, <8 x i64> %b) {
232 entry:
233 ; SLM: cost of 44 {{.*}} mul nsw <8 x i64>
234 %res = mul nsw <8 x i64> %a, %b
235 ret <8 x i64> %res
236 }
237
238 define <16 x i64> @slm-costs_64_v16_mul(<16 x i64> %a, <16 x i64> %b) {
239 entry:
240 ; SLM: cost of 88 {{.*}} mul nsw <16 x i64>
241 %res = mul nsw <16 x i64> %a, %b
242 ret <16 x i64> %res
243 }
244
245 ; mulsd
246 define double @slm-costs_mulsd(double %a, double %b) {
247 entry:
248 ; SLM: cost of 2 {{.*}} fmul double
249 %res = fmul double %a, %b
250 ret double %res
251 }
252
253 ; mulpd
254 define <2 x double> @slm-costs_mulpd(<2 x double> %a, <2 x double> %b) {
255 entry:
256 ; SLM: cost of 4 {{.*}} fmul <2 x double>
257 %res = fmul <2 x double> %a, %b
258 ret <2 x double> %res
259 }
260
261 ; mulps
262 define <4 x float> @slm-costs_mulps(<4 x float> %a, <4 x float> %b) {
263 entry:
264 ; SLM: cost of 2 {{.*}} fmul <4 x float>
265 %res = fmul <4 x float> %a, %b
266 ret <4 x float> %res
267 }
268
269 ; divss
270 define float @slm-costs_divss(float %a, float %b) {
271 entry:
272 ; SLM: cost of 17 {{.*}} fdiv float
273 %res = fdiv float %a, %b
274 ret float %res
275 }
276
277 ; divps
278 define <4 x float> @slm-costs_divps(<4 x float> %a, <4 x float> %b) {
279 entry:
280 ; SLM: cost of 39 {{.*}} fdiv <4 x float>
281 %res = fdiv <4 x float> %a, %b
282 ret <4 x float> %res
283 }
284
285 ; divsd
286 define double @slm-costs_divsd(double %a, double %b) {
287 entry:
288 ; SLM: cost of 32 {{.*}} fdiv double
289 %res = fdiv double %a, %b
290 ret double %res
291 }
292
293 ; divpd
294 define <2 x double> @slm-costs_divpd(<2 x double> %a, <2 x double> %b) {
295 entry:
296 ; SLM: cost of 69 {{.*}} fdiv <2 x double>
297 %res = fdiv <2 x double> %a, %b
298 ret <2 x double> %res
299 }
300
301 ; addpd
302 define <2 x double> @slm-costs_addpd(<2 x double> %a, <2 x double> %b) {
303 entry:
304 ; SLM: cost of 2 {{.*}} fadd <2 x double>
305 %res = fadd <2 x double> %a, %b
306 ret <2 x double> %res
307 }
308
309 ; subpd
310 define <2 x double> @slm-costs_subpd(<2 x double> %a, <2 x double> %b) {
311 entry:
312 ; SLM: cost of 2 {{.*}} fsub <2 x double>
313 %res = fsub <2 x double> %a, %b
314 ret <2 x double> %res
315 }
316
0 ; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
1
2 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
3 target triple = "x86_64-unknown-linux-gnu"
4
5 define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
6 entry:
7 %cmp12 = icmp eq i32 %N, 0
8 br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
9
10 for.body.preheader: ; preds = %entry
11 %wide.trip.count = zext i32 %N to i64
12 br label %for.body
13
14 for.cond.cleanup.loopexit: ; preds = %for.body
15 %phitmp = trunc i32 %add4 to i8
16 br label %for.cond.cleanup
17
18 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
19 %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
20 ret i8 %acc.0.lcssa
21
22 for.body: ; preds = %for.body.preheader, %for.body
23 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
24 %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
25 %arrayidx = getelementptr inbounds i8, i8* %dataA, i64 %indvars.iv
26 %0 = load i8, i8* %arrayidx, align 1
27 %conv = sext i8 %0 to i32
28 %arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
29 %1 = load i8, i8* %arrayidx2, align 1
30 %conv3 = sext i8 %1 to i32
31 ; sources of the mul is sext\sext from i8
32 ; use pmullw\sext seq.
33 ; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
34 %mul = mul nsw i32 %conv3, %conv
35 ; sources of the mul is zext\sext from i8
36 ; use pmulhw\pmullw\pshuf
37 ; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
38 %conv4 = zext i8 %1 to i32
39 %mul2 = mul nsw i32 %conv4, %conv
40 %sum0 = add i32 %mul, %mul2
41 ; sources of the mul is zext\zext from i8
42 ; use pmullw\zext
43 ; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
44 %conv5 = zext i8 %0 to i32
45 %mul3 = mul nsw i32 %conv5, %conv4
46 %sum1 = add i32 %sum0, %mul3
47 ; sources of the mul is sext\-120
48 ; use pmullw\sext
49 ; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
50 %mul4 = mul nsw i32 -120, %conv3
51 %sum2 = add i32 %sum1, %mul4
52 ; sources of the mul is sext\250
53 ; use pmulhw\pmullw\pshuf
54 ; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
55 %mul5 = mul nsw i32 250, %conv3
56 %sum3 = add i32 %sum2, %mul5
57 ; sources of the mul is zext\-120
58 ; use pmulhw\pmullw\pshuf
59 ; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
60 %mul6 = mul nsw i32 -120, %conv4
61 %sum4 = add i32 %sum3, %mul6
62 ; sources of the mul is zext\250
63 ; use pmullw\zext
64 ; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32
65 %mul7 = mul nsw i32 250, %conv4
66 %sum5 = add i32 %sum4, %mul7
67 %add = add i32 %acc.013, 5
68 %add4 = add i32 %add, %sum5
69 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
70 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
71 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
72 }
73
74 define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
75 entry:
76 %cmp12 = icmp eq i32 %N, 0
77 br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
78
79 for.body.preheader: ; preds = %entry
80 %wide.trip.count = zext i32 %N to i64
81 br label %for.body
82
83 for.cond.cleanup.loopexit: ; preds = %for.body
84 %phitmp = trunc i32 %add4 to i16
85 br label %for.cond.cleanup
86
87 for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
88 %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %for.cond.cleanup.loopexit ]
89 ret i16 %acc.0.lcssa
90
91 for.body: ; preds = %for.body.preheader, %for.body
92 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
93 %acc.013 = phi i32 [ %add4, %for.body ], [ 0, %for.body.preheader ]
94 %arrayidx = getelementptr inbounds i16, i16* %dataA, i64 %indvars.iv
95 %0 = load i16, i16* %arrayidx, align 1
96 %conv = sext i16 %0 to i32
97 %arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
98 %1 = load i16, i16* %arrayidx2, align 1
99 %conv3 = sext i16 %1 to i32
100 ; sources of the mul is sext\sext from i16
101 ; use pmulhw\pmullw\pshuf seq.
102 ; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
103 %mul = mul nsw i32 %conv3, %conv
104 ; sources of the mul is zext\sext from i16
105 ; use pmulld
106 ; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32
107 %conv4 = zext i16 %1 to i32
108 %mul2 = mul nsw i32 %conv4, %conv
109 %sum0 = add i32 %mul, %mul2
110 ; sources of the mul is zext\zext from i16
111 ; use pmulhw\pmullw\zext
112 ; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
113 %conv5 = zext i16 %0 to i32
114 %mul3 = mul nsw i32 %conv5, %conv4
115 %sum1 = add i32 %sum0, %mul3
116 ; sources of the mul is sext\-32000
117 ; use pmulhw\pmullw\sext
118 ; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
119 %mul4 = mul nsw i32 -32000, %conv3
120 %sum2 = add i32 %sum1, %mul4
121 ; sources of the mul is sext\64000
122 ; use pmulld
123 ; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32
124 %mul5 = mul nsw i32 64000, %conv3
125 %sum3 = add i32 %sum2, %mul5
126 ; sources of the mul is zext\-32000
127 ; use pmulld
128 ; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32
129 %mul6 = mul nsw i32 -32000, %conv4
130 %sum4 = add i32 %sum3, %mul6
131 ; sources of the mul is zext\64000
132 ; use pmulhw\pmullw\zext
133 ; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32
134 %mul7 = mul nsw i32 250, %conv4
135 %sum5 = add i32 %sum4, %mul7
136 %add = add i32 %acc.013, 5
137 %add4 = add i32 %add, %sum5
138 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
139 %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
140 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
141 }
142
143