LLVM 19.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/ArrayRef.h"
21#include "llvm/ADT/BitVector.h"
34#include "llvm/IR/BasicBlock.h"
35#include "llvm/IR/Constant.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DataLayout.h"
39#include "llvm/IR/InstrTypes.h"
40#include "llvm/IR/Instruction.h"
42#include "llvm/IR/Intrinsics.h"
43#include "llvm/IR/Operator.h"
44#include "llvm/IR/Type.h"
45#include "llvm/IR/Value.h"
53#include <algorithm>
54#include <cassert>
55#include <cstdint>
56#include <limits>
57#include <optional>
58#include <utility>
59
60namespace llvm {
61
62class Function;
63class GlobalValue;
64class LLVMContext;
65class ScalarEvolution;
66class SCEV;
67class TargetMachine;
68
69extern cl::opt<unsigned> PartialUnrollingThreshold;
70
71/// Base class which can be used to help build a TTI implementation.
72///
73/// This class provides as much implementation of the TTI interface as is
74/// possible using the target independent parts of the code generator.
75///
76/// In order to subclass it, your class must implement a getST() method to
77/// return the subtarget, and a getTLI() method to return the target lowering.
78/// We need these methods implemented in the derived class so that this class
79/// doesn't have to duplicate storage for them.
80template <typename T>
82private:
85
86 /// Helper function to access this as a T.
87 T *thisT() { return static_cast<T *>(this); }
88
89 /// Estimate a cost of Broadcast as an extract and sequence of insert
90 /// operations.
91 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
94 // Broadcast cost is equal to the cost of extracting the zero'th element
95 // plus the cost of inserting it into every element of the result vector.
96 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
97 CostKind, 0, nullptr, nullptr);
98
99 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
100 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
101 CostKind, i, nullptr, nullptr);
102 }
103 return Cost;
104 }
105
106 /// Estimate a cost of shuffle as a sequence of extract and insert
107 /// operations.
108 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
111 // Shuffle cost is equal to the cost of extracting element from its argument
112 // plus the cost of inserting them onto the result vector.
113
114 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
115 // index 0 of first vector, index 1 of second vector,index 2 of first
116 // vector and finally index 3 of second vector and insert them at index
117 // <0,1,2,3> of result vector.
118 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
119 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
120 CostKind, i, nullptr, nullptr);
121 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
122 CostKind, i, nullptr, nullptr);
123 }
124 return Cost;
125 }
126
127 /// Estimate a cost of subvector extraction as a sequence of extract and
128 /// insert operations.
129 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
131 int Index,
132 FixedVectorType *SubVTy) {
133 assert(VTy && SubVTy &&
134 "Can only extract subvectors from vectors");
135 int NumSubElts = SubVTy->getNumElements();
136 assert((!isa<FixedVectorType>(VTy) ||
137 (Index + NumSubElts) <=
138 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
139 "SK_ExtractSubvector index out of range");
140
142 // Subvector extraction cost is equal to the cost of extracting element from
143 // the source type plus the cost of inserting them into the result vector
144 // type.
145 for (int i = 0; i != NumSubElts; ++i) {
146 Cost +=
147 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
148 CostKind, i + Index, nullptr, nullptr);
149 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
150 CostKind, i, nullptr, nullptr);
151 }
152 return Cost;
153 }
154
155 /// Estimate a cost of subvector insertion as a sequence of extract and
156 /// insert operations.
157 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
159 int Index,
160 FixedVectorType *SubVTy) {
161 assert(VTy && SubVTy &&
162 "Can only insert subvectors into vectors");
163 int NumSubElts = SubVTy->getNumElements();
164 assert((!isa<FixedVectorType>(VTy) ||
165 (Index + NumSubElts) <=
166 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
167 "SK_InsertSubvector index out of range");
168
170 // Subvector insertion cost is equal to the cost of extracting element from
171 // the source type plus the cost of inserting them into the result vector
172 // type.
173 for (int i = 0; i != NumSubElts; ++i) {
174 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
175 CostKind, i, nullptr, nullptr);
176 Cost +=
177 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
178 i + Index, nullptr, nullptr);
179 }
180 return Cost;
181 }
182
183 /// Local query method delegates up to T which *must* implement this!
184 const TargetSubtargetInfo *getST() const {
185 return static_cast<const T *>(this)->getST();
186 }
187
188 /// Local query method delegates up to T which *must* implement this!
189 const TargetLoweringBase *getTLI() const {
190 return static_cast<const T *>(this)->getTLI();
191 }
192
193 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
194 switch (M) {
196 return ISD::UNINDEXED;
197 case TTI::MIM_PreInc:
198 return ISD::PRE_INC;
199 case TTI::MIM_PreDec:
200 return ISD::PRE_DEC;
201 case TTI::MIM_PostInc:
202 return ISD::POST_INC;
203 case TTI::MIM_PostDec:
204 return ISD::POST_DEC;
205 }
206 llvm_unreachable("Unexpected MemIndexedMode");
207 }
208
209 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
210 Align Alignment,
211 bool VariableMask,
212 bool IsGatherScatter,
214 unsigned AddressSpace = 0) {
215 // We cannot scalarize scalable vectors, so return Invalid.
216 if (isa<ScalableVectorType>(DataTy))
218
219 auto *VT = cast<FixedVectorType>(DataTy);
220 unsigned VF = VT->getNumElements();
221
222 // Assume the target does not have support for gather/scatter operations
223 // and provide a rough estimate.
224 //
225 // First, compute the cost of the individual memory operations.
226 InstructionCost AddrExtractCost =
227 IsGatherScatter
230 PointerType::get(VT->getElementType(), 0), VF),
231 /*Insert=*/false, /*Extract=*/true, CostKind)
232 : 0;
233
234 // The cost of the scalar loads/stores.
235 InstructionCost MemoryOpCost =
236 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
238
239 // Next, compute the cost of packing the result in a vector.
240 InstructionCost PackingCost =
241 getScalarizationOverhead(VT, Opcode != Instruction::Store,
242 Opcode == Instruction::Store, CostKind);
243
244 InstructionCost ConditionalCost = 0;
245 if (VariableMask) {
246 // Compute the cost of conditionally executing the memory operations with
247 // variable masks. This includes extracting the individual conditions, a
248 // branches and PHIs to combine the results.
249 // NOTE: Estimating the cost of conditionally executing the memory
250 // operations accurately is quite difficult and the current solution
251 // provides a very rough estimate only.
252 ConditionalCost =
255 /*Insert=*/false, /*Extract=*/true, CostKind) +
256 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
257 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
258 }
259
260 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
261 }
262
263protected:
265 : BaseT(DL) {}
266 virtual ~BasicTTIImplBase() = default;
267
269
270public:
271 /// \name Scalar TTI Implementations
272 /// @{
274 unsigned AddressSpace, Align Alignment,
275 unsigned *Fast) const {
277 return getTLI()->allowsMisalignedMemoryAccesses(
279 }
280
281 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
282
283 bool isSourceOfDivergence(const Value *V) { return false; }
284
285 bool isAlwaysUniform(const Value *V) { return false; }
286
287 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
288 return false;
289 }
290
291 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
292 return true;
293 }
294
296 // Return an invalid address space.
297 return -1;
298 }
299
301 Intrinsic::ID IID) const {
302 return false;
303 }
304
305 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
306 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
307 }
308
309 unsigned getAssumedAddrSpace(const Value *V) const {
310 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
311 }
312
313 bool isSingleThreaded() const {
314 return getTLI()->getTargetMachine().Options.ThreadModel ==
316 }
317
318 std::pair<const Value *, unsigned>
320 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
321 }
322
324 Value *NewV) const {
325 return nullptr;
326 }
327
328 bool isLegalAddImmediate(int64_t imm) {
329 return getTLI()->isLegalAddImmediate(imm);
330 }
331
332 bool isLegalAddScalableImmediate(int64_t Imm) {
333 return getTLI()->isLegalAddScalableImmediate(Imm);
334 }
335
336 bool isLegalICmpImmediate(int64_t imm) {
337 return getTLI()->isLegalICmpImmediate(imm);
338 }
339
340 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
341 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
342 Instruction *I = nullptr,
343 int64_t ScalableOffset = 0) {
345 AM.BaseGV = BaseGV;
346 AM.BaseOffs = BaseOffset;
347 AM.HasBaseReg = HasBaseReg;
348 AM.Scale = Scale;
349 AM.ScalableOffset = ScalableOffset;
350 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
351 }
352
353 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
354 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
355 }
356
357 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
358 Type *ScalarValTy) const {
359 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
360 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
361 EVT VT = getTLI()->getValueType(DL, SrcTy);
362 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
363 getTLI()->isOperationCustom(ISD::STORE, VT))
364 return true;
365
366 EVT ValVT =
367 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
368 EVT LegalizedVT =
369 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
370 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
371 };
372 while (VF > 2 && IsSupportedByTarget(VF))
373 VF /= 2;
374 return VF;
375 }
376
378 const DataLayout &DL) const {
379 EVT VT = getTLI()->getValueType(DL, Ty);
380 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
381 }
382
384 const DataLayout &DL) const {
385 EVT VT = getTLI()->getValueType(DL, Ty);
386 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
387 }
388
391 }
392
395 }
396
400 }
401
404 }
405
407 StackOffset BaseOffset, bool HasBaseReg,
408 int64_t Scale, unsigned AddrSpace) {
410 AM.BaseGV = BaseGV;
411 AM.BaseOffs = BaseOffset.getFixed();
412 AM.HasBaseReg = HasBaseReg;
413 AM.Scale = Scale;
414 AM.ScalableOffset = BaseOffset.getScalable();
415 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
416 return 0;
417 return -1;
418 }
419
420 bool isTruncateFree(Type *Ty1, Type *Ty2) {
421 return getTLI()->isTruncateFree(Ty1, Ty2);
422 }
423
425 return getTLI()->isProfitableToHoist(I);
426 }
427
428 bool useAA() const { return getST()->useAA(); }
429
430 bool isTypeLegal(Type *Ty) {
431 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
432 return getTLI()->isTypeLegal(VT);
433 }
434
435 unsigned getRegUsageForType(Type *Ty) {
436 EVT ETy = getTLI()->getValueType(DL, Ty);
437 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
438 }
439
443 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
444 }
445
447 unsigned &JumpTableSize,
449 BlockFrequencyInfo *BFI) {
450 /// Try to find the estimated number of clusters. Note that the number of
451 /// clusters identified in this function could be different from the actual
452 /// numbers found in lowering. This function ignore switches that are
453 /// lowered with a mix of jump table / bit test / BTree. This function was
454 /// initially intended to be used when estimating the cost of switch in
455 /// inline cost heuristic, but it's a generic cost model to be used in other
456 /// places (e.g., in loop unrolling).
457 unsigned N = SI.getNumCases();
458 const TargetLoweringBase *TLI = getTLI();
459 const DataLayout &DL = this->getDataLayout();
460
461 JumpTableSize = 0;
462 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
463
464 // Early exit if both a jump table and bit test are not allowed.
465 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
466 return N;
467
468 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
469 APInt MinCaseVal = MaxCaseVal;
470 for (auto CI : SI.cases()) {
471 const APInt &CaseVal = CI.getCaseValue()->getValue();
472 if (CaseVal.sgt(MaxCaseVal))
473 MaxCaseVal = CaseVal;
474 if (CaseVal.slt(MinCaseVal))
475 MinCaseVal = CaseVal;
476 }
477
478 // Check if suitable for a bit test
479 if (N <= DL.getIndexSizeInBits(0u)) {
481 for (auto I : SI.cases())
482 Dests.insert(I.getCaseSuccessor());
483
484 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
485 DL))
486 return 1;
487 }
488
489 // Check if suitable for a jump table.
490 if (IsJTAllowed) {
491 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
492 return N;
493 uint64_t Range =
494 (MaxCaseVal - MinCaseVal)
495 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
496 // Check whether a range of clusters is dense enough for a jump table
497 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
498 JumpTableSize = Range;
499 return 1;
500 }
501 }
502 return N;
503 }
504
506 const TargetLoweringBase *TLI = getTLI();
507 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
508 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
509 }
510
512 const TargetMachine &TM = getTLI()->getTargetMachine();
513 // If non-PIC mode, do not generate a relative lookup table.
514 if (!TM.isPositionIndependent())
515 return false;
516
517 /// Relative lookup table entries consist of 32-bit offsets.
518 /// Do not generate relative lookup tables for large code models
519 /// in 64-bit achitectures where 32-bit offsets might not be enough.
520 if (TM.getCodeModel() == CodeModel::Medium ||
521 TM.getCodeModel() == CodeModel::Large)
522 return false;
523
524 Triple TargetTriple = TM.getTargetTriple();
525 if (!TargetTriple.isArch64Bit())
526 return false;
527
528 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
529 // there.
530 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
531 return false;
532
533 return true;
534 }
535
536 bool haveFastSqrt(Type *Ty) {
537 const TargetLoweringBase *TLI = getTLI();
538 EVT VT = TLI->getValueType(DL, Ty);
539 return TLI->isTypeLegal(VT) &&
541 }
542
544 return true;
545 }
546
548 // Check whether FADD is available, as a proxy for floating-point in
549 // general.
550 const TargetLoweringBase *TLI = getTLI();
551 EVT VT = TLI->getValueType(DL, Ty);
555 }
556
558 const Function &Fn) const {
559 switch (Inst.getOpcode()) {
560 default:
561 break;
562 case Instruction::SDiv:
563 case Instruction::SRem:
564 case Instruction::UDiv:
565 case Instruction::URem: {
566 if (!isa<ConstantInt>(Inst.getOperand(1)))
567 return false;
568 EVT VT = getTLI()->getValueType(DL, Inst.getType());
569 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
570 }
571 };
572
573 return false;
574 }
575
576 unsigned getInliningThresholdMultiplier() const { return 1; }
577 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
578 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
579 return 0;
580 }
581
582 int getInlinerVectorBonusPercent() const { return 150; }
583
587 // This unrolling functionality is target independent, but to provide some
588 // motivation for its intended use, for x86:
589
590 // According to the Intel 64 and IA-32 Architectures Optimization Reference
591 // Manual, Intel Core models and later have a loop stream detector (and
592 // associated uop queue) that can benefit from partial unrolling.
593 // The relevant requirements are:
594 // - The loop must have no more than 4 (8 for Nehalem and later) branches
595 // taken, and none of them may be calls.
596 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
597
598 // According to the Software Optimization Guide for AMD Family 15h
599 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
600 // and loop buffer which can benefit from partial unrolling.
601 // The relevant requirements are:
602 // - The loop must have fewer than 16 branches
603 // - The loop must have less than 40 uops in all executed loop branches
604
605 // The number of taken branches in a loop is hard to estimate here, and
606 // benchmarking has revealed that it is better not to be conservative when
607 // estimating the branch count. As a result, we'll ignore the branch limits
608 // until someone finds a case where it matters in practice.
609
610 unsigned MaxOps;
611 const TargetSubtargetInfo *ST = getST();
612 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
614 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
615 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
616 else
617 return;
618
619 // Scan the loop: don't unroll loops with calls.
620 for (BasicBlock *BB : L->blocks()) {
621 for (Instruction &I : *BB) {
622 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
623 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
624 if (!thisT()->isLoweredToCall(F))
625 continue;
626 }
627
628 if (ORE) {
629 ORE->emit([&]() {
630 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
631 L->getHeader())
632 << "advising against unrolling the loop because it "
633 "contains a "
634 << ore::NV("Call", &I);
635 });
636 }
637 return;
638 }
639 }
640 }
641
642 // Enable runtime and partial unrolling up to the specified size.
643 // Enable using trip count upper bound to unroll loops.
644 UP.Partial = UP.Runtime = UP.UpperBound = true;
645 UP.PartialThreshold = MaxOps;
646
647 // Avoid unrolling when optimizing for size.
648 UP.OptSizeThreshold = 0;
650
651 // Set number of instructions optimized when "back edge"
652 // becomes "fall through" to default value of 2.
653 UP.BEInsns = 2;
654 }
655
658 PP.PeelCount = 0;
659 PP.AllowPeeling = true;
660 PP.AllowLoopNestsPeeling = false;
661 PP.PeelProfiledIterations = true;
662 }
663
665 AssumptionCache &AC,
666 TargetLibraryInfo *LibInfo,
667 HardwareLoopInfo &HWLoopInfo) {
668 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
669 }
670
673 }
674
676 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
677 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
678 }
679
680 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
681 IntrinsicInst &II) {
682 return BaseT::instCombineIntrinsic(IC, II);
683 }
684
685 std::optional<Value *>
687 APInt DemandedMask, KnownBits &Known,
688 bool &KnownBitsComputed) {
689 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
690 KnownBitsComputed);
691 }
692
694 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
695 APInt &UndefElts2, APInt &UndefElts3,
696 std::function<void(Instruction *, unsigned, APInt, APInt &)>
697 SimplifyAndSetOp) {
699 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
700 SimplifyAndSetOp);
701 }
702
703 virtual std::optional<unsigned>
705 return std::optional<unsigned>(
706 getST()->getCacheSize(static_cast<unsigned>(Level)));
707 }
708
709 virtual std::optional<unsigned>
711 std::optional<unsigned> TargetResult =
712 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
713
714 if (TargetResult)
715 return TargetResult;
716
717 return BaseT::getCacheAssociativity(Level);
718 }
719
720 virtual unsigned getCacheLineSize() const {
721 return getST()->getCacheLineSize();
722 }
723
724 virtual unsigned getPrefetchDistance() const {
725 return getST()->getPrefetchDistance();
726 }
727
728 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
729 unsigned NumStridedMemAccesses,
730 unsigned NumPrefetches,
731 bool HasCall) const {
732 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
733 NumPrefetches, HasCall);
734 }
735
736 virtual unsigned getMaxPrefetchIterationsAhead() const {
737 return getST()->getMaxPrefetchIterationsAhead();
738 }
739
740 virtual bool enableWritePrefetching() const {
741 return getST()->enableWritePrefetching();
742 }
743
744 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
745 return getST()->shouldPrefetchAddressSpace(AS);
746 }
747
748 /// @}
749
750 /// \name Vector TTI Implementations
751 /// @{
752
754 return TypeSize::getFixed(32);
755 }
756
757 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
758 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
759 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
760
761 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
762 /// are set if the demanded result elements need to be inserted and/or
763 /// extracted from vectors.
765 const APInt &DemandedElts,
766 bool Insert, bool Extract,
768 /// FIXME: a bitfield is not a reasonable abstraction for talking about
769 /// which elements are needed from a scalable vector
770 if (isa<ScalableVectorType>(InTy))
772 auto *Ty = cast<FixedVectorType>(InTy);
773
774 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
775 "Vector size mismatch");
776
778
779 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
780 if (!DemandedElts[i])
781 continue;
782 if (Insert)
783 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
784 CostKind, i, nullptr, nullptr);
785 if (Extract)
786 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
787 CostKind, i, nullptr, nullptr);
788 }
789
790 return Cost;
791 }
792
793 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
795 bool Extract,
797 if (isa<ScalableVectorType>(InTy))
799 auto *Ty = cast<FixedVectorType>(InTy);
800
801 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
802 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
803 CostKind);
804 }
805
806 /// Estimate the overhead of scalarizing an instructions unique
807 /// non-constant operands. The (potentially vector) types to use for each of
808 /// argument are passes via Tys.
813 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
814
816 SmallPtrSet<const Value*, 4> UniqueOperands;
817 for (int I = 0, E = Args.size(); I != E; I++) {
818 // Disregard things like metadata arguments.
819 const Value *A = Args[I];
820 Type *Ty = Tys[I];
821 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
822 !Ty->isPtrOrPtrVectorTy())
823 continue;
824
825 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
826 if (auto *VecTy = dyn_cast<VectorType>(Ty))
827 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
828 /*Extract*/ true, CostKind);
829 }
830 }
831
832 return Cost;
833 }
834
835 /// Estimate the overhead of scalarizing the inputs and outputs of an
836 /// instruction, with return type RetTy and arguments Args of type Tys. If
837 /// Args are unknown (empty), then the cost associated with one argument is
838 /// added as a heuristic.
844 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
845 if (!Args.empty())
847 else
848 // When no information on arguments is provided, we add the cost
849 // associated with one argument as a heuristic.
850 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
851 /*Extract*/ true, CostKind);
852
853 return Cost;
854 }
855
856 /// Estimate the cost of type-legalization and the legalized type.
857 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
858 LLVMContext &C = Ty->getContext();
859 EVT MTy = getTLI()->getValueType(DL, Ty);
860
862 // We keep legalizing the type until we find a legal kind. We assume that
863 // the only operation that costs anything is the split. After splitting
864 // we need to handle two types.
865 while (true) {
867
869 // Ensure we return a sensible simple VT here, since many callers of
870 // this function require it.
871 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
872 return std::make_pair(InstructionCost::getInvalid(), VT);
873 }
874
875 if (LK.first == TargetLoweringBase::TypeLegal)
876 return std::make_pair(Cost, MTy.getSimpleVT());
877
878 if (LK.first == TargetLoweringBase::TypeSplitVector ||
880 Cost *= 2;
881
882 // Do not loop with f128 type.
883 if (MTy == LK.second)
884 return std::make_pair(Cost, MTy.getSimpleVT());
885
886 // Keep legalizing the type.
887 MTy = LK.second;
888 }
889 }
890
891 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
892
894 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
897 ArrayRef<const Value *> Args = std::nullopt,
898 const Instruction *CxtI = nullptr) {
899 // Check if any of the operands are vector operands.
900 const TargetLoweringBase *TLI = getTLI();
901 int ISD = TLI->InstructionOpcodeToISD(Opcode);
902 assert(ISD && "Invalid opcode");
903
904 // TODO: Handle more cost kinds.
906 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
907 Opd1Info, Opd2Info,
908 Args, CxtI);
909
910 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
911
912 bool IsFloat = Ty->isFPOrFPVectorTy();
913 // Assume that floating point arithmetic operations cost twice as much as
914 // integer operations.
915 InstructionCost OpCost = (IsFloat ? 2 : 1);
916
917 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
918 // The operation is legal. Assume it costs 1.
919 // TODO: Once we have extract/insert subvector cost we need to use them.
920 return LT.first * OpCost;
921 }
922
923 if (!TLI->isOperationExpand(ISD, LT.second)) {
924 // If the operation is custom lowered, then assume that the code is twice
925 // as expensive.
926 return LT.first * 2 * OpCost;
927 }
928
929 // An 'Expand' of URem and SRem is special because it may default
930 // to expanding the operation into a sequence of sub-operations
931 // i.e. X % Y -> X-(X/Y)*Y.
932 if (ISD == ISD::UREM || ISD == ISD::SREM) {
933 bool IsSigned = ISD == ISD::SREM;
934 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
935 LT.second) ||
936 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
937 LT.second)) {
938 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
939 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
940 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
941 InstructionCost MulCost =
942 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
943 InstructionCost SubCost =
944 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
945 return DivCost + MulCost + SubCost;
946 }
947 }
948
949 // We cannot scalarize scalable vectors, so return Invalid.
950 if (isa<ScalableVectorType>(Ty))
952
953 // Else, assume that we need to scalarize this op.
954 // TODO: If one of the types get legalized by splitting, handle this
955 // similarly to what getCastInstrCost() does.
956 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
957 InstructionCost Cost = thisT()->getArithmeticInstrCost(
958 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
959 Args, CxtI);
960 // Return the cost of multiple scalar invocation plus the cost of
961 // inserting and extracting the values.
962 SmallVector<Type *> Tys(Args.size(), Ty);
963 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
964 VTy->getNumElements() * Cost;
965 }
966
967 // We don't know anything about this scalar instruction.
968 return OpCost;
969 }
970
972 ArrayRef<int> Mask,
973 VectorType *Ty, int &Index,
974 VectorType *&SubTy) const {
975 if (Mask.empty())
976 return Kind;
977 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
978 switch (Kind) {
980 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
981 return TTI::SK_Reverse;
982 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
983 return TTI::SK_Broadcast;
984 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
985 (Index + Mask.size()) <= (size_t)NumSrcElts) {
986 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
988 }
989 break;
991 int NumSubElts;
992 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
993 Mask, NumSrcElts, NumSubElts, Index)) {
994 if (Index + NumSubElts > NumSrcElts)
995 return Kind;
996 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
998 }
999 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1000 return TTI::SK_Select;
1001 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1002 return TTI::SK_Transpose;
1003 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1004 return TTI::SK_Splice;
1005 break;
1006 }
1007 case TTI::SK_Select:
1008 case TTI::SK_Reverse:
1009 case TTI::SK_Broadcast:
1010 case TTI::SK_Transpose:
1013 case TTI::SK_Splice:
1014 break;
1015 }
1016 return Kind;
1017 }
1018
1020 ArrayRef<int> Mask,
1022 VectorType *SubTp,
1023 ArrayRef<const Value *> Args = std::nullopt,
1024 const Instruction *CxtI = nullptr) {
1025 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1026 case TTI::SK_Broadcast:
1027 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1028 return getBroadcastShuffleOverhead(FVT, CostKind);
1030 case TTI::SK_Select:
1031 case TTI::SK_Splice:
1032 case TTI::SK_Reverse:
1033 case TTI::SK_Transpose:
1036 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1037 return getPermuteShuffleOverhead(FVT, CostKind);
1040 return getExtractSubvectorOverhead(Tp, CostKind, Index,
1041 cast<FixedVectorType>(SubTp));
1043 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1044 cast<FixedVectorType>(SubTp));
1045 }
1046 llvm_unreachable("Unknown TTI::ShuffleKind");
1047 }
1048
1049 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1052 const Instruction *I = nullptr) {
1053 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1054 return 0;
1055
1056 const TargetLoweringBase *TLI = getTLI();
1057 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1058 assert(ISD && "Invalid opcode");
1059 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1060 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1061
1062 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1063 TypeSize DstSize = DstLT.second.getSizeInBits();
1064 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1065 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1066
1067 switch (Opcode) {
1068 default:
1069 break;
1070 case Instruction::Trunc:
1071 // Check for NOOP conversions.
1072 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1073 return 0;
1074 [[fallthrough]];
1075 case Instruction::BitCast:
1076 // Bitcast between types that are legalized to the same type are free and
1077 // assume int to/from ptr of the same size is also free.
1078 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1079 SrcSize == DstSize)
1080 return 0;
1081 break;
1082 case Instruction::FPExt:
1083 if (I && getTLI()->isExtFree(I))
1084 return 0;
1085 break;
1086 case Instruction::ZExt:
1087 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1088 return 0;
1089 [[fallthrough]];
1090 case Instruction::SExt:
1091 if (I && getTLI()->isExtFree(I))
1092 return 0;
1093
1094 // If this is a zext/sext of a load, return 0 if the corresponding
1095 // extending load exists on target and the result type is legal.
1096 if (CCH == TTI::CastContextHint::Normal) {
1097 EVT ExtVT = EVT::getEVT(Dst);
1098 EVT LoadVT = EVT::getEVT(Src);
1099 unsigned LType =
1100 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1101 if (DstLT.first == SrcLT.first &&
1102 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1103 return 0;
1104 }
1105 break;
1106 case Instruction::AddrSpaceCast:
1107 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1108 Dst->getPointerAddressSpace()))
1109 return 0;
1110 break;
1111 }
1112
1113 auto *SrcVTy = dyn_cast<VectorType>(Src);
1114 auto *DstVTy = dyn_cast<VectorType>(Dst);
1115
1116 // If the cast is marked as legal (or promote) then assume low cost.
1117 if (SrcLT.first == DstLT.first &&
1118 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1119 return SrcLT.first;
1120
1121 // Handle scalar conversions.
1122 if (!SrcVTy && !DstVTy) {
1123 // Just check the op cost. If the operation is legal then assume it costs
1124 // 1.
1125 if (!TLI->isOperationExpand(ISD, DstLT.second))
1126 return 1;
1127
1128 // Assume that illegal scalar instruction are expensive.
1129 return 4;
1130 }
1131
1132 // Check vector-to-vector casts.
1133 if (DstVTy && SrcVTy) {
1134 // If the cast is between same-sized registers, then the check is simple.
1135 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1136
1137 // Assume that Zext is done using AND.
1138 if (Opcode == Instruction::ZExt)
1139 return SrcLT.first;
1140
1141 // Assume that sext is done using SHL and SRA.
1142 if (Opcode == Instruction::SExt)
1143 return SrcLT.first * 2;
1144
1145 // Just check the op cost. If the operation is legal then assume it
1146 // costs
1147 // 1 and multiply by the type-legalization overhead.
1148 if (!TLI->isOperationExpand(ISD, DstLT.second))
1149 return SrcLT.first * 1;
1150 }
1151
1152 // If we are legalizing by splitting, query the concrete TTI for the cost
1153 // of casting the original vector twice. We also need to factor in the
1154 // cost of the split itself. Count that as 1, to be consistent with
1155 // getTypeLegalizationCost().
1156 bool SplitSrc =
1157 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1159 bool SplitDst =
1160 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1162 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1163 DstVTy->getElementCount().isVector()) {
1164 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1165 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1166 T *TTI = static_cast<T *>(this);
1167 // If both types need to be split then the split is free.
1168 InstructionCost SplitCost =
1169 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1170 return SplitCost +
1171 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1172 CostKind, I));
1173 }
1174
1175 // Scalarization cost is Invalid, can't assume any num elements.
1176 if (isa<ScalableVectorType>(DstVTy))
1178
1179 // In other cases where the source or destination are illegal, assume
1180 // the operation will get scalarized.
1181 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1182 InstructionCost Cost = thisT()->getCastInstrCost(
1183 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1184
1185 // Return the cost of multiple scalar invocation plus the cost of
1186 // inserting and extracting the values.
1187 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1188 CostKind) +
1189 Num * Cost;
1190 }
1191
1192 // We already handled vector-to-vector and scalar-to-scalar conversions.
1193 // This
1194 // is where we handle bitcast between vectors and scalars. We need to assume
1195 // that the conversion is scalarized in one way or another.
1196 if (Opcode == Instruction::BitCast) {
1197 // Illegal bitcasts are done by storing and loading from a stack slot.
1198 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1199 /*Extract*/ true, CostKind)
1200 : 0) +
1201 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1202 /*Extract*/ false, CostKind)
1203 : 0);
1204 }
1205
1206 llvm_unreachable("Unhandled cast");
1207 }
1208
1210 VectorType *VecTy, unsigned Index) {
1212 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1213 CostKind, Index, nullptr, nullptr) +
1214 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1216 }
1217
1219 const Instruction *I = nullptr) {
1220 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1221 }
1222
1223 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1224 CmpInst::Predicate VecPred,
1226 const Instruction *I = nullptr) {
1227 const TargetLoweringBase *TLI = getTLI();
1228 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1229 assert(ISD && "Invalid opcode");
1230
1231 // TODO: Handle other cost kinds.
1233 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1234 I);
1235
1236 // Selects on vectors are actually vector selects.
1237 if (ISD == ISD::SELECT) {
1238 assert(CondTy && "CondTy must exist");
1239 if (CondTy->isVectorTy())
1240 ISD = ISD::VSELECT;
1241 }
1242 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1243
1244 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1245 !TLI->isOperationExpand(ISD, LT.second)) {
1246 // The operation is legal. Assume it costs 1. Multiply
1247 // by the type-legalization overhead.
1248 return LT.first * 1;
1249 }
1250
1251 // Otherwise, assume that the cast is scalarized.
1252 // TODO: If one of the types get legalized by splitting, handle this
1253 // similarly to what getCastInstrCost() does.
1254 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1255 if (isa<ScalableVectorType>(ValTy))
1257
1258 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1259 if (CondTy)
1260 CondTy = CondTy->getScalarType();
1261 InstructionCost Cost = thisT()->getCmpSelInstrCost(
1262 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1263
1264 // Return the cost of multiple scalar invocation plus the cost of
1265 // inserting and extracting the values.
1266 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1267 /*Extract*/ false, CostKind) +
1268 Num * Cost;
1269 }
1270
1271 // Unknown scalar opcode.
1272 return 1;
1273 }
1274
1277 unsigned Index, Value *Op0, Value *Op1) {
1278 return getRegUsageForType(Val->getScalarType());
1279 }
1280
1283 unsigned Index) {
1284 Value *Op0 = nullptr;
1285 Value *Op1 = nullptr;
1286 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1287 Op0 = IE->getOperand(0);
1288 Op1 = IE->getOperand(1);
1289 }
1290 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1291 Op1);
1292 }
1293
1294 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1295 int VF,
1296 const APInt &DemandedDstElts,
1298 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1299 "Unexpected size of DemandedDstElts.");
1300
1302
1303 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1304 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1305
1306 // The Mask shuffling cost is extract all the elements of the Mask
1307 // and insert each of them Factor times into the wide vector:
1308 //
1309 // E.g. an interleaved group with factor 3:
1310 // %mask = icmp ult <8 x i32> %vec1, %vec2
1311 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1312 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1313 // The cost is estimated as extract all mask elements from the <8xi1> mask
1314 // vector and insert them factor times into the <24xi1> shuffled mask
1315 // vector.
1316 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1317 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1318 /*Insert*/ false,
1319 /*Extract*/ true, CostKind);
1320 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1321 /*Insert*/ true,
1322 /*Extract*/ false, CostKind);
1323
1324 return Cost;
1325 }
1326
1328 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1331 const Instruction *I = nullptr) {
1332 assert(!Src->isVoidTy() && "Invalid type");
1333 // Assume types, such as structs, are expensive.
1334 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1335 return 4;
1336 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1337
1338 // Assuming that all loads of legal types cost 1.
1339 InstructionCost Cost = LT.first;
1341 return Cost;
1342
1343 const DataLayout &DL = this->getDataLayout();
1344 if (Src->isVectorTy() &&
1345 // In practice it's not currently possible to have a change in lane
1346 // length for extending loads or truncating stores so both types should
1347 // have the same scalable property.
1349 LT.second.getSizeInBits())) {
1350 // This is a vector load that legalizes to a larger type than the vector
1351 // itself. Unless the corresponding extending load or truncating store is
1352 // legal, then this will scalarize.
1354 EVT MemVT = getTLI()->getValueType(DL, Src);
1355 if (Opcode == Instruction::Store)
1356 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1357 else
1358 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1359
1360 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1361 // This is a vector load/store for some illegal type that is scalarized.
1362 // We must account for the cost of building or decomposing the vector.
1364 cast<VectorType>(Src), Opcode != Instruction::Store,
1365 Opcode == Instruction::Store, CostKind);
1366 }
1367 }
1368
1369 return Cost;
1370 }
1371
1373 Align Alignment, unsigned AddressSpace,
1375 // TODO: Pass on AddressSpace when we have test coverage.
1376 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1377 CostKind);
1378 }
1379
1381 const Value *Ptr, bool VariableMask,
1382 Align Alignment,
1384 const Instruction *I = nullptr) {
1385 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1386 true, CostKind);
1387 }
1388
1390 const Value *Ptr, bool VariableMask,
1391 Align Alignment,
1393 const Instruction *I) {
1394 // For a target without strided memory operations (or for an illegal
1395 // operation type on one which does), assume we lower to a gather/scatter
1396 // operation. (Which may in turn be scalarized.)
1397 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1398 Alignment, CostKind, I);
1399 }
1400
1402 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1403 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1404 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1405
1406 // We cannot scalarize scalable vectors, so return Invalid.
1407 if (isa<ScalableVectorType>(VecTy))
1409
1410 auto *VT = cast<FixedVectorType>(VecTy);
1411
1412 unsigned NumElts = VT->getNumElements();
1413 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1414
1415 unsigned NumSubElts = NumElts / Factor;
1416 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1417
1418 // Firstly, the cost of load/store operation.
1420 if (UseMaskForCond || UseMaskForGaps)
1421 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1423 else
1424 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1425 CostKind);
1426
1427 // Legalize the vector type, and get the legalized and unlegalized type
1428 // sizes.
1429 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1430 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1431 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1432
1433 // Scale the cost of the memory operation by the fraction of legalized
1434 // instructions that will actually be used. We shouldn't account for the
1435 // cost of dead instructions since they will be removed.
1436 //
1437 // E.g., An interleaved load of factor 8:
1438 // %vec = load <16 x i64>, <16 x i64>* %ptr
1439 // %v0 = shufflevector %vec, undef, <0, 8>
1440 //
1441 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1442 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1443 // type). The other loads are unused.
1444 //
1445 // TODO: Note that legalization can turn masked loads/stores into unmasked
1446 // (legalized) loads/stores. This can be reflected in the cost.
1447 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1448 // The number of loads of a legal type it will take to represent a load
1449 // of the unlegalized vector type.
1450 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1451
1452 // The number of elements of the unlegalized type that correspond to a
1453 // single legal instruction.
1454 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1455
1456 // Determine which legal instructions will be used.
1457 BitVector UsedInsts(NumLegalInsts, false);
1458 for (unsigned Index : Indices)
1459 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1460 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1461
1462 // Scale the cost of the load by the fraction of legal instructions that
1463 // will be used.
1464 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1465 }
1466
1467 // Then plus the cost of interleave operation.
1468 assert(Indices.size() <= Factor &&
1469 "Interleaved memory op has too many members");
1470
1471 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1472 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1473
1474 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1475 for (unsigned Index : Indices) {
1476 assert(Index < Factor && "Invalid index for interleaved memory op");
1477 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1478 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1479 }
1480
1481 if (Opcode == Instruction::Load) {
1482 // The interleave cost is similar to extract sub vectors' elements
1483 // from the wide vector, and insert them into sub vectors.
1484 //
1485 // E.g. An interleaved load of factor 2 (with one member of index 0):
1486 // %vec = load <8 x i32>, <8 x i32>* %ptr
1487 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1488 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1489 // <8 x i32> vector and insert them into a <4 x i32> vector.
1490 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1491 SubVT, DemandedAllSubElts,
1492 /*Insert*/ true, /*Extract*/ false, CostKind);
1493 Cost += Indices.size() * InsSubCost;
1494 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1495 /*Insert*/ false,
1496 /*Extract*/ true, CostKind);
1497 } else {
1498 // The interleave cost is extract elements from sub vectors, and
1499 // insert them into the wide vector.
1500 //
1501 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1502 // (using VF=4):
1503 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1504 // %gaps.mask = <true, true, false, true, true, false,
1505 // true, true, false, true, true, false>
1506 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1507 // i32 Align, <12 x i1> %gaps.mask
1508 // The cost is estimated as extract all elements (of actual members,
1509 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1510 // i32> vector.
1511 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1512 SubVT, DemandedAllSubElts,
1513 /*Insert*/ false, /*Extract*/ true, CostKind);
1514 Cost += ExtSubCost * Indices.size();
1515 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1516 /*Insert*/ true,
1517 /*Extract*/ false, CostKind);
1518 }
1519
1520 if (!UseMaskForCond)
1521 return Cost;
1522
1523 Type *I8Type = Type::getInt8Ty(VT->getContext());
1524
1525 Cost += thisT()->getReplicationShuffleCost(
1526 I8Type, Factor, NumSubElts,
1527 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1528 CostKind);
1529
1530 // The Gaps mask is invariant and created outside the loop, therefore the
1531 // cost of creating it is not accounted for here. However if we have both
1532 // a MaskForGaps and some other mask that guards the execution of the
1533 // memory access, we need to account for the cost of And-ing the two masks
1534 // inside the loop.
1535 if (UseMaskForGaps) {
1536 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1537 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1538 CostKind);
1539 }
1540
1541 return Cost;
1542 }
1543
1544 /// Get intrinsic cost based on arguments.
1547 // Check for generically free intrinsics.
1549 return 0;
1550
1551 // Assume that target intrinsics are cheap.
1552 Intrinsic::ID IID = ICA.getID();
1555
1556 if (ICA.isTypeBasedOnly())
1558
1559 Type *RetTy = ICA.getReturnType();
1560
1561 ElementCount RetVF =
1562 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1564 const IntrinsicInst *I = ICA.getInst();
1565 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1566 FastMathFlags FMF = ICA.getFlags();
1567 switch (IID) {
1568 default:
1569 break;
1570
1571 case Intrinsic::powi:
1572 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1573 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1574 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1575 ShouldOptForSize)) {
1576 // The cost is modeled on the expansion performed by ExpandPowI in
1577 // SelectionDAGBuilder.
1578 APInt Exponent = RHSC->getValue().abs();
1579 unsigned ActiveBits = Exponent.getActiveBits();
1580 unsigned PopCount = Exponent.popcount();
1581 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1582 thisT()->getArithmeticInstrCost(
1583 Instruction::FMul, RetTy, CostKind);
1584 if (RHSC->isNegative())
1585 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1586 CostKind);
1587 return Cost;
1588 }
1589 }
1590 break;
1591 case Intrinsic::cttz:
1592 // FIXME: If necessary, this should go in target-specific overrides.
1593 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1595 break;
1596
1597 case Intrinsic::ctlz:
1598 // FIXME: If necessary, this should go in target-specific overrides.
1599 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1601 break;
1602
1603 case Intrinsic::memcpy:
1604 return thisT()->getMemcpyCost(ICA.getInst());
1605
1606 case Intrinsic::masked_scatter: {
1607 const Value *Mask = Args[3];
1608 bool VarMask = !isa<Constant>(Mask);
1609 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1610 return thisT()->getGatherScatterOpCost(Instruction::Store,
1611 ICA.getArgTypes()[0], Args[1],
1612 VarMask, Alignment, CostKind, I);
1613 }
1614 case Intrinsic::masked_gather: {
1615 const Value *Mask = Args[2];
1616 bool VarMask = !isa<Constant>(Mask);
1617 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1618 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1619 VarMask, Alignment, CostKind, I);
1620 }
1621 case Intrinsic::experimental_vp_strided_store: {
1622 const Value *Data = Args[0];
1623 const Value *Ptr = Args[1];
1624 const Value *Mask = Args[3];
1625 const Value *EVL = Args[4];
1626 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1627 Align Alignment = I->getParamAlign(1).valueOrOne();
1628 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1629 Data->getType(), Ptr, VarMask,
1630 Alignment, CostKind, I);
1631 }
1632 case Intrinsic::experimental_vp_strided_load: {
1633 const Value *Ptr = Args[0];
1634 const Value *Mask = Args[2];
1635 const Value *EVL = Args[3];
1636 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1637 Align Alignment = I->getParamAlign(0).valueOrOne();
1638 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1639 VarMask, Alignment, CostKind, I);
1640 }
1641 case Intrinsic::experimental_stepvector: {
1642 if (isa<ScalableVectorType>(RetTy))
1644 // The cost of materialising a constant integer vector.
1646 }
1647 case Intrinsic::vector_extract: {
1648 // FIXME: Handle case where a scalable vector is extracted from a scalable
1649 // vector
1650 if (isa<ScalableVectorType>(RetTy))
1652 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1653 return thisT()->getShuffleCost(
1654 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1655 std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1656 }
1657 case Intrinsic::vector_insert: {
1658 // FIXME: Handle case where a scalable vector is inserted into a scalable
1659 // vector
1660 if (isa<ScalableVectorType>(Args[1]->getType()))
1662 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1663 return thisT()->getShuffleCost(
1664 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1665 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1666 }
1667 case Intrinsic::vector_reverse: {
1668 return thisT()->getShuffleCost(
1669 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1670 CostKind, 0, cast<VectorType>(RetTy));
1671 }
1672 case Intrinsic::vector_splice: {
1673 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1674 return thisT()->getShuffleCost(
1675 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1676 CostKind, Index, cast<VectorType>(RetTy));
1677 }
1678 case Intrinsic::vector_reduce_add:
1679 case Intrinsic::vector_reduce_mul:
1680 case Intrinsic::vector_reduce_and:
1681 case Intrinsic::vector_reduce_or:
1682 case Intrinsic::vector_reduce_xor:
1683 case Intrinsic::vector_reduce_smax:
1684 case Intrinsic::vector_reduce_smin:
1685 case Intrinsic::vector_reduce_fmax:
1686 case Intrinsic::vector_reduce_fmin:
1687 case Intrinsic::vector_reduce_fmaximum:
1688 case Intrinsic::vector_reduce_fminimum:
1689 case Intrinsic::vector_reduce_umax:
1690 case Intrinsic::vector_reduce_umin: {
1691 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1693 }
1694 case Intrinsic::vector_reduce_fadd:
1695 case Intrinsic::vector_reduce_fmul: {
1697 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1699 }
1700 case Intrinsic::fshl:
1701 case Intrinsic::fshr: {
1702 const Value *X = Args[0];
1703 const Value *Y = Args[1];
1704 const Value *Z = Args[2];
1707 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1708 const TTI::OperandValueInfo OpInfoBW =
1710 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1711 : TTI::OP_None};
1712
1713 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1714 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1716 Cost +=
1717 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1718 Cost +=
1719 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1720 Cost += thisT()->getArithmeticInstrCost(
1721 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1722 {OpInfoZ.Kind, TTI::OP_None});
1723 Cost += thisT()->getArithmeticInstrCost(
1724 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1725 {OpInfoZ.Kind, TTI::OP_None});
1726 // Non-constant shift amounts requires a modulo.
1727 if (!OpInfoZ.isConstant())
1728 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1729 CostKind, OpInfoZ, OpInfoBW);
1730 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1731 if (X != Y) {
1732 Type *CondTy = RetTy->getWithNewBitWidth(1);
1733 Cost +=
1734 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1736 Cost +=
1737 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1739 }
1740 return Cost;
1741 }
1742 case Intrinsic::get_active_lane_mask: {
1743 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1744 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1745
1746 // If we're not expanding the intrinsic then we assume this is cheap
1747 // to implement.
1748 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1749 return getTypeLegalizationCost(RetTy).first;
1750 }
1751
1752 // Create the expanded types that will be used to calculate the uadd_sat
1753 // operation.
1754 Type *ExpRetTy = VectorType::get(
1755 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1756 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1758 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1759 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1761 return Cost;
1762 }
1763 case Intrinsic::experimental_cttz_elts: {
1764 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1765
1766 // If we're not expanding the intrinsic then we assume this is cheap
1767 // to implement.
1768 if (!getTLI()->shouldExpandCttzElements(ArgType))
1769 return getTypeLegalizationCost(RetTy).first;
1770
1771 // TODO: The costs below reflect the expansion code in
1772 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1773 // favour of compile time.
1774
1775 // Find the smallest "sensible" element type to use for the expansion.
1776 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1777 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1778 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1779 VScaleRange = getVScaleRange(I->getCaller(), 64);
1780
1781 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1782 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1783 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1784
1785 // Create the new vector type & get the vector length
1786 Type *NewVecTy = VectorType::get(
1787 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1788
1789 IntrinsicCostAttributes StepVecAttrs(Intrinsic::experimental_stepvector,
1790 NewVecTy, {}, FMF);
1792 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1793
1794 Cost +=
1795 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1796 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1797 Args[0]->getType(),
1799 Cost +=
1800 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1801
1802 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1803 NewEltTy, NewVecTy, FMF, I, 1);
1804 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1805 Cost +=
1806 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1807
1808 return Cost;
1809 }
1810 }
1811
1812 // VP Intrinsics should have the same cost as their non-vp counterpart.
1813 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1814 // counterpart when the vector length argument is smaller than the maximum
1815 // vector length.
1816 // TODO: Support other kinds of VPIntrinsics
1817 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1818 std::optional<unsigned> FOp =
1820 if (FOp) {
1821 if (ICA.getID() == Intrinsic::vp_load) {
1822 Align Alignment;
1823 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1824 Alignment = VPI->getPointerAlignment().valueOrOne();
1825 unsigned AS = 0;
1826 if (ICA.getArgs().size() > 1)
1827 if (auto *PtrTy =
1828 dyn_cast<PointerType>(ICA.getArgs()[0]->getType()))
1829 AS = PtrTy->getAddressSpace();
1830 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1831 AS, CostKind);
1832 }
1833 if (ICA.getID() == Intrinsic::vp_store) {
1834 Align Alignment;
1835 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1836 Alignment = VPI->getPointerAlignment().valueOrOne();
1837 unsigned AS = 0;
1838 if (ICA.getArgs().size() >= 2)
1839 if (auto *PtrTy =
1840 dyn_cast<PointerType>(ICA.getArgs()[1]->getType()))
1841 AS = PtrTy->getAddressSpace();
1842 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment,
1843 AS, CostKind);
1844 }
1846 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1847 CostKind);
1848 }
1849 }
1850
1851 std::optional<Intrinsic::ID> FID =
1853 if (FID) {
1854 // Non-vp version will have same Args/Tys except mask and vector length.
1855 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 &&
1856 "Expected VPIntrinsic to have Mask and Vector Length args and "
1857 "types");
1859
1860 // VPReduction intrinsics have a start value argument that their non-vp
1861 // counterparts do not have, except for the fadd and fmul non-vp
1862 // counterpart.
1864 *FID != Intrinsic::vector_reduce_fadd &&
1865 *FID != Intrinsic::vector_reduce_fmul)
1866 NewTys = NewTys.drop_front();
1867
1868 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1869 ICA.getFlags());
1870 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1871 }
1872 }
1873
1874 // Assume that we need to scalarize this intrinsic.)
1875 // Compute the scalarization overhead based on Args for a vector
1876 // intrinsic.
1877 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1878 if (RetVF.isVector() && !RetVF.isScalable()) {
1879 ScalarizationCost = 0;
1880 if (!RetTy->isVoidTy())
1881 ScalarizationCost += getScalarizationOverhead(
1882 cast<VectorType>(RetTy),
1883 /*Insert*/ true, /*Extract*/ false, CostKind);
1884 ScalarizationCost +=
1886 }
1887
1888 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1889 ScalarizationCost);
1890 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1891 }
1892
1893 /// Get intrinsic cost based on argument types.
1894 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1895 /// cost of scalarizing the arguments and the return value will be computed
1896 /// based on types.
1900 Intrinsic::ID IID = ICA.getID();
1901 Type *RetTy = ICA.getReturnType();
1902 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1903 FastMathFlags FMF = ICA.getFlags();
1904 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1905 bool SkipScalarizationCost = ICA.skipScalarizationCost();
1906
1907 VectorType *VecOpTy = nullptr;
1908 if (!Tys.empty()) {
1909 // The vector reduction operand is operand 0 except for fadd/fmul.
1910 // Their operand 0 is a scalar start value, so the vector op is operand 1.
1911 unsigned VecTyIndex = 0;
1912 if (IID == Intrinsic::vector_reduce_fadd ||
1913 IID == Intrinsic::vector_reduce_fmul)
1914 VecTyIndex = 1;
1915 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1916 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1917 }
1918
1919 // Library call cost - other than size, make it expensive.
1920 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1921 unsigned ISD = 0;
1922 switch (IID) {
1923 default: {
1924 // Scalable vectors cannot be scalarized, so return Invalid.
1925 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1926 return isa<ScalableVectorType>(Ty);
1927 }))
1929
1930 // Assume that we need to scalarize this intrinsic.
1931 InstructionCost ScalarizationCost =
1932 SkipScalarizationCost ? ScalarizationCostPassed : 0;
1933 unsigned ScalarCalls = 1;
1934 Type *ScalarRetTy = RetTy;
1935 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1936 if (!SkipScalarizationCost)
1937 ScalarizationCost = getScalarizationOverhead(
1938 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1939 ScalarCalls = std::max(ScalarCalls,
1940 cast<FixedVectorType>(RetVTy)->getNumElements());
1941 ScalarRetTy = RetTy->getScalarType();
1942 }
1943 SmallVector<Type *, 4> ScalarTys;
1944 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1945 Type *Ty = Tys[i];
1946 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1947 if (!SkipScalarizationCost)
1948 ScalarizationCost += getScalarizationOverhead(
1949 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1950 ScalarCalls = std::max(ScalarCalls,
1951 cast<FixedVectorType>(VTy)->getNumElements());
1952 Ty = Ty->getScalarType();
1953 }
1954 ScalarTys.push_back(Ty);
1955 }
1956 if (ScalarCalls == 1)
1957 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1958
1959 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1960 InstructionCost ScalarCost =
1961 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1962
1963 return ScalarCalls * ScalarCost + ScalarizationCost;
1964 }
1965 // Look for intrinsics that can be lowered directly or turned into a scalar
1966 // intrinsic call.
1967 case Intrinsic::sqrt:
1968 ISD = ISD::FSQRT;
1969 break;
1970 case Intrinsic::sin:
1971 ISD = ISD::FSIN;
1972 break;
1973 case Intrinsic::cos:
1974 ISD = ISD::FCOS;
1975 break;
1976 case Intrinsic::exp:
1977 ISD = ISD::FEXP;
1978 break;
1979 case Intrinsic::exp2:
1980 ISD = ISD::FEXP2;
1981 break;
1982 case Intrinsic::exp10:
1983 ISD = ISD::FEXP10;
1984 break;
1985 case Intrinsic::log:
1986 ISD = ISD::FLOG;
1987 break;
1988 case Intrinsic::log10:
1989 ISD = ISD::FLOG10;
1990 break;
1991 case Intrinsic::log2:
1992 ISD = ISD::FLOG2;
1993 break;
1994 case Intrinsic::fabs:
1995 ISD = ISD::FABS;
1996 break;
1997 case Intrinsic::canonicalize:
1998 ISD = ISD::FCANONICALIZE;
1999 break;
2000 case Intrinsic::minnum:
2001 ISD = ISD::FMINNUM;
2002 break;
2003 case Intrinsic::maxnum:
2004 ISD = ISD::FMAXNUM;
2005 break;
2006 case Intrinsic::minimum:
2007 ISD = ISD::FMINIMUM;
2008 break;
2009 case Intrinsic::maximum:
2010 ISD = ISD::FMAXIMUM;
2011 break;
2012 case Intrinsic::copysign:
2013 ISD = ISD::FCOPYSIGN;
2014 break;
2015 case Intrinsic::floor:
2016 ISD = ISD::FFLOOR;
2017 break;
2018 case Intrinsic::ceil:
2019 ISD = ISD::FCEIL;
2020 break;
2021 case Intrinsic::trunc:
2022 ISD = ISD::FTRUNC;
2023 break;
2024 case Intrinsic::nearbyint:
2025 ISD = ISD::FNEARBYINT;
2026 break;
2027 case Intrinsic::rint:
2028 ISD = ISD::FRINT;
2029 break;
2030 case Intrinsic::lrint:
2031 ISD = ISD::LRINT;
2032 break;
2033 case Intrinsic::llrint:
2034 ISD = ISD::LLRINT;
2035 break;
2036 case Intrinsic::round:
2037 ISD = ISD::FROUND;
2038 break;
2039 case Intrinsic::roundeven:
2040 ISD = ISD::FROUNDEVEN;
2041 break;
2042 case Intrinsic::pow:
2043 ISD = ISD::FPOW;
2044 break;
2045 case Intrinsic::fma:
2046 ISD = ISD::FMA;
2047 break;
2048 case Intrinsic::fmuladd:
2049 ISD = ISD::FMA;
2050 break;
2051 case Intrinsic::experimental_constrained_fmuladd:
2052 ISD = ISD::STRICT_FMA;
2053 break;
2054 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2055 case Intrinsic::lifetime_start:
2056 case Intrinsic::lifetime_end:
2057 case Intrinsic::sideeffect:
2058 case Intrinsic::pseudoprobe:
2059 case Intrinsic::arithmetic_fence:
2060 return 0;
2061 case Intrinsic::masked_store: {
2062 Type *Ty = Tys[0];
2063 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2064 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2065 CostKind);
2066 }
2067 case Intrinsic::masked_load: {
2068 Type *Ty = RetTy;
2069 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2070 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2071 CostKind);
2072 }
2073 case Intrinsic::vector_reduce_add:
2074 case Intrinsic::vector_reduce_mul:
2075 case Intrinsic::vector_reduce_and:
2076 case Intrinsic::vector_reduce_or:
2077 case Intrinsic::vector_reduce_xor:
2078 return thisT()->getArithmeticReductionCost(
2079 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2080 CostKind);
2081 case Intrinsic::vector_reduce_fadd:
2082 case Intrinsic::vector_reduce_fmul:
2083 return thisT()->getArithmeticReductionCost(
2084 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2085 case Intrinsic::vector_reduce_smax:
2086 case Intrinsic::vector_reduce_smin:
2087 case Intrinsic::vector_reduce_umax:
2088 case Intrinsic::vector_reduce_umin:
2089 case Intrinsic::vector_reduce_fmax:
2090 case Intrinsic::vector_reduce_fmin:
2091 case Intrinsic::vector_reduce_fmaximum:
2092 case Intrinsic::vector_reduce_fminimum:
2093 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2094 VecOpTy, ICA.getFlags(), CostKind);
2095 case Intrinsic::abs: {
2096 // abs(X) = select(icmp(X,0),X,sub(0,X))
2097 Type *CondTy = RetTy->getWithNewBitWidth(1);
2100 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2101 Pred, CostKind);
2102 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2103 Pred, CostKind);
2104 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2105 Cost += thisT()->getArithmeticInstrCost(
2106 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
2107 return Cost;
2108 }
2109 case Intrinsic::smax:
2110 case Intrinsic::smin:
2111 case Intrinsic::umax:
2112 case Intrinsic::umin: {
2113 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2114 Type *CondTy = RetTy->getWithNewBitWidth(1);
2115 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2116 CmpInst::Predicate Pred =
2117 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2119 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2120 Pred, CostKind);
2121 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2122 Pred, CostKind);
2123 return Cost;
2124 }
2125 case Intrinsic::sadd_sat:
2126 case Intrinsic::ssub_sat: {
2127 Type *CondTy = RetTy->getWithNewBitWidth(1);
2128
2129 Type *OpTy = StructType::create({RetTy, CondTy});
2130 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2131 ? Intrinsic::sadd_with_overflow
2132 : Intrinsic::ssub_with_overflow;
2134
2135 // SatMax -> Overflow && SumDiff < 0
2136 // SatMin -> Overflow && SumDiff >= 0
2138 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2139 nullptr, ScalarizationCostPassed);
2140 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2141 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2142 Pred, CostKind);
2143 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2144 CondTy, Pred, CostKind);
2145 return Cost;
2146 }
2147 case Intrinsic::uadd_sat:
2148 case Intrinsic::usub_sat: {
2149 Type *CondTy = RetTy->getWithNewBitWidth(1);
2150
2151 Type *OpTy = StructType::create({RetTy, CondTy});
2152 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2153 ? Intrinsic::uadd_with_overflow
2154 : Intrinsic::usub_with_overflow;
2155
2157 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2158 nullptr, ScalarizationCostPassed);
2159 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2160 Cost +=
2161 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2163 return Cost;
2164 }
2165 case Intrinsic::smul_fix:
2166 case Intrinsic::umul_fix: {
2167 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2168 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2169
2170 unsigned ExtOp =
2171 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2173
2175 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2176 Cost +=
2177 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2178 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2179 CCH, CostKind);
2180 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2181 CostKind,
2184 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2187 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2188 return Cost;
2189 }
2190 case Intrinsic::sadd_with_overflow:
2191 case Intrinsic::ssub_with_overflow: {
2192 Type *SumTy = RetTy->getContainedType(0);
2193 Type *OverflowTy = RetTy->getContainedType(1);
2194 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2195 ? BinaryOperator::Add
2196 : BinaryOperator::Sub;
2197
2198 // Add:
2199 // Overflow -> (Result < LHS) ^ (RHS < 0)
2200 // Sub:
2201 // Overflow -> (Result < LHS) ^ (RHS > 0)
2203 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2204 Cost += 2 * thisT()->getCmpSelInstrCost(
2205 Instruction::ICmp, SumTy, OverflowTy,
2207 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2208 CostKind);
2209 return Cost;
2210 }
2211 case Intrinsic::uadd_with_overflow:
2212 case Intrinsic::usub_with_overflow: {
2213 Type *SumTy = RetTy->getContainedType(0);
2214 Type *OverflowTy = RetTy->getContainedType(1);
2215 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2216 ? BinaryOperator::Add
2217 : BinaryOperator::Sub;
2218 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2221
2223 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2224 Cost +=
2225 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2226 Pred, CostKind);
2227 return Cost;
2228 }
2229 case Intrinsic::smul_with_overflow:
2230 case Intrinsic::umul_with_overflow: {
2231 Type *MulTy = RetTy->getContainedType(0);
2232 Type *OverflowTy = RetTy->getContainedType(1);
2233 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2234 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2235 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2236
2237 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2239
2241 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2242 Cost +=
2243 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2244 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2245 CCH, CostKind);
2246 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2247 CostKind,
2250
2251 if (IsSigned)
2252 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2253 CostKind,
2256
2257 Cost += thisT()->getCmpSelInstrCost(
2258 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2259 return Cost;
2260 }
2261 case Intrinsic::fptosi_sat:
2262 case Intrinsic::fptoui_sat: {
2263 if (Tys.empty())
2264 break;
2265 Type *FromTy = Tys[0];
2266 bool IsSigned = IID == Intrinsic::fptosi_sat;
2267
2269 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2270 {FromTy, FromTy});
2271 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2272 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2273 {FromTy, FromTy});
2274 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2275 Cost += thisT()->getCastInstrCost(
2276 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2278 if (IsSigned) {
2279 Type *CondTy = RetTy->getWithNewBitWidth(1);
2280 Cost += thisT()->getCmpSelInstrCost(
2281 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2282 Cost += thisT()->getCmpSelInstrCost(
2283 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2284 }
2285 return Cost;
2286 }
2287 case Intrinsic::ctpop:
2288 ISD = ISD::CTPOP;
2289 // In case of legalization use TCC_Expensive. This is cheaper than a
2290 // library call but still not a cheap instruction.
2291 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2292 break;
2293 case Intrinsic::ctlz:
2294 ISD = ISD::CTLZ;
2295 break;
2296 case Intrinsic::cttz:
2297 ISD = ISD::CTTZ;
2298 break;
2299 case Intrinsic::bswap:
2300 ISD = ISD::BSWAP;
2301 break;
2302 case Intrinsic::bitreverse:
2303 ISD = ISD::BITREVERSE;
2304 break;
2305 }
2306
2307 const TargetLoweringBase *TLI = getTLI();
2308 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2309
2310 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2311 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2312 TLI->isFAbsFree(LT.second)) {
2313 return 0;
2314 }
2315
2316 // The operation is legal. Assume it costs 1.
2317 // If the type is split to multiple registers, assume that there is some
2318 // overhead to this.
2319 // TODO: Once we have extract/insert subvector cost we need to use them.
2320 if (LT.first > 1)
2321 return (LT.first * 2);
2322 else
2323 return (LT.first * 1);
2324 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2325 // If the operation is custom lowered then assume
2326 // that the code is twice as expensive.
2327 return (LT.first * 2);
2328 }
2329
2330 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2331 // point mul followed by an add.
2332 if (IID == Intrinsic::fmuladd)
2333 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2334 CostKind) +
2335 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2336 CostKind);
2337 if (IID == Intrinsic::experimental_constrained_fmuladd) {
2338 IntrinsicCostAttributes FMulAttrs(
2339 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2340 IntrinsicCostAttributes FAddAttrs(
2341 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2342 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2343 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2344 }
2345
2346 // Else, assume that we need to scalarize this intrinsic. For math builtins
2347 // this will emit a costly libcall, adding call overhead and spills. Make it
2348 // very expensive.
2349 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2350 // Scalable vectors cannot be scalarized, so return Invalid.
2351 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2352 return isa<ScalableVectorType>(Ty);
2353 }))
2355
2356 InstructionCost ScalarizationCost =
2357 SkipScalarizationCost
2358 ? ScalarizationCostPassed
2359 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2360 /*Extract*/ false, CostKind);
2361
2362 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2363 SmallVector<Type *, 4> ScalarTys;
2364 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2365 Type *Ty = Tys[i];
2366 if (Ty->isVectorTy())
2367 Ty = Ty->getScalarType();
2368 ScalarTys.push_back(Ty);
2369 }
2370 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2371 InstructionCost ScalarCost =
2372 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2373 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2374 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2375 if (!ICA.skipScalarizationCost())
2376 ScalarizationCost += getScalarizationOverhead(
2377 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2378 ScalarCalls = std::max(ScalarCalls,
2379 cast<FixedVectorType>(VTy)->getNumElements());
2380 }
2381 }
2382 return ScalarCalls * ScalarCost + ScalarizationCost;
2383 }
2384
2385 // This is going to be turned into a library call, make it expensive.
2386 return SingleCallCost;
2387 }
2388
2389 /// Compute a cost of the given call instruction.
2390 ///
2391 /// Compute the cost of calling function F with return type RetTy and
2392 /// argument types Tys. F might be nullptr, in this case the cost of an
2393 /// arbitrary call with the specified signature will be returned.
2394 /// This is used, for instance, when we estimate call of a vector
2395 /// counterpart of the given function.
2396 /// \param F Called function, might be nullptr.
2397 /// \param RetTy Return value types.
2398 /// \param Tys Argument types.
2399 /// \returns The cost of Call instruction.
2401 ArrayRef<Type *> Tys,
2403 return 10;
2404 }
2405
2406 unsigned getNumberOfParts(Type *Tp) {
2407 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2408 return LT.first.isValid() ? *LT.first.getValue() : 0;
2409 }
2410
2412 const SCEV *) {
2413 return 0;
2414 }
2415
2416 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2417 /// We're assuming that reduction operation are performing the following way:
2418 ///
2419 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2420 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2421 /// \----------------v-------------/ \----------v------------/
2422 /// n/2 elements n/2 elements
2423 /// %red1 = op <n x t> %val, <n x t> val1
2424 /// After this operation we have a vector %red1 where only the first n/2
2425 /// elements are meaningful, the second n/2 elements are undefined and can be
2426 /// dropped. All other operations are actually working with the vector of
2427 /// length n/2, not n, though the real vector length is still n.
2428 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2429 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2430 /// \----------------v-------------/ \----------v------------/
2431 /// n/4 elements 3*n/4 elements
2432 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2433 /// length n/2, the resulting vector has length n/4 etc.
2434 ///
2435 /// The cost model should take into account that the actual length of the
2436 /// vector is reduced on each iteration.
2439 // Targets must implement a default value for the scalable case, since
2440 // we don't know how many lanes the vector has.
2441 if (isa<ScalableVectorType>(Ty))
2443
2444 Type *ScalarTy = Ty->getElementType();
2445 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2446 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2447 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2448 NumVecElts >= 2) {
2449 // Or reduction for i1 is represented as:
2450 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2451 // %res = cmp ne iReduxWidth %val, 0
2452 // And reduction for i1 is represented as:
2453 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2454 // %res = cmp eq iReduxWidth %val, 11111
2455 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2456 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2458 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2461 }
2462 unsigned NumReduxLevels = Log2_32(NumVecElts);
2463 InstructionCost ArithCost = 0;
2464 InstructionCost ShuffleCost = 0;
2465 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2466 unsigned LongVectorCount = 0;
2467 unsigned MVTLen =
2468 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2469 while (NumVecElts > MVTLen) {
2470 NumVecElts /= 2;
2471 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2472 ShuffleCost +=
2473 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2474 CostKind, NumVecElts, SubTy);
2475 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2476 Ty = SubTy;
2477 ++LongVectorCount;
2478 }
2479
2480 NumReduxLevels -= LongVectorCount;
2481
2482 // The minimal length of the vector is limited by the real length of vector
2483 // operations performed on the current platform. That's why several final
2484 // reduction operations are performed on the vectors with the same
2485 // architecture-dependent length.
2486
2487 // By default reductions need one shuffle per reduction level.
2488 ShuffleCost +=
2489 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2490 std::nullopt, CostKind, 0, Ty);
2491 ArithCost +=
2492 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2493 return ShuffleCost + ArithCost +
2494 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2495 CostKind, 0, nullptr, nullptr);
2496 }
2497
2498 /// Try to calculate the cost of performing strict (in-order) reductions,
2499 /// which involves doing a sequence of floating point additions in lane
2500 /// order, starting with an initial value. For example, consider a scalar
2501 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2502 ///
2503 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2504 ///
2505 /// %add1 = %InitVal + %v0
2506 /// %add2 = %add1 + %v1
2507 /// %add3 = %add2 + %v2
2508 /// %add4 = %add3 + %v3
2509 ///
2510 /// As a simple estimate we can say the cost of such a reduction is 4 times
2511 /// the cost of a scalar FP addition. We can only estimate the costs for
2512 /// fixed-width vectors here because for scalable vectors we do not know the
2513 /// runtime number of operations.
2516 // Targets must implement a default value for the scalable case, since
2517 // we don't know how many lanes the vector has.
2518 if (isa<ScalableVectorType>(Ty))
2520
2521 auto *VTy = cast<FixedVectorType>(Ty);
2523 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2524 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2525 Opcode, VTy->getElementType(), CostKind);
2526 ArithCost *= VTy->getNumElements();
2527
2528 return ExtractCost + ArithCost;
2529 }
2530
2532 std::optional<FastMathFlags> FMF,
2534 assert(Ty && "Unknown reduction vector type");
2536 return getOrderedReductionCost(Opcode, Ty, CostKind);
2537 return getTreeReductionCost(Opcode, Ty, CostKind);
2538 }
2539
2540 /// Try to calculate op costs for min/max reduction operations.
2541 /// \param CondTy Conditional type for the Select instruction.
2543 FastMathFlags FMF,
2545 // Targets must implement a default value for the scalable case, since
2546 // we don't know how many lanes the vector has.
2547 if (isa<ScalableVectorType>(Ty))
2549
2550 Type *ScalarTy = Ty->getElementType();
2551 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2552 unsigned NumReduxLevels = Log2_32(NumVecElts);
2553 InstructionCost MinMaxCost = 0;
2554 InstructionCost ShuffleCost = 0;
2555 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2556 unsigned LongVectorCount = 0;
2557 unsigned MVTLen =
2558 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2559 while (NumVecElts > MVTLen) {
2560 NumVecElts /= 2;
2561 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2562
2563 ShuffleCost +=
2564 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2565 CostKind, NumVecElts, SubTy);
2566
2567 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2568 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2569 Ty = SubTy;
2570 ++LongVectorCount;
2571 }
2572
2573 NumReduxLevels -= LongVectorCount;
2574
2575 // The minimal length of the vector is limited by the real length of vector
2576 // operations performed on the current platform. That's why several final
2577 // reduction opertions are perfomed on the vectors with the same
2578 // architecture-dependent length.
2579 ShuffleCost +=
2580 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2581 std::nullopt, CostKind, 0, Ty);
2582 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2583 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2584 // The last min/max should be in vector registers and we counted it above.
2585 // So just need a single extractelement.
2586 return ShuffleCost + MinMaxCost +
2587 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2588 CostKind, 0, nullptr, nullptr);
2589 }
2590
2591 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2592 Type *ResTy, VectorType *Ty,
2593 FastMathFlags FMF,
2595 // Without any native support, this is equivalent to the cost of
2596 // vecreduce.opcode(ext(Ty A)).
2597 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2598 InstructionCost RedCost =
2599 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2600 InstructionCost ExtCost = thisT()->getCastInstrCost(
2601 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2603
2604 return RedCost + ExtCost;
2605 }
2606
2608 VectorType *Ty,
2610 // Without any native support, this is equivalent to the cost of
2611 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2612 // vecreduce.add(mul(A, B)).
2613 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2614 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2615 Instruction::Add, ExtTy, std::nullopt, CostKind);
2616 InstructionCost ExtCost = thisT()->getCastInstrCost(
2617 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2619
2620 InstructionCost MulCost =
2621 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2622
2623 return RedCost + MulCost + 2 * ExtCost;
2624 }
2625
2627
2628 /// @}
2629};
2630
2631/// Concrete BasicTTIImpl that can be used if no further customization
2632/// is needed.
2633class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2635
2636 friend class BasicTTIImplBase<BasicTTIImpl>;
2637
2638 const TargetSubtargetInfo *ST;
2639 const TargetLoweringBase *TLI;
2640
2641 const TargetSubtargetInfo *getST() const { return ST; }
2642 const TargetLoweringBase *getTLI() const { return TLI; }
2643
2644public:
2645 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2646};
2647
2648} // end namespace llvm
2649
2650#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
LLVMContext & Context
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:76
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:212
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1179
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1439
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1108
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
an instruction to allocate memory on the stack
Definition: Instructions.h:59
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:81
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:430
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:287
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:724
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:584
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const
Definition: BasicTTIImpl.h:557
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:891
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:753
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:758
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:420
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:664
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:671
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:744
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:336
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:424
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:736
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:757
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:971
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:435
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:511
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:578
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:446
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:377
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:389
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:686
bool shouldFoldTerminatingConditionAfterLSR() const
Definition: BasicTTIImpl.h:397
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:728
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:281
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:383
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:309
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:810
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:764
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:353
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:440
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:543
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:704
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:406
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:285
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:676
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:273
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:357
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:794
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:710
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:740
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:323
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:656
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:300
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:893
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:547
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:857
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:536
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:319
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:576
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
bool isLegalAddScalableImmediate(int64_t Imm)
Definition: BasicTTIImpl.h:332
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:839
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:759
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:680
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:291
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:328
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:295
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:720
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:305
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:283
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:582
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:693
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:313
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:264
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:577
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:402
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1362
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_EQ
equal
Definition: InstrTypes.h:1014
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:1003
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:420
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:323
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:319
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
bool isTargetIntrinsic() const
isTargetIntrinsic - Returns true if this function is an intrinsic and the intrinsic is specific to a ...
Definition: Function.cpp:889
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
The core instruction combiner logic.
Definition: InstCombiner.h:47
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:513
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:76
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:373
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1659
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:558
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:342
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
static bool isVPBinOp(Intrinsic::ID ID)
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static bool isVPIntrinsic(Intrinsic::ID)
static bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:507
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:215
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2978
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:715
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:484
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:391
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:256
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1059
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1063
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:501
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:728
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:972
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:737
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:991
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:494
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1479
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
AddressSpace
Definition: NVPTXBaseInfo.h:21
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:921
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:340
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:628
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:64
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).