LLVM 19.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
92#include <algorithm>
93#include <cassert>
94#include <cstdint>
95#include <iterator>
96#include <memory>
97#include <optional>
98#include <set>
99#include <string>
100#include <tuple>
101#include <utility>
102
103using namespace llvm;
104using namespace llvm::PatternMatch;
105using namespace slpvectorizer;
106
107#define SV_NAME "slp-vectorizer"
108#define DEBUG_TYPE "SLP"
109
110STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
111
112static cl::opt<bool>
113 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
114 cl::desc("Run the SLP vectorization passes"));
115
116static cl::opt<int>
118 cl::desc("Only vectorize if you gain more than this "
119 "number "));
120
122 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
123 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
124 "heuristics and makes vectorization decision via cost modeling."));
125
126static cl::opt<bool>
127ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
128 cl::desc("Attempt to vectorize horizontal reductions"));
129
131 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
132 cl::desc(
133 "Attempt to vectorize horizontal reductions feeding into a store"));
134
135// NOTE: If AllowHorRdxIdenityOptimization is true, the optimization will run
136// even if we match a reduction but do not vectorize in the end.
138 "slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden,
139 cl::desc("Allow optimization of original scalar identity operations on "
140 "matched horizontal reductions."));
141
142static cl::opt<int>
144 cl::desc("Attempt to vectorize for this register size in bits"));
145
148 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
149
150/// Limits the size of scheduling regions in a block.
151/// It avoid long compile times for _very_ large blocks where vector
152/// instructions are spread over a wide range.
153/// This limit is way higher than needed by real-world functions.
154static cl::opt<int>
155ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
156 cl::desc("Limit the size of the SLP scheduling region per block"));
157
159 "slp-min-reg-size", cl::init(128), cl::Hidden,
160 cl::desc("Attempt to vectorize for this register size in bits"));
161
163 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
164 cl::desc("Limit the recursion depth when building a vectorizable tree"));
165
167 "slp-min-tree-size", cl::init(3), cl::Hidden,
168 cl::desc("Only vectorize small trees if they are fully vectorizable"));
169
170// The maximum depth that the look-ahead score heuristic will explore.
171// The higher this value, the higher the compilation time overhead.
173 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
174 cl::desc("The maximum look-ahead depth for operand reordering scores"));
175
176// The maximum depth that the look-ahead score heuristic will explore
177// when it probing among candidates for vectorization tree roots.
178// The higher this value, the higher the compilation time overhead but unlike
179// similar limit for operands ordering this is less frequently used, hence
180// impact of higher value is less noticeable.
182 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
183 cl::desc("The maximum look-ahead depth for searching best rooting option"));
184
186 "slp-min-strided-loads", cl::init(2), cl::Hidden,
187 cl::desc("The minimum number of loads, which should be considered strided, "
188 "if the stride is > 1 or is runtime value"));
189
191 "slp-max-stride", cl::init(8), cl::Hidden,
192 cl::desc("The maximum stride, considered to be profitable."));
193
194static cl::opt<bool>
195 ViewSLPTree("view-slp-tree", cl::Hidden,
196 cl::desc("Display the SLP trees with Graphviz"));
197
199 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
200 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
201
202// Limit the number of alias checks. The limit is chosen so that
203// it has no negative effect on the llvm benchmarks.
204static const unsigned AliasedCheckLimit = 10;
205
206// Limit of the number of uses for potentially transformed instructions/values,
207// used in checks to avoid compile-time explode.
208static constexpr int UsesLimit = 8;
209
210// Another limit for the alias checks: The maximum distance between load/store
211// instructions where alias checks are done.
212// This limit is useful for very large basic blocks.
213static const unsigned MaxMemDepDistance = 160;
214
215/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
216/// regions to be handled.
217static const int MinScheduleRegionSize = 16;
218
219/// Maximum allowed number of operands in the PHI nodes.
220static const unsigned MaxPHINumOperands = 128;
221
222/// Predicate for the element types that the SLP vectorizer supports.
223///
224/// The most important thing to filter here are types which are invalid in LLVM
225/// vectors. We also filter target specific types which have absolutely no
226/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
227/// avoids spending time checking the cost model and realizing that they will
228/// be inevitably scalarized.
229static bool isValidElementType(Type *Ty) {
230 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
231 !Ty->isPPC_FP128Ty();
232}
233
234/// \returns True if the value is a constant (but not globals/constant
235/// expressions).
236static bool isConstant(Value *V) {
237 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
238}
239
240/// Checks if \p V is one of vector-like instructions, i.e. undef,
241/// insertelement/extractelement with constant indices for fixed vector type or
242/// extractvalue instruction.
244 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
245 !isa<ExtractValueInst, UndefValue>(V))
246 return false;
247 auto *I = dyn_cast<Instruction>(V);
248 if (!I || isa<ExtractValueInst>(I))
249 return true;
250 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
251 return false;
252 if (isa<ExtractElementInst>(I))
253 return isConstant(I->getOperand(1));
254 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
255 return isConstant(I->getOperand(2));
256}
257
258#if !defined(NDEBUG)
259/// Print a short descriptor of the instruction bundle suitable for debug output.
260static std::string shortBundleName(ArrayRef<Value *> VL) {
261 std::string Result;
262 raw_string_ostream OS(Result);
263 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
264 OS.flush();
265 return Result;
266}
267#endif
268
269/// \returns true if all of the instructions in \p VL are in the same block or
270/// false otherwise.
272 Instruction *I0 = dyn_cast<Instruction>(VL[0]);
273 if (!I0)
274 return false;
276 return true;
277
278 BasicBlock *BB = I0->getParent();
279 for (int I = 1, E = VL.size(); I < E; I++) {
280 auto *II = dyn_cast<Instruction>(VL[I]);
281 if (!II)
282 return false;
283
284 if (BB != II->getParent())
285 return false;
286 }
287 return true;
288}
289
290/// \returns True if all of the values in \p VL are constants (but not
291/// globals/constant expressions).
293 // Constant expressions and globals can't be vectorized like normal integer/FP
294 // constants.
295 return all_of(VL, isConstant);
296}
297
298/// \returns True if all of the values in \p VL are identical or some of them
299/// are UndefValue.
300static bool isSplat(ArrayRef<Value *> VL) {
301 Value *FirstNonUndef = nullptr;
302 for (Value *V : VL) {
303 if (isa<UndefValue>(V))
304 continue;
305 if (!FirstNonUndef) {
306 FirstNonUndef = V;
307 continue;
308 }
309 if (V != FirstNonUndef)
310 return false;
311 }
312 return FirstNonUndef != nullptr;
313}
314
315/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
317 if (auto *Cmp = dyn_cast<CmpInst>(I))
318 return Cmp->isCommutative();
319 if (auto *BO = dyn_cast<BinaryOperator>(I))
320 return BO->isCommutative() ||
321 (BO->getOpcode() == Instruction::Sub &&
322 !BO->hasNUsesOrMore(UsesLimit) &&
323 all_of(
324 BO->uses(),
325 [](const Use &U) {
326 // Commutative, if icmp eq/ne sub, 0
327 ICmpInst::Predicate Pred;
328 if (match(U.getUser(),
329 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
330 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
331 return true;
332 // Commutative, if abs(sub nsw, true) or abs(sub, false).
333 ConstantInt *Flag;
334 return match(U.getUser(),
335 m_Intrinsic<Intrinsic::abs>(
336 m_Specific(U.get()), m_ConstantInt(Flag))) &&
337 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
338 Flag->isOne());
339 })) ||
340 (BO->getOpcode() == Instruction::FSub &&
341 !BO->hasNUsesOrMore(UsesLimit) &&
342 all_of(BO->uses(), [](const Use &U) {
343 return match(U.getUser(),
344 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
345 }));
346 return I->isCommutative();
347}
348
349/// \returns inserting index of InsertElement or InsertValue instruction,
350/// using Offset as base offset for index.
351static std::optional<unsigned> getInsertIndex(const Value *InsertInst,
352 unsigned Offset = 0) {
353 int Index = Offset;
354 if (const auto *IE = dyn_cast<InsertElementInst>(InsertInst)) {
355 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
356 if (!VT)
357 return std::nullopt;
358 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
359 if (!CI)
360 return std::nullopt;
361 if (CI->getValue().uge(VT->getNumElements()))
362 return std::nullopt;
363 Index *= VT->getNumElements();
364 Index += CI->getZExtValue();
365 return Index;
366 }
367
368 const auto *IV = cast<InsertValueInst>(InsertInst);
369 Type *CurrentType = IV->getType();
370 for (unsigned I : IV->indices()) {
371 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
372 Index *= ST->getNumElements();
373 CurrentType = ST->getElementType(I);
374 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
375 Index *= AT->getNumElements();
376 CurrentType = AT->getElementType();
377 } else {
378 return std::nullopt;
379 }
380 Index += I;
381 }
382 return Index;
383}
384
385namespace {
386/// Specifies the way the mask should be analyzed for undefs/poisonous elements
387/// in the shuffle mask.
388enum class UseMask {
389 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
390 ///< check for the mask elements for the first argument (mask
391 ///< indices are in range [0:VF)).
392 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
393 ///< for the mask elements for the second argument (mask indices
394 ///< are in range [VF:2*VF))
395 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
396 ///< future shuffle elements and mark them as ones as being used
397 ///< in future. Non-undef elements are considered as unused since
398 ///< they're already marked as used in the mask.
399};
400} // namespace
401
402/// Prepares a use bitset for the given mask either for the first argument or
403/// for the second.
405 UseMask MaskArg) {
406 SmallBitVector UseMask(VF, true);
407 for (auto [Idx, Value] : enumerate(Mask)) {
408 if (Value == PoisonMaskElem) {
409 if (MaskArg == UseMask::UndefsAsMask)
410 UseMask.reset(Idx);
411 continue;
412 }
413 if (MaskArg == UseMask::FirstArg && Value < VF)
414 UseMask.reset(Value);
415 else if (MaskArg == UseMask::SecondArg && Value >= VF)
416 UseMask.reset(Value - VF);
417 }
418 return UseMask;
419}
420
421/// Checks if the given value is actually an undefined constant vector.
422/// Also, if the \p UseMask is not empty, tries to check if the non-masked
423/// elements actually mask the insertelement buildvector, if any.
424template <bool IsPoisonOnly = false>
426 const SmallBitVector &UseMask = {}) {
427 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
428 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
429 if (isa<T>(V))
430 return Res;
431 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
432 if (!VecTy)
433 return Res.reset();
434 auto *C = dyn_cast<Constant>(V);
435 if (!C) {
436 if (!UseMask.empty()) {
437 const Value *Base = V;
438 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
439 Base = II->getOperand(0);
440 if (isa<T>(II->getOperand(1)))
441 continue;
442 std::optional<unsigned> Idx = getInsertIndex(II);
443 if (!Idx) {
444 Res.reset();
445 return Res;
446 }
447 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
448 Res.reset(*Idx);
449 }
450 // TODO: Add analysis for shuffles here too.
451 if (V == Base) {
452 Res.reset();
453 } else {
454 SmallBitVector SubMask(UseMask.size(), false);
455 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
456 }
457 } else {
458 Res.reset();
459 }
460 return Res;
461 }
462 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
463 if (Constant *Elem = C->getAggregateElement(I))
464 if (!isa<T>(Elem) &&
465 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
466 Res.reset(I);
467 }
468 return Res;
469}
470
471/// Checks if the vector of instructions can be represented as a shuffle, like:
472/// %x0 = extractelement <4 x i8> %x, i32 0
473/// %x3 = extractelement <4 x i8> %x, i32 3
474/// %y1 = extractelement <4 x i8> %y, i32 1
475/// %y2 = extractelement <4 x i8> %y, i32 2
476/// %x0x0 = mul i8 %x0, %x0
477/// %x3x3 = mul i8 %x3, %x3
478/// %y1y1 = mul i8 %y1, %y1
479/// %y2y2 = mul i8 %y2, %y2
480/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
481/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
482/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
483/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
484/// ret <4 x i8> %ins4
485/// can be transformed into:
486/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
487/// i32 6>
488/// %2 = mul <4 x i8> %1, %1
489/// ret <4 x i8> %2
490/// Mask will return the Shuffle Mask equivalent to the extracted elements.
491/// TODO: Can we split off and reuse the shuffle mask detection from
492/// ShuffleVectorInst/getShuffleCost?
493static std::optional<TargetTransformInfo::ShuffleKind>
495 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
496 if (It == VL.end())
497 return std::nullopt;
498 auto *EI0 = cast<ExtractElementInst>(*It);
499 if (isa<ScalableVectorType>(EI0->getVectorOperandType()))
500 return std::nullopt;
501 unsigned Size =
502 cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
503 Value *Vec1 = nullptr;
504 Value *Vec2 = nullptr;
505 enum ShuffleMode { Unknown, Select, Permute };
506 ShuffleMode CommonShuffleMode = Unknown;
507 Mask.assign(VL.size(), PoisonMaskElem);
508 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
509 // Undef can be represented as an undef element in a vector.
510 if (isa<UndefValue>(VL[I]))
511 continue;
512 auto *EI = cast<ExtractElementInst>(VL[I]);
513 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
514 return std::nullopt;
515 auto *Vec = EI->getVectorOperand();
516 // We can extractelement from undef or poison vector.
517 if (isUndefVector(Vec).all())
518 continue;
519 // All vector operands must have the same number of vector elements.
520 if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
521 return std::nullopt;
522 if (isa<UndefValue>(EI->getIndexOperand()))
523 continue;
524 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
525 if (!Idx)
526 return std::nullopt;
527 // Undefined behavior if Idx is negative or >= Size.
528 if (Idx->getValue().uge(Size))
529 continue;
530 unsigned IntIdx = Idx->getValue().getZExtValue();
531 Mask[I] = IntIdx;
532 // For correct shuffling we have to have at most 2 different vector operands
533 // in all extractelement instructions.
534 if (!Vec1 || Vec1 == Vec) {
535 Vec1 = Vec;
536 } else if (!Vec2 || Vec2 == Vec) {
537 Vec2 = Vec;
538 Mask[I] += Size;
539 } else {
540 return std::nullopt;
541 }
542 if (CommonShuffleMode == Permute)
543 continue;
544 // If the extract index is not the same as the operation number, it is a
545 // permutation.
546 if (IntIdx != I) {
547 CommonShuffleMode = Permute;
548 continue;
549 }
550 CommonShuffleMode = Select;
551 }
552 // If we're not crossing lanes in different vectors, consider it as blending.
553 if (CommonShuffleMode == Select && Vec2)
555 // If Vec2 was never used, we have a permutation of a single vector, otherwise
556 // we have permutation of 2 vectors.
559}
560
561/// \returns True if Extract{Value,Element} instruction extracts element Idx.
562static std::optional<unsigned> getExtractIndex(Instruction *E) {
563 unsigned Opcode = E->getOpcode();
564 assert((Opcode == Instruction::ExtractElement ||
565 Opcode == Instruction::ExtractValue) &&
566 "Expected extractelement or extractvalue instruction.");
567 if (Opcode == Instruction::ExtractElement) {
568 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
569 if (!CI)
570 return std::nullopt;
571 return CI->getZExtValue();
572 }
573 auto *EI = cast<ExtractValueInst>(E);
574 if (EI->getNumIndices() != 1)
575 return std::nullopt;
576 return *EI->idx_begin();
577}
578
579namespace {
580
581/// Main data required for vectorization of instructions.
582struct InstructionsState {
583 /// The very first instruction in the list with the main opcode.
584 Value *OpValue = nullptr;
585
586 /// The main/alternate instruction.
587 Instruction *MainOp = nullptr;
588 Instruction *AltOp = nullptr;
589
590 /// The main/alternate opcodes for the list of instructions.
591 unsigned getOpcode() const {
592 return MainOp ? MainOp->getOpcode() : 0;
593 }
594
595 unsigned getAltOpcode() const {
596 return AltOp ? AltOp->getOpcode() : 0;
597 }
598
599 /// Some of the instructions in the list have alternate opcodes.
600 bool isAltShuffle() const { return AltOp != MainOp; }
601
602 bool isOpcodeOrAlt(Instruction *I) const {
603 unsigned CheckedOpcode = I->getOpcode();
604 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
605 }
606
607 InstructionsState() = delete;
608 InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
609 : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
610};
611
612} // end anonymous namespace
613
614/// Chooses the correct key for scheduling data. If \p Op has the same (or
615/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
616/// OpValue.
617static Value *isOneOf(const InstructionsState &S, Value *Op) {
618 auto *I = dyn_cast<Instruction>(Op);
619 if (I && S.isOpcodeOrAlt(I))
620 return Op;
621 return S.OpValue;
622}
623
624/// \returns true if \p Opcode is allowed as part of the main/alternate
625/// instruction for SLP vectorization.
626///
627/// Example of unsupported opcode is SDIV that can potentially cause UB if the
628/// "shuffled out" lane would result in division by zero.
629static bool isValidForAlternation(unsigned Opcode) {
630 if (Instruction::isIntDivRem(Opcode))
631 return false;
632
633 return true;
634}
635
636static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
637 const TargetLibraryInfo &TLI,
638 unsigned BaseIndex = 0);
639
640/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
641/// compatible instructions or constants, or just some other regular values.
642static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
643 Value *Op1, const TargetLibraryInfo &TLI) {
644 return (isConstant(BaseOp0) && isConstant(Op0)) ||
645 (isConstant(BaseOp1) && isConstant(Op1)) ||
646 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
647 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
648 BaseOp0 == Op0 || BaseOp1 == Op1 ||
649 getSameOpcode({BaseOp0, Op0}, TLI).getOpcode() ||
650 getSameOpcode({BaseOp1, Op1}, TLI).getOpcode();
651}
652
653/// \returns true if a compare instruction \p CI has similar "look" and
654/// same predicate as \p BaseCI, "as is" or with its operands and predicate
655/// swapped, false otherwise.
656static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
657 const TargetLibraryInfo &TLI) {
658 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
659 "Assessing comparisons of different types?");
660 CmpInst::Predicate BasePred = BaseCI->getPredicate();
661 CmpInst::Predicate Pred = CI->getPredicate();
663
664 Value *BaseOp0 = BaseCI->getOperand(0);
665 Value *BaseOp1 = BaseCI->getOperand(1);
666 Value *Op0 = CI->getOperand(0);
667 Value *Op1 = CI->getOperand(1);
668
669 return (BasePred == Pred &&
670 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
671 (BasePred == SwappedPred &&
672 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
673}
674
675/// \returns analysis of the Instructions in \p VL described in
676/// InstructionsState, the Opcode that we suppose the whole list
677/// could be vectorized even if its structure is diverse.
678static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
679 const TargetLibraryInfo &TLI,
680 unsigned BaseIndex) {
681 // Make sure these are all Instructions.
682 if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
683 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
684
685 bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
686 bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
687 bool IsCmpOp = isa<CmpInst>(VL[BaseIndex]);
688 CmpInst::Predicate BasePred =
689 IsCmpOp ? cast<CmpInst>(VL[BaseIndex])->getPredicate()
691 unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
692 unsigned AltOpcode = Opcode;
693 unsigned AltIndex = BaseIndex;
694
695 bool SwappedPredsCompatible = [&]() {
696 if (!IsCmpOp)
697 return false;
698 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
699 UniquePreds.insert(BasePred);
700 UniqueNonSwappedPreds.insert(BasePred);
701 for (Value *V : VL) {
702 auto *I = dyn_cast<CmpInst>(V);
703 if (!I)
704 return false;
705 CmpInst::Predicate CurrentPred = I->getPredicate();
706 CmpInst::Predicate SwappedCurrentPred =
707 CmpInst::getSwappedPredicate(CurrentPred);
708 UniqueNonSwappedPreds.insert(CurrentPred);
709 if (!UniquePreds.contains(CurrentPred) &&
710 !UniquePreds.contains(SwappedCurrentPred))
711 UniquePreds.insert(CurrentPred);
712 }
713 // Total number of predicates > 2, but if consider swapped predicates
714 // compatible only 2, consider swappable predicates as compatible opcodes,
715 // not alternate.
716 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
717 }();
718 // Check for one alternate opcode from another BinaryOperator.
719 // TODO - generalize to support all operators (types, calls etc.).
720 auto *IBase = cast<Instruction>(VL[BaseIndex]);
721 Intrinsic::ID BaseID = 0;
722 SmallVector<VFInfo> BaseMappings;
723 if (auto *CallBase = dyn_cast<CallInst>(IBase)) {
725 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
726 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
727 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
728 }
729 for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
730 auto *I = cast<Instruction>(VL[Cnt]);
731 unsigned InstOpcode = I->getOpcode();
732 if (IsBinOp && isa<BinaryOperator>(I)) {
733 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
734 continue;
735 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
736 isValidForAlternation(Opcode)) {
737 AltOpcode = InstOpcode;
738 AltIndex = Cnt;
739 continue;
740 }
741 } else if (IsCastOp && isa<CastInst>(I)) {
742 Value *Op0 = IBase->getOperand(0);
743 Type *Ty0 = Op0->getType();
744 Value *Op1 = I->getOperand(0);
745 Type *Ty1 = Op1->getType();
746 if (Ty0 == Ty1) {
747 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
748 continue;
749 if (Opcode == AltOpcode) {
751 isValidForAlternation(InstOpcode) &&
752 "Cast isn't safe for alternation, logic needs to be updated!");
753 AltOpcode = InstOpcode;
754 AltIndex = Cnt;
755 continue;
756 }
757 }
758 } else if (auto *Inst = dyn_cast<CmpInst>(VL[Cnt]); Inst && IsCmpOp) {
759 auto *BaseInst = cast<CmpInst>(VL[BaseIndex]);
760 Type *Ty0 = BaseInst->getOperand(0)->getType();
761 Type *Ty1 = Inst->getOperand(0)->getType();
762 if (Ty0 == Ty1) {
763 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
764 // Check for compatible operands. If the corresponding operands are not
765 // compatible - need to perform alternate vectorization.
766 CmpInst::Predicate CurrentPred = Inst->getPredicate();
767 CmpInst::Predicate SwappedCurrentPred =
768 CmpInst::getSwappedPredicate(CurrentPred);
769
770 if ((E == 2 || SwappedPredsCompatible) &&
771 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
772 continue;
773
774 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
775 continue;
776 auto *AltInst = cast<CmpInst>(VL[AltIndex]);
777 if (AltIndex != BaseIndex) {
778 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
779 continue;
780 } else if (BasePred != CurrentPred) {
781 assert(
782 isValidForAlternation(InstOpcode) &&
783 "CmpInst isn't safe for alternation, logic needs to be updated!");
784 AltIndex = Cnt;
785 continue;
786 }
787 CmpInst::Predicate AltPred = AltInst->getPredicate();
788 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
789 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
790 continue;
791 }
792 } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) {
793 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
794 if (Gep->getNumOperands() != 2 ||
795 Gep->getOperand(0)->getType() != IBase->getOperand(0)->getType())
796 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
797 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
799 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
800 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
801 auto *BaseLI = cast<LoadInst>(IBase);
802 if (!LI->isSimple() || !BaseLI->isSimple())
803 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
804 } else if (auto *Call = dyn_cast<CallInst>(I)) {
805 auto *CallBase = cast<CallInst>(IBase);
806 if (Call->getCalledFunction() != CallBase->getCalledFunction())
807 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
808 if (Call->hasOperandBundles() &&
809 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
810 Call->op_begin() + Call->getBundleOperandsEndIndex(),
811 CallBase->op_begin() +
813 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
815 if (ID != BaseID)
816 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
817 if (!ID) {
818 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
819 if (Mappings.size() != BaseMappings.size() ||
820 Mappings.front().ISA != BaseMappings.front().ISA ||
821 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
822 Mappings.front().VectorName != BaseMappings.front().VectorName ||
823 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
824 Mappings.front().Shape.Parameters !=
825 BaseMappings.front().Shape.Parameters)
826 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
827 }
828 }
829 continue;
830 }
831 return InstructionsState(VL[BaseIndex], nullptr, nullptr);
832 }
833
834 return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
835 cast<Instruction>(VL[AltIndex]));
836}
837
838/// \returns true if all of the values in \p VL have the same type or false
839/// otherwise.
841 Type *Ty = VL.front()->getType();
842 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
843}
844
845/// \returns True if in-tree use also needs extract. This refers to
846/// possible scalar operand in vectorized instruction.
847static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
848 TargetLibraryInfo *TLI) {
849 unsigned Opcode = UserInst->getOpcode();
850 switch (Opcode) {
851 case Instruction::Load: {
852 LoadInst *LI = cast<LoadInst>(UserInst);
853 return (LI->getPointerOperand() == Scalar);
854 }
855 case Instruction::Store: {
856 StoreInst *SI = cast<StoreInst>(UserInst);
857 return (SI->getPointerOperand() == Scalar);
858 }
859 case Instruction::Call: {
860 CallInst *CI = cast<CallInst>(UserInst);
862 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
863 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index()) &&
864 Arg.value().get() == Scalar;
865 });
866 }
867 default:
868 return false;
869 }
870}
871
872/// \returns the AA location that is being access by the instruction.
874 if (StoreInst *SI = dyn_cast<StoreInst>(I))
875 return MemoryLocation::get(SI);
876 if (LoadInst *LI = dyn_cast<LoadInst>(I))
877 return MemoryLocation::get(LI);
878 return MemoryLocation();
879}
880
881/// \returns True if the instruction is not a volatile or atomic load/store.
882static bool isSimple(Instruction *I) {
883 if (LoadInst *LI = dyn_cast<LoadInst>(I))
884 return LI->isSimple();
885 if (StoreInst *SI = dyn_cast<StoreInst>(I))
886 return SI->isSimple();
887 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
888 return !MI->isVolatile();
889 return true;
890}
891
892/// Shuffles \p Mask in accordance with the given \p SubMask.
893/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
894/// one but two input vectors.
895static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
896 bool ExtendingManyInputs = false) {
897 if (SubMask.empty())
898 return;
899 assert(
900 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
901 // Check if input scalars were extended to match the size of other node.
902 (SubMask.size() == Mask.size() &&
903 std::all_of(std::next(Mask.begin(), Mask.size() / 2), Mask.end(),
904 [](int Idx) { return Idx == PoisonMaskElem; }))) &&
905 "SubMask with many inputs support must be larger than the mask.");
906 if (Mask.empty()) {
907 Mask.append(SubMask.begin(), SubMask.end());
908 return;
909 }
910 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
911 int TermValue = std::min(Mask.size(), SubMask.size());
912 for (int I = 0, E = SubMask.size(); I < E; ++I) {
913 if (SubMask[I] == PoisonMaskElem ||
914 (!ExtendingManyInputs &&
915 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
916 continue;
917 NewMask[I] = Mask[SubMask[I]];
918 }
919 Mask.swap(NewMask);
920}
921
922/// Order may have elements assigned special value (size) which is out of
923/// bounds. Such indices only appear on places which correspond to undef values
924/// (see canReuseExtract for details) and used in order to avoid undef values
925/// have effect on operands ordering.
926/// The first loop below simply finds all unused indices and then the next loop
927/// nest assigns these indices for undef values positions.
928/// As an example below Order has two undef positions and they have assigned
929/// values 3 and 7 respectively:
930/// before: 6 9 5 4 9 2 1 0
931/// after: 6 3 5 4 7 2 1 0
933 const unsigned Sz = Order.size();
934 SmallBitVector UnusedIndices(Sz, /*t=*/true);
935 SmallBitVector MaskedIndices(Sz);
936 for (unsigned I = 0; I < Sz; ++I) {
937 if (Order[I] < Sz)
938 UnusedIndices.reset(Order[I]);
939 else
940 MaskedIndices.set(I);
941 }
942 if (MaskedIndices.none())
943 return;
944 assert(UnusedIndices.count() == MaskedIndices.count() &&
945 "Non-synced masked/available indices.");
946 int Idx = UnusedIndices.find_first();
947 int MIdx = MaskedIndices.find_first();
948 while (MIdx >= 0) {
949 assert(Idx >= 0 && "Indices must be synced.");
950 Order[MIdx] = Idx;
951 Idx = UnusedIndices.find_next(Idx);
952 MIdx = MaskedIndices.find_next(MIdx);
953 }
954}
955
956namespace llvm {
957
959 SmallVectorImpl<int> &Mask) {
960 Mask.clear();
961 const unsigned E = Indices.size();
962 Mask.resize(E, PoisonMaskElem);
963 for (unsigned I = 0; I < E; ++I)
964 Mask[Indices[I]] = I;
965}
966
967/// Reorders the list of scalars in accordance with the given \p Mask.
969 ArrayRef<int> Mask) {
970 assert(!Mask.empty() && "Expected non-empty mask.");
971 SmallVector<Value *> Prev(Scalars.size(),
972 UndefValue::get(Scalars.front()->getType()));
973 Prev.swap(Scalars);
974 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
975 if (Mask[I] != PoisonMaskElem)
976 Scalars[Mask[I]] = Prev[I];
977}
978
979/// Checks if the provided value does not require scheduling. It does not
980/// require scheduling if this is not an instruction or it is an instruction
981/// that does not read/write memory and all operands are either not instructions
982/// or phi nodes or instructions from different blocks.
984 auto *I = dyn_cast<Instruction>(V);
985 if (!I)
986 return true;
987 return !mayHaveNonDefUseDependency(*I) &&
988 all_of(I->operands(), [I](Value *V) {
989 auto *IO = dyn_cast<Instruction>(V);
990 if (!IO)
991 return true;
992 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
993 });
994}
995
996/// Checks if the provided value does not require scheduling. It does not
997/// require scheduling if this is not an instruction or it is an instruction
998/// that does not read/write memory and all users are phi nodes or instructions
999/// from the different blocks.
1000static bool isUsedOutsideBlock(Value *V) {
1001 auto *I = dyn_cast<Instruction>(V);
1002 if (!I)
1003 return true;
1004 // Limits the number of uses to save compile time.
1005 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1006 all_of(I->users(), [I](User *U) {
1007 auto *IU = dyn_cast<Instruction>(U);
1008 if (!IU)
1009 return true;
1010 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1011 });
1012}
1013
1014/// Checks if the specified value does not require scheduling. It does not
1015/// require scheduling if all operands and all users do not need to be scheduled
1016/// in the current basic block.
1019}
1020
1021/// Checks if the specified array of instructions does not require scheduling.
1022/// It is so if all either instructions have operands that do not require
1023/// scheduling or their users do not require scheduling since they are phis or
1024/// in other basic blocks.
1026 return !VL.empty() &&
1028}
1029
1030namespace slpvectorizer {
1031
1032/// Bottom Up SLP Vectorizer.
1033class BoUpSLP {
1034 struct TreeEntry;
1035 struct ScheduleData;
1038
1039public:
1040 /// Tracks the state we can represent the loads in the given sequence.
1041 enum class LoadsState {
1042 Gather,
1043 Vectorize,
1046 };
1047
1055
1057 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1060 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1061 AC(AC), DB(DB), DL(DL), ORE(ORE),
1062 Builder(Se->getContext(), TargetFolder(*DL)) {
1063 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1064 // Use the vector register size specified by the target unless overridden
1065 // by a command-line option.
1066 // TODO: It would be better to limit the vectorization factor based on
1067 // data type rather than just register size. For example, x86 AVX has
1068 // 256-bit registers, but it does not support integer operations
1069 // at that width (that requires AVX2).
1070 if (MaxVectorRegSizeOption.getNumOccurrences())
1071 MaxVecRegSize = MaxVectorRegSizeOption;
1072 else
1073 MaxVecRegSize =
1075 .getFixedValue();
1076
1077 if (MinVectorRegSizeOption.getNumOccurrences())
1078 MinVecRegSize = MinVectorRegSizeOption;
1079 else
1080 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1081 }
1082
1083 /// Vectorize the tree that starts with the elements in \p VL.
1084 /// Returns the vectorized root.
1086
1087 /// Vectorize the tree but with the list of externally used values \p
1088 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1089 /// generated extractvalue instructions.
1090 /// \param ReplacedExternals containd list of replaced external values
1091 /// {scalar, replace} after emitting extractelement for external uses.
1092 Value *
1093 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1094 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
1095 Instruction *ReductionRoot = nullptr);
1096
1097 /// \returns the cost incurred by unwanted spills and fills, caused by
1098 /// holding live values over call sites.
1100
1101 /// \returns the vectorization cost of the subtree that starts at \p VL.
1102 /// A negative number means that this is profitable.
1103 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = std::nullopt);
1104
1105 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1106 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1107 void buildTree(ArrayRef<Value *> Roots,
1108 const SmallDenseSet<Value *> &UserIgnoreLst);
1109
1110 /// Construct a vectorizable tree that starts at \p Roots.
1111 void buildTree(ArrayRef<Value *> Roots);
1112
1113 /// Returns whether the root node has in-tree uses.
1115 return !VectorizableTree.empty() &&
1116 !VectorizableTree.front()->UserTreeIndices.empty();
1117 }
1118
1119 /// Return the scalars of the root node.
1121 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1122 return VectorizableTree.front()->Scalars;
1123 }
1124
1125 /// Builds external uses of the vectorized scalars, i.e. the list of
1126 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1127 /// ExternallyUsedValues contains additional list of external uses to handle
1128 /// vectorization of reductions.
1129 void
1130 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1131
1132 /// Transforms graph nodes to target specific representations, if profitable.
1133 void transformNodes();
1134
1135 /// Clear the internal data structures that are created by 'buildTree'.
1136 void deleteTree() {
1137 VectorizableTree.clear();
1138 ScalarToTreeEntry.clear();
1139 MultiNodeScalars.clear();
1140 MustGather.clear();
1141 NonScheduledFirst.clear();
1142 EntryToLastInstruction.clear();
1143 ExternalUses.clear();
1144 ExternalUsesAsGEPs.clear();
1145 for (auto &Iter : BlocksSchedules) {
1146 BlockScheduling *BS = Iter.second.get();
1147 BS->clear();
1148 }
1149 MinBWs.clear();
1150 ReductionBitWidth = 0;
1151 CastMaxMinBWSizes.reset();
1152 ExtraBitWidthNodes.clear();
1153 InstrElementSize.clear();
1154 UserIgnoreList = nullptr;
1155 PostponedGathers.clear();
1156 ValueToGatherNodes.clear();
1157 }
1158
1159 unsigned getTreeSize() const { return VectorizableTree.size(); }
1160
1161 /// Perform LICM and CSE on the newly generated gather sequences.
1163
1164 /// Checks if the specified gather tree entry \p TE can be represented as a
1165 /// shuffled vector entry + (possibly) permutation with other gathers. It
1166 /// implements the checks only for possibly ordered scalars (Loads,
1167 /// ExtractElement, ExtractValue), which can be part of the graph.
1168 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1169
1170 /// Sort loads into increasing pointers offsets to allow greater clustering.
1171 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1172
1173 /// Gets reordering data for the given tree entry. If the entry is vectorized
1174 /// - just return ReorderIndices, otherwise check if the scalars can be
1175 /// reordered and return the most optimal order.
1176 /// \return std::nullopt if ordering is not important, empty order, if
1177 /// identity order is important, or the actual order.
1178 /// \param TopToBottom If true, include the order of vectorized stores and
1179 /// insertelement nodes, otherwise skip them.
1180 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1181 bool TopToBottom);
1182
1183 /// Reorders the current graph to the most profitable order starting from the
1184 /// root node to the leaf nodes. The best order is chosen only from the nodes
1185 /// of the same size (vectorization factor). Smaller nodes are considered
1186 /// parts of subgraph with smaller VF and they are reordered independently. We
1187 /// can make it because we still need to extend smaller nodes to the wider VF
1188 /// and we can merge reordering shuffles with the widening shuffles.
1189 void reorderTopToBottom();
1190
1191 /// Reorders the current graph to the most profitable order starting from
1192 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1193 /// number of reshuffles if the leaf nodes use the same order. In this case we
1194 /// can merge the orders and just shuffle user node instead of shuffling its
1195 /// operands. Plus, even the leaf nodes have different orders, it allows to
1196 /// sink reordering in the graph closer to the root node and merge it later
1197 /// during analysis.
1198 void reorderBottomToTop(bool IgnoreReorder = false);
1199
1200 /// \return The vector element size in bits to use when vectorizing the
1201 /// expression tree ending at \p V. If V is a store, the size is the width of
1202 /// the stored value. Otherwise, the size is the width of the largest loaded
1203 /// value reaching V. This method is used by the vectorizer to calculate
1204 /// vectorization factors.
1205 unsigned getVectorElementSize(Value *V);
1206
1207 /// Compute the minimum type sizes required to represent the entries in a
1208 /// vectorizable tree.
1210
1211 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1212 unsigned getMaxVecRegSize() const {
1213 return MaxVecRegSize;
1214 }
1215
1216 // \returns minimum vector register size as set by cl::opt.
1217 unsigned getMinVecRegSize() const {
1218 return MinVecRegSize;
1219 }
1220
1221 unsigned getMinVF(unsigned Sz) const {
1222 return std::max(2U, getMinVecRegSize() / Sz);
1223 }
1224
1225 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1226 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1227 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1228 return MaxVF ? MaxVF : UINT_MAX;
1229 }
1230
1231 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1232 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1233 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1234 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1235 ///
1236 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1237 unsigned canMapToVector(Type *T) const;
1238
1239 /// \returns True if the VectorizableTree is both tiny and not fully
1240 /// vectorizable. We do not vectorize such trees.
1241 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1242
1243 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1244 /// can be load combined in the backend. Load combining may not be allowed in
1245 /// the IR optimizer, so we do not want to alter the pattern. For example,
1246 /// partially transforming a scalar bswap() pattern into vector code is
1247 /// effectively impossible for the backend to undo.
1248 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1249 /// may not be necessary.
1250 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1251
1252 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1253 /// can be load combined in the backend. Load combining may not be allowed in
1254 /// the IR optimizer, so we do not want to alter the pattern. For example,
1255 /// partially transforming a scalar bswap() pattern into vector code is
1256 /// effectively impossible for the backend to undo.
1257 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1258 /// may not be necessary.
1259 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1260
1261 /// Checks if the given array of loads can be represented as a vectorized,
1262 /// scatter or just simple gather.
1263 /// \param VL list of loads.
1264 /// \param VL0 main load value.
1265 /// \param Order returned order of load instructions.
1266 /// \param PointerOps returned list of pointer operands.
1267 /// \param TryRecursiveCheck used to check if long masked gather can be
1268 /// represented as a serie of loads/insert subvector, if profitable.
1271 SmallVectorImpl<Value *> &PointerOps,
1272 bool TryRecursiveCheck = true) const;
1273
1275
1276 /// This structure holds any data we need about the edges being traversed
1277 /// during buildTree_rec(). We keep track of:
1278 /// (i) the user TreeEntry index, and
1279 /// (ii) the index of the edge.
1280 struct EdgeInfo {
1281 EdgeInfo() = default;
1282 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1284 /// The user TreeEntry.
1285 TreeEntry *UserTE = nullptr;
1286 /// The operand index of the use.
1287 unsigned EdgeIdx = UINT_MAX;
1288#ifndef NDEBUG
1290 const BoUpSLP::EdgeInfo &EI) {
1291 EI.dump(OS);
1292 return OS;
1293 }
1294 /// Debug print.
1295 void dump(raw_ostream &OS) const {
1296 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1297 << " EdgeIdx:" << EdgeIdx << "}";
1298 }
1299 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1300#endif
1301 bool operator == (const EdgeInfo &Other) const {
1302 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1303 }
1304 };
1305
1306 /// A helper class used for scoring candidates for two consecutive lanes.
1308 const TargetLibraryInfo &TLI;
1309 const DataLayout &DL;
1310 ScalarEvolution &SE;
1311 const BoUpSLP &R;
1312 int NumLanes; // Total number of lanes (aka vectorization factor).
1313 int MaxLevel; // The maximum recursion depth for accumulating score.
1314
1315 public:
1317 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1318 int MaxLevel)
1319 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1320 MaxLevel(MaxLevel) {}
1321
1322 // The hard-coded scores listed here are not very important, though it shall
1323 // be higher for better matches to improve the resulting cost. When
1324 // computing the scores of matching one sub-tree with another, we are
1325 // basically counting the number of values that are matching. So even if all
1326 // scores are set to 1, we would still get a decent matching result.
1327 // However, sometimes we have to break ties. For example we may have to
1328 // choose between matching loads vs matching opcodes. This is what these
1329 // scores are helping us with: they provide the order of preference. Also,
1330 // this is important if the scalar is externally used or used in another
1331 // tree entry node in the different lane.
1332
1333 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1334 static const int ScoreConsecutiveLoads = 4;
1335 /// The same load multiple times. This should have a better score than
1336 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1337 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1338 /// a vector load and 1.0 for a broadcast.
1339 static const int ScoreSplatLoads = 3;
1340 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1341 static const int ScoreReversedLoads = 3;
1342 /// A load candidate for masked gather.
1343 static const int ScoreMaskedGatherCandidate = 1;
1344 /// ExtractElementInst from same vector and consecutive indexes.
1345 static const int ScoreConsecutiveExtracts = 4;
1346 /// ExtractElementInst from same vector and reversed indices.
1347 static const int ScoreReversedExtracts = 3;
1348 /// Constants.
1349 static const int ScoreConstants = 2;
1350 /// Instructions with the same opcode.
1351 static const int ScoreSameOpcode = 2;
1352 /// Instructions with alt opcodes (e.g, add + sub).
1353 static const int ScoreAltOpcodes = 1;
1354 /// Identical instructions (a.k.a. splat or broadcast).
1355 static const int ScoreSplat = 1;
1356 /// Matching with an undef is preferable to failing.
1357 static const int ScoreUndef = 1;
1358 /// Score for failing to find a decent match.
1359 static const int ScoreFail = 0;
1360 /// Score if all users are vectorized.
1361 static const int ScoreAllUserVectorized = 1;
1362
1363 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1364 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1365 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1366 /// MainAltOps.
1368 ArrayRef<Value *> MainAltOps) const {
1369 if (!isValidElementType(V1->getType()) ||
1370 !isValidElementType(V2->getType()))
1372
1373 if (V1 == V2) {
1374 if (isa<LoadInst>(V1)) {
1375 // Retruns true if the users of V1 and V2 won't need to be extracted.
1376 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1377 // Bail out if we have too many uses to save compilation time.
1378 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1379 return false;
1380
1381 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1382 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1383 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1384 });
1385 };
1386 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1387 };
1388 // A broadcast of a load can be cheaper on some targets.
1389 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1390 ElementCount::getFixed(NumLanes)) &&
1391 ((int)V1->getNumUses() == NumLanes ||
1392 AllUsersAreInternal(V1, V2)))
1394 }
1396 }
1397
1398 auto CheckSameEntryOrFail = [&]() {
1399 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1400 TE1 && TE1 == R.getTreeEntry(V2))
1403 };
1404
1405 auto *LI1 = dyn_cast<LoadInst>(V1);
1406 auto *LI2 = dyn_cast<LoadInst>(V2);
1407 if (LI1 && LI2) {
1408 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1409 !LI2->isSimple())
1410 return CheckSameEntryOrFail();
1411
1412 std::optional<int> Dist = getPointersDiff(
1413 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1414 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1415 if (!Dist || *Dist == 0) {
1416 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1417 getUnderlyingObject(LI2->getPointerOperand()) &&
1418 R.TTI->isLegalMaskedGather(
1419 FixedVectorType::get(LI1->getType(), NumLanes),
1420 LI1->getAlign()))
1422 return CheckSameEntryOrFail();
1423 }
1424 // The distance is too large - still may be profitable to use masked
1425 // loads/gathers.
1426 if (std::abs(*Dist) > NumLanes / 2)
1428 // This still will detect consecutive loads, but we might have "holes"
1429 // in some cases. It is ok for non-power-2 vectorization and may produce
1430 // better results. It should not affect current vectorization.
1433 }
1434
1435 auto *C1 = dyn_cast<Constant>(V1);
1436 auto *C2 = dyn_cast<Constant>(V2);
1437 if (C1 && C2)
1439
1440 // Extracts from consecutive indexes of the same vector better score as
1441 // the extracts could be optimized away.
1442 Value *EV1;
1443 ConstantInt *Ex1Idx;
1444 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1445 // Undefs are always profitable for extractelements.
1446 // Compiler can easily combine poison and extractelement <non-poison> or
1447 // undef and extractelement <poison>. But combining undef +
1448 // extractelement <non-poison-but-may-produce-poison> requires some
1449 // extra operations.
1450 if (isa<UndefValue>(V2))
1451 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1454 Value *EV2 = nullptr;
1455 ConstantInt *Ex2Idx = nullptr;
1456 if (match(V2,
1458 m_Undef())))) {
1459 // Undefs are always profitable for extractelements.
1460 if (!Ex2Idx)
1462 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1464 if (EV2 == EV1) {
1465 int Idx1 = Ex1Idx->getZExtValue();
1466 int Idx2 = Ex2Idx->getZExtValue();
1467 int Dist = Idx2 - Idx1;
1468 // The distance is too large - still may be profitable to use
1469 // shuffles.
1470 if (std::abs(Dist) == 0)
1472 if (std::abs(Dist) > NumLanes / 2)
1476 }
1478 }
1479 return CheckSameEntryOrFail();
1480 }
1481
1482 auto *I1 = dyn_cast<Instruction>(V1);
1483 auto *I2 = dyn_cast<Instruction>(V2);
1484 if (I1 && I2) {
1485 if (I1->getParent() != I2->getParent())
1486 return CheckSameEntryOrFail();
1487 SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
1488 Ops.push_back(I1);
1489 Ops.push_back(I2);
1490 InstructionsState S = getSameOpcode(Ops, TLI);
1491 // Note: Only consider instructions with <= 2 operands to avoid
1492 // complexity explosion.
1493 if (S.getOpcode() &&
1494 (S.MainOp->getNumOperands() <= 2 || !MainAltOps.empty() ||
1495 !S.isAltShuffle()) &&
1496 all_of(Ops, [&S](Value *V) {
1497 return cast<Instruction>(V)->getNumOperands() ==
1498 S.MainOp->getNumOperands();
1499 }))
1500 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1502 }
1503
1504 if (isa<UndefValue>(V2))
1506
1507 return CheckSameEntryOrFail();
1508 }
1509
1510 /// Go through the operands of \p LHS and \p RHS recursively until
1511 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1512 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1513 /// of \p U1 and \p U2), except at the beginning of the recursion where
1514 /// these are set to nullptr.
1515 ///
1516 /// For example:
1517 /// \verbatim
1518 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1519 /// \ / \ / \ / \ /
1520 /// + + + +
1521 /// G1 G2 G3 G4
1522 /// \endverbatim
1523 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1524 /// each level recursively, accumulating the score. It starts from matching
1525 /// the additions at level 0, then moves on to the loads (level 1). The
1526 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1527 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1528 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1529 /// Please note that the order of the operands does not matter, as we
1530 /// evaluate the score of all profitable combinations of operands. In
1531 /// other words the score of G1 and G4 is the same as G1 and G2. This
1532 /// heuristic is based on ideas described in:
1533 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1534 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1535 /// Luís F. W. Góes
1537 Instruction *U2, int CurrLevel,
1538 ArrayRef<Value *> MainAltOps) const {
1539
1540 // Get the shallow score of V1 and V2.
1541 int ShallowScoreAtThisLevel =
1542 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1543
1544 // If reached MaxLevel,
1545 // or if V1 and V2 are not instructions,
1546 // or if they are SPLAT,
1547 // or if they are not consecutive,
1548 // or if profitable to vectorize loads or extractelements, early return
1549 // the current cost.
1550 auto *I1 = dyn_cast<Instruction>(LHS);
1551 auto *I2 = dyn_cast<Instruction>(RHS);
1552 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1553 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1554 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1555 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1556 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1557 ShallowScoreAtThisLevel))
1558 return ShallowScoreAtThisLevel;
1559 assert(I1 && I2 && "Should have early exited.");
1560
1561 // Contains the I2 operand indexes that got matched with I1 operands.
1562 SmallSet<unsigned, 4> Op2Used;
1563
1564 // Recursion towards the operands of I1 and I2. We are trying all possible
1565 // operand pairs, and keeping track of the best score.
1566 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1567 OpIdx1 != NumOperands1; ++OpIdx1) {
1568 // Try to pair op1I with the best operand of I2.
1569 int MaxTmpScore = 0;
1570 unsigned MaxOpIdx2 = 0;
1571 bool FoundBest = false;
1572 // If I2 is commutative try all combinations.
1573 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1574 unsigned ToIdx = isCommutative(I2)
1575 ? I2->getNumOperands()
1576 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1577 assert(FromIdx <= ToIdx && "Bad index");
1578 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1579 // Skip operands already paired with OpIdx1.
1580 if (Op2Used.count(OpIdx2))
1581 continue;
1582 // Recursively calculate the cost at each level
1583 int TmpScore =
1584 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1585 I1, I2, CurrLevel + 1, std::nullopt);
1586 // Look for the best score.
1587 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1588 TmpScore > MaxTmpScore) {
1589 MaxTmpScore = TmpScore;
1590 MaxOpIdx2 = OpIdx2;
1591 FoundBest = true;
1592 }
1593 }
1594 if (FoundBest) {
1595 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1596 Op2Used.insert(MaxOpIdx2);
1597 ShallowScoreAtThisLevel += MaxTmpScore;
1598 }
1599 }
1600 return ShallowScoreAtThisLevel;
1601 }
1602 };
1603 /// A helper data structure to hold the operands of a vector of instructions.
1604 /// This supports a fixed vector length for all operand vectors.
1606 /// For each operand we need (i) the value, and (ii) the opcode that it
1607 /// would be attached to if the expression was in a left-linearized form.
1608 /// This is required to avoid illegal operand reordering.
1609 /// For example:
1610 /// \verbatim
1611 /// 0 Op1
1612 /// |/
1613 /// Op1 Op2 Linearized + Op2
1614 /// \ / ----------> |/
1615 /// - -
1616 ///
1617 /// Op1 - Op2 (0 + Op1) - Op2
1618 /// \endverbatim
1619 ///
1620 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1621 ///
1622 /// Another way to think of this is to track all the operations across the
1623 /// path from the operand all the way to the root of the tree and to
1624 /// calculate the operation that corresponds to this path. For example, the
1625 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1626 /// corresponding operation is a '-' (which matches the one in the
1627 /// linearized tree, as shown above).
1628 ///
1629 /// For lack of a better term, we refer to this operation as Accumulated
1630 /// Path Operation (APO).
1631 struct OperandData {
1632 OperandData() = default;
1633 OperandData(Value *V, bool APO, bool IsUsed)
1634 : V(V), APO(APO), IsUsed(IsUsed) {}
1635 /// The operand value.
1636 Value *V = nullptr;
1637 /// TreeEntries only allow a single opcode, or an alternate sequence of
1638 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
1639 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
1640 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
1641 /// (e.g., Add/Mul)
1642 bool APO = false;
1643 /// Helper data for the reordering function.
1644 bool IsUsed = false;
1645 };
1646
1647 /// During operand reordering, we are trying to select the operand at lane
1648 /// that matches best with the operand at the neighboring lane. Our
1649 /// selection is based on the type of value we are looking for. For example,
1650 /// if the neighboring lane has a load, we need to look for a load that is
1651 /// accessing a consecutive address. These strategies are summarized in the
1652 /// 'ReorderingMode' enumerator.
1653 enum class ReorderingMode {
1654 Load, ///< Matching loads to consecutive memory addresses
1655 Opcode, ///< Matching instructions based on opcode (same or alternate)
1656 Constant, ///< Matching constants
1657 Splat, ///< Matching the same instruction multiple times (broadcast)
1658 Failed, ///< We failed to create a vectorizable group
1659 };
1660
1662
1663 /// A vector of operand vectors.
1665
1666 const TargetLibraryInfo &TLI;
1667 const DataLayout &DL;
1668 ScalarEvolution &SE;
1669 const BoUpSLP &R;
1670 const Loop *L = nullptr;
1671
1672 /// \returns the operand data at \p OpIdx and \p Lane.
1673 OperandData &getData(unsigned OpIdx, unsigned Lane) {
1674 return OpsVec[OpIdx][Lane];
1675 }
1676
1677 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
1678 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
1679 return OpsVec[OpIdx][Lane];
1680 }
1681
1682 /// Clears the used flag for all entries.
1683 void clearUsed() {
1684 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
1685 OpIdx != NumOperands; ++OpIdx)
1686 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
1687 ++Lane)
1688 OpsVec[OpIdx][Lane].IsUsed = false;
1689 }
1690
1691 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
1692 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
1693 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
1694 }
1695
1696 /// \param Lane lane of the operands under analysis.
1697 /// \param OpIdx operand index in \p Lane lane we're looking the best
1698 /// candidate for.
1699 /// \param Idx operand index of the current candidate value.
1700 /// \returns The additional score due to possible broadcasting of the
1701 /// elements in the lane. It is more profitable to have power-of-2 unique
1702 /// elements in the lane, it will be vectorized with higher probability
1703 /// after removing duplicates. Currently the SLP vectorizer supports only
1704 /// vectorization of the power-of-2 number of unique scalars.
1705 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1706 Value *IdxLaneV = getData(Idx, Lane).V;
1707 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V)
1708 return 0;
1710 for (unsigned Ln = 0, E = getNumLanes(); Ln < E; ++Ln) {
1711 if (Ln == Lane)
1712 continue;
1713 Value *OpIdxLnV = getData(OpIdx, Ln).V;
1714 if (!isa<Instruction>(OpIdxLnV))
1715 return 0;
1716 Uniques.insert(OpIdxLnV);
1717 }
1718 int UniquesCount = Uniques.size();
1719 int UniquesCntWithIdxLaneV =
1720 Uniques.contains(IdxLaneV) ? UniquesCount : UniquesCount + 1;
1721 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1722 int UniquesCntWithOpIdxLaneV =
1723 Uniques.contains(OpIdxLaneV) ? UniquesCount : UniquesCount + 1;
1724 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
1725 return 0;
1726 return (PowerOf2Ceil(UniquesCntWithOpIdxLaneV) -
1727 UniquesCntWithOpIdxLaneV) -
1728 (PowerOf2Ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
1729 }
1730
1731 /// \param Lane lane of the operands under analysis.
1732 /// \param OpIdx operand index in \p Lane lane we're looking the best
1733 /// candidate for.
1734 /// \param Idx operand index of the current candidate value.
1735 /// \returns The additional score for the scalar which users are all
1736 /// vectorized.
1737 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
1738 Value *IdxLaneV = getData(Idx, Lane).V;
1739 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
1740 // Do not care about number of uses for vector-like instructions
1741 // (extractelement/extractvalue with constant indices), they are extracts
1742 // themselves and already externally used. Vectorization of such
1743 // instructions does not add extra extractelement instruction, just may
1744 // remove it.
1745 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
1746 isVectorLikeInstWithConstOps(OpIdxLaneV))
1748 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
1749 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
1750 return 0;
1751 return R.areAllUsersVectorized(IdxLaneI)
1753 : 0;
1754 }
1755
1756 /// Score scaling factor for fully compatible instructions but with
1757 /// different number of external uses. Allows better selection of the
1758 /// instructions with less external uses.
1759 static const int ScoreScaleFactor = 10;
1760
1761 /// \Returns the look-ahead score, which tells us how much the sub-trees
1762 /// rooted at \p LHS and \p RHS match, the more they match the higher the
1763 /// score. This helps break ties in an informed way when we cannot decide on
1764 /// the order of the operands by just considering the immediate
1765 /// predecessors.
1766 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
1767 int Lane, unsigned OpIdx, unsigned Idx,
1768 bool &IsUsed) {
1769 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
1771 // Keep track of the instruction stack as we recurse into the operands
1772 // during the look-ahead score exploration.
1773 int Score =
1774 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
1775 /*CurrLevel=*/1, MainAltOps);
1776 if (Score) {
1777 int SplatScore = getSplatScore(Lane, OpIdx, Idx);
1778 if (Score <= -SplatScore) {
1779 // Set the minimum score for splat-like sequence to avoid setting
1780 // failed state.
1781 Score = 1;
1782 } else {
1783 Score += SplatScore;
1784 // Scale score to see the difference between different operands
1785 // and similar operands but all vectorized/not all vectorized
1786 // uses. It does not affect actual selection of the best
1787 // compatible operand in general, just allows to select the
1788 // operand with all vectorized uses.
1789 Score *= ScoreScaleFactor;
1790 Score += getExternalUseScore(Lane, OpIdx, Idx);
1791 IsUsed = true;
1792 }
1793 }
1794 return Score;
1795 }
1796
1797 /// Best defined scores per lanes between the passes. Used to choose the
1798 /// best operand (with the highest score) between the passes.
1799 /// The key - {Operand Index, Lane}.
1800 /// The value - the best score between the passes for the lane and the
1801 /// operand.
1803 BestScoresPerLanes;
1804
1805 // Search all operands in Ops[*][Lane] for the one that matches best
1806 // Ops[OpIdx][LastLane] and return its opreand index.
1807 // If no good match can be found, return std::nullopt.
1808 std::optional<unsigned>
1809 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
1810 ArrayRef<ReorderingMode> ReorderingModes,
1811 ArrayRef<Value *> MainAltOps) {
1812 unsigned NumOperands = getNumOperands();
1813
1814 // The operand of the previous lane at OpIdx.
1815 Value *OpLastLane = getData(OpIdx, LastLane).V;
1816
1817 // Our strategy mode for OpIdx.
1818 ReorderingMode RMode = ReorderingModes[OpIdx];
1819 if (RMode == ReorderingMode::Failed)
1820 return std::nullopt;
1821
1822 // The linearized opcode of the operand at OpIdx, Lane.
1823 bool OpIdxAPO = getData(OpIdx, Lane).APO;
1824
1825 // The best operand index and its score.
1826 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
1827 // are using the score to differentiate between the two.
1828 struct BestOpData {
1829 std::optional<unsigned> Idx;
1830 unsigned Score = 0;
1831 } BestOp;
1832 BestOp.Score =
1833 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
1834 .first->second;
1835
1836 // Track if the operand must be marked as used. If the operand is set to
1837 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
1838 // want to reestimate the operands again on the following iterations).
1839 bool IsUsed = RMode == ReorderingMode::Splat ||
1840 RMode == ReorderingMode::Constant ||
1841 RMode == ReorderingMode::Load;
1842 // Iterate through all unused operands and look for the best.
1843 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
1844 // Get the operand at Idx and Lane.
1845 OperandData &OpData = getData(Idx, Lane);
1846 Value *Op = OpData.V;
1847 bool OpAPO = OpData.APO;
1848
1849 // Skip already selected operands.
1850 if (OpData.IsUsed)
1851 continue;
1852
1853 // Skip if we are trying to move the operand to a position with a
1854 // different opcode in the linearized tree form. This would break the
1855 // semantics.
1856 if (OpAPO != OpIdxAPO)
1857 continue;
1858
1859 // Look for an operand that matches the current mode.
1860 switch (RMode) {
1861 case ReorderingMode::Load:
1862 case ReorderingMode::Opcode: {
1863 bool LeftToRight = Lane > LastLane;
1864 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1865 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1866 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
1867 OpIdx, Idx, IsUsed);
1868 if (Score > static_cast<int>(BestOp.Score) ||
1869 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1870 Idx == OpIdx)) {
1871 BestOp.Idx = Idx;
1872 BestOp.Score = Score;
1873 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
1874 }
1875 break;
1876 }
1877 case ReorderingMode::Constant:
1878 if (isa<Constant>(Op) ||
1879 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
1880 BestOp.Idx = Idx;
1881 if (isa<Constant>(Op)) {
1883 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1885 }
1886 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1887 IsUsed = false;
1888 }
1889 break;
1890 case ReorderingMode::Splat:
1891 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1892 IsUsed = Op == OpLastLane;
1893 if (Op == OpLastLane) {
1894 BestOp.Score = LookAheadHeuristics::ScoreSplat;
1895 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1897 }
1898 BestOp.Idx = Idx;
1899 }
1900 break;
1901 case ReorderingMode::Failed:
1902 llvm_unreachable("Not expected Failed reordering mode.");
1903 }
1904 }
1905
1906 if (BestOp.Idx) {
1907 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
1908 return BestOp.Idx;
1909 }
1910 // If we could not find a good match return std::nullopt.
1911 return std::nullopt;
1912 }
1913
1914 /// Helper for reorderOperandVecs.
1915 /// \returns the lane that we should start reordering from. This is the one
1916 /// which has the least number of operands that can freely move about or
1917 /// less profitable because it already has the most optimal set of operands.
1918 unsigned getBestLaneToStartReordering() const {
1919 unsigned Min = UINT_MAX;
1920 unsigned SameOpNumber = 0;
1921 // std::pair<unsigned, unsigned> is used to implement a simple voting
1922 // algorithm and choose the lane with the least number of operands that
1923 // can freely move about or less profitable because it already has the
1924 // most optimal set of operands. The first unsigned is a counter for
1925 // voting, the second unsigned is the counter of lanes with instructions
1926 // with same/alternate opcodes and same parent basic block.
1928 // Try to be closer to the original results, if we have multiple lanes
1929 // with same cost. If 2 lanes have the same cost, use the one with the
1930 // lowest index.
1931 for (int I = getNumLanes(); I > 0; --I) {
1932 unsigned Lane = I - 1;
1933 OperandsOrderData NumFreeOpsHash =
1934 getMaxNumOperandsThatCanBeReordered(Lane);
1935 // Compare the number of operands that can move and choose the one with
1936 // the least number.
1937 if (NumFreeOpsHash.NumOfAPOs < Min) {
1938 Min = NumFreeOpsHash.NumOfAPOs;
1939 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1940 HashMap.clear();
1941 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1942 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1943 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
1944 // Select the most optimal lane in terms of number of operands that
1945 // should be moved around.
1946 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
1947 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1948 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
1949 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
1950 auto *It = HashMap.find(NumFreeOpsHash.Hash);
1951 if (It == HashMap.end())
1952 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
1953 else
1954 ++It->second.first;
1955 }
1956 }
1957 // Select the lane with the minimum counter.
1958 unsigned BestLane = 0;
1959 unsigned CntMin = UINT_MAX;
1960 for (const auto &Data : reverse(HashMap)) {
1961 if (Data.second.first < CntMin) {
1962 CntMin = Data.second.first;
1963 BestLane = Data.second.second;
1964 }
1965 }
1966 return BestLane;
1967 }
1968
1969 /// Data structure that helps to reorder operands.
1970 struct OperandsOrderData {
1971 /// The best number of operands with the same APOs, which can be
1972 /// reordered.
1973 unsigned NumOfAPOs = UINT_MAX;
1974 /// Number of operands with the same/alternate instruction opcode and
1975 /// parent.
1976 unsigned NumOpsWithSameOpcodeParent = 0;
1977 /// Hash for the actual operands ordering.
1978 /// Used to count operands, actually their position id and opcode
1979 /// value. It is used in the voting mechanism to find the lane with the
1980 /// least number of operands that can freely move about or less profitable
1981 /// because it already has the most optimal set of operands. Can be
1982 /// replaced with SmallVector<unsigned> instead but hash code is faster
1983 /// and requires less memory.
1984 unsigned Hash = 0;
1985 };
1986 /// \returns the maximum number of operands that are allowed to be reordered
1987 /// for \p Lane and the number of compatible instructions(with the same
1988 /// parent/opcode). This is used as a heuristic for selecting the first lane
1989 /// to start operand reordering.
1990 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
1991 unsigned CntTrue = 0;
1992 unsigned NumOperands = getNumOperands();
1993 // Operands with the same APO can be reordered. We therefore need to count
1994 // how many of them we have for each APO, like this: Cnt[APO] = x.
1995 // Since we only have two APOs, namely true and false, we can avoid using
1996 // a map. Instead we can simply count the number of operands that
1997 // correspond to one of them (in this case the 'true' APO), and calculate
1998 // the other by subtracting it from the total number of operands.
1999 // Operands with the same instruction opcode and parent are more
2000 // profitable since we don't need to move them in many cases, with a high
2001 // probability such lane already can be vectorized effectively.
2002 bool AllUndefs = true;
2003 unsigned NumOpsWithSameOpcodeParent = 0;
2004 Instruction *OpcodeI = nullptr;
2005 BasicBlock *Parent = nullptr;
2006 unsigned Hash = 0;
2007 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2008 const OperandData &OpData = getData(OpIdx, Lane);
2009 if (OpData.APO)
2010 ++CntTrue;
2011 // Use Boyer-Moore majority voting for finding the majority opcode and
2012 // the number of times it occurs.
2013 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2014 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI).getOpcode() ||
2015 I->getParent() != Parent) {
2016 if (NumOpsWithSameOpcodeParent == 0) {
2017 NumOpsWithSameOpcodeParent = 1;
2018 OpcodeI = I;
2019 Parent = I->getParent();
2020 } else {
2021 --NumOpsWithSameOpcodeParent;
2022 }
2023 } else {
2024 ++NumOpsWithSameOpcodeParent;
2025 }
2026 }
2027 Hash = hash_combine(
2028 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2029 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2030 }
2031 if (AllUndefs)
2032 return {};
2033 OperandsOrderData Data;
2034 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2035 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2036 Data.Hash = Hash;
2037 return Data;
2038 }
2039
2040 /// Go through the instructions in VL and append their operands.
2041 void appendOperandsOfVL(ArrayRef<Value *> VL) {
2042 assert(!VL.empty() && "Bad VL");
2043 assert((empty() || VL.size() == getNumLanes()) &&
2044 "Expected same number of lanes");
2045 assert(isa<Instruction>(VL[0]) && "Expected instruction");
2046 unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
2047 constexpr unsigned IntrinsicNumOperands = 2;
2048 if (isa<IntrinsicInst>(VL[0]))
2049 NumOperands = IntrinsicNumOperands;
2050 OpsVec.resize(NumOperands);
2051 unsigned NumLanes = VL.size();
2052 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2053 OpsVec[OpIdx].resize(NumLanes);
2054 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2055 assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
2056 // Our tree has just 3 nodes: the root and two operands.
2057 // It is therefore trivial to get the APO. We only need to check the
2058 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2059 // RHS operand. The LHS operand of both add and sub is never attached
2060 // to an inversese operation in the linearized form, therefore its APO
2061 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2062
2063 // Since operand reordering is performed on groups of commutative
2064 // operations or alternating sequences (e.g., +, -), we can safely
2065 // tell the inverse operations by checking commutativity.
2066 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2067 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2068 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2069 APO, false};
2070 }
2071 }
2072 }
2073
2074 /// \returns the number of operands.
2075 unsigned getNumOperands() const { return OpsVec.size(); }
2076
2077 /// \returns the number of lanes.
2078 unsigned getNumLanes() const { return OpsVec[0].size(); }
2079
2080 /// \returns the operand value at \p OpIdx and \p Lane.
2081 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2082 return getData(OpIdx, Lane).V;
2083 }
2084
2085 /// \returns true if the data structure is empty.
2086 bool empty() const { return OpsVec.empty(); }
2087
2088 /// Clears the data.
2089 void clear() { OpsVec.clear(); }
2090
2091 /// \Returns true if there are enough operands identical to \p Op to fill
2092 /// the whole vector (it is mixed with constants or loop invariant values).
2093 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2094 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2095 bool OpAPO = getData(OpIdx, Lane).APO;
2096 bool IsInvariant = L && L->isLoopInvariant(Op);
2097 unsigned Cnt = 0;
2098 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2099 if (Ln == Lane)
2100 continue;
2101 // This is set to true if we found a candidate for broadcast at Lane.
2102 bool FoundCandidate = false;
2103 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2104 OperandData &Data = getData(OpI, Ln);
2105 if (Data.APO != OpAPO || Data.IsUsed)
2106 continue;
2107 Value *OpILane = getValue(OpI, Lane);
2108 bool IsConstantOp = isa<Constant>(OpILane);
2109 // Consider the broadcast candidate if:
2110 // 1. Same value is found in one of the operands.
2111 if (Data.V == Op ||
2112 // 2. The operand in the given lane is not constant but there is a
2113 // constant operand in another lane (which can be moved to the
2114 // given lane). In this case we can represent it as a simple
2115 // permutation of constant and broadcast.
2116 (!IsConstantOp &&
2117 ((Lns > 2 && isa<Constant>(Data.V)) ||
2118 // 2.1. If we have only 2 lanes, need to check that value in the
2119 // next lane does not build same opcode sequence.
2120 (Lns == 2 &&
2121 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2122 .getOpcode() &&
2123 isa<Constant>(Data.V)))) ||
2124 // 3. The operand in the current lane is loop invariant (can be
2125 // hoisted out) and another operand is also a loop invariant
2126 // (though not a constant). In this case the whole vector can be
2127 // hoisted out.
2128 // FIXME: need to teach the cost model about this case for better
2129 // estimation.
2130 (IsInvariant && !isa<Constant>(Data.V) &&
2131 !getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2132 L->isLoopInvariant(Data.V))) {
2133 FoundCandidate = true;
2134 Data.IsUsed = Data.V == Op;
2135 if (Data.V == Op)
2136 ++Cnt;
2137 break;
2138 }
2139 }
2140 if (!FoundCandidate)
2141 return false;
2142 }
2143 return getNumLanes() == 2 || Cnt > 1;
2144 }
2145
2146 public:
2147 /// Initialize with all the operands of the instruction vector \p RootVL.
2149 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2150 L(R.LI->getLoopFor(
2151 (cast<Instruction>(RootVL.front())->getParent()))) {
2152 // Append all the operands of RootVL.
2153 appendOperandsOfVL(RootVL);
2154 }
2155
2156 /// \Returns a value vector with the operands across all lanes for the
2157 /// opearnd at \p OpIdx.
2158 ValueList getVL(unsigned OpIdx) const {
2159 ValueList OpVL(OpsVec[OpIdx].size());
2160 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2161 "Expected same num of lanes across all operands");
2162 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2163 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2164 return OpVL;
2165 }
2166
2167 // Performs operand reordering for 2 or more operands.
2168 // The original operands are in OrigOps[OpIdx][Lane].
2169 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2170 void reorder() {
2171 unsigned NumOperands = getNumOperands();
2172 unsigned NumLanes = getNumLanes();
2173 // Each operand has its own mode. We are using this mode to help us select
2174 // the instructions for each lane, so that they match best with the ones
2175 // we have selected so far.
2176 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2177
2178 // This is a greedy single-pass algorithm. We are going over each lane
2179 // once and deciding on the best order right away with no back-tracking.
2180 // However, in order to increase its effectiveness, we start with the lane
2181 // that has operands that can move the least. For example, given the
2182 // following lanes:
2183 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2184 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2185 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2186 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2187 // we will start at Lane 1, since the operands of the subtraction cannot
2188 // be reordered. Then we will visit the rest of the lanes in a circular
2189 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2190
2191 // Find the first lane that we will start our search from.
2192 unsigned FirstLane = getBestLaneToStartReordering();
2193
2194 // Initialize the modes.
2195 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2196 Value *OpLane0 = getValue(OpIdx, FirstLane);
2197 // Keep track if we have instructions with all the same opcode on one
2198 // side.
2199 if (isa<LoadInst>(OpLane0))
2200 ReorderingModes[OpIdx] = ReorderingMode::Load;
2201 else if (isa<Instruction>(OpLane0)) {
2202 // Check if OpLane0 should be broadcast.
2203 if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
2204 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2205 else
2206 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2207 }
2208 else if (isa<Constant>(OpLane0))
2209 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2210 else if (isa<Argument>(OpLane0))
2211 // Our best hope is a Splat. It may save some cost in some cases.
2212 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2213 else
2214 // NOTE: This should be unreachable.
2215 ReorderingModes[OpIdx] = ReorderingMode::Failed;
2216 }
2217
2218 // Check that we don't have same operands. No need to reorder if operands
2219 // are just perfect diamond or shuffled diamond match. Do not do it only
2220 // for possible broadcasts or non-power of 2 number of scalars (just for
2221 // now).
2222 auto &&SkipReordering = [this]() {
2223 SmallPtrSet<Value *, 4> UniqueValues;
2224 ArrayRef<OperandData> Op0 = OpsVec.front();
2225 for (const OperandData &Data : Op0)
2226 UniqueValues.insert(Data.V);
2227 for (ArrayRef<OperandData> Op : drop_begin(OpsVec, 1)) {
2228 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2229 return !UniqueValues.contains(Data.V);
2230 }))
2231 return false;
2232 }
2233 // TODO: Check if we can remove a check for non-power-2 number of
2234 // scalars after full support of non-power-2 vectorization.
2235 return UniqueValues.size() != 2 && isPowerOf2_32(UniqueValues.size());
2236 };
2237
2238 // If the initial strategy fails for any of the operand indexes, then we
2239 // perform reordering again in a second pass. This helps avoid assigning
2240 // high priority to the failed strategy, and should improve reordering for
2241 // the non-failed operand indexes.
2242 for (int Pass = 0; Pass != 2; ++Pass) {
2243 // Check if no need to reorder operands since they're are perfect or
2244 // shuffled diamond match.
2245 // Need to do it to avoid extra external use cost counting for
2246 // shuffled matches, which may cause regressions.
2247 if (SkipReordering())
2248 break;
2249 // Skip the second pass if the first pass did not fail.
2250 bool StrategyFailed = false;
2251 // Mark all operand data as free to use.
2252 clearUsed();
2253 // We keep the original operand order for the FirstLane, so reorder the
2254 // rest of the lanes. We are visiting the nodes in a circular fashion,
2255 // using FirstLane as the center point and increasing the radius
2256 // distance.
2257 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2258 for (unsigned I = 0; I < NumOperands; ++I)
2259 MainAltOps[I].push_back(getData(I, FirstLane).V);
2260
2261 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2262 // Visit the lane on the right and then the lane on the left.
2263 for (int Direction : {+1, -1}) {
2264 int Lane = FirstLane + Direction * Distance;
2265 if (Lane < 0 || Lane >= (int)NumLanes)
2266 continue;
2267 int LastLane = Lane - Direction;
2268 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2269 "Out of bounds");
2270 // Look for a good match for each operand.
2271 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2272 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2273 std::optional<unsigned> BestIdx = getBestOperand(
2274 OpIdx, Lane, LastLane, ReorderingModes, MainAltOps[OpIdx]);
2275 // By not selecting a value, we allow the operands that follow to
2276 // select a better matching value. We will get a non-null value in
2277 // the next run of getBestOperand().
2278 if (BestIdx) {
2279 // Swap the current operand with the one returned by
2280 // getBestOperand().
2281 swap(OpIdx, *BestIdx, Lane);
2282 } else {
2283 // Enable the second pass.
2284 StrategyFailed = true;
2285 }
2286 // Try to get the alternate opcode and follow it during analysis.
2287 if (MainAltOps[OpIdx].size() != 2) {
2288 OperandData &AltOp = getData(OpIdx, Lane);
2289 InstructionsState OpS =
2290 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2291 if (OpS.getOpcode() && OpS.isAltShuffle())
2292 MainAltOps[OpIdx].push_back(AltOp.V);
2293 }
2294 }
2295 }
2296 }
2297 // Skip second pass if the strategy did not fail.
2298 if (!StrategyFailed)
2299 break;
2300 }
2301 }
2302
2303#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2304 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2305 switch (RMode) {
2306 case ReorderingMode::Load:
2307 return "Load";
2308 case ReorderingMode::Opcode:
2309 return "Opcode";
2310 case ReorderingMode::Constant:
2311 return "Constant";
2312 case ReorderingMode::Splat:
2313 return "Splat";
2314 case ReorderingMode::Failed:
2315 return "Failed";
2316 }
2317 llvm_unreachable("Unimplemented Reordering Type");
2318 }
2319
2320 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2321 raw_ostream &OS) {
2322 return OS << getModeStr(RMode);
2323 }
2324
2325 /// Debug print.
2326 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2327 printMode(RMode, dbgs());
2328 }
2329
2330 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2331 return printMode(RMode, OS);
2332 }
2333
2335 const unsigned Indent = 2;
2336 unsigned Cnt = 0;
2337 for (const OperandDataVec &OpDataVec : OpsVec) {
2338 OS << "Operand " << Cnt++ << "\n";
2339 for (const OperandData &OpData : OpDataVec) {
2340 OS.indent(Indent) << "{";
2341 if (Value *V = OpData.V)
2342 OS << *V;
2343 else
2344 OS << "null";
2345 OS << ", APO:" << OpData.APO << "}\n";
2346 }
2347 OS << "\n";
2348 }
2349 return OS;
2350 }
2351
2352 /// Debug print.
2353 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2354#endif
2355 };
2356
2357 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2358 /// for a pair which have highest score deemed to have best chance to form
2359 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2360 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2361 /// of the cost, considered to be good enough score.
2362 std::optional<int>
2363 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2364 int Limit = LookAheadHeuristics::ScoreFail) const {
2365 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2367 int BestScore = Limit;
2368 std::optional<int> Index;
2369 for (int I : seq<int>(0, Candidates.size())) {
2370 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2371 Candidates[I].second,
2372 /*U1=*/nullptr, /*U2=*/nullptr,
2373 /*Level=*/1, std::nullopt);
2374 if (Score > BestScore) {
2375 BestScore = Score;
2376 Index = I;
2377 }
2378 }
2379 return Index;
2380 }
2381
2382 /// Checks if the instruction is marked for deletion.
2383 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2384
2385 /// Removes an instruction from its block and eventually deletes it.
2386 /// It's like Instruction::eraseFromParent() except that the actual deletion
2387 /// is delayed until BoUpSLP is destructed.
2389 DeletedInstructions.insert(I);
2390 }
2391
2392 /// Checks if the instruction was already analyzed for being possible
2393 /// reduction root.
2395 return AnalyzedReductionsRoots.count(I);
2396 }
2397 /// Register given instruction as already analyzed for being possible
2398 /// reduction root.
2400 AnalyzedReductionsRoots.insert(I);
2401 }
2402 /// Checks if the provided list of reduced values was checked already for
2403 /// vectorization.
2405 return AnalyzedReductionVals.contains(hash_value(VL));
2406 }
2407 /// Adds the list of reduced values to list of already checked values for the
2408 /// vectorization.
2410 AnalyzedReductionVals.insert(hash_value(VL));
2411 }
2412 /// Clear the list of the analyzed reduction root instructions.
2414 AnalyzedReductionsRoots.clear();
2415 AnalyzedReductionVals.clear();
2416 AnalyzedMinBWVals.clear();
2417 }
2418 /// Checks if the given value is gathered in one of the nodes.
2419 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2420 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2421 }
2422 /// Checks if the given value is gathered in one of the nodes.
2423 bool isGathered(const Value *V) const {
2424 return MustGather.contains(V);
2425 }
2426 /// Checks if the specified value was not schedule.
2427 bool isNotScheduled(const Value *V) const {
2428 return NonScheduledFirst.contains(V);
2429 }
2430
2431 /// Check if the value is vectorized in the tree.
2432 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2433
2434 ~BoUpSLP();
2435
2436private:
2437 /// Determine if a node \p E in can be demoted to a smaller type with a
2438 /// truncation. We collect the entries that will be demoted in ToDemote.
2439 /// \param E Node for analysis
2440 /// \param ToDemote indices of the nodes to be demoted.
2441 bool collectValuesToDemote(const TreeEntry &E, bool IsProfitableToDemoteRoot,
2442 unsigned &BitWidth,
2443 SmallVectorImpl<unsigned> &ToDemote,
2445 unsigned &MaxDepthLevel,
2446 bool &IsProfitableToDemote,
2447 bool IsTruncRoot) const;
2448
2449 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2450 /// reordering (i.e. the operands can be reordered because they have only one
2451 /// user and reordarable).
2452 /// \param ReorderableGathers List of all gather nodes that require reordering
2453 /// (e.g., gather of extractlements or partially vectorizable loads).
2454 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2455 /// reordering, subset of \p NonVectorized.
2456 bool
2457 canReorderOperands(TreeEntry *UserTE,
2458 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2459 ArrayRef<TreeEntry *> ReorderableGathers,
2460 SmallVectorImpl<TreeEntry *> &GatherOps);
2461
2462 /// Checks if the given \p TE is a gather node with clustered reused scalars
2463 /// and reorders it per given \p Mask.
2464 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2465
2466 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2467 /// if any. If it is not vectorized (gather node), returns nullptr.
2468 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2469 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2470 TreeEntry *TE = nullptr;
2471 const auto *It = find_if(VL, [&](Value *V) {
2472 TE = getTreeEntry(V);
2473 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2474 return true;
2475 auto It = MultiNodeScalars.find(V);
2476 if (It != MultiNodeScalars.end()) {
2477 for (TreeEntry *E : It->second) {
2478 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2479 TE = E;
2480 return true;
2481 }
2482 }
2483 }
2484 return false;
2485 });
2486 if (It != VL.end()) {
2487 assert(TE->isSame(VL) && "Expected same scalars.");
2488 return TE;
2489 }
2490 return nullptr;
2491 }
2492
2493 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2494 /// if any. If it is not vectorized (gather node), returns nullptr.
2495 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
2496 unsigned OpIdx) const {
2497 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
2498 const_cast<TreeEntry *>(UserTE), OpIdx);
2499 }
2500
2501 /// Checks if all users of \p I are the part of the vectorization tree.
2502 bool areAllUsersVectorized(
2503 Instruction *I,
2504 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
2505
2506 /// Return information about the vector formed for the specified index
2507 /// of a vector of (the same) instruction.
2509
2510 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
2511 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
2512
2513 /// \returns Cast context for the given graph node.
2515 getCastContextHint(const TreeEntry &TE) const;
2516
2517 /// \returns the cost of the vectorizable entry.
2518 InstructionCost getEntryCost(const TreeEntry *E,
2519 ArrayRef<Value *> VectorizedVals,
2520 SmallPtrSetImpl<Value *> &CheckedExtracts);
2521
2522 /// This is the recursive part of buildTree.
2523 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
2524 const EdgeInfo &EI);
2525
2526 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
2527 /// be vectorized to use the original vector (or aggregate "bitcast" to a
2528 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
2529 /// returns false, setting \p CurrentOrder to either an empty vector or a
2530 /// non-identity permutation that allows to reuse extract instructions.
2531 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
2532 /// extract order.
2533 bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
2534 SmallVectorImpl<unsigned> &CurrentOrder,
2535 bool ResizeAllowed = false) const;
2536
2537 /// Vectorize a single entry in the tree.
2538 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2539 /// avoid issues with def-use order.
2540 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
2541
2542 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
2543 /// \p E.
2544 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
2545 /// avoid issues with def-use order.
2546 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
2547
2548 /// Create a new vector from a list of scalar values. Produces a sequence
2549 /// which exploits values reused across lanes, and arranges the inserts
2550 /// for ease of later optimization.
2551 template <typename BVTy, typename ResTy, typename... Args>
2552 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
2553
2554 /// Create a new vector from a list of scalar values. Produces a sequence
2555 /// which exploits values reused across lanes, and arranges the inserts
2556 /// for ease of later optimization.
2557 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy);
2558
2559 /// Returns the instruction in the bundle, which can be used as a base point
2560 /// for scheduling. Usually it is the last instruction in the bundle, except
2561 /// for the case when all operands are external (in this case, it is the first
2562 /// instruction in the list).
2563 Instruction &getLastInstructionInBundle(const TreeEntry *E);
2564
2565 /// Tries to find extractelement instructions with constant indices from fixed
2566 /// vector type and gather such instructions into a bunch, which highly likely
2567 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2568 /// was successful, the matched scalars are replaced by poison values in \p VL
2569 /// for future analysis.
2570 std::optional<TargetTransformInfo::ShuffleKind>
2571 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
2572 SmallVectorImpl<int> &Mask) const;
2573
2574 /// Tries to find extractelement instructions with constant indices from fixed
2575 /// vector type and gather such instructions into a bunch, which highly likely
2576 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
2577 /// was successful, the matched scalars are replaced by poison values in \p VL
2578 /// for future analysis.
2580 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
2582 unsigned NumParts) const;
2583
2584 /// Checks if the gathered \p VL can be represented as a single register
2585 /// shuffle(s) of previous tree entries.
2586 /// \param TE Tree entry checked for permutation.
2587 /// \param VL List of scalars (a subset of the TE scalar), checked for
2588 /// permutations. Must form single-register vector.
2589 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2590 /// commands to build the mask using the original vector value, without
2591 /// relying on the potential reordering.
2592 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
2593 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
2594 std::optional<TargetTransformInfo::ShuffleKind>
2595 isGatherShuffledSingleRegisterEntry(
2596 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
2597 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
2598 bool ForOrder);
2599
2600 /// Checks if the gathered \p VL can be represented as multi-register
2601 /// shuffle(s) of previous tree entries.
2602 /// \param TE Tree entry checked for permutation.
2603 /// \param VL List of scalars (a subset of the TE scalar), checked for
2604 /// permutations.
2605 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
2606 /// commands to build the mask using the original vector value, without
2607 /// relying on the potential reordering.
2608 /// \returns per-register series of ShuffleKind, if gathered values can be
2609 /// represented as shuffles of previous tree entries. \p Mask is filled with
2610 /// the shuffle mask (also on per-register base).
2612 isGatherShuffledEntry(
2613 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
2615 unsigned NumParts, bool ForOrder = false);
2616
2617 /// \returns the scalarization cost for this list of values. Assuming that
2618 /// this subtree gets vectorized, we may need to extract the values from the
2619 /// roots. This method calculates the cost of extracting the values.
2620 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
2621 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
2622 Type *ScalarTy) const;
2623
2624 /// Set the Builder insert point to one after the last instruction in
2625 /// the bundle
2626 void setInsertPointAfterBundle(const TreeEntry *E);
2627
2628 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
2629 /// specified, the starting vector value is poison.
2630 Value *gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy);
2631
2632 /// \returns whether the VectorizableTree is fully vectorizable and will
2633 /// be beneficial even the tree height is tiny.
2634 bool isFullyVectorizableTinyTree(bool ForReduction) const;
2635
2636 /// Reorder commutative or alt operands to get better probability of
2637 /// generating vectorized code.
2638 static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
2641 const BoUpSLP &R);
2642
2643 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
2644 /// users of \p TE and collects the stores. It returns the map from the store
2645 /// pointers to the collected stores.
2647 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
2648
2649 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
2650 /// stores in \p StoresVec can form a vector instruction. If so it returns
2651 /// true and populates \p ReorderIndices with the shuffle indices of the
2652 /// stores when compared to the sorted vector.
2653 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
2654 OrdersType &ReorderIndices) const;
2655
2656 /// Iterates through the users of \p TE, looking for scalar stores that can be
2657 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
2658 /// their order and builds an order index vector for each store bundle. It
2659 /// returns all these order vectors found.
2660 /// We run this after the tree has formed, otherwise we may come across user
2661 /// instructions that are not yet in the tree.
2663 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
2664
2665 struct TreeEntry {
2666 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
2667 TreeEntry(VecTreeTy &Container) : Container(Container) {}
2668
2669 /// \returns Common mask for reorder indices and reused scalars.
2670 SmallVector<int> getCommonMask() const {
2672 inversePermutation(ReorderIndices, Mask);
2673 ::addMask(Mask, ReuseShuffleIndices);
2674 return Mask;
2675 }
2676
2677 /// \returns true if the scalars in VL are equal to this entry.
2678 bool isSame(ArrayRef<Value *> VL) const {
2679 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
2680 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
2681 return std::equal(VL.begin(), VL.end(), Scalars.begin());
2682 return VL.size() == Mask.size() &&
2683 std::equal(VL.begin(), VL.end(), Mask.begin(),
2684 [Scalars](Value *V, int Idx) {
2685 return (isa<UndefValue>(V) &&
2686 Idx == PoisonMaskElem) ||
2687 (Idx != PoisonMaskElem && V == Scalars[Idx]);
2688 });
2689 };
2690 if (!ReorderIndices.empty()) {
2691 // TODO: implement matching if the nodes are just reordered, still can
2692 // treat the vector as the same if the list of scalars matches VL
2693 // directly, without reordering.
2695 inversePermutation(ReorderIndices, Mask);
2696 if (VL.size() == Scalars.size())
2697 return IsSame(Scalars, Mask);
2698 if (VL.size() == ReuseShuffleIndices.size()) {
2699 ::addMask(Mask, ReuseShuffleIndices);
2700 return IsSame(Scalars, Mask);
2701 }
2702 return false;
2703 }
2704 return IsSame(Scalars, ReuseShuffleIndices);
2705 }
2706
2707 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
2708 return State == TreeEntry::NeedToGather &&
2709 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
2710 UserTreeIndices.front().UserTE == UserEI.UserTE;
2711 }
2712
2713 /// \returns true if current entry has same operands as \p TE.
2714 bool hasEqualOperands(const TreeEntry &TE) const {
2715 if (TE.getNumOperands() != getNumOperands())
2716 return false;
2717 SmallBitVector Used(getNumOperands());
2718 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
2719 unsigned PrevCount = Used.count();
2720 for (unsigned K = 0; K < E; ++K) {
2721 if (Used.test(K))
2722 continue;
2723 if (getOperand(K) == TE.getOperand(I)) {
2724 Used.set(K);
2725 break;
2726 }
2727 }
2728 // Check if we actually found the matching operand.
2729 if (PrevCount == Used.count())
2730 return false;
2731 }
2732 return true;
2733 }
2734
2735 /// \return Final vectorization factor for the node. Defined by the total
2736 /// number of vectorized scalars, including those, used several times in the
2737 /// entry and counted in the \a ReuseShuffleIndices, if any.
2738 unsigned getVectorFactor() const {
2739 if (!ReuseShuffleIndices.empty())
2740 return ReuseShuffleIndices.size();
2741 return Scalars.size();
2742 };
2743
2744 /// A vector of scalars.
2745 ValueList Scalars;
2746
2747 /// The Scalars are vectorized into this value. It is initialized to Null.
2748 WeakTrackingVH VectorizedValue = nullptr;
2749
2750 /// New vector phi instructions emitted for the vectorized phi nodes.
2751 PHINode *PHI = nullptr;
2752
2753 /// Do we need to gather this sequence or vectorize it
2754 /// (either with vector instruction or with scatter/gather
2755 /// intrinsics for store/load)?
2756 enum EntryState {
2757 Vectorize,
2758 ScatterVectorize,
2759 StridedVectorize,
2760 NeedToGather
2761 };
2762 EntryState State;
2763
2764 /// Does this sequence require some shuffling?
2765 SmallVector<int, 4> ReuseShuffleIndices;
2766
2767 /// Does this entry require reordering?
2768 SmallVector<unsigned, 4> ReorderIndices;
2769
2770 /// Points back to the VectorizableTree.
2771 ///
2772 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
2773 /// to be a pointer and needs to be able to initialize the child iterator.
2774 /// Thus we need a reference back to the container to translate the indices
2775 /// to entries.
2776 VecTreeTy &Container;
2777
2778 /// The TreeEntry index containing the user of this entry. We can actually
2779 /// have multiple users so the data structure is not truly a tree.
2780 SmallVector<EdgeInfo, 1> UserTreeIndices;
2781
2782 /// The index of this treeEntry in VectorizableTree.
2783 int Idx = -1;
2784
2785 private:
2786 /// The operands of each instruction in each lane Operands[op_index][lane].
2787 /// Note: This helps avoid the replication of the code that performs the
2788 /// reordering of operands during buildTree_rec() and vectorizeTree().
2790
2791 /// The main/alternate instruction.
2792 Instruction *MainOp = nullptr;
2793 Instruction *AltOp = nullptr;
2794
2795 public:
2796 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
2797 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
2798 if (Operands.size() < OpIdx + 1)
2799 Operands.resize(OpIdx + 1);
2800 assert(Operands[OpIdx].empty() && "Already resized?");
2801 assert(OpVL.size() <= Scalars.size() &&
2802 "Number of operands is greater than the number of scalars.");
2803 Operands[OpIdx].resize(OpVL.size());
2804 copy(OpVL, Operands[OpIdx].begin());
2805 }
2806
2807 /// Set the operands of this bundle in their original order.
2808 void setOperandsInOrder() {
2809 assert(Operands.empty() && "Already initialized?");
2810 auto *I0 = cast<Instruction>(Scalars[0]);
2811 Operands.resize(I0->getNumOperands());
2812 unsigned NumLanes = Scalars.size();
2813 for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
2814 OpIdx != NumOperands; ++OpIdx) {
2815 Operands[OpIdx].resize(NumLanes);
2816 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2817 auto *I = cast<Instruction>(Scalars[Lane]);
2818 assert(I->getNumOperands() == NumOperands &&
2819 "Expected same number of operands");
2820 Operands[OpIdx][Lane] = I->getOperand(OpIdx);
2821 }
2822 }
2823 }
2824
2825 /// Reorders operands of the node to the given mask \p Mask.
2826 void reorderOperands(ArrayRef<int> Mask) {
2827 for (ValueList &Operand : Operands)
2828 reorderScalars(Operand, Mask);
2829 }
2830
2831 /// \returns the \p OpIdx operand of this TreeEntry.
2832 ValueList &getOperand(unsigned OpIdx) {
2833 assert(OpIdx < Operands.size() && "Off bounds");
2834 return Operands[OpIdx];
2835 }
2836
2837 /// \returns the \p OpIdx operand of this TreeEntry.
2838 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
2839 assert(OpIdx < Operands.size() && "Off bounds");
2840 return Operands[OpIdx];
2841 }
2842
2843 /// \returns the number of operands.
2844 unsigned getNumOperands() const { return Operands.size(); }
2845
2846 /// \return the single \p OpIdx operand.
2847 Value *getSingleOperand(unsigned OpIdx) const {
2848 assert(OpIdx < Operands.size() && "Off bounds");
2849 assert(!Operands[OpIdx].empty() && "No operand available");
2850 return Operands[OpIdx][0];
2851 }
2852
2853 /// Some of the instructions in the list have alternate opcodes.
2854 bool isAltShuffle() const { return MainOp != AltOp; }
2855
2856 bool isOpcodeOrAlt(Instruction *I) const {
2857 unsigned CheckedOpcode = I->getOpcode();
2858 return (getOpcode() == CheckedOpcode ||
2859 getAltOpcode() == CheckedOpcode);
2860 }
2861
2862 /// Chooses the correct key for scheduling data. If \p Op has the same (or
2863 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
2864 /// \p OpValue.
2865 Value *isOneOf(Value *Op) const {
2866 auto *I = dyn_cast<Instruction>(Op);
2867 if (I && isOpcodeOrAlt(I))
2868 return Op;
2869 return MainOp;
2870 }
2871
2872 void setOperations(const InstructionsState &S) {
2873 MainOp = S.MainOp;
2874 AltOp = S.AltOp;
2875 }
2876
2877 Instruction *getMainOp() const {
2878 return MainOp;
2879 }
2880
2881 Instruction *getAltOp() const {
2882 return AltOp;
2883 }
2884
2885 /// The main/alternate opcodes for the list of instructions.
2886 unsigned getOpcode() const {
2887 return MainOp ? MainOp->getOpcode() : 0;
2888 }
2889
2890 unsigned getAltOpcode() const {
2891 return AltOp ? AltOp->getOpcode() : 0;
2892 }
2893
2894 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
2895 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
2896 int findLaneForValue(Value *V) const {
2897 unsigned FoundLane = std::distance(Scalars.begin(), find(Scalars, V));
2898 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2899 if (!ReorderIndices.empty())
2900 FoundLane = ReorderIndices[FoundLane];
2901 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
2902 if (!ReuseShuffleIndices.empty()) {
2903 FoundLane = std::distance(ReuseShuffleIndices.begin(),
2904 find(ReuseShuffleIndices, FoundLane));
2905 }
2906 return FoundLane;
2907 }
2908
2909 /// Build a shuffle mask for graph entry which represents a merge of main
2910 /// and alternate operations.
2911 void
2912 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
2914 SmallVectorImpl<Value *> *OpScalars = nullptr,
2915 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
2916
2917 /// Return true if this is a non-power-of-2 node.
2918 bool isNonPowOf2Vec() const {
2919 bool IsNonPowerOf2 = !isPowerOf2_32(Scalars.size());
2920 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
2921 "Reshuffling not supported with non-power-of-2 vectors yet.");
2922 return IsNonPowerOf2;
2923 }
2924
2925#ifndef NDEBUG
2926 /// Debug printer.
2927 LLVM_DUMP_METHOD void dump() const {
2928 dbgs() << Idx << ".\n";
2929 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
2930 dbgs() << "Operand " << OpI << ":\n";
2931 for (const Value *V : Operands[OpI])
2932 dbgs().indent(2) << *V << "\n";
2933 }
2934 dbgs() << "Scalars: \n";
2935 for (Value *V : Scalars)
2936 dbgs().indent(2) << *V << "\n";
2937 dbgs() << "State: ";
2938 switch (State) {
2939 case Vectorize:
2940 dbgs() << "Vectorize\n";
2941 break;
2942 case ScatterVectorize:
2943 dbgs() << "ScatterVectorize\n";
2944 break;
2945 case StridedVectorize:
2946 dbgs() << "StridedVectorize\n";
2947 break;
2948 case NeedToGather:
2949 dbgs() << "NeedToGather\n";
2950 break;
2951 }
2952 dbgs() << "MainOp: ";
2953 if (MainOp)
2954 dbgs() << *MainOp << "\n";
2955 else
2956 dbgs() << "NULL\n";
2957 dbgs() << "AltOp: ";
2958 if (AltOp)
2959 dbgs() << *AltOp << "\n";
2960 else
2961 dbgs() << "NULL\n";
2962 dbgs() << "VectorizedValue: ";
2963 if (VectorizedValue)
2964 dbgs() << *VectorizedValue << "\n";
2965 else
2966 dbgs() << "NULL\n";
2967 dbgs() << "ReuseShuffleIndices: ";
2968 if (ReuseShuffleIndices.empty())
2969 dbgs() << "Empty";
2970 else
2971 for (int ReuseIdx : ReuseShuffleIndices)
2972 dbgs() << ReuseIdx << ", ";
2973 dbgs() << "\n";
2974 dbgs() << "ReorderIndices: ";
2975 for (unsigned ReorderIdx : ReorderIndices)
2976 dbgs() << ReorderIdx << ", ";
2977 dbgs() << "\n";
2978 dbgs() << "UserTreeIndices: ";
2979 for (const auto &EInfo : UserTreeIndices)
2980 dbgs() << EInfo << ", ";
2981 dbgs() << "\n";
2982 }
2983#endif
2984 };
2985
2986#ifndef NDEBUG
2987 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
2988 InstructionCost VecCost, InstructionCost ScalarCost,
2989 StringRef Banner) const {
2990 dbgs() << "SLP: " << Banner << ":\n";
2991 E->dump();
2992 dbgs() << "SLP: Costs:\n";
2993 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
2994 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
2995 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
2996 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
2997 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
2998 }
2999#endif
3000
3001 /// Create a new VectorizableTree entry.
3002 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3003 std::optional<ScheduleData *> Bundle,
3004 const InstructionsState &S,
3005 const EdgeInfo &UserTreeIdx,
3006 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3007 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3008 TreeEntry::EntryState EntryState =
3009 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3010 return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3011 ReuseShuffleIndices, ReorderIndices);
3012 }
3013
3014 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3015 TreeEntry::EntryState EntryState,
3016 std::optional<ScheduleData *> Bundle,
3017 const InstructionsState &S,
3018 const EdgeInfo &UserTreeIdx,
3019 ArrayRef<int> ReuseShuffleIndices = std::nullopt,
3020 ArrayRef<unsigned> ReorderIndices = std::nullopt) {
3021 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3022 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3023 "Need to vectorize gather entry?");
3024 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3025 TreeEntry *Last = VectorizableTree.back().get();
3026 Last->Idx = VectorizableTree.size() - 1;
3027 Last->State = EntryState;
3028 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3029 ReuseShuffleIndices.end());
3030 if (ReorderIndices.empty()) {
3031 Last->Scalars.assign(VL.begin(), VL.end());
3032 Last->setOperations(S);
3033 } else {
3034 // Reorder scalars and build final mask.
3035 Last->Scalars.assign(VL.size(), nullptr);
3036 transform(ReorderIndices, Last->Scalars.begin(),
3037 [VL](unsigned Idx) -> Value * {
3038 if (Idx >= VL.size())
3039 return UndefValue::get(VL.front()->getType());
3040 return VL[Idx];
3041 });
3042 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3043 Last->setOperations(S);
3044 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3045 }
3046 if (Last->State != TreeEntry::NeedToGather) {
3047 for (Value *V : VL) {
3048 const TreeEntry *TE = getTreeEntry(V);
3049 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3050 "Scalar already in tree!");
3051 if (TE) {
3052 if (TE != Last)
3053 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3054 continue;
3055 }
3056 ScalarToTreeEntry[V] = Last;
3057 }
3058 // Update the scheduler bundle to point to this TreeEntry.
3059 ScheduleData *BundleMember = *Bundle;
3060 assert((BundleMember || isa<PHINode>(S.MainOp) ||
3061 isVectorLikeInstWithConstOps(S.MainOp) ||
3062 doesNotNeedToSchedule(VL)) &&
3063 "Bundle and VL out of sync");
3064 if (BundleMember) {
3065 for (Value *V : VL) {
3067 continue;
3068 if (!BundleMember)
3069 continue;
3070 BundleMember->TE = Last;
3071 BundleMember = BundleMember->NextInBundle;
3072 }
3073 }
3074 assert(!BundleMember && "Bundle and VL out of sync");
3075 } else {
3076 // Build a map for gathered scalars to the nodes where they are used.
3077 bool AllConstsOrCasts = true;
3078 for (Value *V : VL)
3079 if (!isConstant(V)) {
3080 auto *I = dyn_cast<CastInst>(V);
3081 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3082 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3083 }
3084 if (AllConstsOrCasts)
3085 CastMaxMinBWSizes =
3086 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3087 MustGather.insert(VL.begin(), VL.end());
3088 }
3089
3090 if (UserTreeIdx.UserTE) {
3091 Last->UserTreeIndices.push_back(UserTreeIdx);
3092 assert((!Last->isNonPowOf2Vec() || Last->ReorderIndices.empty()) &&
3093 "Reordering isn't implemented for non-power-of-2 nodes yet");
3094 }
3095 return Last;
3096 }
3097
3098 /// -- Vectorization State --
3099 /// Holds all of the tree entries.
3100 TreeEntry::VecTreeTy VectorizableTree;
3101
3102#ifndef NDEBUG
3103 /// Debug printer.
3104 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3105 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3106 VectorizableTree[Id]->dump();
3107 dbgs() << "\n";
3108 }
3109 }
3110#endif
3111
3112 TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
3113
3114 const TreeEntry *getTreeEntry(Value *V) const {
3115 return ScalarToTreeEntry.lookup(V);
3116 }
3117
3118 /// Check that the operand node of alternate node does not generate
3119 /// buildvector sequence. If it is, then probably not worth it to build
3120 /// alternate shuffle, if number of buildvector operands + alternate
3121 /// instruction > than the number of buildvector instructions.
3122 /// \param S the instructions state of the analyzed values.
3123 /// \param VL list of the instructions with alternate opcodes.
3124 bool areAltOperandsProfitable(const InstructionsState &S,
3125 ArrayRef<Value *> VL) const;
3126
3127 /// Checks if the specified list of the instructions/values can be vectorized
3128 /// and fills required data before actual scheduling of the instructions.
3129 TreeEntry::EntryState getScalarsVectorizationState(
3130 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
3131 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const;
3132
3133 /// Maps a specific scalar to its tree entry.
3134 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3135
3136 /// List of scalars, used in several vectorize nodes, and the list of the
3137 /// nodes.
3139
3140 /// Maps a value to the proposed vectorizable size.
3141 SmallDenseMap<Value *, unsigned> InstrElementSize;
3142
3143 /// A list of scalars that we found that we need to keep as scalars.
3144 ValueSet MustGather;
3145
3146 /// A set of first non-schedulable values.
3147 ValueSet NonScheduledFirst;
3148
3149 /// A map between the vectorized entries and the last instructions in the
3150 /// bundles. The bundles are built in use order, not in the def order of the
3151 /// instructions. So, we cannot rely directly on the last instruction in the
3152 /// bundle being the last instruction in the program order during
3153 /// vectorization process since the basic blocks are affected, need to
3154 /// pre-gather them before.
3155 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3156
3157 /// List of gather nodes, depending on other gather/vector nodes, which should
3158 /// be emitted after the vector instruction emission process to correctly
3159 /// handle order of the vector instructions and shuffles.
3160 SetVector<const TreeEntry *> PostponedGathers;
3161
3162 using ValueToGatherNodesMap =
3164 ValueToGatherNodesMap ValueToGatherNodes;
3165
3166 /// This POD struct describes one external user in the vectorized tree.
3167 struct ExternalUser {
3168 ExternalUser(Value *S, llvm::User *U, int L)
3169 : Scalar(S), User(U), Lane(L) {}
3170
3171 // Which scalar in our function.
3172 Value *Scalar;
3173
3174 // Which user that uses the scalar.
3176
3177 // Which lane does the scalar belong to.
3178 int Lane;
3179 };
3180 using UserList = SmallVector<ExternalUser, 16>;
3181
3182 /// Checks if two instructions may access the same memory.
3183 ///
3184 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3185 /// is invariant in the calling loop.
3186 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3187 Instruction *Inst2) {
3188 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3189 return true;
3190 // First check if the result is already in the cache.
3191 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3192 auto It = AliasCache.find(Key);
3193 if (It != AliasCache.end())
3194 return It->second;
3195 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3196 // Store the result in the cache.
3197 AliasCache.try_emplace(Key, Aliased);
3198 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3199 return Aliased;
3200 }
3201
3202 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3203
3204 /// Cache for alias results.
3205 /// TODO: consider moving this to the AliasAnalysis itself.
3207
3208 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3209 // globally through SLP because we don't perform any action which
3210 // invalidates capture results.
3211 BatchAAResults BatchAA;
3212
3213 /// Temporary store for deleted instructions. Instructions will be deleted
3214 /// eventually when the BoUpSLP is destructed. The deferral is required to
3215 /// ensure that there are no incorrect collisions in the AliasCache, which
3216 /// can happen if a new instruction is allocated at the same address as a
3217 /// previously deleted instruction.
3218 DenseSet<Instruction *> DeletedInstructions;
3219
3220 /// Set of the instruction, being analyzed already for reductions.
3221 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3222
3223 /// Set of hashes for the list of reduction values already being analyzed.
3224 DenseSet<size_t> AnalyzedReductionVals;
3225
3226 /// Values, already been analyzed for mininmal bitwidth and found to be
3227 /// non-profitable.
3228 DenseSet<Value *> AnalyzedMinBWVals;
3229
3230 /// A list of values that need to extracted out of the tree.
3231 /// This list holds pairs of (Internal Scalar : External User). External User
3232 /// can be nullptr, it means that this Internal Scalar will be used later,
3233 /// after vectorization.
3234 UserList ExternalUses;
3235
3236 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3237 /// extractelement instructions.
3238 SmallPtrSet<Value *, 4> ExternalUsesAsGEPs;
3239
3240 /// Values used only by @llvm.assume calls.
3242
3243 /// Holds all of the instructions that we gathered, shuffle instructions and
3244 /// extractelements.
3245 SetVector<Instruction *> GatherShuffleExtractSeq;
3246
3247 /// A list of blocks that we are going to CSE.
3248 DenseSet<BasicBlock *> CSEBlocks;
3249
3250 /// Contains all scheduling relevant data for an instruction.
3251 /// A ScheduleData either represents a single instruction or a member of an
3252 /// instruction bundle (= a group of instructions which is combined into a
3253 /// vector instruction).
3254 struct ScheduleData {
3255 // The initial value for the dependency counters. It means that the
3256 // dependencies are not calculated yet.
3257 enum { InvalidDeps = -1 };
3258
3259 ScheduleData() = default;
3260
3261 void init(int BlockSchedulingRegionID, Value *OpVal) {
3262 FirstInBundle = this;
3263 NextInBundle = nullptr;
3264 NextLoadStore = nullptr;
3265 IsScheduled = false;
3266 SchedulingRegionID = BlockSchedulingRegionID;
3267 clearDependencies();
3268 OpValue = OpVal;
3269 TE = nullptr;
3270 }
3271
3272 /// Verify basic self consistency properties
3273 void verify() {
3274 if (hasValidDependencies()) {
3275 assert(UnscheduledDeps <= Dependencies && "invariant");
3276 } else {
3277 assert(UnscheduledDeps == Dependencies && "invariant");
3278 }
3279
3280 if (IsScheduled) {
3281 assert(isSchedulingEntity() &&
3282 "unexpected scheduled state");
3283 for (const ScheduleData *BundleMember = this; BundleMember;
3284 BundleMember = BundleMember->NextInBundle) {
3285 assert(BundleMember->hasValidDependencies() &&
3286 BundleMember->UnscheduledDeps == 0 &&
3287 "unexpected scheduled state");
3288 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3289 "only bundle is marked scheduled");
3290 }
3291 }
3292
3293 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3294 "all bundle members must be in same basic block");
3295 }
3296
3297 /// Returns true if the dependency information has been calculated.
3298 /// Note that depenendency validity can vary between instructions within
3299 /// a single bundle.
3300 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3301
3302 /// Returns true for single instructions and for bundle representatives
3303 /// (= the head of a bundle).
3304 bool isSchedulingEntity() const { return FirstInBundle == this; }
3305
3306 /// Returns true if it represents an instruction bundle and not only a
3307 /// single instruction.
3308 bool isPartOfBundle() const {
3309 return NextInBundle != nullptr || FirstInBundle != this || TE;
3310 }
3311
3312 /// Returns true if it is ready for scheduling, i.e. it has no more
3313 /// unscheduled depending instructions/bundles.
3314 bool isReady() const {
3315 assert(isSchedulingEntity() &&
3316 "can't consider non-scheduling entity for ready list");
3317 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3318 }
3319
3320 /// Modifies the number of unscheduled dependencies for this instruction,
3321 /// and returns the number of remaining dependencies for the containing
3322 /// bundle.
3323 int incrementUnscheduledDeps(int Incr) {
3324 assert(hasValidDependencies() &&
3325 "increment of unscheduled deps would be meaningless");
3326 UnscheduledDeps += Incr;
3327 return FirstInBundle->unscheduledDepsInBundle();
3328 }
3329
3330 /// Sets the number of unscheduled dependencies to the number of
3331 /// dependencies.
3332 void resetUnscheduledDeps() {
3333 UnscheduledDeps = Dependencies;
3334 }
3335
3336 /// Clears all dependency information.
3337 void clearDependencies() {
3338 Dependencies = InvalidDeps;
3339 resetUnscheduledDeps();
3340 MemoryDependencies.clear();
3341 ControlDependencies.clear();
3342 }
3343
3344 int unscheduledDepsInBundle() const {
3345 assert(isSchedulingEntity() && "only meaningful on the bundle");
3346 int Sum = 0;
3347 for (const ScheduleData *BundleMember = this; BundleMember;
3348 BundleMember = BundleMember->NextInBundle) {
3349 if (BundleMember->UnscheduledDeps == InvalidDeps)
3350 return InvalidDeps;
3351 Sum += BundleMember->UnscheduledDeps;
3352 }
3353 return Sum;
3354 }
3355
3356 void dump(raw_ostream &os) const {
3357 if (!isSchedulingEntity()) {
3358 os << "/ " << *Inst;
3359 } else if (NextInBundle) {
3360 os << '[' << *Inst;
3361 ScheduleData *SD = NextInBundle;
3362 while (SD) {
3363 os << ';' << *SD->Inst;
3364 SD = SD->NextInBundle;
3365 }
3366 os << ']';
3367 } else {
3368 os << *Inst;
3369 }
3370 }
3371
3372 Instruction *Inst = nullptr;
3373
3374 /// Opcode of the current instruction in the schedule data.
3375 Value *OpValue = nullptr;
3376
3377 /// The TreeEntry that this instruction corresponds to.
3378 TreeEntry *TE = nullptr;
3379
3380 /// Points to the head in an instruction bundle (and always to this for
3381 /// single instructions).
3382 ScheduleData *FirstInBundle = nullptr;
3383
3384 /// Single linked list of all instructions in a bundle. Null if it is a
3385 /// single instruction.
3386 ScheduleData *NextInBundle = nullptr;
3387
3388 /// Single linked list of all memory instructions (e.g. load, store, call)
3389 /// in the block - until the end of the scheduling region.
3390 ScheduleData *NextLoadStore = nullptr;
3391
3392 /// The dependent memory instructions.
3393 /// This list is derived on demand in calculateDependencies().
3394 SmallVector<ScheduleData *, 4> MemoryDependencies;
3395
3396 /// List of instructions which this instruction could be control dependent
3397 /// on. Allowing such nodes to be scheduled below this one could introduce
3398 /// a runtime fault which didn't exist in the original program.
3399 /// ex: this is a load or udiv following a readonly call which inf loops
3400 SmallVector<ScheduleData *, 4> ControlDependencies;
3401
3402 /// This ScheduleData is in the current scheduling region if this matches
3403 /// the current SchedulingRegionID of BlockScheduling.
3404 int SchedulingRegionID = 0;
3405
3406 /// Used for getting a "good" final ordering of instructions.
3407 int SchedulingPriority = 0;
3408
3409 /// The number of dependencies. Constitutes of the number of users of the
3410 /// instruction plus the number of dependent memory instructions (if any).
3411 /// This value is calculated on demand.
3412 /// If InvalidDeps, the number of dependencies is not calculated yet.
3413 int Dependencies = InvalidDeps;
3414
3415 /// The number of dependencies minus the number of dependencies of scheduled
3416 /// instructions. As soon as this is zero, the instruction/bundle gets ready
3417 /// for scheduling.
3418 /// Note that this is negative as long as Dependencies is not calculated.
3419 int UnscheduledDeps = InvalidDeps;
3420
3421 /// True if this instruction is scheduled (or considered as scheduled in the
3422 /// dry-run).
3423 bool IsScheduled = false;
3424 };
3425
3426#ifndef NDEBUG
3428 const BoUpSLP::ScheduleData &SD) {
3429 SD.dump(os);
3430 return os;
3431 }
3432#endif
3433
3434 friend struct GraphTraits<BoUpSLP *>;
3435 friend struct DOTGraphTraits<BoUpSLP *>;
3436
3437 /// Contains all scheduling data for a basic block.
3438 /// It does not schedules instructions, which are not memory read/write
3439 /// instructions and their operands are either constants, or arguments, or
3440 /// phis, or instructions from others blocks, or their users are phis or from
3441 /// the other blocks. The resulting vector instructions can be placed at the
3442 /// beginning of the basic block without scheduling (if operands does not need
3443 /// to be scheduled) or at the end of the block (if users are outside of the
3444 /// block). It allows to save some compile time and memory used by the
3445 /// compiler.
3446 /// ScheduleData is assigned for each instruction in between the boundaries of
3447 /// the tree entry, even for those, which are not part of the graph. It is
3448 /// required to correctly follow the dependencies between the instructions and
3449 /// their correct scheduling. The ScheduleData is not allocated for the
3450 /// instructions, which do not require scheduling, like phis, nodes with
3451 /// extractelements/insertelements only or nodes with instructions, with
3452 /// uses/operands outside of the block.
3453 struct BlockScheduling {
3454 BlockScheduling(BasicBlock *BB)
3455 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
3456
3457 void clear() {
3458 ReadyInsts.clear();
3459 ScheduleStart = nullptr;
3460 ScheduleEnd = nullptr;
3461 FirstLoadStoreInRegion = nullptr;
3462 LastLoadStoreInRegion = nullptr;
3463 RegionHasStackSave = false;
3464
3465 // Reduce the maximum schedule region size by the size of the
3466 // previous scheduling run.
3467 ScheduleRegionSizeLimit -= ScheduleRegionSize;
3468 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
3469 ScheduleRegionSizeLimit = MinScheduleRegionSize;
3470 ScheduleRegionSize = 0;
3471
3472 // Make a new scheduling region, i.e. all existing ScheduleData is not
3473 // in the new region yet.
3474 ++SchedulingRegionID;
3475 }
3476
3477 ScheduleData *getScheduleData(Instruction *I) {
3478 if (BB != I->getParent())
3479 // Avoid lookup if can't possibly be in map.
3480 return nullptr;
3481 ScheduleData *SD = ScheduleDataMap.lookup(I);
3482 if (SD && isInSchedulingRegion(SD))
3483 return SD;
3484 return nullptr;
3485 }
3486
3487 ScheduleData *getScheduleData(Value *V) {
3488 if (auto *I = dyn_cast<Instruction>(V))
3489 return getScheduleData(I);
3490 return nullptr;
3491 }
3492
3493 ScheduleData *getScheduleData(Value *V, Value *Key) {
3494 if (V == Key)
3495 return getScheduleData(V);
3496 auto I = ExtraScheduleDataMap.find(V);
3497 if (I != ExtraScheduleDataMap.end()) {
3498 ScheduleData *SD = I->second.lookup(Key);
3499 if (SD && isInSchedulingRegion(SD))
3500 return SD;
3501 }
3502 return nullptr;
3503 }
3504
3505 bool isInSchedulingRegion(ScheduleData *SD) const {
3506 return SD->SchedulingRegionID == SchedulingRegionID;
3507 }
3508
3509 /// Marks an instruction as scheduled and puts all dependent ready
3510 /// instructions into the ready-list.
3511 template <typename ReadyListType>
3512 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
3513 SD->IsScheduled = true;
3514 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
3515
3516 for (ScheduleData *BundleMember = SD; BundleMember;
3517 BundleMember = BundleMember->NextInBundle) {
3518 if (BundleMember->Inst != BundleMember->OpValue)
3519 continue;
3520
3521 // Handle the def-use chain dependencies.
3522
3523 // Decrement the unscheduled counter and insert to ready list if ready.
3524 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
3525 doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
3526 if (OpDef && OpDef->hasValidDependencies() &&
3527 OpDef->incrementUnscheduledDeps(-1) == 0) {
3528 // There are no more unscheduled dependencies after
3529 // decrementing, so we can put the dependent instruction
3530 // into the ready list.
3531 ScheduleData *DepBundle = OpDef->FirstInBundle;
3532 assert(!DepBundle->IsScheduled &&
3533 "already scheduled bundle gets ready");
3534 ReadyList.insert(DepBundle);
3535 LLVM_DEBUG(dbgs()
3536 << "SLP: gets ready (def): " << *DepBundle << "\n");
3537 }
3538 });
3539 };
3540
3541 // If BundleMember is a vector bundle, its operands may have been
3542 // reordered during buildTree(). We therefore need to get its operands
3543 // through the TreeEntry.
3544 if (TreeEntry *TE = BundleMember->TE) {
3545 // Need to search for the lane since the tree entry can be reordered.
3546 int Lane = std::distance(TE->Scalars.begin(),
3547 find(TE->Scalars, BundleMember->Inst));
3548 assert(Lane >= 0 && "Lane not set");
3549
3550 // Since vectorization tree is being built recursively this assertion
3551 // ensures that the tree entry has all operands set before reaching
3552 // this code. Couple of exceptions known at the moment are extracts
3553 // where their second (immediate) operand is not added. Since
3554 // immediates do not affect scheduler behavior this is considered
3555 // okay.
3556 auto *In = BundleMember->Inst;
3557 assert(
3558 In &&
3559 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
3560 In->getNumOperands() == TE->getNumOperands()) &&
3561 "Missed TreeEntry operands?");
3562 (void)In; // fake use to avoid build failure when assertions disabled
3563
3564 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
3565 OpIdx != NumOperands; ++OpIdx)
3566 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
3567 DecrUnsched(I);
3568 } else {
3569 // If BundleMember is a stand-alone instruction, no operand reordering
3570 // has taken place, so we directly access its operands.
3571 for (Use &U : BundleMember->Inst->operands())
3572 if (auto *I = dyn_cast<Instruction>(U.get()))
3573 DecrUnsched(I);
3574 }
3575 // Handle the memory dependencies.
3576 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
3577 if (MemoryDepSD->hasValidDependencies() &&
3578 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
3579 // There are no more unscheduled dependencies after decrementing,
3580 // so we can put the dependent instruction into the ready list.
3581 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
3582 assert(!DepBundle->IsScheduled &&
3583 "already scheduled bundle gets ready");
3584 ReadyList.insert(DepBundle);
3586 << "SLP: gets ready (mem): " << *DepBundle << "\n");
3587 }
3588 }
3589 // Handle the control dependencies.
3590 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
3591 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
3592 // There are no more unscheduled dependencies after decrementing,
3593 // so we can put the dependent instruction into the ready list.
3594 ScheduleData *DepBundle = DepSD->FirstInBundle;
3595 assert(!DepBundle->IsScheduled &&
3596 "already scheduled bundle gets ready");
3597 ReadyList.insert(DepBundle);
3599 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
3600 }
3601 }
3602 }
3603 }
3604
3605 /// Verify basic self consistency properties of the data structure.
3606 void verify() {
3607 if (!ScheduleStart)
3608 return;
3609
3610 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
3611 ScheduleStart->comesBefore(ScheduleEnd) &&
3612 "Not a valid scheduling region?");
3613
3614 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3615 auto *SD = getScheduleData(I);
3616 if (!SD)
3617 continue;
3618 assert(isInSchedulingRegion(SD) &&
3619 "primary schedule data not in window?");
3620 assert(isInSchedulingRegion(SD->FirstInBundle) &&
3621 "entire bundle in window!");
3622 (void)SD;
3623 doForAllOpcodes(I, [](ScheduleData *SD) { SD->verify(); });
3624 }
3625
3626 for (auto *SD : ReadyInsts) {
3627 assert(SD->isSchedulingEntity() && SD->isReady() &&
3628 "item in ready list not ready?");
3629 (void)SD;
3630 }
3631 }
3632
3633 void doForAllOpcodes(Value *V,
3634 function_ref<void(ScheduleData *SD)> Action) {
3635 if (ScheduleData *SD = getScheduleData(V))
3636 Action(SD);
3637 auto I = ExtraScheduleDataMap.find(V);
3638 if (I != ExtraScheduleDataMap.end())
3639 for (auto &P : I->second)
3640 if (isInSchedulingRegion(P.second))
3641 Action(P.second);
3642 }
3643
3644 /// Put all instructions into the ReadyList which are ready for scheduling.
3645 template <typename ReadyListType>
3646 void initialFillReadyList(ReadyListType &ReadyList) {
3647 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
3648 doForAllOpcodes(I, [&](ScheduleData *SD) {
3649 if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
3650 SD->isReady()) {
3651 ReadyList.insert(SD);
3652 LLVM_DEBUG(dbgs()
3653 << "SLP: initially in ready list: " << *SD << "\n");
3654 }
3655 });
3656 }
3657 }
3658
3659 /// Build a bundle from the ScheduleData nodes corresponding to the
3660 /// scalar instruction for each lane.
3661 ScheduleData *buildBundle(ArrayRef<Value *> VL);
3662
3663 /// Checks if a bundle of instructions can be scheduled, i.e. has no
3664 /// cyclic dependencies. This is only a dry-run, no instructions are
3665 /// actually moved at this stage.
3666 /// \returns the scheduling bundle. The returned Optional value is not
3667 /// std::nullopt if \p VL is allowed to be scheduled.
3668 std::optional<ScheduleData *>
3669 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
3670 const InstructionsState &S);
3671
3672 /// Un-bundles a group of instructions.
3673 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
3674
3675 /// Allocates schedule data chunk.
3676 ScheduleData *allocateScheduleDataChunks();
3677
3678 /// Extends the scheduling region so that V is inside the region.
3679 /// \returns true if the region size is within the limit.
3680 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
3681
3682 /// Initialize the ScheduleData structures for new instructions in the
3683 /// scheduling region.
3684 void initScheduleData(Instruction *FromI, Instruction *ToI,
3685 ScheduleData *PrevLoadStore,
3686 ScheduleData *NextLoadStore);
3687
3688 /// Updates the dependency information of a bundle and of all instructions/
3689 /// bundles which depend on the original bundle.
3690 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
3691 BoUpSLP *SLP);
3692
3693 /// Sets all instruction in the scheduling region to un-scheduled.
3694 void resetSchedule();
3695
3696 BasicBlock *BB;
3697
3698 /// Simple memory allocation for ScheduleData.
3700
3701 /// The size of a ScheduleData array in ScheduleDataChunks.
3702 int ChunkSize;
3703
3704 /// The allocator position in the current chunk, which is the last entry
3705 /// of ScheduleDataChunks.
3706 int ChunkPos;
3707
3708 /// Attaches ScheduleData to Instruction.
3709 /// Note that the mapping survives during all vectorization iterations, i.e.
3710 /// ScheduleData structures are recycled.
3712
3713 /// Attaches ScheduleData to Instruction with the leading key.
3715 ExtraScheduleDataMap;
3716
3717 /// The ready-list for scheduling (only used for the dry-run).
3718 SetVector<ScheduleData *> ReadyInsts;
3719
3720 /// The first instruction of the scheduling region.
3721 Instruction *ScheduleStart = nullptr;
3722
3723 /// The first instruction _after_ the scheduling region.
3724 Instruction *ScheduleEnd = nullptr;
3725
3726 /// The first memory accessing instruction in the scheduling region
3727 /// (can be null).
3728 ScheduleData *FirstLoadStoreInRegion = nullptr;
3729
3730 /// The last memory accessing instruction in the scheduling region
3731 /// (can be null).
3732 ScheduleData *LastLoadStoreInRegion = nullptr;
3733
3734 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
3735 /// region? Used to optimize the dependence calculation for the
3736 /// common case where there isn't.
3737 bool RegionHasStackSave = false;
3738
3739 /// The current size of the scheduling region.
3740 int ScheduleRegionSize = 0;
3741
3742 /// The maximum size allowed for the scheduling region.
3743 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
3744
3745 /// The ID of the scheduling region. For a new vectorization iteration this
3746 /// is incremented which "removes" all ScheduleData from the region.
3747 /// Make sure that the initial SchedulingRegionID is greater than the
3748 /// initial SchedulingRegionID in ScheduleData (which is 0).
3749 int SchedulingRegionID = 1;
3750 };
3751
3752 /// Attaches the BlockScheduling structures to basic blocks.
3754
3755 /// Performs the "real" scheduling. Done before vectorization is actually
3756 /// performed in a basic block.
3757 void scheduleBlock(BlockScheduling *BS);
3758
3759 /// List of users to ignore during scheduling and that don't need extracting.
3760 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
3761
3762 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
3763 /// sorted SmallVectors of unsigned.
3764 struct OrdersTypeDenseMapInfo {
3765 static OrdersType getEmptyKey() {
3766 OrdersType V;
3767 V.push_back(~1U);
3768 return V;
3769 }
3770
3771 static OrdersType getTombstoneKey() {
3772 OrdersType V;
3773 V.push_back(~2U);
3774 return V;
3775 }
3776
3777 static unsigned getHashValue(const OrdersType &V) {
3778 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
3779 }
3780
3781 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
3782 return LHS == RHS;
3783 }
3784 };
3785
3786 // Analysis and block reference.
3787 Function *F;
3788 ScalarEvolution *SE;
3790 TargetLibraryInfo *TLI;
3791 LoopInfo *LI;
3792 DominatorTree *DT;
3793 AssumptionCache *AC;
3794 DemandedBits *DB;
3795 const DataLayout *DL;
3797
3798 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
3799 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
3800
3801 /// Instruction builder to construct the vectorized tree.
3803
3804 /// A map of scalar integer values to the smallest bit width with which they
3805 /// can legally be represented. The values map to (width, signed) pairs,
3806 /// where "width" indicates the minimum bit width and "signed" is True if the
3807 /// value must be signed-extended, rather than zero-extended, back to its
3808 /// original width.
3810
3811 /// Final size of the reduced vector, if the current graph represents the
3812 /// input for the reduction and it was possible to narrow the size of the
3813 /// reduction.
3814 unsigned ReductionBitWidth = 0;
3815
3816 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
3817 /// type sizes, used in the tree.
3818 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
3819
3820 /// Indices of the vectorized nodes, which supposed to be the roots of the new
3821 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
3822 DenseSet<unsigned> ExtraBitWidthNodes;
3823};
3824
3825} // end namespace slpvectorizer
3826
3827template <> struct GraphTraits<BoUpSLP *> {
3828 using TreeEntry = BoUpSLP::TreeEntry;
3829
3830 /// NodeRef has to be a pointer per the GraphWriter.
3832
3834
3835 /// Add the VectorizableTree to the index iterator to be able to return
3836 /// TreeEntry pointers.
3837 struct ChildIteratorType
3838 : public iterator_adaptor_base<
3839 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
3841
3843 ContainerTy &VT)
3844 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
3845
3846 NodeRef operator*() { return I->UserTE; }
3847 };
3848
3850 return R.VectorizableTree[0].get();
3851 }
3852
3853 static ChildIteratorType child_begin(NodeRef N) {
3854 return {N->UserTreeIndices.begin(), N->Container};
3855 }
3856
3857 static ChildIteratorType child_end(NodeRef N) {
3858 return {N->UserTreeIndices.end(), N->Container};
3859 }
3860
3861 /// For the node iterator we just need to turn the TreeEntry iterator into a
3862 /// TreeEntry* iterator so that it dereferences to NodeRef.
3863 class nodes_iterator {
3865 ItTy It;
3866
3867 public:
3868 nodes_iterator(const ItTy &It2) : It(It2) {}
3869 NodeRef operator*() { return It->get(); }
3870 nodes_iterator operator++() {
3871 ++It;
3872 return *this;
3873 }
3874 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
3875 };
3876
3877 static nodes_iterator nodes_begin(BoUpSLP *R) {
3878 return nodes_iterator(R->VectorizableTree.begin());
3879 }
3880
3881 static nodes_iterator nodes_end(BoUpSLP *R) {
3882 return nodes_iterator(R->VectorizableTree.end());
3883 }
3884
3885 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
3886};
3887
3888template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
3889 using TreeEntry = BoUpSLP::TreeEntry;
3890
3891 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
3892
3893 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
3894 std::string Str;
3896 OS << Entry->Idx << ".\n";
3897 if (isSplat(Entry->Scalars))
3898 OS << "<splat> ";
3899 for (auto *V : Entry->Scalars) {
3900 OS << *V;
3901 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
3902 return EU.Scalar == V;
3903 }))
3904 OS << " <extract>";
3905 OS << "\n";
3906 }
3907 return Str;
3908 }
3909
3910 static std::string getNodeAttributes(const TreeEntry *Entry,
3911 const BoUpSLP *) {
3912 if (Entry->State == TreeEntry::NeedToGather)
3913 return "color=red";
3914 if (Entry->State == TreeEntry::ScatterVectorize ||
3915 Entry->State == TreeEntry::StridedVectorize)
3916 return "color=blue";
3917 return "";
3918 }
3919};
3920
3921} // end namespace llvm
3922
3925 for (auto *I : DeletedInstructions) {
3926 for (Use &U : I->operands()) {
3927 auto *Op = dyn_cast<Instruction>(U.get());
3928 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
3930 DeadInsts.emplace_back(Op);
3931 }
3932 I->dropAllReferences();
3933 }
3934 for (auto *I : DeletedInstructions) {
3935 assert(I->use_empty() &&
3936 "trying to erase instruction with users.");
3937 I->eraseFromParent();
3938 }
3939
3940 // Cleanup any dead scalar code feeding the vectorized instructions
3942
3943#ifdef EXPENSIVE_CHECKS
3944 // If we could guarantee that this call is not extremely slow, we could
3945 // remove the ifdef limitation (see PR47712).
3946 assert(!verifyFunction(*F, &dbgs()));
3947#endif
3948}
3949
3950/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
3951/// contains original mask for the scalars reused in the node. Procedure
3952/// transform this mask in accordance with the given \p Mask.
3954 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
3955 "Expected non-empty mask.");
3956 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
3957 Prev.swap(Reuses);
3958 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
3959 if (Mask[I] != PoisonMaskElem)
3960 Reuses[Mask[I]] = Prev[I];
3961}
3962
3963/// Reorders the given \p Order according to the given \p Mask. \p Order - is
3964/// the original order of the scalars. Procedure transforms the provided order
3965/// in accordance with the given \p Mask. If the resulting \p Order is just an
3966/// identity order, \p Order is cleared.
3968 bool BottomOrder = false) {
3969 assert(!Mask.empty() && "Expected non-empty mask.");
3970 unsigned Sz = Mask.size();
3971 if (BottomOrder) {
3972 SmallVector<unsigned> PrevOrder;
3973 if (Order.empty()) {
3974 PrevOrder.resize(Sz);
3975 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
3976 } else {
3977 PrevOrder.swap(Order);
3978 }
3979 Order.assign(Sz, Sz);
3980 for (unsigned I = 0; I < Sz; ++I)
3981 if (Mask[I] != PoisonMaskElem)
3982 Order[I] = PrevOrder[Mask[I]];
3983 if (all_of(enumerate(Order), [&](const auto &Data) {
3984 return Data.value() == Sz || Data.index() == Data.value();
3985 })) {
3986 Order.clear();
3987 return;
3988 }
3989 fixupOrderingIndices(Order);
3990 return;
3991 }
3992 SmallVector<int> MaskOrder;
3993 if (Order.empty()) {
3994 MaskOrder.resize(Sz);
3995 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
3996 } else {
3997 inversePermutation(Order, MaskOrder);
3998 }
3999 reorderReuses(MaskOrder, Mask);
4000 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4001 Order.clear();
4002 return;
4003 }
4004 Order.assign(Sz, Sz);
4005 for (unsigned I = 0; I < Sz; ++I)
4006 if (MaskOrder[I] != PoisonMaskElem)
4007 Order[MaskOrder[I]] = I;
4008 fixupOrderingIndices(Order);
4009}
4010
4011std::optional<BoUpSLP::OrdersType>
4012BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4013 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4014 // Try to find subvector extract/insert patterns and reorder only such
4015 // patterns.
4016 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4017 Type *ScalarTy = GatheredScalars.front()->getType();
4018 int NumScalars = GatheredScalars.size();
4019 if (!isValidElementType(ScalarTy))
4020 return std::nullopt;
4021 auto *VecTy = FixedVectorType::get(ScalarTy, NumScalars);
4022 int NumParts = TTI->getNumberOfParts(VecTy);
4023 if (NumParts == 0 || NumParts >= NumScalars)
4024 NumParts = 1;
4025 SmallVector<int> ExtractMask;
4026 SmallVector<int> Mask;
4029 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4031 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4032 /*ForOrder=*/true);
4033 // No shuffled operands - ignore.
4034 if (GatherShuffles.empty() && ExtractShuffles.empty())
4035 return std::nullopt;
4036 OrdersType CurrentOrder(NumScalars, NumScalars);
4037 if (GatherShuffles.size() == 1 &&
4038 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4039 Entries.front().front()->isSame(TE.Scalars)) {
4040 // Perfect match in the graph, will reuse the previously vectorized
4041 // node. Cost is 0.
4042 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4043 return CurrentOrder;
4044 }
4045 auto IsSplatMask = [](ArrayRef<int> Mask) {
4046 int SingleElt = PoisonMaskElem;
4047 return all_of(Mask, [&](int I) {
4048 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4049 SingleElt = I;
4050 return I == PoisonMaskElem || I == SingleElt;
4051 });
4052 };
4053 // Exclusive broadcast mask - ignore.
4054 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4055 (Entries.size() != 1 ||
4056 Entries.front().front()->ReorderIndices.empty())) ||
4057 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4058 return std::nullopt;
4059 SmallBitVector ShuffledSubMasks(NumParts);
4060 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4061 ArrayRef<int> Mask, int PartSz, int NumParts,
4062 function_ref<unsigned(unsigned)> GetVF) {
4063 for (int I : seq<int>(0, NumParts)) {
4064 if (ShuffledSubMasks.test(I))
4065 continue;
4066 const int VF = GetVF(I);
4067 if (VF == 0)
4068 continue;
4069 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, PartSz);
4070 // Shuffle of at least 2 vectors - ignore.
4071 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4072 std::fill(Slice.begin(), Slice.end(), NumScalars);
4073 ShuffledSubMasks.set(I);
4074 continue;
4075 }
4076 // Try to include as much elements from the mask as possible.
4077 int FirstMin = INT_MAX;
4078 int SecondVecFound = false;
4079 for (int K : seq<int>(0, PartSz)) {
4080 int Idx = Mask[I * PartSz + K];
4081 if (Idx == PoisonMaskElem) {
4082 Value *V = GatheredScalars[I * PartSz + K];
4083 if (isConstant(V) && !isa<PoisonValue>(V)) {
4084 SecondVecFound = true;
4085 break;
4086 }
4087 continue;
4088 }
4089 if (Idx < VF) {
4090 if (FirstMin > Idx)
4091 FirstMin = Idx;
4092 } else {
4093 SecondVecFound = true;
4094 break;
4095 }
4096 }
4097 FirstMin = (FirstMin / PartSz) * PartSz;
4098 // Shuffle of at least 2 vectors - ignore.
4099 if (SecondVecFound) {
4100 std::fill(Slice.begin(), Slice.end(), NumScalars);
4101 ShuffledSubMasks.set(I);
4102 continue;
4103 }
4104 for (int K : seq<int>(0, PartSz)) {
4105 int Idx = Mask[I * PartSz + K];
4106 if (Idx == PoisonMaskElem)
4107 continue;
4108 Idx -= FirstMin;
4109 if (Idx >= PartSz) {
4110 SecondVecFound = true;
4111 break;
4112 }
4113 if (CurrentOrder[I * PartSz + Idx] >
4114 static_cast<unsigned>(I * PartSz + K) &&
4115 CurrentOrder[I * PartSz + Idx] !=
4116 static_cast<unsigned>(I * PartSz + Idx))
4117 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4118 }
4119 // Shuffle of at least 2 vectors - ignore.
4120 if (SecondVecFound) {
4121 std::fill(Slice.begin(), Slice.end(), NumScalars);
4122 ShuffledSubMasks.set(I);
4123 continue;
4124 }
4125 }
4126 };
4127 int PartSz = NumScalars / NumParts;
4128 if (!ExtractShuffles.empty())
4129 TransformMaskToOrder(
4130 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4131 if (!ExtractShuffles[I])
4132 return 0U;
4133 unsigned VF = 0;
4134 for (unsigned Idx : seq<unsigned>(0, PartSz)) {
4135 int K = I * PartSz + Idx;
4136 if (ExtractMask[K] == PoisonMaskElem)
4137 continue;
4138 if (!TE.ReuseShuffleIndices.empty())
4139 K = TE.ReuseShuffleIndices[K];
4140 if (!TE.ReorderIndices.empty())
4141 K = std::distance(TE.ReorderIndices.begin(),
4142 find(TE.ReorderIndices, K));
4143 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4144 if (!EI)
4145 continue;
4146 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4147 ->getElementCount()
4148 .getKnownMinValue());
4149 }
4150 return VF;
4151 });
4152 // Check special corner case - single shuffle of the same entry.
4153 if (GatherShuffles.size() == 1 && NumParts != 1) {
4154 if (ShuffledSubMasks.any())
4155 return std::nullopt;
4156 PartSz = NumScalars;
4157 NumParts = 1;
4158 }
4159 if (!Entries.empty())
4160 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4161 if (!GatherShuffles[I])
4162 return 0U;
4163 return std::max(Entries[I].front()->getVectorFactor(),
4164 Entries[I].back()->getVectorFactor());
4165 });
4166 int NumUndefs =
4167 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4168 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4169 return std::nullopt;
4170 return std::move(CurrentOrder);
4171}
4172
4173static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4174 const TargetLibraryInfo &TLI,
4175 bool CompareOpcodes = true) {
4176 if (getUnderlyingObject(Ptr1) != getUnderlyingObject(Ptr2))
4177 return false;
4178 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4179 if (!GEP1)
4180 return false;
4181 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4182 if (!GEP2)
4183 return false;
4184 return GEP1->getNumOperands() == 2 && GEP2->getNumOperands() == 2 &&
4185 ((isConstant(GEP1->getOperand(1)) &&
4186 isConstant(GEP2->getOperand(1))) ||
4187 !CompareOpcodes ||
4188 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)
4189 .getOpcode());
4190}
4191
4192/// Calculates minimal alignment as a common alignment.
4193template <typename T>
4195 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4196 for (Value *V : VL.drop_front())
4197 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4198 return CommonAlignment;
4199}
4200
4201/// Check if \p Order represents reverse order.
4203 unsigned Sz = Order.size();
4204 return !Order.empty() && all_of(enumerate(Order), [&](const auto &Pair) {
4205 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4206 });
4207}
4208
4209/// Checks if the provided list of pointers \p Pointers represents the strided
4210/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4211/// Otherwise, if \p Inst is not specified, just initialized optional value is
4212/// returned to show that the pointers represent strided pointers. If \p Inst
4213/// specified, the runtime stride is materialized before the given \p Inst.
4214/// \returns std::nullopt if the pointers are not pointers with the runtime
4215/// stride, nullptr or actual stride value, otherwise.
4216static std::optional<Value *>
4218 const DataLayout &DL, ScalarEvolution &SE,
4219 SmallVectorImpl<unsigned> &SortedIndices,
4220 Instruction *Inst = nullptr) {
4222 const SCEV *PtrSCEVLowest = nullptr;
4223 const SCEV *PtrSCEVHighest = nullptr;
4224 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4225 // addresses).
4226 for (Value *Ptr : PointerOps) {
4227 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4228 if (!PtrSCEV)
4229 return std::nullopt;
4230 SCEVs.push_back(PtrSCEV);
4231 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4232 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4233 continue;
4234 }
4235 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4236 if (isa<SCEVCouldNotCompute>(Diff))
4237 return std::nullopt;
4238 if (Diff->isNonConstantNegative()) {
4239 PtrSCEVLowest = PtrSCEV;
4240 continue;
4241 }
4242 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4243 if (isa<SCEVCouldNotCompute>(Diff1))
4244 return std::nullopt;
4245 if (Diff1->isNonConstantNegative()) {
4246 PtrSCEVHighest = PtrSCEV;
4247 continue;
4248 }
4249 }
4250 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4251 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4252 if (isa<SCEVCouldNotCompute>(Dist))
4253 return std::nullopt;
4254 int Size = DL.getTypeStoreSize(ElemTy);
4255 auto TryGetStride = [&](const SCEV *Dist,
4256 const SCEV *Multiplier) -> const SCEV * {
4257 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4258 if (M->getOperand(0) == Multiplier)
4259 return M->getOperand(1);
4260 if (M->getOperand(1) == Multiplier)
4261 return M->getOperand(0);
4262 return nullptr;
4263 }
4264 if (Multiplier == Dist)
4265 return SE.getConstant(Dist->getType(), 1);
4266 return SE.getUDivExactExpr(Dist, Multiplier);
4267 };
4268 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4269 const SCEV *Stride = nullptr;
4270 if (Size != 1 || SCEVs.size() > 2) {
4271 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4272 Stride = TryGetStride(Dist, Sz);
4273 if (!Stride)
4274 return std::nullopt;
4275 }
4276 if (!Stride || isa<SCEVConstant>(Stride))
4277 return std::nullopt;
4278 // Iterate through all pointers and check if all distances are
4279 // unique multiple of Stride.
4280 using DistOrdPair = std::pair<int64_t, int>;
4281 auto Compare = llvm::less_first();
4282 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4283 int Cnt = 0;
4284 bool IsConsecutive = true;
4285 for (const SCEV *PtrSCEV : SCEVs) {
4286 unsigned Dist = 0;
4287 if (PtrSCEV != PtrSCEVLowest) {
4288 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4289 const SCEV *Coeff = TryGetStride(Diff, Stride);
4290 if (!Coeff)
4291 return std::nullopt;
4292 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4293 if (!SC || isa<SCEVCouldNotCompute>(SC))
4294 return std::nullopt;
4295 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4296 SE.getMulExpr(Stride, SC)))
4297 ->isZero())
4298 return std::nullopt;
4299 Dist = SC->getAPInt().getZExtValue();
4300 }
4301 // If the strides are not the same or repeated, we can't vectorize.
4302 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4303 return std::nullopt;
4304 auto Res = Offsets.emplace(Dist, Cnt);
4305 if (!Res.second)
4306 return std::nullopt;
4307 // Consecutive order if the inserted element is the last one.
4308 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4309 ++Cnt;
4310 }
4311 if (Offsets.size() != SCEVs.size())
4312 return std::nullopt;
4313 SortedIndices.clear();
4314 if (!IsConsecutive) {
4315 // Fill SortedIndices array only if it is non-consecutive.
4316 SortedIndices.resize(PointerOps.size());
4317 Cnt = 0;
4318 for (const std::pair<int64_t, int> &Pair : Offsets) {
4319 SortedIndices[Cnt] = Pair.second;
4320 ++Cnt;
4321 }
4322 }
4323 if (!Inst)
4324 return nullptr;
4325 SCEVExpander Expander(SE, DL, "strided-load-vec");
4326 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4327}
4328
4329static std::pair<InstructionCost, InstructionCost>
4331 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4332 Type *ScalarTy, VectorType *VecTy);
4333
4335 ArrayRef<Value *> VL, const Value *VL0, SmallVectorImpl<unsigned> &Order,
4336 SmallVectorImpl<Value *> &PointerOps, bool TryRecursiveCheck) const {
4337 // Check that a vectorized load would load the same memory as a scalar
4338 // load. For example, we don't want to vectorize loads that are smaller
4339 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
4340 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
4341 // from such a struct, we read/write packed bits disagreeing with the
4342 // unvectorized version.
4343 Type *ScalarTy = VL0->getType();
4344
4345 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
4346 return LoadsState::Gather;
4347
4348 // Make sure all loads in the bundle are simple - we can't vectorize
4349 // atomic or volatile loads.
4350 PointerOps.clear();
4351 const unsigned Sz = VL.size();
4352 PointerOps.resize(Sz);
4353 auto *POIter = PointerOps.begin();
4354 for (Value *V : VL) {
4355 auto *L = cast<LoadInst>(V);
4356 if (!L->isSimple())
4357 return LoadsState::Gather;
4358 *POIter = L->getPointerOperand();
4359 ++POIter;
4360 }
4361
4362 Order.clear();
4363 auto *VecTy = FixedVectorType::get(ScalarTy, Sz);
4364 // Check the order of pointer operands or that all pointers are the same.
4365 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
4366 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
4367 if (!Order.empty() && !isPowerOf2_32(VL.size())) {
4368 assert(VectorizeNonPowerOf2 && "non-power-of-2 number of loads only "
4369 "supported with VectorizeNonPowerOf2");
4370 return LoadsState::Gather;
4371 }
4372
4373 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4374 if (!IsSorted && Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy) &&
4375 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
4376 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
4378 if (IsSorted || all_of(PointerOps, [&](Value *P) {
4379 return arePointersCompatible(P, PointerOps.front(), *TLI);
4380 })) {
4381 if (IsSorted) {
4382 Value *Ptr0;
4383 Value *PtrN;
4384 if (Order.empty()) {
4385 Ptr0 = PointerOps.front();
4386 PtrN = PointerOps.back();
4387 } else {
4388 Ptr0 = PointerOps[Order.front()];
4389 PtrN = PointerOps[Order.back()];
4390 }
4391 std::optional<int> Diff =
4392 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
4393 // Check that the sorted loads are consecutive.
4394 if (static_cast<unsigned>(*Diff) == Sz - 1)
4395 return LoadsState::Vectorize;
4396 // Simple check if not a strided access - clear order.
4397 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
4398 // Try to generate strided load node if:
4399 // 1. Target with strided load support is detected.
4400 // 2. The number of loads is greater than MinProfitableStridedLoads,
4401 // or the potential stride <= MaxProfitableLoadStride and the
4402 // potential stride is power-of-2 (to avoid perf regressions for the very
4403 // small number of loads) and max distance > number of loads, or potential
4404 // stride is -1.
4405 // 3. The loads are ordered, or number of unordered loads <=
4406 // MaxProfitableUnorderedLoads, or loads are in reversed order.
4407 // (this check is to avoid extra costs for very expensive shuffles).
4408 if (IsPossibleStrided && (((Sz > MinProfitableStridedLoads ||
4409 (static_cast<unsigned>(std::abs(*Diff)) <=
4411 isPowerOf2_32(std::abs(*Diff)))) &&
4412 static_cast<unsigned>(std::abs(*Diff)) > Sz) ||
4413 *Diff == -(static_cast<int>(Sz) - 1))) {
4414 int Stride = *Diff / static_cast<int>(Sz - 1);
4415 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
4416 Align Alignment =
4417 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
4418 ->getAlign();
4419 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
4420 // Iterate through all pointers and check if all distances are
4421 // unique multiple of Dist.
4422 SmallSet<int, 4> Dists;
4423 for (Value *Ptr : PointerOps) {
4424 int Dist = 0;
4425 if (Ptr == PtrN)
4426 Dist = *Diff;
4427 else if (Ptr != Ptr0)
4428 Dist =
4429 *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
4430 // If the strides are not the same or repeated, we can't
4431 // vectorize.
4432 if (((Dist / Stride) * Stride) != Dist ||
4433 !Dists.insert(Dist).second)
4434 break;
4435 }
4436 if (Dists.size() == Sz)
4438 }
4439 }
4440 }
4441 }
4442 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) {
4443 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
4444 unsigned MinVF = getMinVF(Sz);
4445 unsigned MaxVF = std::max<unsigned>(bit_floor(VL.size() / 2), MinVF);
4446 MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF);
4447 for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) {
4448 unsigned VectorizedCnt = 0;
4450 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End;
4451 Cnt += VF, ++VectorizedCnt) {
4452 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
4454 SmallVector<Value *> PointerOps;
4455 LoadsState LS =
4456 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps,
4457 /*TryRecursiveCheck=*/false);
4458 // Check that the sorted loads are consecutive.
4459 if (LS == LoadsState::Gather)
4460 break;
4461 // If need the reorder - consider as high-cost masked gather for now.
4462 if ((LS == LoadsState::Vectorize ||
4464 !Order.empty() && !isReverseOrder(Order))
4466 States.push_back(LS);
4467 }
4468 // Can be vectorized later as a serie of loads/insertelements.
4469 if (VectorizedCnt == VL.size() / VF) {
4470 // Compare masked gather cost and loads + insersubvector costs.
4472 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4473 TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr,
4474 CostKind, ScalarTy, VecTy);
4475 InstructionCost MaskedGatherCost =
4477 Instruction::Load, VecTy,
4478 cast<LoadInst>(VL0)->getPointerOperand(),
4479 /*VariableMask=*/false, CommonAlignment, CostKind) +
4480 VectorGEPCost - ScalarGEPCost;
4481 InstructionCost VecLdCost = 0;
4482 auto *SubVecTy = FixedVectorType::get(ScalarTy, VF);
4483 for (auto [I, LS] : enumerate(States)) {
4484 auto *LI0 = cast<LoadInst>(VL[I * VF]);
4485 switch (LS) {
4486 case LoadsState::Vectorize: {
4487 auto [ScalarGEPCost, VectorGEPCost] =
4488 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4489 LI0->getPointerOperand(), Instruction::Load,
4490 CostKind, ScalarTy, SubVecTy);
4491 VecLdCost += TTI.getMemoryOpCost(
4492 Instruction::Load, SubVecTy, LI0->getAlign(),
4493 LI0->getPointerAddressSpace(), CostKind,
4495 VectorGEPCost - ScalarGEPCost;
4496 break;
4497 }
4499 auto [ScalarGEPCost, VectorGEPCost] =
4500 getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4501 LI0->getPointerOperand(), Instruction::Load,
4502 CostKind, ScalarTy, SubVecTy);
4503 VecLdCost +=
4505 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4506 /*VariableMask=*/false, CommonAlignment, CostKind) +
4507 VectorGEPCost - ScalarGEPCost;
4508 break;
4509 }
4511 auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts(
4512 TTI, ArrayRef(PointerOps).slice(I * VF, VF),
4513 LI0->getPointerOperand(), Instruction::GetElementPtr,
4514 CostKind, ScalarTy, SubVecTy);
4515 VecLdCost +=
4517 Instruction::Load, SubVecTy, LI0->getPointerOperand(),
4518 /*VariableMask=*/false, CommonAlignment, CostKind) +
4519 VectorGEPCost - ScalarGEPCost;
4520 break;
4521 }
4522 case LoadsState::Gather:
4524 "Expected only consecutive, strided or masked gather loads.");
4525 }
4526 SmallVector<int> ShuffleMask(VL.size());
4527 for (int Idx : seq<int>(0, VL.size()))
4528 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
4529 VecLdCost +=
4530 TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy, ShuffleMask,
4531 CostKind, I * VF, SubVecTy);
4532 }
4533 // If masked gather cost is higher - better to vectorize, so
4534 // consider it as a gather node. It will be better estimated
4535 // later.
4536 if (MaskedGatherCost >= VecLdCost)
4537 return true;
4538 }
4539 }
4540 return false;
4541 };
4542 // TODO: need to improve analysis of the pointers, if not all of them are
4543 // GEPs or have > 2 operands, we end up with a gather node, which just
4544 // increases the cost.
4545 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
4546 bool ProfitableGatherPointers =
4547 L && Sz > 2 &&
4548 static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
4549 return L->isLoopInvariant(V);
4550 })) <= Sz / 2;
4551 if (ProfitableGatherPointers || all_of(PointerOps, [IsSorted](Value *P) {
4552 auto *GEP = dyn_cast<GetElementPtrInst>(P);
4553 return (IsSorted && !GEP && doesNotNeedToBeScheduled(P)) ||
4554 (GEP && GEP->getNumOperands() == 2 &&
4555 isa<Constant, Instruction>(GEP->getOperand(1)));
4556 })) {
4557 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
4558 if (TTI->isLegalMaskedGather(VecTy, CommonAlignment) &&
4559 !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) {
4560 // Check if potential masked gather can be represented as series
4561 // of loads + insertsubvectors.
4562 if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) {
4563 // If masked gather cost is higher - better to vectorize, so
4564 // consider it as a gather node. It will be better estimated
4565 // later.
4566 return LoadsState::Gather;
4567 }
4569 }
4570 }
4571 }
4572
4573 return LoadsState::Gather;
4574}
4575
4577 const DataLayout &DL, ScalarEvolution &SE,
4578 SmallVectorImpl<unsigned> &SortedIndices) {
4580 VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
4581 "Expected list of pointer operands.");
4582 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
4583 // Ptr into, sort and return the sorted indices with values next to one
4584 // another.
4586 Bases[VL[0]].push_back(std::make_tuple(VL[0], 0U, 0U));
4587
4588 unsigned Cnt = 1;
4589 for (Value *Ptr : VL.drop_front()) {
4590 bool Found = any_of(Bases, [&](auto &Base) {
4591 std::optional<int> Diff =
4592 getPointersDiff(ElemTy, Base.first, ElemTy, Ptr, DL, SE,
4593 /*StrictCheck=*/true);
4594 if (!Diff)
4595 return false;
4596
4597 Base.second.emplace_back(Ptr, *Diff, Cnt++);
4598 return true;
4599 });
4600
4601 if (!Found) {
4602 // If we haven't found enough to usefully cluster, return early.
4603 if (Bases.size() > VL.size() / 2 - 1)
4604 return false;
4605
4606 // Not found already - add a new Base
4607 Bases[Ptr].emplace_back(Ptr, 0, Cnt++);
4608 }
4609 }
4610
4611 // For each of the bases sort the pointers by Offset and check if any of the
4612 // base become consecutively allocated.
4613 bool AnyConsecutive = false;
4614 for (auto &Base : Bases) {
4615 auto &Vec = Base.second;
4616 if (Vec.size() > 1) {
4617 llvm::stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
4618 const std::tuple<Value *, int, unsigned> &Y) {
4619 return std::get<1>(X) < std::get<1>(Y);
4620 });
4621 int InitialOffset = std::get<1>(Vec[0]);
4622 AnyConsecutive |= all_of(enumerate(Vec), [InitialOffset](const auto &P) {
4623 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
4624 });
4625 }
4626 }
4627
4628 // Fill SortedIndices array only if it looks worth-while to sort the ptrs.
4629 SortedIndices.clear();
4630 if (!AnyConsecutive)
4631 return false;
4632
4633 for (auto &Base : Bases) {
4634 for (auto &T : Base.second)
4635 SortedIndices.push_back(std::get<2>(T));
4636 }
4637
4638 assert(SortedIndices.size() == VL.size() &&
4639 "Expected SortedIndices to be the size of VL");
4640 return true;
4641}
4642
4643std::optional<BoUpSLP::OrdersType>
4644BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
4645 assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only.");
4646 Type *ScalarTy = TE.Scalars[0]->getType();
4647
4649 Ptrs.reserve(TE.Scalars.size());
4650 for (Value *V : TE.Scalars) {
4651 auto *L = dyn_cast<LoadInst>(V);
4652 if (!L || !L->isSimple())
4653 return std::nullopt;
4654 Ptrs.push_back(L->getPointerOperand());
4655 }
4656
4657 BoUpSLP::OrdersType Order;
4658 if (clusterSortPtrAccesses(Ptrs, ScalarTy, *DL, *SE, Order))
4659 return std::move(Order);
4660 return std::nullopt;
4661}
4662
4663/// Check if two insertelement instructions are from the same buildvector.
4666 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
4667 // Instructions must be from the same basic blocks.
4668 if (VU->getParent() != V->getParent())
4669 return false;
4670 // Checks if 2 insertelements are from the same buildvector.
4671 if (VU->getType() != V->getType())
4672 return false;
4673 // Multiple used inserts are separate nodes.
4674 if (!VU->hasOneUse() && !V->hasOneUse())
4675 return false;
4676 auto *IE1 = VU;
4677 auto *IE2 = V;
4678 std::optional<unsigned> Idx1 = getInsertIndex(IE1);
4679 std::optional<unsigned> Idx2 = getInsertIndex(IE2);
4680 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
4681 return false;
4682 // Go through the vector operand of insertelement instructions trying to find
4683 // either VU as the original vector for IE2 or V as the original vector for
4684 // IE1.
4685 SmallBitVector ReusedIdx(
4686 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
4687 bool IsReusedIdx = false;
4688 do {
4689 if (IE2 == VU && !IE1)
4690 return VU->hasOneUse();
4691 if (IE1 == V && !IE2)
4692 return V->hasOneUse();
4693 if (IE1 && IE1 != V) {
4694 unsigned Idx1 = getInsertIndex(IE1).value_or(*Idx2);
4695 IsReusedIdx |= ReusedIdx.test(Idx1);
4696 ReusedIdx.set(Idx1);
4697 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
4698 IE1 = nullptr;
4699 else
4700 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
4701 }
4702 if (IE2 && IE2 != VU) {
4703 unsigned Idx2 = getInsertIndex(IE2).value_or(*Idx1);
4704 IsReusedIdx |= ReusedIdx.test(Idx2);
4705 ReusedIdx.set(Idx2);
4706 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
4707 IE2 = nullptr;
4708 else
4709 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
4710 }
4711 } while (!IsReusedIdx && (IE1 || IE2));
4712 return false;
4713}
4714
4715std::optional<BoUpSLP::OrdersType>
4716BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
4717 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
4718 if (TE.isNonPowOf2Vec())
4719 return std::nullopt;
4720
4721 // No need to reorder if need to shuffle reuses, still need to shuffle the
4722 // node.
4723 if (!TE.ReuseShuffleIndices.empty()) {
4724 if (isSplat(TE.Scalars))
4725 return std::nullopt;
4726 // Check if reuse shuffle indices can be improved by reordering.
4727 // For this, check that reuse mask is "clustered", i.e. each scalar values
4728 // is used once in each submask of size <number_of_scalars>.
4729 // Example: 4 scalar values.
4730 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
4731 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
4732 // element 3 is used twice in the second submask.
4733 unsigned Sz = TE.Scalars.size();
4734 if (TE.State == TreeEntry::NeedToGather) {
4735 if (std::optional<OrdersType> CurrentOrder =
4737 SmallVector<int> Mask;
4738 fixupOrderingIndices(*CurrentOrder);
4739 inversePermutation(*CurrentOrder, Mask);
4740 ::addMask(Mask, TE.ReuseShuffleIndices);
4741 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
4742 unsigned Sz = TE.Scalars.size();
4743 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
4744 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
4745 if (Idx != PoisonMaskElem)
4746 Res[Idx + K * Sz] = I + K * Sz;
4747 }
4748 return std::move(Res);
4749 }
4750 }
4751 if (Sz == 2 && TE.getVectorFactor() == 4 &&
4753 TE.Scalars.front()->getType(), 2 * TE.getVectorFactor())) == 1)
4754 return std::nullopt;
4755 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
4756 Sz)) {
4757 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4758 if (TE.ReorderIndices.empty())
4759 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4760 else
4761 inversePermutation(TE.ReorderIndices, ReorderMask);
4762 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
4763 unsigned VF = ReorderMask.size();
4764 OrdersType ResOrder(VF, VF);
4765 unsigned NumParts = VF / Sz;
4766 SmallBitVector UsedVals(NumParts);
4767 for (unsigned I = 0; I < VF; I += Sz) {
4768 int Val = PoisonMaskElem;
4769 unsigned UndefCnt = 0;
4770 if (any_of(ArrayRef(ReorderMask).slice(I, Sz),
4771 [&](int Idx) {
4772 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
4773 Val = Idx;
4774 if (Idx == PoisonMaskElem)
4775 ++UndefCnt;
4776 return Idx != PoisonMaskElem && Idx != Val;
4777 }) ||
4778 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
4779 UndefCnt > Sz / 2)
4780 return std::nullopt;
4781 UsedVals.set(Val);
4782 for (unsigned K = 0; K < NumParts; ++K)
4783 ResOrder[Val + Sz * K] = I + K;
4784 }
4785 return std::move(ResOrder);
4786 }
4787 unsigned VF = TE.getVectorFactor();
4788 // Try build correct order for extractelement instructions.
4789 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
4790 TE.ReuseShuffleIndices.end());
4791 if (TE.getOpcode() == Instruction::ExtractElement && !TE.isAltShuffle() &&
4792 all_of(TE.Scalars, [Sz](Value *V) {
4793 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
4794 return Idx && *Idx < Sz;
4795 })) {
4796 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
4797 if (TE.ReorderIndices.empty())
4798 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
4799 else
4800 inversePermutation(TE.ReorderIndices, ReorderMask);
4801 for (unsigned I = 0; I < VF; ++I) {
4802 int &Idx = ReusedMask[I];
4803 if (Idx == PoisonMaskElem)
4804 continue;
4805 Value *V = TE.Scalars[ReorderMask[Idx]];
4806 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
4807 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
4808 }
4809 }
4810 // Build the order of the VF size, need to reorder reuses shuffles, they are
4811 // always of VF size.
4812 OrdersType ResOrder(VF);
4813 std::iota(ResOrder.begin(), ResOrder.end(), 0);
4814 auto *It = ResOrder.begin();
4815 for (unsigned K = 0; K < VF; K += Sz) {
4816 OrdersType CurrentOrder(TE.ReorderIndices);
4817 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
4818 if (SubMask.front() == PoisonMaskElem)
4819 std::iota(SubMask.begin(), SubMask.end(), 0);
4820 reorderOrder(CurrentOrder, SubMask);
4821 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
4822 std::advance(It, Sz);
4823 }
4824 if (TE.State == TreeEntry::NeedToGather &&
4825 all_of(enumerate(ResOrder),
4826 [](const auto &Data) { return Data.index() == Data.value(); }))
4827 return std::nullopt; // No need to reorder.
4828 return std::move(ResOrder);
4829 }
4830 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
4831 any_of(TE.UserTreeIndices,
4832 [](const EdgeInfo &EI) {
4833 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
4834 }) &&
4835 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
4836 return std::nullopt;
4837 if ((TE.State == TreeEntry::Vectorize ||
4838 TE.State == TreeEntry::StridedVectorize) &&
4839 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
4840 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp()))) &&
4841 !TE.isAltShuffle())
4842 return TE.ReorderIndices;
4843 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
4844 auto PHICompare = [&](unsigned I1, unsigned I2) {
4845 Value *V1 = TE.Scalars[I1];
4846 Value *V2 = TE.Scalars[I2];
4847 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
4848 return false;
4849 if (V1->getNumUses() < V2->getNumUses())
4850 return true;
4851 if (V1->getNumUses() > V2->getNumUses())
4852 return false;
4853 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
4854 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
4855 if (auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1))
4856 if (auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2)) {
4858 IE1, IE2,
4859 [](InsertElementInst *II) { return II->getOperand(0); }))
4860 return I1 < I2;
4861 return getInsertIndex(IE1) < getInsertIndex(IE2);
4862 }
4863 if (auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1))
4864 if (auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2)) {
4865 if (EE1->getOperand(0) != EE2->getOperand(0))
4866 return I1 < I2;
4867 return getInsertIndex(EE1) < getInsertIndex(EE2);
4868 }
4869 return I1 < I2;
4870 };
4871 auto IsIdentityOrder = [](const OrdersType &Order) {
4872 for (unsigned Idx : seq<unsigned>(0, Order.size()))
4873 if (Idx != Order[Idx])
4874 return false;
4875 return true;
4876 };
4877 if (!TE.ReorderIndices.empty())
4878 return TE.ReorderIndices;
4880 SmallVector<unsigned> Phis(TE.Scalars.size());
4881 std::iota(Phis.begin(), Phis.end(), 0);
4882 OrdersType ResOrder(TE.Scalars.size());
4883 for (unsigned Id = 0, Sz = TE.Scalars.size(); Id < Sz; ++Id)
4884 PhiToId[Id] = Id;
4885 stable_sort(Phis, PHICompare);
4886 for (unsigned Id = 0, Sz = Phis.size(); Id < Sz; ++Id)
4887 ResOrder[Id] = PhiToId[Phis[Id]];
4888 if (IsIdentityOrder(ResOrder))
4889 return std::nullopt; // No need to reorder.
4890 return std::move(ResOrder);
4891 }
4892 if (TE.State == TreeEntry::NeedToGather && !TE.isAltShuffle() &&
4893 allSameType(TE.Scalars)) {
4894 // TODO: add analysis of other gather nodes with extractelement
4895 // instructions and other values/instructions, not only undefs.
4896 if ((TE.getOpcode() == Instruction::ExtractElement ||
4897 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
4898 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
4899 all_of(TE.Scalars, [](Value *V) {
4900 auto *EE = dyn_cast<ExtractElementInst>(V);
4901 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
4902 })) {
4903 // Check that gather of extractelements can be represented as
4904 // just a shuffle of a single vector.
4905 OrdersType CurrentOrder;
4906 bool Reuse = canReuseExtract(TE.Scalars, TE.getMainOp(), CurrentOrder,
4907 /*ResizeAllowed=*/true);
4908 if (Reuse || !CurrentOrder.empty())
4909 return std::move(CurrentOrder);
4910 }
4911 // If the gather node is <undef, v, .., poison> and
4912 // insertelement poison, v, 0 [+ permute]
4913 // is cheaper than
4914 // insertelement poison, v, n - try to reorder.
4915 // If rotating the whole graph, exclude the permute cost, the whole graph
4916 // might be transformed.
4917 int Sz = TE.Scalars.size();
4918 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
4919 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
4920 const auto *It =
4921 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
4922 if (It == TE.Scalars.begin())
4923 return OrdersType();
4924 auto *Ty = FixedVectorType::get(TE.Scalars.front()->getType(), Sz);
4925 if (It != TE.Scalars.end()) {
4926 OrdersType Order(Sz, Sz);
4927 unsigned Idx = std::distance(TE.Scalars.begin(), It);
4928 Order[Idx] = 0;
4929 fixupOrderingIndices(Order);
4930 SmallVector<int> Mask;
4931 inversePermutation(Order, Mask);
4932 InstructionCost PermuteCost =
4933 TopToBottom
4934 ? 0
4936 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
4937 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
4938 PoisonValue::get(Ty), *It);
4939 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
4940 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
4941 PoisonValue::get(Ty), *It);
4942 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
4943 OrdersType Order(Sz, Sz);
4944 Order[Idx] = 0;
4945 return std::move(Order);
4946 }
4947 }
4948 }
4949 if (isSplat(TE.Scalars))
4950 return std::nullopt;
4951 if (TE.Scalars.size() >= 4)
4952 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
4953 return Order;
4954 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
4955 return CurrentOrder;
4956 }
4957 return std::nullopt;
4958}
4959
4960/// Checks if the given mask is a "clustered" mask with the same clusters of
4961/// size \p Sz, which are not identity submasks.
4963 unsigned Sz) {
4964 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
4965 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
4966 return false;
4967 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
4968 ArrayRef<int> Cluster = Mask.slice(I, Sz);
4969 if (Cluster != FirstCluster)
4970 return false;
4971 }
4972 return true;
4973}
4974
4975void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
4976 // Reorder reuses mask.
4977 reorderReuses(TE.ReuseShuffleIndices, Mask);
4978 const unsigned Sz = TE.Scalars.size();
4979 // For vectorized and non-clustered reused no need to do anything else.
4980 if (TE.State != TreeEntry::NeedToGather ||
4982 Sz) ||
4983 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
4984 return;
4985 SmallVector<int> NewMask;
4986 inversePermutation(TE.ReorderIndices, NewMask);
4987 addMask(NewMask, TE.ReuseShuffleIndices);
4988 // Clear reorder since it is going to be applied to the new mask.
4989 TE.ReorderIndices.clear();
4990 // Try to improve gathered nodes with clustered reuses, if possible.
4991 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
4992 SmallVector<unsigned> NewOrder(Slice.begin(), Slice.end());
4993 inversePermutation(NewOrder, NewMask);
4994 reorderScalars(TE.Scalars, NewMask);
4995 // Fill the reuses mask with the identity submasks.
4996 for (auto *It = TE.ReuseShuffleIndices.begin(),
4997 *End = TE.ReuseShuffleIndices.end();
4998 It != End; std::advance(It, Sz))
4999 std::iota(It, std::next(It, Sz), 0);
5000}
5001
5003 ArrayRef<unsigned> SecondaryOrder) {
5004 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5005 "Expected same size of orders");
5006 unsigned Sz = Order.size();
5007 SmallBitVector UsedIndices(Sz);
5008 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5009 if (Order[Idx] != Sz)
5010 UsedIndices.set(Order[Idx]);
5011 }
5012 if (SecondaryOrder.empty()) {
5013 for (unsigned Idx : seq<unsigned>(0, Sz))
5014 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5015 Order[Idx] = Idx;
5016 } else {
5017 for (unsigned Idx : seq<unsigned>(0, Sz))
5018 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5019 !UsedIndices.test(SecondaryOrder[Idx]))
5020 Order[Idx] = SecondaryOrder[Idx];
5021 }
5022}
5023
5025 // Maps VF to the graph nodes.
5027 // ExtractElement gather nodes which can be vectorized and need to handle
5028 // their ordering.
5030
5031 // Phi nodes can have preferred ordering based on their result users
5033
5034 // AltShuffles can also have a preferred ordering that leads to fewer
5035 // instructions, e.g., the addsub instruction in x86.
5036 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5037
5038 // Maps a TreeEntry to the reorder indices of external users.
5040 ExternalUserReorderMap;
5041 // Find all reorderable nodes with the given VF.
5042 // Currently the are vectorized stores,loads,extracts + some gathering of
5043 // extracts.
5044 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5045 const std::unique_ptr<TreeEntry> &TE) {
5046 // Look for external users that will probably be vectorized.
5047 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5048 findExternalStoreUsersReorderIndices(TE.get());
5049 if (!ExternalUserReorderIndices.empty()) {
5050 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5051 ExternalUserReorderMap.try_emplace(TE.get(),
5052 std::move(ExternalUserReorderIndices));
5053 }
5054
5055 // Patterns like [fadd,fsub] can be combined into a single instruction in
5056 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5057 // to take into account their order when looking for the most used order.
5058 if (TE->isAltShuffle()) {
5059 VectorType *VecTy =
5060 FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
5061 unsigned Opcode0 = TE->getOpcode();
5062 unsigned Opcode1 = TE->getAltOpcode();
5063 // The opcode mask selects between the two opcodes.
5064 SmallBitVector OpcodeMask(TE->Scalars.size(), false);
5065 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
5066 if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
5067 OpcodeMask.set(Lane);
5068 // If this pattern is supported by the target then we consider the order.
5069 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5070 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5071 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5072 }
5073 // TODO: Check the reverse order too.
5074 }
5075
5076 if (std::optional<OrdersType> CurrentOrder =
5077 getReorderingData(*TE, /*TopToBottom=*/true)) {
5078 // Do not include ordering for nodes used in the alt opcode vectorization,
5079 // better to reorder them during bottom-to-top stage. If follow the order
5080 // here, it causes reordering of the whole graph though actually it is
5081 // profitable just to reorder the subgraph that starts from the alternate
5082 // opcode vectorization node. Such nodes already end-up with the shuffle
5083 // instruction and it is just enough to change this shuffle rather than
5084 // rotate the scalars for the whole graph.
5085 unsigned Cnt = 0;
5086 const TreeEntry *UserTE = TE.get();
5087 while (UserTE && Cnt < RecursionMaxDepth) {
5088 if (UserTE->UserTreeIndices.size() != 1)
5089 break;
5090 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5091 return EI.UserTE->State == TreeEntry::Vectorize &&
5092 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5093 }))
5094 return;
5095 UserTE = UserTE->UserTreeIndices.back().UserTE;
5096 ++Cnt;
5097 }
5098 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5099 if (!(TE->State == TreeEntry::Vectorize ||
5100 TE->State == TreeEntry::StridedVectorize) ||
5101 !TE->ReuseShuffleIndices.empty())
5102 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5103 if (TE->State == TreeEntry::Vectorize &&
5104 TE->getOpcode() == Instruction::PHI)
5105 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
5106 }
5107 });
5108
5109 // Reorder the graph nodes according to their vectorization factor.
5110 for (unsigned VF = VectorizableTree.front()->getVectorFactor(); VF > 1;
5111 VF /= 2) {
5112 auto It = VFToOrderedEntries.find(VF);
5113 if (It == VFToOrderedEntries.end())
5114 continue;
5115 // Try to find the most profitable order. We just are looking for the most
5116 // used order and reorder scalar elements in the nodes according to this
5117 // mostly used order.
5118 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
5119 // All operands are reordered and used only in this node - propagate the
5120 // most used order to the user node.
5123 OrdersUses;
5125 for (const TreeEntry *OpTE : OrderedEntries) {
5126 // No need to reorder this nodes, still need to extend and to use shuffle,
5127 // just need to merge reordering shuffle and the reuse shuffle.
5128 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5129 continue;
5130 // Count number of orders uses.
5131 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
5132 &PhisToOrders]() -> const OrdersType & {
5133 if (OpTE->State == TreeEntry::NeedToGather ||
5134 !OpTE->ReuseShuffleIndices.empty()) {
5135 auto It = GathersToOrders.find(OpTE);
5136 if (It != GathersToOrders.end())
5137 return It->second;
5138 }
5139 if (OpTE->isAltShuffle()) {
5140 auto It = AltShufflesToOrders.find(OpTE);
5141 if (It != AltShufflesToOrders.end())
5142 return It->second;
5143 }
5144 if (OpTE->State == TreeEntry::Vectorize &&
5145 OpTE->getOpcode() == Instruction::PHI) {
5146 auto It = PhisToOrders.find(OpTE);
5147 if (It != PhisToOrders.end())
5148 return It->second;
5149 }
5150 return OpTE->ReorderIndices;
5151 }();
5152 // First consider the order of the external scalar users.
5153 auto It = ExternalUserReorderMap.find(OpTE);
5154 if (It != ExternalUserReorderMap.end()) {
5155 const auto &ExternalUserReorderIndices = It->second;
5156 // If the OpTE vector factor != number of scalars - use natural order,
5157 // it is an attempt to reorder node with reused scalars but with
5158 // external uses.
5159 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
5160 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
5161 ExternalUserReorderIndices.size();
5162 } else {
5163 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
5164 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
5165 }
5166 // No other useful reorder data in this entry.
5167 if (Order.empty())
5168 continue;
5169 }
5170 // Stores actually store the mask, not the order, need to invert.
5171 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5172 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5173 SmallVector<int> Mask;
5174 inversePermutation(Order, Mask);
5175 unsigned E = Order.size();
5176 OrdersType CurrentOrder(E, E);
5177 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5178 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5179 });
5180 fixupOrderingIndices(CurrentOrder);
5181 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
5182 } else {
5183 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
5184 }
5185 }
5186 if (OrdersUses.empty())
5187 continue;
5188 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5189 const unsigned Sz = Order.size();
5190 for (unsigned Idx : seq<unsigned>(0, Sz))
5191 if (Idx != Order[Idx] && Order[Idx] != Sz)
5192 return false;
5193 return true;
5194 };
5195 // Choose the most used order.
5196 unsigned IdentityCnt = 0;
5197 unsigned FilledIdentityCnt = 0;
5198 OrdersType IdentityOrder(VF, VF);
5199 for (auto &Pair : OrdersUses) {
5200 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5201 if (!Pair.first.empty())
5202 FilledIdentityCnt += Pair.second;
5203 IdentityCnt += Pair.second;
5204 combineOrders(IdentityOrder, Pair.first);
5205 }
5206 }
5207 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5208 unsigned Cnt = IdentityCnt;
5209 for (auto &Pair : OrdersUses) {
5210 // Prefer identity order. But, if filled identity found (non-empty order)
5211 // with same number of uses, as the new candidate order, we can choose
5212 // this candidate order.
5213 if (Cnt < Pair.second ||
5214 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
5215 Cnt == Pair.second && !BestOrder.empty() &&
5216 IsIdentityOrder(BestOrder))) {
5217 combineOrders(Pair.first, BestOrder);
5218 BestOrder = Pair.first;
5219 Cnt = Pair.second;
5220 } else {
5221 combineOrders(BestOrder, Pair.first);
5222 }
5223 }
5224 // Set order of the user node.
5225 if (IsIdentityOrder(BestOrder))
5226 continue;
5227 fixupOrderingIndices(BestOrder);
5228 SmallVector<int> Mask;
5229 inversePermutation(BestOrder, Mask);
5230 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5231 unsigned E = BestOrder.size();
5232 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5233 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5234 });
5235 // Do an actual reordering, if profitable.
5236 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5237 // Just do the reordering for the nodes with the given VF.
5238 if (TE->Scalars.size() != VF) {
5239 if (TE->ReuseShuffleIndices.size() == VF) {
5240 // Need to reorder the reuses masks of the operands with smaller VF to
5241 // be able to find the match between the graph nodes and scalar
5242 // operands of the given node during vectorization/cost estimation.
5243 assert(all_of(TE->UserTreeIndices,
5244 [VF, &TE](const EdgeInfo &EI) {
5245 return EI.UserTE->Scalars.size() == VF ||
5246 EI.UserTE->Scalars.size() ==
5247 TE->Scalars.size();
5248 }) &&
5249 "All users must be of VF size.");
5250 // Update ordering of the operands with the smaller VF than the given
5251 // one.
5252 reorderNodeWithReuses(*TE, Mask);
5253 }
5254 continue;
5255 }
5256 if ((TE->State == TreeEntry::Vectorize ||
5257 TE->State == TreeEntry::StridedVectorize) &&
5259 InsertElementInst>(TE->getMainOp()) &&
5260 !TE->isAltShuffle()) {
5261 // Build correct orders for extract{element,value}, loads and
5262 // stores.
5263 reorderOrder(TE->ReorderIndices, Mask);
5264 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
5265 TE->reorderOperands(Mask);
5266 } else {
5267 // Reorder the node and its operands.
5268 TE->reorderOperands(Mask);
5269 assert(TE->ReorderIndices.empty() &&
5270 "Expected empty reorder sequence.");
5271 reorderScalars(TE->Scalars, Mask);
5272 }
5273 if (!TE->ReuseShuffleIndices.empty()) {
5274 // Apply reversed order to keep the original ordering of the reused
5275 // elements to avoid extra reorder indices shuffling.
5276 OrdersType CurrentOrder;
5277 reorderOrder(CurrentOrder, MaskOrder);
5278 SmallVector<int> NewReuses;
5279 inversePermutation(CurrentOrder, NewReuses);
5280 addMask(NewReuses, TE->ReuseShuffleIndices);
5281 TE->ReuseShuffleIndices.swap(NewReuses);
5282 }
5283 }
5284 }
5285}
5286
5287bool BoUpSLP::canReorderOperands(
5288 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
5289 ArrayRef<TreeEntry *> ReorderableGathers,
5290 SmallVectorImpl<TreeEntry *> &GatherOps) {
5291 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5292 if (UserTE->isNonPowOf2Vec())
5293 return false;
5294
5295 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
5296 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
5297 return OpData.first == I &&
5298 (OpData.second->State == TreeEntry::Vectorize ||
5299 OpData.second->State == TreeEntry::StridedVectorize);
5300 }))
5301 continue;
5302 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
5303 // Do not reorder if operand node is used by many user nodes.
5304 if (any_of(TE->UserTreeIndices,
5305 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
5306 return false;
5307 // Add the node to the list of the ordered nodes with the identity
5308 // order.
5309 Edges.emplace_back(I, TE);
5310 // Add ScatterVectorize nodes to the list of operands, where just
5311 // reordering of the scalars is required. Similar to the gathers, so
5312 // simply add to the list of gathered ops.
5313 // If there are reused scalars, process this node as a regular vectorize
5314 // node, just reorder reuses mask.
5315 if (TE->State != TreeEntry::Vectorize &&
5316 TE->State != TreeEntry::StridedVectorize &&
5317 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
5318 GatherOps.push_back(TE);
5319 continue;
5320 }
5321 TreeEntry *Gather = nullptr;
5322 if (count_if(ReorderableGathers,
5323 [&Gather, UserTE, I](TreeEntry *TE) {
5324 assert(TE->State != TreeEntry::Vectorize &&
5325 TE->State != TreeEntry::StridedVectorize &&
5326 "Only non-vectorized nodes are expected.");
5327 if (any_of(TE->UserTreeIndices,
5328 [UserTE, I](const EdgeInfo &EI) {
5329 return EI.UserTE == UserTE && EI.EdgeIdx == I;
5330 })) {
5331 assert(TE->isSame(UserTE->getOperand(I)) &&
5332 "Operand entry does not match operands.");
5333 Gather = TE;
5334 return true;
5335 }
5336 return false;
5337 }) > 1 &&
5338 !allConstant(UserTE->getOperand(I)))
5339 return false;
5340 if (Gather)
5341 GatherOps.push_back(Gather);
5342 }
5343 return true;
5344}
5345
5346void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
5347 SetVector<TreeEntry *> OrderedEntries;
5348 DenseSet<const TreeEntry *> GathersToOrders;
5349 // Find all reorderable leaf nodes with the given VF.
5350 // Currently the are vectorized loads,extracts without alternate operands +
5351 // some gathering of extracts.
5352 SmallVector<TreeEntry *> NonVectorized;
5353 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
5354 if (TE->State != TreeEntry::Vectorize &&
5355 TE->State != TreeEntry::StridedVectorize)
5356 NonVectorized.push_back(TE.get());
5357 if (std::optional<OrdersType> CurrentOrder =
5358 getReorderingData(*TE, /*TopToBottom=*/false)) {
5359 OrderedEntries.insert(TE.get());
5360 if (!(TE->State == TreeEntry::Vectorize ||
5361 TE->State == TreeEntry::StridedVectorize) ||
5362 !TE->ReuseShuffleIndices.empty())
5363 GathersToOrders.insert(TE.get());
5364 }
5365 }
5366
5367 // 1. Propagate order to the graph nodes, which use only reordered nodes.
5368 // I.e., if the node has operands, that are reordered, try to make at least
5369 // one operand order in the natural order and reorder others + reorder the
5370 // user node itself.
5372 while (!OrderedEntries.empty()) {
5373 // 1. Filter out only reordered nodes.
5374 // 2. If the entry has multiple uses - skip it and jump to the next node.
5376 SmallVector<TreeEntry *> Filtered;
5377 for (TreeEntry *TE : OrderedEntries) {
5378 if (!(TE->State == TreeEntry::Vectorize ||
5379 TE->State == TreeEntry::StridedVectorize ||
5380 (TE->State == TreeEntry::NeedToGather &&
5381 GathersToOrders.contains(TE))) ||
5382 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5383 !all_of(drop_begin(TE->UserTreeIndices),
5384 [TE](const EdgeInfo &EI) {
5385 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
5386 }) ||
5387 !Visited.insert(TE).second) {
5388 Filtered.push_back(TE);
5389 continue;
5390 }
5391 // Build a map between user nodes and their operands order to speedup
5392 // search. The graph currently does not provide this dependency directly.
5393 for (EdgeInfo &EI : TE->UserTreeIndices) {
5394 TreeEntry *UserTE = EI.UserTE;
5395 auto It = Users.find(UserTE);
5396 if (It == Users.end())
5397 It = Users.insert({UserTE, {}}).first;
5398 It->second.emplace_back(EI.EdgeIdx, TE);
5399 }
5400 }
5401 // Erase filtered entries.
5402 for (TreeEntry *TE : Filtered)
5403 OrderedEntries.remove(TE);
5405 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
5406 UsersVec(Users.begin(), Users.end());
5407 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
5408 return Data1.first->Idx > Data2.first->Idx;
5409 });
5410 for (auto &Data : UsersVec) {
5411 // Check that operands are used only in the User node.
5412 SmallVector<TreeEntry *> GatherOps;
5413 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
5414 GatherOps)) {
5415 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5416 OrderedEntries.remove(Op.second);
5417 continue;
5418 }
5419 // All operands are reordered and used only in this node - propagate the
5420 // most used order to the user node.
5423 OrdersUses;
5424 // Do the analysis for each tree entry only once, otherwise the order of
5425 // the same node my be considered several times, though might be not
5426 // profitable.
5429 for (const auto &Op : Data.second) {
5430 TreeEntry *OpTE = Op.second;
5431 if (!VisitedOps.insert(OpTE).second)
5432 continue;
5433 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
5434 continue;
5435 const auto Order = [&]() -> const OrdersType {
5436 if (OpTE->State == TreeEntry::NeedToGather ||
5437 !OpTE->ReuseShuffleIndices.empty())
5438 return getReorderingData(*OpTE, /*TopToBottom=*/false)
5439 .value_or(OrdersType(1));
5440 return OpTE->ReorderIndices;
5441 }();
5442 // The order is partially ordered, skip it in favor of fully non-ordered
5443 // orders.
5444 if (Order.size() == 1)
5445 continue;
5446 unsigned NumOps = count_if(
5447 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
5448 return P.second == OpTE;
5449 });
5450 // Stores actually store the mask, not the order, need to invert.
5451 if (OpTE->State == TreeEntry::Vectorize && !OpTE->isAltShuffle() &&
5452 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
5453 SmallVector<int> Mask;
5454 inversePermutation(Order, Mask);
5455 unsigned E = Order.size();
5456 OrdersType CurrentOrder(E, E);
5457 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
5458 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
5459 });
5460 fixupOrderingIndices(CurrentOrder);
5461 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
5462 NumOps;
5463 } else {
5464 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
5465 }
5466 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
5467 const auto AllowsReordering = [&](const TreeEntry *TE) {
5468 // FIXME: Reordering isn't implemented for non-power-of-2 nodes yet.
5469 if (TE->isNonPowOf2Vec())
5470 return false;
5471 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
5472 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
5473 (IgnoreReorder && TE->Idx == 0))
5474 return true;
5475 if (TE->State == TreeEntry::NeedToGather) {
5476 if (GathersToOrders.contains(TE))
5477 return !getReorderingData(*TE, /*TopToBottom=*/false)
5478 .value_or(OrdersType(1))
5479 .empty();
5480 return true;
5481 }
5482 return false;
5483 };
5484 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
5485 TreeEntry *UserTE = EI.UserTE;
5486 if (!VisitedUsers.insert(UserTE).second)
5487 continue;
5488 // May reorder user node if it requires reordering, has reused
5489 // scalars, is an alternate op vectorize node or its op nodes require
5490 // reordering.
5491 if (AllowsReordering(UserTE))
5492 continue;
5493 // Check if users allow reordering.
5494 // Currently look up just 1 level of operands to avoid increase of
5495 // the compile time.
5496 // Profitable to reorder if definitely more operands allow
5497 // reordering rather than those with natural order.
5499 if (static_cast<unsigned>(count_if(
5500 Ops, [UserTE, &AllowsReordering](
5501 const std::pair<unsigned, TreeEntry *> &Op) {
5502 return AllowsReordering(Op.second) &&
5503 all_of(Op.second->UserTreeIndices,
5504 [UserTE](const EdgeInfo &EI) {
5505 return EI.UserTE == UserTE;
5506 });
5507 })) <= Ops.size() / 2)
5508 ++Res.first->second;
5509 }
5510 }
5511 if (OrdersUses.empty()) {
5512 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5513 OrderedEntries.remove(Op.second);
5514 continue;
5515 }
5516 auto IsIdentityOrder = [](ArrayRef<unsigned> Order) {
5517 const unsigned Sz = Order.size();
5518 for (unsigned Idx : seq<unsigned>(0, Sz))
5519 if (Idx != Order[Idx] && Order[Idx] != Sz)
5520 return false;
5521 return true;
5522 };
5523 // Choose the most used order.
5524 unsigned IdentityCnt = 0;
5525 unsigned VF = Data.second.front().second->getVectorFactor();
5526 OrdersType IdentityOrder(VF, VF);
5527 for (auto &Pair : OrdersUses) {
5528 if (Pair.first.empty() || IsIdentityOrder(Pair.first)) {
5529 IdentityCnt += Pair.second;
5530 combineOrders(IdentityOrder, Pair.first);
5531 }
5532 }
5533 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
5534 unsigned Cnt = IdentityCnt;
5535 for (auto &Pair : OrdersUses) {
5536 // Prefer identity order. But, if filled identity found (non-empty
5537 // order) with same number of uses, as the new candidate order, we can
5538 // choose this candidate order.
5539 if (Cnt < Pair.second) {
5540 combineOrders(Pair.first, BestOrder);
5541 BestOrder = Pair.first;
5542 Cnt = Pair.second;
5543 } else {
5544 combineOrders(BestOrder, Pair.first);
5545 }
5546 }
5547 // Set order of the user node.
5548 if (IsIdentityOrder(BestOrder)) {
5549 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
5550 OrderedEntries.remove(Op.second);
5551 continue;
5552 }
5553 fixupOrderingIndices(BestOrder);
5554 // Erase operands from OrderedEntries list and adjust their orders.
5555 VisitedOps.clear();
5556 SmallVector<int> Mask;
5557 inversePermutation(BestOrder, Mask);
5558 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
5559 unsigned E = BestOrder.size();
5560 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
5561 return I < E ? static_cast<int>(I) : PoisonMaskElem;
5562 });
5563 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
5564 TreeEntry *TE = Op.second;
5565 OrderedEntries.remove(TE);
5566 if (!VisitedOps.insert(TE).second)
5567 continue;
5568 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
5569 reorderNodeWithReuses(*TE, Mask);
5570 continue;
5571 }
5572 // Gathers are processed separately.
5573 if (TE->State != TreeEntry::Vectorize &&
5574 TE->State != TreeEntry::StridedVectorize &&
5575 (TE->State != TreeEntry::ScatterVectorize ||
5576 TE->ReorderIndices.empty()))
5577 continue;
5578 assert((BestOrder.size() == TE->ReorderIndices.size() ||
5579 TE->ReorderIndices.empty()) &&
5580 "Non-matching sizes of user/operand entries.");
5581 reorderOrder(TE->ReorderIndices, Mask);
5582 if (IgnoreReorder && TE == VectorizableTree.front().get())
5583 IgnoreReorder = false;
5584 }
5585 // For gathers just need to reorder its scalars.
5586 for (TreeEntry *Gather : GatherOps) {
5587 assert(Gather->ReorderIndices.empty() &&
5588 "Unexpected reordering of gathers.");
5589 if (!Gather->ReuseShuffleIndices.empty()) {
5590 // Just reorder reuses indices.
5591 reorderReuses(Gather->ReuseShuffleIndices, Mask);
5592 continue;
5593 }
5594 reorderScalars(Gather->Scalars, Mask);
5595 OrderedEntries.remove(Gather);
5596 }
5597 // Reorder operands of the user node and set the ordering for the user
5598 // node itself.
5599 if (Data.first->State != TreeEntry::Vectorize ||
5600 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
5601 Data.first->getMainOp()) ||
5602 Data.first->isAltShuffle())
5603 Data.first->reorderOperands(Mask);
5604 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
5605 Data.first->isAltShuffle() ||
5606 Data.first->State == TreeEntry::StridedVectorize) {
5607 reorderScalars(Data.first->Scalars, Mask);
5608 reorderOrder(Data.first->ReorderIndices, MaskOrder,
5609 /*BottomOrder=*/true);
5610 if (Data.first->ReuseShuffleIndices.empty() &&
5611 !Data.first->ReorderIndices.empty() &&
5612 !Data.first->isAltShuffle()) {
5613 // Insert user node to the list to try to sink reordering deeper in
5614 // the graph.
5615 OrderedEntries.insert(Data.first);
5616 }
5617 } else {
5618 reorderOrder(Data.first->ReorderIndices, Mask);
5619 }
5620 }
5621 }
5622 // If the reordering is unnecessary, just remove the reorder.
5623 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
5624 VectorizableTree.front()->ReuseShuffleIndices.empty())
5625 VectorizableTree.front()->ReorderIndices.clear();
5626}
5627
5629 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
5630 DenseMap<Value *, unsigned> ScalarToExtUses;
5631 // Collect the values that we need to extract from the tree.
5632 for (auto &TEPtr : VectorizableTree) {
5633 TreeEntry *Entry = TEPtr.get();
5634
5635 // No need to handle users of gathered values.
5636 if (Entry->State == TreeEntry::NeedToGather)
5637 continue;
5638
5639 // For each lane:
5640 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
5641 Value *Scalar = Entry->Scalars[Lane];
5642 if (!isa<Instruction>(Scalar))
5643 continue;
5644 // All uses must be replaced already? No need to do it again.
5645 auto It = ScalarToExtUses.find(Scalar);
5646 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
5647 continue;
5648
5649 // Check if the scalar is externally used as an extra arg.
5650 const auto *ExtI = ExternallyUsedValues.find(Scalar);
5651 if (ExtI != ExternallyUsedValues.end()) {
5652 int FoundLane = Entry->findLaneForValue(Scalar);
5653 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
5654 << FoundLane << " from " << *Scalar << ".\n");
5655 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
5656 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
5657 continue;
5658 }
5659 for (User *U : Scalar->users()) {
5660 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
5661
5662 Instruction *UserInst = dyn_cast<Instruction>(U);
5663 if (!UserInst || isDeleted(UserInst))
5664 continue;
5665
5666 // Ignore users in the user ignore list.
5667 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
5668 continue;
5669
5670 // Skip in-tree scalars that become vectors
5671 if (TreeEntry *UseEntry = getTreeEntry(U)) {
5672 // Some in-tree scalars will remain as scalar in vectorized
5673 // instructions. If that is the case, the one in FoundLane will
5674 // be used.
5675 if (UseEntry->State == TreeEntry::ScatterVectorize ||
5677 Scalar, cast<Instruction>(UseEntry->Scalars.front()), TLI)) {
5678 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
5679 << ".\n");
5680 assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
5681 continue;
5682 }
5683 U = nullptr;
5684 if (It != ScalarToExtUses.end()) {
5685 ExternalUses[It->second].User = nullptr;
5686 break;
5687 }
5688 }
5689
5690 int FoundLane = Entry->findLaneForValue(Scalar);
5691 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
5692 << " from lane " << FoundLane << " from " << *Scalar
5693 << ".\n");
5694 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
5695 ExternalUses.emplace_back(Scalar, U, FoundLane);
5696 if (!U)
5697 break;
5698 }
5699 }
5700 }
5701}
5702
5704BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
5706 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
5707 Value *V = TE->Scalars[Lane];
5708 // To save compilation time we don't visit if we have too many users.
5709 if (V->hasNUsesOrMore(UsesLimit))
5710 break;
5711
5712 // Collect stores per pointer object.
5713 for (User *U : V->users()) {
5714 auto *SI = dyn_cast<StoreInst>(U);
5715 if (SI == nullptr || !SI->isSimple() ||
5716 !isValidElementType(SI->getValueOperand()->getType()))
5717 continue;
5718 // Skip entry if already
5719 if (getTreeEntry(U))
5720 continue;
5721
5722 Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
5723 auto &StoresVec = PtrToStoresMap[Ptr];
5724 // For now just keep one store per pointer object per lane.
5725 // TODO: Extend this to support multiple stores per pointer per lane
5726 if (StoresVec.size() > Lane)
5727 continue;
5728 // Skip if in different BBs.
5729 if (!StoresVec.empty() &&
5730 SI->getParent() != StoresVec.back()->getParent())
5731 continue;
5732 // Make sure that the stores are of the same type.
5733 if (!StoresVec.empty() &&
5734 SI->getValueOperand()->getType() !=
5735 StoresVec.back()->getValueOperand()->getType())
5736 continue;
5737 StoresVec.push_back(SI);
5738 }
5739 }
5740 return PtrToStoresMap;
5741}
5742
5743bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
5744 OrdersType &ReorderIndices) const {
5745 // We check whether the stores in StoreVec can form a vector by sorting them
5746 // and checking whether they are consecutive.
5747
5748 // To avoid calling getPointersDiff() while sorting we create a vector of
5749 // pairs {store, offset from first} and sort this instead.
5750 SmallVector<std::pair<StoreInst *, int>> StoreOffsetVec(StoresVec.size());
5751 StoreInst *S0 = StoresVec[0];
5752 StoreOffsetVec[0] = {S0, 0};
5753 Type *S0Ty = S0->getValueOperand()->getType();
5754 Value *S0Ptr = S0->getPointerOperand();
5755 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
5756 StoreInst *SI = StoresVec[Idx];
5757 std::optional<int> Diff =
5758 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
5759 SI->getPointerOperand(), *DL, *SE,
5760 /*StrictCheck=*/true);
5761 // We failed to compare the pointers so just abandon this StoresVec.
5762 if (!Diff)
5763 return false;
5764 StoreOffsetVec[Idx] = {StoresVec[Idx], *Diff};
5765 }
5766
5767 // Sort the vector based on the pointers. We create a copy because we may
5768 // need the original later for calculating the reorder (shuffle) indices.
5769 stable_sort(StoreOffsetVec, [](const std::pair<StoreInst *, int> &Pair1,
5770 const std::pair<StoreInst *, int> &Pair2) {
5771 int Offset1 = Pair1.second;
5772 int Offset2 = Pair2.second;
5773 return Offset1 < Offset2;
5774 });
5775
5776 // Check if the stores are consecutive by checking if their difference is 1.
5777 for (unsigned Idx : seq<unsigned>(1, StoreOffsetVec.size()))
5778 if (StoreOffsetVec[Idx].second != StoreOffsetVec[Idx - 1].second + 1)
5779 return false;
5780
5781 // Calculate the shuffle indices according to their offset against the sorted
5782 // StoreOffsetVec.
5783 ReorderIndices.reserve(StoresVec.size());
5784 for (StoreInst *SI : StoresVec) {
5785 unsigned Idx = find_if(StoreOffsetVec,
5786 [SI](const std::pair<StoreInst *, int> &Pair) {
5787 return Pair.first == SI;
5788 }) -
5789 StoreOffsetVec.begin();
5790 ReorderIndices.push_back(Idx);
5791 }
5792 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
5793 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
5794 // same convention here.
5795 auto IsIdentityOrder = [](const OrdersType &Order) {
5796 for (unsigned Idx : seq<unsigned>(0, Order.size()))
5797 if (Idx != Order[Idx])
5798 return false;
5799 return true;
5800 };
5801 if (IsIdentityOrder(ReorderIndices))
5802 ReorderIndices.clear();
5803
5804 return true;
5805}
5806
5807#ifndef NDEBUG
5809 for (unsigned Idx : Order)
5810 dbgs() << Idx << ", ";
5811 dbgs() << "\n";
5812}
5813#endif
5814
5816BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
5817 unsigned NumLanes = TE->Scalars.size();
5818
5820 collectUserStores(TE);
5821
5822 // Holds the reorder indices for each candidate store vector that is a user of
5823 // the current TreeEntry.
5824 SmallVector<OrdersType, 1> ExternalReorderIndices;
5825
5826 // Now inspect the stores collected per pointer and look for vectorization
5827 // candidates. For each candidate calculate the reorder index vector and push
5828 // it into `ExternalReorderIndices`
5829 for (const auto &Pair : PtrToStoresMap) {
5830 auto &StoresVec = Pair.second;
5831 // If we have fewer than NumLanes stores, then we can't form a vector.
5832 if (StoresVec.size() != NumLanes)
5833 continue;
5834
5835 // If the stores are not consecutive then abandon this StoresVec.
5836 OrdersType ReorderIndices;
5837 if (!canFormVector(StoresVec, ReorderIndices))
5838 continue;
5839
5840 // We now know that the scalars in StoresVec can form a vector instruction,
5841 // so set the reorder indices.
5842 ExternalReorderIndices.push_back(ReorderIndices);
5843 }
5844 return ExternalReorderIndices;
5845}
5846
5848 const SmallDenseSet<Value *> &UserIgnoreLst) {
5849 deleteTree();
5850 UserIgnoreList = &UserIgnoreLst;
5851 if (!allSameType(Roots))
5852 return;
5853 buildTree_rec(Roots, 0, EdgeInfo());
5854}
5855
5857 deleteTree();
5858 if (!allSameType(Roots))
5859 return;
5860 buildTree_rec(Roots, 0, EdgeInfo());
5861}
5862
5863/// \return true if the specified list of values has only one instruction that
5864/// requires scheduling, false otherwise.
5865#ifndef NDEBUG
5867 Value *NeedsScheduling = nullptr;
5868 for (Value *V : VL) {
5870 continue;
5871 if (!NeedsScheduling) {
5872 NeedsScheduling = V;
5873 continue;
5874 }
5875 return false;
5876 }
5877 return NeedsScheduling;
5878}
5879#endif
5880
5881/// Generates key/subkey pair for the given value to provide effective sorting
5882/// of the values and better detection of the vectorizable values sequences. The
5883/// keys/subkeys can be used for better sorting of the values themselves (keys)
5884/// and in values subgroups (subkeys).
5885static std::pair<size_t, size_t> generateKeySubkey(
5886 Value *V, const TargetLibraryInfo *TLI,
5887 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
5888 bool AllowAlternate) {
5889 hash_code Key = hash_value(V->getValueID() + 2);
5890 hash_code SubKey = hash_value(0);
5891 // Sort the loads by the distance between the pointers.
5892 if (auto *LI = dyn_cast<LoadInst>(V)) {
5893 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
5894 if (LI->isSimple())
5895 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
5896 else
5897 Key = SubKey = hash_value(LI);
5898 } else if (isVectorLikeInstWithConstOps(V)) {
5899 // Sort extracts by the vector operands.
5900 if (isa<ExtractElementInst, UndefValue>(V))
5901 Key = hash_value(Value::UndefValueVal + 1);
5902 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
5903 if (!isUndefVector(EI->getVectorOperand()).all() &&
5904 !isa<UndefValue>(EI->getIndexOperand()))
5905 SubKey = hash_value(EI->getVectorOperand());
5906 }
5907 } else if (auto *I = dyn_cast<Instruction>(V)) {
5908 // Sort other instructions just by the opcodes except for CMPInst.
5909 // For CMP also sort by the predicate kind.
5910 if ((isa<BinaryOperator, CastInst>(I)) &&
5911 isValidForAlternation(I->getOpcode())) {
5912 if (AllowAlternate)
5913 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
5914 else
5915 Key = hash_combine(hash_value(I->getOpcode()), Key);
5916 SubKey = hash_combine(
5917 hash_value(I->getOpcode()), hash_value(I->getType()),
5918 hash_value(isa<BinaryOperator>(I)
5919 ? I->getType()
5920 : cast<CastInst>(I)->getOperand(0)->getType()));
5921 // For casts, look through the only operand to improve compile time.
5922 if (isa<CastInst>(I)) {
5923 std::pair<size_t, size_t> OpVals =
5924 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
5925 /*AllowAlternate=*/true);
5926 Key = hash_combine(OpVals.first, Key);
5927 SubKey = hash_combine(OpVals.first, SubKey);
5928 }
5929 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
5930 CmpInst::Predicate Pred = CI->getPredicate();
5931 if (CI->isCommutative())
5932 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
5934 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
5935 hash_value(SwapPred),
5936 hash_value(CI->getOperand(0)->getType()));
5937 } else if (auto *Call = dyn_cast<CallInst>(I)) {
5940 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
5941 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
5942 SubKey = hash_combine(hash_value(I->getOpcode()),
5943 hash_value(Call->getCalledFunction()));
5944 } else {
5945 Key = hash_combine(hash_value(Call), Key);
5946 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
5947 }
5948 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
5949 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
5950 hash_value(Op.Tag), SubKey);
5951 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
5952 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
5953 SubKey = hash_value(Gep->getPointerOperand());
5954 else
5955 SubKey = hash_value(Gep);
5956 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
5957 !isa<ConstantInt>(I->getOperand(1))) {
5958 // Do not try to vectorize instructions with potentially high cost.
5959 SubKey = hash_value(I);
5960 } else {
5961 SubKey = hash_value(I->getOpcode());
5962 }
5963 Key = hash_combine(hash_value(I->getParent()), Key);
5964 }
5965 return std::make_pair(Key, SubKey);
5966}
5967
5968/// Checks if the specified instruction \p I is an alternate operation for
5969/// the given \p MainOp and \p AltOp instructions.
5970static bool isAlternateInstruction(const Instruction *I,
5971 const Instruction *MainOp,
5972 const Instruction *AltOp,
5973 const TargetLibraryInfo &TLI);
5974
5975bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
5976 ArrayRef<Value *> VL) const {
5977 unsigned Opcode0 = S.getOpcode();
5978 unsigned Opcode1 = S.getAltOpcode();
5979 // The opcode mask selects between the two opcodes.
5980 SmallBitVector OpcodeMask(VL.size(), false);
5981 for (unsigned Lane : seq<unsigned>(0, VL.size()))
5982 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
5983 OpcodeMask.set(Lane);
5984 // If this pattern is supported by the target then consider it profitable.
5985 if (TTI->isLegalAltInstr(FixedVectorType::get(S.MainOp->getType(), VL.size()),
5986 Opcode0, Opcode1, OpcodeMask))
5987 return true;
5989 for (unsigned I : seq<unsigned>(0, S.MainOp->getNumOperands())) {
5990 Operands.emplace_back();
5991 // Prepare the operand vector.
5992 for (Value *V : VL)
5993 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
5994 }
5995 if (Operands.size() == 2) {
5996 // Try find best operands candidates.
5997 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
5999 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
6000 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
6001 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
6002 std::optional<int> Res = findBestRootPair(Candidates);
6003 switch (Res.value_or(0)) {
6004 case 0:
6005 break;
6006 case 1:
6007 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
6008 break;
6009 case 2:
6010 std::swap(Operands[0][I], Operands[1][I]);
6011 break;
6012 default:
6013 llvm_unreachable("Unexpected index.");
6014 }
6015 }
6016 }
6017 DenseSet<unsigned> UniqueOpcodes;
6018 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
6019 unsigned NonInstCnt = 0;
6020 // Estimate number of instructions, required for the vectorized node and for
6021 // the buildvector node.
6022 unsigned UndefCnt = 0;
6023 // Count the number of extra shuffles, required for vector nodes.
6024 unsigned ExtraShuffleInsts = 0;
6025 // Check that operands do not contain same values and create either perfect
6026 // diamond match or shuffled match.
6027 if (Operands.size() == 2) {
6028 // Do not count same operands twice.
6029 if (Operands.front() == Operands.back()) {
6030 Operands.erase(Operands.begin());
6031 } else if (!allConstant(Operands.front()) &&
6032 all_of(Operands.front(), [&](Value *V) {
6033 return is_contained(Operands.back(), V);
6034 })) {
6035 Operands.erase(Operands.begin());
6036 ++ExtraShuffleInsts;
6037 }
6038 }
6039 const Loop *L = LI->getLoopFor(S.MainOp->getParent());
6040 // Vectorize node, if:
6041 // 1. at least single operand is constant or splat.
6042 // 2. Operands have many loop invariants (the instructions are not loop
6043 // invariants).
6044 // 3. At least single unique operands is supposed to vectorized.
6045 return none_of(Operands,
6046 [&](ArrayRef<Value *> Op) {
6047 if (allConstant(Op) ||
6048 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
6049 getSameOpcode(Op, *TLI).MainOp))
6050 return false;
6052 for (Value *V : Op) {
6053 if (isa<Constant, ExtractElementInst>(V) ||
6054 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
6055 if (isa<UndefValue>(V))
6056 ++UndefCnt;
6057 continue;
6058 }
6059 auto Res = Uniques.try_emplace(V, 0);
6060 // Found first duplicate - need to add shuffle.
6061 if (!Res.second && Res.first->second == 1)
6062 ++ExtraShuffleInsts;
6063 ++Res.first->getSecond();
6064 if (auto *I = dyn_cast<Instruction>(V))
6065 UniqueOpcodes.insert(I->getOpcode());
6066 else if (Res.second)
6067 ++NonInstCnt;
6068 }
6069 return none_of(Uniques, [&](const auto &P) {
6070 return P.first->hasNUsesOrMore(P.second + 1) &&
6071 none_of(P.first->users(), [&](User *U) {
6072 return getTreeEntry(U) || Uniques.contains(U);
6073 });
6074 });
6075 }) ||
6076 // Do not vectorize node, if estimated number of vector instructions is
6077 // more than estimated number of buildvector instructions. Number of
6078 // vector operands is number of vector instructions + number of vector
6079 // instructions for operands (buildvectors). Number of buildvector
6080 // instructions is just number_of_operands * number_of_scalars.
6081 (UndefCnt < (VL.size() - 1) * S.MainOp->getNumOperands() &&
6082 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
6083 NumAltInsts) < S.MainOp->getNumOperands() * VL.size());
6084}
6085
6086BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
6087 InstructionsState &S, ArrayRef<Value *> VL, bool IsScatterVectorizeUserTE,
6088 OrdersType &CurrentOrder, SmallVectorImpl<Value *> &PointerOps) const {
6089 assert(S.MainOp && "Expected instructions with same/alternate opcodes only.");
6090
6091 unsigned ShuffleOrOp =
6092 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
6093 auto *VL0 = cast<Instruction>(S.OpValue);
6094 switch (ShuffleOrOp) {
6095 case Instruction::PHI: {
6096 // Too many operands - gather, most probably won't be vectorized.
6097 if (VL0->getNumOperands() > MaxPHINumOperands)
6098 return TreeEntry::NeedToGather;
6099 // Check for terminator values (e.g. invoke).
6100 for (Value *V : VL)
6101 for (Value *Incoming : cast<PHINode>(V)->incoming_values()) {
6102 Instruction *Term = dyn_cast<Instruction>(Incoming);
6103 if (Term && Term->isTerminator()) {
6105 << "SLP: Need to swizzle PHINodes (terminator use).\n");
6106 return TreeEntry::NeedToGather;
6107 }
6108 }
6109
6110 return TreeEntry::Vectorize;
6111 }
6112 case Instruction::ExtractValue:
6113 case Instruction::ExtractElement: {
6114 bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
6115 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops.
6116 if (!isPowerOf2_32(VL.size()))
6117 return TreeEntry::NeedToGather;
6118 if (Reuse || !CurrentOrder.empty())
6119 return TreeEntry::Vectorize;
6120 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
6121 return TreeEntry::NeedToGather;
6122 }
6123 case Instruction::InsertElement: {
6124 // Check that we have a buildvector and not a shuffle of 2 or more
6125 // different vectors.
6126 ValueSet SourceVectors;
6127 for (Value *V : VL) {
6128 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
6129 assert(getInsertIndex(V) != std::nullopt &&
6130 "Non-constant or undef index?");
6131 }
6132
6133 if (count_if(VL, [&SourceVectors](Value *V) {
6134 return !SourceVectors.contains(V);
6135 }) >= 2) {
6136 // Found 2nd source vector - cancel.
6137 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
6138 "different source vectors.\n");
6139 return TreeEntry::NeedToGather;
6140 }
6141
6142 return TreeEntry::Vectorize;
6143 }
6144 case Instruction::Load: {
6145 // Check that a vectorized load would load the same memory as a scalar
6146 // load. For example, we don't want to vectorize loads that are smaller
6147 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6148 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6149 // from such a struct, we read/write packed bits disagreeing with the
6150 // unvectorized version.
6151 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
6153 return TreeEntry::Vectorize;
6155 return TreeEntry::ScatterVectorize;
6157 return TreeEntry::StridedVectorize;
6158 case LoadsState::Gather:
6159#ifndef NDEBUG
6160 Type *ScalarTy = VL0->getType();
6161 if (DL->getTypeSizeInBits(ScalarTy) !=
6162 DL->getTypeAllocSizeInBits(ScalarTy))
6163 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
6164 else if (any_of(VL,
6165 [](Value *V) { return !cast<LoadInst>(V)->isSimple(); }))
6166 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
6167 else
6168 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
6169#endif // NDEBUG
6170 return TreeEntry::NeedToGather;
6171 }
6172 llvm_unreachable("Unexpected state of loads");
6173 }
6174 case Instruction::ZExt:
6175 case Instruction::SExt:
6176 case Instruction::FPToUI:
6177 case Instruction::FPToSI:
6178 case Instruction::FPExt:
6179 case Instruction::PtrToInt:
6180 case Instruction::IntToPtr:
6181 case Instruction::SIToFP:
6182 case Instruction::UIToFP:
6183 case Instruction::Trunc:
6184 case Instruction::FPTrunc:
6185 case Instruction::BitCast: {
6186 Type *SrcTy = VL0->getOperand(0)->getType();
6187 for (Value *V : VL) {
6188 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
6189 if (Ty != SrcTy || !isValidElementType(Ty)) {
6190 LLVM_DEBUG(
6191 dbgs() << "SLP: Gathering casts with different src types.\n");
6192 return TreeEntry::NeedToGather;
6193 }
6194 }
6195 return TreeEntry::Vectorize;
6196 }
6197 case Instruction::ICmp:
6198 case Instruction::FCmp: {
6199 // Check that all of the compares have the same predicate.
6200 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
6202 Type *ComparedTy = VL0->getOperand(0)->getType();
6203 for (Value *V : VL) {
6204 CmpInst *Cmp = cast<CmpInst>(V);
6205 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
6206 Cmp->getOperand(0)->getType() != ComparedTy) {
6207 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
6208 return TreeEntry::NeedToGather;
6209 }
6210 }
6211 return TreeEntry::Vectorize;
6212 }
6213 case Instruction::Select:
6214 case Instruction::FNeg:
6215 case Instruction::Add:
6216 case Instruction::FAdd:
6217 case Instruction::Sub:
6218 case Instruction::FSub:
6219 case Instruction::Mul:
6220 case Instruction::FMul:
6221 case Instruction::UDiv:
6222 case Instruction::SDiv:
6223 case Instruction::FDiv:
6224 case Instruction::URem:
6225 case Instruction::SRem:
6226 case Instruction::FRem:
6227 case Instruction::Shl:
6228 case Instruction::LShr:
6229 case Instruction::AShr:
6230 case Instruction::And:
6231 case Instruction::Or:
6232 case Instruction::Xor:
6233 return TreeEntry::Vectorize;
6234 case Instruction::GetElementPtr: {
6235 // We don't combine GEPs with complicated (nested) indexing.
6236 for (Value *V : VL) {
6237 auto *I = dyn_cast<GetElementPtrInst>(V);
6238 if (!I)
6239 continue;
6240 if (I->getNumOperands() != 2) {
6241 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
6242 return TreeEntry::NeedToGather;
6243 }
6244 }
6245
6246 // We can't combine several GEPs into one vector if they operate on
6247 // different types.
6248 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
6249 for (Value *V : VL) {
6250 auto *GEP = dyn_cast<GEPOperator>(V);
6251 if (!GEP)
6252 continue;
6253 Type *CurTy = GEP->getSourceElementType();
6254 if (Ty0 != CurTy) {
6255 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
6256 return TreeEntry::NeedToGather;
6257 }
6258 }
6259
6260 // We don't combine GEPs with non-constant indexes.
6261 Type *Ty1 = VL0->getOperand(1)->getType();
6262 for (Value *V : VL) {
6263 auto *I = dyn_cast<GetElementPtrInst>(V);
6264 if (!I)
6265 continue;
6266 auto *Op = I->getOperand(1);
6267 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6268 (Op->getType() != Ty1 &&
6269 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
6270 Op->getType()->getScalarSizeInBits() >
6271 DL->getIndexSizeInBits(
6272 V->getType()->getPointerAddressSpace())))) {
6273 LLVM_DEBUG(
6274 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
6275 return TreeEntry::NeedToGather;
6276 }
6277 }
6278
6279 return TreeEntry::Vectorize;
6280 }
6281 case Instruction::Store: {
6282 // Check if the stores are consecutive or if we need to swizzle them.
6283 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
6284 // Avoid types that are padded when being allocated as scalars, while
6285 // being packed together in a vector (such as i1).
6286 if (DL->getTypeSizeInBits(ScalarTy) !=
6287 DL->getTypeAllocSizeInBits(ScalarTy)) {
6288 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
6289 return TreeEntry::NeedToGather;
6290 }
6291 // Make sure all stores in the bundle are simple - we can't vectorize
6292 // atomic or volatile stores.
6293 for (Value *V : VL) {
6294 auto *SI = cast<StoreInst>(V);
6295 if (!SI->isSimple()) {
6296 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
6297 return TreeEntry::NeedToGather;
6298 }
6299 PointerOps.push_back(SI->getPointerOperand());
6300 }
6301
6302 // Check the order of pointer operands.
6303 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
6304 Value *Ptr0;
6305 Value *PtrN;
6306 if (CurrentOrder.empty()) {
6307 Ptr0 = PointerOps.front();
6308 PtrN = PointerOps.back();
6309 } else {
6310 Ptr0 = PointerOps[CurrentOrder.front()];
6311 PtrN = PointerOps[CurrentOrder.back()];
6312 }
6313 std::optional<int> Dist =
6314 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
6315 // Check that the sorted pointer operands are consecutive.
6316 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
6317 return TreeEntry::Vectorize;
6318 }
6319
6320 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
6321 return TreeEntry::NeedToGather;
6322 }
6323 case Instruction::Call: {
6324 // Check if the calls are all to the same vectorizable intrinsic or
6325 // library function.
6326 CallInst *CI = cast<CallInst>(VL0);
6328
6329 VFShape Shape = VFShape::get(
6330 CI->getFunctionType(),
6331 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
6332 false /*HasGlobalPred*/);
6333 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
6334
6335 if (!VecFunc && !isTriviallyVectorizable(ID)) {
6336 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
6337 return TreeEntry::NeedToGather;
6338 }
6339 Function *F = CI->getCalledFunction();
6340 unsigned NumArgs = CI->arg_size();
6341 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
6342 for (unsigned J = 0; J != NumArgs; ++J)
6344 ScalarArgs[J] = CI->getArgOperand(J);
6345 for (Value *V : VL) {
6346 CallInst *CI2 = dyn_cast<CallInst>(V);
6347 if (!CI2 || CI2->getCalledFunction() != F ||
6348 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
6349 (VecFunc &&
6350 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
6352 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
6353 << "\n");
6354 return TreeEntry::NeedToGather;
6355 }
6356 // Some intrinsics have scalar arguments and should be same in order for
6357 // them to be vectorized.
6358 for (unsigned J = 0; J != NumArgs; ++J) {
6360 Value *A1J = CI2->getArgOperand(J);
6361 if (ScalarArgs[J] != A1J) {
6363 << "SLP: mismatched arguments in call:" << *CI
6364 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
6365 return TreeEntry::NeedToGather;
6366 }
6367 }
6368 }
6369 // Verify that the bundle operands are identical between the two calls.
6370 if (CI->hasOperandBundles() &&
6371 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
6372 CI->op_begin() + CI->getBundleOperandsEndIndex(),
6373 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
6374 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
6375 << "!=" << *V << '\n');
6376 return TreeEntry::NeedToGather;
6377 }
6378 }
6379
6380 return TreeEntry::Vectorize;
6381 }
6382 case Instruction::ShuffleVector: {
6383 // If this is not an alternate sequence of opcode like add-sub
6384 // then do not vectorize this instruction.
6385 if (!S.isAltShuffle()) {
6386 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
6387 return TreeEntry::NeedToGather;
6388 }
6389 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
6390 LLVM_DEBUG(
6391 dbgs()
6392 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
6393 "the whole alt sequence is not profitable.\n");
6394 return TreeEntry::NeedToGather;
6395 }
6396
6397 return TreeEntry::Vectorize;
6398 }
6399 default:
6400 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
6401 return TreeEntry::NeedToGather;
6402 }
6403}
6404
6405namespace {
6406/// Allows to correctly handle operands of the phi nodes based on the \p Main
6407/// PHINode order of incoming basic blocks/values.
6408class PHIHandler {
6409 DominatorTree &DT;
6410 PHINode *Main = nullptr;
6413
6414public:
6415 PHIHandler() = delete;
6416 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
6417 : DT(DT), Main(Main), Phis(Phis),
6418 Operands(Main->getNumIncomingValues(),
6419 SmallVector<Value *>(Phis.size(), nullptr)) {}
6420 void buildOperands() {
6421 constexpr unsigned FastLimit = 4;
6422 if (Main->getNumIncomingValues() <= FastLimit) {
6423 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6424 BasicBlock *InBB = Main->getIncomingBlock(I);
6425 if (!DT.isReachableFromEntry(InBB)) {
6426 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6427 continue;
6428 }
6429 // Prepare the operand vector.
6430 for (auto [Idx, V] : enumerate(Phis)) {
6431 auto *P = cast<PHINode>(V);
6432 if (P->getIncomingBlock(I) == InBB)
6433 Operands[I][Idx] = P->getIncomingValue(I);
6434 else
6435 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
6436 }
6437 }
6438 return;
6439 }
6441 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
6442 BasicBlock *InBB = Main->getIncomingBlock(I);
6443 if (!DT.isReachableFromEntry(InBB)) {
6444 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
6445 continue;
6446 }
6447 Blocks.try_emplace(InBB).first->second.push_back(I);
6448 }
6449 for (auto [Idx, V] : enumerate(Phis)) {
6450 auto *P = cast<PHINode>(V);
6451 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
6452 BasicBlock *InBB = P->getIncomingBlock(I);
6453 if (InBB == Main->getIncomingBlock(I)) {
6454 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
6455 continue;
6456 Operands[I][Idx] = P->getIncomingValue(I);
6457 continue;
6458 }
6459 auto It = Blocks.find(InBB);
6460 if (It == Blocks.end())
6461 continue;
6462 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
6463 }
6464 }
6465 for (const auto &P : Blocks) {
6466 if (P.getSecond().size() <= 1)
6467 continue;
6468 unsigned BasicI = P.getSecond().front();
6469 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
6471 [&](const auto &Data) {
6472 return !Data.value() ||
6473 Data.value() == Operands[BasicI][Data.index()];
6474 }) &&
6475 "Expected empty operands list.");
6476 Operands[I] = Operands[BasicI];
6477 }
6478 }
6479 }
6480 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
6481};
6482} // namespace
6483
6484void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
6485 const EdgeInfo &UserTreeIdx) {
6486 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
6487
6488 SmallVector<int> ReuseShuffleIndicies;
6489 SmallVector<Value *> UniqueValues;
6490 SmallVector<Value *> NonUniqueValueVL;
6491 auto TryToFindDuplicates = [&](const InstructionsState &S,
6492 bool DoNotFail = false) {
6493 // Check that every instruction appears once in this bundle.
6494 DenseMap<Value *, unsigned> UniquePositions(VL.size());
6495 for (Value *V : VL) {
6496 if (isConstant(V)) {
6497 ReuseShuffleIndicies.emplace_back(
6498 isa<UndefValue>(V) ? PoisonMaskElem : UniqueValues.size());
6499 UniqueValues.emplace_back(V);
6500 continue;
6501 }
6502 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
6503 ReuseShuffleIndicies.emplace_back(Res.first->second);
6504 if (Res.second)
6505 UniqueValues.emplace_back(V);
6506 }
6507 size_t NumUniqueScalarValues = UniqueValues.size();
6508 if (NumUniqueScalarValues == VL.size()) {
6509 ReuseShuffleIndicies.clear();
6510 } else {
6511 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
6512 if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->isNonPowOf2Vec()) {
6513 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
6514 "for nodes with padding.\n");
6515 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6516 return false;
6517 }
6518 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
6519 if (NumUniqueScalarValues <= 1 ||
6520 (UniquePositions.size() == 1 && all_of(UniqueValues,
6521 [](Value *V) {
6522 return isa<UndefValue>(V) ||
6523 !isConstant(V);
6524 })) ||
6525 !llvm::has_single_bit<uint32_t>(NumUniqueScalarValues)) {
6526 if (DoNotFail && UniquePositions.size() > 1 &&
6527 NumUniqueScalarValues > 1 && S.MainOp->isSafeToRemove() &&
6528 all_of(UniqueValues, [=](Value *V) {
6529 return isa<ExtractElementInst>(V) ||
6530 areAllUsersVectorized(cast<Instruction>(V),
6531 UserIgnoreList);
6532 })) {
6533 unsigned PWSz = PowerOf2Ceil(UniqueValues.size());
6534 if (PWSz == VL.size()) {
6535 ReuseShuffleIndicies.clear();
6536 } else {
6537 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
6538 NonUniqueValueVL.append(PWSz - UniqueValues.size(),
6539 UniqueValues.back());
6540 VL = NonUniqueValueVL;
6541 }
6542 return true;
6543 }
6544 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
6545 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6546 return false;
6547 }
6548 VL = UniqueValues;
6549 }
6550 return true;
6551 };
6552
6553 InstructionsState S = getSameOpcode(VL, *TLI);
6554
6555 // Don't vectorize ephemeral values.
6556 if (!EphValues.empty()) {
6557 for (Value *V : VL) {
6558 if (EphValues.count(V)) {
6559 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6560 << ") is ephemeral.\n");
6561 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6562 return;
6563 }
6564 }
6565 }
6566
6567 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
6568 // a load), in which case peek through to include it in the tree, without
6569 // ballooning over-budget.
6570 if (Depth >= RecursionMaxDepth &&
6571 !(S.MainOp && isa<Instruction>(S.MainOp) && S.MainOp == S.AltOp &&
6572 VL.size() >= 4 &&
6573 (match(S.MainOp, m_Load(m_Value())) || all_of(VL, [&S](const Value *I) {
6574 return match(I,
6576 cast<Instruction>(I)->getOpcode() ==
6577 cast<Instruction>(S.MainOp)->getOpcode();
6578 })))) {
6579 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
6580 if (TryToFindDuplicates(S))
6581 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6582 ReuseShuffleIndicies);
6583 return;
6584 }
6585
6586 // Don't handle scalable vectors
6587 if (S.getOpcode() == Instruction::ExtractElement &&
6588 isa<ScalableVectorType>(
6589 cast<ExtractElementInst>(S.OpValue)->getVectorOperandType())) {
6590 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
6591 if (TryToFindDuplicates(S))
6592 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6593 ReuseShuffleIndicies);
6594 return;
6595 }
6596
6597 // Don't handle vectors.
6598 if (S.OpValue->getType()->isVectorTy() &&
6599 !isa<InsertElementInst>(S.OpValue)) {
6600 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
6601 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6602 return;
6603 }
6604
6605 if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
6606 if (SI->getValueOperand()->getType()->isVectorTy()) {
6607 LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
6608 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6609 return;
6610 }
6611
6612 // If all of the operands are identical or constant we have a simple solution.
6613 // If we deal with insert/extract instructions, they all must have constant
6614 // indices, otherwise we should gather them, not try to vectorize.
6615 // If alternate op node with 2 elements with gathered operands - do not
6616 // vectorize.
6617 auto &&NotProfitableForVectorization = [&S, this,
6619 if (!S.getOpcode() || !S.isAltShuffle() || VL.size() > 2)
6620 return false;
6621 if (VectorizableTree.size() < MinTreeSize)
6622 return false;
6623 if (Depth >= RecursionMaxDepth - 1)
6624 return true;
6625 // Check if all operands are extracts, part of vector node or can build a
6626 // regular vectorize node.
6627 SmallVector<unsigned, 2> InstsCount(VL.size(), 0);
6628 for (Value *V : VL) {
6629 auto *I = cast<Instruction>(V);
6630 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
6631 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
6632 }));
6633 }
6634 bool IsCommutative = isCommutative(S.MainOp) || isCommutative(S.AltOp);
6635 if ((IsCommutative &&
6636 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
6637 (!IsCommutative &&
6638 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
6639 return true;
6640 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
6642 auto *I1 = cast<Instruction>(VL.front());
6643 auto *I2 = cast<Instruction>(VL.back());
6644 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6645 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6646 I2->getOperand(Op));
6647 if (static_cast<unsigned>(count_if(
6648 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6650 })) >= S.MainOp->getNumOperands() / 2)
6651 return false;
6652 if (S.MainOp->getNumOperands() > 2)
6653 return true;
6654 if (IsCommutative) {
6655 // Check permuted operands.
6656 Candidates.clear();
6657 for (int Op = 0, E = S.MainOp->getNumOperands(); Op < E; ++Op)
6658 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
6659 I2->getOperand((Op + 1) % E));
6660 if (any_of(
6661 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
6663 }))
6664 return false;
6665 }
6666 return true;
6667 };
6668 SmallVector<unsigned> SortedIndices;
6669 BasicBlock *BB = nullptr;
6670 bool IsScatterVectorizeUserTE =
6671 UserTreeIdx.UserTE &&
6672 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
6673 bool AreAllSameInsts =
6674 (S.getOpcode() && allSameBlock(VL)) ||
6675 (S.OpValue->getType()->isPointerTy() && IsScatterVectorizeUserTE &&
6676 VL.size() > 2 &&
6677 all_of(VL,
6678 [&BB](Value *V) {
6679 auto *I = dyn_cast<GetElementPtrInst>(V);
6680 if (!I)
6681 return doesNotNeedToBeScheduled(V);
6682 if (!BB)
6683 BB = I->getParent();
6684 return BB == I->getParent() && I->getNumOperands() == 2;
6685 }) &&
6686 BB &&
6687 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
6688 SortedIndices));
6689 if (!AreAllSameInsts || allConstant(VL) || isSplat(VL) ||
6690 (isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
6691 S.OpValue) &&
6693 NotProfitableForVectorization(VL)) {
6694 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
6695 if (TryToFindDuplicates(S))
6696 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6697 ReuseShuffleIndicies);
6698 return;
6699 }
6700
6701 // We now know that this is a vector of instructions of the same type from
6702 // the same block.
6703
6704 // Check if this is a duplicate of another entry.
6705 if (TreeEntry *E = getTreeEntry(S.OpValue)) {
6706 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
6707 if (!E->isSame(VL)) {
6708 auto It = MultiNodeScalars.find(S.OpValue);
6709 if (It != MultiNodeScalars.end()) {
6710 auto *TEIt = find_if(It->getSecond(),
6711 [&](TreeEntry *ME) { return ME->isSame(VL); });
6712 if (TEIt != It->getSecond().end())
6713 E = *TEIt;
6714 else
6715 E = nullptr;
6716 } else {
6717 E = nullptr;
6718 }
6719 }
6720 if (!E) {
6721 if (!doesNotNeedToBeScheduled(S.OpValue)) {
6722 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
6723 if (TryToFindDuplicates(S))
6724 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6725 ReuseShuffleIndicies);
6726 return;
6727 }
6728 } else {
6729 // Record the reuse of the tree node. FIXME, currently this is only used
6730 // to properly draw the graph rather than for the actual vectorization.
6731 E->UserTreeIndices.push_back(UserTreeIdx);
6732 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
6733 << ".\n");
6734 return;
6735 }
6736 }
6737
6738 // Check that none of the instructions in the bundle are already in the tree.
6739 for (Value *V : VL) {
6740 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
6742 continue;
6743 if (getTreeEntry(V)) {
6744 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
6745 << ") is already in tree.\n");
6746 if (TryToFindDuplicates(S))
6747 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6748 ReuseShuffleIndicies);
6749 return;
6750 }
6751 }
6752
6753 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
6754 if (UserIgnoreList && !UserIgnoreList->empty()) {
6755 for (Value *V : VL) {
6756 if (UserIgnoreList && UserIgnoreList->contains(V)) {
6757 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
6758 if (TryToFindDuplicates(S))
6759 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6760 ReuseShuffleIndicies);
6761 return;
6762 }
6763 }
6764 }
6765
6766 // Special processing for sorted pointers for ScatterVectorize node with
6767 // constant indeces only.
6768 if (AreAllSameInsts && UserTreeIdx.UserTE &&
6769 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize &&
6770 !(S.getOpcode() && allSameBlock(VL))) {
6771 assert(S.OpValue->getType()->isPointerTy() &&
6772 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
6773 "Expected pointers only.");
6774 // Reset S to make it GetElementPtr kind of node.
6775 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
6776 assert(It != VL.end() && "Expected at least one GEP.");
6777 S = getSameOpcode(*It, *TLI);
6778 }
6779
6780 // Check that all of the users of the scalars that we want to vectorize are
6781 // schedulable.
6782 auto *VL0 = cast<Instruction>(S.OpValue);
6783 BB = VL0->getParent();
6784
6785 if (!DT->isReachableFromEntry(BB)) {
6786 // Don't go into unreachable blocks. They may contain instructions with
6787 // dependency cycles which confuse the final scheduling.
6788 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
6789 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6790 return;
6791 }
6792
6793 // Don't go into catchswitch blocks, which can happen with PHIs.
6794 // Such blocks can only have PHIs and the catchswitch. There is no
6795 // place to insert a shuffle if we need to, so just avoid that issue.
6796 if (isa<CatchSwitchInst>(BB->getTerminator())) {
6797 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
6798 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
6799 return;
6800 }
6801
6802 // Check that every instruction appears once in this bundle.
6803 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
6804 return;
6805
6806 // Perform specific checks for each particular instruction kind.
6807 OrdersType CurrentOrder;
6808 SmallVector<Value *> PointerOps;
6809 TreeEntry::EntryState State = getScalarsVectorizationState(
6810 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
6811 if (State == TreeEntry::NeedToGather) {
6812 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6813 ReuseShuffleIndicies);
6814 return;
6815 }
6816
6817 auto &BSRef = BlocksSchedules[BB];
6818 if (!BSRef)
6819 BSRef = std::make_unique<BlockScheduling>(BB);
6820
6821 BlockScheduling &BS = *BSRef;
6822
6823 std::optional<ScheduleData *> Bundle =
6824 BS.tryScheduleBundle(UniqueValues, this, S);
6825#ifdef EXPENSIVE_CHECKS
6826 // Make sure we didn't break any internal invariants
6827 BS.verify();
6828#endif
6829 if (!Bundle) {
6830 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
6831 assert((!BS.getScheduleData(VL0) ||
6832 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
6833 "tryScheduleBundle should cancelScheduling on failure");
6834 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
6835 ReuseShuffleIndicies);
6836 NonScheduledFirst.insert(VL.front());
6837 return;
6838 }
6839 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
6840
6841 unsigned ShuffleOrOp = S.isAltShuffle() ?
6842 (unsigned) Instruction::ShuffleVector : S.getOpcode();
6843 switch (ShuffleOrOp) {
6844 case Instruction::PHI: {
6845 auto *PH = cast<PHINode>(VL0);
6846
6847 TreeEntry *TE =
6848 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
6849 LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
6850
6851 // Keeps the reordered operands to avoid code duplication.
6852 PHIHandler Handler(*DT, PH, VL);
6853 Handler.buildOperands();
6854 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6855 TE->setOperand(I, Handler.getOperands(I));
6856 for (unsigned I : seq<unsigned>(0, PH->getNumOperands()))
6857 buildTree_rec(Handler.getOperands(I), Depth + 1, {TE, I});
6858 return;
6859 }
6860 case Instruction::ExtractValue:
6861 case Instruction::ExtractElement: {
6862 if (CurrentOrder.empty()) {
6863 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
6864 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6865 ReuseShuffleIndicies);
6866 // This is a special case, as it does not gather, but at the same time
6867 // we are not extending buildTree_rec() towards the operands.
6868 ValueList Op0;
6869 Op0.assign(VL.size(), VL0->getOperand(0));
6870 VectorizableTree.back()->setOperand(0, Op0);
6871 return;
6872 }
6873 LLVM_DEBUG({
6874 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
6875 "with order";
6876 for (unsigned Idx : CurrentOrder)
6877 dbgs() << " " << Idx;
6878 dbgs() << "\n";
6879 });
6880 fixupOrderingIndices(CurrentOrder);
6881 // Insert new order with initial value 0, if it does not exist,
6882 // otherwise return the iterator to the existing one.
6883 newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6884 ReuseShuffleIndicies, CurrentOrder);
6885 // This is a special case, as it does not gather, but at the same time
6886 // we are not extending buildTree_rec() towards the operands.
6887 ValueList Op0;
6888 Op0.assign(VL.size(), VL0->getOperand(0));
6889 VectorizableTree.back()->setOperand(0, Op0);
6890 return;
6891 }
6892 case Instruction::InsertElement: {
6893 assert(ReuseShuffleIndicies.empty() && "All inserts should be unique");
6894
6895 auto OrdCompare = [](const std::pair<int, int> &P1,
6896 const std::pair<int, int> &P2) {
6897 return P1.first > P2.first;
6898 };
6900 decltype(OrdCompare)>
6901 Indices(OrdCompare);
6902 for (int I = 0, E = VL.size(); I < E; ++I) {
6903 unsigned Idx = *getInsertIndex(VL[I]);
6904 Indices.emplace(Idx, I);
6905 }
6906 OrdersType CurrentOrder(VL.size(), VL.size());
6907 bool IsIdentity = true;
6908 for (int I = 0, E = VL.size(); I < E; ++I) {
6909 CurrentOrder[Indices.top().second] = I;
6910 IsIdentity &= Indices.top().second == I;
6911 Indices.pop();
6912 }
6913 if (IsIdentity)
6914 CurrentOrder.clear();
6915 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6916 std::nullopt, CurrentOrder);
6917 LLVM_DEBUG(dbgs() << "SLP: added inserts bundle.\n");
6918
6919 constexpr int NumOps = 2;
6920 ValueList VectorOperands[NumOps];
6921 for (int I = 0; I < NumOps; ++I) {
6922 for (Value *V : VL)
6923 VectorOperands[I].push_back(cast<Instruction>(V)->getOperand(I));
6924
6925 TE->setOperand(I, VectorOperands[I]);
6926 }
6927 buildTree_rec(VectorOperands[NumOps - 1], Depth + 1, {TE, NumOps - 1});
6928 return;
6929 }
6930 case Instruction::Load: {
6931 // Check that a vectorized load would load the same memory as a scalar
6932 // load. For example, we don't want to vectorize loads that are smaller
6933 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
6934 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
6935 // from such a struct, we read/write packed bits disagreeing with the
6936 // unvectorized version.
6937 TreeEntry *TE = nullptr;
6938 fixupOrderingIndices(CurrentOrder);
6939 switch (State) {
6940 case TreeEntry::Vectorize:
6941 if (CurrentOrder.empty()) {
6942 // Original loads are consecutive and does not require reordering.
6943 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6944 ReuseShuffleIndicies);
6945 LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
6946 } else {
6947 // Need to reorder.
6948 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
6949 ReuseShuffleIndicies, CurrentOrder);
6950 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
6951 }
6952 TE->setOperandsInOrder();
6953 break;
6954 case TreeEntry::StridedVectorize:
6955 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6956 if (CurrentOrder.empty()) {
6957 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6958 UserTreeIdx, ReuseShuffleIndicies);
6959 } else {
6960 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
6961 UserTreeIdx, ReuseShuffleIndicies, CurrentOrder);
6962 }
6963 TE->setOperandsInOrder();
6964 LLVM_DEBUG(dbgs() << "SLP: added a vector of strided loads.\n");
6965 break;
6966 case TreeEntry::ScatterVectorize:
6967 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
6968 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
6969 UserTreeIdx, ReuseShuffleIndicies);
6970 TE->setOperandsInOrder();
6971 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
6972 LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
6973 break;
6974 case TreeEntry::NeedToGather:
6975 llvm_unreachable("Unexpected loads state.");
6976 }
6977 return;
6978 }
6979 case Instruction::ZExt:
6980 case Instruction::SExt:
6981 case Instruction::FPToUI:
6982 case Instruction::FPToSI:
6983 case Instruction::FPExt:
6984 case Instruction::PtrToInt:
6985 case Instruction::IntToPtr:
6986 case Instruction::SIToFP:
6987 case Instruction::UIToFP:
6988 case Instruction::Trunc:
6989 case Instruction::FPTrunc:
6990 case Instruction::BitCast: {
6991 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
6992 std::make_pair(std::numeric_limits<unsigned>::min(),
6993 std::numeric_limits<unsigned>::max()));
6994 if (ShuffleOrOp == Instruction::ZExt ||
6995 ShuffleOrOp == Instruction::SExt) {
6996 CastMaxMinBWSizes = std::make_pair(
6997 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
6998 PrevMaxBW),
6999 std::min<unsigned>(
7000 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7001 PrevMinBW));
7002 } else if (ShuffleOrOp == Instruction::Trunc) {
7003 CastMaxMinBWSizes = std::make_pair(
7004 std::max<unsigned>(
7005 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
7006 PrevMaxBW),
7007 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
7008 PrevMinBW));
7009 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7010 } else if (ShuffleOrOp == Instruction::SIToFP ||
7011 ShuffleOrOp == Instruction::UIToFP) {
7012 unsigned NumSignBits =
7013 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7014 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
7015 APInt Mask = DB->getDemandedBits(OpI);
7016 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
7017 }
7018 if (NumSignBits * 2 >=
7019 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7020 ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
7021 }
7022 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7023 ReuseShuffleIndicies);
7024 LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
7025
7026 TE->setOperandsInOrder();
7027 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7029 // Prepare the operand vector.
7030 for (Value *V : VL)
7031 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7032
7033 buildTree_rec(Operands, Depth + 1, {TE, I});
7034 }
7035 return;
7036 }
7037 case Instruction::ICmp:
7038 case Instruction::FCmp: {
7039 // Check that all of the compares have the same predicate.
7040 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7041 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7042 ReuseShuffleIndicies);
7043 LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
7044
7046 if (cast<CmpInst>(VL0)->isCommutative()) {
7047 // Commutative predicate - collect + sort operands of the instructions
7048 // so that each side is more likely to have the same opcode.
7050 "Commutative Predicate mismatch");
7051 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7052 } else {
7053 // Collect operands - commute if it uses the swapped predicate.
7054 for (Value *V : VL) {
7055 auto *Cmp = cast<CmpInst>(V);
7056 Value *LHS = Cmp->getOperand(0);
7057 Value *RHS = Cmp->getOperand(1);
7058 if (Cmp->getPredicate() != P0)
7059 std::swap(LHS, RHS);
7060 Left.push_back(LHS);
7061 Right.push_back(RHS);
7062 }
7063 }
7064 TE->setOperand(0, Left);
7065 TE->setOperand(1, Right);
7066 buildTree_rec(Left, Depth + 1, {TE, 0});
7067 buildTree_rec(Right, Depth + 1, {TE, 1});
7068 if (ShuffleOrOp == Instruction::ICmp) {
7069 unsigned NumSignBits0 =
7070 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
7071 if (NumSignBits0 * 2 >=
7072 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
7073 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
7074 unsigned NumSignBits1 =
7075 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
7076 if (NumSignBits1 * 2 >=
7077 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
7078 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
7079 }
7080 return;
7081 }
7082 case Instruction::Select:
7083 case Instruction::FNeg:
7084 case Instruction::Add:
7085 case Instruction::FAdd:
7086 case Instruction::Sub:
7087 case Instruction::FSub:
7088 case Instruction::Mul:
7089 case Instruction::FMul:
7090 case Instruction::UDiv:
7091 case Instruction::SDiv:
7092 case Instruction::FDiv:
7093 case Instruction::URem:
7094 case Instruction::SRem:
7095 case Instruction::FRem:
7096 case Instruction::Shl:
7097 case Instruction::LShr:
7098 case Instruction::AShr:
7099 case Instruction::And:
7100 case Instruction::Or:
7101 case Instruction::Xor: {
7102 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7103 ReuseShuffleIndicies);
7104 LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
7105
7106 // Sort operands of the instructions so that each side is more likely to
7107 // have the same opcode.
7108 if (isa<BinaryOperator>(VL0) && isCommutative(VL0)) {
7110 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7111 TE->setOperand(0, Left);
7112 TE->setOperand(1, Right);
7113 buildTree_rec(Left, Depth + 1, {TE, 0});
7114 buildTree_rec(Right, Depth + 1, {TE, 1});
7115 return;
7116 }
7117
7118 TE->setOperandsInOrder();
7119 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7121 // Prepare the operand vector.
7122 for (Value *V : VL)
7123 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7124
7125 buildTree_rec(Operands, Depth + 1, {TE, I});
7126 }
7127 return;
7128 }
7129 case Instruction::GetElementPtr: {
7130 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7131 ReuseShuffleIndicies);
7132 LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
7134 // Prepare the operand vector for pointer operands.
7135 for (Value *V : VL) {
7136 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7137 if (!GEP) {
7138 Operands.front().push_back(V);
7139 continue;
7140 }
7141 Operands.front().push_back(GEP->getPointerOperand());
7142 }
7143 TE->setOperand(0, Operands.front());
7144 // Need to cast all indices to the same type before vectorization to
7145 // avoid crash.
7146 // Required to be able to find correct matches between different gather
7147 // nodes and reuse the vectorized values rather than trying to gather them
7148 // again.
7149 int IndexIdx = 1;
7150 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
7151 Type *Ty = all_of(VL,
7152 [VL0Ty, IndexIdx](Value *V) {
7153 auto *GEP = dyn_cast<GetElementPtrInst>(V);
7154 if (!GEP)
7155 return true;
7156 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
7157 })
7158 ? VL0Ty
7159 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
7160 ->getPointerOperandType()
7161 ->getScalarType());
7162 // Prepare the operand vector.
7163 for (Value *V : VL) {
7164 auto *I = dyn_cast<GetElementPtrInst>(V);
7165 if (!I) {
7166 Operands.back().push_back(
7167 ConstantInt::get(Ty, 0, /*isSigned=*/false));
7168 continue;
7169 }
7170 auto *Op = I->getOperand(IndexIdx);
7171 auto *CI = dyn_cast<ConstantInt>(Op);
7172 if (!CI)
7173 Operands.back().push_back(Op);
7174 else
7175 Operands.back().push_back(ConstantFoldIntegerCast(
7176 CI, Ty, CI->getValue().isSignBitSet(), *DL));
7177 }
7178 TE->setOperand(IndexIdx, Operands.back());
7179
7180 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
7181 buildTree_rec(Operands[I], Depth + 1, {TE, I});
7182 return;
7183 }
7184 case Instruction::Store: {
7185 // Check if the stores are consecutive or if we need to swizzle them.
7186 ValueList Operands(VL.size());
7187 auto *OIter = Operands.begin();
7188 for (Value *V : VL) {
7189 auto *SI = cast<StoreInst>(V);
7190 *OIter = SI->getValueOperand();
7191 ++OIter;
7192 }
7193 // Check that the sorted pointer operands are consecutive.
7194 if (CurrentOrder.empty()) {
7195 // Original stores are consecutive and does not require reordering.
7196 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7197 ReuseShuffleIndicies);
7198 TE->setOperandsInOrder();
7199 buildTree_rec(Operands, Depth + 1, {TE, 0});
7200 LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
7201 } else {
7202 fixupOrderingIndices(CurrentOrder);
7203 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7204 ReuseShuffleIndicies, CurrentOrder);
7205 TE->setOperandsInOrder();
7206 buildTree_rec(Operands, Depth + 1, {TE, 0});
7207 LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
7208 }
7209 return;
7210 }
7211 case Instruction::Call: {
7212 // Check if the calls are all to the same vectorizable intrinsic or
7213 // library function.
7214 CallInst *CI = cast<CallInst>(VL0);
7216
7217 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7218 ReuseShuffleIndicies);
7219 // Sort operands of the instructions so that each side is more likely to
7220 // have the same opcode.
7221 if (isCommutative(VL0)) {
7223 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7224 TE->setOperand(0, Left);
7225 TE->setOperand(1, Right);
7227 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7228 Operands.emplace_back();
7230 continue;
7231 for (Value *V : VL) {
7232 auto *CI2 = cast<CallInst>(V);
7233 Operands.back().push_back(CI2->getArgOperand(I));
7234 }
7235 TE->setOperand(I, Operands.back());
7236 }
7237 buildTree_rec(Left, Depth + 1, {TE, 0});
7238 buildTree_rec(Right, Depth + 1, {TE, 1});
7239 for (unsigned I : seq<unsigned>(2, CI->arg_size())) {
7240 if (Operands[I - 2].empty())
7241 continue;
7242 buildTree_rec(Operands[I - 2], Depth + 1, {TE, I});
7243 }
7244 return;
7245 }
7246 TE->setOperandsInOrder();
7247 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
7248 // For scalar operands no need to create an entry since no need to
7249 // vectorize it.
7251 continue;
7253 // Prepare the operand vector.
7254 for (Value *V : VL) {
7255 auto *CI2 = cast<CallInst>(V);
7256 Operands.push_back(CI2->getArgOperand(I));
7257 }
7258 buildTree_rec(Operands, Depth + 1, {TE, I});
7259 }
7260 return;
7261 }
7262 case Instruction::ShuffleVector: {
7263 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
7264 ReuseShuffleIndicies);
7265 LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
7266
7267 // Reorder operands if reordering would enable vectorization.
7268 auto *CI = dyn_cast<CmpInst>(VL0);
7269 if (isa<BinaryOperator>(VL0) || CI) {
7271 if (!CI || all_of(VL, [](Value *V) {
7272 return cast<CmpInst>(V)->isCommutative();
7273 })) {
7274 reorderInputsAccordingToOpcode(VL, Left, Right, *this);
7275 } else {
7276 auto *MainCI = cast<CmpInst>(S.MainOp);
7277 auto *AltCI = cast<CmpInst>(S.AltOp);
7278 CmpInst::Predicate MainP = MainCI->getPredicate();
7279 CmpInst::Predicate AltP = AltCI->getPredicate();
7280 assert(MainP != AltP &&
7281 "Expected different main/alternate predicates.");
7282 // Collect operands - commute if it uses the swapped predicate or
7283 // alternate operation.
7284 for (Value *V : VL) {
7285 auto *Cmp = cast<CmpInst>(V);
7286 Value *LHS = Cmp->getOperand(0);
7287 Value *RHS = Cmp->getOperand(1);
7288
7289 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
7290 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7291 std::swap(LHS, RHS);
7292 } else {
7293 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
7294 std::swap(LHS, RHS);
7295 }
7296 Left.push_back(LHS);
7297 Right.push_back(RHS);
7298 }
7299 }
7300 TE->setOperand(0, Left);
7301 TE->setOperand(1, Right);
7302 buildTree_rec(Left, Depth + 1, {TE, 0});
7303 buildTree_rec(Right, Depth + 1, {TE, 1});
7304 return;
7305 }
7306
7307 TE->setOperandsInOrder();
7308 for (unsigned I : seq<unsigned>(0, VL0->getNumOperands())) {
7310 // Prepare the operand vector.
7311 for (Value *V : VL)
7312 Operands.push_back(cast<Instruction>(V)->getOperand(I));
7313
7314 buildTree_rec(Operands, Depth + 1, {TE, I});
7315 }
7316 return;
7317 }
7318 default:
7319 break;
7320 }
7321 llvm_unreachable("Unexpected vectorization of the instructions.");
7322}
7323
7325 unsigned N = 1;
7326 Type *EltTy = T;
7327
7328 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
7329 if (auto *ST = dyn_cast<StructType>(EltTy)) {
7330 // Check that struct is homogeneous.
7331 for (const auto *Ty : ST->elements())
7332 if (Ty != *ST->element_begin())
7333 return 0;
7334 N *= ST->getNumElements();
7335 EltTy = *ST->element_begin();
7336 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
7337 N *= AT->getNumElements();
7338 EltTy = AT->getElementType();
7339 } else {
7340 auto *VT = cast<FixedVectorType>(EltTy);
7341 N *= VT->getNumElements();
7342 EltTy = VT->getElementType();
7343 }
7344 }
7345
7346 if (!isValidElementType(EltTy))
7347 return 0;
7349 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
7350 VTSize != DL->getTypeStoreSizeInBits(T))
7351 return 0;
7352 return N;
7353}
7354
7355bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
7356 SmallVectorImpl<unsigned> &CurrentOrder,
7357 bool ResizeAllowed) const {
7358 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
7359 assert(It != VL.end() && "Expected at least one extract instruction.");
7360 auto *E0 = cast<Instruction>(*It);
7361 assert(
7362 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
7363 "Invalid opcode");
7364 // Check if all of the extracts come from the same vector and from the
7365 // correct offset.
7366 Value *Vec = E0->getOperand(0);
7367
7368 CurrentOrder.clear();
7369
7370 // We have to extract from a vector/aggregate with the same number of elements.
7371 unsigned NElts;
7372 if (E0->getOpcode() == Instruction::ExtractValue) {
7373 NElts = canMapToVector(Vec->getType());
7374 if (!NElts)
7375 return false;
7376 // Check if load can be rewritten as load of vector.
7377 LoadInst *LI = dyn_cast<LoadInst>(Vec);
7378 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
7379 return false;
7380 } else {
7381 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
7382 }
7383
7384 unsigned E = VL.size();
7385 if (!ResizeAllowed && NElts != E)
7386 return false;
7387 SmallVector<int> Indices(E, PoisonMaskElem);
7388 unsigned MinIdx = NElts, MaxIdx = 0;
7389 for (auto [I, V] : enumerate(VL)) {
7390 auto *Inst = dyn_cast<Instruction>(V);
7391 if (!Inst)
7392 continue;
7393 if (Inst->getOperand(0) != Vec)
7394 return false;
7395 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
7396 if (isa<UndefValue>(EE->getIndexOperand()))
7397 continue;
7398 std::optional<unsigned> Idx = getExtractIndex(Inst);
7399 if (!Idx)
7400 return false;
7401 const unsigned ExtIdx = *Idx;
7402 if (ExtIdx >= NElts)
7403 continue;
7404 Indices[I] = ExtIdx;
7405 if (MinIdx > ExtIdx)
7406 MinIdx = ExtIdx;
7407 if (MaxIdx < ExtIdx)
7408 MaxIdx = ExtIdx;
7409 }
7410 if (MaxIdx - MinIdx + 1 > E)
7411 return false;
7412 if (MaxIdx + 1 <= E)
7413 MinIdx = 0;
7414
7415 // Check that all of the indices extract from the correct offset.
7416 bool ShouldKeepOrder = true;
7417 // Assign to all items the initial value E + 1 so we can check if the extract
7418 // instruction index was used already.
7419 // Also, later we can check that all the indices are used and we have a
7420 // consecutive access in the extract instructions, by checking that no
7421 // element of CurrentOrder still has value E + 1.
7422 CurrentOrder.assign(E, E);
7423 for (unsigned I = 0; I < E; ++I) {
7424 if (Indices[I] == PoisonMaskElem)
7425 continue;
7426 const unsigned ExtIdx = Indices[I] - MinIdx;
7427 if (CurrentOrder[ExtIdx] != E) {
7428 CurrentOrder.clear();
7429 return false;
7430 }
7431 ShouldKeepOrder &= ExtIdx == I;
7432 CurrentOrder[ExtIdx] = I;
7433 }
7434 if (ShouldKeepOrder)
7435 CurrentOrder.clear();
7436
7437 return ShouldKeepOrder;
7438}
7439
7440bool BoUpSLP::areAllUsersVectorized(
7441 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
7442 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
7443 all_of(I->users(), [this](User *U) {
7444 return ScalarToTreeEntry.contains(U) ||
7445 isVectorLikeInstWithConstOps(U) ||
7446 (isa<ExtractElementInst>(U) && MustGather.contains(U));
7447 });
7448}
7449
7450static std::pair<InstructionCost, InstructionCost>
7453 ArrayRef<Type *> ArgTys) {
7455
7456 // Calculate the cost of the scalar and vector calls.
7457 FastMathFlags FMF;
7458 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7459 FMF = FPCI->getFastMathFlags();
7461 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
7462 dyn_cast<IntrinsicInst>(CI));
7463 auto IntrinsicCost =
7465
7466 auto Shape = VFShape::get(CI->getFunctionType(),
7468 false /*HasGlobalPred*/);
7469 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7470 auto LibCost = IntrinsicCost;
7471 if (!CI->isNoBuiltin() && VecFunc) {
7472 // Calculate the cost of the vector library call.
7473 // If the corresponding vector call is cheaper, return its cost.
7474 LibCost =
7475 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7476 }
7477 return {IntrinsicCost, LibCost};
7478}
7479
7480void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
7481 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
7482 SmallVectorImpl<Value *> *OpScalars,
7483 SmallVectorImpl<Value *> *AltScalars) const {
7484 unsigned Sz = Scalars.size();
7485 Mask.assign(Sz, PoisonMaskElem);
7486 SmallVector<int> OrderMask;
7487 if (!ReorderIndices.empty())
7488 inversePermutation(ReorderIndices, OrderMask);
7489 for (unsigned I = 0; I < Sz; ++I) {
7490 unsigned Idx = I;
7491 if (!ReorderIndices.empty())
7492 Idx = OrderMask[I];
7493 auto *OpInst = cast<Instruction>(Scalars[Idx]);
7494 if (IsAltOp(OpInst)) {
7495 Mask[I] = Sz + Idx;
7496 if (AltScalars)
7497 AltScalars->push_back(OpInst);
7498 } else {
7499 Mask[I] = Idx;
7500 if (OpScalars)
7501 OpScalars->push_back(OpInst);
7502 }
7503 }
7504 if (!ReuseShuffleIndices.empty()) {
7505 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
7506 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
7507 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
7508 });
7509 Mask.swap(NewMask);
7510 }
7511}
7512
7514 const Instruction *MainOp,
7515 const Instruction *AltOp,
7516 const TargetLibraryInfo &TLI) {
7517 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
7518 auto *AltCI = cast<CmpInst>(AltOp);
7519 CmpInst::Predicate MainP = MainCI->getPredicate();
7520 CmpInst::Predicate AltP = AltCI->getPredicate();
7521 assert(MainP != AltP && "Expected different main/alternate predicates.");
7522 auto *CI = cast<CmpInst>(I);
7523 if (isCmpSameOrSwapped(MainCI, CI, TLI))
7524 return false;
7525 if (isCmpSameOrSwapped(AltCI, CI, TLI))
7526 return true;
7527 CmpInst::Predicate P = CI->getPredicate();
7529
7530 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
7531 "CmpInst expected to match either main or alternate predicate or "
7532 "their swap.");
7533 (void)AltP;
7534 return MainP != P && MainP != SwappedP;
7535 }
7536 return I->getOpcode() == AltOp->getOpcode();
7537}
7538
7539TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
7540 assert(!Ops.empty());
7541 const auto *Op0 = Ops.front();
7542
7543 const bool IsConstant = all_of(Ops, [](Value *V) {
7544 // TODO: We should allow undef elements here
7545 return isConstant(V) && !isa<UndefValue>(V);
7546 });
7547 const bool IsUniform = all_of(Ops, [=](Value *V) {
7548 // TODO: We should allow undef elements here
7549 return V == Op0;
7550 });
7551 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
7552 // TODO: We should allow undef elements here
7553 if (auto *CI = dyn_cast<ConstantInt>(V))
7554 return CI->getValue().isPowerOf2();
7555 return false;
7556 });
7557 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
7558 // TODO: We should allow undef elements here
7559 if (auto *CI = dyn_cast<ConstantInt>(V))
7560 return CI->getValue().isNegatedPowerOf2();
7561 return false;
7562 });
7563
7565 if (IsConstant && IsUniform)
7567 else if (IsConstant)
7569 else if (IsUniform)
7571
7573 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
7574 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
7575
7576 return {VK, VP};
7577}
7578
7579namespace {
7580/// The base class for shuffle instruction emission and shuffle cost estimation.
7581class BaseShuffleAnalysis {
7582protected:
7583 /// Checks if the mask is an identity mask.
7584 /// \param IsStrict if is true the function returns false if mask size does
7585 /// not match vector size.
7586 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
7587 bool IsStrict) {
7588 int Limit = Mask.size();
7589 int VF = VecTy->getNumElements();
7590 int Index = -1;
7591 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
7592 return true;
7593 if (!IsStrict) {
7594 // Consider extract subvector starting from index 0.
7596 Index == 0)
7597 return true;
7598 // All VF-size submasks are identity (e.g.
7599 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
7600 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
7601 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
7602 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
7604 }))
7605 return true;
7606 }
7607 return false;
7608 }
7609
7610 /// Tries to combine 2 different masks into single one.
7611 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
7612 /// change the size of the vector, \p LocalVF is the original size of the
7613 /// shuffled vector.
7614 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
7615 ArrayRef<int> ExtMask) {
7616 unsigned VF = Mask.size();
7617 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
7618 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
7619 if (ExtMask[I] == PoisonMaskElem)
7620 continue;
7621 int MaskedIdx = Mask[ExtMask[I] % VF];
7622 NewMask[I] =
7623 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
7624 }
7625 Mask.swap(NewMask);
7626 }
7627
7628 /// Looks through shuffles trying to reduce final number of shuffles in the
7629 /// code. The function looks through the previously emitted shuffle
7630 /// instructions and properly mark indices in mask as undef.
7631 /// For example, given the code
7632 /// \code
7633 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
7634 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
7635 /// \endcode
7636 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
7637 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7638 /// <0, 1, 2, 3> for the shuffle.
7639 /// If 2 operands are of different size, the smallest one will be resized and
7640 /// the mask recalculated properly.
7641 /// For example, given the code
7642 /// \code
7643 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
7644 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
7645 /// \endcode
7646 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
7647 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
7648 /// <0, 1, 2, 3> for the shuffle.
7649 /// So, it tries to transform permutations to simple vector merge, if
7650 /// possible.
7651 /// \param V The input vector which must be shuffled using the given \p Mask.
7652 /// If the better candidate is found, \p V is set to this best candidate
7653 /// vector.
7654 /// \param Mask The input mask for the shuffle. If the best candidate is found
7655 /// during looking-through-shuffles attempt, it is updated accordingly.
7656 /// \param SinglePermute true if the shuffle operation is originally a
7657 /// single-value-permutation. In this case the look-through-shuffles procedure
7658 /// may look for resizing shuffles as the best candidates.
7659 /// \return true if the shuffle results in the non-resizing identity shuffle
7660 /// (and thus can be ignored), false - otherwise.
7661 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
7662 bool SinglePermute) {
7663 Value *Op = V;
7664 ShuffleVectorInst *IdentityOp = nullptr;
7665 SmallVector<int> IdentityMask;
7666 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
7667 // Exit if not a fixed vector type or changing size shuffle.
7668 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
7669 if (!SVTy)
7670 break;
7671 // Remember the identity or broadcast mask, if it is not a resizing
7672 // shuffle. If no better candidates are found, this Op and Mask will be
7673 // used in the final shuffle.
7674 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
7675 if (!IdentityOp || !SinglePermute ||
7676 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
7678 IdentityMask.size()))) {
7679 IdentityOp = SV;
7680 // Store current mask in the IdentityMask so later we did not lost
7681 // this info if IdentityOp is selected as the best candidate for the
7682 // permutation.
7683 IdentityMask.assign(Mask);
7684 }
7685 }
7686 // Remember the broadcast mask. If no better candidates are found, this Op
7687 // and Mask will be used in the final shuffle.
7688 // Zero splat can be used as identity too, since it might be used with
7689 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
7690 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
7691 // expensive, the analysis founds out, that the source vector is just a
7692 // broadcast, this original mask can be transformed to identity mask <0,
7693 // 1, 2, 3>.
7694 // \code
7695 // %0 = shuffle %v, poison, zeroinitalizer
7696 // %res = shuffle %0, poison, <3, 1, 2, 0>
7697 // \endcode
7698 // may be transformed to
7699 // \code
7700 // %0 = shuffle %v, poison, zeroinitalizer
7701 // %res = shuffle %0, poison, <0, 1, 2, 3>
7702 // \endcode
7703 if (SV->isZeroEltSplat()) {
7704 IdentityOp = SV;
7705 IdentityMask.assign(Mask);
7706 }
7707 int LocalVF = Mask.size();
7708 if (auto *SVOpTy =
7709 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
7710 LocalVF = SVOpTy->getNumElements();
7711 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
7712 for (auto [Idx, I] : enumerate(Mask)) {
7713 if (I == PoisonMaskElem ||
7714 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
7715 continue;
7716 ExtMask[Idx] = SV->getMaskValue(I);
7717 }
7718 bool IsOp1Undef =
7719 isUndefVector(SV->getOperand(0),
7720 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
7721 .all();
7722 bool IsOp2Undef =
7723 isUndefVector(SV->getOperand(1),
7724 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
7725 .all();
7726 if (!IsOp1Undef && !IsOp2Undef) {
7727 // Update mask and mark undef elems.
7728 for (int &I : Mask) {
7729 if (I == PoisonMaskElem)
7730 continue;
7731 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
7733 I = PoisonMaskElem;
7734 }
7735 break;
7736 }
7737 SmallVector<int> ShuffleMask(SV->getShuffleMask().begin(),
7738 SV->getShuffleMask().end());
7739 combineMasks(LocalVF, ShuffleMask, Mask);
7740 Mask.swap(ShuffleMask);
7741 if (IsOp2Undef)
7742 Op = SV->getOperand(0);
7743 else
7744 Op = SV->getOperand(1);
7745 }
7746 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
7747 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
7749 if (IdentityOp) {
7750 V = IdentityOp;
7751 assert(Mask.size() == IdentityMask.size() &&
7752 "Expected masks of same sizes.");
7753 // Clear known poison elements.
7754 for (auto [I, Idx] : enumerate(Mask))
7755 if (Idx == PoisonMaskElem)
7756 IdentityMask[I] = PoisonMaskElem;
7757 Mask.swap(IdentityMask);
7758 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
7759 return SinglePermute &&
7760 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
7761 /*IsStrict=*/true) ||
7762 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
7763 Shuffle->isZeroEltSplat() &&
7765 }
7766 V = Op;
7767 return false;
7768 }
7769 V = Op;
7770 return true;
7771 }
7772
7773 /// Smart shuffle instruction emission, walks through shuffles trees and
7774 /// tries to find the best matching vector for the actual shuffle
7775 /// instruction.
7776 template <typename T, typename ShuffleBuilderTy>
7777 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
7778 ShuffleBuilderTy &Builder) {
7779 assert(V1 && "Expected at least one vector value.");
7780 if (V2)
7781 Builder.resizeToMatch(V1, V2);
7782 int VF = Mask.size();
7783 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
7784 VF = FTy->getNumElements();
7785 if (V2 &&
7786 !isUndefVector(V2, buildUseMask(VF, Mask, UseMask::SecondArg)).all()) {
7787 // Peek through shuffles.
7788 Value *Op1 = V1;
7789 Value *Op2 = V2;
7790 int VF =
7791 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
7792 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
7793 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
7794 for (int I = 0, E = Mask.size(); I < E; ++I) {
7795 if (Mask[I] < VF)
7796 CombinedMask1[I] = Mask[I];
7797 else
7798 CombinedMask2[I] = Mask[I] - VF;
7799 }
7800 Value *PrevOp1;
7801 Value *PrevOp2;
7802 do {
7803 PrevOp1 = Op1;
7804 PrevOp2 = Op2;
7805 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
7806 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
7807 // Check if we have 2 resizing shuffles - need to peek through operands
7808 // again.
7809 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
7810 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
7811 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
7812 for (auto [Idx, I] : enumerate(CombinedMask1)) {
7813 if (I == PoisonMaskElem)
7814 continue;
7815 ExtMask1[Idx] = SV1->getMaskValue(I);
7816 }
7817 SmallBitVector UseMask1 = buildUseMask(
7818 cast<FixedVectorType>(SV1->getOperand(1)->getType())
7819 ->getNumElements(),
7820 ExtMask1, UseMask::SecondArg);
7821 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
7822 for (auto [Idx, I] : enumerate(CombinedMask2)) {
7823 if (I == PoisonMaskElem)
7824 continue;
7825 ExtMask2[Idx] = SV2->getMaskValue(I);
7826 }
7827 SmallBitVector UseMask2 = buildUseMask(
7828 cast<FixedVectorType>(SV2->getOperand(1)->getType())
7829 ->getNumElements(),
7830 ExtMask2, UseMask::SecondArg);
7831 if (SV1->getOperand(0)->getType() ==
7832 SV2->getOperand(0)->getType() &&
7833 SV1->getOperand(0)->getType() != SV1->getType() &&
7834 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
7835 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
7836 Op1 = SV1->getOperand(0);
7837 Op2 = SV2->getOperand(0);
7838 SmallVector<int> ShuffleMask1(SV1->getShuffleMask().begin(),
7839 SV1->getShuffleMask().end());
7840 int LocalVF = ShuffleMask1.size();
7841 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
7842 LocalVF = FTy->getNumElements();
7843 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
7844 CombinedMask1.swap(ShuffleMask1);
7845 SmallVector<int> ShuffleMask2(SV2->getShuffleMask().begin(),
7846 SV2->getShuffleMask().end());
7847 LocalVF = ShuffleMask2.size();
7848 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
7849 LocalVF = FTy->getNumElements();
7850 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
7851 CombinedMask2.swap(ShuffleMask2);
7852 }
7853 }
7854 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
7855 Builder.resizeToMatch(Op1, Op2);
7856 VF = std::max(cast<VectorType>(Op1->getType())
7857 ->getElementCount()
7858 .getKnownMinValue(),
7859 cast<VectorType>(Op2->getType())
7860 ->getElementCount()
7861 .getKnownMinValue());
7862 for (int I = 0, E = Mask.size(); I < E; ++I) {
7863 if (CombinedMask2[I] != PoisonMaskElem) {
7864 assert(CombinedMask1[I] == PoisonMaskElem &&
7865 "Expected undefined mask element");
7866 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
7867 }
7868 }
7869 if (Op1 == Op2 &&
7870 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
7871 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
7872 isa<ShuffleVectorInst>(Op1) &&
7873 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
7874 ArrayRef(CombinedMask1))))
7875 return Builder.createIdentity(Op1);
7876 return Builder.createShuffleVector(
7877 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
7878 CombinedMask1);
7879 }
7880 if (isa<PoisonValue>(V1))
7881 return Builder.createPoison(
7882 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
7883 SmallVector<int> NewMask(Mask.begin(), Mask.end());
7884 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
7885 assert(V1 && "Expected non-null value after looking through shuffles.");
7886
7887 if (!IsIdentity)
7888 return Builder.createShuffleVector(V1, NewMask);
7889 return Builder.createIdentity(V1);
7890 }
7891};
7892} // namespace
7893
7894/// Returns the cost of the shuffle instructions with the given \p Kind, vector
7895/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
7896/// subvector pattern.
7897static InstructionCost
7899 VectorType *Tp, ArrayRef<int> Mask = std::nullopt,
7901 int Index = 0, VectorType *SubTp = nullptr,
7902 ArrayRef<const Value *> Args = std::nullopt) {
7903 if (Kind != TTI::SK_PermuteTwoSrc)
7904 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7905 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
7906 int NumSubElts;
7907 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
7908 Mask, NumSrcElts, NumSubElts, Index)) {
7909 if (Index + NumSubElts > NumSrcElts &&
7910 Index + NumSrcElts <= static_cast<int>(Mask.size()))
7911 return TTI.getShuffleCost(
7913 FixedVectorType::get(Tp->getElementType(), Mask.size()), Mask,
7915 }
7916 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
7917}
7918
7919/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
7920static std::pair<InstructionCost, InstructionCost>
7922 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
7923 Type *ScalarTy, VectorType *VecTy) {
7924 InstructionCost ScalarCost = 0;
7925 InstructionCost VecCost = 0;
7926 // Here we differentiate two cases: (1) when Ptrs represent a regular
7927 // vectorization tree node (as they are pointer arguments of scattered
7928 // loads) or (2) when Ptrs are the arguments of loads or stores being
7929 // vectorized as plane wide unit-stride load/store since all the
7930 // loads/stores are known to be from/to adjacent locations.
7931 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
7932 // Case 2: estimate costs for pointer related costs when vectorizing to
7933 // a wide load/store.
7934 // Scalar cost is estimated as a set of pointers with known relationship
7935 // between them.
7936 // For vector code we will use BasePtr as argument for the wide load/store
7937 // but we also need to account all the instructions which are going to
7938 // stay in vectorized code due to uses outside of these scalar
7939 // loads/stores.
7940 ScalarCost = TTI.getPointersChainCost(
7941 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
7942 CostKind);
7943
7944 SmallVector<const Value *> PtrsRetainedInVecCode;
7945 for (Value *V : Ptrs) {
7946 if (V == BasePtr) {
7947 PtrsRetainedInVecCode.push_back(V);
7948 continue;
7949 }
7950 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7951 // For simplicity assume Ptr to stay in vectorized code if it's not a
7952 // GEP instruction. We don't care since it's cost considered free.
7953 // TODO: We should check for any uses outside of vectorizable tree
7954 // rather than just single use.
7955 if (!Ptr || !Ptr->hasOneUse())
7956 PtrsRetainedInVecCode.push_back(V);
7957 }
7958
7959 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
7960 // If all pointers stay in vectorized code then we don't have
7961 // any savings on that.
7962 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
7963 }
7964 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
7965 TTI::PointersChainInfo::getKnownStride(),
7966 VecTy, CostKind);
7967 } else {
7968 // Case 1: Ptrs are the arguments of loads that we are going to transform
7969 // into masked gather load intrinsic.
7970 // All the scalar GEPs will be removed as a result of vectorization.
7971 // For any external uses of some lanes extract element instructions will
7972 // be generated (which cost is estimated separately).
7973 TTI::PointersChainInfo PtrsInfo =
7974 all_of(Ptrs,
7975 [](const Value *V) {
7976 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
7977 return Ptr && !Ptr->hasAllConstantIndices();
7978 })
7979 ? TTI::PointersChainInfo::getUnknownStride()
7980 : TTI::PointersChainInfo::getKnownStride();
7981
7982 ScalarCost =
7983 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
7984 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
7985 if (!BaseGEP) {
7986 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
7987 if (It != Ptrs.end())
7988 BaseGEP = cast<GEPOperator>(*It);
7989 }
7990 if (BaseGEP) {
7991 SmallVector<const Value *> Indices(BaseGEP->indices());
7992 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
7993 BaseGEP->getPointerOperand(), Indices, VecTy,
7994 CostKind);
7995 }
7996 }
7997
7998 return std::make_pair(ScalarCost, VecCost);
7999}
8000
8003 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
8004 TreeEntry &E = *TE.get();
8005 switch (E.getOpcode()) {
8006 case Instruction::Load: {
8007 Type *ScalarTy = E.getMainOp()->getType();
8008 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
8009 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
8010 // Check if profitable to represent consecutive load + reverse as strided
8011 // load with stride -1.
8012 if (isReverseOrder(E.ReorderIndices) &&
8013 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8014 SmallVector<int> Mask;
8015 inversePermutation(E.ReorderIndices, Mask);
8016 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
8017 InstructionCost OriginalVecCost =
8018 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
8023 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
8024 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
8025 if (StridedCost < OriginalVecCost)
8026 // Strided load is more profitable than consecutive load + reverse -
8027 // transform the node to strided load.
8028 E.State = TreeEntry::StridedVectorize;
8029 }
8030 break;
8031 }
8032 case Instruction::Store: {
8033 Type *ScalarTy =
8034 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
8035 auto *VecTy = FixedVectorType::get(ScalarTy, E.Scalars.size());
8036 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
8037 // Check if profitable to represent consecutive load + reverse as strided
8038 // load with stride -1.
8039 if (isReverseOrder(E.ReorderIndices) &&
8040 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
8041 SmallVector<int> Mask;
8042 inversePermutation(E.ReorderIndices, Mask);
8043 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
8044 InstructionCost OriginalVecCost =
8045 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
8050 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
8051 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
8052 if (StridedCost < OriginalVecCost)
8053 // Strided load is more profitable than consecutive load + reverse -
8054 // transform the node to strided load.
8055 E.State = TreeEntry::StridedVectorize;
8056 }
8057 break;
8058 }
8059 default:
8060 break;
8061 }
8062 }
8063}
8064
8065/// Merges shuffle masks and emits final shuffle instruction, if required. It
8066/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
8067/// when the actual shuffle instruction is generated only if this is actually
8068/// required. Otherwise, the shuffle instruction emission is delayed till the
8069/// end of the process, to reduce the number of emitted instructions and further
8070/// analysis/transformations.
8071class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
8072 bool IsFinalized = false;
8073 SmallVector<int> CommonMask;
8075 Type *ScalarTy = nullptr;
8076 const TargetTransformInfo &TTI;
8078 SmallDenseSet<Value *> VectorizedVals;
8079 BoUpSLP &R;
8080 SmallPtrSetImpl<Value *> &CheckedExtracts;
8081 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
8082 /// While set, still trying to estimate the cost for the same nodes and we
8083 /// can delay actual cost estimation (virtual shuffle instruction emission).
8084 /// May help better estimate the cost if same nodes must be permuted + allows
8085 /// to move most of the long shuffles cost estimation to TTI.
8086 bool SameNodesEstimated = true;
8087
8088 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
8089 if (Ty->getScalarType()->isPointerTy()) {
8093 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
8094 Ty->getScalarType());
8095 if (auto *VTy = dyn_cast<VectorType>(Ty))
8096 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
8097 return Res;
8098 }
8099 return Constant::getAllOnesValue(Ty);
8100 }
8101
8102 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
8103 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
8104 return TTI::TCC_Free;
8105 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
8106 InstructionCost GatherCost = 0;
8107 SmallVector<Value *> Gathers(VL.begin(), VL.end());
8108 // Improve gather cost for gather of loads, if we can group some of the
8109 // loads into vector loads.
8110 InstructionsState S = getSameOpcode(VL, *R.TLI);
8111 const unsigned Sz = R.DL->getTypeSizeInBits(ScalarTy);
8112 unsigned MinVF = R.getMinVF(2 * Sz);
8113 if (VL.size() > 2 &&
8114 ((S.getOpcode() == Instruction::Load && !S.isAltShuffle()) ||
8115 (InVectors.empty() &&
8116 any_of(seq<unsigned>(0, VL.size() / MinVF),
8117 [&](unsigned Idx) {
8118 ArrayRef<Value *> SubVL = VL.slice(Idx * MinVF, MinVF);
8119 InstructionsState S = getSameOpcode(SubVL, *R.TLI);
8120 return S.getOpcode() == Instruction::Load &&
8121 !S.isAltShuffle();
8122 }))) &&
8123 !all_of(Gathers, [&](Value *V) { return R.getTreeEntry(V); }) &&
8124 !isSplat(Gathers)) {
8125 InstructionCost BaseCost = R.getGatherCost(Gathers, !Root, ScalarTy);
8126 SetVector<Value *> VectorizedLoads;
8128 SmallVector<unsigned> ScatterVectorized;
8129 unsigned StartIdx = 0;
8130 unsigned VF = VL.size() / 2;
8131 for (; VF >= MinVF; VF /= 2) {
8132 for (unsigned Cnt = StartIdx, End = VL.size(); Cnt + VF <= End;
8133 Cnt += VF) {
8134 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
8135 if (S.getOpcode() != Instruction::Load || S.isAltShuffle()) {
8136 InstructionsState SliceS = getSameOpcode(Slice, *R.TLI);
8137 if (SliceS.getOpcode() != Instruction::Load ||
8138 SliceS.isAltShuffle())
8139 continue;
8140 }
8141 if (!VectorizedLoads.count(Slice.front()) &&
8142 !VectorizedLoads.count(Slice.back()) && allSameBlock(Slice)) {
8143 SmallVector<Value *> PointerOps;
8144 OrdersType CurrentOrder;
8145 LoadsState LS = R.canVectorizeLoads(Slice, Slice.front(),
8146 CurrentOrder, PointerOps);
8147 switch (LS) {
8151 // Mark the vectorized loads so that we don't vectorize them
8152 // again.
8153 // TODO: better handling of loads with reorders.
8154 if (((LS == LoadsState::Vectorize ||
8156 CurrentOrder.empty()) ||
8158 isReverseOrder(CurrentOrder)))
8159 VectorizedStarts.emplace_back(Cnt, LS);
8160 else
8161 ScatterVectorized.push_back(Cnt);
8162 VectorizedLoads.insert(Slice.begin(), Slice.end());
8163 // If we vectorized initial block, no need to try to vectorize
8164 // it again.
8165 if (Cnt == StartIdx)
8166 StartIdx += VF;
8167 break;
8168 case LoadsState::Gather:
8169 break;
8170 }
8171 }
8172 }
8173 // Check if the whole array was vectorized already - exit.
8174 if (StartIdx >= VL.size())
8175 break;
8176 // Found vectorizable parts - exit.
8177 if (!VectorizedLoads.empty())
8178 break;
8179 }
8180 if (!VectorizedLoads.empty()) {
8181 unsigned NumParts = TTI.getNumberOfParts(VecTy);
8182 bool NeedInsertSubvectorAnalysis =
8183 !NumParts || (VL.size() / VF) > NumParts;
8184 // Get the cost for gathered loads.
8185 for (unsigned I = 0, End = VL.size(); I < End; I += VF) {
8186 if (VectorizedLoads.contains(VL[I]))
8187 continue;
8188 GatherCost +=
8189 getBuildVectorCost(VL.slice(I, std::min(End - I, VF)), Root);
8190 }
8191 // Exclude potentially vectorized loads from list of gathered
8192 // scalars.
8193 Gathers.assign(Gathers.size(), PoisonValue::get(VL.front()->getType()));
8194 // The cost for vectorized loads.
8195 InstructionCost ScalarsCost = 0;
8196 for (Value *V : VectorizedLoads) {
8197 auto *LI = cast<LoadInst>(V);
8198 ScalarsCost +=
8199 TTI.getMemoryOpCost(Instruction::Load, LI->getType(),
8200 LI->getAlign(), LI->getPointerAddressSpace(),
8201 CostKind, TTI::OperandValueInfo(), LI);
8202 }
8203 auto *LoadTy = FixedVectorType::get(VL.front()->getType(), VF);
8204 for (const std::pair<unsigned, LoadsState> &P : VectorizedStarts) {
8205 auto *LI = cast<LoadInst>(VL[P.first]);
8206 Align Alignment = LI->getAlign();
8207 GatherCost +=
8208 P.second == LoadsState::Vectorize
8209 ? TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment,
8210 LI->getPointerAddressSpace(), CostKind,
8213 Instruction::Load, LoadTy, LI->getPointerOperand(),
8214 /*VariableMask=*/false, Alignment, CostKind, LI);
8215 // Estimate GEP cost.
8216 SmallVector<Value *> PointerOps(VF);
8217 for (auto [I, V] : enumerate(VL.slice(P.first, VF)))
8218 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8219 auto [ScalarGEPCost, VectorGEPCost] =
8220 getGEPCosts(TTI, PointerOps, LI->getPointerOperand(),
8221 Instruction::Load, CostKind, LI->getType(), LoadTy);
8222 GatherCost += VectorGEPCost - ScalarGEPCost;
8223 }
8224 for (unsigned P : ScatterVectorized) {
8225 auto *LI0 = cast<LoadInst>(VL[P]);
8226 ArrayRef<Value *> Slice = VL.slice(P, VF);
8227 Align CommonAlignment = computeCommonAlignment<LoadInst>(Slice);
8228 GatherCost += TTI.getGatherScatterOpCost(
8229 Instruction::Load, LoadTy, LI0->getPointerOperand(),
8230 /*VariableMask=*/false, CommonAlignment, CostKind, LI0);
8231 // Estimate GEP cost.
8232 SmallVector<Value *> PointerOps(VF);
8233 for (auto [I, V] : enumerate(Slice))
8234 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
8235 OrdersType Order;
8236 if (sortPtrAccesses(PointerOps, LI0->getType(), *R.DL, *R.SE,
8237 Order)) {
8238 // TODO: improve checks if GEPs can be vectorized.
8239 Value *Ptr0 = PointerOps.front();
8240 Type *ScalarTy = Ptr0->getType();
8241 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
8242 auto [ScalarGEPCost, VectorGEPCost] =
8243 getGEPCosts(TTI, PointerOps, Ptr0, Instruction::GetElementPtr,
8244 CostKind, ScalarTy, VecTy);
8245 GatherCost += VectorGEPCost - ScalarGEPCost;
8246 if (!Order.empty()) {
8247 SmallVector<int> Mask;
8248 inversePermutation(Order, Mask);
8250 VecTy, Mask, CostKind);
8251 }
8252 } else {
8253 GatherCost += R.getGatherCost(PointerOps, /*ForPoisonSrc=*/true,
8254 PointerOps.front()->getType());
8255 }
8256 }
8257 if (NeedInsertSubvectorAnalysis) {
8258 // Add the cost for the subvectors insert.
8259 SmallVector<int> ShuffleMask(VL.size());
8260 for (unsigned I = VF, E = VL.size(); I < E; I += VF) {
8261 for (unsigned Idx : seq<unsigned>(0, E))
8262 ShuffleMask[Idx] = Idx / VF == I ? E + Idx % VF : Idx;
8263 GatherCost += TTI.getShuffleCost(TTI::SK_InsertSubvector, VecTy,
8264 ShuffleMask, CostKind, I, LoadTy);
8265 }
8266 }
8267 GatherCost -= ScalarsCost;
8268 }
8269 GatherCost = std::min(BaseCost, GatherCost);
8270 } else if (!Root && isSplat(VL)) {
8271 // Found the broadcasting of the single scalar, calculate the cost as
8272 // the broadcast.
8273 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
8274 assert(It != VL.end() && "Expected at least one non-undef value.");
8275 // Add broadcast for non-identity shuffle only.
8276 bool NeedShuffle =
8277 count(VL, *It) > 1 &&
8278 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
8279 if (!NeedShuffle)
8280 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
8281 CostKind, std::distance(VL.begin(), It),
8282 PoisonValue::get(VecTy), *It);
8283
8284 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
8285 transform(VL, ShuffleMask.begin(), [](Value *V) {
8286 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
8287 });
8288 InstructionCost InsertCost =
8289 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
8290 PoisonValue::get(VecTy), *It);
8292 VecTy, ShuffleMask, CostKind,
8293 /*Index=*/0, /*SubTp=*/nullptr,
8294 /*Args=*/*It);
8295 }
8296 return GatherCost +
8297 (all_of(Gathers, IsaPred<UndefValue>)
8299 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
8300 ScalarTy));
8301 };
8302
8303 /// Compute the cost of creating a vector containing the extracted values from
8304 /// \p VL.
8306 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
8307 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8308 unsigned NumParts) {
8309 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
8310 unsigned NumElts =
8311 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
8312 auto *EE = dyn_cast<ExtractElementInst>(V);
8313 if (!EE)
8314 return Sz;
8315 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
8316 if (!VecTy)
8317 return Sz;
8318 return std::max(Sz, VecTy->getNumElements());
8319 });
8320 unsigned NumSrcRegs =
8321 TTI.getNumberOfParts(FixedVectorType::get(ScalarTy, NumElts));
8322 if (NumSrcRegs == 0)
8323 NumSrcRegs = 1;
8324 // FIXME: this must be moved to TTI for better estimation.
8325 unsigned EltsPerVector = PowerOf2Ceil(std::max(
8326 divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
8327 auto CheckPerRegistersShuffle =
8328 [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
8329 DenseSet<int> RegIndices;
8330 // Check that if trying to permute same single/2 input vectors.
8332 int FirstRegId = -1;
8333 for (int &I : Mask) {
8334 if (I == PoisonMaskElem)
8335 continue;
8336 int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
8337 if (FirstRegId < 0)
8338 FirstRegId = RegId;
8339 RegIndices.insert(RegId);
8340 if (RegIndices.size() > 2)
8341 return std::nullopt;
8342 if (RegIndices.size() == 2)
8343 ShuffleKind = TTI::SK_PermuteTwoSrc;
8344 I = (I % NumElts) % EltsPerVector +
8345 (RegId == FirstRegId ? 0 : EltsPerVector);
8346 }
8347 return ShuffleKind;
8348 };
8350
8351 // Process extracts in blocks of EltsPerVector to check if the source vector
8352 // operand can be re-used directly. If not, add the cost of creating a
8353 // shuffle to extract the values into a vector register.
8354 for (unsigned Part = 0; Part < NumParts; ++Part) {
8355 if (!ShuffleKinds[Part])
8356 continue;
8357 ArrayRef<int> MaskSlice =
8358 Mask.slice(Part * EltsPerVector,
8359 (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
8360 ? Mask.size() % EltsPerVector
8361 : EltsPerVector);
8362 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
8363 copy(MaskSlice, SubMask.begin());
8364 std::optional<TTI::ShuffleKind> RegShuffleKind =
8365 CheckPerRegistersShuffle(SubMask);
8366 if (!RegShuffleKind) {
8367 Cost += ::getShuffleCost(TTI, *ShuffleKinds[Part],
8368 FixedVectorType::get(ScalarTy, NumElts),
8369 MaskSlice);
8370 continue;
8371 }
8372 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
8373 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
8374 Cost += ::getShuffleCost(TTI, *RegShuffleKind,
8375 FixedVectorType::get(ScalarTy, EltsPerVector),
8376 SubMask);
8377 }
8378 }
8379 return Cost;
8380 }
8381 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
8382 /// shuffle emission.
8383 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
8384 ArrayRef<int> Mask) {
8385 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8386 if (Mask[Idx] != PoisonMaskElem)
8387 CommonMask[Idx] = Idx;
8388 }
8389 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
8390 /// mask \p Mask, register number \p Part, that includes \p SliceSize
8391 /// elements.
8392 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
8393 ArrayRef<int> Mask, unsigned Part,
8394 unsigned SliceSize) {
8395 if (SameNodesEstimated) {
8396 // Delay the cost estimation if the same nodes are reshuffling.
8397 // If we already requested the cost of reshuffling of E1 and E2 before, no
8398 // need to estimate another cost with the sub-Mask, instead include this
8399 // sub-Mask into the CommonMask to estimate it later and avoid double cost
8400 // estimation.
8401 if ((InVectors.size() == 2 &&
8402 InVectors.front().get<const TreeEntry *>() == &E1 &&
8403 InVectors.back().get<const TreeEntry *>() == E2) ||
8404 (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
8405 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
8406 [](int Idx) { return Idx == PoisonMaskElem; }) &&
8407 "Expected all poisoned elements.");
8408 ArrayRef<int> SubMask =
8409 ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
8410 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
8411 return;
8412 }
8413 // Found non-matching nodes - need to estimate the cost for the matched
8414 // and transform mask.
8415 Cost += createShuffle(InVectors.front(),
8416 InVectors.size() == 1 ? nullptr : InVectors.back(),
8417 CommonMask);
8418 transformMaskAfterShuffle(CommonMask, CommonMask);
8419 }
8420 SameNodesEstimated = false;
8421 if (!E2 && InVectors.size() == 1) {
8422 unsigned VF = E1.getVectorFactor();
8423 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
8424 VF = std::max(VF,
8425 cast<FixedVectorType>(V1->getType())->getNumElements());
8426 } else {
8427 const auto *E = InVectors.front().get<const TreeEntry *>();
8428 VF = std::max(VF, E->getVectorFactor());
8429 }
8430 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8431 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8432 CommonMask[Idx] = Mask[Idx] + VF;
8433 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
8434 transformMaskAfterShuffle(CommonMask, CommonMask);
8435 } else {
8436 Cost += createShuffle(&E1, E2, Mask);
8437 transformMaskAfterShuffle(CommonMask, Mask);
8438 }
8439 }
8440
8441 class ShuffleCostBuilder {
8442 const TargetTransformInfo &TTI;
8443
8444 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
8445 int Index = -1;
8446 return Mask.empty() ||
8447 (VF == Mask.size() &&
8450 Index == 0);
8451 }
8452
8453 public:
8454 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
8455 ~ShuffleCostBuilder() = default;
8456 InstructionCost createShuffleVector(Value *V1, Value *,
8457 ArrayRef<int> Mask) const {
8458 // Empty mask or identity mask are free.
8459 unsigned VF =
8460 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8461 if (isEmptyOrIdentity(Mask, VF))
8462 return TTI::TCC_Free;
8463 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
8464 cast<VectorType>(V1->getType()), Mask);
8465 }
8466 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
8467 // Empty mask or identity mask are free.
8468 unsigned VF =
8469 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
8470 if (isEmptyOrIdentity(Mask, VF))
8471 return TTI::TCC_Free;
8473 cast<VectorType>(V1->getType()), Mask);
8474 }
8475 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
8476 InstructionCost createPoison(Type *Ty, unsigned VF) const {
8477 return TTI::TCC_Free;
8478 }
8479 void resizeToMatch(Value *&, Value *&) const {}
8480 };
8481
8482 /// Smart shuffle instruction emission, walks through shuffles trees and
8483 /// tries to find the best matching vector for the actual shuffle
8484 /// instruction.
8486 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
8488 ArrayRef<int> Mask) {
8489 ShuffleCostBuilder Builder(TTI);
8490 SmallVector<int> CommonMask(Mask.begin(), Mask.end());
8491 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
8492 unsigned CommonVF = Mask.size();
8493 InstructionCost ExtraCost = 0;
8494 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
8495 unsigned VF) -> InstructionCost {
8496 if (E.State == TreeEntry::NeedToGather && allConstant(E.Scalars))
8497 return TTI::TCC_Free;
8498 Type *EScalarTy = E.Scalars.front()->getType();
8499 bool IsSigned = true;
8500 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
8501 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
8502 IsSigned = It->second.second;
8503 }
8504 if (EScalarTy != ScalarTy) {
8505 unsigned CastOpcode = Instruction::Trunc;
8506 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8507 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8508 if (DstSz > SrcSz)
8509 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8510 return TTI.getCastInstrCost(CastOpcode,
8511 FixedVectorType::get(ScalarTy, VF),
8512 FixedVectorType::get(EScalarTy, VF),
8513 TTI::CastContextHint::None, CostKind);
8514 }
8515 return TTI::TCC_Free;
8516 };
8517 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
8518 if (isa<Constant>(V))
8519 return TTI::TCC_Free;
8520 auto *VecTy = cast<VectorType>(V->getType());
8521 Type *EScalarTy = VecTy->getElementType();
8522 if (EScalarTy != ScalarTy) {
8523 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
8524 unsigned CastOpcode = Instruction::Trunc;
8525 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
8526 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
8527 if (DstSz > SrcSz)
8528 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
8529 return TTI.getCastInstrCost(
8530 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
8531 VecTy, TTI::CastContextHint::None, CostKind);
8532 }
8533 return TTI::TCC_Free;
8534 };
8535 if (!V1 && !V2 && !P2.isNull()) {
8536 // Shuffle 2 entry nodes.
8537 const TreeEntry *E = P1.get<const TreeEntry *>();
8538 unsigned VF = E->getVectorFactor();
8539 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8540 CommonVF = std::max(VF, E2->getVectorFactor());
8541 assert(all_of(Mask,
8542 [=](int Idx) {
8543 return Idx < 2 * static_cast<int>(CommonVF);
8544 }) &&
8545 "All elements in mask must be less than 2 * CommonVF.");
8546 if (E->Scalars.size() == E2->Scalars.size()) {
8547 SmallVector<int> EMask = E->getCommonMask();
8548 SmallVector<int> E2Mask = E2->getCommonMask();
8549 if (!EMask.empty() || !E2Mask.empty()) {
8550 for (int &Idx : CommonMask) {
8551 if (Idx == PoisonMaskElem)
8552 continue;
8553 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
8554 Idx = EMask[Idx];
8555 else if (Idx >= static_cast<int>(CommonVF))
8556 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
8557 E->Scalars.size();
8558 }
8559 }
8560 CommonVF = E->Scalars.size();
8561 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
8562 GetNodeMinBWAffectedCost(*E2, CommonVF);
8563 } else {
8564 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
8565 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
8566 }
8567 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8568 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8569 } else if (!V1 && P2.isNull()) {
8570 // Shuffle single entry node.
8571 const TreeEntry *E = P1.get<const TreeEntry *>();
8572 unsigned VF = E->getVectorFactor();
8573 CommonVF = VF;
8574 assert(
8575 all_of(Mask,
8576 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8577 "All elements in mask must be less than CommonVF.");
8578 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
8579 SmallVector<int> EMask = E->getCommonMask();
8580 assert(!EMask.empty() && "Expected non-empty common mask.");
8581 for (int &Idx : CommonMask) {
8582 if (Idx != PoisonMaskElem)
8583 Idx = EMask[Idx];
8584 }
8585 CommonVF = E->Scalars.size();
8586 }
8587 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
8588 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8589 // Not identity/broadcast? Try to see if the original vector is better.
8590 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
8591 CommonVF == CommonMask.size() &&
8592 any_of(enumerate(CommonMask),
8593 [](const auto &&P) {
8594 return P.value() != PoisonMaskElem &&
8595 static_cast<unsigned>(P.value()) != P.index();
8596 }) &&
8597 any_of(CommonMask,
8598 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
8599 SmallVector<int> ReorderMask;
8600 inversePermutation(E->ReorderIndices, ReorderMask);
8601 ::addMask(CommonMask, ReorderMask);
8602 }
8603 } else if (V1 && P2.isNull()) {
8604 // Shuffle single vector.
8605 ExtraCost += GetValueMinBWAffectedCost(V1);
8606 CommonVF = cast<FixedVectorType>(V1->getType())->getNumElements();
8607 assert(
8608 all_of(Mask,
8609 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
8610 "All elements in mask must be less than CommonVF.");
8611 } else if (V1 && !V2) {
8612 // Shuffle vector and tree node.
8613 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8614 const TreeEntry *E2 = P2.get<const TreeEntry *>();
8615 CommonVF = std::max(VF, E2->getVectorFactor());
8616 assert(all_of(Mask,
8617 [=](int Idx) {
8618 return Idx < 2 * static_cast<int>(CommonVF);
8619 }) &&
8620 "All elements in mask must be less than 2 * CommonVF.");
8621 if (E2->Scalars.size() == VF && VF != CommonVF) {
8622 SmallVector<int> E2Mask = E2->getCommonMask();
8623 assert(!E2Mask.empty() && "Expected non-empty common mask.");
8624 for (int &Idx : CommonMask) {
8625 if (Idx == PoisonMaskElem)
8626 continue;
8627 if (Idx >= static_cast<int>(CommonVF))
8628 Idx = E2Mask[Idx - CommonVF] + VF;
8629 }
8630 CommonVF = VF;
8631 }
8632 ExtraCost += GetValueMinBWAffectedCost(V1);
8633 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8634 ExtraCost += GetNodeMinBWAffectedCost(
8635 *E2, std::min(CommonVF, E2->getVectorFactor()));
8636 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8637 } else if (!V1 && V2) {
8638 // Shuffle vector and tree node.
8639 unsigned VF = cast<FixedVectorType>(V2->getType())->getNumElements();
8640 const TreeEntry *E1 = P1.get<const TreeEntry *>();
8641 CommonVF = std::max(VF, E1->getVectorFactor());
8642 assert(all_of(Mask,
8643 [=](int Idx) {
8644 return Idx < 2 * static_cast<int>(CommonVF);
8645 }) &&
8646 "All elements in mask must be less than 2 * CommonVF.");
8647 if (E1->Scalars.size() == VF && VF != CommonVF) {
8648 SmallVector<int> E1Mask = E1->getCommonMask();
8649 assert(!E1Mask.empty() && "Expected non-empty common mask.");
8650 for (int &Idx : CommonMask) {
8651 if (Idx == PoisonMaskElem)
8652 continue;
8653 if (Idx >= static_cast<int>(CommonVF))
8654 Idx = E1Mask[Idx - CommonVF] + VF;
8655 else
8656 Idx = E1Mask[Idx];
8657 }
8658 CommonVF = VF;
8659 }
8660 ExtraCost += GetNodeMinBWAffectedCost(
8661 *E1, std::min(CommonVF, E1->getVectorFactor()));
8662 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8663 ExtraCost += GetValueMinBWAffectedCost(V2);
8664 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8665 } else {
8666 assert(V1 && V2 && "Expected both vectors.");
8667 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8668 CommonVF =
8669 std::max(VF, cast<FixedVectorType>(V2->getType())->getNumElements());
8670 assert(all_of(Mask,
8671 [=](int Idx) {
8672 return Idx < 2 * static_cast<int>(CommonVF);
8673 }) &&
8674 "All elements in mask must be less than 2 * CommonVF.");
8675 ExtraCost +=
8676 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
8677 if (V1->getType() != V2->getType()) {
8678 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8679 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8680 } else {
8681 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
8682 V1 = Constant::getNullValue(FixedVectorType::get(ScalarTy, CommonVF));
8683 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
8684 V2 = getAllOnesValue(*R.DL, FixedVectorType::get(ScalarTy, CommonVF));
8685 }
8686 }
8687 InVectors.front() = Constant::getNullValue(
8688 FixedVectorType::get(ScalarTy, CommonMask.size()));
8689 if (InVectors.size() == 2)
8690 InVectors.pop_back();
8691 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
8692 V1, V2, CommonMask, Builder);
8693 }
8694
8695public:
8697 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
8698 SmallPtrSetImpl<Value *> &CheckedExtracts)
8699 : ScalarTy(ScalarTy), TTI(TTI),
8700 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
8701 CheckedExtracts(CheckedExtracts) {}
8702 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
8703 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
8704 unsigned NumParts, bool &UseVecBaseAsInput) {
8705 UseVecBaseAsInput = false;
8706 if (Mask.empty())
8707 return nullptr;
8708 Value *VecBase = nullptr;
8709 ArrayRef<Value *> VL = E->Scalars;
8710 // If the resulting type is scalarized, do not adjust the cost.
8711 if (NumParts == VL.size())
8712 return nullptr;
8713 // Check if it can be considered reused if same extractelements were
8714 // vectorized already.
8715 bool PrevNodeFound = any_of(
8716 ArrayRef(R.VectorizableTree).take_front(E->Idx),
8717 [&](const std::unique_ptr<TreeEntry> &TE) {
8718 return ((!TE->isAltShuffle() &&
8719 TE->getOpcode() == Instruction::ExtractElement) ||
8720 TE->State == TreeEntry::NeedToGather) &&
8721 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
8722 return VL.size() > Data.index() &&
8723 (Mask[Data.index()] == PoisonMaskElem ||
8724 isa<UndefValue>(VL[Data.index()]) ||
8725 Data.value() == VL[Data.index()]);
8726 });
8727 });
8728 SmallPtrSet<Value *, 4> UniqueBases;
8729 unsigned SliceSize = VL.size() / NumParts;
8730 for (unsigned Part = 0; Part < NumParts; ++Part) {
8731 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
8732 for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
8733 // Ignore non-extractelement scalars.
8734 if (isa<UndefValue>(V) ||
8735 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
8736 continue;
8737 // If all users of instruction are going to be vectorized and this
8738 // instruction itself is not going to be vectorized, consider this
8739 // instruction as dead and remove its cost from the final cost of the
8740 // vectorized tree.
8741 // Also, avoid adjusting the cost for extractelements with multiple uses
8742 // in different graph entries.
8743 auto *EE = cast<ExtractElementInst>(V);
8744 VecBase = EE->getVectorOperand();
8745 UniqueBases.insert(VecBase);
8746 const TreeEntry *VE = R.getTreeEntry(V);
8747 if (!CheckedExtracts.insert(V).second ||
8748 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
8749 any_of(EE->users(),
8750 [&](User *U) {
8751 return isa<GetElementPtrInst>(U) &&
8752 !R.areAllUsersVectorized(cast<Instruction>(U),
8753 &VectorizedVals);
8754 }) ||
8755 (VE && VE != E))
8756 continue;
8757 std::optional<unsigned> EEIdx = getExtractIndex(EE);
8758 if (!EEIdx)
8759 continue;
8760 unsigned Idx = *EEIdx;
8761 // Take credit for instruction that will become dead.
8762 if (EE->hasOneUse() || !PrevNodeFound) {
8763 Instruction *Ext = EE->user_back();
8764 if (isa<SExtInst, ZExtInst>(Ext) &&
8765 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
8766 // Use getExtractWithExtendCost() to calculate the cost of
8767 // extractelement/ext pair.
8768 Cost -=
8769 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
8770 EE->getVectorOperandType(), Idx);
8771 // Add back the cost of s|zext which is subtracted separately.
8773 Ext->getOpcode(), Ext->getType(), EE->getType(),
8774 TTI::getCastContextHint(Ext), CostKind, Ext);
8775 continue;
8776 }
8777 }
8778 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
8779 CostKind, Idx);
8780 }
8781 }
8782 // Check that gather of extractelements can be represented as just a
8783 // shuffle of a single/two vectors the scalars are extracted from.
8784 // Found the bunch of extractelement instructions that must be gathered
8785 // into a vector and can be represented as a permutation elements in a
8786 // single input vector or of 2 input vectors.
8787 // Done for reused if same extractelements were vectorized already.
8788 if (!PrevNodeFound)
8789 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
8790 InVectors.assign(1, E);
8791 CommonMask.assign(Mask.begin(), Mask.end());
8792 transformMaskAfterShuffle(CommonMask, CommonMask);
8793 SameNodesEstimated = false;
8794 if (NumParts != 1 && UniqueBases.size() != 1) {
8795 UseVecBaseAsInput = true;
8796 VecBase = Constant::getNullValue(
8797 FixedVectorType::get(ScalarTy, CommonMask.size()));
8798 }
8799 return VecBase;
8800 }
8801 /// Checks if the specified entry \p E needs to be delayed because of its
8802 /// dependency nodes.
8803 std::optional<InstructionCost>
8804 needToDelay(const TreeEntry *,
8806 // No need to delay the cost estimation during analysis.
8807 return std::nullopt;
8808 }
8809 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
8810 if (&E1 == &E2) {
8811 assert(all_of(Mask,
8812 [&](int Idx) {
8813 return Idx < static_cast<int>(E1.getVectorFactor());
8814 }) &&
8815 "Expected single vector shuffle mask.");
8816 add(E1, Mask);
8817 return;
8818 }
8819 if (InVectors.empty()) {
8820 CommonMask.assign(Mask.begin(), Mask.end());
8821 InVectors.assign({&E1, &E2});
8822 return;
8823 }
8824 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8825 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8826 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8827 if (NumParts == 0 || NumParts >= Mask.size())
8828 NumParts = 1;
8829 unsigned SliceSize = Mask.size() / NumParts;
8830 const auto *It =
8831 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8832 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8833 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
8834 }
8835 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
8836 if (InVectors.empty()) {
8837 CommonMask.assign(Mask.begin(), Mask.end());
8838 InVectors.assign(1, &E1);
8839 return;
8840 }
8841 assert(!CommonMask.empty() && "Expected non-empty common mask.");
8842 auto *MaskVecTy = FixedVectorType::get(ScalarTy, Mask.size());
8843 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
8844 if (NumParts == 0 || NumParts >= Mask.size())
8845 NumParts = 1;
8846 unsigned SliceSize = Mask.size() / NumParts;
8847 const auto *It =
8848 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
8849 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
8850 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
8851 if (!SameNodesEstimated && InVectors.size() == 1)
8852 InVectors.emplace_back(&E1);
8853 }
8854 /// Adds 2 input vectors and the mask for their shuffling.
8855 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
8856 // May come only for shuffling of 2 vectors with extractelements, already
8857 // handled in adjustExtracts.
8858 assert(InVectors.size() == 1 &&
8859 all_of(enumerate(CommonMask),
8860 [&](auto P) {
8861 if (P.value() == PoisonMaskElem)
8862 return Mask[P.index()] == PoisonMaskElem;
8863 auto *EI =
8864 cast<ExtractElementInst>(InVectors.front()
8865 .get<const TreeEntry *>()
8866 ->Scalars[P.index()]);
8867 return EI->getVectorOperand() == V1 ||
8868 EI->getVectorOperand() == V2;
8869 }) &&
8870 "Expected extractelement vectors.");
8871 }
8872 /// Adds another one input vector and the mask for the shuffling.
8873 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
8874 if (InVectors.empty()) {
8875 assert(CommonMask.empty() && !ForExtracts &&
8876 "Expected empty input mask/vectors.");
8877 CommonMask.assign(Mask.begin(), Mask.end());
8878 InVectors.assign(1, V1);
8879 return;
8880 }
8881 if (ForExtracts) {
8882 // No need to add vectors here, already handled them in adjustExtracts.
8883 assert(InVectors.size() == 1 &&
8884 InVectors.front().is<const TreeEntry *>() && !CommonMask.empty() &&
8885 all_of(enumerate(CommonMask),
8886 [&](auto P) {
8887 Value *Scalar = InVectors.front()
8888 .get<const TreeEntry *>()
8889 ->Scalars[P.index()];
8890 if (P.value() == PoisonMaskElem)
8891 return P.value() == Mask[P.index()] ||
8892 isa<UndefValue>(Scalar);
8893 if (isa<Constant>(V1))
8894 return true;
8895 auto *EI = cast<ExtractElementInst>(Scalar);
8896 return EI->getVectorOperand() == V1;
8897 }) &&
8898 "Expected only tree entry for extractelement vectors.");
8899 return;
8900 }
8901 assert(!InVectors.empty() && !CommonMask.empty() &&
8902 "Expected only tree entries from extracts/reused buildvectors.");
8903 unsigned VF = cast<FixedVectorType>(V1->getType())->getNumElements();
8904 if (InVectors.size() == 2) {
8905 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
8906 transformMaskAfterShuffle(CommonMask, CommonMask);
8907 VF = std::max<unsigned>(VF, CommonMask.size());
8908 } else if (const auto *InTE =
8909 InVectors.front().dyn_cast<const TreeEntry *>()) {
8910 VF = std::max(VF, InTE->getVectorFactor());
8911 } else {
8912 VF = std::max(
8913 VF, cast<FixedVectorType>(InVectors.front().get<Value *>()->getType())
8914 ->getNumElements());
8915 }
8916 InVectors.push_back(V1);
8917 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8918 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
8919 CommonMask[Idx] = Mask[Idx] + VF;
8920 }
8921 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
8922 Value *Root = nullptr) {
8923 Cost += getBuildVectorCost(VL, Root);
8924 if (!Root) {
8925 // FIXME: Need to find a way to avoid use of getNullValue here.
8927 unsigned VF = VL.size();
8928 if (MaskVF != 0)
8929 VF = std::min(VF, MaskVF);
8930 for (Value *V : VL.take_front(VF)) {
8931 if (isa<UndefValue>(V)) {
8932 Vals.push_back(cast<Constant>(V));
8933 continue;
8934 }
8935 Vals.push_back(Constant::getNullValue(V->getType()));
8936 }
8937 return ConstantVector::get(Vals);
8938 }
8941 cast<FixedVectorType>(Root->getType())->getNumElements()),
8942 getAllOnesValue(*R.DL, ScalarTy));
8943 }
8945 /// Finalize emission of the shuffles.
8947 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
8948 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
8949 IsFinalized = true;
8950 if (Action) {
8951 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
8952 if (InVectors.size() == 2)
8953 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
8954 else
8955 Cost += createShuffle(Vec, nullptr, CommonMask);
8956 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
8957 if (CommonMask[Idx] != PoisonMaskElem)
8958 CommonMask[Idx] = Idx;
8959 assert(VF > 0 &&
8960 "Expected vector length for the final value before action.");
8961 Value *V = Vec.get<Value *>();
8962 Action(V, CommonMask);
8963 InVectors.front() = V;
8964 }
8965 ::addMask(CommonMask, ExtMask, /*ExtendingManyInputs=*/true);
8966 if (CommonMask.empty()) {
8967 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
8968 return Cost;
8969 }
8970 return Cost +
8971 createShuffle(InVectors.front(),
8972 InVectors.size() == 2 ? InVectors.back() : nullptr,
8973 CommonMask);
8974 }
8975
8977 assert((IsFinalized || CommonMask.empty()) &&
8978 "Shuffle construction must be finalized.");
8979 }
8980};
8981
8982const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
8983 unsigned Idx) const {
8984 Value *Op = E->getOperand(Idx).front();
8985 if (const TreeEntry *TE = getTreeEntry(Op)) {
8986 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8987 return EI.EdgeIdx == Idx && EI.UserTE == E;
8988 }) != TE->UserTreeIndices.end())
8989 return TE;
8990 auto MIt = MultiNodeScalars.find(Op);
8991 if (MIt != MultiNodeScalars.end()) {
8992 for (const TreeEntry *TE : MIt->second) {
8993 if (find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
8994 return EI.EdgeIdx == Idx && EI.UserTE == E;
8995 }) != TE->UserTreeIndices.end())
8996 return TE;
8997 }
8998 }
8999 }
9000 const auto *It =
9001 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9002 return TE->State == TreeEntry::NeedToGather &&
9003 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
9004 return EI.EdgeIdx == Idx && EI.UserTE == E;
9005 }) != TE->UserTreeIndices.end();
9006 });
9007 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
9008 return It->get();
9009}
9010
9011TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
9012 if (TE.State == TreeEntry::ScatterVectorize ||
9013 TE.State == TreeEntry::StridedVectorize)
9015 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
9016 !TE.isAltShuffle()) {
9017 if (TE.ReorderIndices.empty())
9020 inversePermutation(TE.ReorderIndices, Mask);
9021 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
9023 }
9025}
9026
9027/// Builds the arguments types vector for the given call instruction with the
9028/// given \p ID for the specified vector factor.
9030 const Intrinsic::ID ID,
9031 const unsigned VF,
9032 unsigned MinBW) {
9033 SmallVector<Type *> ArgTys;
9034 for (auto [Idx, Arg] : enumerate(CI->args())) {
9037 ArgTys.push_back(Arg->getType());
9038 continue;
9039 }
9040 if (MinBW > 0) {
9042 IntegerType::get(CI->getContext(), MinBW), VF));
9043 continue;
9044 }
9045 }
9046 ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
9047 }
9048 return ArgTys;
9049}
9050
9052BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9053 SmallPtrSetImpl<Value *> &CheckedExtracts) {
9054 ArrayRef<Value *> VL = E->Scalars;
9055
9056 Type *ScalarTy = VL[0]->getType();
9057 if (E->State != TreeEntry::NeedToGather) {
9058 if (auto *SI = dyn_cast<StoreInst>(VL[0]))
9059 ScalarTy = SI->getValueOperand()->getType();
9060 else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
9061 ScalarTy = CI->getOperand(0)->getType();
9062 else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
9063 ScalarTy = IE->getOperand(1)->getType();
9064 }
9065 if (!isValidElementType(ScalarTy))
9067 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
9069
9070 // If we have computed a smaller type for the expression, update VecTy so
9071 // that the costs will be accurate.
9072 auto It = MinBWs.find(E);
9073 Type *OrigScalarTy = ScalarTy;
9074 if (It != MinBWs.end()) {
9075 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
9076 VecTy = FixedVectorType::get(ScalarTy, VL.size());
9077 }
9078 unsigned EntryVF = E->getVectorFactor();
9079 auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
9080
9081 bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
9082 if (E->State == TreeEntry::NeedToGather) {
9083 if (allConstant(VL))
9084 return 0;
9085 if (isa<InsertElementInst>(VL[0]))
9087 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
9088 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
9089 }
9090 InstructionCost CommonCost = 0;
9092 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
9093 if (!E->ReorderIndices.empty() &&
9094 (E->State != TreeEntry::StridedVectorize || !IsReverseOrder)) {
9095 SmallVector<int> NewMask;
9096 if (E->getOpcode() == Instruction::Store) {
9097 // For stores the order is actually a mask.
9098 NewMask.resize(E->ReorderIndices.size());
9099 copy(E->ReorderIndices, NewMask.begin());
9100 } else {
9101 inversePermutation(E->ReorderIndices, NewMask);
9102 }
9103 ::addMask(Mask, NewMask);
9104 }
9105 if (NeedToShuffleReuses)
9106 ::addMask(Mask, E->ReuseShuffleIndices);
9107 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
9108 CommonCost =
9109 TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
9110 assert((E->State == TreeEntry::Vectorize ||
9111 E->State == TreeEntry::ScatterVectorize ||
9112 E->State == TreeEntry::StridedVectorize) &&
9113 "Unhandled state");
9114 assert(E->getOpcode() &&
9115 ((allSameType(VL) && allSameBlock(VL)) ||
9116 (E->getOpcode() == Instruction::GetElementPtr &&
9117 E->getMainOp()->getType()->isPointerTy())) &&
9118 "Invalid VL");
9119 Instruction *VL0 = E->getMainOp();
9120 unsigned ShuffleOrOp =
9121 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
9122 SetVector<Value *> UniqueValues(VL.begin(), VL.end());
9123 const unsigned Sz = UniqueValues.size();
9124 SmallBitVector UsedScalars(Sz, false);
9125 for (unsigned I = 0; I < Sz; ++I) {
9126 if (getTreeEntry(UniqueValues[I]) == E)
9127 continue;
9128 UsedScalars.set(I);
9129 }
9130 auto GetCastContextHint = [&](Value *V) {
9131 if (const TreeEntry *OpTE = getTreeEntry(V))
9132 return getCastContextHint(*OpTE);
9133 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
9134 if (SrcState.getOpcode() == Instruction::Load && !SrcState.isAltShuffle())
9137 };
9138 auto GetCostDiff =
9139 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
9141 // Calculate the cost of this instruction.
9142 InstructionCost ScalarCost = 0;
9143 if (isa<CastInst, CmpInst, SelectInst, CallInst>(VL0)) {
9144 // For some of the instructions no need to calculate cost for each
9145 // particular instruction, we can use the cost of the single
9146 // instruction x total number of scalar instructions.
9147 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
9148 } else {
9149 for (unsigned I = 0; I < Sz; ++I) {
9150 if (UsedScalars.test(I))
9151 continue;
9152 ScalarCost += ScalarEltCost(I);
9153 }
9154 }
9155
9156 InstructionCost VecCost = VectorCost(CommonCost);
9157 // Check if the current node must be resized, if the parent node is not
9158 // resized.
9159 if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) {
9160 const EdgeInfo &EI = E->UserTreeIndices.front();
9161 if ((EI.UserTE->getOpcode() != Instruction::Select ||
9162 EI.EdgeIdx != 0) &&
9163 It != MinBWs.end()) {
9164 auto UserBWIt = MinBWs.find(EI.UserTE);
9165 Type *UserScalarTy =
9166 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
9167 if (UserBWIt != MinBWs.end())
9168 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
9169 UserBWIt->second.first);
9170 if (ScalarTy != UserScalarTy) {
9171 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9172 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
9173 unsigned VecOpcode;
9174 auto *UserVecTy =
9175 FixedVectorType::get(UserScalarTy, E->getVectorFactor());
9176 if (BWSz > SrcBWSz)
9177 VecOpcode = Instruction::Trunc;
9178 else
9179 VecOpcode =
9180 It->second.second ? Instruction::SExt : Instruction::ZExt;
9181 TTI::CastContextHint CCH = GetCastContextHint(VL0);
9182 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
9183 CostKind);
9184 }
9185 }
9186 }
9187 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
9188 ScalarCost, "Calculated costs for Tree"));
9189 return VecCost - ScalarCost;
9190 };
9191 // Calculate cost difference from vectorizing set of GEPs.
9192 // Negative value means vectorizing is profitable.
9193 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
9194 assert((E->State == TreeEntry::Vectorize ||
9195 E->State == TreeEntry::StridedVectorize) &&
9196 "Entry state expected to be Vectorize or StridedVectorize here.");
9197 InstructionCost ScalarCost = 0;
9198 InstructionCost VecCost = 0;
9199 std::tie(ScalarCost, VecCost) = getGEPCosts(
9200 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
9201 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
9202 "Calculated GEPs cost for Tree"));
9203
9204 return VecCost - ScalarCost;
9205 };
9206
9207 switch (ShuffleOrOp) {
9208 case Instruction::PHI: {
9209 // Count reused scalars.
9210 InstructionCost ScalarCost = 0;
9212 for (Value *V : UniqueValues) {
9213 auto *PHI = dyn_cast<PHINode>(V);
9214 if (!PHI)
9215 continue;
9216
9217 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
9218 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
9219 Value *Op = PHI->getIncomingValue(I);
9220 Operands[I] = Op;
9221 }
9222 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
9223 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
9224 if (!OpTE->ReuseShuffleIndices.empty())
9225 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
9226 OpTE->Scalars.size());
9227 }
9228
9229 return CommonCost - ScalarCost;
9230 }
9231 case Instruction::ExtractValue:
9232 case Instruction::ExtractElement: {
9233 auto GetScalarCost = [&](unsigned Idx) {
9234 auto *I = cast<Instruction>(UniqueValues[Idx]);
9235 VectorType *SrcVecTy;
9236 if (ShuffleOrOp == Instruction::ExtractElement) {
9237 auto *EE = cast<ExtractElementInst>(I);
9238 SrcVecTy = EE->getVectorOperandType();
9239 } else {
9240 auto *EV = cast<ExtractValueInst>(I);
9241 Type *AggregateTy = EV->getAggregateOperand()->getType();
9242 unsigned NumElts;
9243 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
9244 NumElts = ATy->getNumElements();
9245 else
9246 NumElts = AggregateTy->getStructNumElements();
9247 SrcVecTy = FixedVectorType::get(OrigScalarTy, NumElts);
9248 }
9249 if (I->hasOneUse()) {
9250 Instruction *Ext = I->user_back();
9251 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
9252 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
9253 // Use getExtractWithExtendCost() to calculate the cost of
9254 // extractelement/ext pair.
9256 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
9257 // Subtract the cost of s|zext which is subtracted separately.
9259 Ext->getOpcode(), Ext->getType(), I->getType(),
9261 return Cost;
9262 }
9263 }
9264 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
9266 };
9267 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
9268 return GetCostDiff(GetScalarCost, GetVectorCost);
9269 }
9270 case Instruction::InsertElement: {
9271 assert(E->ReuseShuffleIndices.empty() &&
9272 "Unique insertelements only are expected.");
9273 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
9274 unsigned const NumElts = SrcVecTy->getNumElements();
9275 unsigned const NumScalars = VL.size();
9276
9277 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
9278
9279 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
9280 unsigned OffsetBeg = *getInsertIndex(VL.front());
9281 unsigned OffsetEnd = OffsetBeg;
9282 InsertMask[OffsetBeg] = 0;
9283 for (auto [I, V] : enumerate(VL.drop_front())) {
9284 unsigned Idx = *getInsertIndex(V);
9285 if (OffsetBeg > Idx)
9286 OffsetBeg = Idx;
9287 else if (OffsetEnd < Idx)
9288 OffsetEnd = Idx;
9289 InsertMask[Idx] = I + 1;
9290 }
9291 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
9292 if (NumOfParts > 0)
9293 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
9294 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
9295 VecScalarsSz;
9296 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
9297 unsigned InsertVecSz = std::min<unsigned>(
9298 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
9299 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
9300 bool IsWholeSubvector =
9301 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
9302 // Check if we can safely insert a subvector. If it is not possible, just
9303 // generate a whole-sized vector and shuffle the source vector and the new
9304 // subvector.
9305 if (OffsetBeg + InsertVecSz > VecSz) {
9306 // Align OffsetBeg to generate correct mask.
9307 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
9308 InsertVecSz = VecSz;
9309 }
9310
9311 APInt DemandedElts = APInt::getZero(NumElts);
9312 // TODO: Add support for Instruction::InsertValue.
9314 if (!E->ReorderIndices.empty()) {
9315 inversePermutation(E->ReorderIndices, Mask);
9316 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
9317 } else {
9318 Mask.assign(VecSz, PoisonMaskElem);
9319 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
9320 }
9321 bool IsIdentity = true;
9322 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
9323 Mask.swap(PrevMask);
9324 for (unsigned I = 0; I < NumScalars; ++I) {
9325 unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]);
9326 DemandedElts.setBit(InsertIdx);
9327 IsIdentity &= InsertIdx - OffsetBeg == I;
9328 Mask[InsertIdx - OffsetBeg] = I;
9329 }
9330 assert(Offset < NumElts && "Failed to find vector index offset");
9331
9333 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
9334 /*Insert*/ true, /*Extract*/ false,
9335 CostKind);
9336
9337 // First cost - resize to actual vector size if not identity shuffle or
9338 // need to shift the vector.
9339 // Do not calculate the cost if the actual size is the register size and
9340 // we can merge this shuffle with the following SK_Select.
9341 auto *InsertVecTy = FixedVectorType::get(ScalarTy, InsertVecSz);
9342 if (!IsIdentity)
9344 InsertVecTy, Mask);
9345 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
9346 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
9347 }));
9348 // Second cost - permutation with subvector, if some elements are from the
9349 // initial vector or inserting a subvector.
9350 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
9351 // subvector of ActualVecTy.
9352 SmallBitVector InMask =
9353 isUndefVector(FirstInsert->getOperand(0),
9354 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
9355 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
9356 if (InsertVecSz != VecSz) {
9357 auto *ActualVecTy = FixedVectorType::get(ScalarTy, VecSz);
9359 std::nullopt, CostKind, OffsetBeg - Offset,
9360 InsertVecTy);
9361 } else {
9362 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
9363 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
9364 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
9365 I <= End; ++I)
9366 if (Mask[I] != PoisonMaskElem)
9367 Mask[I] = I + VecSz;
9368 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
9369 Mask[I] =
9370 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
9371 Cost +=
9372 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
9373 }
9374 }
9375 return Cost;
9376 }
9377 case Instruction::ZExt:
9378 case Instruction::SExt:
9379 case Instruction::FPToUI:
9380 case Instruction::FPToSI:
9381 case Instruction::FPExt:
9382 case Instruction::PtrToInt:
9383 case Instruction::IntToPtr:
9384 case Instruction::SIToFP:
9385 case Instruction::UIToFP:
9386 case Instruction::Trunc:
9387 case Instruction::FPTrunc:
9388 case Instruction::BitCast: {
9389 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9390 Type *SrcScalarTy = VL0->getOperand(0)->getType();
9391 auto *SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9392 unsigned Opcode = ShuffleOrOp;
9393 unsigned VecOpcode = Opcode;
9394 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
9395 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
9396 // Check if the values are candidates to demote.
9397 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
9398 if (SrcIt != MinBWs.end()) {
9399 SrcBWSz = SrcIt->second.first;
9400 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
9401 SrcVecTy = FixedVectorType::get(SrcScalarTy, VL.size());
9402 }
9403 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9404 if (BWSz == SrcBWSz) {
9405 VecOpcode = Instruction::BitCast;
9406 } else if (BWSz < SrcBWSz) {
9407 VecOpcode = Instruction::Trunc;
9408 } else if (It != MinBWs.end()) {
9409 assert(BWSz > SrcBWSz && "Invalid cast!");
9410 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
9411 } else if (SrcIt != MinBWs.end()) {
9412 assert(BWSz > SrcBWSz && "Invalid cast!");
9413 VecOpcode =
9414 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
9415 }
9416 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
9417 !SrcIt->second.second) {
9418 VecOpcode = Instruction::UIToFP;
9419 }
9420 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
9421 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9422 return TTI->getCastInstrCost(Opcode, VL0->getType(),
9423 VL0->getOperand(0)->getType(),
9425 };
9426 auto GetVectorCost = [=](InstructionCost CommonCost) {
9427 // Do not count cost here if minimum bitwidth is in effect and it is just
9428 // a bitcast (here it is just a noop).
9429 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
9430 return CommonCost;
9431 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
9432 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
9433 return CommonCost +
9434 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
9435 VecOpcode == Opcode ? VI : nullptr);
9436 };
9437 return GetCostDiff(GetScalarCost, GetVectorCost);
9438 }
9439 case Instruction::FCmp:
9440 case Instruction::ICmp:
9441 case Instruction::Select: {
9442 CmpInst::Predicate VecPred, SwappedVecPred;
9443 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
9444 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
9445 match(VL0, MatchCmp))
9446 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
9447 else
9448 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
9451 auto GetScalarCost = [&](unsigned Idx) {
9452 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9453 CmpInst::Predicate CurrentPred = ScalarTy->isFloatingPointTy()
9456 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
9457 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
9458 !match(VI, MatchCmp)) ||
9459 (CurrentPred != VecPred && CurrentPred != SwappedVecPred))
9460 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
9463
9464 return TTI->getCmpSelInstrCost(E->getOpcode(), OrigScalarTy,
9465 Builder.getInt1Ty(), CurrentPred, CostKind,
9466 VI);
9467 };
9468 auto GetVectorCost = [&](InstructionCost CommonCost) {
9469 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9470
9472 E->getOpcode(), VecTy, MaskTy, VecPred, CostKind, VL0);
9473 // Check if it is possible and profitable to use min/max for selects
9474 // in VL.
9475 //
9476 auto IntrinsicAndUse = canConvertToMinOrMaxIntrinsic(VL);
9477 if (IntrinsicAndUse.first != Intrinsic::not_intrinsic) {
9478 IntrinsicCostAttributes CostAttrs(IntrinsicAndUse.first, VecTy,
9479 {VecTy, VecTy});
9480 InstructionCost IntrinsicCost =
9481 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9482 // If the selects are the only uses of the compares, they will be
9483 // dead and we can adjust the cost by removing their cost.
9484 if (IntrinsicAndUse.second)
9485 IntrinsicCost -= TTI->getCmpSelInstrCost(Instruction::ICmp, VecTy,
9486 MaskTy, VecPred, CostKind);
9487 VecCost = std::min(VecCost, IntrinsicCost);
9488 }
9489 return VecCost + CommonCost;
9490 };
9491 return GetCostDiff(GetScalarCost, GetVectorCost);
9492 }
9493 case Instruction::FNeg:
9494 case Instruction::Add:
9495 case Instruction::FAdd:
9496 case Instruction::Sub:
9497 case Instruction::FSub:
9498 case Instruction::Mul:
9499 case Instruction::FMul:
9500 case Instruction::UDiv:
9501 case Instruction::SDiv:
9502 case Instruction::FDiv:
9503 case Instruction::URem:
9504 case Instruction::SRem:
9505 case Instruction::FRem:
9506 case Instruction::Shl:
9507 case Instruction::LShr:
9508 case Instruction::AShr:
9509 case Instruction::And:
9510 case Instruction::Or:
9511 case Instruction::Xor: {
9512 auto GetScalarCost = [&](unsigned Idx) {
9513 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9514 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
9515 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
9516 TTI::OperandValueInfo Op2Info =
9517 TTI::getOperandInfo(VI->getOperand(OpIdx));
9518 SmallVector<const Value *> Operands(VI->operand_values());
9519 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
9520 Op1Info, Op2Info, Operands, VI);
9521 };
9522 auto GetVectorCost = [=](InstructionCost CommonCost) {
9523 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
9524 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
9525 ArrayRef<Value *> Ops = E->getOperand(I);
9526 if (all_of(Ops, [&](Value *Op) {
9527 auto *CI = dyn_cast<ConstantInt>(Op);
9528 return CI && CI->getValue().countr_one() >= It->second.first;
9529 }))
9530 return CommonCost;
9531 }
9532 }
9533 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
9534 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
9535 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
9536 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
9537 Op2Info, std::nullopt, nullptr, TLI) +
9538 CommonCost;
9539 };
9540 return GetCostDiff(GetScalarCost, GetVectorCost);
9541 }
9542 case Instruction::GetElementPtr: {
9543 return CommonCost + GetGEPCostDiff(VL, VL0);
9544 }
9545 case Instruction::Load: {
9546 auto GetScalarCost = [&](unsigned Idx) {
9547 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
9548 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
9549 VI->getAlign(), VI->getPointerAddressSpace(),
9551 };
9552 auto *LI0 = cast<LoadInst>(VL0);
9553 auto GetVectorCost = [&](InstructionCost CommonCost) {
9554 InstructionCost VecLdCost;
9555 if (E->State == TreeEntry::Vectorize) {
9556 VecLdCost = TTI->getMemoryOpCost(
9557 Instruction::Load, VecTy, LI0->getAlign(),
9558 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
9559 } else if (E->State == TreeEntry::StridedVectorize) {
9560 Align CommonAlignment =
9561 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9562 VecLdCost = TTI->getStridedMemoryOpCost(
9563 Instruction::Load, VecTy, LI0->getPointerOperand(),
9564 /*VariableMask=*/false, CommonAlignment, CostKind);
9565 } else {
9566 assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState");
9567 Align CommonAlignment =
9568 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
9569 VecLdCost = TTI->getGatherScatterOpCost(
9570 Instruction::Load, VecTy, LI0->getPointerOperand(),
9571 /*VariableMask=*/false, CommonAlignment, CostKind);
9572 }
9573 return VecLdCost + CommonCost;
9574 };
9575
9576 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
9577 // If this node generates masked gather load then it is not a terminal node.
9578 // Hence address operand cost is estimated separately.
9579 if (E->State == TreeEntry::ScatterVectorize)
9580 return Cost;
9581
9582 // Estimate cost of GEPs since this tree node is a terminator.
9583 SmallVector<Value *> PointerOps(VL.size());
9584 for (auto [I, V] : enumerate(VL))
9585 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
9586 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
9587 }
9588 case Instruction::Store: {
9589 bool IsReorder = !E->ReorderIndices.empty();
9590 auto GetScalarCost = [=](unsigned Idx) {
9591 auto *VI = cast<StoreInst>(VL[Idx]);
9592 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
9593 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
9594 VI->getAlign(), VI->getPointerAddressSpace(),
9595 CostKind, OpInfo, VI);
9596 };
9597 auto *BaseSI =
9598 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
9599 auto GetVectorCost = [=](InstructionCost CommonCost) {
9600 // We know that we can merge the stores. Calculate the cost.
9601 InstructionCost VecStCost;
9602 if (E->State == TreeEntry::StridedVectorize) {
9603 Align CommonAlignment =
9604 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
9605 VecStCost = TTI->getStridedMemoryOpCost(
9606 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9607 /*VariableMask=*/false, CommonAlignment, CostKind);
9608 } else {
9609 assert(E->State == TreeEntry::Vectorize &&
9610 "Expected either strided or consecutive stores.");
9611 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
9612 VecStCost = TTI->getMemoryOpCost(
9613 Instruction::Store, VecTy, BaseSI->getAlign(),
9614 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
9615 }
9616 return VecStCost + CommonCost;
9617 };
9618 SmallVector<Value *> PointerOps(VL.size());
9619 for (auto [I, V] : enumerate(VL)) {
9620 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
9621 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
9622 }
9623
9624 return GetCostDiff(GetScalarCost, GetVectorCost) +
9625 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
9626 }
9627 case Instruction::Call: {
9628 auto GetScalarCost = [&](unsigned Idx) {
9629 auto *CI = cast<CallInst>(UniqueValues[Idx]);
9632 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
9633 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
9634 }
9637 CI->getFunctionType()->params(), CostKind);
9638 };
9639 auto GetVectorCost = [=](InstructionCost CommonCost) {
9640 auto *CI = cast<CallInst>(VL0);
9642 SmallVector<Type *> ArgTys =
9644 It != MinBWs.end() ? It->second.first : 0);
9645 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9646 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9647 };
9648 return GetCostDiff(GetScalarCost, GetVectorCost);
9649 }
9650 case Instruction::ShuffleVector: {
9651 assert(E->isAltShuffle() &&
9652 ((Instruction::isBinaryOp(E->getOpcode()) &&
9653 Instruction::isBinaryOp(E->getAltOpcode())) ||
9654 (Instruction::isCast(E->getOpcode()) &&
9655 Instruction::isCast(E->getAltOpcode())) ||
9656 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
9657 "Invalid Shuffle Vector Operand");
9658 // Try to find the previous shuffle node with the same operands and same
9659 // main/alternate ops.
9660 auto TryFindNodeWithEqualOperands = [=]() {
9661 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
9662 if (TE.get() == E)
9663 break;
9664 if (TE->isAltShuffle() &&
9665 ((TE->getOpcode() == E->getOpcode() &&
9666 TE->getAltOpcode() == E->getAltOpcode()) ||
9667 (TE->getOpcode() == E->getAltOpcode() &&
9668 TE->getAltOpcode() == E->getOpcode())) &&
9669 TE->hasEqualOperands(*E))
9670 return true;
9671 }
9672 return false;
9673 };
9674 auto GetScalarCost = [&](unsigned Idx) {
9675 auto *VI = cast<Instruction>(UniqueValues[Idx]);
9676 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
9677 (void)E;
9678 return TTI->getInstructionCost(VI, CostKind);
9679 };
9680 // Need to clear CommonCost since the final shuffle cost is included into
9681 // vector cost.
9682 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
9683 // VecCost is equal to sum of the cost of creating 2 vectors
9684 // and the cost of creating shuffle.
9685 InstructionCost VecCost = 0;
9686 if (TryFindNodeWithEqualOperands()) {
9687 LLVM_DEBUG({
9688 dbgs() << "SLP: diamond match for alternate node found.\n";
9689 E->dump();
9690 });
9691 // No need to add new vector costs here since we're going to reuse
9692 // same main/alternate vector ops, just do different shuffling.
9693 } else if (Instruction::isBinaryOp(E->getOpcode())) {
9694 VecCost =
9695 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
9696 VecCost +=
9697 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
9698 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
9699 auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
9700 VecCost = TTIRef.getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
9701 CI0->getPredicate(), CostKind, VL0);
9702 VecCost += TTIRef.getCmpSelInstrCost(
9703 E->getOpcode(), VecTy, MaskTy,
9704 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
9705 E->getAltOp());
9706 } else {
9707 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
9708 auto *SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9709 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
9710 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
9711 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
9712 unsigned SrcBWSz =
9713 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
9714 if (SrcIt != MinBWs.end()) {
9715 SrcBWSz = SrcIt->second.first;
9716 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
9717 SrcTy = FixedVectorType::get(SrcSclTy, VL.size());
9718 }
9719 if (BWSz <= SrcBWSz) {
9720 if (BWSz < SrcBWSz)
9721 VecCost =
9722 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
9724 LLVM_DEBUG({
9725 dbgs()
9726 << "SLP: alternate extension, which should be truncated.\n";
9727 E->dump();
9728 });
9729 return VecCost;
9730 }
9731 }
9732 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
9734 VecCost +=
9735 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
9737 }
9739 E->buildAltOpShuffleMask(
9740 [E](Instruction *I) {
9741 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
9742 return I->getOpcode() == E->getAltOpcode();
9743 },
9744 Mask);
9746 FinalVecTy, Mask);
9747 // Patterns like [fadd,fsub] can be combined into a single instruction
9748 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
9749 // need to take into account their order when looking for the most used
9750 // order.
9751 unsigned Opcode0 = E->getOpcode();
9752 unsigned Opcode1 = E->getAltOpcode();
9753 // The opcode mask selects between the two opcodes.
9754 SmallBitVector OpcodeMask(E->Scalars.size(), false);
9755 for (unsigned Lane : seq<unsigned>(0, E->Scalars.size()))
9756 if (cast<Instruction>(E->Scalars[Lane])->getOpcode() == Opcode1)
9757 OpcodeMask.set(Lane);
9758 // If this pattern is supported by the target then we consider the
9759 // order.
9760 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
9761 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
9762 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
9763 return AltVecCost < VecCost ? AltVecCost : VecCost;
9764 }
9765 // TODO: Check the reverse order too.
9766 return VecCost;
9767 };
9768 return GetCostDiff(GetScalarCost, GetVectorCost);
9769 }
9770 default:
9771 llvm_unreachable("Unknown instruction");
9772 }
9773}
9774
9775bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
9776 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
9777 << VectorizableTree.size() << " is fully vectorizable .\n");
9778
9779 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
9781 return TE->State == TreeEntry::NeedToGather &&
9782 !any_of(TE->Scalars,
9783 [this](Value *V) { return EphValues.contains(V); }) &&
9784 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
9785 TE->Scalars.size() < Limit ||
9786 ((TE->getOpcode() == Instruction::ExtractElement ||
9787 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
9788 isFixedVectorShuffle(TE->Scalars, Mask)) ||
9789 (TE->State == TreeEntry::NeedToGather &&
9790 TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()));
9791 };
9792
9793 // We only handle trees of heights 1 and 2.
9794 if (VectorizableTree.size() == 1 &&
9795 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
9796 (ForReduction &&
9797 AreVectorizableGathers(VectorizableTree[0].get(),
9798 VectorizableTree[0]->Scalars.size()) &&
9799 VectorizableTree[0]->getVectorFactor() > 2)))
9800 return true;
9801
9802 if (VectorizableTree.size() != 2)
9803 return false;
9804
9805 // Handle splat and all-constants stores. Also try to vectorize tiny trees
9806 // with the second gather nodes if they have less scalar operands rather than
9807 // the initial tree element (may be profitable to shuffle the second gather)
9808 // or they are extractelements, which form shuffle.
9810 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
9811 AreVectorizableGathers(VectorizableTree[1].get(),
9812 VectorizableTree[0]->Scalars.size()))
9813 return true;
9814
9815 // Gathering cost would be too much for tiny trees.
9816 if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
9817 (VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9818 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
9819 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
9820 return false;
9821
9822 return true;
9823}
9824
9825static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
9827 bool MustMatchOrInst) {
9828 // Look past the root to find a source value. Arbitrarily follow the
9829 // path through operand 0 of any 'or'. Also, peek through optional
9830 // shift-left-by-multiple-of-8-bits.
9831 Value *ZextLoad = Root;
9832 const APInt *ShAmtC;
9833 bool FoundOr = false;
9834 while (!isa<ConstantExpr>(ZextLoad) &&
9835 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
9836 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
9837 ShAmtC->urem(8) == 0))) {
9838 auto *BinOp = cast<BinaryOperator>(ZextLoad);
9839 ZextLoad = BinOp->getOperand(0);
9840 if (BinOp->getOpcode() == Instruction::Or)
9841 FoundOr = true;
9842 }
9843 // Check if the input is an extended load of the required or/shift expression.
9844 Value *Load;
9845 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
9846 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
9847 return false;
9848
9849 // Require that the total load bit width is a legal integer type.
9850 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
9851 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
9852 Type *SrcTy = Load->getType();
9853 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
9854 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
9855 return false;
9856
9857 // Everything matched - assume that we can fold the whole sequence using
9858 // load combining.
9859 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
9860 << *(cast<Instruction>(Root)) << "\n");
9861
9862 return true;
9863}
9864
9866 if (RdxKind != RecurKind::Or)
9867 return false;
9868
9869 unsigned NumElts = VectorizableTree[0]->Scalars.size();
9870 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
9871 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
9872 /* MatchOr */ false);
9873}
9874
9876 // Peek through a final sequence of stores and check if all operations are
9877 // likely to be load-combined.
9878 unsigned NumElts = Stores.size();
9879 for (Value *Scalar : Stores) {
9880 Value *X;
9881 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
9882 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
9883 return false;
9884 }
9885 return true;
9886}
9887
9888bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
9889 // No need to vectorize inserts of gathered values.
9890 if (VectorizableTree.size() == 2 &&
9891 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
9892 VectorizableTree[1]->State == TreeEntry::NeedToGather &&
9893 (VectorizableTree[1]->getVectorFactor() <= 2 ||
9894 !(isSplat(VectorizableTree[1]->Scalars) ||
9895 allConstant(VectorizableTree[1]->Scalars))))
9896 return true;
9897
9898 // If the graph includes only PHI nodes and gathers, it is defnitely not
9899 // profitable for the vectorization, we can skip it, if the cost threshold is
9900 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
9901 // gathers/buildvectors.
9902 constexpr int Limit = 4;
9903 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
9904 !VectorizableTree.empty() &&
9905 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9906 return (TE->State == TreeEntry::NeedToGather &&
9907 TE->getOpcode() != Instruction::ExtractElement &&
9908 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
9909 TE->getOpcode() == Instruction::PHI;
9910 }))
9911 return true;
9912
9913 // We can vectorize the tree if its size is greater than or equal to the
9914 // minimum size specified by the MinTreeSize command line option.
9915 if (VectorizableTree.size() >= MinTreeSize)
9916 return false;
9917
9918 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
9919 // can vectorize it if we can prove it fully vectorizable.
9920 if (isFullyVectorizableTinyTree(ForReduction))
9921 return false;
9922
9923 // Check if any of the gather node forms an insertelement buildvector
9924 // somewhere.
9925 bool IsAllowedSingleBVNode =
9926 VectorizableTree.size() > 1 ||
9927 (VectorizableTree.size() == 1 && VectorizableTree.front()->getOpcode() &&
9928 !VectorizableTree.front()->isAltShuffle() &&
9929 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
9930 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
9931 allSameBlock(VectorizableTree.front()->Scalars));
9932 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
9933 return TE->State == TreeEntry::NeedToGather &&
9934 all_of(TE->Scalars, [&](Value *V) {
9935 return isa<ExtractElementInst, UndefValue>(V) ||
9936 (IsAllowedSingleBVNode &&
9937 !V->hasNUsesOrMore(UsesLimit) &&
9938 any_of(V->users(), IsaPred<InsertElementInst>));
9939 });
9940 }))
9941 return false;
9942
9943 assert(VectorizableTree.empty()
9944 ? ExternalUses.empty()
9945 : true && "We shouldn't have any external users");
9946
9947 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
9948 // vectorizable.
9949 return true;
9950}
9951
9953 // Walk from the bottom of the tree to the top, tracking which values are
9954 // live. When we see a call instruction that is not part of our tree,
9955 // query TTI to see if there is a cost to keeping values live over it
9956 // (for example, if spills and fills are required).
9957 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
9959
9961 Instruction *PrevInst = nullptr;
9962
9963 // The entries in VectorizableTree are not necessarily ordered by their
9964 // position in basic blocks. Collect them and order them by dominance so later
9965 // instructions are guaranteed to be visited first. For instructions in
9966 // different basic blocks, we only scan to the beginning of the block, so
9967 // their order does not matter, as long as all instructions in a basic block
9968 // are grouped together. Using dominance ensures a deterministic order.
9969 SmallVector<Instruction *, 16> OrderedScalars;
9970 for (const auto &TEPtr : VectorizableTree) {
9971 if (TEPtr->State != TreeEntry::Vectorize)
9972 continue;
9973 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
9974 if (!Inst)
9975 continue;
9976 OrderedScalars.push_back(Inst);
9977 }
9978 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
9979 auto *NodeA = DT->getNode(A->getParent());
9980 auto *NodeB = DT->getNode(B->getParent());
9981 assert(NodeA && "Should only process reachable instructions");
9982 assert(NodeB && "Should only process reachable instructions");
9983 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
9984 "Different nodes should have different DFS numbers");
9985 if (NodeA != NodeB)
9986 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
9987 return B->comesBefore(A);
9988 });
9989
9990 for (Instruction *Inst : OrderedScalars) {
9991 if (!PrevInst) {
9992 PrevInst = Inst;
9993 continue;
9994 }
9995
9996 // Update LiveValues.
9997 LiveValues.erase(PrevInst);
9998 for (auto &J : PrevInst->operands()) {
9999 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
10000 LiveValues.insert(cast<Instruction>(&*J));
10001 }
10002
10003 LLVM_DEBUG({
10004 dbgs() << "SLP: #LV: " << LiveValues.size();
10005 for (auto *X : LiveValues)
10006 dbgs() << " " << X->getName();
10007 dbgs() << ", Looking at ";
10008 Inst->dump();
10009 });
10010
10011 // Now find the sequence of instructions between PrevInst and Inst.
10012 unsigned NumCalls = 0;
10013 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
10014 PrevInstIt =
10015 PrevInst->getIterator().getReverse();
10016 while (InstIt != PrevInstIt) {
10017 if (PrevInstIt == PrevInst->getParent()->rend()) {
10018 PrevInstIt = Inst->getParent()->rbegin();
10019 continue;
10020 }
10021
10022 auto NoCallIntrinsic = [this](Instruction *I) {
10023 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
10024 if (II->isAssumeLikeIntrinsic())
10025 return true;
10026 FastMathFlags FMF;
10028 for (auto &ArgOp : II->args())
10029 Tys.push_back(ArgOp->getType());
10030 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
10031 FMF = FPMO->getFastMathFlags();
10032 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
10033 FMF);
10034 InstructionCost IntrCost =
10037 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
10038 if (IntrCost < CallCost)
10039 return true;
10040 }
10041 return false;
10042 };
10043
10044 // Debug information does not impact spill cost.
10045 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
10046 &*PrevInstIt != PrevInst)
10047 NumCalls++;
10048
10049 ++PrevInstIt;
10050 }
10051
10052 if (NumCalls) {
10054 for (auto *II : LiveValues) {
10055 auto *ScalarTy = II->getType();
10056 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
10057 ScalarTy = VectorTy->getElementType();
10058 V.push_back(FixedVectorType::get(ScalarTy, BundleWidth));
10059 }
10060 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
10061 }
10062
10063 PrevInst = Inst;
10064 }
10065
10066 return Cost;
10067}
10068
10069/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
10070/// buildvector sequence.
10072 const InsertElementInst *IE2) {
10073 if (IE1 == IE2)
10074 return false;
10075 const auto *I1 = IE1;
10076 const auto *I2 = IE2;
10077 const InsertElementInst *PrevI1;
10078 const InsertElementInst *PrevI2;
10079 unsigned Idx1 = *getInsertIndex(IE1);
10080 unsigned Idx2 = *getInsertIndex(IE2);
10081 do {
10082 if (I2 == IE1)
10083 return true;
10084 if (I1 == IE2)
10085 return false;
10086 PrevI1 = I1;
10087 PrevI2 = I2;
10088 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
10089 getInsertIndex(I1).value_or(Idx2) != Idx2)
10090 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
10091 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
10092 getInsertIndex(I2).value_or(Idx1) != Idx1)
10093 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
10094 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
10095 llvm_unreachable("Two different buildvectors not expected.");
10096}
10097
10098namespace {
10099/// Returns incoming Value *, if the requested type is Value * too, or a default
10100/// value, otherwise.
10101struct ValueSelect {
10102 template <typename U>
10103 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
10104 return V;
10105 }
10106 template <typename U>
10107 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
10108 return U();
10109 }
10110};
10111} // namespace
10112
10113/// Does the analysis of the provided shuffle masks and performs the requested
10114/// actions on the vectors with the given shuffle masks. It tries to do it in
10115/// several steps.
10116/// 1. If the Base vector is not undef vector, resizing the very first mask to
10117/// have common VF and perform action for 2 input vectors (including non-undef
10118/// Base). Other shuffle masks are combined with the resulting after the 1 stage
10119/// and processed as a shuffle of 2 elements.
10120/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
10121/// action only for 1 vector with the given mask, if it is not the identity
10122/// mask.
10123/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
10124/// vectors, combing the masks properly between the steps.
10125template <typename T>
10127 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
10128 function_ref<unsigned(T *)> GetVF,
10129 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
10131 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
10132 SmallVector<int> Mask(ShuffleMask.begin()->second);
10133 auto VMIt = std::next(ShuffleMask.begin());
10134 T *Prev = nullptr;
10135 SmallBitVector UseMask =
10136 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
10137 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
10138 if (!IsBaseUndef.all()) {
10139 // Base is not undef, need to combine it with the next subvectors.
10140 std::pair<T *, bool> Res =
10141 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
10142 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
10143 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
10144 if (Mask[Idx] == PoisonMaskElem)
10145 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
10146 else
10147 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
10148 }
10149 auto *V = ValueSelect::get<T *>(Base);
10150 (void)V;
10151 assert((!V || GetVF(V) == Mask.size()) &&
10152 "Expected base vector of VF number of elements.");
10153 Prev = Action(Mask, {nullptr, Res.first});
10154 } else if (ShuffleMask.size() == 1) {
10155 // Base is undef and only 1 vector is shuffled - perform the action only for
10156 // single vector, if the mask is not the identity mask.
10157 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
10158 /*ForSingleMask=*/true);
10159 if (Res.second)
10160 // Identity mask is found.
10161 Prev = Res.first;
10162 else
10163 Prev = Action(Mask, {ShuffleMask.begin()->first});
10164 } else {
10165 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
10166 // shuffles step by step, combining shuffle between the steps.
10167 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
10168 unsigned Vec2VF = GetVF(VMIt->first);
10169 if (Vec1VF == Vec2VF) {
10170 // No need to resize the input vectors since they are of the same size, we
10171 // can shuffle them directly.
10172 ArrayRef<int> SecMask = VMIt->second;
10173 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10174 if (SecMask[I] != PoisonMaskElem) {
10175 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10176 Mask[I] = SecMask[I] + Vec1VF;
10177 }
10178 }
10179 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
10180 } else {
10181 // Vectors of different sizes - resize and reshuffle.
10182 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
10183 /*ForSingleMask=*/false);
10184 std::pair<T *, bool> Res2 =
10185 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10186 ArrayRef<int> SecMask = VMIt->second;
10187 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10188 if (Mask[I] != PoisonMaskElem) {
10189 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10190 if (Res1.second)
10191 Mask[I] = I;
10192 } else if (SecMask[I] != PoisonMaskElem) {
10193 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
10194 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
10195 }
10196 }
10197 Prev = Action(Mask, {Res1.first, Res2.first});
10198 }
10199 VMIt = std::next(VMIt);
10200 }
10201 bool IsBaseNotUndef = !IsBaseUndef.all();
10202 (void)IsBaseNotUndef;
10203 // Perform requested actions for the remaining masks/vectors.
10204 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
10205 // Shuffle other input vectors, if any.
10206 std::pair<T *, bool> Res =
10207 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
10208 ArrayRef<int> SecMask = VMIt->second;
10209 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
10210 if (SecMask[I] != PoisonMaskElem) {
10211 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
10212 "Multiple uses of scalars.");
10213 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
10214 } else if (Mask[I] != PoisonMaskElem) {
10215 Mask[I] = I;
10216 }
10217 }
10218 Prev = Action(Mask, {Prev, Res.first});
10219 }
10220 return Prev;
10221}
10222
10225 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
10226 << VectorizableTree.size() << ".\n");
10227
10228 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
10229
10230 SmallPtrSet<Value *, 4> CheckedExtracts;
10231 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
10232 TreeEntry &TE = *VectorizableTree[I];
10233 if (TE.State == TreeEntry::NeedToGather) {
10234 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
10235 E && E->getVectorFactor() == TE.getVectorFactor() &&
10236 E->isSame(TE.Scalars)) {
10237 // Some gather nodes might be absolutely the same as some vectorizable
10238 // nodes after reordering, need to handle it.
10239 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
10240 << shortBundleName(TE.Scalars) << ".\n"
10241 << "SLP: Current total cost = " << Cost << "\n");
10242 continue;
10243 }
10244 }
10245
10246 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
10247 Cost += C;
10248 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
10249 << shortBundleName(TE.Scalars) << ".\n"
10250 << "SLP: Current total cost = " << Cost << "\n");
10251 }
10252
10253 SmallPtrSet<Value *, 16> ExtractCostCalculated;
10254 InstructionCost ExtractCost = 0;
10257 SmallVector<APInt> DemandedElts;
10258 SmallDenseSet<Value *, 4> UsedInserts;
10260 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
10261 for (ExternalUser &EU : ExternalUses) {
10262 // We only add extract cost once for the same scalar.
10263 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
10264 !ExtractCostCalculated.insert(EU.Scalar).second)
10265 continue;
10266
10267 // Uses by ephemeral values are free (because the ephemeral value will be
10268 // removed prior to code generation, and so the extraction will be
10269 // removed as well).
10270 if (EphValues.count(EU.User))
10271 continue;
10272
10273 // No extract cost for vector "scalar"
10274 if (isa<FixedVectorType>(EU.Scalar->getType()))
10275 continue;
10276
10277 // If found user is an insertelement, do not calculate extract cost but try
10278 // to detect it as a final shuffled/identity match.
10279 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User)) {
10280 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
10281 if (!UsedInserts.insert(VU).second)
10282 continue;
10283 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
10284 if (InsertIdx) {
10285 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
10286 auto *It = find_if(
10287 FirstUsers,
10288 [this, VU](const std::pair<Value *, const TreeEntry *> &Pair) {
10290 VU, cast<InsertElementInst>(Pair.first),
10291 [this](InsertElementInst *II) -> Value * {
10292 Value *Op0 = II->getOperand(0);
10293 if (getTreeEntry(II) && !getTreeEntry(Op0))
10294 return nullptr;
10295 return Op0;
10296 });
10297 });
10298 int VecId = -1;
10299 if (It == FirstUsers.end()) {
10300 (void)ShuffleMasks.emplace_back();
10301 SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
10302 if (Mask.empty())
10303 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10304 // Find the insertvector, vectorized in tree, if any.
10305 Value *Base = VU;
10306 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
10307 if (IEBase != EU.User &&
10308 (!IEBase->hasOneUse() ||
10309 getInsertIndex(IEBase).value_or(*InsertIdx) == *InsertIdx))
10310 break;
10311 // Build the mask for the vectorized insertelement instructions.
10312 if (const TreeEntry *E = getTreeEntry(IEBase)) {
10313 VU = IEBase;
10314 do {
10315 IEBase = cast<InsertElementInst>(Base);
10316 int Idx = *getInsertIndex(IEBase);
10317 assert(Mask[Idx] == PoisonMaskElem &&
10318 "InsertElementInstruction used already.");
10319 Mask[Idx] = Idx;
10320 Base = IEBase->getOperand(0);
10321 } while (E == getTreeEntry(Base));
10322 break;
10323 }
10324 Base = cast<InsertElementInst>(Base)->getOperand(0);
10325 }
10326 FirstUsers.emplace_back(VU, ScalarTE);
10327 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
10328 VecId = FirstUsers.size() - 1;
10329 auto It = MinBWs.find(ScalarTE);
10330 if (It != MinBWs.end() &&
10331 VectorCasts
10332 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
10333 .second) {
10334 unsigned BWSz = It->second.first;
10335 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
10336 unsigned VecOpcode;
10337 if (DstBWSz < BWSz)
10338 VecOpcode = Instruction::Trunc;
10339 else
10340 VecOpcode =
10341 It->second.second ? Instruction::SExt : Instruction::ZExt;
10344 VecOpcode, FTy,
10346 IntegerType::get(FTy->getContext(), BWSz),
10347 FTy->getNumElements()),
10349 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10350 << " for extending externally used vector with "
10351 "non-equal minimum bitwidth.\n");
10352 Cost += C;
10353 }
10354 } else {
10355 if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
10356 It->first = VU;
10357 VecId = std::distance(FirstUsers.begin(), It);
10358 }
10359 int InIdx = *InsertIdx;
10360 SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
10361 if (Mask.empty())
10362 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
10363 Mask[InIdx] = EU.Lane;
10364 DemandedElts[VecId].setBit(InIdx);
10365 continue;
10366 }
10367 }
10368 }
10369 // Leave the GEPs as is, they are free in most cases and better to keep them
10370 // as GEPs.
10372 if (auto *GEP = dyn_cast<GetElementPtrInst>(EU.Scalar)) {
10373 if (!ValueToExtUses) {
10374 ValueToExtUses.emplace();
10375 for_each(enumerate(ExternalUses), [&](const auto &P) {
10376 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
10377 });
10378 }
10379 // Can use original GEP, if no operands vectorized or they are marked as
10380 // externally used already.
10381 bool CanBeUsedAsGEP = all_of(GEP->operands(), [&](Value *V) {
10382 if (!getTreeEntry(V))
10383 return true;
10384 auto It = ValueToExtUses->find(V);
10385 if (It != ValueToExtUses->end()) {
10386 // Replace all uses to avoid compiler crash.
10387 ExternalUses[It->second].User = nullptr;
10388 return true;
10389 }
10390 return false;
10391 });
10392 if (CanBeUsedAsGEP) {
10393 ExtractCost += TTI->getInstructionCost(GEP, CostKind);
10394 ExternalUsesAsGEPs.insert(EU.Scalar);
10395 continue;
10396 }
10397 }
10398
10399 // If we plan to rewrite the tree in a smaller type, we will need to sign
10400 // extend the extracted value back to the original type. Here, we account
10401 // for the extract and the added cost of the sign extend if needed.
10402 auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
10403 auto It = MinBWs.find(getTreeEntry(EU.Scalar));
10404 if (It != MinBWs.end()) {
10405 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
10406 unsigned Extend =
10407 It->second.second ? Instruction::SExt : Instruction::ZExt;
10408 VecTy = FixedVectorType::get(MinTy, BundleWidth);
10409 ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
10410 VecTy, EU.Lane);
10411 } else {
10412 ExtractCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
10413 CostKind, EU.Lane);
10414 }
10415 }
10416 // Add reduced value cost, if resized.
10417 if (!VectorizedVals.empty()) {
10418 const TreeEntry &Root = *VectorizableTree.front().get();
10419 auto BWIt = MinBWs.find(&Root);
10420 if (BWIt != MinBWs.end()) {
10421 Type *DstTy = Root.Scalars.front()->getType();
10422 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy);
10423 unsigned SrcSz =
10424 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
10425 if (OriginalSz != SrcSz) {
10426 unsigned Opcode = Instruction::Trunc;
10427 if (OriginalSz > SrcSz)
10428 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
10429 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
10430 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
10433 }
10434 }
10435 }
10436
10437 InstructionCost SpillCost = getSpillCost();
10438 Cost += SpillCost + ExtractCost;
10439 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
10440 bool) {
10441 InstructionCost C = 0;
10442 unsigned VF = Mask.size();
10443 unsigned VecVF = TE->getVectorFactor();
10444 if (VF != VecVF &&
10445 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
10447 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
10448 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
10449 OrigMask.begin());
10450 C = TTI->getShuffleCost(
10452 FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
10453 LLVM_DEBUG(
10454 dbgs() << "SLP: Adding cost " << C
10455 << " for final shuffle of insertelement external users.\n";
10456 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
10457 Cost += C;
10458 return std::make_pair(TE, true);
10459 }
10460 return std::make_pair(TE, false);
10461 };
10462 // Calculate the cost of the reshuffled vectors, if any.
10463 for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
10464 Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
10465 auto Vector = ShuffleMasks[I].takeVector();
10466 unsigned VF = 0;
10467 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
10469 assert((TEs.size() == 1 || TEs.size() == 2) &&
10470 "Expected exactly 1 or 2 tree entries.");
10471 if (TEs.size() == 1) {
10472 if (VF == 0)
10473 VF = TEs.front()->getVectorFactor();
10474 auto *FTy =
10475 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10476 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
10477 !all_of(enumerate(Mask), [=](const auto &Data) {
10478 return Data.value() == PoisonMaskElem ||
10479 (Data.index() < VF &&
10480 static_cast<int>(Data.index()) == Data.value());
10481 })) {
10484 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10485 << " for final shuffle of insertelement "
10486 "external users.\n";
10487 TEs.front()->dump();
10488 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10489 Cost += C;
10490 }
10491 } else {
10492 if (VF == 0) {
10493 if (TEs.front() &&
10494 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
10495 VF = TEs.front()->getVectorFactor();
10496 else
10497 VF = Mask.size();
10498 }
10499 auto *FTy =
10500 FixedVectorType::get(TEs.back()->Scalars.front()->getType(), VF);
10503 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
10504 << " for final shuffle of vector node and external "
10505 "insertelement users.\n";
10506 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
10507 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10508 Cost += C;
10509 }
10510 VF = Mask.size();
10511 return TEs.back();
10512 };
10513 (void)performExtractsShuffleAction<const TreeEntry>(
10514 MutableArrayRef(Vector.data(), Vector.size()), Base,
10515 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
10516 EstimateShufflesCost);
10518 cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
10519 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
10520 Cost -= InsertCost;
10521 }
10522
10523 // Add the cost for reduced value resize (if required).
10524 if (ReductionBitWidth != 0) {
10525 assert(UserIgnoreList && "Expected reduction tree.");
10526 const TreeEntry &E = *VectorizableTree.front().get();
10527 auto It = MinBWs.find(&E);
10528 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
10529 unsigned SrcSize = It->second.first;
10530 unsigned DstSize = ReductionBitWidth;
10531 unsigned Opcode = Instruction::Trunc;
10532 if (SrcSize < DstSize)
10533 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
10534 auto *SrcVecTy =
10535 FixedVectorType::get(Builder.getIntNTy(SrcSize), E.getVectorFactor());
10536 auto *DstVecTy =
10537 FixedVectorType::get(Builder.getIntNTy(DstSize), E.getVectorFactor());
10538 TTI::CastContextHint CCH = getCastContextHint(E);
10539 InstructionCost CastCost;
10540 switch (E.getOpcode()) {
10541 case Instruction::SExt:
10542 case Instruction::ZExt:
10543 case Instruction::Trunc: {
10544 const TreeEntry *OpTE = getOperandEntry(&E, 0);
10545 CCH = getCastContextHint(*OpTE);
10546 break;
10547 }
10548 default:
10549 break;
10550 }
10551 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
10553 Cost += CastCost;
10554 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
10555 << " for final resize for reduction from " << SrcVecTy
10556 << " to " << DstVecTy << "\n";
10557 dbgs() << "SLP: Current total cost = " << Cost << "\n");
10558 }
10559 }
10560
10561#ifndef NDEBUG
10562 SmallString<256> Str;
10563 {
10565 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
10566 << "SLP: Extract Cost = " << ExtractCost << ".\n"
10567 << "SLP: Total Cost = " << Cost << ".\n";
10568 }
10569 LLVM_DEBUG(dbgs() << Str);
10570 if (ViewSLPTree)
10571 ViewGraph(this, "SLP" + F->getName(), false, Str);
10572#endif
10573
10574 return Cost;
10575}
10576
10577/// Tries to find extractelement instructions with constant indices from fixed
10578/// vector type and gather such instructions into a bunch, which highly likely
10579/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10580/// successful, the matched scalars are replaced by poison values in \p VL for
10581/// future analysis.
10582std::optional<TTI::ShuffleKind>
10583BoUpSLP::tryToGatherSingleRegisterExtractElements(
10585 // Scan list of gathered scalars for extractelements that can be represented
10586 // as shuffles.
10588 SmallVector<int> UndefVectorExtracts;
10589 for (int I = 0, E = VL.size(); I < E; ++I) {
10590 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10591 if (!EI) {
10592 if (isa<UndefValue>(VL[I]))
10593 UndefVectorExtracts.push_back(I);
10594 continue;
10595 }
10596 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
10597 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
10598 continue;
10599 std::optional<unsigned> Idx = getExtractIndex(EI);
10600 // Undefined index.
10601 if (!Idx) {
10602 UndefVectorExtracts.push_back(I);
10603 continue;
10604 }
10605 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
10606 ExtractMask.reset(*Idx);
10607 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
10608 UndefVectorExtracts.push_back(I);
10609 continue;
10610 }
10611 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
10612 }
10613 // Sort the vector operands by the maximum number of uses in extractelements.
10615 for (const auto &Data : VectorOpToIdx)
10616 VFToVector[cast<FixedVectorType>(Data.first->getType())->getNumElements()]
10617 .push_back(Data.first);
10618 for (auto &Data : VFToVector) {
10619 stable_sort(Data.second, [&VectorOpToIdx](Value *V1, Value *V2) {
10620 return VectorOpToIdx.find(V1)->second.size() >
10621 VectorOpToIdx.find(V2)->second.size();
10622 });
10623 }
10624 // Find the best pair of the vectors with the same number of elements or a
10625 // single vector.
10626 const int UndefSz = UndefVectorExtracts.size();
10627 unsigned SingleMax = 0;
10628 Value *SingleVec = nullptr;
10629 unsigned PairMax = 0;
10630 std::pair<Value *, Value *> PairVec(nullptr, nullptr);
10631 for (auto &Data : VFToVector) {
10632 Value *V1 = Data.second.front();
10633 if (SingleMax < VectorOpToIdx[V1].size() + UndefSz) {
10634 SingleMax = VectorOpToIdx[V1].size() + UndefSz;
10635 SingleVec = V1;
10636 }
10637 Value *V2 = nullptr;
10638 if (Data.second.size() > 1)
10639 V2 = *std::next(Data.second.begin());
10640 if (V2 && PairMax < VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() +
10641 UndefSz) {
10642 PairMax = VectorOpToIdx[V1].size() + VectorOpToIdx[V2].size() + UndefSz;
10643 PairVec = std::make_pair(V1, V2);
10644 }
10645 }
10646 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
10647 return std::nullopt;
10648 // Check if better to perform a shuffle of 2 vectors or just of a single
10649 // vector.
10650 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
10651 SmallVector<Value *> GatheredExtracts(
10652 VL.size(), PoisonValue::get(VL.front()->getType()));
10653 if (SingleMax >= PairMax && SingleMax) {
10654 for (int Idx : VectorOpToIdx[SingleVec])
10655 std::swap(GatheredExtracts[Idx], VL[Idx]);
10656 } else {
10657 for (Value *V : {PairVec.first, PairVec.second})
10658 for (int Idx : VectorOpToIdx[V])
10659 std::swap(GatheredExtracts[Idx], VL[Idx]);
10660 }
10661 // Add extracts from undefs too.
10662 for (int Idx : UndefVectorExtracts)
10663 std::swap(GatheredExtracts[Idx], VL[Idx]);
10664 // Check that gather of extractelements can be represented as just a
10665 // shuffle of a single/two vectors the scalars are extracted from.
10666 std::optional<TTI::ShuffleKind> Res =
10667 isFixedVectorShuffle(GatheredExtracts, Mask);
10668 if (!Res) {
10669 // TODO: try to check other subsets if possible.
10670 // Restore the original VL if attempt was not successful.
10671 copy(SavedVL, VL.begin());
10672 return std::nullopt;
10673 }
10674 // Restore unused scalars from mask, if some of the extractelements were not
10675 // selected for shuffle.
10676 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
10677 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
10678 isa<UndefValue>(GatheredExtracts[I])) {
10679 std::swap(VL[I], GatheredExtracts[I]);
10680 continue;
10681 }
10682 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
10683 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
10684 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
10685 is_contained(UndefVectorExtracts, I))
10686 continue;
10687 }
10688 return Res;
10689}
10690
10691/// Tries to find extractelement instructions with constant indices from fixed
10692/// vector type and gather such instructions into a bunch, which highly likely
10693/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
10694/// successful, the matched scalars are replaced by poison values in \p VL for
10695/// future analysis.
10697BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
10699 unsigned NumParts) const {
10700 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
10701 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
10702 Mask.assign(VL.size(), PoisonMaskElem);
10703 unsigned SliceSize = VL.size() / NumParts;
10704 for (unsigned Part = 0; Part < NumParts; ++Part) {
10705 // Scan list of gathered scalars for extractelements that can be represented
10706 // as shuffles.
10708 MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
10709 SmallVector<int> SubMask;
10710 std::optional<TTI::ShuffleKind> Res =
10711 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
10712 ShufflesRes[Part] = Res;
10713 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
10714 }
10715 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
10716 return Res.has_value();
10717 }))
10718 ShufflesRes.clear();
10719 return ShufflesRes;
10720}
10721
10722std::optional<TargetTransformInfo::ShuffleKind>
10723BoUpSLP::isGatherShuffledSingleRegisterEntry(
10724 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
10725 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
10726 Entries.clear();
10727 // TODO: currently checking only for Scalars in the tree entry, need to count
10728 // reused elements too for better cost estimation.
10729 const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
10730 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
10731 const BasicBlock *TEInsertBlock = nullptr;
10732 // Main node of PHI entries keeps the correct order of operands/incoming
10733 // blocks.
10734 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
10735 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
10736 TEInsertPt = TEInsertBlock->getTerminator();
10737 } else {
10738 TEInsertBlock = TEInsertPt->getParent();
10739 }
10740 if (!DT->isReachableFromEntry(TEInsertBlock))
10741 return std::nullopt;
10742 auto *NodeUI = DT->getNode(TEInsertBlock);
10743 assert(NodeUI && "Should only process reachable instructions");
10744 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
10745 auto CheckOrdering = [&](const Instruction *InsertPt) {
10746 // Argument InsertPt is an instruction where vector code for some other
10747 // tree entry (one that shares one or more scalars with TE) is going to be
10748 // generated. This lambda returns true if insertion point of vector code
10749 // for the TE dominates that point (otherwise dependency is the other way
10750 // around). The other node is not limited to be of a gather kind. Gather
10751 // nodes are not scheduled and their vector code is inserted before their
10752 // first user. If user is PHI, that is supposed to be at the end of a
10753 // predecessor block. Otherwise it is the last instruction among scalars of
10754 // the user node. So, instead of checking dependency between instructions
10755 // themselves, we check dependency between their insertion points for vector
10756 // code (since each scalar instruction ends up as a lane of a vector
10757 // instruction).
10758 const BasicBlock *InsertBlock = InsertPt->getParent();
10759 auto *NodeEUI = DT->getNode(InsertBlock);
10760 if (!NodeEUI)
10761 return false;
10762 assert((NodeUI == NodeEUI) ==
10763 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
10764 "Different nodes should have different DFS numbers");
10765 // Check the order of the gather nodes users.
10766 if (TEInsertPt->getParent() != InsertBlock &&
10767 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
10768 return false;
10769 if (TEInsertPt->getParent() == InsertBlock &&
10770 TEInsertPt->comesBefore(InsertPt))
10771 return false;
10772 return true;
10773 };
10774 // Find all tree entries used by the gathered values. If no common entries
10775 // found - not a shuffle.
10776 // Here we build a set of tree nodes for each gathered value and trying to
10777 // find the intersection between these sets. If we have at least one common
10778 // tree node for each gathered value - we have just a permutation of the
10779 // single vector. If we have 2 different sets, we're in situation where we
10780 // have a permutation of 2 input vectors.
10782 DenseMap<Value *, int> UsedValuesEntry;
10783 for (Value *V : VL) {
10784 if (isConstant(V))
10785 continue;
10786 // Build a list of tree entries where V is used.
10788 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
10789 if (TEPtr == TE)
10790 continue;
10791 assert(any_of(TEPtr->Scalars,
10792 [&](Value *V) { return GatheredScalars.contains(V); }) &&
10793 "Must contain at least single gathered value.");
10794 assert(TEPtr->UserTreeIndices.size() == 1 &&
10795 "Expected only single user of a gather node.");
10796 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
10797
10798 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
10799 const Instruction *InsertPt =
10800 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
10801 : &getLastInstructionInBundle(UseEI.UserTE);
10802 if (TEInsertPt == InsertPt) {
10803 // If 2 gathers are operands of the same entry (regardless of whether
10804 // user is PHI or else), compare operands indices, use the earlier one
10805 // as the base.
10806 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
10807 continue;
10808 // If the user instruction is used for some reason in different
10809 // vectorized nodes - make it depend on index.
10810 if (TEUseEI.UserTE != UseEI.UserTE &&
10811 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
10812 continue;
10813 }
10814
10815 // Check if the user node of the TE comes after user node of TEPtr,
10816 // otherwise TEPtr depends on TE.
10817 if ((TEInsertBlock != InsertPt->getParent() ||
10818 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
10819 !CheckOrdering(InsertPt))
10820 continue;
10821 VToTEs.insert(TEPtr);
10822 }
10823 if (const TreeEntry *VTE = getTreeEntry(V)) {
10824 if (ForOrder) {
10825 if (VTE->State != TreeEntry::Vectorize) {
10826 auto It = MultiNodeScalars.find(V);
10827 if (It == MultiNodeScalars.end())
10828 continue;
10829 VTE = *It->getSecond().begin();
10830 // Iterate through all vectorized nodes.
10831 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
10832 return MTE->State == TreeEntry::Vectorize;
10833 });
10834 if (MIt == It->getSecond().end())
10835 continue;
10836 VTE = *MIt;
10837 }
10838 }
10839 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
10840 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
10841 continue;
10842 VToTEs.insert(VTE);
10843 }
10844 if (VToTEs.empty())
10845 continue;
10846 if (UsedTEs.empty()) {
10847 // The first iteration, just insert the list of nodes to vector.
10848 UsedTEs.push_back(VToTEs);
10849 UsedValuesEntry.try_emplace(V, 0);
10850 } else {
10851 // Need to check if there are any previously used tree nodes which use V.
10852 // If there are no such nodes, consider that we have another one input
10853 // vector.
10854 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
10855 unsigned Idx = 0;
10856 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
10857 // Do we have a non-empty intersection of previously listed tree entries
10858 // and tree entries using current V?
10859 set_intersect(VToTEs, Set);
10860 if (!VToTEs.empty()) {
10861 // Yes, write the new subset and continue analysis for the next
10862 // scalar.
10863 Set.swap(VToTEs);
10864 break;
10865 }
10866 VToTEs = SavedVToTEs;
10867 ++Idx;
10868 }
10869 // No non-empty intersection found - need to add a second set of possible
10870 // source vectors.
10871 if (Idx == UsedTEs.size()) {
10872 // If the number of input vectors is greater than 2 - not a permutation,
10873 // fallback to the regular gather.
10874 // TODO: support multiple reshuffled nodes.
10875 if (UsedTEs.size() == 2)
10876 continue;
10877 UsedTEs.push_back(SavedVToTEs);
10878 Idx = UsedTEs.size() - 1;
10879 }
10880 UsedValuesEntry.try_emplace(V, Idx);
10881 }
10882 }
10883
10884 if (UsedTEs.empty()) {
10885 Entries.clear();
10886 return std::nullopt;
10887 }
10888
10889 unsigned VF = 0;
10890 if (UsedTEs.size() == 1) {
10891 // Keep the order to avoid non-determinism.
10892 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
10893 UsedTEs.front().end());
10894 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10895 return TE1->Idx < TE2->Idx;
10896 });
10897 // Try to find the perfect match in another gather node at first.
10898 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
10899 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
10900 });
10901 if (It != FirstEntries.end() &&
10902 ((*It)->getVectorFactor() == VL.size() ||
10903 ((*It)->getVectorFactor() == TE->Scalars.size() &&
10904 TE->ReuseShuffleIndices.size() == VL.size() &&
10905 (*It)->isSame(TE->Scalars)))) {
10906 Entries.push_back(*It);
10907 if ((*It)->getVectorFactor() == VL.size()) {
10908 std::iota(std::next(Mask.begin(), Part * VL.size()),
10909 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
10910 } else {
10911 SmallVector<int> CommonMask = TE->getCommonMask();
10912 copy(CommonMask, Mask.begin());
10913 }
10914 // Clear undef scalars.
10915 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
10916 if (isa<PoisonValue>(VL[I]))
10919 }
10920 // No perfect match, just shuffle, so choose the first tree node from the
10921 // tree.
10922 Entries.push_back(FirstEntries.front());
10923 } else {
10924 // Try to find nodes with the same vector factor.
10925 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
10926 // Keep the order of tree nodes to avoid non-determinism.
10928 for (const TreeEntry *TE : UsedTEs.front()) {
10929 unsigned VF = TE->getVectorFactor();
10930 auto It = VFToTE.find(VF);
10931 if (It != VFToTE.end()) {
10932 if (It->second->Idx > TE->Idx)
10933 It->getSecond() = TE;
10934 continue;
10935 }
10936 VFToTE.try_emplace(VF, TE);
10937 }
10938 // Same, keep the order to avoid non-determinism.
10939 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
10940 UsedTEs.back().end());
10941 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
10942 return TE1->Idx < TE2->Idx;
10943 });
10944 for (const TreeEntry *TE : SecondEntries) {
10945 auto It = VFToTE.find(TE->getVectorFactor());
10946 if (It != VFToTE.end()) {
10947 VF = It->first;
10948 Entries.push_back(It->second);
10949 Entries.push_back(TE);
10950 break;
10951 }
10952 }
10953 // No 2 source vectors with the same vector factor - just choose 2 with max
10954 // index.
10955 if (Entries.empty()) {
10956 Entries.push_back(*llvm::max_element(
10957 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
10958 return TE1->Idx < TE2->Idx;
10959 }));
10960 Entries.push_back(SecondEntries.front());
10961 VF = std::max(Entries.front()->getVectorFactor(),
10962 Entries.back()->getVectorFactor());
10963 }
10964 }
10965
10966 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
10967 // Checks if the 2 PHIs are compatible in terms of high possibility to be
10968 // vectorized.
10969 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
10970 auto *PHI = cast<PHINode>(V);
10971 auto *PHI1 = cast<PHINode>(V1);
10972 // Check that all incoming values are compatible/from same parent (if they
10973 // are instructions).
10974 // The incoming values are compatible if they all are constants, or
10975 // instruction with the same/alternate opcodes from the same basic block.
10976 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
10977 Value *In = PHI->getIncomingValue(I);
10978 Value *In1 = PHI1->getIncomingValue(I);
10979 if (isConstant(In) && isConstant(In1))
10980 continue;
10981 if (!getSameOpcode({In, In1}, *TLI).getOpcode())
10982 return false;
10983 if (cast<Instruction>(In)->getParent() !=
10984 cast<Instruction>(In1)->getParent())
10985 return false;
10986 }
10987 return true;
10988 };
10989 // Check if the value can be ignored during analysis for shuffled gathers.
10990 // We suppose it is better to ignore instruction, which do not form splats,
10991 // are not vectorized/not extractelements (these instructions will be handled
10992 // by extractelements processing) or may form vector node in future.
10993 auto MightBeIgnored = [=](Value *V) {
10994 auto *I = dyn_cast<Instruction>(V);
10995 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
10997 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
10998 };
10999 // Check that the neighbor instruction may form a full vector node with the
11000 // current instruction V. It is possible, if they have same/alternate opcode
11001 // and same parent basic block.
11002 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
11003 Value *V1 = VL[Idx];
11004 bool UsedInSameVTE = false;
11005 auto It = UsedValuesEntry.find(V1);
11006 if (It != UsedValuesEntry.end())
11007 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
11008 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
11009 getSameOpcode({V, V1}, *TLI).getOpcode() &&
11010 cast<Instruction>(V)->getParent() ==
11011 cast<Instruction>(V1)->getParent() &&
11012 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
11013 };
11014 // Build a shuffle mask for better cost estimation and vector emission.
11015 SmallBitVector UsedIdxs(Entries.size());
11017 for (int I = 0, E = VL.size(); I < E; ++I) {
11018 Value *V = VL[I];
11019 auto It = UsedValuesEntry.find(V);
11020 if (It == UsedValuesEntry.end())
11021 continue;
11022 // Do not try to shuffle scalars, if they are constants, or instructions
11023 // that can be vectorized as a result of the following vector build
11024 // vectorization.
11025 if (isConstant(V) || (MightBeIgnored(V) &&
11026 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
11027 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
11028 continue;
11029 unsigned Idx = It->second;
11030 EntryLanes.emplace_back(Idx, I);
11031 UsedIdxs.set(Idx);
11032 }
11033 // Iterate through all shuffled scalars and select entries, which can be used
11034 // for final shuffle.
11036 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
11037 if (!UsedIdxs.test(I))
11038 continue;
11039 // Fix the entry number for the given scalar. If it is the first entry, set
11040 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
11041 // These indices are used when calculating final shuffle mask as the vector
11042 // offset.
11043 for (std::pair<unsigned, int> &Pair : EntryLanes)
11044 if (Pair.first == I)
11045 Pair.first = TempEntries.size();
11046 TempEntries.push_back(Entries[I]);
11047 }
11048 Entries.swap(TempEntries);
11049 if (EntryLanes.size() == Entries.size() &&
11050 !VL.equals(ArrayRef(TE->Scalars)
11051 .slice(Part * VL.size(),
11052 std::min<int>(VL.size(), TE->Scalars.size())))) {
11053 // We may have here 1 or 2 entries only. If the number of scalars is equal
11054 // to the number of entries, no need to do the analysis, it is not very
11055 // profitable. Since VL is not the same as TE->Scalars, it means we already
11056 // have some shuffles before. Cut off not profitable case.
11057 Entries.clear();
11058 return std::nullopt;
11059 }
11060 // Build the final mask, check for the identity shuffle, if possible.
11061 bool IsIdentity = Entries.size() == 1;
11062 // Pair.first is the offset to the vector, while Pair.second is the index of
11063 // scalar in the list.
11064 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
11065 unsigned Idx = Part * VL.size() + Pair.second;
11066 Mask[Idx] =
11067 Pair.first * VF +
11068 (ForOrder ? std::distance(
11069 Entries[Pair.first]->Scalars.begin(),
11070 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
11071 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
11072 IsIdentity &= Mask[Idx] == Pair.second;
11073 }
11074 switch (Entries.size()) {
11075 case 1:
11076 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
11078 break;
11079 case 2:
11080 if (EntryLanes.size() > 2 || VL.size() <= 2)
11082 break;
11083 default:
11084 break;
11085 }
11086 Entries.clear();
11087 // Clear the corresponding mask elements.
11088 std::fill(std::next(Mask.begin(), Part * VL.size()),
11089 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
11090 return std::nullopt;
11091}
11092
11094BoUpSLP::isGatherShuffledEntry(
11095 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
11096 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
11097 bool ForOrder) {
11098 assert(NumParts > 0 && NumParts < VL.size() &&
11099 "Expected positive number of registers.");
11100 Entries.clear();
11101 // No need to check for the topmost gather node.
11102 if (TE == VectorizableTree.front().get())
11103 return {};
11104 // FIXME: Gathering for non-power-of-2 nodes not implemented yet.
11105 if (TE->isNonPowOf2Vec())
11106 return {};
11107 Mask.assign(VL.size(), PoisonMaskElem);
11108 assert(TE->UserTreeIndices.size() == 1 &&
11109 "Expected only single user of the gather node.");
11110 assert(VL.size() % NumParts == 0 &&
11111 "Number of scalars must be divisible by NumParts.");
11112 unsigned SliceSize = VL.size() / NumParts;
11114 for (unsigned Part = 0; Part < NumParts; ++Part) {
11115 ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
11116 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
11117 std::optional<TTI::ShuffleKind> SubRes =
11118 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
11119 ForOrder);
11120 if (!SubRes)
11121 SubEntries.clear();
11122 Res.push_back(SubRes);
11123 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
11124 SubEntries.front()->getVectorFactor() == VL.size() &&
11125 (SubEntries.front()->isSame(TE->Scalars) ||
11126 SubEntries.front()->isSame(VL))) {
11127 SmallVector<const TreeEntry *> LocalSubEntries;
11128 LocalSubEntries.swap(SubEntries);
11129 Entries.clear();
11130 Res.clear();
11131 std::iota(Mask.begin(), Mask.end(), 0);
11132 // Clear undef scalars.
11133 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
11134 if (isa<PoisonValue>(VL[I]))
11136 Entries.emplace_back(1, LocalSubEntries.front());
11138 return Res;
11139 }
11140 }
11141 if (all_of(Res,
11142 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
11143 Entries.clear();
11144 return {};
11145 }
11146 return Res;
11147}
11148
11149InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
11150 Type *ScalarTy) const {
11151 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11152 bool DuplicateNonConst = false;
11153 // Find the cost of inserting/extracting values from the vector.
11154 // Check if the same elements are inserted several times and count them as
11155 // shuffle candidates.
11156 APInt ShuffledElements = APInt::getZero(VL.size());
11157 DenseMap<Value *, unsigned> UniqueElements;
11160 auto EstimateInsertCost = [&](unsigned I, Value *V) {
11161 if (V->getType() != ScalarTy) {
11162 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
11164 V = nullptr;
11165 }
11166 if (!ForPoisonSrc)
11167 Cost +=
11168 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
11169 I, Constant::getNullValue(VecTy), V);
11170 };
11171 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
11172 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
11173 Value *V = VL[I];
11174 // No need to shuffle duplicates for constants.
11175 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
11176 ShuffledElements.setBit(I);
11177 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
11178 continue;
11179 }
11180
11181 auto Res = UniqueElements.try_emplace(V, I);
11182 if (Res.second) {
11183 EstimateInsertCost(I, V);
11184 ShuffleMask[I] = I;
11185 continue;
11186 }
11187
11188 DuplicateNonConst = true;
11189 ShuffledElements.setBit(I);
11190 ShuffleMask[I] = Res.first->second;
11191 }
11192 if (ForPoisonSrc)
11193 Cost =
11194 TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
11195 /*Extract*/ false, CostKind);
11196 if (DuplicateNonConst)
11198 VecTy, ShuffleMask);
11199 return Cost;
11200}
11201
11202// Perform operand reordering on the instructions in VL and return the reordered
11203// operands in Left and Right.
11204void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
11207 const BoUpSLP &R) {
11208 if (VL.empty())
11209 return;
11210 VLOperands Ops(VL, R);
11211 // Reorder the operands in place.
11212 Ops.reorder();
11213 Left = Ops.getVL(0);
11214 Right = Ops.getVL(1);
11215}
11216
11217Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
11218 auto &Res = EntryToLastInstruction.FindAndConstruct(E);
11219 if (Res.second)
11220 return *Res.second;
11221 // Get the basic block this bundle is in. All instructions in the bundle
11222 // should be in this block (except for extractelement-like instructions with
11223 // constant indeces).
11224 auto *Front = E->getMainOp();
11225 auto *BB = Front->getParent();
11226 assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
11227 if (E->getOpcode() == Instruction::GetElementPtr &&
11228 !isa<GetElementPtrInst>(V))
11229 return true;
11230 auto *I = cast<Instruction>(V);
11231 return !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
11232 isVectorLikeInstWithConstOps(I);
11233 }));
11234
11235 auto FindLastInst = [&]() {
11236 Instruction *LastInst = Front;
11237 for (Value *V : E->Scalars) {
11238 auto *I = dyn_cast<Instruction>(V);
11239 if (!I)
11240 continue;
11241 if (LastInst->getParent() == I->getParent()) {
11242 if (LastInst->comesBefore(I))
11243 LastInst = I;
11244 continue;
11245 }
11246 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11247 !isa<GetElementPtrInst>(I)) ||
11248 (isVectorLikeInstWithConstOps(LastInst) &&
11250 "Expected vector-like or non-GEP in GEP node insts only.");
11251 if (!DT->isReachableFromEntry(LastInst->getParent())) {
11252 LastInst = I;
11253 continue;
11254 }
11255 if (!DT->isReachableFromEntry(I->getParent()))
11256 continue;
11257 auto *NodeA = DT->getNode(LastInst->getParent());
11258 auto *NodeB = DT->getNode(I->getParent());
11259 assert(NodeA && "Should only process reachable instructions");
11260 assert(NodeB && "Should only process reachable instructions");
11261 assert((NodeA == NodeB) ==
11262 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11263 "Different nodes should have different DFS numbers");
11264 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
11265 LastInst = I;
11266 }
11267 BB = LastInst->getParent();
11268 return LastInst;
11269 };
11270
11271 auto FindFirstInst = [&]() {
11272 Instruction *FirstInst = Front;
11273 for (Value *V : E->Scalars) {
11274 auto *I = dyn_cast<Instruction>(V);
11275 if (!I)
11276 continue;
11277 if (FirstInst->getParent() == I->getParent()) {
11278 if (I->comesBefore(FirstInst))
11279 FirstInst = I;
11280 continue;
11281 }
11282 assert(((E->getOpcode() == Instruction::GetElementPtr &&
11283 !isa<GetElementPtrInst>(I)) ||
11284 (isVectorLikeInstWithConstOps(FirstInst) &&
11286 "Expected vector-like or non-GEP in GEP node insts only.");
11287 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
11288 FirstInst = I;
11289 continue;
11290 }
11291 if (!DT->isReachableFromEntry(I->getParent()))
11292 continue;
11293 auto *NodeA = DT->getNode(FirstInst->getParent());
11294 auto *NodeB = DT->getNode(I->getParent());
11295 assert(NodeA && "Should only process reachable instructions");
11296 assert(NodeB && "Should only process reachable instructions");
11297 assert((NodeA == NodeB) ==
11298 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
11299 "Different nodes should have different DFS numbers");
11300 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
11301 FirstInst = I;
11302 }
11303 return FirstInst;
11304 };
11305
11306 // Set the insert point to the beginning of the basic block if the entry
11307 // should not be scheduled.
11308 if (doesNotNeedToSchedule(E->Scalars) ||
11309 (E->State != TreeEntry::NeedToGather &&
11310 all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
11311 if ((E->getOpcode() == Instruction::GetElementPtr &&
11312 any_of(E->Scalars,
11313 [](Value *V) {
11314 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
11315 })) ||
11316 all_of(E->Scalars,
11317 [](Value *V) {
11318 return !isVectorLikeInstWithConstOps(V) &&
11319 isUsedOutsideBlock(V);
11320 }) ||
11321 (E->State == TreeEntry::NeedToGather && E->Idx == 0 &&
11322 all_of(E->Scalars, [](Value *V) {
11323 return isa<ExtractElementInst, UndefValue>(V) ||
11324 areAllOperandsNonInsts(V);
11325 })))
11326 Res.second = FindLastInst();
11327 else
11328 Res.second = FindFirstInst();
11329 return *Res.second;
11330 }
11331
11332 // Find the last instruction. The common case should be that BB has been
11333 // scheduled, and the last instruction is VL.back(). So we start with
11334 // VL.back() and iterate over schedule data until we reach the end of the
11335 // bundle. The end of the bundle is marked by null ScheduleData.
11336 if (BlocksSchedules.count(BB)) {
11337 Value *V = E->isOneOf(E->Scalars.back());
11339 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
11340 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
11341 if (Bundle && Bundle->isPartOfBundle())
11342 for (; Bundle; Bundle = Bundle->NextInBundle)
11343 if (Bundle->OpValue == Bundle->Inst)
11344 Res.second = Bundle->Inst;
11345 }
11346
11347 // LastInst can still be null at this point if there's either not an entry
11348 // for BB in BlocksSchedules or there's no ScheduleData available for
11349 // VL.back(). This can be the case if buildTree_rec aborts for various
11350 // reasons (e.g., the maximum recursion depth is reached, the maximum region
11351 // size is reached, etc.). ScheduleData is initialized in the scheduling
11352 // "dry-run".
11353 //
11354 // If this happens, we can still find the last instruction by brute force. We
11355 // iterate forwards from Front (inclusive) until we either see all
11356 // instructions in the bundle or reach the end of the block. If Front is the
11357 // last instruction in program order, LastInst will be set to Front, and we
11358 // will visit all the remaining instructions in the block.
11359 //
11360 // One of the reasons we exit early from buildTree_rec is to place an upper
11361 // bound on compile-time. Thus, taking an additional compile-time hit here is
11362 // not ideal. However, this should be exceedingly rare since it requires that
11363 // we both exit early from buildTree_rec and that the bundle be out-of-order
11364 // (causing us to iterate all the way to the end of the block).
11365 if (!Res.second)
11366 Res.second = FindLastInst();
11367 assert(Res.second && "Failed to find last instruction in bundle");
11368 return *Res.second;
11369}
11370
11371void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
11372 auto *Front = E->getMainOp();
11373 Instruction *LastInst = &getLastInstructionInBundle(E);
11374 assert(LastInst && "Failed to find last instruction in bundle");
11375 BasicBlock::iterator LastInstIt = LastInst->getIterator();
11376 // If the instruction is PHI, set the insert point after all the PHIs.
11377 bool IsPHI = isa<PHINode>(LastInst);
11378 if (IsPHI)
11379 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
11380 if (IsPHI || (E->State != TreeEntry::NeedToGather &&
11381 doesNotNeedToSchedule(E->Scalars))) {
11382 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
11383 } else {
11384 // Set the insertion point after the last instruction in the bundle. Set the
11385 // debug location to Front.
11386 Builder.SetInsertPoint(
11387 LastInst->getParent(),
11389 }
11390 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
11391}
11392
11393Value *BoUpSLP::gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy) {
11394 // List of instructions/lanes from current block and/or the blocks which are
11395 // part of the current loop. These instructions will be inserted at the end to
11396 // make it possible to optimize loops and hoist invariant instructions out of
11397 // the loops body with better chances for success.
11399 SmallSet<int, 4> PostponedIndices;
11400 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
11401 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
11403 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
11404 InsertBB = InsertBB->getSinglePredecessor();
11405 return InsertBB && InsertBB == InstBB;
11406 };
11407 for (int I = 0, E = VL.size(); I < E; ++I) {
11408 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
11409 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
11410 getTreeEntry(Inst) ||
11411 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
11412 PostponedIndices.insert(I).second)
11413 PostponedInsts.emplace_back(Inst, I);
11414 }
11415
11416 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
11417 Type *Ty) {
11418 Value *Scalar = V;
11419 if (Scalar->getType() != Ty) {
11420 assert(Scalar->getType()->isIntegerTy() && Ty->isIntegerTy() &&
11421 "Expected integer types only.");
11422 Value *V = Scalar;
11423 if (auto *CI = dyn_cast<CastInst>(Scalar);
11424 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
11425 Value *Op = CI->getOperand(0);
11426 if (auto *IOp = dyn_cast<Instruction>(Op);
11427 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
11428 V = Op;
11429 }
11430 Scalar = Builder.CreateIntCast(
11431 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
11432 }
11433
11434 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
11435 auto *InsElt = dyn_cast<InsertElementInst>(Vec);
11436 if (!InsElt)
11437 return Vec;
11438 GatherShuffleExtractSeq.insert(InsElt);
11439 CSEBlocks.insert(InsElt->getParent());
11440 // Add to our 'need-to-extract' list.
11441 if (isa<Instruction>(V)) {
11442 if (TreeEntry *Entry = getTreeEntry(V)) {
11443 // Find which lane we need to extract.
11444 User *UserOp = nullptr;
11445 if (Scalar != V) {
11446 if (auto *SI = dyn_cast<Instruction>(Scalar))
11447 UserOp = SI;
11448 } else {
11449 UserOp = InsElt;
11450 }
11451 if (UserOp) {
11452 unsigned FoundLane = Entry->findLaneForValue(V);
11453 ExternalUses.emplace_back(V, UserOp, FoundLane);
11454 }
11455 }
11456 }
11457 return Vec;
11458 };
11459 auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
11460 Value *Vec = Root ? Root : PoisonValue::get(VecTy);
11461 SmallVector<int> NonConsts;
11462 // Insert constant values at first.
11463 for (int I = 0, E = VL.size(); I < E; ++I) {
11464 if (PostponedIndices.contains(I))
11465 continue;
11466 if (!isConstant(VL[I])) {
11467 NonConsts.push_back(I);
11468 continue;
11469 }
11470 if (Root) {
11471 if (!isa<UndefValue>(VL[I])) {
11472 NonConsts.push_back(I);
11473 continue;
11474 }
11475 if (isa<PoisonValue>(VL[I]))
11476 continue;
11477 if (auto *SV = dyn_cast<ShuffleVectorInst>(Root)) {
11478 if (SV->getMaskValue(I) == PoisonMaskElem)
11479 continue;
11480 }
11481 }
11482 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11483 }
11484 // Insert non-constant values.
11485 for (int I : NonConsts)
11486 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
11487 // Append instructions, which are/may be part of the loop, in the end to make
11488 // it possible to hoist non-loop-based instructions.
11489 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
11490 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
11491
11492 return Vec;
11493}
11494
11495/// Merges shuffle masks and emits final shuffle instruction, if required. It
11496/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
11497/// when the actual shuffle instruction is generated only if this is actually
11498/// required. Otherwise, the shuffle instruction emission is delayed till the
11499/// end of the process, to reduce the number of emitted instructions and further
11500/// analysis/transformations.
11501/// The class also will look through the previously emitted shuffle instructions
11502/// and properly mark indices in mask as undef.
11503/// For example, given the code
11504/// \code
11505/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
11506/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
11507/// \endcode
11508/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
11509/// look through %s1 and %s2 and emit
11510/// \code
11511/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11512/// \endcode
11513/// instead.
11514/// If 2 operands are of different size, the smallest one will be resized and
11515/// the mask recalculated properly.
11516/// For example, given the code
11517/// \code
11518/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
11519/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
11520/// \endcode
11521/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
11522/// look through %s1 and %s2 and emit
11523/// \code
11524/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
11525/// \endcode
11526/// instead.
11527class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
11528 bool IsFinalized = false;
11529 /// Combined mask for all applied operands and masks. It is built during
11530 /// analysis and actual emission of shuffle vector instructions.
11531 SmallVector<int> CommonMask;
11532 /// List of operands for the shuffle vector instruction. It hold at max 2
11533 /// operands, if the 3rd is going to be added, the first 2 are combined into
11534 /// shuffle with \p CommonMask mask, the first operand sets to be the
11535 /// resulting shuffle and the second operand sets to be the newly added
11536 /// operand. The \p CommonMask is transformed in the proper way after that.
11537 SmallVector<Value *, 2> InVectors;
11538 Type *ScalarTy = nullptr;
11539 IRBuilderBase &Builder;
11540 BoUpSLP &R;
11541
11542 class ShuffleIRBuilder {
11543 IRBuilderBase &Builder;
11544 /// Holds all of the instructions that we gathered.
11545 SetVector<Instruction *> &GatherShuffleExtractSeq;
11546 /// A list of blocks that we are going to CSE.
11547 DenseSet<BasicBlock *> &CSEBlocks;
11548 /// Data layout.
11549 const DataLayout &DL;
11550
11551 public:
11552 ShuffleIRBuilder(IRBuilderBase &Builder,
11553 SetVector<Instruction *> &GatherShuffleExtractSeq,
11554 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
11555 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
11556 CSEBlocks(CSEBlocks), DL(DL) {}
11557 ~ShuffleIRBuilder() = default;
11558 /// Creates shufflevector for the 2 operands with the given mask.
11559 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
11560 if (V1->getType() != V2->getType()) {
11562 V1->getType()->isIntOrIntVectorTy() &&
11563 "Expected integer vector types only.");
11564 if (V1->getType() != V2->getType()) {
11565 if (cast<VectorType>(V2->getType())
11566 ->getElementType()
11567 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
11568 ->getElementType()
11569 ->getIntegerBitWidth())
11570 V2 = Builder.CreateIntCast(
11571 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
11572 else
11573 V1 = Builder.CreateIntCast(
11574 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
11575 }
11576 }
11577 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
11578 if (auto *I = dyn_cast<Instruction>(Vec)) {
11579 GatherShuffleExtractSeq.insert(I);
11580 CSEBlocks.insert(I->getParent());
11581 }
11582 return Vec;
11583 }
11584 /// Creates permutation of the single vector operand with the given mask, if
11585 /// it is not identity mask.
11586 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
11587 if (Mask.empty())
11588 return V1;
11589 unsigned VF = Mask.size();
11590 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
11591 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
11592 return V1;
11593 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
11594 if (auto *I = dyn_cast<Instruction>(Vec)) {
11595 GatherShuffleExtractSeq.insert(I);
11596 CSEBlocks.insert(I->getParent());
11597 }
11598 return Vec;
11599 }
11600 Value *createIdentity(Value *V) { return V; }
11601 Value *createPoison(Type *Ty, unsigned VF) {
11602 return PoisonValue::get(FixedVectorType::get(Ty, VF));
11603 }
11604 /// Resizes 2 input vector to match the sizes, if the they are not equal
11605 /// yet. The smallest vector is resized to the size of the larger vector.
11606 void resizeToMatch(Value *&V1, Value *&V2) {
11607 if (V1->getType() == V2->getType())
11608 return;
11609 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
11610 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
11611 int VF = std::max(V1VF, V2VF);
11612 int MinVF = std::min(V1VF, V2VF);
11613 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
11614 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
11615 0);
11616 Value *&Op = MinVF == V1VF ? V1 : V2;
11617 Op = Builder.CreateShuffleVector(Op, IdentityMask);
11618 if (auto *I = dyn_cast<Instruction>(Op)) {
11619 GatherShuffleExtractSeq.insert(I);
11620 CSEBlocks.insert(I->getParent());
11621 }
11622 if (MinVF == V1VF)
11623 V1 = Op;
11624 else
11625 V2 = Op;
11626 }
11627 };
11628
11629 /// Smart shuffle instruction emission, walks through shuffles trees and
11630 /// tries to find the best matching vector for the actual shuffle
11631 /// instruction.
11632 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
11633 assert(V1 && "Expected at least one vector value.");
11634 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
11635 R.CSEBlocks, *R.DL);
11636 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
11637 ShuffleBuilder);
11638 }
11639
11640 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
11641 /// shuffle emission.
11642 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
11643 ArrayRef<int> Mask) {
11644 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11645 if (Mask[Idx] != PoisonMaskElem)
11646 CommonMask[Idx] = Idx;
11647 }
11648
11649 /// Cast value \p V to the vector type with the same number of elements, but
11650 /// the base type \p ScalarTy.
11651 Value *castToScalarTyElem(Value *V,
11652 std::optional<bool> IsSigned = std::nullopt) {
11653 auto *VecTy = cast<VectorType>(V->getType());
11654 if (VecTy->getElementType() == ScalarTy)
11655 return V;
11656 return Builder.CreateIntCast(
11657 V, VectorType::get(ScalarTy, VecTy->getElementCount()),
11658 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
11659 }
11660
11661public:
11663 : ScalarTy(ScalarTy), Builder(Builder), R(R) {}
11664
11665 /// Adjusts extractelements after reusing them.
11666 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
11667 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
11668 unsigned NumParts, bool &UseVecBaseAsInput) {
11669 UseVecBaseAsInput = false;
11670 SmallPtrSet<Value *, 4> UniqueBases;
11671 Value *VecBase = nullptr;
11672 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
11673 int Idx = Mask[I];
11674 if (Idx == PoisonMaskElem)
11675 continue;
11676 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
11677 VecBase = EI->getVectorOperand();
11678 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
11679 VecBase = TE->VectorizedValue;
11680 assert(VecBase && "Expected vectorized value.");
11681 UniqueBases.insert(VecBase);
11682 // If the only one use is vectorized - can delete the extractelement
11683 // itself.
11684 if (!EI->hasOneUse() || (NumParts != 1 && count(E->Scalars, EI) > 1) ||
11685 any_of(EI->users(), [&](User *U) {
11686 const TreeEntry *UTE = R.getTreeEntry(U);
11687 return !UTE || R.MultiNodeScalars.contains(U) ||
11688 (isa<GetElementPtrInst>(U) &&
11689 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
11690 count_if(R.VectorizableTree,
11691 [&](const std::unique_ptr<TreeEntry> &TE) {
11692 return any_of(TE->UserTreeIndices,
11693 [&](const EdgeInfo &Edge) {
11694 return Edge.UserTE == UTE;
11695 }) &&
11696 is_contained(TE->Scalars, EI);
11697 }) != 1;
11698 }))
11699 continue;
11700 R.eraseInstruction(EI);
11701 }
11702 if (NumParts == 1 || UniqueBases.size() == 1) {
11703 VecBase = castToScalarTyElem(VecBase);
11704 return VecBase;
11705 }
11706 UseVecBaseAsInput = true;
11707 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
11708 for (auto [I, Idx] : enumerate(Mask))
11709 if (Idx != PoisonMaskElem)
11710 Idx = I;
11711 };
11712 // Perform multi-register vector shuffle, joining them into a single virtual
11713 // long vector.
11714 // Need to shuffle each part independently and then insert all this parts
11715 // into a long virtual vector register, forming the original vector.
11716 Value *Vec = nullptr;
11717 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
11718 unsigned SliceSize = E->Scalars.size() / NumParts;
11719 for (unsigned Part = 0; Part < NumParts; ++Part) {
11721 ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
11722 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
11723 constexpr int MaxBases = 2;
11724 SmallVector<Value *, MaxBases> Bases(MaxBases);
11725#ifndef NDEBUG
11726 int PrevSize = 0;
11727#endif // NDEBUG
11728 for (const auto [I, V]: enumerate(VL)) {
11729 if (SubMask[I] == PoisonMaskElem)
11730 continue;
11731 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
11732 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
11733 VecOp = TE->VectorizedValue;
11734 assert(VecOp && "Expected vectorized value.");
11735 const int Size =
11736 cast<FixedVectorType>(VecOp->getType())->getNumElements();
11737#ifndef NDEBUG
11738 assert((PrevSize == Size || PrevSize == 0) &&
11739 "Expected vectors of the same size.");
11740 PrevSize = Size;
11741#endif // NDEBUG
11742 VecOp = castToScalarTyElem(VecOp);
11743 Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
11744 }
11745 if (!Bases.front())
11746 continue;
11747 Value *SubVec;
11748 if (Bases.back()) {
11749 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
11750 TransformToIdentity(SubMask);
11751 } else {
11752 SubVec = Bases.front();
11753 }
11754 if (!Vec) {
11755 Vec = SubVec;
11756 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
11757 [&](unsigned P) {
11758 ArrayRef<int> SubMask =
11759 Mask.slice(P * SliceSize, SliceSize);
11760 return all_of(SubMask, [](int Idx) {
11761 return Idx == PoisonMaskElem;
11762 });
11763 })) &&
11764 "Expected first part or all previous parts masked.");
11765 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11766 } else {
11767 unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11768 if (Vec->getType() != SubVec->getType()) {
11769 unsigned SubVecVF =
11770 cast<FixedVectorType>(SubVec->getType())->getNumElements();
11771 VF = std::max(VF, SubVecVF);
11772 }
11773 // Adjust SubMask.
11774 for (int &Idx : SubMask)
11775 if (Idx != PoisonMaskElem)
11776 Idx += VF;
11777 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
11778 Vec = createShuffle(Vec, SubVec, VecMask);
11779 TransformToIdentity(VecMask);
11780 }
11781 }
11782 copy(VecMask, Mask.begin());
11783 return Vec;
11784 }
11785 /// Checks if the specified entry \p E needs to be delayed because of its
11786 /// dependency nodes.
11787 std::optional<Value *>
11788 needToDelay(const TreeEntry *E,
11790 // No need to delay emission if all deps are ready.
11791 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
11792 return all_of(
11793 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
11794 }))
11795 return std::nullopt;
11796 // Postpone gather emission, will be emitted after the end of the
11797 // process to keep correct order.
11798 auto *ResVecTy = FixedVectorType::get(ScalarTy, E->getVectorFactor());
11799 return Builder.CreateAlignedLoad(
11800 ResVecTy,
11802 MaybeAlign());
11803 }
11804 /// Adds 2 input vectors (in form of tree entries) and the mask for their
11805 /// shuffling.
11806 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
11807 Value *V1 = E1.VectorizedValue;
11808 if (V1->getType()->isIntOrIntVectorTy())
11809 V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
11810 return !isKnownNonNegative(
11811 V, SimplifyQuery(*R.DL));
11812 }));
11813 Value *V2 = E2.VectorizedValue;
11814 if (V2->getType()->isIntOrIntVectorTy())
11815 V2 = castToScalarTyElem(V2, all_of(E2.Scalars, [&](Value *V) {
11816 return !isKnownNonNegative(
11817 V, SimplifyQuery(*R.DL));
11818 }));
11819 add(V1, V2, Mask);
11820 }
11821 /// Adds single input vector (in form of tree entry) and the mask for its
11822 /// shuffling.
11823 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
11824 Value *V1 = E1.VectorizedValue;
11825 if (V1->getType()->isIntOrIntVectorTy())
11826 V1 = castToScalarTyElem(V1, all_of(E1.Scalars, [&](Value *V) {
11827 return !isKnownNonNegative(
11828 V, SimplifyQuery(*R.DL));
11829 }));
11830 add(V1, Mask);
11831 }
11832 /// Adds 2 input vectors and the mask for their shuffling.
11833 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
11834 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
11835 V1 = castToScalarTyElem(V1);
11836 V2 = castToScalarTyElem(V2);
11837 if (InVectors.empty()) {
11838 InVectors.push_back(V1);
11839 InVectors.push_back(V2);
11840 CommonMask.assign(Mask.begin(), Mask.end());
11841 return;
11842 }
11843 Value *Vec = InVectors.front();
11844 if (InVectors.size() == 2) {
11845 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11846 transformMaskAfterShuffle(CommonMask, CommonMask);
11847 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
11848 Mask.size()) {
11849 Vec = createShuffle(Vec, nullptr, CommonMask);
11850 transformMaskAfterShuffle(CommonMask, CommonMask);
11851 }
11852 V1 = createShuffle(V1, V2, Mask);
11853 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11854 if (Mask[Idx] != PoisonMaskElem)
11855 CommonMask[Idx] = Idx + Sz;
11856 InVectors.front() = Vec;
11857 if (InVectors.size() == 2)
11858 InVectors.back() = V1;
11859 else
11860 InVectors.push_back(V1);
11861 }
11862 /// Adds another one input vector and the mask for the shuffling.
11863 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
11864 V1 = castToScalarTyElem(V1);
11865 if (InVectors.empty()) {
11866 if (!isa<FixedVectorType>(V1->getType())) {
11867 V1 = createShuffle(V1, nullptr, CommonMask);
11868 CommonMask.assign(Mask.size(), PoisonMaskElem);
11869 transformMaskAfterShuffle(CommonMask, Mask);
11870 }
11871 InVectors.push_back(V1);
11872 CommonMask.assign(Mask.begin(), Mask.end());
11873 return;
11874 }
11875 const auto *It = find(InVectors, V1);
11876 if (It == InVectors.end()) {
11877 if (InVectors.size() == 2 ||
11878 InVectors.front()->getType() != V1->getType() ||
11879 !isa<FixedVectorType>(V1->getType())) {
11880 Value *V = InVectors.front();
11881 if (InVectors.size() == 2) {
11882 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11883 transformMaskAfterShuffle(CommonMask, CommonMask);
11884 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
11885 CommonMask.size()) {
11886 V = createShuffle(InVectors.front(), nullptr, CommonMask);
11887 transformMaskAfterShuffle(CommonMask, CommonMask);
11888 }
11889 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11890 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
11891 CommonMask[Idx] =
11892 V->getType() != V1->getType()
11893 ? Idx + Sz
11894 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
11895 ->getNumElements();
11896 if (V->getType() != V1->getType())
11897 V1 = createShuffle(V1, nullptr, Mask);
11898 InVectors.front() = V;
11899 if (InVectors.size() == 2)
11900 InVectors.back() = V1;
11901 else
11902 InVectors.push_back(V1);
11903 return;
11904 }
11905 // Check if second vector is required if the used elements are already
11906 // used from the first one.
11907 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11908 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
11909 InVectors.push_back(V1);
11910 break;
11911 }
11912 }
11913 int VF = CommonMask.size();
11914 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
11915 VF = FTy->getNumElements();
11916 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11917 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
11918 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
11919 }
11920 /// Adds another one input vector and the mask for the shuffling.
11922 SmallVector<int> NewMask;
11923 inversePermutation(Order, NewMask);
11924 add(V1, NewMask);
11925 }
11926 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
11927 Value *Root = nullptr) {
11928 return R.gather(VL, Root, ScalarTy);
11929 }
11930 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
11931 /// Finalize emission of the shuffles.
11932 /// \param Action the action (if any) to be performed before final applying of
11933 /// the \p ExtMask mask.
11934 Value *
11935 finalize(ArrayRef<int> ExtMask, unsigned VF = 0,
11936 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
11937 IsFinalized = true;
11938 if (Action) {
11939 Value *Vec = InVectors.front();
11940 if (InVectors.size() == 2) {
11941 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
11942 InVectors.pop_back();
11943 } else {
11944 Vec = createShuffle(Vec, nullptr, CommonMask);
11945 }
11946 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
11947 if (CommonMask[Idx] != PoisonMaskElem)
11948 CommonMask[Idx] = Idx;
11949 assert(VF > 0 &&
11950 "Expected vector length for the final value before action.");
11951 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
11952 if (VecVF < VF) {
11953 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
11954 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
11955 Vec = createShuffle(Vec, nullptr, ResizeMask);
11956 }
11957 Action(Vec, CommonMask);
11958 InVectors.front() = Vec;
11959 }
11960 if (!ExtMask.empty()) {
11961 if (CommonMask.empty()) {
11962 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11963 } else {
11964 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11965 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11966 if (ExtMask[I] == PoisonMaskElem)
11967 continue;
11968 NewMask[I] = CommonMask[ExtMask[I]];
11969 }
11970 CommonMask.swap(NewMask);
11971 }
11972 }
11973 if (CommonMask.empty()) {
11974 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11975 return InVectors.front();
11976 }
11977 if (InVectors.size() == 2)
11978 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
11979 return createShuffle(InVectors.front(), nullptr, CommonMask);
11980 }
11981
11983 assert((IsFinalized || CommonMask.empty()) &&
11984 "Shuffle construction must be finalized.");
11985 }
11986};
11987
11988Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
11989 bool PostponedPHIs) {
11990 ValueList &VL = E->getOperand(NodeIdx);
11991 const unsigned VF = VL.size();
11992 InstructionsState S = getSameOpcode(VL, *TLI);
11993 // Special processing for GEPs bundle, which may include non-gep values.
11994 if (!S.getOpcode() && VL.front()->getType()->isPointerTy()) {
11995 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
11996 if (It != VL.end())
11997 S = getSameOpcode(*It, *TLI);
11998 }
11999 if (S.getOpcode()) {
12000 auto CheckSameVE = [&](const TreeEntry *VE) {
12001 return VE->isSame(VL) &&
12002 (any_of(VE->UserTreeIndices,
12003 [E, NodeIdx](const EdgeInfo &EI) {
12004 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12005 }) ||
12006 any_of(VectorizableTree,
12007 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
12008 return TE->isOperandGatherNode({E, NodeIdx}) &&
12009 VE->isSame(TE->Scalars);
12010 }));
12011 };
12012 TreeEntry *VE = getTreeEntry(S.OpValue);
12013 bool IsSameVE = VE && CheckSameVE(VE);
12014 if (!IsSameVE) {
12015 auto It = MultiNodeScalars.find(S.OpValue);
12016 if (It != MultiNodeScalars.end()) {
12017 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
12018 return TE != VE && CheckSameVE(TE);
12019 });
12020 if (I != It->getSecond().end()) {
12021 VE = *I;
12022 IsSameVE = true;
12023 }
12024 }
12025 }
12026 if (IsSameVE) {
12027 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
12028 ShuffleInstructionBuilder ShuffleBuilder(
12029 cast<VectorType>(V->getType())->getElementType(), Builder, *this);
12030 ShuffleBuilder.add(V, Mask);
12031 return ShuffleBuilder.finalize(std::nullopt);
12032 };
12033 Value *V = vectorizeTree(VE, PostponedPHIs);
12034 if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
12035 if (!VE->ReuseShuffleIndices.empty()) {
12036 // Reshuffle to get only unique values.
12037 // If some of the scalars are duplicated in the vectorization
12038 // tree entry, we do not vectorize them but instead generate a
12039 // mask for the reuses. But if there are several users of the
12040 // same entry, they may have different vectorization factors.
12041 // This is especially important for PHI nodes. In this case, we
12042 // need to adapt the resulting instruction for the user
12043 // vectorization factor and have to reshuffle it again to take
12044 // only unique elements of the vector. Without this code the
12045 // function incorrectly returns reduced vector instruction with
12046 // the same elements, not with the unique ones.
12047
12048 // block:
12049 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
12050 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
12051 // ... (use %2)
12052 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
12053 // br %block
12055 for (auto [I, V] : enumerate(VL)) {
12056 if (isa<PoisonValue>(V))
12057 continue;
12058 Mask[I] = VE->findLaneForValue(V);
12059 }
12060 V = FinalShuffle(V, Mask);
12061 } else {
12062 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
12063 "Expected vectorization factor less "
12064 "than original vector size.");
12065 SmallVector<int> UniformMask(VF, 0);
12066 std::iota(UniformMask.begin(), UniformMask.end(), 0);
12067 V = FinalShuffle(V, UniformMask);
12068 }
12069 }
12070 // Need to update the operand gather node, if actually the operand is not a
12071 // vectorized node, but the buildvector/gather node, which matches one of
12072 // the vectorized nodes.
12073 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
12074 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
12075 }) == VE->UserTreeIndices.end()) {
12076 auto *It = find_if(
12077 VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12078 return TE->State == TreeEntry::NeedToGather &&
12079 TE->UserTreeIndices.front().UserTE == E &&
12080 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
12081 });
12082 assert(It != VectorizableTree.end() && "Expected gather node operand.");
12083 (*It)->VectorizedValue = V;
12084 }
12085 return V;
12086 }
12087 }
12088
12089 // Find the corresponding gather entry and vectorize it.
12090 // Allows to be more accurate with tree/graph transformations, checks for the
12091 // correctness of the transformations in many cases.
12092 auto *I = find_if(VectorizableTree,
12093 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
12094 return TE->isOperandGatherNode({E, NodeIdx});
12095 });
12096 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
12097 assert(I->get()->UserTreeIndices.size() == 1 &&
12098 "Expected only single user for the gather node.");
12099 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
12100 return vectorizeTree(I->get(), PostponedPHIs);
12101}
12102
12103template <typename BVTy, typename ResTy, typename... Args>
12104ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
12105 Args &...Params) {
12106 assert(E->State == TreeEntry::NeedToGather && "Expected gather node.");
12107 unsigned VF = E->getVectorFactor();
12108
12109 bool NeedFreeze = false;
12110 SmallVector<int> ReuseShuffleIndicies(E->ReuseShuffleIndices.begin(),
12111 E->ReuseShuffleIndices.end());
12112 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
12113 // Build a mask out of the reorder indices and reorder scalars per this
12114 // mask.
12115 SmallVector<int> ReorderMask;
12116 inversePermutation(E->ReorderIndices, ReorderMask);
12117 if (!ReorderMask.empty())
12118 reorderScalars(GatheredScalars, ReorderMask);
12119 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
12120 unsigned I, unsigned SliceSize) {
12121 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
12122 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12123 }))
12124 return false;
12125 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
12126 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
12127 if (UserTE->getNumOperands() != 2)
12128 return false;
12129 auto *It =
12130 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
12131 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
12132 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
12133 }) != TE->UserTreeIndices.end();
12134 });
12135 if (It == VectorizableTree.end())
12136 return false;
12137 int Idx;
12138 if ((Mask.size() < InputVF &&
12140 Idx == 0) ||
12141 (Mask.size() == InputVF &&
12142 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
12143 std::iota(std::next(Mask.begin(), I * SliceSize),
12144 std::next(Mask.begin(), (I + 1) * SliceSize), 0);
12145 } else {
12146 unsigned IVal =
12147 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
12148 std::fill(std::next(Mask.begin(), I * SliceSize),
12149 std::next(Mask.begin(), (I + 1) * SliceSize), IVal);
12150 }
12151 return true;
12152 };
12153 BVTy ShuffleBuilder(ScalarTy, Params...);
12154 ResTy Res = ResTy();
12156 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
12158 Value *ExtractVecBase = nullptr;
12159 bool UseVecBaseAsInput = false;
12162 Type *OrigScalarTy = GatheredScalars.front()->getType();
12163 auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
12164 unsigned NumParts = TTI->getNumberOfParts(VecTy);
12165 if (NumParts == 0 || NumParts >= GatheredScalars.size())
12166 NumParts = 1;
12167 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
12168 // Check for gathered extracts.
12169 bool Resized = false;
12170 ExtractShuffles =
12171 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
12172 if (!ExtractShuffles.empty()) {
12173 SmallVector<const TreeEntry *> ExtractEntries;
12174 for (auto [Idx, I] : enumerate(ExtractMask)) {
12175 if (I == PoisonMaskElem)
12176 continue;
12177 if (const auto *TE = getTreeEntry(
12178 cast<ExtractElementInst>(E->Scalars[Idx])->getVectorOperand()))
12179 ExtractEntries.push_back(TE);
12180 }
12181 if (std::optional<ResTy> Delayed =
12182 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
12183 // Delay emission of gathers which are not ready yet.
12184 PostponedGathers.insert(E);
12185 // Postpone gather emission, will be emitted after the end of the
12186 // process to keep correct order.
12187 return *Delayed;
12188 }
12189 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
12190 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
12191 ExtractVecBase = VecBase;
12192 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
12193 if (VF == VecBaseTy->getNumElements() &&
12194 GatheredScalars.size() != VF) {
12195 Resized = true;
12196 GatheredScalars.append(VF - GatheredScalars.size(),
12197 PoisonValue::get(OrigScalarTy));
12198 }
12199 }
12200 }
12201 // Gather extracts after we check for full matched gathers only.
12202 if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
12203 E->isAltShuffle() ||
12204 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
12205 isSplat(E->Scalars) ||
12206 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
12207 GatherShuffles =
12208 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
12209 }
12210 if (!GatherShuffles.empty()) {
12211 if (std::optional<ResTy> Delayed =
12212 ShuffleBuilder.needToDelay(E, Entries)) {
12213 // Delay emission of gathers which are not ready yet.
12214 PostponedGathers.insert(E);
12215 // Postpone gather emission, will be emitted after the end of the
12216 // process to keep correct order.
12217 return *Delayed;
12218 }
12219 if (GatherShuffles.size() == 1 &&
12220 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
12221 Entries.front().front()->isSame(E->Scalars)) {
12222 // Perfect match in the graph, will reuse the previously vectorized
12223 // node. Cost is 0.
12224 LLVM_DEBUG(
12225 dbgs()
12226 << "SLP: perfect diamond match for gather bundle "
12227 << shortBundleName(E->Scalars) << ".\n");
12228 // Restore the mask for previous partially matched values.
12229 Mask.resize(E->Scalars.size());
12230 const TreeEntry *FrontTE = Entries.front().front();
12231 if (FrontTE->ReorderIndices.empty() &&
12232 ((FrontTE->ReuseShuffleIndices.empty() &&
12233 E->Scalars.size() == FrontTE->Scalars.size()) ||
12234 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
12235 std::iota(Mask.begin(), Mask.end(), 0);
12236 } else {
12237 for (auto [I, V] : enumerate(E->Scalars)) {
12238 if (isa<PoisonValue>(V)) {
12240 continue;
12241 }
12242 Mask[I] = FrontTE->findLaneForValue(V);
12243 }
12244 }
12245 ShuffleBuilder.add(*FrontTE, Mask);
12246 Res = ShuffleBuilder.finalize(E->getCommonMask());
12247 return Res;
12248 }
12249 if (!Resized) {
12250 if (GatheredScalars.size() != VF &&
12251 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
12252 return any_of(TEs, [&](const TreeEntry *TE) {
12253 return TE->getVectorFactor() == VF;
12254 });
12255 }))
12256 GatheredScalars.append(VF - GatheredScalars.size(),
12257 PoisonValue::get(OrigScalarTy));
12258 }
12259 // Remove shuffled elements from list of gathers.
12260 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
12261 if (Mask[I] != PoisonMaskElem)
12262 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12263 }
12264 }
12265 }
12266 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
12267 SmallVectorImpl<int> &ReuseMask,
12268 bool IsRootPoison) {
12269 // For splats with can emit broadcasts instead of gathers, so try to find
12270 // such sequences.
12271 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
12272 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
12273 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
12274 SmallVector<int> UndefPos;
12275 DenseMap<Value *, unsigned> UniquePositions;
12276 // Gather unique non-const values and all constant values.
12277 // For repeated values, just shuffle them.
12278 int NumNonConsts = 0;
12279 int SinglePos = 0;
12280 for (auto [I, V] : enumerate(Scalars)) {
12281 if (isa<UndefValue>(V)) {
12282 if (!isa<PoisonValue>(V)) {
12283 ReuseMask[I] = I;
12284 UndefPos.push_back(I);
12285 }
12286 continue;
12287 }
12288 if (isConstant(V)) {
12289 ReuseMask[I] = I;
12290 continue;
12291 }
12292 ++NumNonConsts;
12293 SinglePos = I;
12294 Value *OrigV = V;
12295 Scalars[I] = PoisonValue::get(OrigScalarTy);
12296 if (IsSplat) {
12297 Scalars.front() = OrigV;
12298 ReuseMask[I] = 0;
12299 } else {
12300 const auto Res = UniquePositions.try_emplace(OrigV, I);
12301 Scalars[Res.first->second] = OrigV;
12302 ReuseMask[I] = Res.first->second;
12303 }
12304 }
12305 if (NumNonConsts == 1) {
12306 // Restore single insert element.
12307 if (IsSplat) {
12308 ReuseMask.assign(VF, PoisonMaskElem);
12309 std::swap(Scalars.front(), Scalars[SinglePos]);
12310 if (!UndefPos.empty() && UndefPos.front() == 0)
12311 Scalars.front() = UndefValue::get(OrigScalarTy);
12312 }
12313 ReuseMask[SinglePos] = SinglePos;
12314 } else if (!UndefPos.empty() && IsSplat) {
12315 // For undef values, try to replace them with the simple broadcast.
12316 // We can do it if the broadcasted value is guaranteed to be
12317 // non-poisonous, or by freezing the incoming scalar value first.
12318 auto *It = find_if(Scalars, [this, E](Value *V) {
12319 return !isa<UndefValue>(V) &&
12320 (getTreeEntry(V) || isGuaranteedNotToBePoison(V) ||
12321 (E->UserTreeIndices.size() == 1 &&
12322 any_of(V->uses(), [E](const Use &U) {
12323 // Check if the value already used in the same operation in
12324 // one of the nodes already.
12325 return E->UserTreeIndices.front().EdgeIdx !=
12326 U.getOperandNo() &&
12327 is_contained(
12328 E->UserTreeIndices.front().UserTE->Scalars,
12329 U.getUser());
12330 })));
12331 });
12332 if (It != Scalars.end()) {
12333 // Replace undefs by the non-poisoned scalars and emit broadcast.
12334 int Pos = std::distance(Scalars.begin(), It);
12335 for (int I : UndefPos) {
12336 // Set the undef position to the non-poisoned scalar.
12337 ReuseMask[I] = Pos;
12338 // Replace the undef by the poison, in the mask it is replaced by
12339 // non-poisoned scalar already.
12340 if (I != Pos)
12341 Scalars[I] = PoisonValue::get(OrigScalarTy);
12342 }
12343 } else {
12344 // Replace undefs by the poisons, emit broadcast and then emit
12345 // freeze.
12346 for (int I : UndefPos) {
12347 ReuseMask[I] = PoisonMaskElem;
12348 if (isa<UndefValue>(Scalars[I]))
12349 Scalars[I] = PoisonValue::get(OrigScalarTy);
12350 }
12351 NeedFreeze = true;
12352 }
12353 }
12354 };
12355 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
12356 bool IsNonPoisoned = true;
12357 bool IsUsedInExpr = true;
12358 Value *Vec1 = nullptr;
12359 if (!ExtractShuffles.empty()) {
12360 // Gather of extractelements can be represented as just a shuffle of
12361 // a single/two vectors the scalars are extracted from.
12362 // Find input vectors.
12363 Value *Vec2 = nullptr;
12364 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12365 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
12366 ExtractMask[I] = PoisonMaskElem;
12367 }
12368 if (UseVecBaseAsInput) {
12369 Vec1 = ExtractVecBase;
12370 } else {
12371 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
12372 if (ExtractMask[I] == PoisonMaskElem)
12373 continue;
12374 if (isa<UndefValue>(E->Scalars[I]))
12375 continue;
12376 auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
12377 Value *VecOp = EI->getVectorOperand();
12378 if (const auto *TE = getTreeEntry(VecOp))
12379 if (TE->VectorizedValue)
12380 VecOp = TE->VectorizedValue;
12381 if (!Vec1) {
12382 Vec1 = VecOp;
12383 } else if (Vec1 != VecOp) {
12384 assert((!Vec2 || Vec2 == VecOp) &&
12385 "Expected only 1 or 2 vectors shuffle.");
12386 Vec2 = VecOp;
12387 }
12388 }
12389 }
12390 if (Vec2) {
12391 IsUsedInExpr = false;
12392 IsNonPoisoned &=
12394 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
12395 } else if (Vec1) {
12396 IsUsedInExpr &= FindReusedSplat(
12397 ExtractMask,
12398 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
12399 ExtractMask.size());
12400 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
12401 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
12402 } else {
12403 IsUsedInExpr = false;
12404 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
12405 /*ForExtracts=*/true);
12406 }
12407 }
12408 if (!GatherShuffles.empty()) {
12409 unsigned SliceSize = E->Scalars.size() / NumParts;
12410 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
12411 for (const auto [I, TEs] : enumerate(Entries)) {
12412 if (TEs.empty()) {
12413 assert(!GatherShuffles[I] &&
12414 "No shuffles with empty entries list expected.");
12415 continue;
12416 }
12417 assert((TEs.size() == 1 || TEs.size() == 2) &&
12418 "Expected shuffle of 1 or 2 entries.");
12419 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
12420 VecMask.assign(VecMask.size(), PoisonMaskElem);
12421 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
12422 if (TEs.size() == 1) {
12423 IsUsedInExpr &= FindReusedSplat(
12424 VecMask, TEs.front()->getVectorFactor(), I, SliceSize);
12425 ShuffleBuilder.add(*TEs.front(), VecMask);
12426 if (TEs.front()->VectorizedValue)
12427 IsNonPoisoned &=
12428 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
12429 } else {
12430 IsUsedInExpr = false;
12431 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
12432 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
12433 IsNonPoisoned &=
12434 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
12435 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
12436 }
12437 }
12438 }
12439 // Try to figure out best way to combine values: build a shuffle and insert
12440 // elements or just build several shuffles.
12441 // Insert non-constant scalars.
12442 SmallVector<Value *> NonConstants(GatheredScalars);
12443 int EMSz = ExtractMask.size();
12444 int MSz = Mask.size();
12445 // Try to build constant vector and shuffle with it only if currently we
12446 // have a single permutation and more than 1 scalar constants.
12447 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
12448 bool IsIdentityShuffle =
12449 ((UseVecBaseAsInput ||
12450 all_of(ExtractShuffles,
12451 [](const std::optional<TTI::ShuffleKind> &SK) {
12452 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12454 })) &&
12455 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
12456 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
12457 (!GatherShuffles.empty() &&
12458 all_of(GatherShuffles,
12459 [](const std::optional<TTI::ShuffleKind> &SK) {
12460 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
12462 }) &&
12463 none_of(Mask, [&](int I) { return I >= MSz; }) &&
12465 bool EnoughConstsForShuffle =
12466 IsSingleShuffle &&
12467 (none_of(GatheredScalars,
12468 [](Value *V) {
12469 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
12470 }) ||
12471 any_of(GatheredScalars,
12472 [](Value *V) {
12473 return isa<Constant>(V) && !isa<UndefValue>(V);
12474 })) &&
12475 (!IsIdentityShuffle ||
12476 (GatheredScalars.size() == 2 &&
12477 any_of(GatheredScalars,
12478 [](Value *V) { return !isa<UndefValue>(V); })) ||
12479 count_if(GatheredScalars, [](Value *V) {
12480 return isa<Constant>(V) && !isa<PoisonValue>(V);
12481 }) > 1);
12482 // NonConstants array contains just non-constant values, GatheredScalars
12483 // contains only constant to build final vector and then shuffle.
12484 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
12485 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
12486 NonConstants[I] = PoisonValue::get(OrigScalarTy);
12487 else
12488 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
12489 }
12490 // Generate constants for final shuffle and build a mask for them.
12491 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
12492 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
12493 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
12494 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
12495 ShuffleBuilder.add(BV, BVMask);
12496 }
12497 if (all_of(NonConstants, [=](Value *V) {
12498 return isa<PoisonValue>(V) ||
12499 (IsSingleShuffle && ((IsIdentityShuffle &&
12500 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
12501 }))
12502 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12503 else
12504 Res = ShuffleBuilder.finalize(
12505 E->ReuseShuffleIndices, E->Scalars.size(),
12506 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
12507 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
12508 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
12509 });
12510 } else if (!allConstant(GatheredScalars)) {
12511 // Gather unique scalars and all constants.
12512 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
12513 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
12514 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
12515 ShuffleBuilder.add(BV, ReuseMask);
12516 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12517 } else {
12518 // Gather all constants.
12519 SmallVector<int> Mask(E->Scalars.size(), PoisonMaskElem);
12520 for (auto [I, V] : enumerate(E->Scalars)) {
12521 if (!isa<PoisonValue>(V))
12522 Mask[I] = I;
12523 }
12524 Value *BV = ShuffleBuilder.gather(E->Scalars);
12525 ShuffleBuilder.add(BV, Mask);
12526 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12527 }
12528
12529 if (NeedFreeze)
12530 Res = ShuffleBuilder.createFreeze(Res);
12531 return Res;
12532}
12533
12534Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy) {
12535 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
12536 Builder, *this);
12537}
12538
12539Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12540 IRBuilderBase::InsertPointGuard Guard(Builder);
12541
12542 if (E->VectorizedValue &&
12543 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
12544 E->isAltShuffle())) {
12545 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
12546 return E->VectorizedValue;
12547 }
12548
12549 Value *V = E->Scalars.front();
12550 Type *ScalarTy = V->getType();
12551 if (auto *Store = dyn_cast<StoreInst>(V))
12552 ScalarTy = Store->getValueOperand()->getType();
12553 else if (auto *IE = dyn_cast<InsertElementInst>(V))
12554 ScalarTy = IE->getOperand(1)->getType();
12555 auto It = MinBWs.find(E);
12556 if (It != MinBWs.end())
12557 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
12558 auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
12559 if (E->State == TreeEntry::NeedToGather) {
12560 // Set insert point for non-reduction initial nodes.
12561 if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
12562 setInsertPointAfterBundle(E);
12563 Value *Vec = createBuildVector(E, ScalarTy);
12564 E->VectorizedValue = Vec;
12565 return Vec;
12566 }
12567
12568 bool IsReverseOrder = isReverseOrder(E->ReorderIndices);
12569 auto FinalShuffle = [&](Value *V, const TreeEntry *E, VectorType *VecTy) {
12570 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
12571 if (E->getOpcode() == Instruction::Store &&
12572 E->State == TreeEntry::Vectorize) {
12574 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
12575 E->ReorderIndices.size());
12576 ShuffleBuilder.add(V, Mask);
12577 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
12578 ShuffleBuilder.addOrdered(V, std::nullopt);
12579 } else {
12580 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
12581 }
12582 return ShuffleBuilder.finalize(E->ReuseShuffleIndices);
12583 };
12584
12585 assert((E->State == TreeEntry::Vectorize ||
12586 E->State == TreeEntry::ScatterVectorize ||
12587 E->State == TreeEntry::StridedVectorize) &&
12588 "Unhandled state");
12589 unsigned ShuffleOrOp =
12590 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
12591 Instruction *VL0 = E->getMainOp();
12592 auto GetOperandSignedness = [&](unsigned Idx) {
12593 const TreeEntry *OpE = getOperandEntry(E, Idx);
12594 bool IsSigned = false;
12595 auto It = MinBWs.find(OpE);
12596 if (It != MinBWs.end())
12597 IsSigned = It->second.second;
12598 else
12599 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
12600 return !isKnownNonNegative(R, SimplifyQuery(*DL));
12601 });
12602 return IsSigned;
12603 };
12604 switch (ShuffleOrOp) {
12605 case Instruction::PHI: {
12606 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
12607 E != VectorizableTree.front().get() ||
12608 !E->UserTreeIndices.empty()) &&
12609 "PHI reordering is free.");
12610 if (PostponedPHIs && E->VectorizedValue)
12611 return E->VectorizedValue;
12612 auto *PH = cast<PHINode>(VL0);
12613 Builder.SetInsertPoint(PH->getParent(),
12614 PH->getParent()->getFirstNonPHIIt());
12615 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12616 if (PostponedPHIs || !E->VectorizedValue) {
12617 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
12618 E->PHI = NewPhi;
12619 Value *V = NewPhi;
12620
12621 // Adjust insertion point once all PHI's have been generated.
12622 Builder.SetInsertPoint(PH->getParent(),
12623 PH->getParent()->getFirstInsertionPt());
12624 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12625
12626 V = FinalShuffle(V, E, VecTy);
12627
12628 E->VectorizedValue = V;
12629 if (PostponedPHIs)
12630 return V;
12631 }
12632 PHINode *NewPhi = cast<PHINode>(E->PHI);
12633 // If phi node is fully emitted - exit.
12634 if (NewPhi->getNumIncomingValues() != 0)
12635 return NewPhi;
12636
12637 // PHINodes may have multiple entries from the same block. We want to
12638 // visit every block once.
12640
12641 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
12643 BasicBlock *IBB = PH->getIncomingBlock(I);
12644
12645 // Stop emission if all incoming values are generated.
12646 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
12647 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12648 return NewPhi;
12649 }
12650
12651 if (!VisitedBBs.insert(IBB).second) {
12652 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
12653 continue;
12654 }
12655
12656 Builder.SetInsertPoint(IBB->getTerminator());
12657 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
12658 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
12659 if (VecTy != Vec->getType()) {
12660 assert((It != MinBWs.end() ||
12661 getOperandEntry(E, I)->State == TreeEntry::NeedToGather ||
12662 MinBWs.contains(getOperandEntry(E, I))) &&
12663 "Expected item in MinBWs.");
12664 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
12665 }
12666 NewPhi->addIncoming(Vec, IBB);
12667 }
12668
12669 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
12670 "Invalid number of incoming values");
12671 return NewPhi;
12672 }
12673
12674 case Instruction::ExtractElement: {
12675 Value *V = E->getSingleOperand(0);
12676 if (const TreeEntry *TE = getTreeEntry(V))
12677 V = TE->VectorizedValue;
12678 setInsertPointAfterBundle(E);
12679 V = FinalShuffle(V, E, VecTy);
12680 E->VectorizedValue = V;
12681 return V;
12682 }
12683 case Instruction::ExtractValue: {
12684 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
12685 Builder.SetInsertPoint(LI);
12686 Value *Ptr = LI->getPointerOperand();
12687 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
12688 Value *NewV = propagateMetadata(V, E->Scalars);
12689 NewV = FinalShuffle(NewV, E, VecTy);
12690 E->VectorizedValue = NewV;
12691 return NewV;
12692 }
12693 case Instruction::InsertElement: {
12694 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
12695 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
12696 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
12697 ArrayRef<Value *> Op = E->getOperand(1);
12698 Type *ScalarTy = Op.front()->getType();
12699 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
12700 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
12701 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
12702 assert(Res.first > 0 && "Expected item in MinBWs.");
12703 V = Builder.CreateIntCast(
12704 V,
12706 ScalarTy,
12707 cast<FixedVectorType>(V->getType())->getNumElements()),
12708 Res.second);
12709 }
12710
12711 // Create InsertVector shuffle if necessary
12712 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
12713 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
12714 }));
12715 const unsigned NumElts =
12716 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
12717 const unsigned NumScalars = E->Scalars.size();
12718
12719 unsigned Offset = *getInsertIndex(VL0);
12720 assert(Offset < NumElts && "Failed to find vector index offset");
12721
12722 // Create shuffle to resize vector
12724 if (!E->ReorderIndices.empty()) {
12725 inversePermutation(E->ReorderIndices, Mask);
12726 Mask.append(NumElts - NumScalars, PoisonMaskElem);
12727 } else {
12728 Mask.assign(NumElts, PoisonMaskElem);
12729 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
12730 }
12731 // Create InsertVector shuffle if necessary
12732 bool IsIdentity = true;
12733 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
12734 Mask.swap(PrevMask);
12735 for (unsigned I = 0; I < NumScalars; ++I) {
12736 Value *Scalar = E->Scalars[PrevMask[I]];
12737 unsigned InsertIdx = *getInsertIndex(Scalar);
12738 IsIdentity &= InsertIdx - Offset == I;
12739 Mask[InsertIdx - Offset] = I;
12740 }
12741 if (!IsIdentity || NumElts != NumScalars) {
12742 Value *V2 = nullptr;
12743 bool IsVNonPoisonous = isGuaranteedNotToBePoison(V) && !isConstant(V);
12744 SmallVector<int> InsertMask(Mask);
12745 if (NumElts != NumScalars && Offset == 0) {
12746 // Follow all insert element instructions from the current buildvector
12747 // sequence.
12748 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
12749 do {
12750 std::optional<unsigned> InsertIdx = getInsertIndex(Ins);
12751 if (!InsertIdx)
12752 break;
12753 if (InsertMask[*InsertIdx] == PoisonMaskElem)
12754 InsertMask[*InsertIdx] = *InsertIdx;
12755 if (!Ins->hasOneUse())
12756 break;
12757 Ins = dyn_cast_or_null<InsertElementInst>(
12758 Ins->getUniqueUndroppableUser());
12759 } while (Ins);
12760 SmallBitVector UseMask =
12761 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12762 SmallBitVector IsFirstPoison =
12763 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12764 SmallBitVector IsFirstUndef =
12765 isUndefVector(FirstInsert->getOperand(0), UseMask);
12766 if (!IsFirstPoison.all()) {
12767 unsigned Idx = 0;
12768 for (unsigned I = 0; I < NumElts; I++) {
12769 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
12770 IsFirstUndef.test(I)) {
12771 if (IsVNonPoisonous) {
12772 InsertMask[I] = I < NumScalars ? I : 0;
12773 continue;
12774 }
12775 if (!V2)
12776 V2 = UndefValue::get(V->getType());
12777 if (Idx >= NumScalars)
12778 Idx = NumScalars - 1;
12779 InsertMask[I] = NumScalars + Idx;
12780 ++Idx;
12781 } else if (InsertMask[I] != PoisonMaskElem &&
12782 Mask[I] == PoisonMaskElem) {
12783 InsertMask[I] = PoisonMaskElem;
12784 }
12785 }
12786 } else {
12787 InsertMask = Mask;
12788 }
12789 }
12790 if (!V2)
12791 V2 = PoisonValue::get(V->getType());
12792 V = Builder.CreateShuffleVector(V, V2, InsertMask);
12793 if (auto *I = dyn_cast<Instruction>(V)) {
12794 GatherShuffleExtractSeq.insert(I);
12795 CSEBlocks.insert(I->getParent());
12796 }
12797 }
12798
12799 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
12800 for (unsigned I = 0; I < NumElts; I++) {
12801 if (Mask[I] != PoisonMaskElem)
12802 InsertMask[Offset + I] = I;
12803 }
12804 SmallBitVector UseMask =
12805 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
12806 SmallBitVector IsFirstUndef =
12807 isUndefVector(FirstInsert->getOperand(0), UseMask);
12808 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
12809 NumElts != NumScalars) {
12810 if (IsFirstUndef.all()) {
12811 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
12812 SmallBitVector IsFirstPoison =
12813 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12814 if (!IsFirstPoison.all()) {
12815 for (unsigned I = 0; I < NumElts; I++) {
12816 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
12817 InsertMask[I] = I + NumElts;
12818 }
12819 }
12820 V = Builder.CreateShuffleVector(
12821 V,
12822 IsFirstPoison.all() ? PoisonValue::get(V->getType())
12823 : FirstInsert->getOperand(0),
12824 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
12825 if (auto *I = dyn_cast<Instruction>(V)) {
12826 GatherShuffleExtractSeq.insert(I);
12827 CSEBlocks.insert(I->getParent());
12828 }
12829 }
12830 } else {
12831 SmallBitVector IsFirstPoison =
12832 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
12833 for (unsigned I = 0; I < NumElts; I++) {
12834 if (InsertMask[I] == PoisonMaskElem)
12835 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
12836 else
12837 InsertMask[I] += NumElts;
12838 }
12839 V = Builder.CreateShuffleVector(
12840 FirstInsert->getOperand(0), V, InsertMask,
12841 cast<Instruction>(E->Scalars.back())->getName());
12842 if (auto *I = dyn_cast<Instruction>(V)) {
12843 GatherShuffleExtractSeq.insert(I);
12844 CSEBlocks.insert(I->getParent());
12845 }
12846 }
12847 }
12848
12849 ++NumVectorInstructions;
12850 E->VectorizedValue = V;
12851 return V;
12852 }
12853 case Instruction::ZExt:
12854 case Instruction::SExt:
12855 case Instruction::FPToUI:
12856 case Instruction::FPToSI:
12857 case Instruction::FPExt:
12858 case Instruction::PtrToInt:
12859 case Instruction::IntToPtr:
12860 case Instruction::SIToFP:
12861 case Instruction::UIToFP:
12862 case Instruction::Trunc:
12863 case Instruction::FPTrunc:
12864 case Instruction::BitCast: {
12865 setInsertPointAfterBundle(E);
12866
12867 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
12868 if (E->VectorizedValue) {
12869 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12870 return E->VectorizedValue;
12871 }
12872
12873 auto *CI = cast<CastInst>(VL0);
12874 Instruction::CastOps VecOpcode = CI->getOpcode();
12875 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
12876 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
12877 if (!ScalarTy->isFloatingPointTy() && !SrcScalarTy->isFloatingPointTy() &&
12878 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
12879 SrcScalarTy != CI->getOperand(0)->getType())) {
12880 // Check if the values are candidates to demote.
12881 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
12882 if (SrcIt != MinBWs.end())
12883 SrcBWSz = SrcIt->second.first;
12884 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
12885 if (BWSz == SrcBWSz) {
12886 VecOpcode = Instruction::BitCast;
12887 } else if (BWSz < SrcBWSz) {
12888 VecOpcode = Instruction::Trunc;
12889 } else if (It != MinBWs.end()) {
12890 assert(BWSz > SrcBWSz && "Invalid cast!");
12891 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12892 } else if (SrcIt != MinBWs.end()) {
12893 assert(BWSz > SrcBWSz && "Invalid cast!");
12894 VecOpcode =
12895 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
12896 }
12897 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
12898 !SrcIt->second.second) {
12899 VecOpcode = Instruction::UIToFP;
12900 }
12901 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
12902 ? InVec
12903 : Builder.CreateCast(VecOpcode, InVec, VecTy);
12904 V = FinalShuffle(V, E, VecTy);
12905
12906 E->VectorizedValue = V;
12907 ++NumVectorInstructions;
12908 return V;
12909 }
12910 case Instruction::FCmp:
12911 case Instruction::ICmp: {
12912 setInsertPointAfterBundle(E);
12913
12914 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
12915 if (E->VectorizedValue) {
12916 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12917 return E->VectorizedValue;
12918 }
12919 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
12920 if (E->VectorizedValue) {
12921 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12922 return E->VectorizedValue;
12923 }
12924 if (L->getType() != R->getType()) {
12925 assert((getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
12926 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12927 MinBWs.contains(getOperandEntry(E, 0)) ||
12928 MinBWs.contains(getOperandEntry(E, 1))) &&
12929 "Expected item in MinBWs.");
12930 if (cast<VectorType>(L->getType())
12931 ->getElementType()
12932 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
12933 ->getElementType()
12934 ->getIntegerBitWidth()) {
12935 Type *CastTy = R->getType();
12936 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
12937 } else {
12938 Type *CastTy = L->getType();
12939 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
12940 }
12941 }
12942
12943 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
12944 Value *V = Builder.CreateCmp(P0, L, R);
12945 propagateIRFlags(V, E->Scalars, VL0);
12946 // Do not cast for cmps.
12947 VecTy = cast<FixedVectorType>(V->getType());
12948 V = FinalShuffle(V, E, VecTy);
12949
12950 E->VectorizedValue = V;
12951 ++NumVectorInstructions;
12952 return V;
12953 }
12954 case Instruction::Select: {
12955 setInsertPointAfterBundle(E);
12956
12957 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
12958 if (E->VectorizedValue) {
12959 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12960 return E->VectorizedValue;
12961 }
12962 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
12963 if (E->VectorizedValue) {
12964 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12965 return E->VectorizedValue;
12966 }
12967 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
12968 if (E->VectorizedValue) {
12969 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12970 return E->VectorizedValue;
12971 }
12972 if (True->getType() != VecTy || False->getType() != VecTy) {
12973 assert((It != MinBWs.end() ||
12974 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
12975 getOperandEntry(E, 2)->State == TreeEntry::NeedToGather ||
12976 MinBWs.contains(getOperandEntry(E, 1)) ||
12977 MinBWs.contains(getOperandEntry(E, 2))) &&
12978 "Expected item in MinBWs.");
12979 if (True->getType() != VecTy)
12980 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
12981 if (False->getType() != VecTy)
12982 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
12983 }
12984
12985 Value *V = Builder.CreateSelect(Cond, True, False);
12986 V = FinalShuffle(V, E, VecTy);
12987
12988 E->VectorizedValue = V;
12989 ++NumVectorInstructions;
12990 return V;
12991 }
12992 case Instruction::FNeg: {
12993 setInsertPointAfterBundle(E);
12994
12995 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
12996
12997 if (E->VectorizedValue) {
12998 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
12999 return E->VectorizedValue;
13000 }
13001
13002 Value *V = Builder.CreateUnOp(
13003 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
13004 propagateIRFlags(V, E->Scalars, VL0);
13005 if (auto *I = dyn_cast<Instruction>(V))
13006 V = propagateMetadata(I, E->Scalars);
13007
13008 V = FinalShuffle(V, E, VecTy);
13009
13010 E->VectorizedValue = V;
13011 ++NumVectorInstructions;
13012
13013 return V;
13014 }
13015 case Instruction::Add:
13016 case Instruction::FAdd:
13017 case Instruction::Sub:
13018 case Instruction::FSub:
13019 case Instruction::Mul:
13020 case Instruction::FMul:
13021 case Instruction::UDiv:
13022 case Instruction::SDiv:
13023 case Instruction::FDiv:
13024 case Instruction::URem:
13025 case Instruction::SRem:
13026 case Instruction::FRem:
13027 case Instruction::Shl:
13028 case Instruction::LShr:
13029 case Instruction::AShr:
13030 case Instruction::And:
13031 case Instruction::Or:
13032 case Instruction::Xor: {
13033 setInsertPointAfterBundle(E);
13034
13035 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
13036 if (E->VectorizedValue) {
13037 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13038 return E->VectorizedValue;
13039 }
13040 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
13041 if (E->VectorizedValue) {
13042 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13043 return E->VectorizedValue;
13044 }
13045 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
13046 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
13047 ArrayRef<Value *> Ops = E->getOperand(I);
13048 if (all_of(Ops, [&](Value *Op) {
13049 auto *CI = dyn_cast<ConstantInt>(Op);
13050 return CI && CI->getValue().countr_one() >= It->second.first;
13051 })) {
13052 V = FinalShuffle(I == 0 ? RHS : LHS, E, VecTy);
13053 E->VectorizedValue = V;
13054 ++NumVectorInstructions;
13055 return V;
13056 }
13057 }
13058 }
13059 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
13060 assert((It != MinBWs.end() ||
13061 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13062 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13063 MinBWs.contains(getOperandEntry(E, 0)) ||
13064 MinBWs.contains(getOperandEntry(E, 1))) &&
13065 "Expected item in MinBWs.");
13066 if (LHS->getType() != VecTy)
13067 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
13068 if (RHS->getType() != VecTy)
13069 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
13070 }
13071
13072 Value *V = Builder.CreateBinOp(
13073 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
13074 RHS);
13075 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
13076 if (auto *I = dyn_cast<Instruction>(V)) {
13077 V = propagateMetadata(I, E->Scalars);
13078 // Drop nuw flags for abs(sub(commutative), true).
13079 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
13080 any_of(E->Scalars, [](Value *V) {
13081 return isCommutative(cast<Instruction>(V));
13082 }))
13083 I->setHasNoUnsignedWrap(/*b=*/false);
13084 }
13085
13086 V = FinalShuffle(V, E, VecTy);
13087
13088 E->VectorizedValue = V;
13089 ++NumVectorInstructions;
13090
13091 return V;
13092 }
13093 case Instruction::Load: {
13094 // Loads are inserted at the head of the tree because we don't want to
13095 // sink them all the way down past store instructions.
13096 setInsertPointAfterBundle(E);
13097
13098 LoadInst *LI = cast<LoadInst>(VL0);
13099 Instruction *NewLI;
13100 Value *PO = LI->getPointerOperand();
13101 if (E->State == TreeEntry::Vectorize) {
13102 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
13103 } else if (E->State == TreeEntry::StridedVectorize) {
13104 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
13105 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
13106 PO = IsReverseOrder ? PtrN : Ptr0;
13107 std::optional<int> Diff = getPointersDiff(
13108 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
13109 Type *StrideTy = DL->getIndexType(PO->getType());
13110 Value *StrideVal;
13111 if (Diff) {
13112 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
13113 StrideVal =
13114 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
13115 DL->getTypeAllocSize(ScalarTy));
13116 } else {
13117 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
13118 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
13119 return cast<LoadInst>(V)->getPointerOperand();
13120 });
13121 OrdersType Order;
13122 std::optional<Value *> Stride =
13123 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
13124 &*Builder.GetInsertPoint());
13125 Value *NewStride =
13126 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
13127 StrideVal = Builder.CreateMul(
13128 NewStride,
13129 ConstantInt::get(
13130 StrideTy,
13131 (IsReverseOrder ? -1 : 1) *
13132 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
13133 }
13134 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13135 auto *Inst = Builder.CreateIntrinsic(
13136 Intrinsic::experimental_vp_strided_load,
13137 {VecTy, PO->getType(), StrideTy},
13138 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
13139 Builder.getInt32(E->Scalars.size())});
13140 Inst->addParamAttr(
13141 /*ArgNo=*/0,
13142 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13143 NewLI = Inst;
13144 } else {
13145 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
13146 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
13147 if (E->VectorizedValue) {
13148 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13149 return E->VectorizedValue;
13150 }
13151 // Use the minimum alignment of the gathered loads.
13152 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
13153 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
13154 }
13155 Value *V = propagateMetadata(NewLI, E->Scalars);
13156
13157 V = FinalShuffle(V, E, VecTy);
13158 E->VectorizedValue = V;
13159 ++NumVectorInstructions;
13160 return V;
13161 }
13162 case Instruction::Store: {
13163 auto *SI = cast<StoreInst>(VL0);
13164
13165 setInsertPointAfterBundle(E);
13166
13167 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
13168 if (VecValue->getType() != VecTy)
13169 VecValue =
13170 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
13171 VecValue = FinalShuffle(VecValue, E, VecTy);
13172
13173 Value *Ptr = SI->getPointerOperand();
13174 Instruction *ST;
13175 if (E->State == TreeEntry::Vectorize) {
13176 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
13177 } else {
13178 assert(E->State == TreeEntry::StridedVectorize &&
13179 "Expected either strided or conseutive stores.");
13180 if (!E->ReorderIndices.empty()) {
13181 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
13182 Ptr = SI->getPointerOperand();
13183 }
13184 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
13185 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
13186 auto *Inst = Builder.CreateIntrinsic(
13187 Intrinsic::experimental_vp_strided_store,
13188 {VecTy, Ptr->getType(), StrideTy},
13189 {VecValue, Ptr,
13190 ConstantInt::get(
13191 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
13192 Builder.getAllOnesMask(VecTy->getElementCount()),
13193 Builder.getInt32(E->Scalars.size())});
13194 Inst->addParamAttr(
13195 /*ArgNo=*/1,
13196 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
13197 ST = Inst;
13198 }
13199
13200 Value *V = propagateMetadata(ST, E->Scalars);
13201
13202 E->VectorizedValue = V;
13203 ++NumVectorInstructions;
13204 return V;
13205 }
13206 case Instruction::GetElementPtr: {
13207 auto *GEP0 = cast<GetElementPtrInst>(VL0);
13208 setInsertPointAfterBundle(E);
13209
13210 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
13211 if (E->VectorizedValue) {
13212 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13213 return E->VectorizedValue;
13214 }
13215
13216 SmallVector<Value *> OpVecs;
13217 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
13218 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
13219 if (E->VectorizedValue) {
13220 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13221 return E->VectorizedValue;
13222 }
13223 OpVecs.push_back(OpVec);
13224 }
13225
13226 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
13227 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
13229 for (Value *V : E->Scalars) {
13230 if (isa<GetElementPtrInst>(V))
13231 GEPs.push_back(V);
13232 }
13233 V = propagateMetadata(I, GEPs);
13234 }
13235
13236 V = FinalShuffle(V, E, VecTy);
13237
13238 E->VectorizedValue = V;
13239 ++NumVectorInstructions;
13240
13241 return V;
13242 }
13243 case Instruction::Call: {
13244 CallInst *CI = cast<CallInst>(VL0);
13245 setInsertPointAfterBundle(E);
13246
13248
13249 SmallVector<Type *> ArgTys =
13251 It != MinBWs.end() ? It->second.first : 0);
13252 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
13253 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
13254 VecCallCosts.first <= VecCallCosts.second;
13255
13256 Value *ScalarArg = nullptr;
13257 SmallVector<Value *> OpVecs;
13258 SmallVector<Type *, 2> TysForDecl;
13259 // Add return type if intrinsic is overloaded on it.
13260 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
13261 TysForDecl.push_back(VecTy);
13262 auto *CEI = cast<CallInst>(VL0);
13263 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
13264 ValueList OpVL;
13265 // Some intrinsics have scalar arguments. This argument should not be
13266 // vectorized.
13267 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
13268 ScalarArg = CEI->getArgOperand(I);
13269 // if decided to reduce bitwidth of abs intrinsic, it second argument
13270 // must be set false (do not return poison, if value issigned min).
13271 if (ID == Intrinsic::abs && It != MinBWs.end() &&
13272 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
13273 ScalarArg = Builder.getFalse();
13274 OpVecs.push_back(ScalarArg);
13276 TysForDecl.push_back(ScalarArg->getType());
13277 continue;
13278 }
13279
13280 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
13281 if (E->VectorizedValue) {
13282 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13283 return E->VectorizedValue;
13284 }
13285 ScalarArg = CEI->getArgOperand(I);
13286 if (cast<VectorType>(OpVec->getType())->getElementType() !=
13287 ScalarArg->getType() &&
13288 It == MinBWs.end()) {
13289 auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
13290 VecTy->getNumElements());
13291 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
13292 } else if (It != MinBWs.end()) {
13293 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
13294 }
13295 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
13296 OpVecs.push_back(OpVec);
13297 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
13298 TysForDecl.push_back(OpVec->getType());
13299 }
13300
13301 Function *CF;
13302 if (!UseIntrinsic) {
13303 VFShape Shape =
13306 static_cast<unsigned>(VecTy->getNumElements())),
13307 false /*HasGlobalPred*/);
13308 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
13309 } else {
13310 CF = Intrinsic::getDeclaration(F->getParent(), ID, TysForDecl);
13311 }
13312
13314 CI->getOperandBundlesAsDefs(OpBundles);
13315 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
13316
13317 propagateIRFlags(V, E->Scalars, VL0);
13318 V = FinalShuffle(V, E, VecTy);
13319
13320 E->VectorizedValue = V;
13321 ++NumVectorInstructions;
13322 return V;
13323 }
13324 case Instruction::ShuffleVector: {
13325 assert(E->isAltShuffle() &&
13326 ((Instruction::isBinaryOp(E->getOpcode()) &&
13327 Instruction::isBinaryOp(E->getAltOpcode())) ||
13328 (Instruction::isCast(E->getOpcode()) &&
13329 Instruction::isCast(E->getAltOpcode())) ||
13330 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
13331 "Invalid Shuffle Vector Operand");
13332
13333 Value *LHS = nullptr, *RHS = nullptr;
13334 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
13335 setInsertPointAfterBundle(E);
13336 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13337 if (E->VectorizedValue) {
13338 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13339 return E->VectorizedValue;
13340 }
13341 RHS = vectorizeOperand(E, 1, PostponedPHIs);
13342 } else {
13343 setInsertPointAfterBundle(E);
13344 LHS = vectorizeOperand(E, 0, PostponedPHIs);
13345 }
13346 if (E->VectorizedValue) {
13347 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
13348 return E->VectorizedValue;
13349 }
13350 if (LHS && RHS &&
13351 ((Instruction::isBinaryOp(E->getOpcode()) &&
13352 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
13353 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
13354 assert((It != MinBWs.end() ||
13355 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
13356 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
13357 MinBWs.contains(getOperandEntry(E, 0)) ||
13358 MinBWs.contains(getOperandEntry(E, 1))) &&
13359 "Expected item in MinBWs.");
13360 Type *CastTy = VecTy;
13361 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
13362 if (cast<VectorType>(LHS->getType())
13363 ->getElementType()
13364 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
13365 ->getElementType()
13366 ->getIntegerBitWidth())
13367 CastTy = RHS->getType();
13368 else
13369 CastTy = LHS->getType();
13370 }
13371 if (LHS->getType() != CastTy)
13372 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
13373 if (RHS->getType() != CastTy)
13374 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
13375 }
13376
13377 Value *V0, *V1;
13378 if (Instruction::isBinaryOp(E->getOpcode())) {
13379 V0 = Builder.CreateBinOp(
13380 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
13381 V1 = Builder.CreateBinOp(
13382 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
13383 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
13384 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
13385 auto *AltCI = cast<CmpInst>(E->getAltOp());
13386 CmpInst::Predicate AltPred = AltCI->getPredicate();
13387 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
13388 } else {
13389 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
13390 unsigned SrcBWSz = DL->getTypeSizeInBits(
13391 cast<VectorType>(LHS->getType())->getElementType());
13392 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
13393 if (BWSz <= SrcBWSz) {
13394 if (BWSz < SrcBWSz)
13395 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
13396 assert(LHS->getType() == VecTy && "Expected same type as operand.");
13397 if (auto *I = dyn_cast<Instruction>(LHS))
13398 LHS = propagateMetadata(I, E->Scalars);
13399 E->VectorizedValue = LHS;
13400 ++NumVectorInstructions;
13401 return LHS;
13402 }
13403 }
13404 V0 = Builder.CreateCast(
13405 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
13406 V1 = Builder.CreateCast(
13407 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
13408 }
13409 // Add V0 and V1 to later analysis to try to find and remove matching
13410 // instruction, if any.
13411 for (Value *V : {V0, V1}) {
13412 if (auto *I = dyn_cast<Instruction>(V)) {
13413 GatherShuffleExtractSeq.insert(I);
13414 CSEBlocks.insert(I->getParent());
13415 }
13416 }
13417
13418 // Create shuffle to take alternate operations from the vector.
13419 // Also, gather up main and alt scalar ops to propagate IR flags to
13420 // each vector operation.
13421 ValueList OpScalars, AltScalars;
13423 E->buildAltOpShuffleMask(
13424 [E, this](Instruction *I) {
13425 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
13426 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
13427 *TLI);
13428 },
13429 Mask, &OpScalars, &AltScalars);
13430
13431 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
13432 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
13433 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
13434 // Drop nuw flags for abs(sub(commutative), true).
13435 if (auto *I = dyn_cast<Instruction>(Vec);
13436 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
13437 any_of(E->Scalars, [](Value *V) {
13438 auto *IV = cast<Instruction>(V);
13439 return IV->getOpcode() == Instruction::Sub &&
13440 isCommutative(cast<Instruction>(IV));
13441 }))
13442 I->setHasNoUnsignedWrap(/*b=*/false);
13443 };
13444 DropNuwFlag(V0, E->getOpcode());
13445 DropNuwFlag(V1, E->getAltOpcode());
13446
13447 Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
13448 if (auto *I = dyn_cast<Instruction>(V)) {
13449 V = propagateMetadata(I, E->Scalars);
13450 GatherShuffleExtractSeq.insert(I);
13451 CSEBlocks.insert(I->getParent());
13452 }
13453
13454 E->VectorizedValue = V;
13455 ++NumVectorInstructions;
13456
13457 return V;
13458 }
13459 default:
13460 llvm_unreachable("unknown inst");
13461 }
13462 return nullptr;
13463}
13464
13466 ExtraValueToDebugLocsMap ExternallyUsedValues;
13467 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
13468 return vectorizeTree(ExternallyUsedValues, ReplacedExternals);
13469}
13470
13471namespace {
13472/// Data type for handling buildvector sequences with the reused scalars from
13473/// other tree entries.
13474struct ShuffledInsertData {
13475 /// List of insertelements to be replaced by shuffles.
13476 SmallVector<InsertElementInst *> InsertElements;
13477 /// The parent vectors and shuffle mask for the given list of inserts.
13479};
13480} // namespace
13481
13483 const ExtraValueToDebugLocsMap &ExternallyUsedValues,
13484 SmallVectorImpl<std::pair<Value *, Value *>> &ReplacedExternals,
13485 Instruction *ReductionRoot) {
13486 // All blocks must be scheduled before any instructions are inserted.
13487 for (auto &BSIter : BlocksSchedules) {
13488 scheduleBlock(BSIter.second.get());
13489 }
13490 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
13491 // need to rebuild it.
13492 EntryToLastInstruction.clear();
13493
13494 if (ReductionRoot)
13495 Builder.SetInsertPoint(ReductionRoot->getParent(),
13496 ReductionRoot->getIterator());
13497 else
13498 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13499
13500 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
13501 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
13502 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
13503 if (TE->State == TreeEntry::Vectorize &&
13504 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
13505 TE->VectorizedValue)
13506 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
13507 // Run through the list of postponed gathers and emit them, replacing the temp
13508 // emitted allocas with actual vector instructions.
13509 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
13511 for (const TreeEntry *E : PostponedNodes) {
13512 auto *TE = const_cast<TreeEntry *>(E);
13513 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
13514 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
13515 TE->UserTreeIndices.front().EdgeIdx)) &&
13516 VecTE->isSame(TE->Scalars))
13517 // Found gather node which is absolutely the same as one of the
13518 // vectorized nodes. It may happen after reordering.
13519 continue;
13520 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
13521 TE->VectorizedValue = nullptr;
13522 auto *UserI =
13523 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
13524 // If user is a PHI node, its vector code have to be inserted right before
13525 // block terminator. Since the node was delayed, there were some unresolved
13526 // dependencies at the moment when stab instruction was emitted. In a case
13527 // when any of these dependencies turn out an operand of another PHI, coming
13528 // from this same block, position of a stab instruction will become invalid.
13529 // The is because source vector that supposed to feed this gather node was
13530 // inserted at the end of the block [after stab instruction]. So we need
13531 // to adjust insertion point again to the end of block.
13532 if (isa<PHINode>(UserI)) {
13533 // Insert before all users.
13534 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
13535 for (User *U : PrevVec->users()) {
13536 if (U == UserI)
13537 continue;
13538 auto *UI = dyn_cast<Instruction>(U);
13539 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
13540 continue;
13541 if (UI->comesBefore(InsertPt))
13542 InsertPt = UI;
13543 }
13544 Builder.SetInsertPoint(InsertPt);
13545 } else {
13546 Builder.SetInsertPoint(PrevVec);
13547 }
13548 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
13549 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
13550 if (Vec->getType() != PrevVec->getType()) {
13551 assert(Vec->getType()->isIntOrIntVectorTy() &&
13552 PrevVec->getType()->isIntOrIntVectorTy() &&
13553 "Expected integer vector types only.");
13554 std::optional<bool> IsSigned;
13555 for (Value *V : TE->Scalars) {
13556 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
13557 auto It = MinBWs.find(BaseTE);
13558 if (It != MinBWs.end()) {
13559 IsSigned = IsSigned.value_or(false) || It->second.second;
13560 if (*IsSigned)
13561 break;
13562 }
13563 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
13564 auto It = MinBWs.find(MNTE);
13565 if (It != MinBWs.end()) {
13566 IsSigned = IsSigned.value_or(false) || It->second.second;
13567 if (*IsSigned)
13568 break;
13569 }
13570 }
13571 if (IsSigned.value_or(false))
13572 break;
13573 // Scan through gather nodes.
13574 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
13575 auto It = MinBWs.find(BVE);
13576 if (It != MinBWs.end()) {
13577 IsSigned = IsSigned.value_or(false) || It->second.second;
13578 if (*IsSigned)
13579 break;
13580 }
13581 }
13582 if (IsSigned.value_or(false))
13583 break;
13584 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
13585 IsSigned =
13586 IsSigned.value_or(false) ||
13587 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
13588 continue;
13589 }
13590 if (IsSigned.value_or(false))
13591 break;
13592 }
13593 }
13594 if (IsSigned.value_or(false)) {
13595 // Final attempt - check user node.
13596 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
13597 if (It != MinBWs.end())
13598 IsSigned = It->second.second;
13599 }
13600 assert(IsSigned &&
13601 "Expected user node or perfect diamond match in MinBWs.");
13602 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
13603 }
13604 PrevVec->replaceAllUsesWith(Vec);
13605 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
13606 // Replace the stub vector node, if it was used before for one of the
13607 // buildvector nodes already.
13608 auto It = PostponedValues.find(PrevVec);
13609 if (It != PostponedValues.end()) {
13610 for (TreeEntry *VTE : It->getSecond())
13611 VTE->VectorizedValue = Vec;
13612 }
13613 eraseInstruction(PrevVec);
13614 }
13615
13616 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
13617 << " values .\n");
13618
13619 SmallVector<ShuffledInsertData> ShuffledInserts;
13620 // Maps vector instruction to original insertelement instruction
13621 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
13622 // Maps extract Scalar to the corresponding extractelement instruction in the
13623 // basic block. Only one extractelement per block should be emitted.
13624 DenseMap<Value *,
13626 ScalarToEEs;
13627 SmallDenseSet<Value *, 4> UsedInserts;
13629 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
13630 // Extract all of the elements with the external uses.
13631 for (const auto &ExternalUse : ExternalUses) {
13632 Value *Scalar = ExternalUse.Scalar;
13633 llvm::User *User = ExternalUse.User;
13634
13635 // Skip users that we already RAUW. This happens when one instruction
13636 // has multiple uses of the same value.
13637 if (User && !is_contained(Scalar->users(), User))
13638 continue;
13639 TreeEntry *E = getTreeEntry(Scalar);
13640 assert(E && "Invalid scalar");
13641 assert(E->State != TreeEntry::NeedToGather &&
13642 "Extracting from a gather list");
13643 // Non-instruction pointers are not deleted, just skip them.
13644 if (E->getOpcode() == Instruction::GetElementPtr &&
13645 !isa<GetElementPtrInst>(Scalar))
13646 continue;
13647
13648 Value *Vec = E->VectorizedValue;
13649 assert(Vec && "Can't find vectorizable value");
13650
13651 Value *Lane = Builder.getInt32(ExternalUse.Lane);
13652 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
13653 if (Scalar->getType() != Vec->getType()) {
13654 Value *Ex = nullptr;
13655 Value *ExV = nullptr;
13656 auto *GEP = dyn_cast<GetElementPtrInst>(Scalar);
13657 bool ReplaceGEP = GEP && ExternalUsesAsGEPs.contains(GEP);
13658 auto It = ScalarToEEs.find(Scalar);
13659 if (It != ScalarToEEs.end()) {
13660 // No need to emit many extracts, just move the only one in the
13661 // current block.
13662 auto EEIt = It->second.find(Builder.GetInsertBlock());
13663 if (EEIt != It->second.end()) {
13664 Instruction *I = EEIt->second.first;
13665 if (Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
13666 Builder.GetInsertPoint()->comesBefore(I)) {
13667 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
13668 Builder.GetInsertPoint());
13669 if (auto *CI = EEIt->second.second)
13670 CI->moveAfter(I);
13671 }
13672 Ex = I;
13673 ExV = EEIt->second.second ? EEIt->second.second : Ex;
13674 }
13675 }
13676 if (!Ex) {
13677 // "Reuse" the existing extract to improve final codegen.
13678 if (auto *ES = dyn_cast<ExtractElementInst>(Scalar)) {
13679 Value *V = ES->getVectorOperand();
13680 if (const TreeEntry *ETE = getTreeEntry(V))
13681 V = ETE->VectorizedValue;
13682 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
13683 } else if (ReplaceGEP) {
13684 // Leave the GEPs as is, they are free in most cases and better to
13685 // keep them as GEPs.
13686 auto *CloneGEP = GEP->clone();
13687 if (isa<Instruction>(Vec))
13688 CloneGEP->insertBefore(*Builder.GetInsertBlock(),
13689 Builder.GetInsertPoint());
13690 else
13691 CloneGEP->insertBefore(GEP);
13692 if (GEP->hasName())
13693 CloneGEP->takeName(GEP);
13694 Ex = CloneGEP;
13695 } else {
13696 Ex = Builder.CreateExtractElement(Vec, Lane);
13697 }
13698 // If necessary, sign-extend or zero-extend ScalarRoot
13699 // to the larger type.
13700 ExV = Ex;
13701 if (Scalar->getType() != Ex->getType())
13702 ExV = Builder.CreateIntCast(Ex, Scalar->getType(),
13703 MinBWs.find(E)->second.second);
13704 if (auto *I = dyn_cast<Instruction>(Ex))
13705 ScalarToEEs[Scalar].try_emplace(
13706 Builder.GetInsertBlock(),
13707 std::make_pair(I, cast<Instruction>(ExV)));
13708 }
13709 // The then branch of the previous if may produce constants, since 0
13710 // operand might be a constant.
13711 if (auto *ExI = dyn_cast<Instruction>(Ex)) {
13712 GatherShuffleExtractSeq.insert(ExI);
13713 CSEBlocks.insert(ExI->getParent());
13714 }
13715 return ExV;
13716 }
13717 assert(isa<FixedVectorType>(Scalar->getType()) &&
13718 isa<InsertElementInst>(Scalar) &&
13719 "In-tree scalar of vector type is not insertelement?");
13720 auto *IE = cast<InsertElementInst>(Scalar);
13721 VectorToInsertElement.try_emplace(Vec, IE);
13722 return Vec;
13723 };
13724 // If User == nullptr, the Scalar remains as scalar in vectorized
13725 // instructions or is used as extra arg. Generate ExtractElement instruction
13726 // and update the record for this scalar in ExternallyUsedValues.
13727 if (!User) {
13728 if (!ScalarsWithNullptrUser.insert(Scalar).second)
13729 continue;
13730 assert((ExternallyUsedValues.count(Scalar) ||
13731 any_of(Scalar->users(),
13732 [&](llvm::User *U) {
13733 if (ExternalUsesAsGEPs.contains(U))
13734 return true;
13735 TreeEntry *UseEntry = getTreeEntry(U);
13736 return UseEntry &&
13737 (UseEntry->State == TreeEntry::Vectorize ||
13738 UseEntry->State ==
13739 TreeEntry::StridedVectorize) &&
13740 (E->State == TreeEntry::Vectorize ||
13741 E->State == TreeEntry::StridedVectorize) &&
13742 doesInTreeUserNeedToExtract(
13743 Scalar,
13744 cast<Instruction>(UseEntry->Scalars.front()),
13745 TLI);
13746 })) &&
13747 "Scalar with nullptr User must be registered in "
13748 "ExternallyUsedValues map or remain as scalar in vectorized "
13749 "instructions");
13750 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13751 if (auto *PHI = dyn_cast<PHINode>(VecI))
13752 Builder.SetInsertPoint(PHI->getParent(),
13753 PHI->getParent()->getFirstNonPHIIt());
13754 else
13755 Builder.SetInsertPoint(VecI->getParent(),
13756 std::next(VecI->getIterator()));
13757 } else {
13758 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13759 }
13760 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13761 // Required to update internally referenced instructions.
13762 Scalar->replaceAllUsesWith(NewInst);
13763 ReplacedExternals.emplace_back(Scalar, NewInst);
13764 continue;
13765 }
13766
13767 if (auto *VU = dyn_cast<InsertElementInst>(User)) {
13768 // Skip if the scalar is another vector op or Vec is not an instruction.
13769 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
13770 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
13771 if (!UsedInserts.insert(VU).second)
13772 continue;
13773 // Need to use original vector, if the root is truncated.
13774 auto BWIt = MinBWs.find(E);
13775 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
13776 auto *ScalarTy = FTy->getElementType();
13777 auto Key = std::make_pair(Vec, ScalarTy);
13778 auto VecIt = VectorCasts.find(Key);
13779 if (VecIt == VectorCasts.end()) {
13780 IRBuilderBase::InsertPointGuard Guard(Builder);
13781 if (auto *IVec = dyn_cast<PHINode>(Vec))
13782 Builder.SetInsertPoint(
13783 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
13784 else if (auto *IVec = dyn_cast<Instruction>(Vec))
13785 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
13786 Vec = Builder.CreateIntCast(
13787 Vec,
13789 ScalarTy,
13790 cast<FixedVectorType>(Vec->getType())->getNumElements()),
13791 BWIt->second.second);
13792 VectorCasts.try_emplace(Key, Vec);
13793 } else {
13794 Vec = VecIt->second;
13795 }
13796 }
13797
13798 std::optional<unsigned> InsertIdx = getInsertIndex(VU);
13799 if (InsertIdx) {
13800 auto *It =
13801 find_if(ShuffledInserts, [VU](const ShuffledInsertData &Data) {
13802 // Checks if 2 insertelements are from the same buildvector.
13803 InsertElementInst *VecInsert = Data.InsertElements.front();
13805 VU, VecInsert,
13806 [](InsertElementInst *II) { return II->getOperand(0); });
13807 });
13808 unsigned Idx = *InsertIdx;
13809 if (It == ShuffledInserts.end()) {
13810 (void)ShuffledInserts.emplace_back();
13811 It = std::next(ShuffledInserts.begin(),
13812 ShuffledInserts.size() - 1);
13813 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13814 if (Mask.empty())
13815 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13816 // Find the insertvector, vectorized in tree, if any.
13817 Value *Base = VU;
13818 while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
13819 if (IEBase != User &&
13820 (!IEBase->hasOneUse() ||
13821 getInsertIndex(IEBase).value_or(Idx) == Idx))
13822 break;
13823 // Build the mask for the vectorized insertelement instructions.
13824 if (const TreeEntry *E = getTreeEntry(IEBase)) {
13825 do {
13826 IEBase = cast<InsertElementInst>(Base);
13827 int IEIdx = *getInsertIndex(IEBase);
13828 assert(Mask[IEIdx] == PoisonMaskElem &&
13829 "InsertElementInstruction used already.");
13830 Mask[IEIdx] = IEIdx;
13831 Base = IEBase->getOperand(0);
13832 } while (E == getTreeEntry(Base));
13833 break;
13834 }
13835 Base = cast<InsertElementInst>(Base)->getOperand(0);
13836 // After the vectorization the def-use chain has changed, need
13837 // to look through original insertelement instructions, if they
13838 // get replaced by vector instructions.
13839 auto It = VectorToInsertElement.find(Base);
13840 if (It != VectorToInsertElement.end())
13841 Base = It->second;
13842 }
13843 }
13844 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
13845 if (Mask.empty())
13846 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
13847 Mask[Idx] = ExternalUse.Lane;
13848 It->InsertElements.push_back(cast<InsertElementInst>(User));
13849 continue;
13850 }
13851 }
13852 }
13853 }
13854
13855 // Generate extracts for out-of-tree users.
13856 // Find the insertion point for the extractelement lane.
13857 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
13858 if (PHINode *PH = dyn_cast<PHINode>(User)) {
13859 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
13860 if (PH->getIncomingValue(I) == Scalar) {
13861 Instruction *IncomingTerminator =
13862 PH->getIncomingBlock(I)->getTerminator();
13863 if (isa<CatchSwitchInst>(IncomingTerminator)) {
13864 Builder.SetInsertPoint(VecI->getParent(),
13865 std::next(VecI->getIterator()));
13866 } else {
13867 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
13868 }
13869 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13870 PH->setOperand(I, NewInst);
13871 }
13872 }
13873 } else {
13874 Builder.SetInsertPoint(cast<Instruction>(User));
13875 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13876 User->replaceUsesOfWith(Scalar, NewInst);
13877 }
13878 } else {
13879 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
13880 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
13881 User->replaceUsesOfWith(Scalar, NewInst);
13882 }
13883
13884 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
13885 }
13886
13887 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
13888 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
13889 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
13890 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
13891 for (int I = 0, E = Mask.size(); I < E; ++I) {
13892 if (Mask[I] < VF)
13893 CombinedMask1[I] = Mask[I];
13894 else
13895 CombinedMask2[I] = Mask[I] - VF;
13896 }
13897 ShuffleInstructionBuilder ShuffleBuilder(
13898 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
13899 ShuffleBuilder.add(V1, CombinedMask1);
13900 if (V2)
13901 ShuffleBuilder.add(V2, CombinedMask2);
13902 return ShuffleBuilder.finalize(std::nullopt);
13903 };
13904
13905 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
13906 bool ForSingleMask) {
13907 unsigned VF = Mask.size();
13908 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
13909 if (VF != VecVF) {
13910 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
13911 Vec = CreateShuffle(Vec, nullptr, Mask);
13912 return std::make_pair(Vec, true);
13913 }
13914 if (!ForSingleMask) {
13915 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
13916 for (unsigned I = 0; I < VF; ++I) {
13917 if (Mask[I] != PoisonMaskElem)
13918 ResizeMask[Mask[I]] = Mask[I];
13919 }
13920 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
13921 }
13922 }
13923
13924 return std::make_pair(Vec, false);
13925 };
13926 // Perform shuffling of the vectorize tree entries for better handling of
13927 // external extracts.
13928 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
13929 // Find the first and the last instruction in the list of insertelements.
13930 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
13931 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
13932 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
13933 Builder.SetInsertPoint(LastInsert);
13934 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
13935 Value *NewInst = performExtractsShuffleAction<Value>(
13936 MutableArrayRef(Vector.data(), Vector.size()),
13937 FirstInsert->getOperand(0),
13938 [](Value *Vec) {
13939 return cast<VectorType>(Vec->getType())
13940 ->getElementCount()
13941 .getKnownMinValue();
13942 },
13943 ResizeToVF,
13944 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
13945 ArrayRef<Value *> Vals) {
13946 assert((Vals.size() == 1 || Vals.size() == 2) &&
13947 "Expected exactly 1 or 2 input values.");
13948 if (Vals.size() == 1) {
13949 // Do not create shuffle if the mask is a simple identity
13950 // non-resizing mask.
13951 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
13952 ->getNumElements() ||
13953 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
13954 return CreateShuffle(Vals.front(), nullptr, Mask);
13955 return Vals.front();
13956 }
13957 return CreateShuffle(Vals.front() ? Vals.front()
13958 : FirstInsert->getOperand(0),
13959 Vals.back(), Mask);
13960 });
13961 auto It = ShuffledInserts[I].InsertElements.rbegin();
13962 // Rebuild buildvector chain.
13963 InsertElementInst *II = nullptr;
13964 if (It != ShuffledInserts[I].InsertElements.rend())
13965 II = *It;
13967 while (It != ShuffledInserts[I].InsertElements.rend()) {
13968 assert(II && "Must be an insertelement instruction.");
13969 if (*It == II)
13970 ++It;
13971 else
13972 Inserts.push_back(cast<Instruction>(II));
13973 II = dyn_cast<InsertElementInst>(II->getOperand(0));
13974 }
13975 for (Instruction *II : reverse(Inserts)) {
13976 II->replaceUsesOfWith(II->getOperand(0), NewInst);
13977 if (auto *NewI = dyn_cast<Instruction>(NewInst))
13978 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
13979 II->moveAfter(NewI);
13980 NewInst = II;
13981 }
13982 LastInsert->replaceAllUsesWith(NewInst);
13983 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
13984 IE->replaceUsesOfWith(IE->getOperand(0),
13985 PoisonValue::get(IE->getOperand(0)->getType()));
13986 IE->replaceUsesOfWith(IE->getOperand(1),
13987 PoisonValue::get(IE->getOperand(1)->getType()));
13988 eraseInstruction(IE);
13989 }
13990 CSEBlocks.insert(LastInsert->getParent());
13991 }
13992
13993 SmallVector<Instruction *> RemovedInsts;
13994 // For each vectorized value:
13995 for (auto &TEPtr : VectorizableTree) {
13996 TreeEntry *Entry = TEPtr.get();
13997
13998 // No need to handle users of gathered values.
13999 if (Entry->State == TreeEntry::NeedToGather)
14000 continue;
14001
14002 assert(Entry->VectorizedValue && "Can't find vectorizable value");
14003
14004 // For each lane:
14005 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
14006 Value *Scalar = Entry->Scalars[Lane];
14007
14008 if (Entry->getOpcode() == Instruction::GetElementPtr &&
14009 !isa<GetElementPtrInst>(Scalar))
14010 continue;
14011#ifndef NDEBUG
14012 Type *Ty = Scalar->getType();
14013 if (!Ty->isVoidTy()) {
14014 for (User *U : Scalar->users()) {
14015 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
14016
14017 // It is legal to delete users in the ignorelist.
14018 assert((getTreeEntry(U) ||
14019 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14020 (isa_and_nonnull<Instruction>(U) &&
14021 isDeleted(cast<Instruction>(U)))) &&
14022 "Deleting out-of-tree value");
14023 }
14024 }
14025#endif
14026 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14027 eraseInstruction(cast<Instruction>(Scalar));
14028 // Retain to-be-deleted instructions for some debug-info
14029 // bookkeeping. NOTE: eraseInstruction only marks the instruction for
14030 // deletion - instructions are not deleted until later.
14031 RemovedInsts.push_back(cast<Instruction>(Scalar));
14032 }
14033 }
14034
14035 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
14036 // new vector instruction.
14037 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
14038 V->mergeDIAssignID(RemovedInsts);
14039
14040 Builder.ClearInsertionPoint();
14041 InstrElementSize.clear();
14042
14043 const TreeEntry &RootTE = *VectorizableTree.front().get();
14044 Value *Vec = RootTE.VectorizedValue;
14045 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
14046 It != MinBWs.end() &&
14047 ReductionBitWidth != It->second.first) {
14048 IRBuilder<>::InsertPointGuard Guard(Builder);
14049 Builder.SetInsertPoint(ReductionRoot->getParent(),
14050 ReductionRoot->getIterator());
14051 Vec = Builder.CreateIntCast(
14052 Vec,
14053 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
14054 cast<VectorType>(Vec->getType())->getElementCount()),
14055 It->second.second);
14056 }
14057 return Vec;
14058}
14059
14061 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
14062 << " gather sequences instructions.\n");
14063 // LICM InsertElementInst sequences.
14064 for (Instruction *I : GatherShuffleExtractSeq) {
14065 if (isDeleted(I))
14066 continue;
14067
14068 // Check if this block is inside a loop.
14069 Loop *L = LI->getLoopFor(I->getParent());
14070 if (!L)
14071 continue;
14072
14073 // Check if it has a preheader.
14074 BasicBlock *PreHeader = L->getLoopPreheader();
14075 if (!PreHeader)
14076 continue;
14077
14078 // If the vector or the element that we insert into it are
14079 // instructions that are defined in this basic block then we can't
14080 // hoist this instruction.
14081 if (any_of(I->operands(), [L](Value *V) {
14082 auto *OpI = dyn_cast<Instruction>(V);
14083 return OpI && L->contains(OpI);
14084 }))
14085 continue;
14086
14087 // We can hoist this instruction. Move it to the pre-header.
14088 I->moveBefore(PreHeader->getTerminator());
14089 CSEBlocks.insert(PreHeader);
14090 }
14091
14092 // Make a list of all reachable blocks in our CSE queue.
14094 CSEWorkList.reserve(CSEBlocks.size());
14095 for (BasicBlock *BB : CSEBlocks)
14096 if (DomTreeNode *N = DT->getNode(BB)) {
14098 CSEWorkList.push_back(N);
14099 }
14100
14101 // Sort blocks by domination. This ensures we visit a block after all blocks
14102 // dominating it are visited.
14103 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
14104 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
14105 "Different nodes should have different DFS numbers");
14106 return A->getDFSNumIn() < B->getDFSNumIn();
14107 });
14108
14109 // Less defined shuffles can be replaced by the more defined copies.
14110 // Between two shuffles one is less defined if it has the same vector operands
14111 // and its mask indeces are the same as in the first one or undefs. E.g.
14112 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
14113 // poison, <0, 0, 0, 0>.
14114 auto &&IsIdenticalOrLessDefined = [this](Instruction *I1, Instruction *I2,
14115 SmallVectorImpl<int> &NewMask) {
14116 if (I1->getType() != I2->getType())
14117 return false;
14118 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
14119 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
14120 if (!SI1 || !SI2)
14121 return I1->isIdenticalTo(I2);
14122 if (SI1->isIdenticalTo(SI2))
14123 return true;
14124 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
14125 if (SI1->getOperand(I) != SI2->getOperand(I))
14126 return false;
14127 // Check if the second instruction is more defined than the first one.
14128 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
14129 ArrayRef<int> SM1 = SI1->getShuffleMask();
14130 // Count trailing undefs in the mask to check the final number of used
14131 // registers.
14132 unsigned LastUndefsCnt = 0;
14133 for (int I = 0, E = NewMask.size(); I < E; ++I) {
14134 if (SM1[I] == PoisonMaskElem)
14135 ++LastUndefsCnt;
14136 else
14137 LastUndefsCnt = 0;
14138 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
14139 NewMask[I] != SM1[I])
14140 return false;
14141 if (NewMask[I] == PoisonMaskElem)
14142 NewMask[I] = SM1[I];
14143 }
14144 // Check if the last undefs actually change the final number of used vector
14145 // registers.
14146 return SM1.size() - LastUndefsCnt > 1 &&
14147 TTI->getNumberOfParts(SI1->getType()) ==
14149 FixedVectorType::get(SI1->getType()->getElementType(),
14150 SM1.size() - LastUndefsCnt));
14151 };
14152 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
14153 // instructions. TODO: We can further optimize this scan if we split the
14154 // instructions into different buckets based on the insert lane.
14156 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
14157 assert(*I &&
14158 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
14159 "Worklist not sorted properly!");
14160 BasicBlock *BB = (*I)->getBlock();
14161 // For all instructions in blocks containing gather sequences:
14162 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
14163 if (isDeleted(&In))
14164 continue;
14165 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
14166 !GatherShuffleExtractSeq.contains(&In))
14167 continue;
14168
14169 // Check if we can replace this instruction with any of the
14170 // visited instructions.
14171 bool Replaced = false;
14172 for (Instruction *&V : Visited) {
14173 SmallVector<int> NewMask;
14174 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
14175 DT->dominates(V->getParent(), In.getParent())) {
14176 In.replaceAllUsesWith(V);
14177 eraseInstruction(&In);
14178 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
14179 if (!NewMask.empty())
14180 SI->setShuffleMask(NewMask);
14181 Replaced = true;
14182 break;
14183 }
14184 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
14185 GatherShuffleExtractSeq.contains(V) &&
14186 IsIdenticalOrLessDefined(V, &In, NewMask) &&
14187 DT->dominates(In.getParent(), V->getParent())) {
14188 In.moveAfter(V);
14189 V->replaceAllUsesWith(&In);
14191 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
14192 if (!NewMask.empty())
14193 SI->setShuffleMask(NewMask);
14194 V = &In;
14195 Replaced = true;
14196 break;
14197 }
14198 }
14199 if (!Replaced) {
14200 assert(!is_contained(Visited, &In));
14201 Visited.push_back(&In);
14202 }
14203 }
14204 }
14205 CSEBlocks.clear();
14206 GatherShuffleExtractSeq.clear();
14207}
14208
14209BoUpSLP::ScheduleData *
14210BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
14211 ScheduleData *Bundle = nullptr;
14212 ScheduleData *PrevInBundle = nullptr;
14213 for (Value *V : VL) {
14215 continue;
14216 ScheduleData *BundleMember = getScheduleData(V);
14217 assert(BundleMember &&
14218 "no ScheduleData for bundle member "
14219 "(maybe not in same basic block)");
14220 assert(BundleMember->isSchedulingEntity() &&
14221 "bundle member already part of other bundle");
14222 if (PrevInBundle) {
14223 PrevInBundle->NextInBundle = BundleMember;
14224 } else {
14225 Bundle = BundleMember;
14226 }
14227
14228 // Group the instructions to a bundle.
14229 BundleMember->FirstInBundle = Bundle;
14230 PrevInBundle = BundleMember;
14231 }
14232 assert(Bundle && "Failed to find schedule bundle");
14233 return Bundle;
14234}
14235
14236// Groups the instructions to a bundle (which is then a single scheduling entity)
14237// and schedules instructions until the bundle gets ready.
14238std::optional<BoUpSLP::ScheduleData *>
14239BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
14240 const InstructionsState &S) {
14241 // No need to schedule PHIs, insertelement, extractelement and extractvalue
14242 // instructions.
14243 if (isa<PHINode>(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) ||
14245 return nullptr;
14246
14247 // Initialize the instruction bundle.
14248 Instruction *OldScheduleEnd = ScheduleEnd;
14249 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
14250
14251 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
14252 ScheduleData *Bundle) {
14253 // The scheduling region got new instructions at the lower end (or it is a
14254 // new region for the first bundle). This makes it necessary to
14255 // recalculate all dependencies.
14256 // It is seldom that this needs to be done a second time after adding the
14257 // initial bundle to the region.
14258 if (ScheduleEnd != OldScheduleEnd) {
14259 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
14260 doForAllOpcodes(I, [](ScheduleData *SD) { SD->clearDependencies(); });
14261 ReSchedule = true;
14262 }
14263 if (Bundle) {
14264 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
14265 << " in block " << BB->getName() << "\n");
14266 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
14267 }
14268
14269 if (ReSchedule) {
14270 resetSchedule();
14271 initialFillReadyList(ReadyInsts);
14272 }
14273
14274 // Now try to schedule the new bundle or (if no bundle) just calculate
14275 // dependencies. As soon as the bundle is "ready" it means that there are no
14276 // cyclic dependencies and we can schedule it. Note that's important that we
14277 // don't "schedule" the bundle yet (see cancelScheduling).
14278 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
14279 !ReadyInsts.empty()) {
14280 ScheduleData *Picked = ReadyInsts.pop_back_val();
14281 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
14282 "must be ready to schedule");
14283 schedule(Picked, ReadyInsts);
14284 }
14285 };
14286
14287 // Make sure that the scheduling region contains all
14288 // instructions of the bundle.
14289 for (Value *V : VL) {
14291 continue;
14292 if (!extendSchedulingRegion(V, S)) {
14293 // If the scheduling region got new instructions at the lower end (or it
14294 // is a new region for the first bundle). This makes it necessary to
14295 // recalculate all dependencies.
14296 // Otherwise the compiler may crash trying to incorrectly calculate
14297 // dependencies and emit instruction in the wrong order at the actual
14298 // scheduling.
14299 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
14300 return std::nullopt;
14301 }
14302 }
14303
14304 bool ReSchedule = false;
14305 for (Value *V : VL) {
14307 continue;
14308 ScheduleData *BundleMember = getScheduleData(V);
14309 assert(BundleMember &&
14310 "no ScheduleData for bundle member (maybe not in same basic block)");
14311
14312 // Make sure we don't leave the pieces of the bundle in the ready list when
14313 // whole bundle might not be ready.
14314 ReadyInsts.remove(BundleMember);
14315
14316 if (!BundleMember->IsScheduled)
14317 continue;
14318 // A bundle member was scheduled as single instruction before and now
14319 // needs to be scheduled as part of the bundle. We just get rid of the
14320 // existing schedule.
14321 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
14322 << " was already scheduled\n");
14323 ReSchedule = true;
14324 }
14325
14326 auto *Bundle = buildBundle(VL);
14327 TryScheduleBundleImpl(ReSchedule, Bundle);
14328 if (!Bundle->isReady()) {
14329 cancelScheduling(VL, S.OpValue);
14330 return std::nullopt;
14331 }
14332 return Bundle;
14333}
14334
14335void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
14336 Value *OpValue) {
14337 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
14339 return;
14340
14341 if (doesNotNeedToBeScheduled(OpValue))
14342 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
14343 ScheduleData *Bundle = getScheduleData(OpValue);
14344 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
14345 assert(!Bundle->IsScheduled &&
14346 "Can't cancel bundle which is already scheduled");
14347 assert(Bundle->isSchedulingEntity() &&
14348 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
14349 "tried to unbundle something which is not a bundle");
14350
14351 // Remove the bundle from the ready list.
14352 if (Bundle->isReady())
14353 ReadyInsts.remove(Bundle);
14354
14355 // Un-bundle: make single instructions out of the bundle.
14356 ScheduleData *BundleMember = Bundle;
14357 while (BundleMember) {
14358 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
14359 BundleMember->FirstInBundle = BundleMember;
14360 ScheduleData *Next = BundleMember->NextInBundle;
14361 BundleMember->NextInBundle = nullptr;
14362 BundleMember->TE = nullptr;
14363 if (BundleMember->unscheduledDepsInBundle() == 0) {
14364 ReadyInsts.insert(BundleMember);
14365 }
14366 BundleMember = Next;
14367 }
14368}
14369
14370BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
14371 // Allocate a new ScheduleData for the instruction.
14372 if (ChunkPos >= ChunkSize) {
14373 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
14374 ChunkPos = 0;
14375 }
14376 return &(ScheduleDataChunks.back()[ChunkPos++]);
14377}
14378
14379bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
14380 const InstructionsState &S) {
14381 if (getScheduleData(V, isOneOf(S, V)))
14382 return true;
14383 Instruction *I = dyn_cast<Instruction>(V);
14384 assert(I && "bundle member must be an instruction");
14385 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
14387 "phi nodes/insertelements/extractelements/extractvalues don't need to "
14388 "be scheduled");
14389 auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool {
14390 ScheduleData *ISD = getScheduleData(I);
14391 if (!ISD)
14392 return false;
14393 assert(isInSchedulingRegion(ISD) &&
14394 "ScheduleData not in scheduling region");
14395 ScheduleData *SD = allocateScheduleDataChunks();
14396 SD->Inst = I;
14397 SD->init(SchedulingRegionID, S.OpValue);
14398 ExtraScheduleDataMap[I][S.OpValue] = SD;
14399 return true;
14400 };
14401 if (CheckScheduleForI(I))
14402 return true;
14403 if (!ScheduleStart) {
14404 // It's the first instruction in the new region.
14405 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
14406 ScheduleStart = I;
14407 ScheduleEnd = I->getNextNode();
14408 if (isOneOf(S, I) != I)
14409 CheckScheduleForI(I);
14410 assert(ScheduleEnd && "tried to vectorize a terminator?");
14411 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
14412 return true;
14413 }
14414 // Search up and down at the same time, because we don't know if the new
14415 // instruction is above or below the existing scheduling region.
14416 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
14417 // against the budget. Otherwise debug info could affect codegen.
14419 ++ScheduleStart->getIterator().getReverse();
14420 BasicBlock::reverse_iterator UpperEnd = BB->rend();
14421 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
14422 BasicBlock::iterator LowerEnd = BB->end();
14423 auto IsAssumeLikeIntr = [](const Instruction &I) {
14424 if (auto *II = dyn_cast<IntrinsicInst>(&I))
14425 return II->isAssumeLikeIntrinsic();
14426 return false;
14427 };
14428 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14429 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14430 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
14431 &*DownIter != I) {
14432 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
14433 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
14434 return false;
14435 }
14436
14437 ++UpIter;
14438 ++DownIter;
14439
14440 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
14441 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
14442 }
14443 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
14444 assert(I->getParent() == ScheduleStart->getParent() &&
14445 "Instruction is in wrong basic block.");
14446 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
14447 ScheduleStart = I;
14448 if (isOneOf(S, I) != I)
14449 CheckScheduleForI(I);
14450 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
14451 << "\n");
14452 return true;
14453 }
14454 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
14455 "Expected to reach top of the basic block or instruction down the "
14456 "lower end.");
14457 assert(I->getParent() == ScheduleEnd->getParent() &&
14458 "Instruction is in wrong basic block.");
14459 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
14460 nullptr);
14461 ScheduleEnd = I->getNextNode();
14462 if (isOneOf(S, I) != I)
14463 CheckScheduleForI(I);
14464 assert(ScheduleEnd && "tried to vectorize a terminator?");
14465 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
14466 return true;
14467}
14468
14469void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
14470 Instruction *ToI,
14471 ScheduleData *PrevLoadStore,
14472 ScheduleData *NextLoadStore) {
14473 ScheduleData *CurrentLoadStore = PrevLoadStore;
14474 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
14475 // No need to allocate data for non-schedulable instructions.
14477 continue;
14478 ScheduleData *SD = ScheduleDataMap.lookup(I);
14479 if (!SD) {
14480 SD = allocateScheduleDataChunks();
14481 ScheduleDataMap[I] = SD;
14482 SD->Inst = I;
14483 }
14484 assert(!isInSchedulingRegion(SD) &&
14485 "new ScheduleData already in scheduling region");
14486 SD->init(SchedulingRegionID, I);
14487
14488 if (I->mayReadOrWriteMemory() &&
14489 (!isa<IntrinsicInst>(I) ||
14490 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
14491 cast<IntrinsicInst>(I)->getIntrinsicID() !=
14492 Intrinsic::pseudoprobe))) {
14493 // Update the linked list of memory accessing instructions.
14494 if (CurrentLoadStore) {
14495 CurrentLoadStore->NextLoadStore = SD;
14496 } else {
14497 FirstLoadStoreInRegion = SD;
14498 }
14499 CurrentLoadStore = SD;
14500 }
14501
14502 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14503 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14504 RegionHasStackSave = true;
14505 }
14506 if (NextLoadStore) {
14507 if (CurrentLoadStore)
14508 CurrentLoadStore->NextLoadStore = NextLoadStore;
14509 } else {
14510 LastLoadStoreInRegion = CurrentLoadStore;
14511 }
14512}
14513
14514void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
14515 bool InsertInReadyList,
14516 BoUpSLP *SLP) {
14517 assert(SD->isSchedulingEntity());
14518
14520 WorkList.push_back(SD);
14521
14522 while (!WorkList.empty()) {
14523 ScheduleData *SD = WorkList.pop_back_val();
14524 for (ScheduleData *BundleMember = SD; BundleMember;
14525 BundleMember = BundleMember->NextInBundle) {
14526 assert(isInSchedulingRegion(BundleMember));
14527 if (BundleMember->hasValidDependencies())
14528 continue;
14529
14530 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
14531 << "\n");
14532 BundleMember->Dependencies = 0;
14533 BundleMember->resetUnscheduledDeps();
14534
14535 // Handle def-use chain dependencies.
14536 if (BundleMember->OpValue != BundleMember->Inst) {
14537 if (ScheduleData *UseSD = getScheduleData(BundleMember->Inst)) {
14538 BundleMember->Dependencies++;
14539 ScheduleData *DestBundle = UseSD->FirstInBundle;
14540 if (!DestBundle->IsScheduled)
14541 BundleMember->incrementUnscheduledDeps(1);
14542 if (!DestBundle->hasValidDependencies())
14543 WorkList.push_back(DestBundle);
14544 }
14545 } else {
14546 for (User *U : BundleMember->Inst->users()) {
14547 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
14548 BundleMember->Dependencies++;
14549 ScheduleData *DestBundle = UseSD->FirstInBundle;
14550 if (!DestBundle->IsScheduled)
14551 BundleMember->incrementUnscheduledDeps(1);
14552 if (!DestBundle->hasValidDependencies())
14553 WorkList.push_back(DestBundle);
14554 }
14555 }
14556 }
14557
14558 auto MakeControlDependent = [&](Instruction *I) {
14559 auto *DepDest = getScheduleData(I);
14560 assert(DepDest && "must be in schedule window");
14561 DepDest->ControlDependencies.push_back(BundleMember);
14562 BundleMember->Dependencies++;
14563 ScheduleData *DestBundle = DepDest->FirstInBundle;
14564 if (!DestBundle->IsScheduled)
14565 BundleMember->incrementUnscheduledDeps(1);
14566 if (!DestBundle->hasValidDependencies())
14567 WorkList.push_back(DestBundle);
14568 };
14569
14570 // Any instruction which isn't safe to speculate at the beginning of the
14571 // block is control dependend on any early exit or non-willreturn call
14572 // which proceeds it.
14573 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
14574 for (Instruction *I = BundleMember->Inst->getNextNode();
14575 I != ScheduleEnd; I = I->getNextNode()) {
14576 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
14577 continue;
14578
14579 // Add the dependency
14580 MakeControlDependent(I);
14581
14583 // Everything past here must be control dependent on I.
14584 break;
14585 }
14586 }
14587
14588 if (RegionHasStackSave) {
14589 // If we have an inalloc alloca instruction, it needs to be scheduled
14590 // after any preceeding stacksave. We also need to prevent any alloca
14591 // from reordering above a preceeding stackrestore.
14592 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
14593 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
14594 for (Instruction *I = BundleMember->Inst->getNextNode();
14595 I != ScheduleEnd; I = I->getNextNode()) {
14596 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
14597 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14598 // Any allocas past here must be control dependent on I, and I
14599 // must be memory dependend on BundleMember->Inst.
14600 break;
14601
14602 if (!isa<AllocaInst>(I))
14603 continue;
14604
14605 // Add the dependency
14606 MakeControlDependent(I);
14607 }
14608 }
14609
14610 // In addition to the cases handle just above, we need to prevent
14611 // allocas and loads/stores from moving below a stacksave or a
14612 // stackrestore. Avoiding moving allocas below stackrestore is currently
14613 // thought to be conservatism. Moving loads/stores below a stackrestore
14614 // can lead to incorrect code.
14615 if (isa<AllocaInst>(BundleMember->Inst) ||
14616 BundleMember->Inst->mayReadOrWriteMemory()) {
14617 for (Instruction *I = BundleMember->Inst->getNextNode();
14618 I != ScheduleEnd; I = I->getNextNode()) {
14619 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
14620 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
14621 continue;
14622
14623 // Add the dependency
14624 MakeControlDependent(I);
14625 break;
14626 }
14627 }
14628 }
14629
14630 // Handle the memory dependencies (if any).
14631 ScheduleData *DepDest = BundleMember->NextLoadStore;
14632 if (!DepDest)
14633 continue;
14634 Instruction *SrcInst = BundleMember->Inst;
14635 assert(SrcInst->mayReadOrWriteMemory() &&
14636 "NextLoadStore list for non memory effecting bundle?");
14637 MemoryLocation SrcLoc = getLocation(SrcInst);
14638 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
14639 unsigned NumAliased = 0;
14640 unsigned DistToSrc = 1;
14641
14642 for (; DepDest; DepDest = DepDest->NextLoadStore) {
14643 assert(isInSchedulingRegion(DepDest));
14644
14645 // We have two limits to reduce the complexity:
14646 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
14647 // SLP->isAliased (which is the expensive part in this loop).
14648 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
14649 // the whole loop (even if the loop is fast, it's quadratic).
14650 // It's important for the loop break condition (see below) to
14651 // check this limit even between two read-only instructions.
14652 if (DistToSrc >= MaxMemDepDistance ||
14653 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
14654 (NumAliased >= AliasedCheckLimit ||
14655 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
14656
14657 // We increment the counter only if the locations are aliased
14658 // (instead of counting all alias checks). This gives a better
14659 // balance between reduced runtime and accurate dependencies.
14660 NumAliased++;
14661
14662 DepDest->MemoryDependencies.push_back(BundleMember);
14663 BundleMember->Dependencies++;
14664 ScheduleData *DestBundle = DepDest->FirstInBundle;
14665 if (!DestBundle->IsScheduled) {
14666 BundleMember->incrementUnscheduledDeps(1);
14667 }
14668 if (!DestBundle->hasValidDependencies()) {
14669 WorkList.push_back(DestBundle);
14670 }
14671 }
14672
14673 // Example, explaining the loop break condition: Let's assume our
14674 // starting instruction is i0 and MaxMemDepDistance = 3.
14675 //
14676 // +--------v--v--v
14677 // i0,i1,i2,i3,i4,i5,i6,i7,i8
14678 // +--------^--^--^
14679 //
14680 // MaxMemDepDistance let us stop alias-checking at i3 and we add
14681 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
14682 // Previously we already added dependencies from i3 to i6,i7,i8
14683 // (because of MaxMemDepDistance). As we added a dependency from
14684 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
14685 // and we can abort this loop at i6.
14686 if (DistToSrc >= 2 * MaxMemDepDistance)
14687 break;
14688 DistToSrc++;
14689 }
14690 }
14691 if (InsertInReadyList && SD->isReady()) {
14692 ReadyInsts.insert(SD);
14693 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
14694 << "\n");
14695 }
14696 }
14697}
14698
14699void BoUpSLP::BlockScheduling::resetSchedule() {
14700 assert(ScheduleStart &&
14701 "tried to reset schedule on block which has not been scheduled");
14702 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
14703 doForAllOpcodes(I, [&](ScheduleData *SD) {
14704 assert(isInSchedulingRegion(SD) &&
14705 "ScheduleData not in scheduling region");
14706 SD->IsScheduled = false;
14707 SD->resetUnscheduledDeps();
14708 });
14709 }
14710 ReadyInsts.clear();
14711}
14712
14713void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
14714 if (!BS->ScheduleStart)
14715 return;
14716
14717 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
14718
14719 // A key point - if we got here, pre-scheduling was able to find a valid
14720 // scheduling of the sub-graph of the scheduling window which consists
14721 // of all vector bundles and their transitive users. As such, we do not
14722 // need to reschedule anything *outside of* that subgraph.
14723
14724 BS->resetSchedule();
14725
14726 // For the real scheduling we use a more sophisticated ready-list: it is
14727 // sorted by the original instruction location. This lets the final schedule
14728 // be as close as possible to the original instruction order.
14729 // WARNING: If changing this order causes a correctness issue, that means
14730 // there is some missing dependence edge in the schedule data graph.
14731 struct ScheduleDataCompare {
14732 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
14733 return SD2->SchedulingPriority < SD1->SchedulingPriority;
14734 }
14735 };
14736 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
14737
14738 // Ensure that all dependency data is updated (for nodes in the sub-graph)
14739 // and fill the ready-list with initial instructions.
14740 int Idx = 0;
14741 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
14742 I = I->getNextNode()) {
14743 BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
14744 TreeEntry *SDTE = getTreeEntry(SD->Inst);
14745 (void)SDTE;
14747 SD->isPartOfBundle() ==
14748 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
14749 "scheduler and vectorizer bundle mismatch");
14750 SD->FirstInBundle->SchedulingPriority = Idx++;
14751
14752 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
14753 BS->calculateDependencies(SD, false, this);
14754 });
14755 }
14756 BS->initialFillReadyList(ReadyInsts);
14757
14758 Instruction *LastScheduledInst = BS->ScheduleEnd;
14759
14760 // Do the "real" scheduling.
14761 while (!ReadyInsts.empty()) {
14762 ScheduleData *Picked = *ReadyInsts.begin();
14763 ReadyInsts.erase(ReadyInsts.begin());
14764
14765 // Move the scheduled instruction(s) to their dedicated places, if not
14766 // there yet.
14767 for (ScheduleData *BundleMember = Picked; BundleMember;
14768 BundleMember = BundleMember->NextInBundle) {
14769 Instruction *PickedInst = BundleMember->Inst;
14770 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
14771 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
14772 LastScheduledInst = PickedInst;
14773 }
14774
14775 BS->schedule(Picked, ReadyInsts);
14776 }
14777
14778 // Check that we didn't break any of our invariants.
14779#ifdef EXPENSIVE_CHECKS
14780 BS->verify();
14781#endif
14782
14783#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
14784 // Check that all schedulable entities got scheduled
14785 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
14786 BS->doForAllOpcodes(I, [&](ScheduleData *SD) {
14787 if (SD->isSchedulingEntity() && SD->hasValidDependencies()) {
14788 assert(SD->IsScheduled && "must be scheduled at this point");
14789 }
14790 });
14791 }
14792#endif
14793
14794 // Avoid duplicate scheduling of the block.
14795 BS->ScheduleStart = nullptr;
14796}
14797
14799 // If V is a store, just return the width of the stored value (or value
14800 // truncated just before storing) without traversing the expression tree.
14801 // This is the common case.
14802 if (auto *Store = dyn_cast<StoreInst>(V))
14803 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
14804
14805 if (auto *IEI = dyn_cast<InsertElementInst>(V))
14806 return getVectorElementSize(IEI->getOperand(1));
14807
14808 auto E = InstrElementSize.find(V);
14809 if (E != InstrElementSize.end())
14810 return E->second;
14811
14812 // If V is not a store, we can traverse the expression tree to find loads
14813 // that feed it. The type of the loaded value may indicate a more suitable
14814 // width than V's type. We want to base the vector element size on the width
14815 // of memory operations where possible.
14818 if (auto *I = dyn_cast<Instruction>(V)) {
14819 Worklist.emplace_back(I, I->getParent(), 0);
14820 Visited.insert(I);
14821 }
14822
14823 // Traverse the expression tree in bottom-up order looking for loads. If we
14824 // encounter an instruction we don't yet handle, we give up.
14825 auto Width = 0u;
14826 Value *FirstNonBool = nullptr;
14827 while (!Worklist.empty()) {
14828 auto [I, Parent, Level] = Worklist.pop_back_val();
14829
14830 // We should only be looking at scalar instructions here. If the current
14831 // instruction has a vector type, skip.
14832 auto *Ty = I->getType();
14833 if (isa<VectorType>(Ty))
14834 continue;
14835 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
14836 FirstNonBool = I;
14837 if (Level > RecursionMaxDepth)
14838 continue;
14839
14840 // If the current instruction is a load, update MaxWidth to reflect the
14841 // width of the loaded value.
14842 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
14843 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
14844
14845 // Otherwise, we need to visit the operands of the instruction. We only
14846 // handle the interesting cases from buildTree here. If an operand is an
14847 // instruction we haven't yet visited and from the same basic block as the
14848 // user or the use is a PHI node, we add it to the worklist.
14851 for (Use &U : I->operands()) {
14852 if (auto *J = dyn_cast<Instruction>(U.get()))
14853 if (Visited.insert(J).second &&
14854 (isa<PHINode>(I) || J->getParent() == Parent)) {
14855 Worklist.emplace_back(J, J->getParent(), Level + 1);
14856 continue;
14857 }
14858 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
14859 FirstNonBool = U.get();
14860 }
14861 } else {
14862 break;
14863 }
14864 }
14865
14866 // If we didn't encounter a memory access in the expression tree, or if we
14867 // gave up for some reason, just return the width of V. Otherwise, return the
14868 // maximum width we found.
14869 if (!Width) {
14870 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
14871 V = FirstNonBool;
14872 Width = DL->getTypeSizeInBits(V->getType());
14873 }
14874
14875 for (Instruction *I : Visited)
14876 InstrElementSize[I] = Width;
14877
14878 return Width;
14879}
14880
14881bool BoUpSLP::collectValuesToDemote(
14882 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
14884 unsigned &MaxDepthLevel, bool &IsProfitableToDemote,
14885 bool IsTruncRoot) const {
14886 // We can always demote constants.
14887 if (all_of(E.Scalars, IsaPred<Constant>))
14888 return true;
14889
14890 unsigned OrigBitWidth = DL->getTypeSizeInBits(E.Scalars.front()->getType());
14891 if (OrigBitWidth == BitWidth) {
14892 MaxDepthLevel = 1;
14893 return true;
14894 }
14895
14896 // If the value is not a vectorized instruction in the expression and not used
14897 // by the insertelement instruction and not used in multiple vector nodes, it
14898 // cannot be demoted.
14899 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
14900 if (MultiNodeScalars.contains(V))
14901 return false;
14902 if (OrigBitWidth > BitWidth) {
14903 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14904 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14905 return true;
14906 }
14907 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
14908 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
14909 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*DL));
14910 if (IsSigned)
14911 ++BitWidth1;
14912 if (auto *I = dyn_cast<Instruction>(V)) {
14913 APInt Mask = DB->getDemandedBits(I);
14914 unsigned BitWidth2 =
14915 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
14916 while (!IsSigned && BitWidth2 < OrigBitWidth) {
14917 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
14918 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
14919 break;
14920 BitWidth2 *= 2;
14921 }
14922 BitWidth1 = std::min(BitWidth1, BitWidth2);
14923 }
14924 BitWidth = std::max(BitWidth, BitWidth1);
14925 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
14926 };
14927 using namespace std::placeholders;
14928 auto FinalAnalysis = [&]() {
14929 if (!IsProfitableToDemote)
14930 return false;
14931 bool Res = all_of(
14932 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
14933 // Demote gathers.
14934 if (Res && E.State == TreeEntry::NeedToGather) {
14935 // Check possible extractelement instructions bases and final vector
14936 // length.
14937 SmallPtrSet<Value *, 4> UniqueBases;
14938 for (Value *V : E.Scalars) {
14939 auto *EE = dyn_cast<ExtractElementInst>(V);
14940 if (!EE)
14941 continue;
14942 UniqueBases.insert(EE->getVectorOperand());
14943 }
14944 const unsigned VF = E.Scalars.size();
14945 Type *OrigScalarTy = E.Scalars.front()->getType();
14946 if (UniqueBases.size() <= 2 ||
14947 TTI->getNumberOfParts(FixedVectorType::get(OrigScalarTy, VF)) ==
14949 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
14950 ToDemote.push_back(E.Idx);
14951 }
14952 return Res;
14953 };
14954 if (E.State == TreeEntry::NeedToGather || !Visited.insert(&E).second ||
14955 any_of(E.Scalars, [&](Value *V) {
14956 return all_of(V->users(), [&](User *U) {
14957 return isa<InsertElementInst>(U) && !getTreeEntry(U);
14958 });
14959 }))
14960 return FinalAnalysis();
14961
14962 if (any_of(E.Scalars, [&](Value *V) {
14963 return !all_of(V->users(), [=](User *U) {
14964 return getTreeEntry(U) ||
14965 (UserIgnoreList && UserIgnoreList->contains(U)) ||
14966 (!isa<CmpInst>(U) && U->getType()->isSized() &&
14967 !U->getType()->isScalableTy() &&
14968 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
14969 }) && !IsPotentiallyTruncated(V, BitWidth);
14970 }))
14971 return false;
14972
14973 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
14974 bool &NeedToExit) {
14975 NeedToExit = false;
14976 unsigned InitLevel = MaxDepthLevel;
14977 for (const TreeEntry *Op : Operands) {
14978 unsigned Level = InitLevel;
14979 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
14980 ToDemote, Visited, Level, IsProfitableToDemote,
14981 IsTruncRoot)) {
14982 if (!IsProfitableToDemote)
14983 return false;
14984 NeedToExit = true;
14985 if (!FinalAnalysis())
14986 return false;
14987 continue;
14988 }
14989 MaxDepthLevel = std::max(MaxDepthLevel, Level);
14990 }
14991 return true;
14992 };
14993 auto AttemptCheckBitwidth =
14994 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
14995 // Try all bitwidth < OrigBitWidth.
14996 NeedToExit = false;
14997 unsigned BestFailBitwidth = 0;
14998 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
14999 if (Checker(BitWidth, OrigBitWidth))
15000 return true;
15001 if (BestFailBitwidth == 0 && FinalAnalysis())
15002 BestFailBitwidth = BitWidth;
15003 }
15004 if (BitWidth >= OrigBitWidth) {
15005 if (BestFailBitwidth == 0) {
15006 BitWidth = OrigBitWidth;
15007 return false;
15008 }
15009 MaxDepthLevel = 1;
15010 BitWidth = BestFailBitwidth;
15011 NeedToExit = true;
15012 return true;
15013 }
15014 return false;
15015 };
15016 auto TryProcessInstruction =
15017 [&](unsigned &BitWidth,
15019 function_ref<bool(unsigned, unsigned)> Checker = {}) {
15020 if (Operands.empty()) {
15021 if (!IsTruncRoot)
15022 MaxDepthLevel = 1;
15023 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15024 std::ref(BitWidth)));
15025 } else {
15026 // Several vectorized uses? Check if we can truncate it, otherwise -
15027 // exit.
15028 if (E.UserTreeIndices.size() > 1 &&
15029 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
15030 std::ref(BitWidth))))
15031 return false;
15032 bool NeedToExit = false;
15033 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
15034 return false;
15035 if (NeedToExit)
15036 return true;
15037 if (!ProcessOperands(Operands, NeedToExit))
15038 return false;
15039 if (NeedToExit)
15040 return true;
15041 }
15042
15043 ++MaxDepthLevel;
15044 // Record the entry that we can demote.
15045 ToDemote.push_back(E.Idx);
15046 return IsProfitableToDemote;
15047 };
15048 switch (E.getOpcode()) {
15049
15050 // We can always demote truncations and extensions. Since truncations can
15051 // seed additional demotion, we save the truncated value.
15052 case Instruction::Trunc:
15053 if (IsProfitableToDemoteRoot)
15054 IsProfitableToDemote = true;
15055 return TryProcessInstruction(BitWidth);
15056 case Instruction::ZExt:
15057 case Instruction::SExt:
15058 IsProfitableToDemote = true;
15059 return TryProcessInstruction(BitWidth);
15060
15061 // We can demote certain binary operations if we can demote both of their
15062 // operands.
15063 case Instruction::Add:
15064 case Instruction::Sub:
15065 case Instruction::Mul:
15066 case Instruction::And:
15067 case Instruction::Or:
15068 case Instruction::Xor: {
15069 return TryProcessInstruction(
15070 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
15071 }
15072 case Instruction::Shl: {
15073 // If we are truncating the result of this SHL, and if it's a shift of an
15074 // inrange amount, we can always perform a SHL in a smaller type.
15075 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
15076 return all_of(E.Scalars, [&](Value *V) {
15077 auto *I = cast<Instruction>(V);
15078 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15079 return AmtKnownBits.getMaxValue().ult(BitWidth);
15080 });
15081 };
15082 return TryProcessInstruction(
15083 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
15084 }
15085 case Instruction::LShr: {
15086 // If this is a truncate of a logical shr, we can truncate it to a smaller
15087 // lshr iff we know that the bits we would otherwise be shifting in are
15088 // already zeros.
15089 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15090 return all_of(E.Scalars, [&](Value *V) {
15091 auto *I = cast<Instruction>(V);
15092 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15093 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15094 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15095 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
15096 SimplifyQuery(*DL));
15097 });
15098 };
15099 return TryProcessInstruction(
15100 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15101 LShrChecker);
15102 }
15103 case Instruction::AShr: {
15104 // If this is a truncate of an arithmetic shr, we can truncate it to a
15105 // smaller ashr iff we know that all the bits from the sign bit of the
15106 // original type and the sign bit of the truncate type are similar.
15107 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15108 return all_of(E.Scalars, [&](Value *V) {
15109 auto *I = cast<Instruction>(V);
15110 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
15111 unsigned ShiftedBits = OrigBitWidth - BitWidth;
15112 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
15113 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15114 nullptr, DT);
15115 });
15116 };
15117 return TryProcessInstruction(
15118 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
15119 AShrChecker);
15120 }
15121 case Instruction::UDiv:
15122 case Instruction::URem: {
15123 // UDiv and URem can be truncated if all the truncated bits are zero.
15124 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15125 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15126 return all_of(E.Scalars, [&](Value *V) {
15127 auto *I = cast<Instruction>(V);
15128 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15129 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
15130 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15131 });
15132 };
15133 return TryProcessInstruction(
15134 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
15135 }
15136
15137 // We can demote selects if we can demote their true and false values.
15138 case Instruction::Select: {
15139 return TryProcessInstruction(
15140 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
15141 }
15142
15143 // We can demote phis if we can demote all their incoming operands. Note that
15144 // we don't need to worry about cycles since we ensure single use above.
15145 case Instruction::PHI: {
15146 const unsigned NumOps = E.getNumOperands();
15148 transform(seq<unsigned>(0, NumOps), Ops.begin(),
15149 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
15150
15151 return TryProcessInstruction(BitWidth, Ops);
15152 }
15153
15154 case Instruction::Call: {
15155 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
15156 if (!IC)
15157 break;
15159 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
15160 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
15161 break;
15162 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
15163 function_ref<bool(unsigned, unsigned)> CallChecker;
15164 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
15165 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
15166 return all_of(E.Scalars, [&](Value *V) {
15167 auto *I = cast<Instruction>(V);
15168 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
15169 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
15170 return MaskedValueIsZero(I->getOperand(0), Mask,
15171 SimplifyQuery(*DL)) &&
15172 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
15173 }
15174 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
15175 "Expected min/max intrinsics only.");
15176 unsigned SignBits = OrigBitWidth - BitWidth;
15177 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
15178 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
15179 nullptr, DT);
15180 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
15181 nullptr, DT);
15182 return SignBits <= Op0SignBits &&
15183 ((SignBits != Op0SignBits &&
15184 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
15185 MaskedValueIsZero(I->getOperand(0), Mask,
15186 SimplifyQuery(*DL))) &&
15187 SignBits <= Op1SignBits &&
15188 ((SignBits != Op1SignBits &&
15189 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
15190 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
15191 });
15192 };
15193 if (ID != Intrinsic::abs) {
15194 Operands.push_back(getOperandEntry(&E, 1));
15195 CallChecker = CompChecker;
15196 }
15197 InstructionCost BestCost =
15198 std::numeric_limits<InstructionCost::CostType>::max();
15199 unsigned BestBitWidth = BitWidth;
15200 unsigned VF = E.Scalars.size();
15201 // Choose the best bitwidth based on cost estimations.
15202 auto Checker = [&](unsigned BitWidth, unsigned) {
15203 unsigned MinBW = PowerOf2Ceil(BitWidth);
15204 SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
15205 auto VecCallCosts = getVectorCallCosts(
15206 IC,
15207 FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
15208 TTI, TLI, ArgTys);
15209 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
15210 if (Cost < BestCost) {
15211 BestCost = Cost;
15212 BestBitWidth = BitWidth;
15213 }
15214 return false;
15215 };
15216 [[maybe_unused]] bool NeedToExit;
15217 (void)AttemptCheckBitwidth(Checker, NeedToExit);
15218 BitWidth = BestBitWidth;
15219 return TryProcessInstruction(BitWidth, Operands, CallChecker);
15220 }
15221
15222 // Otherwise, conservatively give up.
15223 default:
15224 break;
15225 }
15226 MaxDepthLevel = 1;
15227 return FinalAnalysis();
15228}
15229
15230static RecurKind getRdxKind(Value *V);
15231
15233 // We only attempt to truncate integer expressions.
15234 bool IsStoreOrInsertElt =
15235 VectorizableTree.front()->getOpcode() == Instruction::Store ||
15236 VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
15237 if ((IsStoreOrInsertElt || UserIgnoreList) &&
15238 ExtraBitWidthNodes.size() <= 1 &&
15239 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
15240 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
15241 return;
15242
15243 unsigned NodeIdx = 0;
15244 if (IsStoreOrInsertElt &&
15245 VectorizableTree.front()->State != TreeEntry::NeedToGather)
15246 NodeIdx = 1;
15247
15248 // Ensure the roots of the vectorizable tree don't form a cycle.
15249 if (VectorizableTree[NodeIdx]->State == TreeEntry::NeedToGather ||
15250 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
15251 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15252 [NodeIdx](const EdgeInfo &EI) {
15253 return EI.UserTE->Idx >
15254 static_cast<int>(NodeIdx);
15255 })))
15256 return;
15257
15258 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
15259 // resize to the final type.
15260 bool IsTruncRoot = false;
15261 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
15262 SmallVector<unsigned> RootDemotes;
15263 if (NodeIdx != 0 &&
15264 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15265 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15266 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
15267 IsTruncRoot = true;
15268 RootDemotes.push_back(NodeIdx);
15269 IsProfitableToDemoteRoot = true;
15270 ++NodeIdx;
15271 }
15272
15273 // Analyzed the reduction already and not profitable - exit.
15274 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
15275 return;
15276
15277 SmallVector<unsigned> ToDemote;
15278 auto ComputeMaxBitWidth = [&](const TreeEntry &E, bool IsTopRoot,
15279 bool IsProfitableToDemoteRoot, unsigned Opcode,
15280 unsigned Limit, bool IsTruncRoot,
15281 bool IsSignedCmp) {
15282 ToDemote.clear();
15283 unsigned VF = E.getVectorFactor();
15284 auto *TreeRootIT = dyn_cast<IntegerType>(E.Scalars.front()->getType());
15285 if (!TreeRootIT || !Opcode)
15286 return 0u;
15287
15288 if (any_of(E.Scalars,
15289 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
15290 return 0u;
15291
15292 unsigned NumParts =
15293 TTI->getNumberOfParts(FixedVectorType::get(TreeRootIT, VF));
15294
15295 // The maximum bit width required to represent all the values that can be
15296 // demoted without loss of precision. It would be safe to truncate the roots
15297 // of the expression to this width.
15298 unsigned MaxBitWidth = 1u;
15299
15300 // True if the roots can be zero-extended back to their original type,
15301 // rather than sign-extended. We know that if the leading bits are not
15302 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
15303 // True.
15304 // Determine if the sign bit of all the roots is known to be zero. If not,
15305 // IsKnownPositive is set to False.
15306 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
15307 KnownBits Known = computeKnownBits(R, *DL);
15308 return Known.isNonNegative();
15309 });
15310
15311 // We first check if all the bits of the roots are demanded. If they're not,
15312 // we can truncate the roots to this narrower type.
15313 for (Value *Root : E.Scalars) {
15314 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
15315 TypeSize NumTypeBits = DL->getTypeSizeInBits(Root->getType());
15316 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15317 // If we can't prove that the sign bit is zero, we must add one to the
15318 // maximum bit width to account for the unknown sign bit. This preserves
15319 // the existing sign bit so we can safely sign-extend the root back to the
15320 // original type. Otherwise, if we know the sign bit is zero, we will
15321 // zero-extend the root instead.
15322 //
15323 // FIXME: This is somewhat suboptimal, as there will be cases where adding
15324 // one to the maximum bit width will yield a larger-than-necessary
15325 // type. In general, we need to add an extra bit only if we can't
15326 // prove that the upper bit of the original type is equal to the
15327 // upper bit of the proposed smaller type. If these two bits are
15328 // the same (either zero or one) we know that sign-extending from
15329 // the smaller type will result in the same value. Here, since we
15330 // can't yet prove this, we are just making the proposed smaller
15331 // type larger to ensure correctness.
15332 if (!IsKnownPositive)
15333 ++BitWidth1;
15334
15335 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
15336 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15337 MaxBitWidth =
15338 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
15339 }
15340
15341 if (MaxBitWidth < 8 && MaxBitWidth > 1)
15342 MaxBitWidth = 8;
15343
15344 // If the original type is large, but reduced type does not improve the reg
15345 // use - ignore it.
15346 if (NumParts > 1 &&
15347 NumParts ==
15349 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
15350 return 0u;
15351
15352 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
15353 Opcode == Instruction::SExt ||
15354 Opcode == Instruction::ZExt || NumParts > 1;
15355 // Conservatively determine if we can actually truncate the roots of the
15356 // expression. Collect the values that can be demoted in ToDemote and
15357 // additional roots that require investigating in Roots.
15359 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
15360 bool NeedToDemote = IsProfitableToDemote;
15361
15362 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
15363 ToDemote, Visited, MaxDepthLevel, NeedToDemote,
15364 IsTruncRoot) ||
15365 (MaxDepthLevel <= Limit &&
15366 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
15367 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
15368 DL->getTypeSizeInBits(TreeRootIT) /
15369 DL->getTypeSizeInBits(cast<Instruction>(E.Scalars.front())
15370 ->getOperand(0)
15371 ->getType()) >
15372 2)))))
15373 return 0u;
15374 // Round MaxBitWidth up to the next power-of-two.
15375 MaxBitWidth = bit_ceil(MaxBitWidth);
15376
15377 return MaxBitWidth;
15378 };
15379
15380 // If we can truncate the root, we must collect additional values that might
15381 // be demoted as a result. That is, those seeded by truncations we will
15382 // modify.
15383 // Add reduction ops sizes, if any.
15384 if (UserIgnoreList &&
15385 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
15386 for (Value *V : *UserIgnoreList) {
15387 auto NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
15388 auto NumTypeBits = DL->getTypeSizeInBits(V->getType());
15389 unsigned BitWidth1 = NumTypeBits - NumSignBits;
15391 ++BitWidth1;
15392 unsigned BitWidth2 = BitWidth1;
15394 auto Mask = DB->getDemandedBits(cast<Instruction>(V));
15395 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
15396 }
15397 ReductionBitWidth =
15398 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
15399 }
15400 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
15401 ReductionBitWidth = 8;
15402
15403 ReductionBitWidth = bit_ceil(ReductionBitWidth);
15404 }
15405 bool IsTopRoot = NodeIdx == 0;
15406 while (NodeIdx < VectorizableTree.size() &&
15407 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
15408 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
15409 RootDemotes.push_back(NodeIdx);
15410 ++NodeIdx;
15411 IsTruncRoot = true;
15412 }
15413 bool IsSignedCmp = false;
15414 while (NodeIdx < VectorizableTree.size()) {
15415 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
15416 unsigned Limit = 2;
15417 unsigned Opcode = VectorizableTree[NodeIdx]->getOpcode();
15418 if (IsTopRoot &&
15419 ReductionBitWidth ==
15420 DL->getTypeSizeInBits(
15421 VectorizableTree.front()->Scalars.front()->getType()))
15422 Limit = 3;
15423 unsigned MaxBitWidth = ComputeMaxBitWidth(
15424 *VectorizableTree[NodeIdx].get(), IsTopRoot, IsProfitableToDemoteRoot,
15425 Opcode, Limit, IsTruncRoot, IsSignedCmp);
15426 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
15427 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
15428 ReductionBitWidth = bit_ceil(MaxBitWidth);
15429 else if (MaxBitWidth == 0)
15430 ReductionBitWidth = 0;
15431 }
15432
15433 for (unsigned Idx : RootDemotes) {
15434 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
15435 uint32_t OrigBitWidth = DL->getTypeSizeInBits(V->getType());
15436 if (OrigBitWidth > MaxBitWidth) {
15437 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
15438 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
15439 }
15440 return false;
15441 }))
15442 ToDemote.push_back(Idx);
15443 }
15444 RootDemotes.clear();
15445 IsTopRoot = false;
15446 IsProfitableToDemoteRoot = true;
15447
15448 if (ExtraBitWidthNodes.empty()) {
15449 NodeIdx = VectorizableTree.size();
15450 } else {
15451 unsigned NewIdx = 0;
15452 do {
15453 NewIdx = *ExtraBitWidthNodes.begin();
15454 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
15455 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
15456 NodeIdx = NewIdx;
15457 IsTruncRoot =
15458 NodeIdx < VectorizableTree.size() &&
15459 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15460 [](const EdgeInfo &EI) {
15461 return EI.EdgeIdx == 0 &&
15462 EI.UserTE->getOpcode() == Instruction::Trunc &&
15463 !EI.UserTE->isAltShuffle();
15464 });
15465 IsSignedCmp =
15466 NodeIdx < VectorizableTree.size() &&
15467 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
15468 [&](const EdgeInfo &EI) {
15469 return EI.UserTE->getOpcode() == Instruction::ICmp &&
15470 any_of(EI.UserTE->Scalars, [&](Value *V) {
15471 auto *IC = dyn_cast<ICmpInst>(V);
15472 return IC &&
15473 (IC->isSigned() ||
15474 !isKnownNonNegative(IC->getOperand(0),
15475 SimplifyQuery(*DL)) ||
15476 !isKnownNonNegative(IC->getOperand(1),
15477 SimplifyQuery(*DL)));
15478 });
15479 });
15480 }
15481
15482 // If the maximum bit width we compute is less than the with of the roots'
15483 // type, we can proceed with the narrowing. Otherwise, do nothing.
15484 if (MaxBitWidth == 0 ||
15485 MaxBitWidth >=
15486 cast<IntegerType>(TreeRoot.front()->getType())->getBitWidth()) {
15487 if (UserIgnoreList)
15488 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
15489 continue;
15490 }
15491
15492 // Finally, map the values we can demote to the maximum bit with we
15493 // computed.
15494 for (unsigned Idx : ToDemote) {
15495 TreeEntry *TE = VectorizableTree[Idx].get();
15496 if (MinBWs.contains(TE))
15497 continue;
15498 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
15499 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15500 });
15501 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
15502 }
15503 }
15504}
15505
15507 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
15508 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
15509 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
15510 auto *AA = &AM.getResult<AAManager>(F);
15511 auto *LI = &AM.getResult<LoopAnalysis>(F);
15512 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
15513 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
15514 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
15516
15517 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
15518 if (!Changed)
15519 return PreservedAnalyses::all();
15520
15523 return PA;
15524}
15525
15527 TargetTransformInfo *TTI_,
15528 TargetLibraryInfo *TLI_, AAResults *AA_,
15529 LoopInfo *LI_, DominatorTree *DT_,
15530 AssumptionCache *AC_, DemandedBits *DB_,
15533 return false;
15534 SE = SE_;
15535 TTI = TTI_;
15536 TLI = TLI_;
15537 AA = AA_;
15538 LI = LI_;
15539 DT = DT_;
15540 AC = AC_;
15541 DB = DB_;
15542 DL = &F.getParent()->getDataLayout();
15543
15544 Stores.clear();
15545 GEPs.clear();
15546 bool Changed = false;
15547
15548 // If the target claims to have no vector registers don't attempt
15549 // vectorization.
15551 LLVM_DEBUG(
15552 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
15553 return false;
15554 }
15555
15556 // Don't vectorize when the attribute NoImplicitFloat is used.
15557 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
15558 return false;
15559
15560 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
15561
15562 // Use the bottom up slp vectorizer to construct chains that start with
15563 // store instructions.
15564 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
15565
15566 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
15567 // delete instructions.
15568
15569 // Update DFS numbers now so that we can use them for ordering.
15570 DT->updateDFSNumbers();
15571
15572 // Scan the blocks in the function in post order.
15573 for (auto *BB : post_order(&F.getEntryBlock())) {
15574 // Start new block - clear the list of reduction roots.
15575 R.clearReductionData();
15576 collectSeedInstructions(BB);
15577
15578 // Vectorize trees that end at stores.
15579 if (!Stores.empty()) {
15580 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
15581 << " underlying objects.\n");
15582 Changed |= vectorizeStoreChains(R);
15583 }
15584
15585 // Vectorize trees that end at reductions.
15586 Changed |= vectorizeChainsInBlock(BB, R);
15587
15588 // Vectorize the index computations of getelementptr instructions. This
15589 // is primarily intended to catch gather-like idioms ending at
15590 // non-consecutive loads.
15591 if (!GEPs.empty()) {
15592 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
15593 << " underlying objects.\n");
15594 Changed |= vectorizeGEPIndices(BB, R);
15595 }
15596 }
15597
15598 if (Changed) {
15599 R.optimizeGatherSequence();
15600 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
15601 }
15602 return Changed;
15603}
15604
15605std::optional<bool>
15606SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
15607 unsigned Idx, unsigned MinVF,
15608 unsigned &Size) {
15609 Size = 0;
15610 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
15611 << "\n");
15612 const unsigned Sz = R.getVectorElementSize(Chain[0]);
15613 unsigned VF = Chain.size();
15614
15615 if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) {
15616 // Check if vectorizing with a non-power-of-2 VF should be considered. At
15617 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
15618 // all vector lanes are used.
15619 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
15620 return false;
15621 }
15622
15623 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
15624 << "\n");
15625
15626 SetVector<Value *> ValOps;
15627 for (Value *V : Chain)
15628 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
15629 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
15630 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
15631 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
15632 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
15633 bool IsPowerOf2 =
15634 isPowerOf2_32(ValOps.size()) ||
15635 (VectorizeNonPowerOf2 && isPowerOf2_32(ValOps.size() + 1));
15636 if ((!IsPowerOf2 && S.getOpcode() && S.getOpcode() != Instruction::Load &&
15637 (!S.MainOp->isSafeToRemove() ||
15638 any_of(ValOps.getArrayRef(),
15639 [&](Value *V) {
15640 return !isa<ExtractElementInst>(V) &&
15641 (V->getNumUses() > Chain.size() ||
15642 any_of(V->users(), [&](User *U) {
15643 return !Stores.contains(U);
15644 }));
15645 }))) ||
15646 (ValOps.size() > Chain.size() / 2 && !S.getOpcode())) {
15647 Size = (!IsPowerOf2 && S.getOpcode()) ? 1 : 2;
15648 return false;
15649 }
15650 }
15651 if (R.isLoadCombineCandidate(Chain))
15652 return true;
15653 R.buildTree(Chain);
15654 // Check if tree tiny and store itself or its value is not vectorized.
15655 if (R.isTreeTinyAndNotFullyVectorizable()) {
15656 if (R.isGathered(Chain.front()) ||
15657 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
15658 return std::nullopt;
15659 Size = R.getTreeSize();
15660 return false;
15661 }
15662 R.reorderTopToBottom();
15663 R.reorderBottomToTop();
15664 R.buildExternalUses();
15665
15666 R.computeMinimumValueSizes();
15667 R.transformNodes();
15668
15669 Size = R.getTreeSize();
15670 if (S.getOpcode() == Instruction::Load)
15671 Size = 2; // cut off masked gather small trees
15672 InstructionCost Cost = R.getTreeCost();
15673
15674 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
15675 if (Cost < -SLPCostThreshold) {
15676 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
15677
15678 using namespace ore;
15679
15680 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
15681 cast<StoreInst>(Chain[0]))
15682 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
15683 << " and with tree size "
15684 << NV("TreeSize", R.getTreeSize()));
15685
15686 R.vectorizeTree();
15687 return true;
15688 }
15689
15690 return false;
15691}
15692
15693/// Checks if the quadratic mean deviation is less than 90% of the mean size.
15694static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
15695 bool First) {
15696 unsigned Num = 0;
15697 uint64_t Sum = std::accumulate(
15698 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15699 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15700 unsigned Size = First ? Val.first : Val.second;
15701 if (Size == 1)
15702 return V;
15703 ++Num;
15704 return V + Size;
15705 });
15706 if (Num == 0)
15707 return true;
15708 uint64_t Mean = Sum / Num;
15709 if (Mean == 0)
15710 return true;
15711 uint64_t Dev = std::accumulate(
15712 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
15713 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
15714 unsigned P = First ? Val.first : Val.second;
15715 if (P == 1)
15716 return V;
15717 return V + (P - Mean) * (P - Mean);
15718 }) /
15719 Num;
15720 return Dev * 81 / (Mean * Mean) == 0;
15721}
15722
15723bool SLPVectorizerPass::vectorizeStores(
15724 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
15725 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
15726 &Visited) {
15727 // We may run into multiple chains that merge into a single chain. We mark the
15728 // stores that we vectorized so that we don't visit the same store twice.
15729 BoUpSLP::ValueSet VectorizedStores;
15730 bool Changed = false;
15731
15732 struct StoreDistCompare {
15733 bool operator()(const std::pair<unsigned, int> &Op1,
15734 const std::pair<unsigned, int> &Op2) const {
15735 return Op1.second < Op2.second;
15736 }
15737 };
15738 // A set of pairs (index of store in Stores array ref, Distance of the store
15739 // address relative to base store address in units).
15740 using StoreIndexToDistSet =
15741 std::set<std::pair<unsigned, int>, StoreDistCompare>;
15742 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
15743 int PrevDist = -1;
15745 // Collect the chain into a list.
15746 for (auto [Idx, Data] : enumerate(Set)) {
15747 if (Operands.empty() || Data.second - PrevDist == 1) {
15748 Operands.push_back(Stores[Data.first]);
15749 PrevDist = Data.second;
15750 if (Idx != Set.size() - 1)
15751 continue;
15752 }
15753 auto E = make_scope_exit([&, &DataVar = Data]() {
15754 Operands.clear();
15755 Operands.push_back(Stores[DataVar.first]);
15756 PrevDist = DataVar.second;
15757 });
15758
15759 if (Operands.size() <= 1 ||
15760 !Visited
15761 .insert({Operands.front(),
15762 cast<StoreInst>(Operands.front())->getValueOperand(),
15763 Operands.back(),
15764 cast<StoreInst>(Operands.back())->getValueOperand(),
15765 Operands.size()})
15766 .second)
15767 continue;
15768
15769 unsigned MaxVecRegSize = R.getMaxVecRegSize();
15770 unsigned EltSize = R.getVectorElementSize(Operands[0]);
15771 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
15772
15773 unsigned MaxVF =
15774 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
15775 unsigned MaxRegVF = MaxVF;
15776 auto *Store = cast<StoreInst>(Operands[0]);
15777 Type *StoreTy = Store->getValueOperand()->getType();
15778 Type *ValueTy = StoreTy;
15779 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
15780 ValueTy = Trunc->getSrcTy();
15781 if (ValueTy == StoreTy &&
15782 R.getVectorElementSize(Store->getValueOperand()) <= EltSize)
15783 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
15784 unsigned MinVF = std::max<unsigned>(
15786 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
15787 ValueTy)));
15788
15789 if (MaxVF < MinVF) {
15790 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
15791 << ") < "
15792 << "MinVF (" << MinVF << ")\n");
15793 continue;
15794 }
15795
15796 unsigned NonPowerOf2VF = 0;
15798 // First try vectorizing with a non-power-of-2 VF. At the moment, only
15799 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
15800 // lanes are used.
15801 unsigned CandVF = Operands.size();
15802 if (isPowerOf2_32(CandVF + 1) && CandVF <= MaxRegVF)
15803 NonPowerOf2VF = CandVF;
15804 }
15805
15806 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
15807 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
15808 unsigned Size = MinVF;
15809 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
15810 VF = Size > MaxVF ? NonPowerOf2VF : Size;
15811 Size *= 2;
15812 });
15813 unsigned End = Operands.size();
15814 unsigned Repeat = 0;
15815 constexpr unsigned MaxAttempts = 4;
15817 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
15818 P.first = P.second = 1;
15819 });
15821 auto IsNotVectorized = [](bool First,
15822 const std::pair<unsigned, unsigned> &P) {
15823 return First ? P.first > 0 : P.second > 0;
15824 };
15825 auto IsVectorized = [](bool First,
15826 const std::pair<unsigned, unsigned> &P) {
15827 return First ? P.first == 0 : P.second == 0;
15828 };
15829 auto VFIsProfitable = [](bool First, unsigned Size,
15830 const std::pair<unsigned, unsigned> &P) {
15831 return First ? Size >= P.first : Size >= P.second;
15832 };
15833 auto FirstSizeSame = [](unsigned Size,
15834 const std::pair<unsigned, unsigned> &P) {
15835 return Size == P.first;
15836 };
15837 while (true) {
15838 ++Repeat;
15839 bool RepeatChanged = false;
15840 bool AnyProfitableGraph;
15841 for (unsigned Size : CandidateVFs) {
15842 AnyProfitableGraph = false;
15843 unsigned StartIdx = std::distance(
15844 RangeSizes.begin(),
15845 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
15846 std::placeholders::_1)));
15847 while (StartIdx < End) {
15848 unsigned EndIdx =
15849 std::distance(RangeSizes.begin(),
15850 find_if(RangeSizes.drop_front(StartIdx),
15851 std::bind(IsVectorized, Size >= MaxRegVF,
15852 std::placeholders::_1)));
15853 unsigned Sz = EndIdx >= End ? End : EndIdx;
15854 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
15855 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
15856 Size >= MaxRegVF)) {
15857 ++Cnt;
15858 continue;
15859 }
15861 assert(all_of(Slice,
15862 [&](Value *V) {
15863 return cast<StoreInst>(V)
15864 ->getValueOperand()
15865 ->getType() ==
15866 cast<StoreInst>(Slice.front())
15867 ->getValueOperand()
15868 ->getType();
15869 }) &&
15870 "Expected all operands of same type.");
15871 if (!NonSchedulable.empty()) {
15872 auto [NonSchedSizeMax, NonSchedSizeMin] =
15873 NonSchedulable.lookup(Slice.front());
15874 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
15875 Cnt += NonSchedSizeMax;
15876 continue;
15877 }
15878 }
15879 unsigned TreeSize;
15880 std::optional<bool> Res =
15881 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
15882 if (!Res) {
15883 NonSchedulable
15884 .try_emplace(Slice.front(), std::make_pair(Size, Size))
15885 .first->getSecond()
15886 .second = Size;
15887 } else if (*Res) {
15888 // Mark the vectorized stores so that we don't vectorize them
15889 // again.
15890 VectorizedStores.insert(Slice.begin(), Slice.end());
15891 // Mark the vectorized stores so that we don't vectorize them
15892 // again.
15893 AnyProfitableGraph = RepeatChanged = Changed = true;
15894 // If we vectorized initial block, no need to try to vectorize
15895 // it again.
15896 for_each(RangeSizes.slice(Cnt, Size),
15897 [](std::pair<unsigned, unsigned> &P) {
15898 P.first = P.second = 0;
15899 });
15900 if (Cnt < StartIdx + MinVF) {
15901 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
15902 [](std::pair<unsigned, unsigned> &P) {
15903 P.first = P.second = 0;
15904 });
15905 StartIdx = Cnt + Size;
15906 }
15907 if (Cnt > Sz - Size - MinVF) {
15908 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
15909 [](std::pair<unsigned, unsigned> &P) {
15910 P.first = P.second = 0;
15911 });
15912 if (Sz == End)
15913 End = Cnt;
15914 Sz = Cnt;
15915 }
15916 Cnt += Size;
15917 continue;
15918 }
15919 if (Size > 2 && Res &&
15920 !all_of(RangeSizes.slice(Cnt, Size),
15921 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
15922 std::placeholders::_1))) {
15923 Cnt += Size;
15924 continue;
15925 }
15926 // Check for the very big VFs that we're not rebuilding same
15927 // trees, just with larger number of elements.
15928 if (Size > MaxRegVF && TreeSize > 1 &&
15929 all_of(RangeSizes.slice(Cnt, Size),
15930 std::bind(FirstSizeSame, TreeSize,
15931 std::placeholders::_1))) {
15932 Cnt += Size;
15933 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
15934 ++Cnt;
15935 continue;
15936 }
15937 if (TreeSize > 1)
15938 for_each(RangeSizes.slice(Cnt, Size),
15939 [&](std::pair<unsigned, unsigned> &P) {
15940 if (Size >= MaxRegVF)
15941 P.second = std::max(P.second, TreeSize);
15942 else
15943 P.first = std::max(P.first, TreeSize);
15944 });
15945 ++Cnt;
15946 AnyProfitableGraph = true;
15947 }
15948 if (StartIdx >= End)
15949 break;
15950 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
15951 AnyProfitableGraph = true;
15952 StartIdx = std::distance(
15953 RangeSizes.begin(),
15954 find_if(RangeSizes.drop_front(Sz),
15955 std::bind(IsNotVectorized, Size >= MaxRegVF,
15956 std::placeholders::_1)));
15957 }
15958 if (!AnyProfitableGraph && Size >= MaxRegVF)
15959 break;
15960 }
15961 // All values vectorized - exit.
15962 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
15963 return P.first == 0 && P.second == 0;
15964 }))
15965 break;
15966 // Check if tried all attempts or no need for the last attempts at all.
15967 if (Repeat >= MaxAttempts ||
15968 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
15969 break;
15970 constexpr unsigned StoresLimit = 64;
15971 const unsigned MaxTotalNum = bit_floor(std::min<unsigned>(
15972 Operands.size(),
15973 static_cast<unsigned>(
15974 End -
15975 std::distance(
15976 RangeSizes.begin(),
15977 find_if(RangeSizes, std::bind(IsNotVectorized, true,
15978 std::placeholders::_1))) +
15979 1)));
15980 unsigned VF = PowerOf2Ceil(CandidateVFs.front()) * 2;
15981 if (VF > MaxTotalNum || VF >= StoresLimit)
15982 break;
15983 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
15984 if (P.first != 0)
15985 P.first = std::max(P.second, P.first);
15986 });
15987 // Last attempt to vectorize max number of elements, if all previous
15988 // attempts were unsuccessful because of the cost issues.
15989 CandidateVFs.clear();
15990 CandidateVFs.push_back(VF);
15991 }
15992 }
15993 };
15994
15995 // Stores pair (first: index of the store into Stores array ref, address of
15996 // which taken as base, second: sorted set of pairs {index, dist}, which are
15997 // indices of stores in the set and their store location distances relative to
15998 // the base address).
15999
16000 // Need to store the index of the very first store separately, since the set
16001 // may be reordered after the insertion and the first store may be moved. This
16002 // container allows to reduce number of calls of getPointersDiff() function.
16004 // Inserts the specified store SI with the given index Idx to the set of the
16005 // stores. If the store with the same distance is found already - stop
16006 // insertion, try to vectorize already found stores. If some stores from this
16007 // sequence were not vectorized - try to vectorize them with the new store
16008 // later. But this logic is applied only to the stores, that come before the
16009 // previous store with the same distance.
16010 // Example:
16011 // 1. store x, %p
16012 // 2. store y, %p+1
16013 // 3. store z, %p+2
16014 // 4. store a, %p
16015 // 5. store b, %p+3
16016 // - Scan this from the last to first store. The very first bunch of stores is
16017 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
16018 // vector).
16019 // - The next store in the list - #1 - has the same distance from store #5 as
16020 // the store #4.
16021 // - Try to vectorize sequence of stores 4,2,3,5.
16022 // - If all these stores are vectorized - just drop them.
16023 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
16024 // - Start new stores sequence.
16025 // The new bunch of stores is {1, {1, 0}}.
16026 // - Add the stores from previous sequence, that were not vectorized.
16027 // Here we consider the stores in the reversed order, rather they are used in
16028 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
16029 // Store #3 can be added -> comes after store #4 with the same distance as
16030 // store #1.
16031 // Store #5 cannot be added - comes before store #4.
16032 // This logic allows to improve the compile time, we assume that the stores
16033 // after previous store with the same distance most likely have memory
16034 // dependencies and no need to waste compile time to try to vectorize them.
16035 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
16036 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
16037 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
16038 std::optional<int> Diff = getPointersDiff(
16039 Stores[Set.first]->getValueOperand()->getType(),
16040 Stores[Set.first]->getPointerOperand(),
16041 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
16042 /*StrictCheck=*/true);
16043 if (!Diff)
16044 continue;
16045 auto It = Set.second.find(std::make_pair(Idx, *Diff));
16046 if (It == Set.second.end()) {
16047 Set.second.emplace(Idx, *Diff);
16048 return;
16049 }
16050 // Try to vectorize the first found set to avoid duplicate analysis.
16051 TryToVectorize(Set.second);
16052 StoreIndexToDistSet PrevSet;
16053 PrevSet.swap(Set.second);
16054 Set.first = Idx;
16055 Set.second.emplace(Idx, 0);
16056 // Insert stores that followed previous match to try to vectorize them
16057 // with this store.
16058 unsigned StartIdx = It->first + 1;
16059 SmallBitVector UsedStores(Idx - StartIdx);
16060 // Distances to previously found dup store (or this store, since they
16061 // store to the same addresses).
16062 SmallVector<int> Dists(Idx - StartIdx, 0);
16063 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
16064 // Do not try to vectorize sequences, we already tried.
16065 if (Pair.first <= It->first ||
16066 VectorizedStores.contains(Stores[Pair.first]))
16067 break;
16068 unsigned BI = Pair.first - StartIdx;
16069 UsedStores.set(BI);
16070 Dists[BI] = Pair.second - It->second;
16071 }
16072 for (unsigned I = StartIdx; I < Idx; ++I) {
16073 unsigned BI = I - StartIdx;
16074 if (UsedStores.test(BI))
16075 Set.second.emplace(I, Dists[BI]);
16076 }
16077 return;
16078 }
16079 auto &Res = SortedStores.emplace_back();
16080 Res.first = Idx;
16081 Res.second.emplace(Idx, 0);
16082 };
16083 StoreInst *PrevStore = Stores.front();
16084 for (auto [I, SI] : enumerate(Stores)) {
16085 // Check that we do not try to vectorize stores of different types.
16086 if (PrevStore->getValueOperand()->getType() !=
16087 SI->getValueOperand()->getType()) {
16088 for (auto &Set : SortedStores)
16089 TryToVectorize(Set.second);
16090 SortedStores.clear();
16091 PrevStore = SI;
16092 }
16093 FillStoresSet(I, SI);
16094 }
16095
16096 // Final vectorization attempt.
16097 for (auto &Set : SortedStores)
16098 TryToVectorize(Set.second);
16099
16100 return Changed;
16101}
16102
16103void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
16104 // Initialize the collections. We will make a single pass over the block.
16105 Stores.clear();
16106 GEPs.clear();
16107
16108 // Visit the store and getelementptr instructions in BB and organize them in
16109 // Stores and GEPs according to the underlying objects of their pointer
16110 // operands.
16111 for (Instruction &I : *BB) {
16112 // Ignore store instructions that are volatile or have a pointer operand
16113 // that doesn't point to a scalar type.
16114 if (auto *SI = dyn_cast<StoreInst>(&I)) {
16115 if (!SI->isSimple())
16116 continue;
16117 if (!isValidElementType(SI->getValueOperand()->getType()))
16118 continue;
16119 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
16120 }
16121
16122 // Ignore getelementptr instructions that have more than one index, a
16123 // constant index, or a pointer operand that doesn't point to a scalar
16124 // type.
16125 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
16126 if (GEP->getNumIndices() != 1)
16127 continue;
16128 Value *Idx = GEP->idx_begin()->get();
16129 if (isa<Constant>(Idx))
16130 continue;
16131 if (!isValidElementType(Idx->getType()))
16132 continue;
16133 if (GEP->getType()->isVectorTy())
16134 continue;
16135 GEPs[GEP->getPointerOperand()].push_back(GEP);
16136 }
16137 }
16138}
16139
16140bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
16141 bool MaxVFOnly) {
16142 if (VL.size() < 2)
16143 return false;
16144
16145 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
16146 << VL.size() << ".\n");
16147
16148 // Check that all of the parts are instructions of the same type,
16149 // we permit an alternate opcode via InstructionsState.
16150 InstructionsState S = getSameOpcode(VL, *TLI);
16151 if (!S.getOpcode())
16152 return false;
16153
16154 Instruction *I0 = cast<Instruction>(S.OpValue);
16155 // Make sure invalid types (including vector type) are rejected before
16156 // determining vectorization factor for scalar instructions.
16157 for (Value *V : VL) {
16158 Type *Ty = V->getType();
16159 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
16160 // NOTE: the following will give user internal llvm type name, which may
16161 // not be useful.
16162 R.getORE()->emit([&]() {
16163 std::string TypeStr;
16164 llvm::raw_string_ostream rso(TypeStr);
16165 Ty->print(rso);
16166 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
16167 << "Cannot SLP vectorize list: type "
16168 << rso.str() + " is unsupported by vectorizer";
16169 });
16170 return false;
16171 }
16172 }
16173
16174 unsigned Sz = R.getVectorElementSize(I0);
16175 unsigned MinVF = R.getMinVF(Sz);
16176 unsigned MaxVF = std::max<unsigned>(llvm::bit_floor(VL.size()), MinVF);
16177 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
16178 if (MaxVF < 2) {
16179 R.getORE()->emit([&]() {
16180 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
16181 << "Cannot SLP vectorize list: vectorization factor "
16182 << "less than 2 is not supported";
16183 });
16184 return false;
16185 }
16186
16187 bool Changed = false;
16188 bool CandidateFound = false;
16189 InstructionCost MinCost = SLPCostThreshold.getValue();
16190 Type *ScalarTy = VL[0]->getType();
16191 if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
16192 ScalarTy = IE->getOperand(1)->getType();
16193
16194 unsigned NextInst = 0, MaxInst = VL.size();
16195 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
16196 // No actual vectorization should happen, if number of parts is the same as
16197 // provided vectorization factor (i.e. the scalar type is used for vector
16198 // code during codegen).
16199 auto *VecTy = FixedVectorType::get(ScalarTy, VF);
16200 if (TTI->getNumberOfParts(VecTy) == VF)
16201 continue;
16202 for (unsigned I = NextInst; I < MaxInst; ++I) {
16203 unsigned ActualVF = std::min(MaxInst - I, VF);
16204
16205 if (!isPowerOf2_32(ActualVF))
16206 continue;
16207
16208 if (MaxVFOnly && ActualVF < MaxVF)
16209 break;
16210 if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
16211 break;
16212
16213 ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
16214 // Check that a previous iteration of this loop did not delete the Value.
16215 if (llvm::any_of(Ops, [&R](Value *V) {
16216 auto *I = dyn_cast<Instruction>(V);
16217 return I && R.isDeleted(I);
16218 }))
16219 continue;
16220
16221 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
16222 << "\n");
16223
16224 R.buildTree(Ops);
16225 if (R.isTreeTinyAndNotFullyVectorizable())
16226 continue;
16227 R.reorderTopToBottom();
16228 R.reorderBottomToTop(
16229 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
16230 !R.doesRootHaveInTreeUses());
16231 R.buildExternalUses();
16232
16233 R.computeMinimumValueSizes();
16234 R.transformNodes();
16235 InstructionCost Cost = R.getTreeCost();
16236 CandidateFound = true;
16237 MinCost = std::min(MinCost, Cost);
16238
16239 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
16240 << " for VF=" << ActualVF << "\n");
16241 if (Cost < -SLPCostThreshold) {
16242 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
16243 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
16244 cast<Instruction>(Ops[0]))
16245 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
16246 << " and with tree size "
16247 << ore::NV("TreeSize", R.getTreeSize()));
16248
16249 R.vectorizeTree();
16250 // Move to the next bundle.
16251 I += VF - 1;
16252 NextInst = I + 1;
16253 Changed = true;
16254 }
16255 }
16256 }
16257
16258 if (!Changed && CandidateFound) {
16259 R.getORE()->emit([&]() {
16260 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
16261 << "List vectorization was possible but not beneficial with cost "
16262 << ore::NV("Cost", MinCost) << " >= "
16263 << ore::NV("Treshold", -SLPCostThreshold);
16264 });
16265 } else if (!Changed) {
16266 R.getORE()->emit([&]() {
16267 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
16268 << "Cannot SLP vectorize list: vectorization was impossible"
16269 << " with available vectorization factors";
16270 });
16271 }
16272 return Changed;
16273}
16274
16275bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
16276 if (!I)
16277 return false;
16278
16279 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
16280 return false;
16281
16282 Value *P = I->getParent();
16283
16284 // Vectorize in current basic block only.
16285 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
16286 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
16287 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
16288 return false;
16289
16290 // First collect all possible candidates
16292 Candidates.emplace_back(Op0, Op1);
16293
16294 auto *A = dyn_cast<BinaryOperator>(Op0);
16295 auto *B = dyn_cast<BinaryOperator>(Op1);
16296 // Try to skip B.
16297 if (A && B && B->hasOneUse()) {
16298 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
16299 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
16300 if (B0 && B0->getParent() == P)
16301 Candidates.emplace_back(A, B0);
16302 if (B1 && B1->getParent() == P)
16303 Candidates.emplace_back(A, B1);
16304 }
16305 // Try to skip A.
16306 if (B && A && A->hasOneUse()) {
16307 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
16308 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
16309 if (A0 && A0->getParent() == P)
16310 Candidates.emplace_back(A0, B);
16311 if (A1 && A1->getParent() == P)
16312 Candidates.emplace_back(A1, B);
16313 }
16314
16315 if (Candidates.size() == 1)
16316 return tryToVectorizeList({Op0, Op1}, R);
16317
16318 // We have multiple options. Try to pick the single best.
16319 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
16320 if (!BestCandidate)
16321 return false;
16322 return tryToVectorizeList(
16323 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
16324}
16325
16326namespace {
16327
16328/// Model horizontal reductions.
16329///
16330/// A horizontal reduction is a tree of reduction instructions that has values
16331/// that can be put into a vector as its leaves. For example:
16332///
16333/// mul mul mul mul
16334/// \ / \ /
16335/// + +
16336/// \ /
16337/// +
16338/// This tree has "mul" as its leaf values and "+" as its reduction
16339/// instructions. A reduction can feed into a store or a binary operation
16340/// feeding a phi.
16341/// ...
16342/// \ /
16343/// +
16344/// |
16345/// phi +=
16346///
16347/// Or:
16348/// ...
16349/// \ /
16350/// +
16351/// |
16352/// *p =
16353///
16354class HorizontalReduction {
16355 using ReductionOpsType = SmallVector<Value *, 16>;
16356 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
16357 ReductionOpsListType ReductionOps;
16358 /// List of possibly reduced values.
16360 /// Maps reduced value to the corresponding reduction operation.
16362 // Use map vector to make stable output.
16364 WeakTrackingVH ReductionRoot;
16365 /// The type of reduction operation.
16366 RecurKind RdxKind;
16367 /// Checks if the optimization of original scalar identity operations on
16368 /// matched horizontal reductions is enabled and allowed.
16369 bool IsSupportedHorRdxIdentityOp = false;
16370
16371 static bool isCmpSelMinMax(Instruction *I) {
16372 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
16374 }
16375
16376 // And/or are potentially poison-safe logical patterns like:
16377 // select x, y, false
16378 // select x, true, y
16379 static bool isBoolLogicOp(Instruction *I) {
16380 return isa<SelectInst>(I) &&
16381 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
16382 }
16383
16384 /// Checks if instruction is associative and can be vectorized.
16385 static bool isVectorizable(RecurKind Kind, Instruction *I) {
16386 if (Kind == RecurKind::None)
16387 return false;
16388
16389 // Integer ops that map to select instructions or intrinsics are fine.
16391 isBoolLogicOp(I))
16392 return true;
16393
16394 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
16395 // FP min/max are associative except for NaN and -0.0. We do not
16396 // have to rule out -0.0 here because the intrinsic semantics do not
16397 // specify a fixed result for it.
16398 return I->getFastMathFlags().noNaNs();
16399 }
16400
16401 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
16402 return true;
16403
16404 return I->isAssociative();
16405 }
16406
16407 static Value *getRdxOperand(Instruction *I, unsigned Index) {
16408 // Poison-safe 'or' takes the form: select X, true, Y
16409 // To make that work with the normal operand processing, we skip the
16410 // true value operand.
16411 // TODO: Change the code and data structures to handle this without a hack.
16412 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
16413 return I->getOperand(2);
16414 return I->getOperand(Index);
16415 }
16416
16417 /// Creates reduction operation with the current opcode.
16418 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
16419 Value *RHS, const Twine &Name, bool UseSelect) {
16420 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
16421 switch (Kind) {
16422 case RecurKind::Or:
16423 if (UseSelect &&
16425 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
16426 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16427 Name);
16428 case RecurKind::And:
16429 if (UseSelect &&
16431 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
16432 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16433 Name);
16434 case RecurKind::Add:
16435 case RecurKind::Mul:
16436 case RecurKind::Xor:
16437 case RecurKind::FAdd:
16438 case RecurKind::FMul:
16439 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
16440 Name);
16441 case RecurKind::FMax:
16442 return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
16443 case RecurKind::FMin:
16444 return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
16445 case RecurKind::FMaximum:
16446 return Builder.CreateBinaryIntrinsic(Intrinsic::maximum, LHS, RHS);
16447 case RecurKind::FMinimum:
16448 return Builder.CreateBinaryIntrinsic(Intrinsic::minimum, LHS, RHS);
16449 case RecurKind::SMax:
16450 if (UseSelect) {
16451 Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
16452 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16453 }
16454 return Builder.CreateBinaryIntrinsic(Intrinsic::smax, LHS, RHS);
16455 case RecurKind::SMin:
16456 if (UseSelect) {
16457 Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
16458 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16459 }
16460 return Builder.CreateBinaryIntrinsic(Intrinsic::smin, LHS, RHS);
16461 case RecurKind::UMax:
16462 if (UseSelect) {
16463 Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
16464 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16465 }
16466 return Builder.CreateBinaryIntrinsic(Intrinsic::umax, LHS, RHS);
16467 case RecurKind::UMin:
16468 if (UseSelect) {
16469 Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
16470 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
16471 }
16472 return Builder.CreateBinaryIntrinsic(Intrinsic::umin, LHS, RHS);
16473 default:
16474 llvm_unreachable("Unknown reduction operation.");
16475 }
16476 }
16477
16478 /// Creates reduction operation with the current opcode with the IR flags
16479 /// from \p ReductionOps, dropping nuw/nsw flags.
16480 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
16481 Value *RHS, const Twine &Name,
16482 const ReductionOpsListType &ReductionOps) {
16483 bool UseSelect = ReductionOps.size() == 2 ||
16484 // Logical or/and.
16485 (ReductionOps.size() == 1 &&
16486 any_of(ReductionOps.front(), IsaPred<SelectInst>));
16487 assert((!UseSelect || ReductionOps.size() != 2 ||
16488 isa<SelectInst>(ReductionOps[1][0])) &&
16489 "Expected cmp + select pairs for reduction");
16490 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
16492 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
16493 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
16494 /*IncludeWrapFlags=*/false);
16495 propagateIRFlags(Op, ReductionOps[1], nullptr,
16496 /*IncludeWrapFlags=*/false);
16497 return Op;
16498 }
16499 }
16500 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
16501 return Op;
16502 }
16503
16504public:
16505 static RecurKind getRdxKind(Value *V) {
16506 auto *I = dyn_cast<Instruction>(V);
16507 if (!I)
16508 return RecurKind::None;
16509 if (match(I, m_Add(m_Value(), m_Value())))
16510 return RecurKind::Add;
16511 if (match(I, m_Mul(m_Value(), m_Value())))
16512 return RecurKind::Mul;
16513 if (match(I, m_And(m_Value(), m_Value())) ||
16515 return RecurKind::And;
16516 if (match(I, m_Or(m_Value(), m_Value())) ||
16518 return RecurKind::Or;
16519 if (match(I, m_Xor(m_Value(), m_Value())))
16520 return RecurKind::Xor;
16521 if (match(I, m_FAdd(m_Value(), m_Value())))
16522 return RecurKind::FAdd;
16523 if (match(I, m_FMul(m_Value(), m_Value())))
16524 return RecurKind::FMul;
16525
16526 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
16527 return RecurKind::FMax;
16528 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
16529 return RecurKind::FMin;
16530
16531 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
16532 return RecurKind::FMaximum;
16533 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
16534 return RecurKind::FMinimum;
16535 // This matches either cmp+select or intrinsics. SLP is expected to handle
16536 // either form.
16537 // TODO: If we are canonicalizing to intrinsics, we can remove several
16538 // special-case paths that deal with selects.
16539 if (match(I, m_SMax(m_Value(), m_Value())))
16540 return RecurKind::SMax;
16541 if (match(I, m_SMin(m_Value(), m_Value())))
16542 return RecurKind::SMin;
16543 if (match(I, m_UMax(m_Value(), m_Value())))
16544 return RecurKind::UMax;
16545 if (match(I, m_UMin(m_Value(), m_Value())))
16546 return RecurKind::UMin;
16547
16548 if (auto *Select = dyn_cast<SelectInst>(I)) {
16549 // Try harder: look for min/max pattern based on instructions producing
16550 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
16551 // During the intermediate stages of SLP, it's very common to have
16552 // pattern like this (since optimizeGatherSequence is run only once
16553 // at the end):
16554 // %1 = extractelement <2 x i32> %a, i32 0
16555 // %2 = extractelement <2 x i32> %a, i32 1
16556 // %cond = icmp sgt i32 %1, %2
16557 // %3 = extractelement <2 x i32> %a, i32 0
16558 // %4 = extractelement <2 x i32> %a, i32 1
16559 // %select = select i1 %cond, i32 %3, i32 %4
16560 CmpInst::Predicate Pred;
16561 Instruction *L1;
16562 Instruction *L2;
16563
16564 Value *LHS = Select->getTrueValue();
16565 Value *RHS = Select->getFalseValue();
16566 Value *Cond = Select->getCondition();
16567
16568 // TODO: Support inverse predicates.
16569 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
16570 if (!isa<ExtractElementInst>(RHS) ||
16571 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16572 return RecurKind::None;
16573 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
16574 if (!isa<ExtractElementInst>(LHS) ||
16575 !L1->isIdenticalTo(cast<Instruction>(LHS)))
16576 return RecurKind::None;
16577 } else {
16578 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
16579 return RecurKind::None;
16580 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
16581 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
16582 !L2->isIdenticalTo(cast<Instruction>(RHS)))
16583 return RecurKind::None;
16584 }
16585
16586 switch (Pred) {
16587 default:
16588 return RecurKind::None;
16589 case CmpInst::ICMP_SGT:
16590 case CmpInst::ICMP_SGE:
16591 return RecurKind::SMax;
16592 case CmpInst::ICMP_SLT:
16593 case CmpInst::ICMP_SLE:
16594 return RecurKind::SMin;
16595 case CmpInst::ICMP_UGT:
16596 case CmpInst::ICMP_UGE:
16597 return RecurKind::UMax;
16598 case CmpInst::ICMP_ULT:
16599 case CmpInst::ICMP_ULE:
16600 return RecurKind::UMin;
16601 }
16602 }
16603 return RecurKind::None;
16604 }
16605
16606 /// Get the index of the first operand.
16607 static unsigned getFirstOperandIndex(Instruction *I) {
16608 return isCmpSelMinMax(I) ? 1 : 0;
16609 }
16610
16611private:
16612 /// Total number of operands in the reduction operation.
16613 static unsigned getNumberOfOperands(Instruction *I) {
16614 return isCmpSelMinMax(I) ? 3 : 2;
16615 }
16616
16617 /// Checks if the instruction is in basic block \p BB.
16618 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
16619 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
16620 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
16621 auto *Sel = cast<SelectInst>(I);
16622 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
16623 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
16624 }
16625 return I->getParent() == BB;
16626 }
16627
16628 /// Expected number of uses for reduction operations/reduced values.
16629 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
16630 if (IsCmpSelMinMax) {
16631 // SelectInst must be used twice while the condition op must have single
16632 // use only.
16633 if (auto *Sel = dyn_cast<SelectInst>(I))
16634 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
16635 return I->hasNUses(2);
16636 }
16637
16638 // Arithmetic reduction operation must be used once only.
16639 return I->hasOneUse();
16640 }
16641
16642 /// Initializes the list of reduction operations.
16643 void initReductionOps(Instruction *I) {
16644 if (isCmpSelMinMax(I))
16645 ReductionOps.assign(2, ReductionOpsType());
16646 else
16647 ReductionOps.assign(1, ReductionOpsType());
16648 }
16649
16650 /// Add all reduction operations for the reduction instruction \p I.
16651 void addReductionOps(Instruction *I) {
16652 if (isCmpSelMinMax(I)) {
16653 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
16654 ReductionOps[1].emplace_back(I);
16655 } else {
16656 ReductionOps[0].emplace_back(I);
16657 }
16658 }
16659
16660 static bool isGoodForReduction(ArrayRef<Value *> Data) {
16661 int Sz = Data.size();
16662 auto *I = dyn_cast<Instruction>(Data.front());
16663 return Sz > 1 || isConstant(Data.front()) ||
16664 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
16665 }
16666
16667public:
16668 HorizontalReduction() = default;
16669
16670 /// Try to find a reduction tree.
16671 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
16672 ScalarEvolution &SE, const DataLayout &DL,
16673 const TargetLibraryInfo &TLI) {
16674 RdxKind = HorizontalReduction::getRdxKind(Root);
16675 if (!isVectorizable(RdxKind, Root))
16676 return false;
16677
16678 // Analyze "regular" integer/FP types for reductions - no target-specific
16679 // types or pointers.
16680 Type *Ty = Root->getType();
16681 if (!isValidElementType(Ty) || Ty->isPointerTy())
16682 return false;
16683
16684 // Though the ultimate reduction may have multiple uses, its condition must
16685 // have only single use.
16686 if (auto *Sel = dyn_cast<SelectInst>(Root))
16687 if (!Sel->getCondition()->hasOneUse())
16688 return false;
16689
16690 ReductionRoot = Root;
16691
16692 // Iterate through all the operands of the possible reduction tree and
16693 // gather all the reduced values, sorting them by their value id.
16694 BasicBlock *BB = Root->getParent();
16695 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
16696 SmallVector<Instruction *> Worklist(1, Root);
16697 // Checks if the operands of the \p TreeN instruction are also reduction
16698 // operations or should be treated as reduced values or an extra argument,
16699 // which is not part of the reduction.
16700 auto CheckOperands = [&](Instruction *TreeN,
16701 SmallVectorImpl<Value *> &ExtraArgs,
16702 SmallVectorImpl<Value *> &PossibleReducedVals,
16703 SmallVectorImpl<Instruction *> &ReductionOps) {
16704 for (int I = getFirstOperandIndex(TreeN),
16705 End = getNumberOfOperands(TreeN);
16706 I < End; ++I) {
16707 Value *EdgeVal = getRdxOperand(TreeN, I);
16708 ReducedValsToOps[EdgeVal].push_back(TreeN);
16709 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
16710 // Edge has wrong parent - mark as an extra argument.
16711 if (EdgeInst && !isVectorLikeInstWithConstOps(EdgeInst) &&
16712 !hasSameParent(EdgeInst, BB)) {
16713 ExtraArgs.push_back(EdgeVal);
16714 continue;
16715 }
16716 // If the edge is not an instruction, or it is different from the main
16717 // reduction opcode or has too many uses - possible reduced value.
16718 // Also, do not try to reduce const values, if the operation is not
16719 // foldable.
16720 if (!EdgeInst || getRdxKind(EdgeInst) != RdxKind ||
16721 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
16722 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
16723 !isVectorizable(RdxKind, EdgeInst) ||
16724 (R.isAnalyzedReductionRoot(EdgeInst) &&
16725 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
16726 PossibleReducedVals.push_back(EdgeVal);
16727 continue;
16728 }
16729 ReductionOps.push_back(EdgeInst);
16730 }
16731 };
16732 // Try to regroup reduced values so that it gets more profitable to try to
16733 // reduce them. Values are grouped by their value ids, instructions - by
16734 // instruction op id and/or alternate op id, plus do extra analysis for
16735 // loads (grouping them by the distabce between pointers) and cmp
16736 // instructions (grouping them by the predicate).
16738 PossibleReducedVals;
16739 initReductionOps(Root);
16741 SmallSet<size_t, 2> LoadKeyUsed;
16742 SmallPtrSet<Value *, 4> DoNotReverseVals;
16743
16744 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
16746 if (LoadKeyUsed.contains(Key)) {
16747 auto LIt = LoadsMap.find(Ptr);
16748 if (LIt != LoadsMap.end()) {
16749 for (LoadInst *RLI : LIt->second) {
16750 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
16751 LI->getType(), LI->getPointerOperand(), DL, SE,
16752 /*StrictCheck=*/true))
16753 return hash_value(RLI->getPointerOperand());
16754 }
16755 for (LoadInst *RLI : LIt->second) {
16757 LI->getPointerOperand(), TLI)) {
16758 hash_code SubKey = hash_value(RLI->getPointerOperand());
16759 DoNotReverseVals.insert(RLI);
16760 return SubKey;
16761 }
16762 }
16763 if (LIt->second.size() > 2) {
16764 hash_code SubKey =
16765 hash_value(LIt->second.back()->getPointerOperand());
16766 DoNotReverseVals.insert(LIt->second.back());
16767 return SubKey;
16768 }
16769 }
16770 }
16771 LoadKeyUsed.insert(Key);
16772 LoadsMap.try_emplace(Ptr).first->second.push_back(LI);
16773 return hash_value(LI->getPointerOperand());
16774 };
16775
16776 while (!Worklist.empty()) {
16777 Instruction *TreeN = Worklist.pop_back_val();
16779 SmallVector<Value *> PossibleRedVals;
16780 SmallVector<Instruction *> PossibleReductionOps;
16781 CheckOperands(TreeN, Args, PossibleRedVals, PossibleReductionOps);
16782 // If too many extra args - mark the instruction itself as a reduction
16783 // value, not a reduction operation.
16784 if (Args.size() < 2) {
16785 addReductionOps(TreeN);
16786 // Add extra args.
16787 if (!Args.empty()) {
16788 assert(Args.size() == 1 && "Expected only single argument.");
16789 ExtraArgs[TreeN] = Args.front();
16790 }
16791 // Add reduction values. The values are sorted for better vectorization
16792 // results.
16793 for (Value *V : PossibleRedVals) {
16794 size_t Key, Idx;
16795 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
16796 /*AllowAlternate=*/false);
16797 ++PossibleReducedVals[Key][Idx]
16798 .insert(std::make_pair(V, 0))
16799 .first->second;
16800 }
16801 Worklist.append(PossibleReductionOps.rbegin(),
16802 PossibleReductionOps.rend());
16803 } else {
16804 size_t Key, Idx;
16805 std::tie(Key, Idx) = generateKeySubkey(TreeN, &TLI, GenerateLoadsSubkey,
16806 /*AllowAlternate=*/false);
16807 ++PossibleReducedVals[Key][Idx]
16808 .insert(std::make_pair(TreeN, 0))
16809 .first->second;
16810 }
16811 }
16812 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
16813 // Sort values by the total number of values kinds to start the reduction
16814 // from the longest possible reduced values sequences.
16815 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
16816 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
16817 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
16818 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
16819 It != E; ++It) {
16820 PossibleRedValsVect.emplace_back();
16821 auto RedValsVect = It->second.takeVector();
16822 stable_sort(RedValsVect, llvm::less_second());
16823 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
16824 PossibleRedValsVect.back().append(Data.second, Data.first);
16825 }
16826 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
16827 return P1.size() > P2.size();
16828 });
16829 int NewIdx = -1;
16830 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
16831 if (isGoodForReduction(Data) ||
16832 (isa<LoadInst>(Data.front()) && NewIdx >= 0 &&
16833 isa<LoadInst>(ReducedVals[NewIdx].front()) &&
16835 cast<LoadInst>(Data.front())->getPointerOperand()) ==
16836 getUnderlyingObject(cast<LoadInst>(ReducedVals[NewIdx].front())
16837 ->getPointerOperand()))) {
16838 if (NewIdx < 0) {
16839 NewIdx = ReducedVals.size();
16840 ReducedVals.emplace_back();
16841 }
16842 if (DoNotReverseVals.contains(Data.front()))
16843 ReducedVals[NewIdx].append(Data.begin(), Data.end());
16844 else
16845 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
16846 } else {
16847 ReducedVals.emplace_back().append(Data.rbegin(), Data.rend());
16848 }
16849 }
16850 }
16851 // Sort the reduced values by number of same/alternate opcode and/or pointer
16852 // operand.
16853 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
16854 return P1.size() > P2.size();
16855 });
16856 return true;
16857 }
16858
16859 /// Attempt to vectorize the tree found by matchAssociativeReduction.
16860 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
16861 const TargetLibraryInfo &TLI) {
16862 constexpr int ReductionLimit = 4;
16863 constexpr unsigned RegMaxNumber = 4;
16864 constexpr unsigned RedValsMaxNumber = 128;
16865 // If there are a sufficient number of reduction values, reduce
16866 // to a nearby power-of-2. We can safely generate oversized
16867 // vectors and rely on the backend to split them to legal sizes.
16868 unsigned NumReducedVals =
16869 std::accumulate(ReducedVals.begin(), ReducedVals.end(), 0,
16870 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
16871 if (!isGoodForReduction(Vals))
16872 return Num;
16873 return Num + Vals.size();
16874 });
16875 if (NumReducedVals < ReductionLimit &&
16877 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
16878 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
16879 }))) {
16880 for (ReductionOpsType &RdxOps : ReductionOps)
16881 for (Value *RdxOp : RdxOps)
16882 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
16883 return nullptr;
16884 }
16885
16886 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
16887 TargetFolder(DL));
16888 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
16889
16890 // Track the reduced values in case if they are replaced by extractelement
16891 // because of the vectorization.
16893 ReducedVals.size() * ReducedVals.front().size() + ExtraArgs.size());
16894 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
16895 SmallVector<std::pair<Value *, Value *>> ReplacedExternals;
16896 ExternallyUsedValues.reserve(ExtraArgs.size() + 1);
16897 // The same extra argument may be used several times, so log each attempt
16898 // to use it.
16899 for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
16900 assert(Pair.first && "DebugLoc must be set.");
16901 ExternallyUsedValues[Pair.second].push_back(Pair.first);
16902 TrackedVals.try_emplace(Pair.second, Pair.second);
16903 }
16904
16905 // The compare instruction of a min/max is the insertion point for new
16906 // instructions and may be replaced with a new compare instruction.
16907 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
16908 assert(isa<SelectInst>(RdxRootInst) &&
16909 "Expected min/max reduction to have select root instruction");
16910 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
16911 assert(isa<Instruction>(ScalarCond) &&
16912 "Expected min/max reduction to have compare condition");
16913 return cast<Instruction>(ScalarCond);
16914 };
16915
16916 // Return new VectorizedTree, based on previous value.
16917 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
16918 if (VectorizedTree) {
16919 // Update the final value in the reduction.
16921 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
16922 if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
16924 !isGuaranteedNotToBePoison(VectorizedTree))) {
16925 auto It = ReducedValsToOps.find(Res);
16926 if (It != ReducedValsToOps.end() &&
16927 any_of(It->getSecond(),
16928 [](Instruction *I) { return isBoolLogicOp(I); }))
16929 std::swap(VectorizedTree, Res);
16930 }
16931
16932 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
16933 ReductionOps);
16934 }
16935 // Initialize the final value in the reduction.
16936 return Res;
16937 };
16938 bool AnyBoolLogicOp =
16939 any_of(ReductionOps.back(), [](Value *V) {
16940 return isBoolLogicOp(cast<Instruction>(V));
16941 });
16942 // The reduction root is used as the insertion point for new instructions,
16943 // so set it as externally used to prevent it from being deleted.
16944 ExternallyUsedValues[ReductionRoot];
16945 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
16946 ReductionOps.front().size());
16947 for (ReductionOpsType &RdxOps : ReductionOps)
16948 for (Value *RdxOp : RdxOps) {
16949 if (!RdxOp)
16950 continue;
16951 IgnoreList.insert(RdxOp);
16952 }
16953 // Intersect the fast-math-flags from all reduction operations.
16954 FastMathFlags RdxFMF;
16955 RdxFMF.set();
16956 for (Value *U : IgnoreList)
16957 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
16958 RdxFMF &= FPMO->getFastMathFlags();
16959 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
16960
16961 // Need to track reduced vals, they may be changed during vectorization of
16962 // subvectors.
16963 for (ArrayRef<Value *> Candidates : ReducedVals)
16964 for (Value *V : Candidates)
16965 TrackedVals.try_emplace(V, V);
16966
16967 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
16968 // List of the values that were reduced in other trees as part of gather
16969 // nodes and thus requiring extract if fully vectorized in other trees.
16970 SmallPtrSet<Value *, 4> RequiredExtract;
16971 Value *VectorizedTree = nullptr;
16972 bool CheckForReusedReductionOps = false;
16973 // Try to vectorize elements based on their type.
16974 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
16975 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
16976 InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
16977 SmallVector<Value *> Candidates;
16978 Candidates.reserve(2 * OrigReducedVals.size());
16979 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
16980 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
16981 Value *RdxVal = TrackedVals.find(OrigReducedVals[Cnt])->second;
16982 // Check if the reduction value was not overriden by the extractelement
16983 // instruction because of the vectorization and exclude it, if it is not
16984 // compatible with other values.
16985 // Also check if the instruction was folded to constant/other value.
16986 auto *Inst = dyn_cast<Instruction>(RdxVal);
16987 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
16988 (!S.getOpcode() || !S.isOpcodeOrAlt(Inst))) ||
16989 (S.getOpcode() && !Inst))
16990 continue;
16991 Candidates.push_back(RdxVal);
16992 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
16993 }
16994 bool ShuffledExtracts = false;
16995 // Try to handle shuffled extractelements.
16996 if (S.getOpcode() == Instruction::ExtractElement && !S.isAltShuffle() &&
16997 I + 1 < E) {
16998 InstructionsState NextS = getSameOpcode(ReducedVals[I + 1], TLI);
16999 if (NextS.getOpcode() == Instruction::ExtractElement &&
17000 !NextS.isAltShuffle()) {
17001 SmallVector<Value *> CommonCandidates(Candidates);
17002 for (Value *RV : ReducedVals[I + 1]) {
17003 Value *RdxVal = TrackedVals.find(RV)->second;
17004 // Check if the reduction value was not overriden by the
17005 // extractelement instruction because of the vectorization and
17006 // exclude it, if it is not compatible with other values.
17007 if (auto *Inst = dyn_cast<Instruction>(RdxVal))
17008 if (!NextS.getOpcode() || !NextS.isOpcodeOrAlt(Inst))
17009 continue;
17010 CommonCandidates.push_back(RdxVal);
17011 TrackedToOrig.try_emplace(RdxVal, RV);
17012 }
17014 if (isFixedVectorShuffle(CommonCandidates, Mask)) {
17015 ++I;
17016 Candidates.swap(CommonCandidates);
17017 ShuffledExtracts = true;
17018 }
17019 }
17020 }
17021
17022 // Emit code for constant values.
17023 if (AllowHorRdxIdenityOptimization && Candidates.size() > 1 &&
17024 allConstant(Candidates)) {
17025 Value *Res = Candidates.front();
17026 ++VectorizedVals.try_emplace(Candidates.front(), 0).first->getSecond();
17027 for (Value *VC : ArrayRef(Candidates).drop_front()) {
17028 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
17029 ++VectorizedVals.try_emplace(VC, 0).first->getSecond();
17030 if (auto *ResI = dyn_cast<Instruction>(Res))
17031 V.analyzedReductionRoot(ResI);
17032 }
17033 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
17034 continue;
17035 }
17036
17037 unsigned NumReducedVals = Candidates.size();
17038 if (NumReducedVals < ReductionLimit &&
17039 (NumReducedVals < 2 || !AllowHorRdxIdenityOptimization ||
17040 !isSplat(Candidates)))
17041 continue;
17042
17043 // Check if we support repeated scalar values processing (optimization of
17044 // original scalar identity operations on matched horizontal reductions).
17045 IsSupportedHorRdxIdentityOp =
17046 AllowHorRdxIdenityOptimization && RdxKind != RecurKind::Mul &&
17047 RdxKind != RecurKind::FMul && RdxKind != RecurKind::FMulAdd;
17048 // Gather same values.
17049 MapVector<Value *, unsigned> SameValuesCounter;
17050 if (IsSupportedHorRdxIdentityOp)
17051 for (Value *V : Candidates)
17052 ++SameValuesCounter.insert(std::make_pair(V, 0)).first->second;
17053 // Used to check if the reduced values used same number of times. In this
17054 // case the compiler may produce better code. E.g. if reduced values are
17055 // aabbccdd (8 x values), then the first node of the tree will have a node
17056 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
17057 // Plus, the final reduction will be performed on <8 x aabbccdd>.
17058 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
17059 // x abcd) * 2.
17060 // Currently it only handles add/fadd/xor. and/or/min/max do not require
17061 // this analysis, other operations may require an extra estimation of
17062 // the profitability.
17063 bool SameScaleFactor = false;
17064 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
17065 SameValuesCounter.size() != Candidates.size();
17066 if (OptReusedScalars) {
17067 SameScaleFactor =
17068 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
17069 RdxKind == RecurKind::Xor) &&
17070 all_of(drop_begin(SameValuesCounter),
17071 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
17072 return P.second == SameValuesCounter.front().second;
17073 });
17074 Candidates.resize(SameValuesCounter.size());
17075 transform(SameValuesCounter, Candidates.begin(),
17076 [](const auto &P) { return P.first; });
17077 NumReducedVals = Candidates.size();
17078 // Have a reduction of the same element.
17079 if (NumReducedVals == 1) {
17080 Value *OrigV = TrackedToOrig.find(Candidates.front())->second;
17081 unsigned Cnt = SameValuesCounter.lookup(OrigV);
17082 Value *RedVal =
17083 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
17084 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17085 VectorizedVals.try_emplace(OrigV, Cnt);
17086 continue;
17087 }
17088 }
17089
17090 unsigned MaxVecRegSize = V.getMaxVecRegSize();
17091 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
17092 unsigned MaxElts =
17093 RegMaxNumber * llvm::bit_floor(MaxVecRegSize / EltSize);
17094
17095 unsigned ReduxWidth = std::min<unsigned>(
17096 llvm::bit_floor(NumReducedVals),
17097 std::clamp<unsigned>(MaxElts, RedValsMaxNumber,
17098 RegMaxNumber * RedValsMaxNumber));
17099 unsigned Start = 0;
17100 unsigned Pos = Start;
17101 // Restarts vectorization attempt with lower vector factor.
17102 unsigned PrevReduxWidth = ReduxWidth;
17103 bool CheckForReusedReductionOpsLocal = false;
17104 auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
17105 &CheckForReusedReductionOpsLocal,
17106 &PrevReduxWidth, &V,
17107 &IgnoreList](bool IgnoreVL = false) {
17108 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
17109 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
17110 // Check if any of the reduction ops are gathered. If so, worth
17111 // trying again with less number of reduction ops.
17112 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
17113 }
17114 ++Pos;
17115 if (Pos < NumReducedVals - ReduxWidth + 1)
17116 return IsAnyRedOpGathered;
17117 Pos = Start;
17118 ReduxWidth /= 2;
17119 return IsAnyRedOpGathered;
17120 };
17121 bool AnyVectorized = false;
17122 while (Pos < NumReducedVals - ReduxWidth + 1 &&
17123 ReduxWidth >= ReductionLimit) {
17124 // Dependency in tree of the reduction ops - drop this attempt, try
17125 // later.
17126 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
17127 Start == 0) {
17128 CheckForReusedReductionOps = true;
17129 break;
17130 }
17131 PrevReduxWidth = ReduxWidth;
17132 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
17133 // Beeing analyzed already - skip.
17134 if (V.areAnalyzedReductionVals(VL)) {
17135 (void)AdjustReducedVals(/*IgnoreVL=*/true);
17136 continue;
17137 }
17138 // Early exit if any of the reduction values were deleted during
17139 // previous vectorization attempts.
17140 if (any_of(VL, [&V](Value *RedVal) {
17141 auto *RedValI = dyn_cast<Instruction>(RedVal);
17142 if (!RedValI)
17143 return false;
17144 return V.isDeleted(RedValI);
17145 }))
17146 break;
17147 V.buildTree(VL, IgnoreList);
17148 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
17149 if (!AdjustReducedVals())
17150 V.analyzedReductionVals(VL);
17151 continue;
17152 }
17153 if (V.isLoadCombineReductionCandidate(RdxKind)) {
17154 if (!AdjustReducedVals())
17155 V.analyzedReductionVals(VL);
17156 continue;
17157 }
17158 V.reorderTopToBottom();
17159 // No need to reorder the root node at all.
17160 V.reorderBottomToTop(/*IgnoreReorder=*/true);
17161 // Keep extracted other reduction values, if they are used in the
17162 // vectorization trees.
17163 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
17164 ExternallyUsedValues);
17165 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
17166 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
17167 continue;
17168 for (Value *V : ReducedVals[Cnt])
17169 if (isa<Instruction>(V))
17170 LocalExternallyUsedValues[TrackedVals[V]];
17171 }
17172 if (!IsSupportedHorRdxIdentityOp) {
17173 // Number of uses of the candidates in the vector of values.
17174 assert(SameValuesCounter.empty() &&
17175 "Reused values counter map is not empty");
17176 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17177 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17178 continue;
17179 Value *V = Candidates[Cnt];
17180 Value *OrigV = TrackedToOrig.find(V)->second;
17181 ++SameValuesCounter[OrigV];
17182 }
17183 }
17184 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
17185 // Gather externally used values.
17187 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
17188 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
17189 continue;
17190 Value *RdxVal = Candidates[Cnt];
17191 if (!Visited.insert(RdxVal).second)
17192 continue;
17193 // Check if the scalar was vectorized as part of the vectorization
17194 // tree but not the top node.
17195 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
17196 LocalExternallyUsedValues[RdxVal];
17197 continue;
17198 }
17199 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17200 unsigned NumOps =
17201 VectorizedVals.lookup(RdxVal) + SameValuesCounter[OrigV];
17202 if (NumOps != ReducedValsToOps.find(OrigV)->second.size())
17203 LocalExternallyUsedValues[RdxVal];
17204 }
17205 // Do not need the list of reused scalars in regular mode anymore.
17206 if (!IsSupportedHorRdxIdentityOp)
17207 SameValuesCounter.clear();
17208 for (Value *RdxVal : VL)
17209 if (RequiredExtract.contains(RdxVal))
17210 LocalExternallyUsedValues[RdxVal];
17211 // Update LocalExternallyUsedValues for the scalar, replaced by
17212 // extractelement instructions.
17213 DenseMap<Value *, Value *> ReplacementToExternal;
17214 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals)
17215 ReplacementToExternal.try_emplace(Pair.second, Pair.first);
17216 for (const std::pair<Value *, Value *> &Pair : ReplacedExternals) {
17217 Value *Ext = Pair.first;
17218 auto RIt = ReplacementToExternal.find(Ext);
17219 while (RIt != ReplacementToExternal.end()) {
17220 Ext = RIt->second;
17221 RIt = ReplacementToExternal.find(Ext);
17222 }
17223 auto *It = ExternallyUsedValues.find(Ext);
17224 if (It == ExternallyUsedValues.end())
17225 continue;
17226 LocalExternallyUsedValues[Pair.second].append(It->second);
17227 }
17228 V.buildExternalUses(LocalExternallyUsedValues);
17229
17230 V.computeMinimumValueSizes();
17231 V.transformNodes();
17232
17233 // Estimate cost.
17234 InstructionCost TreeCost = V.getTreeCost(VL);
17235 InstructionCost ReductionCost =
17236 getReductionCost(TTI, VL, IsCmpSelMinMax, ReduxWidth, RdxFMF);
17237 InstructionCost Cost = TreeCost + ReductionCost;
17238 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
17239 << " for reduction\n");
17240 if (!Cost.isValid())
17241 break;
17242 if (Cost >= -SLPCostThreshold) {
17243 V.getORE()->emit([&]() {
17245 SV_NAME, "HorSLPNotBeneficial",
17246 ReducedValsToOps.find(VL[0])->second.front())
17247 << "Vectorizing horizontal reduction is possible "
17248 << "but not beneficial with cost " << ore::NV("Cost", Cost)
17249 << " and threshold "
17250 << ore::NV("Threshold", -SLPCostThreshold);
17251 });
17252 if (!AdjustReducedVals())
17253 V.analyzedReductionVals(VL);
17254 continue;
17255 }
17256
17257 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
17258 << Cost << ". (HorRdx)\n");
17259 V.getORE()->emit([&]() {
17260 return OptimizationRemark(
17261 SV_NAME, "VectorizedHorizontalReduction",
17262 ReducedValsToOps.find(VL[0])->second.front())
17263 << "Vectorized horizontal reduction with cost "
17264 << ore::NV("Cost", Cost) << " and with tree size "
17265 << ore::NV("TreeSize", V.getTreeSize());
17266 });
17267
17268 Builder.setFastMathFlags(RdxFMF);
17269
17270 // Emit a reduction. If the root is a select (min/max idiom), the insert
17271 // point is the compare condition of that select.
17272 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
17273 Instruction *InsertPt = RdxRootInst;
17274 if (IsCmpSelMinMax)
17275 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
17276
17277 // Vectorize a tree.
17278 Value *VectorizedRoot = V.vectorizeTree(LocalExternallyUsedValues,
17279 ReplacedExternals, InsertPt);
17280
17281 Builder.SetInsertPoint(InsertPt);
17282
17283 // To prevent poison from leaking across what used to be sequential,
17284 // safe, scalar boolean logic operations, the reduction operand must be
17285 // frozen.
17286 if ((isBoolLogicOp(RdxRootInst) ||
17287 (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
17288 !isGuaranteedNotToBePoison(VectorizedRoot))
17289 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
17290
17291 // Emit code to correctly handle reused reduced values, if required.
17292 if (OptReusedScalars && !SameScaleFactor) {
17293 VectorizedRoot =
17294 emitReusedOps(VectorizedRoot, Builder, V.getRootNodeScalars(),
17295 SameValuesCounter, TrackedToOrig);
17296 }
17297
17298 Value *ReducedSubTree =
17299 emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
17300 if (ReducedSubTree->getType() != VL.front()->getType()) {
17301 ReducedSubTree = Builder.CreateIntCast(
17302 ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
17304 R, cast<Instruction>(ReductionOps.front().front())
17305 ->getModule()
17306 ->getDataLayout());
17307 return !Known.isNonNegative();
17308 }));
17309 }
17310
17311 // Improved analysis for add/fadd/xor reductions with same scale factor
17312 // for all operands of reductions. We can emit scalar ops for them
17313 // instead.
17314 if (OptReusedScalars && SameScaleFactor)
17315 ReducedSubTree = emitScaleForReusedOps(
17316 ReducedSubTree, Builder, SameValuesCounter.front().second);
17317
17318 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
17319 // Count vectorized reduced values to exclude them from final reduction.
17320 for (Value *RdxVal : VL) {
17321 Value *OrigV = TrackedToOrig.find(RdxVal)->second;
17322 if (IsSupportedHorRdxIdentityOp) {
17323 VectorizedVals.try_emplace(OrigV, SameValuesCounter[RdxVal]);
17324 continue;
17325 }
17326 ++VectorizedVals.try_emplace(OrigV, 0).first->getSecond();
17327 if (!V.isVectorized(RdxVal))
17328 RequiredExtract.insert(RdxVal);
17329 }
17330 Pos += ReduxWidth;
17331 Start = Pos;
17332 ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
17333 AnyVectorized = true;
17334 }
17335 if (OptReusedScalars && !AnyVectorized) {
17336 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
17337 Value *RedVal = emitScaleForReusedOps(P.first, Builder, P.second);
17338 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
17339 Value *OrigV = TrackedToOrig.find(P.first)->second;
17340 VectorizedVals.try_emplace(OrigV, P.second);
17341 }
17342 continue;
17343 }
17344 }
17345 if (VectorizedTree) {
17346 // Reorder operands of bool logical op in the natural order to avoid
17347 // possible problem with poison propagation. If not possible to reorder
17348 // (both operands are originally RHS), emit an extra freeze instruction
17349 // for the LHS operand.
17350 // I.e., if we have original code like this:
17351 // RedOp1 = select i1 ?, i1 LHS, i1 false
17352 // RedOp2 = select i1 RHS, i1 ?, i1 false
17353
17354 // Then, we swap LHS/RHS to create a new op that matches the poison
17355 // semantics of the original code.
17356
17357 // If we have original code like this and both values could be poison:
17358 // RedOp1 = select i1 ?, i1 LHS, i1 false
17359 // RedOp2 = select i1 ?, i1 RHS, i1 false
17360
17361 // Then, we must freeze LHS in the new op.
17362 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
17363 Instruction *RedOp1,
17364 Instruction *RedOp2,
17365 bool InitStep) {
17366 if (!AnyBoolLogicOp)
17367 return;
17368 if (isBoolLogicOp(RedOp1) &&
17369 ((!InitStep && LHS == VectorizedTree) ||
17370 getRdxOperand(RedOp1, 0) == LHS || isGuaranteedNotToBePoison(LHS)))
17371 return;
17372 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
17373 getRdxOperand(RedOp2, 0) == RHS ||
17375 std::swap(LHS, RHS);
17376 return;
17377 }
17378 if (LHS != VectorizedTree)
17379 LHS = Builder.CreateFreeze(LHS);
17380 };
17381 // Finish the reduction.
17382 // Need to add extra arguments and not vectorized possible reduction
17383 // values.
17384 // Try to avoid dependencies between the scalar remainders after
17385 // reductions.
17386 auto FinalGen =
17388 bool InitStep) {
17389 unsigned Sz = InstVals.size();
17391 Sz % 2);
17392 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
17393 Instruction *RedOp = InstVals[I + 1].first;
17394 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
17395 Value *RdxVal1 = InstVals[I].second;
17396 Value *StableRdxVal1 = RdxVal1;
17397 auto It1 = TrackedVals.find(RdxVal1);
17398 if (It1 != TrackedVals.end())
17399 StableRdxVal1 = It1->second;
17400 Value *RdxVal2 = InstVals[I + 1].second;
17401 Value *StableRdxVal2 = RdxVal2;
17402 auto It2 = TrackedVals.find(RdxVal2);
17403 if (It2 != TrackedVals.end())
17404 StableRdxVal2 = It2->second;
17405 // To prevent poison from leaking across what used to be
17406 // sequential, safe, scalar boolean logic operations, the
17407 // reduction operand must be frozen.
17408 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
17409 RedOp, InitStep);
17410 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
17411 StableRdxVal2, "op.rdx", ReductionOps);
17412 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
17413 }
17414 if (Sz % 2 == 1)
17415 ExtraReds[Sz / 2] = InstVals.back();
17416 return ExtraReds;
17417 };
17419 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
17420 VectorizedTree);
17422 for (ArrayRef<Value *> Candidates : ReducedVals) {
17423 for (Value *RdxVal : Candidates) {
17424 if (!Visited.insert(RdxVal).second)
17425 continue;
17426 unsigned NumOps = VectorizedVals.lookup(RdxVal);
17427 for (Instruction *RedOp :
17428 ArrayRef(ReducedValsToOps.find(RdxVal)->second)
17429 .drop_back(NumOps))
17430 ExtraReductions.emplace_back(RedOp, RdxVal);
17431 }
17432 }
17433 for (auto &Pair : ExternallyUsedValues) {
17434 // Add each externally used value to the final reduction.
17435 for (auto *I : Pair.second)
17436 ExtraReductions.emplace_back(I, Pair.first);
17437 }
17438 // Iterate through all not-vectorized reduction values/extra arguments.
17439 bool InitStep = true;
17440 while (ExtraReductions.size() > 1) {
17441 VectorizedTree = ExtraReductions.front().second;
17443 FinalGen(ExtraReductions, InitStep);
17444 ExtraReductions.swap(NewReds);
17445 InitStep = false;
17446 }
17447 VectorizedTree = ExtraReductions.front().second;
17448
17449 ReductionRoot->replaceAllUsesWith(VectorizedTree);
17450
17451 // The original scalar reduction is expected to have no remaining
17452 // uses outside the reduction tree itself. Assert that we got this
17453 // correct, replace internal uses with undef, and mark for eventual
17454 // deletion.
17455#ifndef NDEBUG
17456 SmallSet<Value *, 4> IgnoreSet;
17457 for (ArrayRef<Value *> RdxOps : ReductionOps)
17458 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
17459#endif
17460 for (ArrayRef<Value *> RdxOps : ReductionOps) {
17461 for (Value *Ignore : RdxOps) {
17462 if (!Ignore)
17463 continue;
17464#ifndef NDEBUG
17465 for (auto *U : Ignore->users()) {
17466 assert(IgnoreSet.count(U) &&
17467 "All users must be either in the reduction ops list.");
17468 }
17469#endif
17470 if (!Ignore->use_empty()) {
17471 Value *Undef = UndefValue::get(Ignore->getType());
17472 Ignore->replaceAllUsesWith(Undef);
17473 }
17474 V.eraseInstruction(cast<Instruction>(Ignore));
17475 }
17476 }
17477 } else if (!CheckForReusedReductionOps) {
17478 for (ReductionOpsType &RdxOps : ReductionOps)
17479 for (Value *RdxOp : RdxOps)
17480 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
17481 }
17482 return VectorizedTree;
17483 }
17484
17485private:
17486 /// Calculate the cost of a reduction.
17487 InstructionCost getReductionCost(TargetTransformInfo *TTI,
17488 ArrayRef<Value *> ReducedVals,
17489 bool IsCmpSelMinMax, unsigned ReduxWidth,
17490 FastMathFlags FMF) {
17492 Type *ScalarTy = ReducedVals.front()->getType();
17493 FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
17494 InstructionCost VectorCost = 0, ScalarCost;
17495 // If all of the reduced values are constant, the vector cost is 0, since
17496 // the reduction value can be calculated at the compile time.
17497 bool AllConsts = allConstant(ReducedVals);
17498 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
17500 // Scalar cost is repeated for N-1 elements.
17501 int Cnt = ReducedVals.size();
17502 for (Value *RdxVal : ReducedVals) {
17503 if (Cnt == 1)
17504 break;
17505 --Cnt;
17506 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
17507 Cost += GenCostFn();
17508 continue;
17509 }
17510 InstructionCost ScalarCost = 0;
17511 for (User *U : RdxVal->users()) {
17512 auto *RdxOp = cast<Instruction>(U);
17513 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
17514 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
17515 continue;
17516 }
17517 ScalarCost = InstructionCost::getInvalid();
17518 break;
17519 }
17520 if (ScalarCost.isValid())
17521 Cost += ScalarCost;
17522 else
17523 Cost += GenCostFn();
17524 }
17525 return Cost;
17526 };
17527 switch (RdxKind) {
17528 case RecurKind::Add:
17529 case RecurKind::Mul:
17530 case RecurKind::Or:
17531 case RecurKind::And:
17532 case RecurKind::Xor:
17533 case RecurKind::FAdd:
17534 case RecurKind::FMul: {
17535 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
17536 if (!AllConsts)
17537 VectorCost =
17538 TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
17539 ScalarCost = EvaluateScalarCost([&]() {
17540 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
17541 });
17542 break;
17543 }
17544 case RecurKind::FMax:
17545 case RecurKind::FMin:
17546 case RecurKind::FMaximum:
17547 case RecurKind::FMinimum:
17548 case RecurKind::SMax:
17549 case RecurKind::SMin:
17550 case RecurKind::UMax:
17551 case RecurKind::UMin: {
17553 if (!AllConsts)
17554 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
17555 ScalarCost = EvaluateScalarCost([&]() {
17556 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
17557 return TTI->getIntrinsicInstrCost(ICA, CostKind);
17558 });
17559 break;
17560 }
17561 default:
17562 llvm_unreachable("Expected arithmetic or min/max reduction operation");
17563 }
17564
17565 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
17566 << " for reduction of " << shortBundleName(ReducedVals)
17567 << " (It is a splitting reduction)\n");
17568 return VectorCost - ScalarCost;
17569 }
17570
17571 /// Emit a horizontal reduction of the vectorized value.
17572 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
17573 unsigned ReduxWidth, const TargetTransformInfo *TTI) {
17574 assert(VectorizedValue && "Need to have a vectorized tree node");
17575 assert(isPowerOf2_32(ReduxWidth) &&
17576 "We only handle power-of-two reductions for now");
17577 assert(RdxKind != RecurKind::FMulAdd &&
17578 "A call to the llvm.fmuladd intrinsic is not handled yet");
17579
17580 ++NumVectorInstructions;
17581 return createSimpleTargetReduction(Builder, VectorizedValue, RdxKind);
17582 }
17583
17584 /// Emits optimized code for unique scalar value reused \p Cnt times.
17585 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17586 unsigned Cnt) {
17587 assert(IsSupportedHorRdxIdentityOp &&
17588 "The optimization of matched scalar identity horizontal reductions "
17589 "must be supported.");
17590 switch (RdxKind) {
17591 case RecurKind::Add: {
17592 // res = mul vv, n
17593 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
17594 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
17595 << VectorizedValue << ". (HorRdx)\n");
17596 return Builder.CreateMul(VectorizedValue, Scale);
17597 }
17598 case RecurKind::Xor: {
17599 // res = n % 2 ? 0 : vv
17600 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
17601 << ". (HorRdx)\n");
17602 if (Cnt % 2 == 0)
17603 return Constant::getNullValue(VectorizedValue->getType());
17604 return VectorizedValue;
17605 }
17606 case RecurKind::FAdd: {
17607 // res = fmul v, n
17608 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
17609 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
17610 << VectorizedValue << ". (HorRdx)\n");
17611 return Builder.CreateFMul(VectorizedValue, Scale);
17612 }
17613 case RecurKind::And:
17614 case RecurKind::Or:
17615 case RecurKind::SMax:
17616 case RecurKind::SMin:
17617 case RecurKind::UMax:
17618 case RecurKind::UMin:
17619 case RecurKind::FMax:
17620 case RecurKind::FMin:
17621 case RecurKind::FMaximum:
17622 case RecurKind::FMinimum:
17623 // res = vv
17624 return VectorizedValue;
17625 case RecurKind::Mul:
17626 case RecurKind::FMul:
17627 case RecurKind::FMulAdd:
17628 case RecurKind::IAnyOf:
17629 case RecurKind::FAnyOf:
17630 case RecurKind::None:
17631 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
17632 }
17633 return nullptr;
17634 }
17635
17636 /// Emits actual operation for the scalar identity values, found during
17637 /// horizontal reduction analysis.
17638 Value *emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
17640 const MapVector<Value *, unsigned> &SameValuesCounter,
17641 const DenseMap<Value *, Value *> &TrackedToOrig) {
17642 assert(IsSupportedHorRdxIdentityOp &&
17643 "The optimization of matched scalar identity horizontal reductions "
17644 "must be supported.");
17645 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
17646 if (VTy->getElementType() != VL.front()->getType()) {
17647 VectorizedValue = Builder.CreateIntCast(
17648 VectorizedValue,
17649 FixedVectorType::get(VL.front()->getType(), VTy->getNumElements()),
17650 any_of(VL, [&](Value *R) {
17652 R, cast<Instruction>(ReductionOps.front().front())
17653 ->getModule()
17654 ->getDataLayout());
17655 return !Known.isNonNegative();
17656 }));
17657 }
17658 switch (RdxKind) {
17659 case RecurKind::Add: {
17660 // root = mul prev_root, <1, 1, n, 1>
17662 for (Value *V : VL) {
17663 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17664 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
17665 }
17666 auto *Scale = ConstantVector::get(Vals);
17667 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
17668 << VectorizedValue << ". (HorRdx)\n");
17669 return Builder.CreateMul(VectorizedValue, Scale);
17670 }
17671 case RecurKind::And:
17672 case RecurKind::Or:
17673 // No need for multiple or/and(s).
17674 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
17675 << ". (HorRdx)\n");
17676 return VectorizedValue;
17677 case RecurKind::SMax:
17678 case RecurKind::SMin:
17679 case RecurKind::UMax:
17680 case RecurKind::UMin:
17681 case RecurKind::FMax:
17682 case RecurKind::FMin:
17683 case RecurKind::FMaximum:
17684 case RecurKind::FMinimum:
17685 // No need for multiple min/max(s) of the same value.
17686 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
17687 << ". (HorRdx)\n");
17688 return VectorizedValue;
17689 case RecurKind::Xor: {
17690 // Replace values with even number of repeats with 0, since
17691 // x xor x = 0.
17692 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
17693 // 7>, if elements 4th and 6th elements have even number of repeats.
17695 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
17697 std::iota(Mask.begin(), Mask.end(), 0);
17698 bool NeedShuffle = false;
17699 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
17700 Value *V = VL[I];
17701 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17702 if (Cnt % 2 == 0) {
17703 Mask[I] = VF;
17704 NeedShuffle = true;
17705 }
17706 }
17707 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
17708 : Mask) dbgs()
17709 << I << " ";
17710 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
17711 if (NeedShuffle)
17712 VectorizedValue = Builder.CreateShuffleVector(
17713 VectorizedValue,
17714 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
17715 return VectorizedValue;
17716 }
17717 case RecurKind::FAdd: {
17718 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
17720 for (Value *V : VL) {
17721 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.find(V)->second);
17722 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
17723 }
17724 auto *Scale = ConstantVector::get(Vals);
17725 return Builder.CreateFMul(VectorizedValue, Scale);
17726 }
17727 case RecurKind::Mul:
17728 case RecurKind::FMul:
17729 case RecurKind::FMulAdd:
17730 case RecurKind::IAnyOf:
17731 case RecurKind::FAnyOf:
17732 case RecurKind::None:
17733 llvm_unreachable("Unexpected reduction kind for reused scalars.");
17734 }
17735 return nullptr;
17736 }
17737};
17738} // end anonymous namespace
17739
17740/// Gets recurrence kind from the specified value.
17742 return HorizontalReduction::getRdxKind(V);
17743}
17744static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
17745 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
17746 return cast<FixedVectorType>(IE->getType())->getNumElements();
17747
17748 unsigned AggregateSize = 1;
17749 auto *IV = cast<InsertValueInst>(InsertInst);
17750 Type *CurrentType = IV->getType();
17751 do {
17752 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
17753 for (auto *Elt : ST->elements())
17754 if (Elt != ST->getElementType(0)) // check homogeneity
17755 return std::nullopt;
17756 AggregateSize *= ST->getNumElements();
17757 CurrentType = ST->getElementType(0);
17758 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
17759 AggregateSize *= AT->getNumElements();
17760 CurrentType = AT->getElementType();
17761 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
17762 AggregateSize *= VT->getNumElements();
17763 return AggregateSize;
17764 } else if (CurrentType->isSingleValueType()) {
17765 return AggregateSize;
17766 } else {
17767 return std::nullopt;
17768 }
17769 } while (true);
17770}
17771
17772static void findBuildAggregate_rec(Instruction *LastInsertInst,
17774 SmallVectorImpl<Value *> &BuildVectorOpds,
17775 SmallVectorImpl<Value *> &InsertElts,
17776 unsigned OperandOffset) {
17777 do {
17778 Value *InsertedOperand = LastInsertInst->getOperand(1);
17779 std::optional<unsigned> OperandIndex =
17780 getInsertIndex(LastInsertInst, OperandOffset);
17781 if (!OperandIndex)
17782 return;
17783 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
17784 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
17785 BuildVectorOpds, InsertElts, *OperandIndex);
17786
17787 } else {
17788 BuildVectorOpds[*OperandIndex] = InsertedOperand;
17789 InsertElts[*OperandIndex] = LastInsertInst;
17790 }
17791 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
17792 } while (LastInsertInst != nullptr &&
17793 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
17794 LastInsertInst->hasOneUse());
17795}
17796
17797/// Recognize construction of vectors like
17798/// %ra = insertelement <4 x float> poison, float %s0, i32 0
17799/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
17800/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
17801/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
17802/// starting from the last insertelement or insertvalue instruction.
17803///
17804/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
17805/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
17806/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
17807///
17808/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
17809///
17810/// \return true if it matches.
17811static bool findBuildAggregate(Instruction *LastInsertInst,
17813 SmallVectorImpl<Value *> &BuildVectorOpds,
17814 SmallVectorImpl<Value *> &InsertElts) {
17815
17816 assert((isa<InsertElementInst>(LastInsertInst) ||
17817 isa<InsertValueInst>(LastInsertInst)) &&
17818 "Expected insertelement or insertvalue instruction!");
17819
17820 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
17821 "Expected empty result vectors!");
17822
17823 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
17824 if (!AggregateSize)
17825 return false;
17826 BuildVectorOpds.resize(*AggregateSize);
17827 InsertElts.resize(*AggregateSize);
17828
17829 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
17830 llvm::erase(BuildVectorOpds, nullptr);
17831 llvm::erase(InsertElts, nullptr);
17832 if (BuildVectorOpds.size() >= 2)
17833 return true;
17834
17835 return false;
17836}
17837
17838/// Try and get a reduction instruction from a phi node.
17839///
17840/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
17841/// if they come from either \p ParentBB or a containing loop latch.
17842///
17843/// \returns A candidate reduction value if possible, or \code nullptr \endcode
17844/// if not possible.
17846 BasicBlock *ParentBB, LoopInfo *LI) {
17847 // There are situations where the reduction value is not dominated by the
17848 // reduction phi. Vectorizing such cases has been reported to cause
17849 // miscompiles. See PR25787.
17850 auto DominatedReduxValue = [&](Value *R) {
17851 return isa<Instruction>(R) &&
17852 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
17853 };
17854
17855 Instruction *Rdx = nullptr;
17856
17857 // Return the incoming value if it comes from the same BB as the phi node.
17858 if (P->getIncomingBlock(0) == ParentBB) {
17859 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17860 } else if (P->getIncomingBlock(1) == ParentBB) {
17861 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17862 }
17863
17864 if (Rdx && DominatedReduxValue(Rdx))
17865 return Rdx;
17866
17867 // Otherwise, check whether we have a loop latch to look at.
17868 Loop *BBL = LI->getLoopFor(ParentBB);
17869 if (!BBL)
17870 return nullptr;
17871 BasicBlock *BBLatch = BBL->getLoopLatch();
17872 if (!BBLatch)
17873 return nullptr;
17874
17875 // There is a loop latch, return the incoming value if it comes from
17876 // that. This reduction pattern occasionally turns up.
17877 if (P->getIncomingBlock(0) == BBLatch) {
17878 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
17879 } else if (P->getIncomingBlock(1) == BBLatch) {
17880 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
17881 }
17882
17883 if (Rdx && DominatedReduxValue(Rdx))
17884 return Rdx;
17885
17886 return nullptr;
17887}
17888
17889static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
17890 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
17891 return true;
17892 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
17893 return true;
17894 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
17895 return true;
17896 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
17897 return true;
17898 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
17899 return true;
17900 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
17901 return true;
17902 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
17903 return true;
17904 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
17905 return true;
17906 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
17907 return true;
17908 return false;
17909}
17910
17911/// We could have an initial reduction that is not an add.
17912/// r *= v1 + v2 + v3 + v4
17913/// In such a case start looking for a tree rooted in the first '+'.
17914/// \Returns the new root if found, which may be nullptr if not an instruction.
17916 Instruction *Root) {
17917 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
17918 isa<IntrinsicInst>(Root)) &&
17919 "Expected binop, select, or intrinsic for reduction matching");
17920 Value *LHS =
17921 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
17922 Value *RHS =
17923 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
17924 if (LHS == Phi)
17925 return dyn_cast<Instruction>(RHS);
17926 if (RHS == Phi)
17927 return dyn_cast<Instruction>(LHS);
17928 return nullptr;
17929}
17930
17931/// \p Returns the first operand of \p I that does not match \p Phi. If
17932/// operand is not an instruction it returns nullptr.
17934 Value *Op0 = nullptr;
17935 Value *Op1 = nullptr;
17936 if (!matchRdxBop(I, Op0, Op1))
17937 return nullptr;
17938 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
17939}
17940
17941/// \Returns true if \p I is a candidate instruction for reduction vectorization.
17943 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
17944 Value *B0 = nullptr, *B1 = nullptr;
17945 bool IsBinop = matchRdxBop(I, B0, B1);
17946 return IsBinop || IsSelect;
17947}
17948
17949bool SLPVectorizerPass::vectorizeHorReduction(
17951 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
17952 if (!ShouldVectorizeHor)
17953 return false;
17954 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
17955
17956 if (Root->getParent() != BB || isa<PHINode>(Root))
17957 return false;
17958
17959 // If we can find a secondary reduction root, use that instead.
17960 auto SelectRoot = [&]() {
17961 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
17962 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
17963 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
17964 return NewRoot;
17965 return Root;
17966 };
17967
17968 // Start analysis starting from Root instruction. If horizontal reduction is
17969 // found, try to vectorize it. If it is not a horizontal reduction or
17970 // vectorization is not possible or not effective, and currently analyzed
17971 // instruction is a binary operation, try to vectorize the operands, using
17972 // pre-order DFS traversal order. If the operands were not vectorized, repeat
17973 // the same procedure considering each operand as a possible root of the
17974 // horizontal reduction.
17975 // Interrupt the process if the Root instruction itself was vectorized or all
17976 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
17977 // If a horizintal reduction was not matched or vectorized we collect
17978 // instructions for possible later attempts for vectorization.
17979 std::queue<std::pair<Instruction *, unsigned>> Stack;
17980 Stack.emplace(SelectRoot(), 0);
17981 SmallPtrSet<Value *, 8> VisitedInstrs;
17982 bool Res = false;
17983 auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
17984 if (R.isAnalyzedReductionRoot(Inst))
17985 return nullptr;
17986 if (!isReductionCandidate(Inst))
17987 return nullptr;
17988 HorizontalReduction HorRdx;
17989 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
17990 return nullptr;
17991 return HorRdx.tryToReduce(R, *DL, TTI, *TLI);
17992 };
17993 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
17994 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
17995 FutureSeed = getNonPhiOperand(Root, P);
17996 if (!FutureSeed)
17997 return false;
17998 }
17999 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
18000 // analysis is done separately.
18001 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
18002 PostponedInsts.push_back(FutureSeed);
18003 return true;
18004 };
18005
18006 while (!Stack.empty()) {
18007 Instruction *Inst;
18008 unsigned Level;
18009 std::tie(Inst, Level) = Stack.front();
18010 Stack.pop();
18011 // Do not try to analyze instruction that has already been vectorized.
18012 // This may happen when we vectorize instruction operands on a previous
18013 // iteration while stack was populated before that happened.
18014 if (R.isDeleted(Inst))
18015 continue;
18016 if (Value *VectorizedV = TryToReduce(Inst)) {
18017 Res = true;
18018 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
18019 // Try to find another reduction.
18020 Stack.emplace(I, Level);
18021 continue;
18022 }
18023 } else {
18024 // We could not vectorize `Inst` so try to use it as a future seed.
18025 if (!TryAppendToPostponedInsts(Inst)) {
18026 assert(Stack.empty() && "Expected empty stack");
18027 break;
18028 }
18029 }
18030
18031 // Try to vectorize operands.
18032 // Continue analysis for the instruction from the same basic block only to
18033 // save compile time.
18034 if (++Level < RecursionMaxDepth)
18035 for (auto *Op : Inst->operand_values())
18036 if (VisitedInstrs.insert(Op).second)
18037 if (auto *I = dyn_cast<Instruction>(Op))
18038 // Do not try to vectorize CmpInst operands, this is done
18039 // separately.
18040 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
18041 !R.isDeleted(I) && I->getParent() == BB)
18042 Stack.emplace(I, Level);
18043 }
18044 return Res;
18045}
18046
18047bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
18048 BasicBlock *BB, BoUpSLP &R,
18050 SmallVector<WeakTrackingVH> PostponedInsts;
18051 bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
18052 Res |= tryToVectorize(PostponedInsts, R);
18053 return Res;
18054}
18055
18056bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
18057 BoUpSLP &R) {
18058 bool Res = false;
18059 for (Value *V : Insts)
18060 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
18061 Res |= tryToVectorize(Inst, R);
18062 return Res;
18063}
18064
18065bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
18066 BasicBlock *BB, BoUpSLP &R) {
18067 if (!R.canMapToVector(IVI->getType()))
18068 return false;
18069
18070 SmallVector<Value *, 16> BuildVectorOpds;
18071 SmallVector<Value *, 16> BuildVectorInsts;
18072 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
18073 return false;
18074
18075 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
18076 // Aggregate value is unlikely to be processed in vector register.
18077 return tryToVectorizeList(BuildVectorOpds, R);
18078}
18079
18080bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
18081 BasicBlock *BB, BoUpSLP &R) {
18082 SmallVector<Value *, 16> BuildVectorInsts;
18083 SmallVector<Value *, 16> BuildVectorOpds;
18085 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
18086 (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
18087 isFixedVectorShuffle(BuildVectorOpds, Mask)))
18088 return false;
18089
18090 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
18091 return tryToVectorizeList(BuildVectorInsts, R);
18092}
18093
18094template <typename T>
18096 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
18097 function_ref<bool(T *, T *)> AreCompatible,
18098 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
18099 bool MaxVFOnly, BoUpSLP &R) {
18100 bool Changed = false;
18101 // Sort by type, parent, operands.
18102 stable_sort(Incoming, Comparator);
18103
18104 // Try to vectorize elements base on their type.
18105 SmallVector<T *> Candidates;
18106 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
18107 // Look for the next elements with the same type, parent and operand
18108 // kinds.
18109 auto *SameTypeIt = IncIt;
18110 while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
18111 ++SameTypeIt;
18112
18113 // Try to vectorize them.
18114 unsigned NumElts = (SameTypeIt - IncIt);
18115 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
18116 << NumElts << ")\n");
18117 // The vectorization is a 3-state attempt:
18118 // 1. Try to vectorize instructions with the same/alternate opcodes with the
18119 // size of maximal register at first.
18120 // 2. Try to vectorize remaining instructions with the same type, if
18121 // possible. This may result in the better vectorization results rather than
18122 // if we try just to vectorize instructions with the same/alternate opcodes.
18123 // 3. Final attempt to try to vectorize all instructions with the
18124 // same/alternate ops only, this may result in some extra final
18125 // vectorization.
18126 if (NumElts > 1 &&
18127 TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
18128 // Success start over because instructions might have been changed.
18129 Changed = true;
18130 } else {
18131 /// \Returns the minimum number of elements that we will attempt to
18132 /// vectorize.
18133 auto GetMinNumElements = [&R](Value *V) {
18134 unsigned EltSize = R.getVectorElementSize(V);
18135 return std::max(2U, R.getMaxVecRegSize() / EltSize);
18136 };
18137 if (NumElts < GetMinNumElements(*IncIt) &&
18138 (Candidates.empty() ||
18139 Candidates.front()->getType() == (*IncIt)->getType())) {
18140 Candidates.append(IncIt, std::next(IncIt, NumElts));
18141 }
18142 }
18143 // Final attempt to vectorize instructions with the same types.
18144 if (Candidates.size() > 1 &&
18145 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
18146 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
18147 // Success start over because instructions might have been changed.
18148 Changed = true;
18149 } else if (MaxVFOnly) {
18150 // Try to vectorize using small vectors.
18151 for (auto *It = Candidates.begin(), *End = Candidates.end();
18152 It != End;) {
18153 auto *SameTypeIt = It;
18154 while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
18155 ++SameTypeIt;
18156 unsigned NumElts = (SameTypeIt - It);
18157 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
18158 /*MaxVFOnly=*/false))
18159 Changed = true;
18160 It = SameTypeIt;
18161 }
18162 }
18163 Candidates.clear();
18164 }
18165
18166 // Start over at the next instruction of a different type (or the end).
18167 IncIt = SameTypeIt;
18168 }
18169 return Changed;
18170}
18171
18172/// Compare two cmp instructions. If IsCompatibility is true, function returns
18173/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
18174/// operands. If IsCompatibility is false, function implements strict weak
18175/// ordering relation between two cmp instructions, returning true if the first
18176/// instruction is "less" than the second, i.e. its predicate is less than the
18177/// predicate of the second or the operands IDs are less than the operands IDs
18178/// of the second cmp instruction.
18179template <bool IsCompatibility>
18180static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
18181 const DominatorTree &DT) {
18182 assert(isValidElementType(V->getType()) &&
18183 isValidElementType(V2->getType()) &&
18184 "Expected valid element types only.");
18185 if (V == V2)
18186 return IsCompatibility;
18187 auto *CI1 = cast<CmpInst>(V);
18188 auto *CI2 = cast<CmpInst>(V2);
18189 if (CI1->getOperand(0)->getType()->getTypeID() <
18190 CI2->getOperand(0)->getType()->getTypeID())
18191 return !IsCompatibility;
18192 if (CI1->getOperand(0)->getType()->getTypeID() >
18193 CI2->getOperand(0)->getType()->getTypeID())
18194 return false;
18195 CmpInst::Predicate Pred1 = CI1->getPredicate();
18196 CmpInst::Predicate Pred2 = CI2->getPredicate();
18199 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
18200 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
18201 if (BasePred1 < BasePred2)
18202 return !IsCompatibility;
18203 if (BasePred1 > BasePred2)
18204 return false;
18205 // Compare operands.
18206 bool CI1Preds = Pred1 == BasePred1;
18207 bool CI2Preds = Pred2 == BasePred1;
18208 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
18209 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
18210 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
18211 if (Op1 == Op2)
18212 continue;
18213 if (Op1->getValueID() < Op2->getValueID())
18214 return !IsCompatibility;
18215 if (Op1->getValueID() > Op2->getValueID())
18216 return false;
18217 if (auto *I1 = dyn_cast<Instruction>(Op1))
18218 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
18219 if (IsCompatibility) {
18220 if (I1->getParent() != I2->getParent())
18221 return false;
18222 } else {
18223 // Try to compare nodes with same parent.
18224 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
18225 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
18226 if (!NodeI1)
18227 return NodeI2 != nullptr;
18228 if (!NodeI2)
18229 return false;
18230 assert((NodeI1 == NodeI2) ==
18231 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18232 "Different nodes should have different DFS numbers");
18233 if (NodeI1 != NodeI2)
18234 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18235 }
18236 InstructionsState S = getSameOpcode({I1, I2}, TLI);
18237 if (S.getOpcode() && (IsCompatibility || !S.isAltShuffle()))
18238 continue;
18239 if (IsCompatibility)
18240 return false;
18241 if (I1->getOpcode() != I2->getOpcode())
18242 return I1->getOpcode() < I2->getOpcode();
18243 }
18244 }
18245 return IsCompatibility;
18246}
18247
18248template <typename ItT>
18249bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
18250 BasicBlock *BB, BoUpSLP &R) {
18251 bool Changed = false;
18252 // Try to find reductions first.
18253 for (CmpInst *I : CmpInsts) {
18254 if (R.isDeleted(I))
18255 continue;
18256 for (Value *Op : I->operands())
18257 if (auto *RootOp = dyn_cast<Instruction>(Op))
18258 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
18259 }
18260 // Try to vectorize operands as vector bundles.
18261 for (CmpInst *I : CmpInsts) {
18262 if (R.isDeleted(I))
18263 continue;
18264 Changed |= tryToVectorize(I, R);
18265 }
18266 // Try to vectorize list of compares.
18267 // Sort by type, compare predicate, etc.
18268 auto CompareSorter = [&](Value *V, Value *V2) {
18269 if (V == V2)
18270 return false;
18271 return compareCmp<false>(V, V2, *TLI, *DT);
18272 };
18273
18274 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
18275 if (V1 == V2)
18276 return true;
18277 return compareCmp<true>(V1, V2, *TLI, *DT);
18278 };
18279
18281 for (Instruction *V : CmpInsts)
18282 if (!R.isDeleted(V) && isValidElementType(V->getType()))
18283 Vals.push_back(V);
18284 if (Vals.size() <= 1)
18285 return Changed;
18286 Changed |= tryToVectorizeSequence<Value>(
18287 Vals, CompareSorter, AreCompatibleCompares,
18288 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18289 // Exclude possible reductions from other blocks.
18290 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
18291 return any_of(V->users(), [V](User *U) {
18292 auto *Select = dyn_cast<SelectInst>(U);
18293 return Select &&
18294 Select->getParent() != cast<Instruction>(V)->getParent();
18295 });
18296 });
18297 if (ArePossiblyReducedInOtherBlock)
18298 return false;
18299 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18300 },
18301 /*MaxVFOnly=*/true, R);
18302 return Changed;
18303}
18304
18305bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
18306 BasicBlock *BB, BoUpSLP &R) {
18307 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
18308 "This function only accepts Insert instructions");
18309 bool OpsChanged = false;
18310 SmallVector<WeakTrackingVH> PostponedInsts;
18311 // pass1 - try to vectorize reductions only
18312 for (auto *I : reverse(Instructions)) {
18313 if (R.isDeleted(I))
18314 continue;
18315 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
18316 }
18317 // pass2 - try to match and vectorize a buildvector sequence.
18318 for (auto *I : reverse(Instructions)) {
18319 if (R.isDeleted(I) || isa<CmpInst>(I))
18320 continue;
18321 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
18322 OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
18323 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
18324 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
18325 }
18326 }
18327 // Now try to vectorize postponed instructions.
18328 OpsChanged |= tryToVectorize(PostponedInsts, R);
18329
18330 Instructions.clear();
18331 return OpsChanged;
18332}
18333
18334bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
18335 bool Changed = false;
18337 SmallPtrSet<Value *, 16> VisitedInstrs;
18338 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
18339 // node. Allows better to identify the chains that can be vectorized in the
18340 // better way.
18342 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
18344 isValidElementType(V2->getType()) &&
18345 "Expected vectorizable types only.");
18346 // It is fine to compare type IDs here, since we expect only vectorizable
18347 // types, like ints, floats and pointers, we don't care about other type.
18348 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
18349 return true;
18350 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
18351 return false;
18352 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18353 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18354 if (Opcodes1.size() < Opcodes2.size())
18355 return true;
18356 if (Opcodes1.size() > Opcodes2.size())
18357 return false;
18358 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18359 {
18360 // Instructions come first.
18361 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
18362 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
18363 if (I1 && I2) {
18364 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
18365 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
18366 if (!NodeI1)
18367 return NodeI2 != nullptr;
18368 if (!NodeI2)
18369 return false;
18370 assert((NodeI1 == NodeI2) ==
18371 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18372 "Different nodes should have different DFS numbers");
18373 if (NodeI1 != NodeI2)
18374 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18375 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18376 if (S.getOpcode() && !S.isAltShuffle())
18377 continue;
18378 return I1->getOpcode() < I2->getOpcode();
18379 }
18380 if (I1)
18381 return true;
18382 if (I2)
18383 return false;
18384 }
18385 {
18386 // Non-undef constants come next.
18387 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
18388 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
18389 if (C1 && C2)
18390 continue;
18391 if (C1)
18392 return true;
18393 if (C2)
18394 return false;
18395 }
18396 bool U1 = isa<UndefValue>(Opcodes1[I]);
18397 bool U2 = isa<UndefValue>(Opcodes2[I]);
18398 {
18399 // Non-constant non-instructions come next.
18400 if (!U1 && !U2) {
18401 auto ValID1 = Opcodes1[I]->getValueID();
18402 auto ValID2 = Opcodes2[I]->getValueID();
18403 if (ValID1 == ValID2)
18404 continue;
18405 if (ValID1 < ValID2)
18406 return true;
18407 if (ValID1 > ValID2)
18408 return false;
18409 }
18410 if (!U1)
18411 return true;
18412 if (!U2)
18413 return false;
18414 }
18415 // Undefs come last.
18416 assert(U1 && U2 && "The only thing left should be undef & undef.");
18417 continue;
18418 }
18419 return false;
18420 };
18421 auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
18422 if (V1 == V2)
18423 return true;
18424 if (V1->getType() != V2->getType())
18425 return false;
18426 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
18427 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
18428 if (Opcodes1.size() != Opcodes2.size())
18429 return false;
18430 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
18431 // Undefs are compatible with any other value.
18432 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
18433 continue;
18434 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
18435 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18436 if (I1->getParent() != I2->getParent())
18437 return false;
18438 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18439 if (S.getOpcode())
18440 continue;
18441 return false;
18442 }
18443 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
18444 continue;
18445 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
18446 return false;
18447 }
18448 return true;
18449 };
18450
18451 bool HaveVectorizedPhiNodes = false;
18452 do {
18453 // Collect the incoming values from the PHIs.
18454 Incoming.clear();
18455 for (Instruction &I : *BB) {
18456 auto *P = dyn_cast<PHINode>(&I);
18457 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
18458 break;
18459
18460 // No need to analyze deleted, vectorized and non-vectorizable
18461 // instructions.
18462 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
18463 isValidElementType(P->getType()))
18464 Incoming.push_back(P);
18465 }
18466
18467 if (Incoming.size() <= 1)
18468 break;
18469
18470 // Find the corresponding non-phi nodes for better matching when trying to
18471 // build the tree.
18472 for (Value *V : Incoming) {
18473 SmallVectorImpl<Value *> &Opcodes =
18474 PHIToOpcodes.try_emplace(V).first->getSecond();
18475 if (!Opcodes.empty())
18476 continue;
18477 SmallVector<Value *, 4> Nodes(1, V);
18479 while (!Nodes.empty()) {
18480 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
18481 if (!Visited.insert(PHI).second)
18482 continue;
18483 for (Value *V : PHI->incoming_values()) {
18484 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
18485 Nodes.push_back(PHI1);
18486 continue;
18487 }
18488 Opcodes.emplace_back(V);
18489 }
18490 }
18491 }
18492
18493 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
18494 Incoming, PHICompare, AreCompatiblePHIs,
18495 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
18496 return tryToVectorizeList(Candidates, R, MaxVFOnly);
18497 },
18498 /*MaxVFOnly=*/true, R);
18499 Changed |= HaveVectorizedPhiNodes;
18500 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
18501 } while (HaveVectorizedPhiNodes);
18502
18503 VisitedInstrs.clear();
18504
18505 InstSetVector PostProcessInserts;
18506 SmallSetVector<CmpInst *, 8> PostProcessCmps;
18507 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
18508 // also vectorizes `PostProcessCmps`.
18509 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
18510 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
18511 if (VectorizeCmps) {
18512 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
18513 PostProcessCmps.clear();
18514 }
18515 PostProcessInserts.clear();
18516 return Changed;
18517 };
18518 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
18519 auto IsInPostProcessInstrs = [&](Instruction *I) {
18520 if (auto *Cmp = dyn_cast<CmpInst>(I))
18521 return PostProcessCmps.contains(Cmp);
18522 return isa<InsertElementInst, InsertValueInst>(I) &&
18523 PostProcessInserts.contains(I);
18524 };
18525 // Returns true if `I` is an instruction without users, like terminator, or
18526 // function call with ignored return value, store. Ignore unused instructions
18527 // (basing on instruction type, except for CallInst and InvokeInst).
18528 auto HasNoUsers = [](Instruction *I) {
18529 return I->use_empty() &&
18530 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
18531 };
18532 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
18533 // Skip instructions with scalable type. The num of elements is unknown at
18534 // compile-time for scalable type.
18535 if (isa<ScalableVectorType>(It->getType()))
18536 continue;
18537
18538 // Skip instructions marked for the deletion.
18539 if (R.isDeleted(&*It))
18540 continue;
18541 // We may go through BB multiple times so skip the one we have checked.
18542 if (!VisitedInstrs.insert(&*It).second) {
18543 if (HasNoUsers(&*It) &&
18544 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
18545 // We would like to start over since some instructions are deleted
18546 // and the iterator may become invalid value.
18547 Changed = true;
18548 It = BB->begin();
18549 E = BB->end();
18550 }
18551 continue;
18552 }
18553
18554 if (isa<DbgInfoIntrinsic>(It))
18555 continue;
18556
18557 // Try to vectorize reductions that use PHINodes.
18558 if (PHINode *P = dyn_cast<PHINode>(It)) {
18559 // Check that the PHI is a reduction PHI.
18560 if (P->getNumIncomingValues() == 2) {
18561 // Try to match and vectorize a horizontal reduction.
18562 Instruction *Root = getReductionInstr(DT, P, BB, LI);
18563 if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
18564 Changed = true;
18565 It = BB->begin();
18566 E = BB->end();
18567 continue;
18568 }
18569 }
18570 // Try to vectorize the incoming values of the PHI, to catch reductions
18571 // that feed into PHIs.
18572 for (unsigned I = 0, E = P->getNumIncomingValues(); I != E; I++) {
18573 // Skip if the incoming block is the current BB for now. Also, bypass
18574 // unreachable IR for efficiency and to avoid crashing.
18575 // TODO: Collect the skipped incoming values and try to vectorize them
18576 // after processing BB.
18577 if (BB == P->getIncomingBlock(I) ||
18578 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
18579 continue;
18580
18581 // Postponed instructions should not be vectorized here, delay their
18582 // vectorization.
18583 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
18584 PI && !IsInPostProcessInstrs(PI))
18585 Changed |= vectorizeRootInstruction(nullptr, PI,
18586 P->getIncomingBlock(I), R, TTI);
18587 }
18588 continue;
18589 }
18590
18591 if (HasNoUsers(&*It)) {
18592 bool OpsChanged = false;
18593 auto *SI = dyn_cast<StoreInst>(It);
18594 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
18595 if (SI) {
18596 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
18597 // Try to vectorize chain in store, if this is the only store to the
18598 // address in the block.
18599 // TODO: This is just a temporarily solution to save compile time. Need
18600 // to investigate if we can safely turn on slp-vectorize-hor-store
18601 // instead to allow lookup for reduction chains in all non-vectorized
18602 // stores (need to check side effects and compile time).
18603 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
18604 SI->getValueOperand()->hasOneUse();
18605 }
18606 if (TryToVectorizeRoot) {
18607 for (auto *V : It->operand_values()) {
18608 // Postponed instructions should not be vectorized here, delay their
18609 // vectorization.
18610 if (auto *VI = dyn_cast<Instruction>(V);
18611 VI && !IsInPostProcessInstrs(VI))
18612 // Try to match and vectorize a horizontal reduction.
18613 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
18614 }
18615 }
18616 // Start vectorization of post-process list of instructions from the
18617 // top-tree instructions to try to vectorize as many instructions as
18618 // possible.
18619 OpsChanged |=
18620 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
18621 if (OpsChanged) {
18622 // We would like to start over since some instructions are deleted
18623 // and the iterator may become invalid value.
18624 Changed = true;
18625 It = BB->begin();
18626 E = BB->end();
18627 continue;
18628 }
18629 }
18630
18631 if (isa<InsertElementInst, InsertValueInst>(It))
18632 PostProcessInserts.insert(&*It);
18633 else if (isa<CmpInst>(It))
18634 PostProcessCmps.insert(cast<CmpInst>(&*It));
18635 }
18636
18637 return Changed;
18638}
18639
18640bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
18641 auto Changed = false;
18642 for (auto &Entry : GEPs) {
18643 // If the getelementptr list has fewer than two elements, there's nothing
18644 // to do.
18645 if (Entry.second.size() < 2)
18646 continue;
18647
18648 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
18649 << Entry.second.size() << ".\n");
18650
18651 // Process the GEP list in chunks suitable for the target's supported
18652 // vector size. If a vector register can't hold 1 element, we are done. We
18653 // are trying to vectorize the index computations, so the maximum number of
18654 // elements is based on the size of the index expression, rather than the
18655 // size of the GEP itself (the target's pointer size).
18656 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18657 unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
18658 if (MaxVecRegSize < EltSize)
18659 continue;
18660
18661 unsigned MaxElts = MaxVecRegSize / EltSize;
18662 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
18663 auto Len = std::min<unsigned>(BE - BI, MaxElts);
18664 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
18665
18666 // Initialize a set a candidate getelementptrs. Note that we use a
18667 // SetVector here to preserve program order. If the index computations
18668 // are vectorizable and begin with loads, we want to minimize the chance
18669 // of having to reorder them later.
18670 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
18671
18672 // Some of the candidates may have already been vectorized after we
18673 // initially collected them or their index is optimized to constant value.
18674 // If so, they are marked as deleted, so remove them from the set of
18675 // candidates.
18676 Candidates.remove_if([&R](Value *I) {
18677 return R.isDeleted(cast<Instruction>(I)) ||
18678 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
18679 });
18680
18681 // Remove from the set of candidates all pairs of getelementptrs with
18682 // constant differences. Such getelementptrs are likely not good
18683 // candidates for vectorization in a bottom-up phase since one can be
18684 // computed from the other. We also ensure all candidate getelementptr
18685 // indices are unique.
18686 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
18687 auto *GEPI = GEPList[I];
18688 if (!Candidates.count(GEPI))
18689 continue;
18690 auto *SCEVI = SE->getSCEV(GEPList[I]);
18691 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
18692 auto *GEPJ = GEPList[J];
18693 auto *SCEVJ = SE->getSCEV(GEPList[J]);
18694 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
18695 Candidates.remove(GEPI);
18696 Candidates.remove(GEPJ);
18697 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
18698 Candidates.remove(GEPJ);
18699 }
18700 }
18701 }
18702
18703 // We break out of the above computation as soon as we know there are
18704 // fewer than two candidates remaining.
18705 if (Candidates.size() < 2)
18706 continue;
18707
18708 // Add the single, non-constant index of each candidate to the bundle. We
18709 // ensured the indices met these constraints when we originally collected
18710 // the getelementptrs.
18711 SmallVector<Value *, 16> Bundle(Candidates.size());
18712 auto BundleIndex = 0u;
18713 for (auto *V : Candidates) {
18714 auto *GEP = cast<GetElementPtrInst>(V);
18715 auto *GEPIdx = GEP->idx_begin()->get();
18716 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
18717 Bundle[BundleIndex++] = GEPIdx;
18718 }
18719
18720 // Try and vectorize the indices. We are currently only interested in
18721 // gather-like cases of the form:
18722 //
18723 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
18724 //
18725 // where the loads of "a", the loads of "b", and the subtractions can be
18726 // performed in parallel. It's likely that detecting this pattern in a
18727 // bottom-up phase will be simpler and less costly than building a
18728 // full-blown top-down phase beginning at the consecutive loads.
18729 Changed |= tryToVectorizeList(Bundle, R);
18730 }
18731 }
18732 return Changed;
18733}
18734
18735bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
18736 bool Changed = false;
18737 // Sort by type, base pointers and values operand. Value operands must be
18738 // compatible (have the same opcode, same parent), otherwise it is
18739 // definitely not profitable to try to vectorize them.
18740 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
18741 if (V->getValueOperand()->getType()->getTypeID() <
18742 V2->getValueOperand()->getType()->getTypeID())
18743 return true;
18744 if (V->getValueOperand()->getType()->getTypeID() >
18745 V2->getValueOperand()->getType()->getTypeID())
18746 return false;
18747 if (V->getPointerOperandType()->getTypeID() <
18748 V2->getPointerOperandType()->getTypeID())
18749 return true;
18750 if (V->getPointerOperandType()->getTypeID() >
18751 V2->getPointerOperandType()->getTypeID())
18752 return false;
18753 // UndefValues are compatible with all other values.
18754 if (isa<UndefValue>(V->getValueOperand()) ||
18755 isa<UndefValue>(V2->getValueOperand()))
18756 return false;
18757 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
18758 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18760 DT->getNode(I1->getParent());
18762 DT->getNode(I2->getParent());
18763 assert(NodeI1 && "Should only process reachable instructions");
18764 assert(NodeI2 && "Should only process reachable instructions");
18765 assert((NodeI1 == NodeI2) ==
18766 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
18767 "Different nodes should have different DFS numbers");
18768 if (NodeI1 != NodeI2)
18769 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
18770 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18771 if (S.getOpcode())
18772 return false;
18773 return I1->getOpcode() < I2->getOpcode();
18774 }
18775 if (isa<Constant>(V->getValueOperand()) &&
18776 isa<Constant>(V2->getValueOperand()))
18777 return false;
18778 return V->getValueOperand()->getValueID() <
18779 V2->getValueOperand()->getValueID();
18780 };
18781
18782 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
18783 if (V1 == V2)
18784 return true;
18785 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
18786 return false;
18787 if (V1->getPointerOperandType() != V2->getPointerOperandType())
18788 return false;
18789 // Undefs are compatible with any other value.
18790 if (isa<UndefValue>(V1->getValueOperand()) ||
18791 isa<UndefValue>(V2->getValueOperand()))
18792 return true;
18793 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
18794 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
18795 if (I1->getParent() != I2->getParent())
18796 return false;
18797 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
18798 return S.getOpcode() > 0;
18799 }
18800 if (isa<Constant>(V1->getValueOperand()) &&
18801 isa<Constant>(V2->getValueOperand()))
18802 return true;
18803 return V1->getValueOperand()->getValueID() ==
18804 V2->getValueOperand()->getValueID();
18805 };
18806
18807 // Attempt to sort and vectorize each of the store-groups.
18809 for (auto &Pair : Stores) {
18810 if (Pair.second.size() < 2)
18811 continue;
18812
18813 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
18814 << Pair.second.size() << ".\n");
18815
18816 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
18817 continue;
18818
18819 // Reverse stores to do bottom-to-top analysis. This is important if the
18820 // values are stores to the same addresses several times, in this case need
18821 // to follow the stores order (reversed to meet the memory dependecies).
18822 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
18823 Pair.second.rend());
18824 Changed |= tryToVectorizeSequence<StoreInst>(
18825 ReversedStores, StoreSorter, AreCompatibleStores,
18826 [&](ArrayRef<StoreInst *> Candidates, bool) {
18827 return vectorizeStores(Candidates, R, Attempted);
18828 },
18829 /*MaxVFOnly=*/false, R);
18830 }
18831 return Changed;
18832}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
amdgpu AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:537
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition: LICM.cpp:1499
Loop::LoopBounds::Direction Direction
Definition: LoopInfo.cpp:230
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
Module.h This file contains the declarations for the Module class.
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(VerifyEach)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static cl::opt< bool > AllowHorRdxIdenityOptimization("slp-optimize-identity-hor-reduction-ops", cl::init(true), cl::Hidden, cl::desc("Allow optimization of original scalar identity operations on " "matched horizontal reductions."))
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static Value * isOneOf(const InstructionsState &S, Value *Op)
Chooses the correct key for scheduling data.
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
#define SV_NAME
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static std::string shortBundleName(ArrayRef< Value * > VL)
Print a short descriptor of the instruction bundle suitable for debug output.
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static std::optional< unsigned > getInsertIndex(const Value *InsertInst, unsigned Offset=0)
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static bool isValidForAlternation(unsigned Opcode)
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt)
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI, unsigned BaseIndex=0)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:167
This pass exposes codegen information to IR-level passes.
This defines the Use class.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
InstructionCost finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:76
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1308
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1636
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:178
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:264
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:492
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:473
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:187
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:174
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:228
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:204
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:168
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:210
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:195
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:194
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
InstListType::const_iterator getFirstNonPHIIt() const
Iterator returning form of getFirstNonPHI.
Definition: BasicBlock.cpp:367
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
reverse_iterator rend()
Definition: BasicBlock.h:448
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:165
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:70
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1494
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:2338
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:2233
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2475
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:2332
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1600
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1678
unsigned arg_size() const
Definition: InstrTypes.h:1685
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1871
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:2329
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:601
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:983
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:1362
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:1017
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:1021
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:1167
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:1129
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:1105
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2126
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:154
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1449
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1398
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:484
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:905
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:101
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:202
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:155
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&... Args)
Definition: DenseMap.h:235
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:151
iterator end()
Definition: DenseMap.h:84
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:145
value_type & FindAndConstruct(const KeyT &Key)
Definition: DenseMap.h:348
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:220
Implements a dense probed hash-table based set.
Definition: DenseSet.h:271
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:308
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:539
unsigned getNumElements() const
Definition: DerivedTypes.h:582
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
Type * getReturnType() const
Definition: DerivedTypes.h:124
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:973
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:921
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:511
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2460
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:539
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1807
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:466
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:932
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2535
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:311
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:848
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1753
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
Value * CreateICmpUGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2249
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2494
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:471
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1666
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2161
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1587
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:630
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:260
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:742
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:257
const BasicBlock * getParent() const
Definition: Instruction.h:152
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:252
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:258
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
An instruction for reading from memory.
Definition: Instructions.h:184
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:286
Value * getPointerOperand()
Definition: Instructions.h:280
bool isSimple() const
Definition: Instructions.h:272
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:236
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type count(const KeyT &Key) const
Definition: MapVector.h:165
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
void reserve(size_type NumEntries)
Grow the MapVector so that it can contain at least NumEntries items before resizing again.
Definition: MapVector.h:64
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:307
T & front() const
front - Get the first element.
Definition: ArrayRef.h:363
iterator end() const
Definition: ArrayRef.h:357
iterator begin() const
Definition: ArrayRef.h:356
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:376
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:449
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T get() const
Returns the value of the specified pointer type.
Definition: PointerUnion.h:155
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:162
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1827
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:109
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:115
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:144
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
Definition: SetVector.h:264
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:290
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
bool erase(PtrType Ptr)
erase - If the set contains the specified pointer, remove it and return true, otherwise return false.
Definition: SmallPtrSet.h:356
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:366
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:166
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:236
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
size_type size() const
Definition: SmallSet.h:161
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:717
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:981
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
Type * getPointerOperandType() const
Definition: Instructions.h:420
Value * getValueOperand()
Definition: Instructions.h:414
Value * getPointerOperand()
Definition: Instructions.h:417
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask=std::nullopt, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr) const
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:160
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:234
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:287
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:166
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:137
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1808
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:242
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, Use *, unsigned NumOps)
Definition: User.h:73
op_iterator op_begin()
Definition: User.h:234
Value * getOperand(unsigned i) const
Definition: User.h:169
iterator_range< value_op_iterator > operand_values()
Definition: User.h:266
The Vector Function Database.
Definition: VectorUtils.h:29
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:70
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1074
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:206
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:185
bool erase(const ValueT &V)
Definition: DenseSet.h:101
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:97
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:199
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:74
self_iterator getIterator()
Definition: ilist_node.h:109
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:660
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals=std::nullopt)
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
MapVector< Value *, SmallVector< Instruction *, 2 > > ExtraValueToDebugLocsMap
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
SmallPtrSet< Value *, 16 > ValueSet
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:103
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:121
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1471
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ Undef
Value of the register doesn't matter.
ManagedStatic< cl::opt< FnT >, OptCreatorT > Action
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:227
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:456
void stable_sort(R &&Range)
Definition: STLExtras.h:1995
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1742
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1715
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:128
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:950
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:540
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &DL, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
Value * createSimpleTargetReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a target reduction of the given vector.
Definition: LoopUtils.cpp:1154
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:40
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7060
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
bool getAlign(const Function &F, unsigned index, unsigned &align)
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
iterator_range< po_iterator< T > > post_order(const T &G)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:372
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2059
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1928
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:324
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:275
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1647
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1754
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:116
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:419
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1223
constexpr int PoisonMaskElem
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:34
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1914
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Definition: STLExtras.h:1986
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1824
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if the instruction does not have any effects besides calculating the result and does not ...
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:191
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1921
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:565
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1749
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1879
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx)
Identifies if the vector form of the intrinsic has a scalar operand.
InstructionCost Cost
uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew=0)
Returns the largest uint64_t less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:439
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:613
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:491
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2496
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition: KnownBits.h:104
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:220
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1450
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1459
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.
bool operator==(const EdgeInfo &Other) const