LLVM 20.0.0git
SLPVectorizer.cpp
Go to the documentation of this file.
1//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
10// stores that can be put together into vector-stores. Next, it attempts to
11// construct vectorizable tree using the use-def chains. If a profitable tree
12// was found, the SLP vectorizer performs vectorization on the tree.
13//
14// The pass is inspired by the work described in the paper:
15// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
16//
17//===----------------------------------------------------------------------===//
18
20#include "llvm/ADT/DenseMap.h"
21#include "llvm/ADT/DenseSet.h"
23#include "llvm/ADT/STLExtras.h"
24#include "llvm/ADT/ScopeExit.h"
26#include "llvm/ADT/SetVector.h"
29#include "llvm/ADT/SmallSet.h"
31#include "llvm/ADT/Statistic.h"
32#include "llvm/ADT/iterator.h"
51#include "llvm/IR/Attributes.h"
52#include "llvm/IR/BasicBlock.h"
53#include "llvm/IR/Constant.h"
54#include "llvm/IR/Constants.h"
55#include "llvm/IR/DataLayout.h"
57#include "llvm/IR/Dominators.h"
58#include "llvm/IR/Function.h"
59#include "llvm/IR/IRBuilder.h"
60#include "llvm/IR/InstrTypes.h"
61#include "llvm/IR/Instruction.h"
64#include "llvm/IR/Intrinsics.h"
65#include "llvm/IR/Module.h"
66#include "llvm/IR/Operator.h"
68#include "llvm/IR/Type.h"
69#include "llvm/IR/Use.h"
70#include "llvm/IR/User.h"
71#include "llvm/IR/Value.h"
72#include "llvm/IR/ValueHandle.h"
73#ifdef EXPENSIVE_CHECKS
74#include "llvm/IR/Verifier.h"
75#endif
76#include "llvm/Pass.h"
81#include "llvm/Support/Debug.h"
93#include <algorithm>
94#include <cassert>
95#include <cstdint>
96#include <iterator>
97#include <memory>
98#include <optional>
99#include <set>
100#include <string>
101#include <tuple>
102#include <utility>
103
104using namespace llvm;
105using namespace llvm::PatternMatch;
106using namespace slpvectorizer;
107using namespace std::placeholders;
108
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
111
112STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
113
114DEBUG_COUNTER(VectorizedGraphs, "slp-vectorized",
115 "Controls which SLP graphs should be vectorized.");
116
117static cl::opt<bool>
118 RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
119 cl::desc("Run the SLP vectorization passes"));
120
121static cl::opt<bool>
122 SLPReVec("slp-revec", cl::init(false), cl::Hidden,
123 cl::desc("Enable vectorization for wider vector utilization"));
124
125static cl::opt<int>
127 cl::desc("Only vectorize if you gain more than this "
128 "number "));
129
131 "slp-skip-early-profitability-check", cl::init(false), cl::Hidden,
132 cl::desc("When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
134
135static cl::opt<bool>
136ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
137 cl::desc("Attempt to vectorize horizontal reductions"));
138
140 "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
141 cl::desc(
142 "Attempt to vectorize horizontal reductions feeding into a store"));
143
144static cl::opt<int>
146 cl::desc("Attempt to vectorize for this register size in bits"));
147
150 cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
151
152/// Limits the size of scheduling regions in a block.
153/// It avoid long compile times for _very_ large blocks where vector
154/// instructions are spread over a wide range.
155/// This limit is way higher than needed by real-world functions.
156static cl::opt<int>
157ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
158 cl::desc("Limit the size of the SLP scheduling region per block"));
159
161 "slp-min-reg-size", cl::init(128), cl::Hidden,
162 cl::desc("Attempt to vectorize for this register size in bits"));
163
165 "slp-recursion-max-depth", cl::init(12), cl::Hidden,
166 cl::desc("Limit the recursion depth when building a vectorizable tree"));
167
169 "slp-min-tree-size", cl::init(3), cl::Hidden,
170 cl::desc("Only vectorize small trees if they are fully vectorizable"));
171
172// The maximum depth that the look-ahead score heuristic will explore.
173// The higher this value, the higher the compilation time overhead.
175 "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
176 cl::desc("The maximum look-ahead depth for operand reordering scores"));
177
178// The maximum depth that the look-ahead score heuristic will explore
179// when it probing among candidates for vectorization tree roots.
180// The higher this value, the higher the compilation time overhead but unlike
181// similar limit for operands ordering this is less frequently used, hence
182// impact of higher value is less noticeable.
184 "slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden,
185 cl::desc("The maximum look-ahead depth for searching best rooting option"));
186
188 "slp-min-strided-loads", cl::init(2), cl::Hidden,
189 cl::desc("The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
191
193 "slp-max-stride", cl::init(8), cl::Hidden,
194 cl::desc("The maximum stride, considered to be profitable."));
195
196static cl::opt<bool>
197 ViewSLPTree("view-slp-tree", cl::Hidden,
198 cl::desc("Display the SLP trees with Graphviz"));
199
201 "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden,
202 cl::desc("Try to vectorize with non-power-of-2 number of elements."));
203
204// Limit the number of alias checks. The limit is chosen so that
205// it has no negative effect on the llvm benchmarks.
206static const unsigned AliasedCheckLimit = 10;
207
208// Limit of the number of uses for potentially transformed instructions/values,
209// used in checks to avoid compile-time explode.
210static constexpr int UsesLimit = 64;
211
212// Another limit for the alias checks: The maximum distance between load/store
213// instructions where alias checks are done.
214// This limit is useful for very large basic blocks.
215static const unsigned MaxMemDepDistance = 160;
216
217/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
218/// regions to be handled.
219static const int MinScheduleRegionSize = 16;
220
221/// Maximum allowed number of operands in the PHI nodes.
222static const unsigned MaxPHINumOperands = 128;
223
224/// Predicate for the element types that the SLP vectorizer supports.
225///
226/// The most important thing to filter here are types which are invalid in LLVM
227/// vectors. We also filter target specific types which have absolutely no
228/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
229/// avoids spending time checking the cost model and realizing that they will
230/// be inevitably scalarized.
231static bool isValidElementType(Type *Ty) {
232 // TODO: Support ScalableVectorType.
233 if (SLPReVec && isa<FixedVectorType>(Ty))
234 Ty = Ty->getScalarType();
235 return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
236 !Ty->isPPC_FP128Ty();
237}
238
239/// Returns the type of the given value/instruction \p V. If it is store,
240/// returns the type of its value operand, for Cmp - the types of the compare
241/// operands and for insertelement - the type os the inserted operand.
242/// Otherwise, just the type of the value is returned.
244 if (auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
250 return V->getType();
251}
252
253/// \returns the number of elements for Ty.
254static unsigned getNumElements(Type *Ty) {
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
259 return 1;
260}
261
262/// \returns the vector type of ScalarTy based on vectorization factor.
263static FixedVectorType *getWidenedType(Type *ScalarTy, unsigned VF) {
264 return FixedVectorType::get(ScalarTy->getScalarType(),
265 VF * getNumElements(ScalarTy));
266}
267
268/// Returns the number of elements of the given type \p Ty, not less than \p Sz,
269/// which forms type, which splits by \p TTI into whole vector types during
270/// legalization.
272 Type *Ty, unsigned Sz) {
273 if (!isValidElementType(Ty))
274 return bit_ceil(Sz);
275 // Find the number of elements, which forms full vectors.
276 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
277 if (NumParts == 0 || NumParts >= Sz)
278 return bit_ceil(Sz);
279 return bit_ceil(divideCeil(Sz, NumParts)) * NumParts;
280}
281
282/// Returns the number of elements of the given type \p Ty, not greater than \p
283/// Sz, which forms type, which splits by \p TTI into whole vector types during
284/// legalization.
285static unsigned
287 unsigned Sz) {
288 if (!isValidElementType(Ty))
289 return bit_floor(Sz);
290 // Find the number of elements, which forms full vectors.
291 unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
292 if (NumParts == 0 || NumParts >= Sz)
293 return bit_floor(Sz);
294 unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
295 if (RegVF > Sz)
296 return bit_floor(Sz);
297 return (Sz / RegVF) * RegVF;
298}
299
300static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements,
301 SmallVectorImpl<int> &Mask) {
302 // The ShuffleBuilder implementation use shufflevector to splat an "element".
303 // But the element have different meaning for SLP (scalar) and REVEC
304 // (vector). We need to expand Mask into masks which shufflevector can use
305 // directly.
306 SmallVector<int> NewMask(Mask.size() * VecTyNumElements);
307 for (unsigned I : seq<unsigned>(Mask.size()))
308 for (auto [J, MaskV] : enumerate(MutableArrayRef(NewMask).slice(
309 I * VecTyNumElements, VecTyNumElements)))
310 MaskV = Mask[I] == PoisonMaskElem ? PoisonMaskElem
311 : Mask[I] * VecTyNumElements + J;
312 Mask.swap(NewMask);
313}
314
315/// \returns the number of groups of shufflevector
316/// A group has the following features
317/// 1. All of value in a group are shufflevector.
318/// 2. The mask of all shufflevector is isExtractSubvectorMask.
319/// 3. The mask of all shufflevector uses all of the elements of the source.
320/// e.g., it is 1 group (%0)
321/// %1 = shufflevector <16 x i8> %0, <16 x i8> poison,
322/// <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
323/// %2 = shufflevector <16 x i8> %0, <16 x i8> poison,
324/// <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
325/// it is 2 groups (%3 and %4)
326/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
327/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
328/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
329/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
330/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
331/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
332/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
333/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
334/// it is 0 group
335/// %12 = shufflevector <8 x i16> %10, <8 x i16> poison,
336/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
337/// %13 = shufflevector <8 x i16> %11, <8 x i16> poison,
338/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
340 if (VL.empty())
341 return 0;
342 if (!all_of(VL, IsaPred<ShuffleVectorInst>))
343 return 0;
344 auto *SV = cast<ShuffleVectorInst>(VL.front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
349 return 0;
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.size() % GroupSize) != 0)
352 return 0;
353 unsigned NumGroup = 0;
354 for (size_t I = 0, E = VL.size(); I != E; I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[I]);
356 Value *Src = SV->getOperand(0);
357 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
358 SmallBitVector ExpectedIndex(GroupSize);
359 if (!all_of(Group, [&](Value *V) {
360 auto *SV = cast<ShuffleVectorInst>(V);
361 // From the same source.
362 if (SV->getOperand(0) != Src)
363 return false;
364 int Index;
365 if (!SV->isExtractSubvectorMask(Index))
366 return false;
367 ExpectedIndex.set(Index / ShuffleMaskSize);
368 return true;
369 }))
370 return 0;
371 if (!ExpectedIndex.all())
372 return 0;
373 ++NumGroup;
374 }
375 assert(NumGroup == (VL.size() / GroupSize) && "Unexpected number of groups");
376 return NumGroup;
377}
378
379/// \returns a shufflevector mask which is used to vectorize shufflevectors
380/// e.g.,
381/// %5 = shufflevector <8 x i16> %3, <8 x i16> poison,
382/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
383/// %6 = shufflevector <8 x i16> %3, <8 x i16> poison,
384/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
385/// %7 = shufflevector <8 x i16> %4, <8 x i16> poison,
386/// <4 x i32> <i32 0, i32 1, i32 2, i32 3>
387/// %8 = shufflevector <8 x i16> %4, <8 x i16> poison,
388/// <4 x i32> <i32 4, i32 5, i32 6, i32 7>
389/// the result is
390/// <0, 1, 2, 3, 12, 13, 14, 15, 16, 17, 18, 19, 28, 29, 30, 31>
392 assert(getShufflevectorNumGroups(VL) && "Not supported shufflevector usage.");
393 auto *SV = cast<ShuffleVectorInst>(VL.front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
396 SmallVector<int> Mask;
397 unsigned AccumulateLength = 0;
398 for (Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (int M : SV->getShuffleMask())
401 Mask.push_back(M == PoisonMaskElem ? PoisonMaskElem
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
404 }
405 return Mask;
406}
407
408/// \returns True if the value is a constant (but not globals/constant
409/// expressions).
410static bool isConstant(Value *V) {
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
412}
413
414/// Checks if \p V is one of vector-like instructions, i.e. undef,
415/// insertelement/extractelement with constant indices for fixed vector type or
416/// extractvalue instruction.
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
420 return false;
421 auto *I = dyn_cast<Instruction>(V);
422 if (!I || isa<ExtractValueInst>(I))
423 return true;
424 if (!isa<FixedVectorType>(I->getOperand(0)->getType()))
425 return false;
426 if (isa<ExtractElementInst>(I))
427 return isConstant(I->getOperand(1));
428 assert(isa<InsertElementInst>(V) && "Expected only insertelement.");
429 return isConstant(I->getOperand(2));
430}
431
432/// Returns power-of-2 number of elements in a single register (part), given the
433/// total number of elements \p Size and number of registers (parts) \p
434/// NumParts.
435static unsigned getPartNumElems(unsigned Size, unsigned NumParts) {
436 return std::min<unsigned>(Size, bit_ceil(divideCeil(Size, NumParts)));
437}
438
439/// Returns correct remaining number of elements, considering total amount \p
440/// Size, (power-of-2 number) of elements in a single register \p PartNumElems
441/// and current register (part) \p Part.
442static unsigned getNumElems(unsigned Size, unsigned PartNumElems,
443 unsigned Part) {
444 return std::min<unsigned>(PartNumElems, Size - Part * PartNumElems);
445}
446
447#if !defined(NDEBUG)
448/// Print a short descriptor of the instruction bundle suitable for debug output.
449static std::string shortBundleName(ArrayRef<Value *> VL, int Idx = -1) {
450 std::string Result;
451 raw_string_ostream OS(Result);
452 if (Idx >= 0)
453 OS << "Idx: " << Idx << ", ";
454 OS << "n=" << VL.size() << " [" << *VL.front() << ", ..]";
455 return Result;
456}
457#endif
458
459/// \returns true if all of the instructions in \p VL are in the same block or
460/// false otherwise.
462 auto *It = find_if(VL, IsaPred<Instruction>);
463 if (It == VL.end())
464 return false;
465 Instruction *I0 = cast<Instruction>(*It);
467 return true;
468
469 BasicBlock *BB = I0->getParent();
470 for (Value *V : iterator_range(It, VL.end())) {
471 if (isa<PoisonValue>(V))
472 continue;
473 auto *II = dyn_cast<Instruction>(V);
474 if (!II)
475 return false;
476
477 if (BB != II->getParent())
478 return false;
479 }
480 return true;
481}
482
483/// \returns True if all of the values in \p VL are constants (but not
484/// globals/constant expressions).
486 // Constant expressions and globals can't be vectorized like normal integer/FP
487 // constants.
488 return all_of(VL, isConstant);
489}
490
491/// \returns True if all of the values in \p VL are identical or some of them
492/// are UndefValue.
493static bool isSplat(ArrayRef<Value *> VL) {
494 Value *FirstNonUndef = nullptr;
495 for (Value *V : VL) {
496 if (isa<UndefValue>(V))
497 continue;
498 if (!FirstNonUndef) {
499 FirstNonUndef = V;
500 continue;
501 }
502 if (V != FirstNonUndef)
503 return false;
504 }
505 return FirstNonUndef != nullptr;
506}
507
508/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
510 if (auto *Cmp = dyn_cast<CmpInst>(I))
511 return Cmp->isCommutative();
512 if (auto *BO = dyn_cast<BinaryOperator>(I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
515 !BO->hasNUsesOrMore(UsesLimit) &&
516 all_of(
517 BO->uses(),
518 [](const Use &U) {
519 // Commutative, if icmp eq/ne sub, 0
520 CmpPredicate Pred;
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
524 return true;
525 // Commutative, if abs(sub nsw, true) or abs(sub, false).
526 ConstantInt *Flag;
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
531 Flag->isOne());
532 })) ||
533 (BO->getOpcode() == Instruction::FSub &&
534 !BO->hasNUsesOrMore(UsesLimit) &&
535 all_of(BO->uses(), [](const Use &U) {
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
538 }));
539 return I->isCommutative();
540}
541
542template <typename T>
543static std::optional<unsigned> getInsertExtractIndex(const Value *Inst,
544 unsigned Offset) {
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
547 "unsupported T");
548 int Index = Offset;
549 if (const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
551 if (!VT)
552 return std::nullopt;
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
554 if (!CI)
555 return std::nullopt;
556 if (CI->getValue().uge(VT->getNumElements()))
557 return std::nullopt;
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
560 return Index;
561 }
562 return std::nullopt;
563}
564
565/// \returns inserting or extracting index of InsertElement, ExtractElement or
566/// InsertValue instruction, using Offset as base offset for index.
567/// \returns std::nullopt if the index is not an immediate.
568static std::optional<unsigned> getElementIndex(const Value *Inst,
569 unsigned Offset = 0) {
570 if (auto Index = getInsertExtractIndex<InsertElementInst>(Inst, Offset))
571 return Index;
572 if (auto Index = getInsertExtractIndex<ExtractElementInst>(Inst, Offset))
573 return Index;
574
575 int Index = Offset;
576
577 const auto *IV = dyn_cast<InsertValueInst>(Inst);
578 if (!IV)
579 return std::nullopt;
580
581 Type *CurrentType = IV->getType();
582 for (unsigned I : IV->indices()) {
583 if (const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(I);
586 } else if (const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
589 } else {
590 return std::nullopt;
591 }
592 Index += I;
593 }
594 return Index;
595}
596
597namespace {
598/// Specifies the way the mask should be analyzed for undefs/poisonous elements
599/// in the shuffle mask.
600enum class UseMask {
601 FirstArg, ///< The mask is expected to be for permutation of 1-2 vectors,
602 ///< check for the mask elements for the first argument (mask
603 ///< indices are in range [0:VF)).
604 SecondArg, ///< The mask is expected to be for permutation of 2 vectors, check
605 ///< for the mask elements for the second argument (mask indices
606 ///< are in range [VF:2*VF))
607 UndefsAsMask ///< Consider undef mask elements (-1) as placeholders for
608 ///< future shuffle elements and mark them as ones as being used
609 ///< in future. Non-undef elements are considered as unused since
610 ///< they're already marked as used in the mask.
611};
612} // namespace
613
614/// Prepares a use bitset for the given mask either for the first argument or
615/// for the second.
617 UseMask MaskArg) {
618 SmallBitVector UseMask(VF, true);
619 for (auto [Idx, Value] : enumerate(Mask)) {
620 if (Value == PoisonMaskElem) {
621 if (MaskArg == UseMask::UndefsAsMask)
622 UseMask.reset(Idx);
623 continue;
624 }
625 if (MaskArg == UseMask::FirstArg && Value < VF)
626 UseMask.reset(Value);
627 else if (MaskArg == UseMask::SecondArg && Value >= VF)
628 UseMask.reset(Value - VF);
629 }
630 return UseMask;
631}
632
633/// Checks if the given value is actually an undefined constant vector.
634/// Also, if the \p UseMask is not empty, tries to check if the non-masked
635/// elements actually mask the insertelement buildvector, if any.
636template <bool IsPoisonOnly = false>
638 const SmallBitVector &UseMask = {}) {
639 SmallBitVector Res(UseMask.empty() ? 1 : UseMask.size(), true);
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
641 if (isa<T>(V))
642 return Res;
643 auto *VecTy = dyn_cast<FixedVectorType>(V->getType());
644 if (!VecTy)
645 return Res.reset();
646 auto *C = dyn_cast<Constant>(V);
647 if (!C) {
648 if (!UseMask.empty()) {
649 const Value *Base = V;
650 while (auto *II = dyn_cast<InsertElementInst>(Base)) {
651 Base = II->getOperand(0);
652 if (isa<T>(II->getOperand(1)))
653 continue;
654 std::optional<unsigned> Idx = getElementIndex(II);
655 if (!Idx) {
656 Res.reset();
657 return Res;
658 }
659 if (*Idx < UseMask.size() && !UseMask.test(*Idx))
660 Res.reset(*Idx);
661 }
662 // TODO: Add analysis for shuffles here too.
663 if (V == Base) {
664 Res.reset();
665 } else {
666 SmallBitVector SubMask(UseMask.size(), false);
667 Res &= isUndefVector<IsPoisonOnly>(Base, SubMask);
668 }
669 } else {
670 Res.reset();
671 }
672 return Res;
673 }
674 for (unsigned I = 0, E = VecTy->getNumElements(); I != E; ++I) {
675 if (Constant *Elem = C->getAggregateElement(I))
676 if (!isa<T>(Elem) &&
677 (UseMask.empty() || (I < UseMask.size() && !UseMask.test(I))))
678 Res.reset(I);
679 }
680 return Res;
681}
682
683/// Checks if the vector of instructions can be represented as a shuffle, like:
684/// %x0 = extractelement <4 x i8> %x, i32 0
685/// %x3 = extractelement <4 x i8> %x, i32 3
686/// %y1 = extractelement <4 x i8> %y, i32 1
687/// %y2 = extractelement <4 x i8> %y, i32 2
688/// %x0x0 = mul i8 %x0, %x0
689/// %x3x3 = mul i8 %x3, %x3
690/// %y1y1 = mul i8 %y1, %y1
691/// %y2y2 = mul i8 %y2, %y2
692/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
693/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
694/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
695/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
696/// ret <4 x i8> %ins4
697/// can be transformed into:
698/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
699/// i32 6>
700/// %2 = mul <4 x i8> %1, %1
701/// ret <4 x i8> %2
702/// Mask will return the Shuffle Mask equivalent to the extracted elements.
703/// TODO: Can we split off and reuse the shuffle mask detection from
704/// ShuffleVectorInst/getShuffleCost?
705static std::optional<TargetTransformInfo::ShuffleKind>
707 AssumptionCache *AC) {
708 const auto *It = find_if(VL, IsaPred<ExtractElementInst>);
709 if (It == VL.end())
710 return std::nullopt;
711 unsigned Size =
712 std::accumulate(VL.begin(), VL.end(), 0u, [](unsigned S, Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
714 if (!EI)
715 return S;
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
717 if (!VTy)
718 return S;
719 return std::max(S, VTy->getNumElements());
720 });
721
722 Value *Vec1 = nullptr;
723 Value *Vec2 = nullptr;
724 bool HasNonUndefVec = any_of(VL, [&](Value *V) {
725 auto *EE = dyn_cast<ExtractElementInst>(V);
726 if (!EE)
727 return false;
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
730 return false;
731 return isGuaranteedNotToBePoison(Vec, AC);
732 });
733 enum ShuffleMode { Unknown, Select, Permute };
734 ShuffleMode CommonShuffleMode = Unknown;
735 Mask.assign(VL.size(), PoisonMaskElem);
736 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
737 // Undef can be represented as an undef element in a vector.
738 if (isa<UndefValue>(VL[I]))
739 continue;
740 auto *EI = cast<ExtractElementInst>(VL[I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
742 return std::nullopt;
743 auto *Vec = EI->getVectorOperand();
744 // We can extractelement from undef or poison vector.
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
746 continue;
747 // All vector operands must have the same number of vector elements.
748 if (isa<UndefValue>(Vec)) {
749 Mask[I] = I;
750 } else {
751 if (isa<UndefValue>(EI->getIndexOperand()))
752 continue;
753 auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
754 if (!Idx)
755 return std::nullopt;
756 // Undefined behavior if Idx is negative or >= Size.
757 if (Idx->getValue().uge(Size))
758 continue;
759 unsigned IntIdx = Idx->getValue().getZExtValue();
760 Mask[I] = IntIdx;
761 }
762 if (isUndefVector(Vec).all() && HasNonUndefVec)
763 continue;
764 // For correct shuffling we have to have at most 2 different vector operands
765 // in all extractelement instructions.
766 if (!Vec1 || Vec1 == Vec) {
767 Vec1 = Vec;
768 } else if (!Vec2 || Vec2 == Vec) {
769 Vec2 = Vec;
770 Mask[I] += Size;
771 } else {
772 return std::nullopt;
773 }
774 if (CommonShuffleMode == Permute)
775 continue;
776 // If the extract index is not the same as the operation number, it is a
777 // permutation.
778 if (Mask[I] % Size != I) {
779 CommonShuffleMode = Permute;
780 continue;
781 }
782 CommonShuffleMode = Select;
783 }
784 // If we're not crossing lanes in different vectors, consider it as blending.
785 if (CommonShuffleMode == Select && Vec2)
787 // If Vec2 was never used, we have a permutation of a single vector, otherwise
788 // we have permutation of 2 vectors.
791}
792
793/// \returns True if Extract{Value,Element} instruction extracts element Idx.
794static std::optional<unsigned> getExtractIndex(Instruction *E) {
795 unsigned Opcode = E->getOpcode();
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
801 if (!CI)
802 return std::nullopt;
803 return CI->getZExtValue();
804 }
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
807 return std::nullopt;
808 return *EI->idx_begin();
809}
810
811namespace {
812
813/// Main data required for vectorization of instructions.
814class InstructionsState {
815 /// The main/alternate instruction. MainOp is also VL0.
816 Instruction *MainOp = nullptr;
817 Instruction *AltOp = nullptr;
818
819public:
820 Instruction *getMainOp() const {
821 assert(valid() && "InstructionsState is invalid.");
822 return MainOp;
823 }
824
825 Instruction *getAltOp() const {
826 assert(valid() && "InstructionsState is invalid.");
827 return AltOp;
828 }
829
830 /// The main/alternate opcodes for the list of instructions.
831 unsigned getOpcode() const { return getMainOp()->getOpcode(); }
832
833 unsigned getAltOpcode() const { return getAltOp()->getOpcode(); }
834
835 /// Some of the instructions in the list have alternate opcodes.
836 bool isAltShuffle() const { return getMainOp() != getAltOp(); }
837
838 bool isOpcodeOrAlt(Instruction *I) const {
839 unsigned CheckedOpcode = I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
841 }
842
843 /// Checks if the current state is valid, i.e. has non-null MainOp
844 bool valid() const { return MainOp && AltOp; }
845
846 explicit operator bool() const { return valid(); }
847
848 InstructionsState() = delete;
849 InstructionsState(Instruction *MainOp, Instruction *AltOp)
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() { return {nullptr, nullptr}; }
852};
853
854} // end anonymous namespace
855
856/// \returns true if \p Opcode is allowed as part of the main/alternate
857/// instruction for SLP vectorization.
858///
859/// Example of unsupported opcode is SDIV that can potentially cause UB if the
860/// "shuffled out" lane would result in division by zero.
861static bool isValidForAlternation(unsigned Opcode) {
862 if (Instruction::isIntDivRem(Opcode))
863 return false;
864
865 return true;
866}
867
868static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
869 const TargetLibraryInfo &TLI);
870
871/// Checks if the provided operands of 2 cmp instructions are compatible, i.e.
872/// compatible instructions or constants, or just some other regular values.
873static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0,
874 Value *Op1, const TargetLibraryInfo &TLI) {
875 return (isConstant(BaseOp0) && isConstant(Op0)) ||
876 (isConstant(BaseOp1) && isConstant(Op1)) ||
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
880 getSameOpcode({BaseOp0, Op0}, TLI) ||
881 getSameOpcode({BaseOp1, Op1}, TLI);
882}
883
884/// \returns true if a compare instruction \p CI has similar "look" and
885/// same predicate as \p BaseCI, "as is" or with its operands and predicate
886/// swapped, false otherwise.
887static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI,
888 const TargetLibraryInfo &TLI) {
889 assert(BaseCI->getOperand(0)->getType() == CI->getOperand(0)->getType() &&
890 "Assessing comparisons of different types?");
891 CmpInst::Predicate BasePred = BaseCI->getPredicate();
892 CmpInst::Predicate Pred = CI->getPredicate();
894
895 Value *BaseOp0 = BaseCI->getOperand(0);
896 Value *BaseOp1 = BaseCI->getOperand(1);
897 Value *Op0 = CI->getOperand(0);
898 Value *Op1 = CI->getOperand(1);
899
900 return (BasePred == Pred &&
901 areCompatibleCmpOps(BaseOp0, BaseOp1, Op0, Op1, TLI)) ||
902 (BasePred == SwappedPred &&
903 areCompatibleCmpOps(BaseOp0, BaseOp1, Op1, Op0, TLI));
904}
905
906/// \returns analysis of the Instructions in \p VL described in
907/// InstructionsState, the Opcode that we suppose the whole list
908/// could be vectorized even if its structure is diverse.
909static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
910 const TargetLibraryInfo &TLI) {
911 // Make sure these are all Instructions.
912 if (!all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
914
915 auto *It = find_if(VL, IsaPred<Instruction>);
916 if (It == VL.end())
917 return InstructionsState::invalid();
918
919 Instruction *MainOp = cast<Instruction>(*It);
920 unsigned InstCnt = std::count_if(It, VL.end(), IsaPred<Instruction>);
921 if ((VL.size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.size() / 2) ||
922 (VL.size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
924
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
928 CmpInst::Predicate BasePred = IsCmpOp ? cast<CmpInst>(MainOp)->getPredicate()
930 Instruction *AltOp = MainOp;
931 unsigned Opcode = MainOp->getOpcode();
932 unsigned AltOpcode = Opcode;
933
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
935 SetVector<unsigned> UniquePreds, UniqueNonSwappedPreds;
936 UniquePreds.insert(BasePred);
937 UniqueNonSwappedPreds.insert(BasePred);
938 for (Value *V : VL) {
939 auto *I = dyn_cast<CmpInst>(V);
940 if (!I)
941 return false;
942 CmpInst::Predicate CurrentPred = I->getPredicate();
943 CmpInst::Predicate SwappedCurrentPred =
944 CmpInst::getSwappedPredicate(CurrentPred);
945 UniqueNonSwappedPreds.insert(CurrentPred);
946 if (!UniquePreds.contains(CurrentPred) &&
947 !UniquePreds.contains(SwappedCurrentPred))
948 UniquePreds.insert(CurrentPred);
949 }
950 // Total number of predicates > 2, but if consider swapped predicates
951 // compatible only 2, consider swappable predicates as compatible opcodes,
952 // not alternate.
953 return UniqueNonSwappedPreds.size() > 2 && UniquePreds.size() == 2;
954 }();
955 // Check for one alternate opcode from another BinaryOperator.
956 // TODO - generalize to support all operators (types, calls etc.).
957 Intrinsic::ID BaseID = 0;
958 SmallVector<VFInfo> BaseMappings;
959 if (auto *CallBase = dyn_cast<CallInst>(MainOp)) {
961 BaseMappings = VFDatabase(*CallBase).getMappings(*CallBase);
962 if (!isTriviallyVectorizable(BaseID) && BaseMappings.empty())
963 return InstructionsState::invalid();
964 }
965 bool AnyPoison = InstCnt != VL.size();
966 // Skip MainOp.
967 for (Value *V : iterator_range(It + 1, VL.end())) {
968 auto *I = dyn_cast<Instruction>(V);
969 if (!I)
970 continue;
971
972 // Cannot combine poison and divisions.
973 // TODO: do some smart analysis of the CallInsts to exclude divide-like
974 // intrinsics/functions only.
975 if (AnyPoison && (I->isIntDivRem() || I->isFPDivRem() || isa<CallInst>(I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode = I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
980 continue;
981 if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
982 isValidForAlternation(Opcode)) {
983 AltOpcode = InstOpcode;
984 AltOp = I;
985 continue;
986 }
987 } else if (IsCastOp && isa<CastInst>(I)) {
988 Value *Op0 = MainOp->getOperand(0);
989 Type *Ty0 = Op0->getType();
990 Value *Op1 = I->getOperand(0);
991 Type *Ty1 = Op1->getType();
992 if (Ty0 == Ty1) {
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
994 continue;
995 if (Opcode == AltOpcode) {
997 isValidForAlternation(InstOpcode) &&
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1000 AltOp = I;
1001 continue;
1002 }
1003 }
1004 } else if (auto *Inst = dyn_cast<CmpInst>(I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1008 if (Ty0 == Ty1) {
1009 assert(InstOpcode == Opcode && "Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1012 "and CastInst.");
1013 // Check for compatible operands. If the corresponding operands are not
1014 // compatible - need to perform alternate vectorization.
1015 CmpInst::Predicate CurrentPred = Inst->getPredicate();
1016 CmpInst::Predicate SwappedCurrentPred =
1017 CmpInst::getSwappedPredicate(CurrentPred);
1018
1019 if ((VL.size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1021 continue;
1022
1023 if (isCmpSameOrSwapped(BaseInst, Inst, TLI))
1024 continue;
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1027 if (isCmpSameOrSwapped(AltInst, Inst, TLI))
1028 continue;
1029 } else if (BasePred != CurrentPred) {
1030 assert(
1031 isValidForAlternation(InstOpcode) &&
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1033 AltOp = I;
1034 continue;
1035 }
1036 CmpInst::Predicate AltPred = AltInst->getPredicate();
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1039 continue;
1040 }
1041 } else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1044 "CastInst.");
1045 if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
1046 if (Gep->getNumOperands() != 2 ||
1047 Gep->getOperand(0)->getType() != MainOp->getOperand(0)->getType())
1048 return InstructionsState::invalid();
1049 } else if (auto *EI = dyn_cast<ExtractElementInst>(I)) {
1051 return InstructionsState::invalid();
1052 } else if (auto *LI = dyn_cast<LoadInst>(I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 } else if (auto *Call = dyn_cast<CallInst>(I)) {
1057 auto *CallBase = cast<CallInst>(MainOp);
1058 if (Call->getCalledFunction() != CallBase->getCalledFunction())
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1064 CallBase->op_begin() +
1066 return InstructionsState::invalid();
1068 if (ID != BaseID)
1069 return InstructionsState::invalid();
1070 if (!ID) {
1071 SmallVector<VFInfo> Mappings = VFDatabase(*Call).getMappings(*Call);
1072 if (Mappings.size() != BaseMappings.size() ||
1073 Mappings.front().ISA != BaseMappings.front().ISA ||
1074 Mappings.front().ScalarName != BaseMappings.front().ScalarName ||
1075 Mappings.front().VectorName != BaseMappings.front().VectorName ||
1076 Mappings.front().Shape.VF != BaseMappings.front().Shape.VF ||
1077 Mappings.front().Shape.Parameters !=
1078 BaseMappings.front().Shape.Parameters)
1079 return InstructionsState::invalid();
1080 }
1081 }
1082 continue;
1083 }
1084 return InstructionsState::invalid();
1085 }
1086
1087 return InstructionsState(MainOp, AltOp);
1088}
1089
1090/// \returns true if all of the values in \p VL have the same type or false
1091/// otherwise.
1093 Type *Ty = VL.front()->getType();
1094 return all_of(VL.drop_front(), [&](Value *V) { return V->getType() == Ty; });
1095}
1096
1097/// \returns True if in-tree use also needs extract. This refers to
1098/// possible scalar operand in vectorized instruction.
1099static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
1100 TargetLibraryInfo *TLI,
1101 const TargetTransformInfo *TTI) {
1102 if (!UserInst)
1103 return false;
1104 unsigned Opcode = UserInst->getOpcode();
1105 switch (Opcode) {
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1108 return (LI->getPointerOperand() == Scalar);
1109 }
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1113 }
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1117 return any_of(enumerate(CI->args()), [&](auto &&Arg) {
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1120 });
1121 }
1122 default:
1123 return false;
1124 }
1125}
1126
1127/// \returns the AA location that is being access by the instruction.
1129 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1130 return MemoryLocation::get(SI);
1131 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1132 return MemoryLocation::get(LI);
1133 return MemoryLocation();
1134}
1135
1136/// \returns True if the instruction is not a volatile or atomic load/store.
1137static bool isSimple(Instruction *I) {
1138 if (LoadInst *LI = dyn_cast<LoadInst>(I))
1139 return LI->isSimple();
1140 if (StoreInst *SI = dyn_cast<StoreInst>(I))
1141 return SI->isSimple();
1142 if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
1143 return !MI->isVolatile();
1144 return true;
1145}
1146
1147/// Shuffles \p Mask in accordance with the given \p SubMask.
1148/// \param ExtendingManyInputs Supports reshuffling of the mask with not only
1149/// one but two input vectors.
1150static void addMask(SmallVectorImpl<int> &Mask, ArrayRef<int> SubMask,
1151 bool ExtendingManyInputs = false) {
1152 if (SubMask.empty())
1153 return;
1154 assert(
1155 (!ExtendingManyInputs || SubMask.size() > Mask.size() ||
1156 // Check if input scalars were extended to match the size of other node.
1157 (SubMask.size() == Mask.size() && Mask.back() == PoisonMaskElem)) &&
1158 "SubMask with many inputs support must be larger than the mask.");
1159 if (Mask.empty()) {
1160 Mask.append(SubMask.begin(), SubMask.end());
1161 return;
1162 }
1163 SmallVector<int> NewMask(SubMask.size(), PoisonMaskElem);
1164 int TermValue = std::min(Mask.size(), SubMask.size());
1165 for (int I = 0, E = SubMask.size(); I < E; ++I) {
1166 if (SubMask[I] == PoisonMaskElem ||
1167 (!ExtendingManyInputs &&
1168 (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue)))
1169 continue;
1170 NewMask[I] = Mask[SubMask[I]];
1171 }
1172 Mask.swap(NewMask);
1173}
1174
1175/// Order may have elements assigned special value (size) which is out of
1176/// bounds. Such indices only appear on places which correspond to undef values
1177/// (see canReuseExtract for details) and used in order to avoid undef values
1178/// have effect on operands ordering.
1179/// The first loop below simply finds all unused indices and then the next loop
1180/// nest assigns these indices for undef values positions.
1181/// As an example below Order has two undef positions and they have assigned
1182/// values 3 and 7 respectively:
1183/// before: 6 9 5 4 9 2 1 0
1184/// after: 6 3 5 4 7 2 1 0
1186 const unsigned Sz = Order.size();
1187 SmallBitVector UnusedIndices(Sz, /*t=*/true);
1188 SmallBitVector MaskedIndices(Sz);
1189 for (unsigned I = 0; I < Sz; ++I) {
1190 if (Order[I] < Sz)
1191 UnusedIndices.reset(Order[I]);
1192 else
1193 MaskedIndices.set(I);
1194 }
1195 if (MaskedIndices.none())
1196 return;
1197 assert(UnusedIndices.count() == MaskedIndices.count() &&
1198 "Non-synced masked/available indices.");
1199 int Idx = UnusedIndices.find_first();
1200 int MIdx = MaskedIndices.find_first();
1201 while (MIdx >= 0) {
1202 assert(Idx >= 0 && "Indices must be synced.");
1203 Order[MIdx] = Idx;
1204 Idx = UnusedIndices.find_next(Idx);
1205 MIdx = MaskedIndices.find_next(MIdx);
1206 }
1207}
1208
1209/// \returns a bitset for selecting opcodes. false for Opcode0 and true for
1210/// Opcode1.
1212 unsigned Opcode1) {
1213 Type *ScalarTy = VL[0]->getType();
1214 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
1215 SmallBitVector OpcodeMask(VL.size() * ScalarTyNumElements, false);
1216 for (unsigned Lane : seq<unsigned>(VL.size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1218 continue;
1219 if (cast<Instruction>(VL[Lane])->getOpcode() == Opcode1)
1220 OpcodeMask.set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1222 }
1223 return OpcodeMask;
1224}
1225
1226namespace llvm {
1227
1229 SmallVectorImpl<int> &Mask) {
1230 Mask.clear();
1231 const unsigned E = Indices.size();
1232 Mask.resize(E, PoisonMaskElem);
1233 for (unsigned I = 0; I < E; ++I)
1234 Mask[Indices[I]] = I;
1235}
1236
1237/// Reorders the list of scalars in accordance with the given \p Mask.
1239 ArrayRef<int> Mask) {
1240 assert(!Mask.empty() && "Expected non-empty mask.");
1241 SmallVector<Value *> Prev(Scalars.size(),
1242 PoisonValue::get(Scalars.front()->getType()));
1243 Prev.swap(Scalars);
1244 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
1245 if (Mask[I] != PoisonMaskElem)
1246 Scalars[Mask[I]] = Prev[I];
1247}
1248
1249/// Checks if the provided value does not require scheduling. It does not
1250/// require scheduling if this is not an instruction or it is an instruction
1251/// that does not read/write memory and all operands are either not instructions
1252/// or phi nodes or instructions from different blocks.
1254 auto *I = dyn_cast<Instruction>(V);
1255 if (!I)
1256 return true;
1257 return !mayHaveNonDefUseDependency(*I) &&
1258 all_of(I->operands(), [I](Value *V) {
1259 auto *IO = dyn_cast<Instruction>(V);
1260 if (!IO)
1261 return true;
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1263 });
1264}
1265
1266/// Checks if the provided value does not require scheduling. It does not
1267/// require scheduling if this is not an instruction or it is an instruction
1268/// that does not read/write memory and all users are phi nodes or instructions
1269/// from the different blocks.
1270static bool isUsedOutsideBlock(Value *V) {
1271 auto *I = dyn_cast<Instruction>(V);
1272 if (!I)
1273 return true;
1274 // Limits the number of uses to save compile time.
1275 return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) &&
1276 all_of(I->users(), [I](User *U) {
1277 auto *IU = dyn_cast<Instruction>(U);
1278 if (!IU)
1279 return true;
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1281 });
1282}
1283
1284/// Checks if the specified value does not require scheduling. It does not
1285/// require scheduling if all operands and all users do not need to be scheduled
1286/// in the current basic block.
1289}
1290
1291/// Checks if the specified array of instructions does not require scheduling.
1292/// It is so if all either instructions have operands that do not require
1293/// scheduling or their users do not require scheduling since they are phis or
1294/// in other basic blocks.
1296 return !VL.empty() &&
1298}
1299
1300/// Returns true if widened type of \p Ty elements with size \p Sz represents
1301/// full vector type, i.e. adding extra element results in extra parts upon type
1302/// legalization.
1304 unsigned Sz) {
1305 if (Sz <= 1)
1306 return false;
1307 if (!isValidElementType(Ty) && !isa<FixedVectorType>(Ty))
1308 return false;
1309 if (has_single_bit(Sz))
1310 return true;
1311 const unsigned NumParts = TTI.getNumberOfParts(getWidenedType(Ty, Sz));
1312 return NumParts > 0 && NumParts < Sz && has_single_bit(Sz / NumParts) &&
1313 Sz % NumParts == 0;
1314}
1315
1316namespace slpvectorizer {
1317
1318/// Bottom Up SLP Vectorizer.
1319class BoUpSLP {
1320 struct TreeEntry;
1321 struct ScheduleData;
1324
1325public:
1326 /// Tracks the state we can represent the loads in the given sequence.
1327 enum class LoadsState {
1328 Gather,
1329 Vectorize,
1332 };
1333
1340
1342 TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
1345 : BatchAA(*Aa), F(Func), SE(Se), TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB), DL(DL), ORE(ORE),
1347 Builder(Se->getContext(), TargetFolder(*DL)) {
1348 CodeMetrics::collectEphemeralValues(F, AC, EphValues);
1349 // Use the vector register size specified by the target unless overridden
1350 // by a command-line option.
1351 // TODO: It would be better to limit the vectorization factor based on
1352 // data type rather than just register size. For example, x86 AVX has
1353 // 256-bit registers, but it does not support integer operations
1354 // at that width (that requires AVX2).
1355 if (MaxVectorRegSizeOption.getNumOccurrences())
1356 MaxVecRegSize = MaxVectorRegSizeOption;
1357 else
1358 MaxVecRegSize =
1360 .getFixedValue();
1361
1362 if (MinVectorRegSizeOption.getNumOccurrences())
1363 MinVecRegSize = MinVectorRegSizeOption;
1364 else
1365 MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
1366 }
1367
1368 /// Vectorize the tree that starts with the elements in \p VL.
1369 /// Returns the vectorized root.
1371
1372 /// Vectorize the tree but with the list of externally used values \p
1373 /// ExternallyUsedValues. Values in this MapVector can be replaced but the
1374 /// generated extractvalue instructions.
1375 Value *
1376 vectorizeTree(const ExtraValueToDebugLocsMap &ExternallyUsedValues,
1377 Instruction *ReductionRoot = nullptr);
1378
1379 /// \returns the cost incurred by unwanted spills and fills, caused by
1380 /// holding live values over call sites.
1382
1383 /// \returns the vectorization cost of the subtree that starts at \p VL.
1384 /// A negative number means that this is profitable.
1385 InstructionCost getTreeCost(ArrayRef<Value *> VectorizedVals = {});
1386
1387 /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
1388 /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
1389 void buildTree(ArrayRef<Value *> Roots,
1390 const SmallDenseSet<Value *> &UserIgnoreLst);
1391
1392 /// Construct a vectorizable tree that starts at \p Roots.
1393 void buildTree(ArrayRef<Value *> Roots);
1394
1395 /// Returns whether the root node has in-tree uses.
1397 return !VectorizableTree.empty() &&
1398 !VectorizableTree.front()->UserTreeIndices.empty();
1399 }
1400
1401 /// Return the scalars of the root node.
1403 assert(!VectorizableTree.empty() && "No graph to get the first node from");
1404 return VectorizableTree.front()->Scalars;
1405 }
1406
1407 /// Returns the type/is-signed info for the root node in the graph without
1408 /// casting.
1409 std::optional<std::pair<Type *, bool>> getRootNodeTypeWithNoCast() const {
1410 const TreeEntry &Root = *VectorizableTree.front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.find(&Root);
1415 if (It != MinBWs.end())
1416 return std::make_pair(IntegerType::get(Root.Scalars.front()->getContext(),
1417 It->second.first),
1418 It->second.second);
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1424 }
1425
1426 /// Checks if the root graph node can be emitted with narrower bitwidth at
1427 /// codegen and returns it signedness, if so.
1429 return MinBWs.at(VectorizableTree.front().get()).second;
1430 }
1431
1432 /// Returns reduction type after minbitdth analysis.
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.front()->Scalars.front()->getType()))
1439 return getWidenedType(
1440 VectorizableTree.front()->Scalars.front()->getType(),
1441 VectorizableTree.front()->getVectorFactor());
1442 return getWidenedType(
1444 VectorizableTree.front()->Scalars.front()->getContext(),
1445 ReductionBitWidth),
1446 VectorizableTree.front()->getVectorFactor());
1447 }
1448
1449 /// Builds external uses of the vectorized scalars, i.e. the list of
1450 /// vectorized scalars to be extracted, their lanes and their scalar users. \p
1451 /// ExternallyUsedValues contains additional list of external uses to handle
1452 /// vectorization of reductions.
1453 void
1454 buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues = {});
1455
1456 /// Transforms graph nodes to target specific representations, if profitable.
1457 void transformNodes();
1458
1459 /// Clear the internal data structures that are created by 'buildTree'.
1460 void deleteTree() {
1461 VectorizableTree.clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1464 MustGather.clear();
1465 NonScheduledFirst.clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.clear();
1468 IsGraphTransformMode = false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1474 BS->clear();
1475 }
1476 MinBWs.clear();
1477 ReductionBitWidth = 0;
1478 BaseGraphSize = 1;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList = nullptr;
1483 PostponedGathers.clear();
1484 ValueToGatherNodes.clear();
1485 }
1486
1487 unsigned getTreeSize() const { return VectorizableTree.size(); }
1488
1489 /// Returns the base graph size, before any transformations.
1490 unsigned getCanonicalGraphSize() const { return BaseGraphSize; }
1491
1492 /// Perform LICM and CSE on the newly generated gather sequences.
1494
1495 /// Does this non-empty order represent an identity order? Identity
1496 /// should be represented as an empty order, so this is used to
1497 /// decide if we can canonicalize a computed order. Undef elements
1498 /// (represented as size) are ignored.
1500 assert(!Order.empty() && "expected non-empty order");
1501 const unsigned Sz = Order.size();
1502 return all_of(enumerate(Order), [&](const auto &P) {
1503 return P.value() == P.index() || P.value() == Sz;
1504 });
1505 }
1506
1507 /// Checks if the specified gather tree entry \p TE can be represented as a
1508 /// shuffled vector entry + (possibly) permutation with other gathers. It
1509 /// implements the checks only for possibly ordered scalars (Loads,
1510 /// ExtractElement, ExtractValue), which can be part of the graph.
1511 std::optional<OrdersType> findReusedOrderedScalars(const TreeEntry &TE);
1512
1513 /// Sort loads into increasing pointers offsets to allow greater clustering.
1514 std::optional<OrdersType> findPartiallyOrderedLoads(const TreeEntry &TE);
1515
1516 /// Gets reordering data for the given tree entry. If the entry is vectorized
1517 /// - just return ReorderIndices, otherwise check if the scalars can be
1518 /// reordered and return the most optimal order.
1519 /// \return std::nullopt if ordering is not important, empty order, if
1520 /// identity order is important, or the actual order.
1521 /// \param TopToBottom If true, include the order of vectorized stores and
1522 /// insertelement nodes, otherwise skip them.
1523 std::optional<OrdersType> getReorderingData(const TreeEntry &TE,
1524 bool TopToBottom);
1525
1526 /// Reorders the current graph to the most profitable order starting from the
1527 /// root node to the leaf nodes. The best order is chosen only from the nodes
1528 /// of the same size (vectorization factor). Smaller nodes are considered
1529 /// parts of subgraph with smaller VF and they are reordered independently. We
1530 /// can make it because we still need to extend smaller nodes to the wider VF
1531 /// and we can merge reordering shuffles with the widening shuffles.
1532 void reorderTopToBottom();
1533
1534 /// Reorders the current graph to the most profitable order starting from
1535 /// leaves to the root. It allows to rotate small subgraphs and reduce the
1536 /// number of reshuffles if the leaf nodes use the same order. In this case we
1537 /// can merge the orders and just shuffle user node instead of shuffling its
1538 /// operands. Plus, even the leaf nodes have different orders, it allows to
1539 /// sink reordering in the graph closer to the root node and merge it later
1540 /// during analysis.
1541 void reorderBottomToTop(bool IgnoreReorder = false);
1542
1543 /// \return The vector element size in bits to use when vectorizing the
1544 /// expression tree ending at \p V. If V is a store, the size is the width of
1545 /// the stored value. Otherwise, the size is the width of the largest loaded
1546 /// value reaching V. This method is used by the vectorizer to calculate
1547 /// vectorization factors.
1548 unsigned getVectorElementSize(Value *V);
1549
1550 /// Compute the minimum type sizes required to represent the entries in a
1551 /// vectorizable tree.
1553
1554 // \returns maximum vector register size as set by TTI or overridden by cl::opt.
1555 unsigned getMaxVecRegSize() const {
1556 return MaxVecRegSize;
1557 }
1558
1559 // \returns minimum vector register size as set by cl::opt.
1560 unsigned getMinVecRegSize() const {
1561 return MinVecRegSize;
1562 }
1563
1564 unsigned getMinVF(unsigned Sz) const {
1565 return std::max(2U, getMinVecRegSize() / Sz);
1566 }
1567
1568 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
1569 unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
1570 MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
1571 return MaxVF ? MaxVF : UINT_MAX;
1572 }
1573
1574 /// Check if homogeneous aggregate is isomorphic to some VectorType.
1575 /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
1576 /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
1577 /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
1578 ///
1579 /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
1580 unsigned canMapToVector(Type *T) const;
1581
1582 /// \returns True if the VectorizableTree is both tiny and not fully
1583 /// vectorizable. We do not vectorize such trees.
1584 bool isTreeTinyAndNotFullyVectorizable(bool ForReduction = false) const;
1585
1586 /// Checks if the graph and all its subgraphs cannot be better vectorized.
1587 /// It may happen, if all gather nodes are loads and they cannot be
1588 /// "clusterized". In this case even subgraphs cannot be vectorized more
1589 /// effectively than the base graph.
1590 bool isTreeNotExtendable() const;
1591
1592 /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
1593 /// can be load combined in the backend. Load combining may not be allowed in
1594 /// the IR optimizer, so we do not want to alter the pattern. For example,
1595 /// partially transforming a scalar bswap() pattern into vector code is
1596 /// effectively impossible for the backend to undo.
1597 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1598 /// may not be necessary.
1599 bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
1600
1601 /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
1602 /// can be load combined in the backend. Load combining may not be allowed in
1603 /// the IR optimizer, so we do not want to alter the pattern. For example,
1604 /// partially transforming a scalar bswap() pattern into vector code is
1605 /// effectively impossible for the backend to undo.
1606 /// TODO: If load combining is allowed in the IR optimizer, this analysis
1607 /// may not be necessary.
1608 bool isLoadCombineCandidate(ArrayRef<Value *> Stores) const;
1609
1610 /// Checks if the given array of loads can be represented as a vectorized,
1611 /// scatter or just simple gather.
1612 /// \param VL list of loads.
1613 /// \param VL0 main load value.
1614 /// \param Order returned order of load instructions.
1615 /// \param PointerOps returned list of pointer operands.
1616 /// \param BestVF return best vector factor, if recursive check found better
1617 /// vectorization sequences rather than masked gather.
1618 /// \param TryRecursiveCheck used to check if long masked gather can be
1619 /// represented as a serie of loads/insert subvector, if profitable.
1622 SmallVectorImpl<Value *> &PointerOps,
1623 unsigned *BestVF = nullptr,
1624 bool TryRecursiveCheck = true) const;
1625
1626 /// Registers non-vectorizable sequence of loads
1627 template <typename T> void registerNonVectorizableLoads(ArrayRef<T *> VL) {
1628 ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL));
1629 }
1630
1631 /// Checks if the given loads sequence is known as not vectorizable
1632 template <typename T>
1634 return ListOfKnonwnNonVectorizableLoads.contains(hash_value(VL));
1635 }
1636
1638
1639 /// This structure holds any data we need about the edges being traversed
1640 /// during buildTree_rec(). We keep track of:
1641 /// (i) the user TreeEntry index, and
1642 /// (ii) the index of the edge.
1643 struct EdgeInfo {
1644 EdgeInfo() = default;
1645 EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
1647 /// The user TreeEntry.
1648 TreeEntry *UserTE = nullptr;
1649 /// The operand index of the use.
1650 unsigned EdgeIdx = UINT_MAX;
1651#ifndef NDEBUG
1653 const BoUpSLP::EdgeInfo &EI) {
1654 EI.dump(OS);
1655 return OS;
1656 }
1657 /// Debug print.
1658 void dump(raw_ostream &OS) const {
1659 OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
1660 << " EdgeIdx:" << EdgeIdx << "}";
1661 }
1662 LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
1663#endif
1664 bool operator == (const EdgeInfo &Other) const {
1665 return UserTE == Other.UserTE && EdgeIdx == Other.EdgeIdx;
1666 }
1667 };
1668
1669 /// A helper class used for scoring candidates for two consecutive lanes.
1671 const TargetLibraryInfo &TLI;
1672 const DataLayout &DL;
1673 ScalarEvolution &SE;
1674 const BoUpSLP &R;
1675 int NumLanes; // Total number of lanes (aka vectorization factor).
1676 int MaxLevel; // The maximum recursion depth for accumulating score.
1677
1678 public:
1680 ScalarEvolution &SE, const BoUpSLP &R, int NumLanes,
1681 int MaxLevel)
1682 : TLI(TLI), DL(DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1684
1685 // The hard-coded scores listed here are not very important, though it shall
1686 // be higher for better matches to improve the resulting cost. When
1687 // computing the scores of matching one sub-tree with another, we are
1688 // basically counting the number of values that are matching. So even if all
1689 // scores are set to 1, we would still get a decent matching result.
1690 // However, sometimes we have to break ties. For example we may have to
1691 // choose between matching loads vs matching opcodes. This is what these
1692 // scores are helping us with: they provide the order of preference. Also,
1693 // this is important if the scalar is externally used or used in another
1694 // tree entry node in the different lane.
1695
1696 /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
1697 static const int ScoreConsecutiveLoads = 4;
1698 /// The same load multiple times. This should have a better score than
1699 /// `ScoreSplat` because it in x86 for a 2-lane vector we can represent it
1700 /// with `movddup (%reg), xmm0` which has a throughput of 0.5 versus 0.5 for
1701 /// a vector load and 1.0 for a broadcast.
1702 static const int ScoreSplatLoads = 3;
1703 /// Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
1704 static const int ScoreReversedLoads = 3;
1705 /// A load candidate for masked gather.
1706 static const int ScoreMaskedGatherCandidate = 1;
1707 /// ExtractElementInst from same vector and consecutive indexes.
1708 static const int ScoreConsecutiveExtracts = 4;
1709 /// ExtractElementInst from same vector and reversed indices.
1710 static const int ScoreReversedExtracts = 3;
1711 /// Constants.
1712 static const int ScoreConstants = 2;
1713 /// Instructions with the same opcode.
1714 static const int ScoreSameOpcode = 2;
1715 /// Instructions with alt opcodes (e.g, add + sub).
1716 static const int ScoreAltOpcodes = 1;
1717 /// Identical instructions (a.k.a. splat or broadcast).
1718 static const int ScoreSplat = 1;
1719 /// Matching with an undef is preferable to failing.
1720 static const int ScoreUndef = 1;
1721 /// Score for failing to find a decent match.
1722 static const int ScoreFail = 0;
1723 /// Score if all users are vectorized.
1724 static const int ScoreAllUserVectorized = 1;
1725
1726 /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
1727 /// \p U1 and \p U2 are the users of \p V1 and \p V2.
1728 /// Also, checks if \p V1 and \p V2 are compatible with instructions in \p
1729 /// MainAltOps.
1731 ArrayRef<Value *> MainAltOps) const {
1732 if (!isValidElementType(V1->getType()) ||
1733 !isValidElementType(V2->getType()))
1735
1736 if (V1 == V2) {
1737 if (isa<LoadInst>(V1)) {
1738 // Retruns true if the users of V1 and V2 won't need to be extracted.
1739 auto AllUsersAreInternal = [U1, U2, this](Value *V1, Value *V2) {
1740 // Bail out if we have too many uses to save compilation time.
1741 if (V1->hasNUsesOrMore(UsesLimit) || V2->hasNUsesOrMore(UsesLimit))
1742 return false;
1743
1744 auto AllUsersVectorized = [U1, U2, this](Value *V) {
1745 return llvm::all_of(V->users(), [U1, U2, this](Value *U) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1747 });
1748 };
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1750 };
1751 // A broadcast of a load can be cheaper on some targets.
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1753 ElementCount::getFixed(NumLanes)) &&
1754 ((int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1757 }
1759 }
1760
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1766 };
1767
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1770 if (LI1 && LI2) {
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1772 !LI2->isSimple())
1773 return CheckSameEntryOrFail();
1774
1775 std::optional<int> Dist = getPointersDiff(
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(), DL, SE, /*StrictCheck=*/true);
1778 if (!Dist || *Dist == 0) {
1779 if (getUnderlyingObject(LI1->getPointerOperand()) ==
1780 getUnderlyingObject(LI2->getPointerOperand()) &&
1781 R.TTI->isLegalMaskedGather(
1782 getWidenedType(LI1->getType(), NumLanes), LI1->getAlign()))
1784 return CheckSameEntryOrFail();
1785 }
1786 // The distance is too large - still may be profitable to use masked
1787 // loads/gathers.
1788 if (std::abs(*Dist) > NumLanes / 2)
1790 // This still will detect consecutive loads, but we might have "holes"
1791 // in some cases. It is ok for non-power-2 vectorization and may produce
1792 // better results. It should not affect current vectorization.
1795 }
1796
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1799 if (C1 && C2)
1801
1802 // Extracts from consecutive indexes of the same vector better score as
1803 // the extracts could be optimized away.
1804 Value *EV1;
1805 ConstantInt *Ex1Idx;
1806 if (match(V1, m_ExtractElt(m_Value(EV1), m_ConstantInt(Ex1Idx)))) {
1807 // Undefs are always profitable for extractelements.
1808 // Compiler can easily combine poison and extractelement <non-poison> or
1809 // undef and extractelement <poison>. But combining undef +
1810 // extractelement <non-poison-but-may-produce-poison> requires some
1811 // extra operations.
1812 if (isa<UndefValue>(V2))
1813 return (isa<PoisonValue>(V2) || isUndefVector(EV1).all())
1816 Value *EV2 = nullptr;
1817 ConstantInt *Ex2Idx = nullptr;
1818 if (match(V2,
1820 m_Undef())))) {
1821 // Undefs are always profitable for extractelements.
1822 if (!Ex2Idx)
1824 if (isUndefVector(EV2).all() && EV2->getType() == EV1->getType())
1826 if (EV2 == EV1) {
1827 int Idx1 = Ex1Idx->getZExtValue();
1828 int Idx2 = Ex2Idx->getZExtValue();
1829 int Dist = Idx2 - Idx1;
1830 // The distance is too large - still may be profitable to use
1831 // shuffles.
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1838 }
1840 }
1841 return CheckSameEntryOrFail();
1842 }
1843
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1846 if (I1 && I2) {
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1849 SmallVector<Value *, 4> Ops(MainAltOps);
1850 Ops.push_back(I1);
1851 Ops.push_back(I2);
1852 InstructionsState S = getSameOpcode(Ops, TLI);
1853 // Note: Only consider instructions with <= 2 operands to avoid
1854 // complexity explosion.
1855 if (S &&
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.empty() ||
1857 !S.isAltShuffle()) &&
1858 all_of(Ops, [&S](Value *V) {
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1862 }))
1863 return S.isAltShuffle() ? LookAheadHeuristics::ScoreAltOpcodes
1865 }
1866
1867 if (I1 && isa<PoisonValue>(V2))
1869
1870 if (isa<UndefValue>(V2))
1872
1873 return CheckSameEntryOrFail();
1874 }
1875
1876 /// Go through the operands of \p LHS and \p RHS recursively until
1877 /// MaxLevel, and return the cummulative score. \p U1 and \p U2 are
1878 /// the users of \p LHS and \p RHS (that is \p LHS and \p RHS are operands
1879 /// of \p U1 and \p U2), except at the beginning of the recursion where
1880 /// these are set to nullptr.
1881 ///
1882 /// For example:
1883 /// \verbatim
1884 /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
1885 /// \ / \ / \ / \ /
1886 /// + + + +
1887 /// G1 G2 G3 G4
1888 /// \endverbatim
1889 /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
1890 /// each level recursively, accumulating the score. It starts from matching
1891 /// the additions at level 0, then moves on to the loads (level 1). The
1892 /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
1893 /// {B[0],B[1]} match with LookAheadHeuristics::ScoreConsecutiveLoads, while
1894 /// {A[0],C[0]} has a score of LookAheadHeuristics::ScoreFail.
1895 /// Please note that the order of the operands does not matter, as we
1896 /// evaluate the score of all profitable combinations of operands. In
1897 /// other words the score of G1 and G4 is the same as G1 and G2. This
1898 /// heuristic is based on ideas described in:
1899 /// Look-ahead SLP: Auto-vectorization in the presence of commutative
1900 /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
1901 /// Luís F. W. Góes
1903 Instruction *U2, int CurrLevel,
1904 ArrayRef<Value *> MainAltOps) const {
1905
1906 // Get the shallow score of V1 and V2.
1907 int ShallowScoreAtThisLevel =
1908 getShallowScore(LHS, RHS, U1, U2, MainAltOps);
1909
1910 // If reached MaxLevel,
1911 // or if V1 and V2 are not instructions,
1912 // or if they are SPLAT,
1913 // or if they are not consecutive,
1914 // or if profitable to vectorize loads or extractelements, early return
1915 // the current cost.
1916 auto *I1 = dyn_cast<Instruction>(LHS);
1917 auto *I2 = dyn_cast<Instruction>(RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1919 ShallowScoreAtThisLevel == LookAheadHeuristics::ScoreFail ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 && "Should have early exited.");
1926
1927 // Contains the I2 operand indexes that got matched with I1 operands.
1928 SmallSet<unsigned, 4> Op2Used;
1929
1930 // Recursion towards the operands of I1 and I2. We are trying all possible
1931 // operand pairs, and keeping track of the best score.
1932 for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1934 // Try to pair op1I with the best operand of I2.
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest = false;
1938 // If I2 is commutative try all combinations.
1939 unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
1940 unsigned ToIdx = isCommutative(I2)
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx && "Bad index");
1944 for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1945 // Skip operands already paired with OpIdx1.
1946 if (Op2Used.count(OpIdx2))
1947 continue;
1948 // Recursively calculate the cost at each level
1949 int TmpScore =
1950 getScoreAtLevelRec(I1->getOperand(OpIdx1), I2->getOperand(OpIdx2),
1951 I1, I2, CurrLevel + 1, {});
1952 // Look for the best score.
1953 if (TmpScore > LookAheadHeuristics::ScoreFail &&
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1956 MaxOpIdx2 = OpIdx2;
1957 FoundBest = true;
1958 }
1959 }
1960 if (FoundBest) {
1961 // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
1962 Op2Used.insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1964 }
1965 }
1966 return ShallowScoreAtThisLevel;
1967 }
1968 };
1969 /// A helper data structure to hold the operands of a vector of instructions.
1970 /// This supports a fixed vector length for all operand vectors.
1972 /// For each operand we need (i) the value, and (ii) the opcode that it
1973 /// would be attached to if the expression was in a left-linearized form.
1974 /// This is required to avoid illegal operand reordering.
1975 /// For example:
1976 /// \verbatim
1977 /// 0 Op1
1978 /// |/
1979 /// Op1 Op2 Linearized + Op2
1980 /// \ / ----------> |/
1981 /// - -
1982 ///
1983 /// Op1 - Op2 (0 + Op1) - Op2
1984 /// \endverbatim
1985 ///
1986 /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
1987 ///
1988 /// Another way to think of this is to track all the operations across the
1989 /// path from the operand all the way to the root of the tree and to
1990 /// calculate the operation that corresponds to this path. For example, the
1991 /// path from Op2 to the root crosses the RHS of the '-', therefore the
1992 /// corresponding operation is a '-' (which matches the one in the
1993 /// linearized tree, as shown above).
1994 ///
1995 /// For lack of a better term, we refer to this operation as Accumulated
1996 /// Path Operation (APO).
1997 struct OperandData {
1998 OperandData() = default;
1999 OperandData(Value *V, bool APO, bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2001 /// The operand value.
2002 Value *V = nullptr;
2003 /// TreeEntries only allow a single opcode, or an alternate sequence of
2004 /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
2005 /// APO. It is set to 'true' if 'V' is attached to an inverse operation
2006 /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
2007 /// (e.g., Add/Mul)
2008 bool APO = false;
2009 /// Helper data for the reordering function.
2010 bool IsUsed = false;
2011 };
2012
2013 /// During operand reordering, we are trying to select the operand at lane
2014 /// that matches best with the operand at the neighboring lane. Our
2015 /// selection is based on the type of value we are looking for. For example,
2016 /// if the neighboring lane has a load, we need to look for a load that is
2017 /// accessing a consecutive address. These strategies are summarized in the
2018 /// 'ReorderingMode' enumerator.
2019 enum class ReorderingMode {
2020 Load, ///< Matching loads to consecutive memory addresses
2021 Opcode, ///< Matching instructions based on opcode (same or alternate)
2022 Constant, ///< Matching constants
2023 Splat, ///< Matching the same instruction multiple times (broadcast)
2024 Failed, ///< We failed to create a vectorizable group
2025 };
2026
2028
2029 /// A vector of operand vectors.
2031 /// When VL[0] is IntrinsicInst, ArgSize is CallBase::arg_size. When VL[0]
2032 /// is not IntrinsicInst, ArgSize is User::getNumOperands.
2033 unsigned ArgSize = 0;
2034
2035 const TargetLibraryInfo &TLI;
2036 const DataLayout &DL;
2037 ScalarEvolution &SE;
2038 const BoUpSLP &R;
2039 const Loop *L = nullptr;
2040
2041 /// \returns the operand data at \p OpIdx and \p Lane.
2042 OperandData &getData(unsigned OpIdx, unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2044 }
2045
2046 /// \returns the operand data at \p OpIdx and \p Lane. Const version.
2047 const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
2048 return OpsVec[OpIdx][Lane];
2049 }
2050
2051 /// Clears the used flag for all entries.
2052 void clearUsed() {
2053 for (unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2056 ++Lane)
2057 OpsVec[OpIdx][Lane].IsUsed = false;
2058 }
2059
2060 /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
2061 void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2063 }
2064
2065 /// \param Lane lane of the operands under analysis.
2066 /// \param OpIdx operand index in \p Lane lane we're looking the best
2067 /// candidate for.
2068 /// \param Idx operand index of the current candidate value.
2069 /// \returns The additional score due to possible broadcasting of the
2070 /// elements in the lane. It is more profitable to have power-of-2 unique
2071 /// elements in the lane, it will be vectorized with higher probability
2072 /// after removing duplicates. Currently the SLP vectorizer supports only
2073 /// vectorization of the power-of-2 number of unique scalars.
2074 int getSplatScore(unsigned Lane, unsigned OpIdx, unsigned Idx,
2075 const SmallBitVector &UsedLanes) const {
2076 Value *IdxLaneV = getData(Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2079 return 0;
2081 for (unsigned Ln : seq<unsigned>(getNumLanes())) {
2082 if (Ln == Lane)
2083 continue;
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2086 return 0;
2087 Uniques.try_emplace(OpIdxLnV, Ln);
2088 }
2089 unsigned UniquesCount = Uniques.size();
2090 auto IdxIt = Uniques.find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2098 return 0;
2099 return std::min(bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2102 bit_floor(UniquesCntWithOpIdxLaneV)) -
2103 ((IdxIt != Uniques.end() && UsedLanes.test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV - bit_floor(UniquesCntWithIdxLaneV)
2105 : bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2106 }
2107
2108 /// \param Lane lane of the operands under analysis.
2109 /// \param OpIdx operand index in \p Lane lane we're looking the best
2110 /// candidate for.
2111 /// \param Idx operand index of the current candidate value.
2112 /// \returns The additional score for the scalar which users are all
2113 /// vectorized.
2114 int getExternalUseScore(unsigned Lane, unsigned OpIdx, unsigned Idx) const {
2115 Value *IdxLaneV = getData(Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2117 // Do not care about number of uses for vector-like instructions
2118 // (extractelement/extractvalue with constant indices), they are extracts
2119 // themselves and already externally used. Vectorization of such
2120 // instructions does not add extra extractelement instruction, just may
2121 // remove it.
2122 if (isVectorLikeInstWithConstOps(IdxLaneV) &&
2123 isVectorLikeInstWithConstOps(OpIdxLaneV))
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2127 return 0;
2128 return R.areAllUsersVectorized(IdxLaneI)
2130 : 0;
2131 }
2132
2133 /// Score scaling factor for fully compatible instructions but with
2134 /// different number of external uses. Allows better selection of the
2135 /// instructions with less external uses.
2136 static const int ScoreScaleFactor = 10;
2137
2138 /// \Returns the look-ahead score, which tells us how much the sub-trees
2139 /// rooted at \p LHS and \p RHS match, the more they match the higher the
2140 /// score. This helps break ties in an informed way when we cannot decide on
2141 /// the order of the operands by just considering the immediate
2142 /// predecessors.
2143 int getLookAheadScore(Value *LHS, Value *RHS, ArrayRef<Value *> MainAltOps,
2144 int Lane, unsigned OpIdx, unsigned Idx,
2145 bool &IsUsed, const SmallBitVector &UsedLanes) {
2146 LookAheadHeuristics LookAhead(TLI, DL, SE, R, getNumLanes(),
2148 // Keep track of the instruction stack as we recurse into the operands
2149 // during the look-ahead score exploration.
2150 int Score =
2151 LookAhead.getScoreAtLevelRec(LHS, RHS, /*U1=*/nullptr, /*U2=*/nullptr,
2152 /*CurrLevel=*/1, MainAltOps);
2153 if (Score) {
2154 int SplatScore = getSplatScore(Lane, OpIdx, Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2156 // Failed score.
2157 Score = 0;
2158 } else {
2159 Score += SplatScore;
2160 // Scale score to see the difference between different operands
2161 // and similar operands but all vectorized/not all vectorized
2162 // uses. It does not affect actual selection of the best
2163 // compatible operand in general, just allows to select the
2164 // operand with all vectorized uses.
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx, Idx);
2167 IsUsed = true;
2168 }
2169 }
2170 return Score;
2171 }
2172
2173 /// Best defined scores per lanes between the passes. Used to choose the
2174 /// best operand (with the highest score) between the passes.
2175 /// The key - {Operand Index, Lane}.
2176 /// The value - the best score between the passes for the lane and the
2177 /// operand.
2179 BestScoresPerLanes;
2180
2181 // Search all operands in Ops[*][Lane] for the one that matches best
2182 // Ops[OpIdx][LastLane] and return its opreand index.
2183 // If no good match can be found, return std::nullopt.
2184 std::optional<unsigned>
2185 getBestOperand(unsigned OpIdx, int Lane, int LastLane,
2186 ArrayRef<ReorderingMode> ReorderingModes,
2187 ArrayRef<Value *> MainAltOps,
2188 const SmallBitVector &UsedLanes) {
2189 unsigned NumOperands = getNumOperands();
2190
2191 // The operand of the previous lane at OpIdx.
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2193
2194 // Our strategy mode for OpIdx.
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2198
2199 // The linearized opcode of the operand at OpIdx, Lane.
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2201
2202 // The best operand index and its score.
2203 // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
2204 // are using the score to differentiate between the two.
2205 struct BestOpData {
2206 std::optional<unsigned> Idx;
2207 unsigned Score = 0;
2208 } BestOp;
2209 BestOp.Score =
2210 BestScoresPerLanes.try_emplace(std::make_pair(OpIdx, Lane), 0)
2211 .first->second;
2212
2213 // Track if the operand must be marked as used. If the operand is set to
2214 // Score 1 explicitly (because of non power-of-2 unique scalars, we may
2215 // want to reestimate the operands again on the following iterations).
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2219 // Iterate through all unused operands and look for the best.
2220 for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
2221 // Get the operand at Idx and Lane.
2222 OperandData &OpData = getData(Idx, Lane);
2223 Value *Op = OpData.V;
2224 bool OpAPO = OpData.APO;
2225
2226 // Skip already selected operands.
2227 if (OpData.IsUsed)
2228 continue;
2229
2230 // Skip if we are trying to move the operand to a position with a
2231 // different opcode in the linearized tree form. This would break the
2232 // semantics.
2233 if (OpAPO != OpIdxAPO)
2234 continue;
2235
2236 // Look for an operand that matches the current mode.
2237 switch (RMode) {
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
2242 Value *OpRight = (LeftToRight) ? Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx, Idx, IsUsed, UsedLanes);
2245 if (Score > static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score == static_cast<int>(BestOp.Score) &&
2247 Idx == OpIdx)) {
2248 BestOp.Idx = Idx;
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2251 }
2252 break;
2253 }
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(Op))) {
2257 BestOp.Idx = Idx;
2258 if (isa<Constant>(Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2262 }
2263 if (isa<UndefValue>(Op) || !isa<Constant>(Op))
2264 IsUsed = false;
2265 }
2266 break;
2267 case ReorderingMode::Splat:
2268 if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
2269 IsUsed = Op == OpLastLane;
2270 if (Op == OpLastLane) {
2271 BestOp.Score = LookAheadHeuristics::ScoreSplat;
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2274 }
2275 BestOp.Idx = Idx;
2276 }
2277 break;
2278 case ReorderingMode::Failed:
2279 llvm_unreachable("Not expected Failed reordering mode.");
2280 }
2281 }
2282
2283 if (BestOp.Idx) {
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2285 return BestOp.Idx;
2286 }
2287 // If we could not find a good match return std::nullopt.
2288 return std::nullopt;
2289 }
2290
2291 /// Helper for reorderOperandVecs.
2292 /// \returns the lane that we should start reordering from. This is the one
2293 /// which has the least number of operands that can freely move about or
2294 /// less profitable because it already has the most optimal set of operands.
2295 unsigned getBestLaneToStartReordering() const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2298 // std::pair<unsigned, unsigned> is used to implement a simple voting
2299 // algorithm and choose the lane with the least number of operands that
2300 // can freely move about or less profitable because it already has the
2301 // most optimal set of operands. The first unsigned is a counter for
2302 // voting, the second unsigned is the counter of lanes with instructions
2303 // with same/alternate opcodes and same parent basic block.
2305 // Try to be closer to the original results, if we have multiple lanes
2306 // with same cost. If 2 lanes have the same cost, use the one with the
2307 // highest index.
2308 for (int I = getNumLanes(); I > 0; --I) {
2309 unsigned Lane = I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2312 // Compare the number of operands that can move and choose the one with
2313 // the least number.
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2317 HashMap.clear();
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2321 // Select the most optimal lane in terms of number of operands that
2322 // should be moved around.
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 } else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2329 if (!Inserted)
2330 ++It->second.first;
2331 }
2332 }
2333 // Select the lane with the minimum counter.
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2336 for (const auto &Data : reverse(HashMap)) {
2337 if (Data.second.first < CntMin) {
2338 CntMin = Data.second.first;
2339 BestLane = Data.second.second;
2340 }
2341 }
2342 return BestLane;
2343 }
2344
2345 /// Data structure that helps to reorder operands.
2346 struct OperandsOrderData {
2347 /// The best number of operands with the same APOs, which can be
2348 /// reordered.
2349 unsigned NumOfAPOs = UINT_MAX;
2350 /// Number of operands with the same/alternate instruction opcode and
2351 /// parent.
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2353 /// Hash for the actual operands ordering.
2354 /// Used to count operands, actually their position id and opcode
2355 /// value. It is used in the voting mechanism to find the lane with the
2356 /// least number of operands that can freely move about or less profitable
2357 /// because it already has the most optimal set of operands. Can be
2358 /// replaced with SmallVector<unsigned> instead but hash code is faster
2359 /// and requires less memory.
2360 unsigned Hash = 0;
2361 };
2362 /// \returns the maximum number of operands that are allowed to be reordered
2363 /// for \p Lane and the number of compatible instructions(with the same
2364 /// parent/opcode). This is used as a heuristic for selecting the first lane
2365 /// to start operand reordering.
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2369 // Operands with the same APO can be reordered. We therefore need to count
2370 // how many of them we have for each APO, like this: Cnt[APO] = x.
2371 // Since we only have two APOs, namely true and false, we can avoid using
2372 // a map. Instead we can simply count the number of operands that
2373 // correspond to one of them (in this case the 'true' APO), and calculate
2374 // the other by subtracting it from the total number of operands.
2375 // Operands with the same instruction opcode and parent are more
2376 // profitable since we don't need to move them in many cases, with a high
2377 // probability such lane already can be vectorized effectively.
2378 bool AllUndefs = true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2380 Instruction *OpcodeI = nullptr;
2381 BasicBlock *Parent = nullptr;
2382 unsigned Hash = 0;
2383 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2385 if (OpData.APO)
2386 ++CntTrue;
2387 // Use Boyer-Moore majority voting for finding the majority opcode and
2388 // the number of times it occurs.
2389 if (auto *I = dyn_cast<Instruction>(OpData.V)) {
2390 if (!OpcodeI || !getSameOpcode({OpcodeI, I}, TLI) ||
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2394 OpcodeI = I;
2395 Parent = I->getParent();
2396 } else {
2397 --NumOpsWithSameOpcodeParent;
2398 }
2399 } else {
2400 ++NumOpsWithSameOpcodeParent;
2401 }
2402 }
2403 Hash = hash_combine(
2404 Hash, hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2406 }
2407 if (AllUndefs)
2408 return {};
2409 OperandsOrderData Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2412 Data.Hash = Hash;
2413 return Data;
2414 }
2415
2416 /// Go through the instructions in VL and append their operands.
2417 void appendOperandsOfVL(ArrayRef<Value *> VL, const InstructionsState &S) {
2418 assert(!VL.empty() && "Bad VL");
2419 assert((empty() || VL.size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2421 assert(S.valid() && "InstructionsState is invalid.");
2422 // IntrinsicInst::isCommutative returns true if swapping the first "two"
2423 // arguments to the intrinsic produces the same result.
2424 constexpr unsigned IntrinsicNumOperands = 2;
2425 Instruction *MainOp = S.getMainOp();
2426 unsigned NumOperands = MainOp->getNumOperands();
2427 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.resize(NumOperands);
2429 unsigned NumLanes = VL.size();
2430 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].resize(NumLanes);
2432 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434 "Expected instruction or poison value");
2435 // Our tree has just 3 nodes: the root and two operands.
2436 // It is therefore trivial to get the APO. We only need to check the
2437 // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
2438 // RHS operand. The LHS operand of both add and sub is never attached
2439 // to an inversese operation in the linearized form, therefore its APO
2440 // is false. The RHS is true only if VL[Lane] is an inverse operation.
2441
2442 // Since operand reordering is performed on groups of commutative
2443 // operations or alternating sequences (e.g., +, -), we can safely
2444 // tell the inverse operations by checking commutativity.
2445 if (isa<PoisonValue>(VL[Lane])) {
2446 if (auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2447 if (OpIdx == 0) {
2448 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(), true, false};
2449 continue;
2450 }
2451 } else if (auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2452 if (OpIdx == 0) {
2453 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(), true, false};
2454 continue;
2455 }
2456 }
2457 OpsVec[OpIdx][Lane] = {
2458 PoisonValue::get(MainOp->getOperand(OpIdx)->getType()), true,
2459 false};
2460 continue;
2461 }
2462 bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
2463 bool APO = (OpIdx == 0) ? false : IsInverseOperation;
2464 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2465 APO, false};
2466 }
2467 }
2468 }
2469
2470 /// \returns the number of operands.
2471 unsigned getNumOperands() const { return ArgSize; }
2472
2473 /// \returns the number of lanes.
2474 unsigned getNumLanes() const { return OpsVec[0].size(); }
2475
2476 /// \returns the operand value at \p OpIdx and \p Lane.
2477 Value *getValue(unsigned OpIdx, unsigned Lane) const {
2478 return getData(OpIdx, Lane).V;
2479 }
2480
2481 /// \returns true if the data structure is empty.
2482 bool empty() const { return OpsVec.empty(); }
2483
2484 /// Clears the data.
2485 void clear() { OpsVec.clear(); }
2486
2487 /// \Returns true if there are enough operands identical to \p Op to fill
2488 /// the whole vector (it is mixed with constants or loop invariant values).
2489 /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
2490 bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
2491 assert(Op == getValue(OpIdx, Lane) &&
2492 "Op is expected to be getValue(OpIdx, Lane).");
2493 // Small number of loads - try load matching.
2494 if (isa<LoadInst>(Op) && getNumLanes() == 2 && getNumOperands() == 2)
2495 return false;
2496 bool OpAPO = getData(OpIdx, Lane).APO;
2497 bool IsInvariant = L && L->isLoopInvariant(Op);
2498 unsigned Cnt = 0;
2499 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2500 if (Ln == Lane)
2501 continue;
2502 // This is set to true if we found a candidate for broadcast at Lane.
2503 bool FoundCandidate = false;
2504 for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2505 OperandData &Data = getData(OpI, Ln);
2506 if (Data.APO != OpAPO || Data.IsUsed)
2507 continue;
2508 Value *OpILane = getValue(OpI, Lane);
2509 bool IsConstantOp = isa<Constant>(OpILane);
2510 // Consider the broadcast candidate if:
2511 // 1. Same value is found in one of the operands.
2512 if (Data.V == Op ||
2513 // 2. The operand in the given lane is not constant but there is a
2514 // constant operand in another lane (which can be moved to the
2515 // given lane). In this case we can represent it as a simple
2516 // permutation of constant and broadcast.
2517 (!IsConstantOp &&
2518 ((Lns > 2 && isa<Constant>(Data.V)) ||
2519 // 2.1. If we have only 2 lanes, need to check that value in the
2520 // next lane does not build same opcode sequence.
2521 (Lns == 2 &&
2522 !getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI) &&
2523 isa<Constant>(Data.V)))) ||
2524 // 3. The operand in the current lane is loop invariant (can be
2525 // hoisted out) and another operand is also a loop invariant
2526 // (though not a constant). In this case the whole vector can be
2527 // hoisted out.
2528 // FIXME: need to teach the cost model about this case for better
2529 // estimation.
2530 (IsInvariant && !isa<Constant>(Data.V) &&
2531 !getSameOpcode({Op, Data.V}, TLI) &&
2532 L->isLoopInvariant(Data.V))) {
2533 FoundCandidate = true;
2534 Data.IsUsed = Data.V == Op;
2535 if (Data.V == Op)
2536 ++Cnt;
2537 break;
2538 }
2539 }
2540 if (!FoundCandidate)
2541 return false;
2542 }
2543 return getNumLanes() == 2 || Cnt > 1;
2544 }
2545
2546 /// Checks if there is at least single compatible operand in lanes other
2547 /// than \p Lane, compatible with the operand \p Op.
2548 bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
2549 assert(Op == getValue(OpIdx, Lane) &&
2550 "Op is expected to be getValue(OpIdx, Lane).");
2551 bool OpAPO = getData(OpIdx, Lane).APO;
2552 for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2553 if (Ln == Lane)
2554 continue;
2555 if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
2556 const OperandData &Data = getData(OpI, Ln);
2557 if (Data.APO != OpAPO || Data.IsUsed)
2558 return true;
2559 Value *OpILn = getValue(OpI, Ln);
2560 return (L && L->isLoopInvariant(OpILn)) ||
2561 (getSameOpcode({Op, OpILn}, TLI) &&
2562 allSameBlock({Op, OpILn}));
2563 }))
2564 return true;
2565 }
2566 return false;
2567 }
2568
2569 public:
2570 /// Initialize with all the operands of the instruction vector \p RootVL.
2571 VLOperands(ArrayRef<Value *> RootVL, const InstructionsState &S,
2572 const BoUpSLP &R)
2573 : TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2574 L(R.LI->getLoopFor(S.getMainOp()->getParent())) {
2575 // Append all the operands of RootVL.
2576 appendOperandsOfVL(RootVL, S);
2577 }
2578
2579 /// \Returns a value vector with the operands across all lanes for the
2580 /// opearnd at \p OpIdx.
2581 ValueList getVL(unsigned OpIdx) const {
2582 ValueList OpVL(OpsVec[OpIdx].size());
2583 assert(OpsVec[OpIdx].size() == getNumLanes() &&
2584 "Expected same num of lanes across all operands");
2585 for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2586 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2587 return OpVL;
2588 }
2589
2590 // Performs operand reordering for 2 or more operands.
2591 // The original operands are in OrigOps[OpIdx][Lane].
2592 // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
2593 void reorder() {
2594 unsigned NumOperands = getNumOperands();
2595 unsigned NumLanes = getNumLanes();
2596 // Each operand has its own mode. We are using this mode to help us select
2597 // the instructions for each lane, so that they match best with the ones
2598 // we have selected so far.
2599 SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
2600
2601 // This is a greedy single-pass algorithm. We are going over each lane
2602 // once and deciding on the best order right away with no back-tracking.
2603 // However, in order to increase its effectiveness, we start with the lane
2604 // that has operands that can move the least. For example, given the
2605 // following lanes:
2606 // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
2607 // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
2608 // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
2609 // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
2610 // we will start at Lane 1, since the operands of the subtraction cannot
2611 // be reordered. Then we will visit the rest of the lanes in a circular
2612 // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
2613
2614 // Find the first lane that we will start our search from.
2615 unsigned FirstLane = getBestLaneToStartReordering();
2616
2617 // Initialize the modes.
2618 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2619 Value *OpLane0 = getValue(OpIdx, FirstLane);
2620 // Keep track if we have instructions with all the same opcode on one
2621 // side.
2622 if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2623 // Check if OpLane0 should be broadcast.
2624 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2625 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2626 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627 else if (isa<LoadInst>(OpILane0))
2628 ReorderingModes[OpIdx] = ReorderingMode::Load;
2629 else
2630 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2631 } else if (isa<Constant>(OpLane0)) {
2632 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2633 } else if (isa<Argument>(OpLane0)) {
2634 // Our best hope is a Splat. It may save some cost in some cases.
2635 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2636 } else {
2637 llvm_unreachable("Unexpected value kind.");
2638 }
2639 }
2640
2641 // Check that we don't have same operands. No need to reorder if operands
2642 // are just perfect diamond or shuffled diamond match. Do not do it only
2643 // for possible broadcasts or non-power of 2 number of scalars (just for
2644 // now).
2645 auto &&SkipReordering = [this]() {
2646 SmallPtrSet<Value *, 4> UniqueValues;
2647 ArrayRef<OperandData> Op0 = OpsVec.front();
2648 for (const OperandData &Data : Op0)
2649 UniqueValues.insert(Data.V);
2651 ArrayRef(OpsVec).slice(1, getNumOperands() - 1)) {
2652 if (any_of(Op, [&UniqueValues](const OperandData &Data) {
2653 return !UniqueValues.contains(Data.V);
2654 }))
2655 return false;
2656 }
2657 // TODO: Check if we can remove a check for non-power-2 number of
2658 // scalars after full support of non-power-2 vectorization.
2659 return UniqueValues.size() != 2 &&
2660 hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(),
2661 UniqueValues.size());
2662 };
2663
2664 // If the initial strategy fails for any of the operand indexes, then we
2665 // perform reordering again in a second pass. This helps avoid assigning
2666 // high priority to the failed strategy, and should improve reordering for
2667 // the non-failed operand indexes.
2668 for (int Pass = 0; Pass != 2; ++Pass) {
2669 // Check if no need to reorder operands since they're are perfect or
2670 // shuffled diamond match.
2671 // Need to do it to avoid extra external use cost counting for
2672 // shuffled matches, which may cause regressions.
2673 if (SkipReordering())
2674 break;
2675 // Skip the second pass if the first pass did not fail.
2676 bool StrategyFailed = false;
2677 // Mark all operand data as free to use.
2678 clearUsed();
2679 // We keep the original operand order for the FirstLane, so reorder the
2680 // rest of the lanes. We are visiting the nodes in a circular fashion,
2681 // using FirstLane as the center point and increasing the radius
2682 // distance.
2683 SmallVector<SmallVector<Value *, 2>> MainAltOps(NumOperands);
2684 for (unsigned I = 0; I < NumOperands; ++I)
2685 MainAltOps[I].push_back(getData(I, FirstLane).V);
2686
2687 SmallBitVector UsedLanes(NumLanes);
2688 UsedLanes.set(FirstLane);
2689 for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2690 // Visit the lane on the right and then the lane on the left.
2691 for (int Direction : {+1, -1}) {
2692 int Lane = FirstLane + Direction * Distance;
2693 if (Lane < 0 || Lane >= (int)NumLanes)
2694 continue;
2695 UsedLanes.set(Lane);
2696 int LastLane = Lane - Direction;
2697 assert(LastLane >= 0 && LastLane < (int)NumLanes &&
2698 "Out of bounds");
2699 // Look for a good match for each operand.
2700 for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2701 // Search for the operand that matches SortedOps[OpIdx][Lane-1].
2702 std::optional<unsigned> BestIdx =
2703 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2704 MainAltOps[OpIdx], UsedLanes);
2705 // By not selecting a value, we allow the operands that follow to
2706 // select a better matching value. We will get a non-null value in
2707 // the next run of getBestOperand().
2708 if (BestIdx) {
2709 // Swap the current operand with the one returned by
2710 // getBestOperand().
2711 swap(OpIdx, *BestIdx, Lane);
2712 } else {
2713 // Enable the second pass.
2714 StrategyFailed = true;
2715 }
2716 // Try to get the alternate opcode and follow it during analysis.
2717 if (MainAltOps[OpIdx].size() != 2) {
2718 OperandData &AltOp = getData(OpIdx, Lane);
2719 InstructionsState OpS =
2720 getSameOpcode({MainAltOps[OpIdx].front(), AltOp.V}, TLI);
2721 if (OpS && OpS.isAltShuffle())
2722 MainAltOps[OpIdx].push_back(AltOp.V);
2723 }
2724 }
2725 }
2726 }
2727 // Skip second pass if the strategy did not fail.
2728 if (!StrategyFailed)
2729 break;
2730 }
2731 }
2732
2733#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2734 LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
2735 switch (RMode) {
2736 case ReorderingMode::Load:
2737 return "Load";
2738 case ReorderingMode::Opcode:
2739 return "Opcode";
2740 case ReorderingMode::Constant:
2741 return "Constant";
2742 case ReorderingMode::Splat:
2743 return "Splat";
2744 case ReorderingMode::Failed:
2745 return "Failed";
2746 }
2747 llvm_unreachable("Unimplemented Reordering Type");
2748 }
2749
2750 LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
2751 raw_ostream &OS) {
2752 return OS << getModeStr(RMode);
2753 }
2754
2755 /// Debug print.
2756 LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
2757 printMode(RMode, dbgs());
2758 }
2759
2760 friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
2761 return printMode(RMode, OS);
2762 }
2763
2765 const unsigned Indent = 2;
2766 unsigned Cnt = 0;
2767 for (const OperandDataVec &OpDataVec : OpsVec) {
2768 OS << "Operand " << Cnt++ << "\n";
2769 for (const OperandData &OpData : OpDataVec) {
2770 OS.indent(Indent) << "{";
2771 if (Value *V = OpData.V)
2772 OS << *V;
2773 else
2774 OS << "null";
2775 OS << ", APO:" << OpData.APO << "}\n";
2776 }
2777 OS << "\n";
2778 }
2779 return OS;
2780 }
2781
2782 /// Debug print.
2783 LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
2784#endif
2785 };
2786
2787 /// Evaluate each pair in \p Candidates and return index into \p Candidates
2788 /// for a pair which have highest score deemed to have best chance to form
2789 /// root of profitable tree to vectorize. Return std::nullopt if no candidate
2790 /// scored above the LookAheadHeuristics::ScoreFail. \param Limit Lower limit
2791 /// of the cost, considered to be good enough score.
2792 std::optional<int>
2793 findBestRootPair(ArrayRef<std::pair<Value *, Value *>> Candidates,
2794 int Limit = LookAheadHeuristics::ScoreFail) const {
2795 LookAheadHeuristics LookAhead(*TLI, *DL, *SE, *this, /*NumLanes=*/2,
2797 int BestScore = Limit;
2798 std::optional<int> Index;
2799 for (int I : seq<int>(0, Candidates.size())) {
2800 int Score = LookAhead.getScoreAtLevelRec(Candidates[I].first,
2801 Candidates[I].second,
2802 /*U1=*/nullptr, /*U2=*/nullptr,
2803 /*CurrLevel=*/1, {});
2804 if (Score > BestScore) {
2805 BestScore = Score;
2806 Index = I;
2807 }
2808 }
2809 return Index;
2810 }
2811
2812 /// Checks if the instruction is marked for deletion.
2813 bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
2814
2815 /// Removes an instruction from its block and eventually deletes it.
2816 /// It's like Instruction::eraseFromParent() except that the actual deletion
2817 /// is delayed until BoUpSLP is destructed.
2819 DeletedInstructions.insert(I);
2820 }
2821
2822 /// Remove instructions from the parent function and clear the operands of \p
2823 /// DeadVals instructions, marking for deletion trivially dead operands.
2824 template <typename T>
2827 for (T *V : DeadVals) {
2828 auto *I = cast<Instruction>(V);
2829 DeletedInstructions.insert(I);
2830 }
2831 DenseSet<Value *> Processed;
2832 for (T *V : DeadVals) {
2833 if (!V || !Processed.insert(V).second)
2834 continue;
2835 auto *I = cast<Instruction>(V);
2838 if (const TreeEntry *Entry = getTreeEntry(I)) {
2839 Entries.push_back(Entry);
2840 auto It = MultiNodeScalars.find(I);
2841 if (It != MultiNodeScalars.end())
2842 Entries.append(It->second.begin(), It->second.end());
2843 }
2844 for (Use &U : I->operands()) {
2845 if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2846 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2848 (Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2849 return Entry->VectorizedValue == OpI;
2850 })))
2851 DeadInsts.push_back(OpI);
2852 }
2853 I->dropAllReferences();
2854 }
2855 for (T *V : DeadVals) {
2856 auto *I = cast<Instruction>(V);
2857 if (!I->getParent())
2858 continue;
2859 assert((I->use_empty() || all_of(I->uses(),
2860 [&](Use &U) {
2861 return isDeleted(
2862 cast<Instruction>(U.getUser()));
2863 })) &&
2864 "trying to erase instruction with users.");
2865 I->removeFromParent();
2866 SE->forgetValue(I);
2867 }
2868 // Process the dead instruction list until empty.
2869 while (!DeadInsts.empty()) {
2870 Value *V = DeadInsts.pop_back_val();
2871 Instruction *VI = cast_or_null<Instruction>(V);
2872 if (!VI || !VI->getParent())
2873 continue;
2875 "Live instruction found in dead worklist!");
2876 assert(VI->use_empty() && "Instructions with uses are not dead.");
2877
2878 // Don't lose the debug info while deleting the instructions.
2879 salvageDebugInfo(*VI);
2880
2881 // Null out all of the instruction's operands to see if any operand
2882 // becomes dead as we go.
2883 for (Use &OpU : VI->operands()) {
2884 Value *OpV = OpU.get();
2885 if (!OpV)
2886 continue;
2887 OpU.set(nullptr);
2888
2889 if (!OpV->use_empty())
2890 continue;
2891
2892 // If the operand is an instruction that became dead as we nulled out
2893 // the operand, and if it is 'trivially' dead, delete it in a future
2894 // loop iteration.
2895 if (auto *OpI = dyn_cast<Instruction>(OpV))
2896 if (!DeletedInstructions.contains(OpI) &&
2898 DeadInsts.push_back(OpI);
2899 }
2900
2901 VI->removeFromParent();
2902 DeletedInstructions.insert(VI);
2903 SE->forgetValue(VI);
2904 }
2905 }
2906
2907 /// Checks if the instruction was already analyzed for being possible
2908 /// reduction root.
2910 return AnalyzedReductionsRoots.count(I);
2911 }
2912 /// Register given instruction as already analyzed for being possible
2913 /// reduction root.
2915 AnalyzedReductionsRoots.insert(I);
2916 }
2917 /// Checks if the provided list of reduced values was checked already for
2918 /// vectorization.
2920 return AnalyzedReductionVals.contains(hash_value(VL));
2921 }
2922 /// Adds the list of reduced values to list of already checked values for the
2923 /// vectorization.
2925 AnalyzedReductionVals.insert(hash_value(VL));
2926 }
2927 /// Clear the list of the analyzed reduction root instructions.
2929 AnalyzedReductionsRoots.clear();
2930 AnalyzedReductionVals.clear();
2931 AnalyzedMinBWVals.clear();
2932 }
2933 /// Checks if the given value is gathered in one of the nodes.
2934 bool isAnyGathered(const SmallDenseSet<Value *> &Vals) const {
2935 return any_of(MustGather, [&](Value *V) { return Vals.contains(V); });
2936 }
2937 /// Checks if the given value is gathered in one of the nodes.
2938 bool isGathered(const Value *V) const {
2939 return MustGather.contains(V);
2940 }
2941 /// Checks if the specified value was not schedule.
2942 bool isNotScheduled(const Value *V) const {
2943 return NonScheduledFirst.contains(V);
2944 }
2945
2946 /// Check if the value is vectorized in the tree.
2947 bool isVectorized(Value *V) const { return getTreeEntry(V); }
2948
2949 ~BoUpSLP();
2950
2951private:
2952 /// Determine if a node \p E in can be demoted to a smaller type with a
2953 /// truncation. We collect the entries that will be demoted in ToDemote.
2954 /// \param E Node for analysis
2955 /// \param ToDemote indices of the nodes to be demoted.
2956 bool collectValuesToDemote(
2957 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
2959 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
2960 bool &IsProfitableToDemote, bool IsTruncRoot) const;
2961
2962 /// Check if the operands on the edges \p Edges of the \p UserTE allows
2963 /// reordering (i.e. the operands can be reordered because they have only one
2964 /// user and reordarable).
2965 /// \param ReorderableGathers List of all gather nodes that require reordering
2966 /// (e.g., gather of extractlements or partially vectorizable loads).
2967 /// \param GatherOps List of gather operand nodes for \p UserTE that require
2968 /// reordering, subset of \p NonVectorized.
2969 bool
2970 canReorderOperands(TreeEntry *UserTE,
2971 SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
2972 ArrayRef<TreeEntry *> ReorderableGathers,
2973 SmallVectorImpl<TreeEntry *> &GatherOps);
2974
2975 /// Checks if the given \p TE is a gather node with clustered reused scalars
2976 /// and reorders it per given \p Mask.
2977 void reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const;
2978
2979 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
2980 /// if any. If it is not vectorized (gather node), returns nullptr.
2981 TreeEntry *getVectorizedOperand(TreeEntry *UserTE, unsigned OpIdx) {
2982 ArrayRef<Value *> VL = UserTE->getOperand(OpIdx);
2983 TreeEntry *TE = nullptr;
2984 const auto *It = find_if(VL, [&](Value *V) {
2985 TE = getTreeEntry(V);
2986 if (TE && is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2987 return true;
2988 auto It = MultiNodeScalars.find(V);
2989 if (It != MultiNodeScalars.end()) {
2990 for (TreeEntry *E : It->second) {
2991 if (is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2992 TE = E;
2993 return true;
2994 }
2995 }
2996 }
2997 return false;
2998 });
2999 if (It != VL.end()) {
3000 assert(TE->isSame(VL) && "Expected same scalars.");
3001 return TE;
3002 }
3003 return nullptr;
3004 }
3005
3006 /// Returns vectorized operand \p OpIdx of the node \p UserTE from the graph,
3007 /// if any. If it is not vectorized (gather node), returns nullptr.
3008 const TreeEntry *getVectorizedOperand(const TreeEntry *UserTE,
3009 unsigned OpIdx) const {
3010 return const_cast<BoUpSLP *>(this)->getVectorizedOperand(
3011 const_cast<TreeEntry *>(UserTE), OpIdx);
3012 }
3013
3014 /// Checks if all users of \p I are the part of the vectorization tree.
3015 bool areAllUsersVectorized(
3016 Instruction *I,
3017 const SmallDenseSet<Value *> *VectorizedVals = nullptr) const;
3018
3019 /// Return information about the vector formed for the specified index
3020 /// of a vector of (the same) instruction.
3022
3023 /// \ returns the graph entry for the \p Idx operand of the \p E entry.
3024 const TreeEntry *getOperandEntry(const TreeEntry *E, unsigned Idx) const;
3025
3026 /// Gets the root instruction for the given node. If the node is a strided
3027 /// load/store node with the reverse order, the root instruction is the last
3028 /// one.
3029 Instruction *getRootEntryInstruction(const TreeEntry &Entry) const;
3030
3031 /// \returns Cast context for the given graph node.
3033 getCastContextHint(const TreeEntry &TE) const;
3034
3035 /// \returns the cost of the vectorizable entry.
3036 InstructionCost getEntryCost(const TreeEntry *E,
3037 ArrayRef<Value *> VectorizedVals,
3038 SmallPtrSetImpl<Value *> &CheckedExtracts);
3039
3040 /// This is the recursive part of buildTree.
3041 void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
3042 const EdgeInfo &EI, unsigned InterleaveFactor = 0);
3043
3044 /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
3045 /// be vectorized to use the original vector (or aggregate "bitcast" to a
3046 /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
3047 /// returns false, setting \p CurrentOrder to either an empty vector or a
3048 /// non-identity permutation that allows to reuse extract instructions.
3049 /// \param ResizeAllowed indicates whether it is allowed to handle subvector
3050 /// extract order.
3051 bool canReuseExtract(ArrayRef<Value *> VL,
3052 SmallVectorImpl<unsigned> &CurrentOrder,
3053 bool ResizeAllowed = false) const;
3054
3055 /// Vectorize a single entry in the tree.
3056 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3057 /// avoid issues with def-use order.
3058 Value *vectorizeTree(TreeEntry *E, bool PostponedPHIs);
3059
3060 /// Returns vectorized operand node, that matches the order of the scalars
3061 /// operand number \p NodeIdx in entry \p E.
3062 TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E, unsigned NodeIdx);
3063 const TreeEntry *getMatchedVectorizedOperand(const TreeEntry *E,
3064 unsigned NodeIdx) const {
3065 return const_cast<BoUpSLP *>(this)->getMatchedVectorizedOperand(E, NodeIdx);
3066 }
3067
3068 /// Vectorize a single entry in the tree, the \p Idx-th operand of the entry
3069 /// \p E.
3070 /// \param PostponedPHIs true, if need to postpone emission of phi nodes to
3071 /// avoid issues with def-use order.
3072 Value *vectorizeOperand(TreeEntry *E, unsigned NodeIdx, bool PostponedPHIs);
3073
3074 /// Create a new vector from a list of scalar values. Produces a sequence
3075 /// which exploits values reused across lanes, and arranges the inserts
3076 /// for ease of later optimization.
3077 template <typename BVTy, typename ResTy, typename... Args>
3078 ResTy processBuildVector(const TreeEntry *E, Type *ScalarTy, Args &...Params);
3079
3080 /// Create a new vector from a list of scalar values. Produces a sequence
3081 /// which exploits values reused across lanes, and arranges the inserts
3082 /// for ease of later optimization.
3083 Value *createBuildVector(const TreeEntry *E, Type *ScalarTy,
3084 bool PostponedPHIs);
3085
3086 /// Returns the instruction in the bundle, which can be used as a base point
3087 /// for scheduling. Usually it is the last instruction in the bundle, except
3088 /// for the case when all operands are external (in this case, it is the first
3089 /// instruction in the list).
3090 Instruction &getLastInstructionInBundle(const TreeEntry *E);
3091
3092 /// Tries to find extractelement instructions with constant indices from fixed
3093 /// vector type and gather such instructions into a bunch, which highly likely
3094 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3095 /// was successful, the matched scalars are replaced by poison values in \p VL
3096 /// for future analysis.
3097 std::optional<TargetTransformInfo::ShuffleKind>
3098 tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
3099 SmallVectorImpl<int> &Mask) const;
3100
3101 /// Tries to find extractelement instructions with constant indices from fixed
3102 /// vector type and gather such instructions into a bunch, which highly likely
3103 /// might be detected as a shuffle of 1 or 2 input vectors. If this attempt
3104 /// was successful, the matched scalars are replaced by poison values in \p VL
3105 /// for future analysis.
3107 tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
3109 unsigned NumParts) const;
3110
3111 /// Checks if the gathered \p VL can be represented as a single register
3112 /// shuffle(s) of previous tree entries.
3113 /// \param TE Tree entry checked for permutation.
3114 /// \param VL List of scalars (a subset of the TE scalar), checked for
3115 /// permutations. Must form single-register vector.
3116 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3117 /// commands to build the mask using the original vector value, without
3118 /// relying on the potential reordering.
3119 /// \returns ShuffleKind, if gathered values can be represented as shuffles of
3120 /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
3121 std::optional<TargetTransformInfo::ShuffleKind>
3122 isGatherShuffledSingleRegisterEntry(
3123 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
3124 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part,
3125 bool ForOrder);
3126
3127 /// Checks if the gathered \p VL can be represented as multi-register
3128 /// shuffle(s) of previous tree entries.
3129 /// \param TE Tree entry checked for permutation.
3130 /// \param VL List of scalars (a subset of the TE scalar), checked for
3131 /// permutations.
3132 /// \param ForOrder Tries to fetch the best candidates for ordering info. Also
3133 /// commands to build the mask using the original vector value, without
3134 /// relying on the potential reordering.
3135 /// \returns per-register series of ShuffleKind, if gathered values can be
3136 /// represented as shuffles of previous tree entries. \p Mask is filled with
3137 /// the shuffle mask (also on per-register base).
3139 isGatherShuffledEntry(
3140 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
3142 unsigned NumParts, bool ForOrder = false);
3143
3144 /// \returns the cost of gathering (inserting) the values in \p VL into a
3145 /// vector.
3146 /// \param ForPoisonSrc true if initial vector is poison, false otherwise.
3147 InstructionCost getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
3148 Type *ScalarTy) const;
3149
3150 /// Set the Builder insert point to one after the last instruction in
3151 /// the bundle
3152 void setInsertPointAfterBundle(const TreeEntry *E);
3153
3154 /// \returns a vector from a collection of scalars in \p VL. if \p Root is not
3155 /// specified, the starting vector value is poison.
3156 Value *
3157 gather(ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
3158 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle);
3159
3160 /// \returns whether the VectorizableTree is fully vectorizable and will
3161 /// be beneficial even the tree height is tiny.
3162 bool isFullyVectorizableTinyTree(bool ForReduction) const;
3163
3164 /// Run through the list of all gathered loads in the graph and try to find
3165 /// vector loads/masked gathers instead of regular gathers. Later these loads
3166 /// are reshufled to build final gathered nodes.
3167 void tryToVectorizeGatheredLoads(
3168 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
3169 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
3170 8> &GatheredLoads);
3171
3172 /// Helper for `findExternalStoreUsersReorderIndices()`. It iterates over the
3173 /// users of \p TE and collects the stores. It returns the map from the store
3174 /// pointers to the collected stores.
3176 collectUserStores(const BoUpSLP::TreeEntry *TE) const;
3177
3178 /// Helper for `findExternalStoreUsersReorderIndices()`. It checks if the
3179 /// stores in \p StoresVec can form a vector instruction. If so it returns
3180 /// true and populates \p ReorderIndices with the shuffle indices of the
3181 /// stores when compared to the sorted vector.
3182 bool canFormVector(ArrayRef<StoreInst *> StoresVec,
3183 OrdersType &ReorderIndices) const;
3184
3185 /// Iterates through the users of \p TE, looking for scalar stores that can be
3186 /// potentially vectorized in a future SLP-tree. If found, it keeps track of
3187 /// their order and builds an order index vector for each store bundle. It
3188 /// returns all these order vectors found.
3189 /// We run this after the tree has formed, otherwise we may come across user
3190 /// instructions that are not yet in the tree.
3192 findExternalStoreUsersReorderIndices(TreeEntry *TE) const;
3193
3194 /// Tries to reorder the gathering node for better vectorization
3195 /// opportunities.
3196 void reorderGatherNode(TreeEntry &TE);
3197
3198 struct TreeEntry {
3199 using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
3200 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3201
3202 /// \returns Common mask for reorder indices and reused scalars.
3203 SmallVector<int> getCommonMask() const {
3205 inversePermutation(ReorderIndices, Mask);
3206 ::addMask(Mask, ReuseShuffleIndices);
3207 return Mask;
3208 }
3209
3210 /// \returns true if the scalars in VL are equal to this entry.
3211 bool isSame(ArrayRef<Value *> VL) const {
3212 auto &&IsSame = [VL](ArrayRef<Value *> Scalars, ArrayRef<int> Mask) {
3213 if (Mask.size() != VL.size() && VL.size() == Scalars.size())
3214 return std::equal(VL.begin(), VL.end(), Scalars.begin());
3215 return VL.size() == Mask.size() &&
3216 std::equal(VL.begin(), VL.end(), Mask.begin(),
3217 [Scalars](Value *V, int Idx) {
3218 return (isa<UndefValue>(V) &&
3219 Idx == PoisonMaskElem) ||
3220 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3221 });
3222 };
3223 if (!ReorderIndices.empty()) {
3224 // TODO: implement matching if the nodes are just reordered, still can
3225 // treat the vector as the same if the list of scalars matches VL
3226 // directly, without reordering.
3228 inversePermutation(ReorderIndices, Mask);
3229 if (VL.size() == Scalars.size())
3230 return IsSame(Scalars, Mask);
3231 if (VL.size() == ReuseShuffleIndices.size()) {
3232 ::addMask(Mask, ReuseShuffleIndices);
3233 return IsSame(Scalars, Mask);
3234 }
3235 return false;
3236 }
3237 return IsSame(Scalars, ReuseShuffleIndices);
3238 }
3239
3240 bool isOperandGatherNode(const EdgeInfo &UserEI) const {
3241 return isGather() && !UserTreeIndices.empty() &&
3242 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3243 UserTreeIndices.front().UserTE == UserEI.UserTE;
3244 }
3245
3246 /// \returns true if current entry has same operands as \p TE.
3247 bool hasEqualOperands(const TreeEntry &TE) const {
3248 if (TE.getNumOperands() != getNumOperands())
3249 return false;
3250 SmallBitVector Used(getNumOperands());
3251 for (unsigned I = 0, E = getNumOperands(); I < E; ++I) {
3252 unsigned PrevCount = Used.count();
3253 for (unsigned K = 0; K < E; ++K) {
3254 if (Used.test(K))
3255 continue;
3256 if (getOperand(K) == TE.getOperand(I)) {
3257 Used.set(K);
3258 break;
3259 }
3260 }
3261 // Check if we actually found the matching operand.
3262 if (PrevCount == Used.count())
3263 return false;
3264 }
3265 return true;
3266 }
3267
3268 /// \return Final vectorization factor for the node. Defined by the total
3269 /// number of vectorized scalars, including those, used several times in the
3270 /// entry and counted in the \a ReuseShuffleIndices, if any.
3271 unsigned getVectorFactor() const {
3272 if (!ReuseShuffleIndices.empty())
3273 return ReuseShuffleIndices.size();
3274 return Scalars.size();
3275 };
3276
3277 /// Checks if the current node is a gather node.
3278 bool isGather() const { return State == NeedToGather; }
3279
3280 /// A vector of scalars.
3281 ValueList Scalars;
3282
3283 /// The Scalars are vectorized into this value. It is initialized to Null.
3284 WeakTrackingVH VectorizedValue = nullptr;
3285
3286 /// New vector phi instructions emitted for the vectorized phi nodes.
3287 PHINode *PHI = nullptr;
3288
3289 /// Do we need to gather this sequence or vectorize it
3290 /// (either with vector instruction or with scatter/gather
3291 /// intrinsics for store/load)?
3292 enum EntryState {
3293 Vectorize, ///< The node is regularly vectorized.
3294 ScatterVectorize, ///< Masked scatter/gather node.
3295 StridedVectorize, ///< Strided loads (and stores)
3296 NeedToGather, ///< Gather/buildvector node.
3297 CombinedVectorize, ///< Vectorized node, combined with its user into more
3298 ///< complex node like select/cmp to minmax, mul/add to
3299 ///< fma, etc. Must be used for the following nodes in
3300 ///< the pattern, not the very first one.
3301 };
3302 EntryState State;
3303
3304 /// List of combined opcodes supported by the vectorizer.
3305 enum CombinedOpcode {
3306 NotCombinedOp = -1,
3307 MinMax = Instruction::OtherOpsEnd + 1,
3308 };
3309 CombinedOpcode CombinedOp = NotCombinedOp;
3310
3311 /// Does this sequence require some shuffling?
3312 SmallVector<int, 4> ReuseShuffleIndices;
3313
3314 /// Does this entry require reordering?
3315 SmallVector<unsigned, 4> ReorderIndices;
3316
3317 /// Points back to the VectorizableTree.
3318 ///
3319 /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
3320 /// to be a pointer and needs to be able to initialize the child iterator.
3321 /// Thus we need a reference back to the container to translate the indices
3322 /// to entries.
3323 VecTreeTy &Container;
3324
3325 /// The TreeEntry index containing the user of this entry. We can actually
3326 /// have multiple users so the data structure is not truly a tree.
3327 SmallVector<EdgeInfo, 1> UserTreeIndices;
3328
3329 /// The index of this treeEntry in VectorizableTree.
3330 unsigned Idx = 0;
3331
3332 /// For gather/buildvector/alt opcode (TODO) nodes, which are combined from
3333 /// other nodes as a series of insertvector instructions.
3334 SmallVector<std::pair<unsigned, unsigned>, 2> CombinedEntriesWithIndices;
3335
3336 private:
3337 /// The operands of each instruction in each lane Operands[op_index][lane].
3338 /// Note: This helps avoid the replication of the code that performs the
3339 /// reordering of operands during buildTree_rec() and vectorizeTree().
3341
3342 /// MainOp and AltOp are recorded inside. S should be obtained from
3343 /// newTreeEntry.
3344 InstructionsState S = InstructionsState::invalid();
3345
3346 /// Interleaving factor for interleaved loads Vectorize nodes.
3347 unsigned InterleaveFactor = 0;
3348
3349 public:
3350 /// Returns interleave factor for interleave nodes.
3351 unsigned getInterleaveFactor() const { return InterleaveFactor; }
3352 /// Sets interleaving factor for the interleaving nodes.
3353 void setInterleave(unsigned Factor) { InterleaveFactor = Factor; }
3354
3355 /// Set this bundle's \p OpIdx'th operand to \p OpVL.
3356 void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
3357 if (Operands.size() < OpIdx + 1)
3358 Operands.resize(OpIdx + 1);
3359 assert(Operands[OpIdx].empty() && "Already resized?");
3360 assert(OpVL.size() <= Scalars.size() &&
3361 "Number of operands is greater than the number of scalars.");
3362 Operands[OpIdx].resize(OpVL.size());
3363 copy(OpVL, Operands[OpIdx].begin());
3364 }
3365
3366 /// Set this bundle's operand from Scalars.
3367 void setOperand(const BoUpSLP &R, bool RequireReorder = false) {
3368 VLOperands Ops(Scalars, S, R);
3369 if (RequireReorder)
3370 Ops.reorder();
3371 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3372 setOperand(I, Ops.getVL(I));
3373 }
3374
3375 /// Reorders operands of the node to the given mask \p Mask.
3376 void reorderOperands(ArrayRef<int> Mask) {
3377 for (ValueList &Operand : Operands)
3378 reorderScalars(Operand, Mask);
3379 }
3380
3381 /// \returns the \p OpIdx operand of this TreeEntry.
3382 ValueList &getOperand(unsigned OpIdx) {
3383 assert(OpIdx < Operands.size() && "Off bounds");
3384 return Operands[OpIdx];
3385 }
3386
3387 /// \returns the \p OpIdx operand of this TreeEntry.
3388 ArrayRef<Value *> getOperand(unsigned OpIdx) const {
3389 assert(OpIdx < Operands.size() && "Off bounds");
3390 return Operands[OpIdx];
3391 }
3392
3393 /// \returns the number of operands.
3394 unsigned getNumOperands() const { return Operands.size(); }
3395
3396 /// \return the single \p OpIdx operand.
3397 Value *getSingleOperand(unsigned OpIdx) const {
3398 assert(OpIdx < Operands.size() && "Off bounds");
3399 assert(!Operands[OpIdx].empty() && "No operand available");
3400 return Operands[OpIdx][0];
3401 }
3402
3403 /// Some of the instructions in the list have alternate opcodes.
3404 bool isAltShuffle() const { return S.isAltShuffle(); }
3405
3406 bool isOpcodeOrAlt(Instruction *I) const { return S.isOpcodeOrAlt(I); }
3407
3408 /// Chooses the correct key for scheduling data. If \p Op has the same (or
3409 /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
3410 /// \p OpValue.
3411 Value *isOneOf(Value *Op) const {
3412 auto *I = dyn_cast<Instruction>(Op);
3413 if (I && isOpcodeOrAlt(I))
3414 return Op;
3415 return S.getMainOp();
3416 }
3417
3418 void setOperations(const InstructionsState &S) {
3419 assert(S && "InstructionsState is invalid.");
3420 this->S = S;
3421 }
3422
3423 Instruction *getMainOp() const { return S.getMainOp(); }
3424
3425 Instruction *getAltOp() const { return S.getAltOp(); }
3426
3427 /// The main/alternate opcodes for the list of instructions.
3428 unsigned getOpcode() const { return S.getOpcode(); }
3429
3430 unsigned getAltOpcode() const { return S.getAltOpcode(); }
3431
3432 bool hasState() const { return S.valid(); }
3433
3434 /// When ReuseReorderShuffleIndices is empty it just returns position of \p
3435 /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
3436 int findLaneForValue(Value *V) const {
3437 unsigned FoundLane = getVectorFactor();
3438 for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
3439 std::advance(It, 1)) {
3440 if (*It != V)
3441 continue;
3442 FoundLane = std::distance(Scalars.begin(), It);
3443 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3444 if (!ReorderIndices.empty())
3445 FoundLane = ReorderIndices[FoundLane];
3446 assert(FoundLane < Scalars.size() && "Couldn't find extract lane");
3447 if (ReuseShuffleIndices.empty())
3448 break;
3449 if (auto *RIt = find(ReuseShuffleIndices, FoundLane);
3450 RIt != ReuseShuffleIndices.end()) {
3451 FoundLane = std::distance(ReuseShuffleIndices.begin(), RIt);
3452 break;
3453 }
3454 }
3455 assert(FoundLane < getVectorFactor() && "Unable to find given value.");
3456 return FoundLane;
3457 }
3458
3459 /// Build a shuffle mask for graph entry which represents a merge of main
3460 /// and alternate operations.
3461 void
3462 buildAltOpShuffleMask(const function_ref<bool(Instruction *)> IsAltOp,
3464 SmallVectorImpl<Value *> *OpScalars = nullptr,
3465 SmallVectorImpl<Value *> *AltScalars = nullptr) const;
3466
3467 /// Return true if this is a non-power-of-2 node.
3468 bool isNonPowOf2Vec() const {
3469 bool IsNonPowerOf2 = !has_single_bit(Scalars.size());
3470 return IsNonPowerOf2;
3471 }
3472
3473 /// Return true if this is a node, which tries to vectorize number of
3474 /// elements, forming whole vectors.
3475 bool
3476 hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const {
3477 bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2(
3478 TTI, getValueType(Scalars.front()), Scalars.size());
3479 assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) &&
3480 "Reshuffling not supported with non-power-of-2 vectors yet.");
3481 return IsNonPowerOf2;
3482 }
3483
3484 Value *getOrdered(unsigned Idx) const {
3485 assert(isGather() && "Must be used only for buildvectors/gathers.");
3486 if (ReorderIndices.empty())
3487 return Scalars[Idx];
3489 inversePermutation(ReorderIndices, Mask);
3490 return Scalars[Mask[Idx]];
3491 }
3492
3493#ifndef NDEBUG
3494 /// Debug printer.
3495 LLVM_DUMP_METHOD void dump() const {
3496 dbgs() << Idx << ".\n";
3497 for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
3498 dbgs() << "Operand " << OpI << ":\n";
3499 for (const Value *V : Operands[OpI])
3500 dbgs().indent(2) << *V << "\n";
3501 }
3502 dbgs() << "Scalars: \n";
3503 for (Value *V : Scalars)
3504 dbgs().indent(2) << *V << "\n";
3505 dbgs() << "State: ";
3506 switch (State) {
3507 case Vectorize:
3508 if (InterleaveFactor > 0) {
3509 dbgs() << "Vectorize with interleave factor " << InterleaveFactor
3510 << "\n";
3511 } else {
3512 dbgs() << "Vectorize\n";
3513 }
3514 break;
3515 case ScatterVectorize:
3516 dbgs() << "ScatterVectorize\n";
3517 break;
3518 case StridedVectorize:
3519 dbgs() << "StridedVectorize\n";
3520 break;
3521 case NeedToGather:
3522 dbgs() << "NeedToGather\n";
3523 break;
3524 case CombinedVectorize:
3525 dbgs() << "CombinedVectorize\n";
3526 break;
3527 }
3528 if (S) {
3529 dbgs() << "MainOp: " << *S.getMainOp() << "\n";
3530 dbgs() << "AltOp: " << *S.getAltOp() << "\n";
3531 } else {
3532 dbgs() << "MainOp: NULL\n";
3533 dbgs() << "AltOp: NULL\n";
3534 }
3535 dbgs() << "VectorizedValue: ";
3536 if (VectorizedValue)
3537 dbgs() << *VectorizedValue << "\n";
3538 else
3539 dbgs() << "NULL\n";
3540 dbgs() << "ReuseShuffleIndices: ";
3541 if (ReuseShuffleIndices.empty())
3542 dbgs() << "Empty";
3543 else
3544 for (int ReuseIdx : ReuseShuffleIndices)
3545 dbgs() << ReuseIdx << ", ";
3546 dbgs() << "\n";
3547 dbgs() << "ReorderIndices: ";
3548 for (unsigned ReorderIdx : ReorderIndices)
3549 dbgs() << ReorderIdx << ", ";
3550 dbgs() << "\n";
3551 dbgs() << "UserTreeIndices: ";
3552 for (const auto &EInfo : UserTreeIndices)
3553 dbgs() << EInfo << ", ";
3554 dbgs() << "\n";
3555 if (!CombinedEntriesWithIndices.empty()) {
3556 dbgs() << "Combined entries: ";
3557 interleaveComma(CombinedEntriesWithIndices, dbgs(), [&](const auto &P) {
3558 dbgs() << "Entry index " << P.first << " with offset " << P.second;
3559 });
3560 dbgs() << "\n";
3561 }
3562 }
3563#endif
3564 };
3565
3566#ifndef NDEBUG
3567 void dumpTreeCosts(const TreeEntry *E, InstructionCost ReuseShuffleCost,
3568 InstructionCost VecCost, InstructionCost ScalarCost,
3569 StringRef Banner) const {
3570 dbgs() << "SLP: " << Banner << ":\n";
3571 E->dump();
3572 dbgs() << "SLP: Costs:\n";
3573 dbgs() << "SLP: ReuseShuffleCost = " << ReuseShuffleCost << "\n";
3574 dbgs() << "SLP: VectorCost = " << VecCost << "\n";
3575 dbgs() << "SLP: ScalarCost = " << ScalarCost << "\n";
3576 dbgs() << "SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3577 << ReuseShuffleCost + VecCost - ScalarCost << "\n";
3578 }
3579#endif
3580
3581 /// Create a new VectorizableTree entry.
3582 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3583 std::optional<ScheduleData *> Bundle,
3584 const InstructionsState &S,
3585 const EdgeInfo &UserTreeIdx,
3586 ArrayRef<int> ReuseShuffleIndices = {},
3587 ArrayRef<unsigned> ReorderIndices = {},
3588 unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593 if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3595 return E;
3596 }
3597
3598 TreeEntry *newTreeEntry(ArrayRef<Value *> VL,
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601 const InstructionsState &S,
3602 const EdgeInfo &UserTreeIdx,
3603 ArrayRef<int> ReuseShuffleIndices = {},
3604 ArrayRef<unsigned> ReorderIndices = {}) {
3605 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607 "Need to vectorize gather entry?");
3608 // Gathered loads still gathered? Do not create entry, use the original one.
3609 if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3613 return nullptr;
3614 VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *Last = VectorizableTree.back().get();
3616 Last->Idx = VectorizableTree.size() - 1;
3617 Last->State = EntryState;
3618 // FIXME: Remove once support for ReuseShuffleIndices has been implemented
3619 // for non-power-of-two vectors.
3620 assert(
3622 ReuseShuffleIndices.empty()) &&
3623 "Reshuffling scalars not yet supported for nodes with padding");
3624 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626 if (ReorderIndices.empty()) {
3627 Last->Scalars.assign(VL.begin(), VL.end());
3628 if (S)
3629 Last->setOperations(S);
3630 } else {
3631 // Reorder scalars and build final mask.
3632 Last->Scalars.assign(VL.size(), nullptr);
3633 transform(ReorderIndices, Last->Scalars.begin(),
3634 [VL](unsigned Idx) -> Value * {
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3637 return VL[Idx];
3638 });
3639 InstructionsState S = getSameOpcode(Last->Scalars, *TLI);
3640 if (S)
3641 Last->setOperations(S);
3642 Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
3643 }
3644 if (!Last->isGather()) {
3645 for (Value *V : VL) {
3646 const TreeEntry *TE = getTreeEntry(V);
3647 assert((!TE || TE == Last || doesNotNeedToBeScheduled(V)) &&
3648 "Scalar already in tree!");
3649 if (TE) {
3650 if (TE != Last)
3651 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(Last);
3652 continue;
3653 }
3654 ScalarToTreeEntry[V] = Last;
3655 }
3656 // Update the scheduler bundle to point to this TreeEntry.
3657 ScheduleData *BundleMember = *Bundle;
3658 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3659 isVectorLikeInstWithConstOps(S.getMainOp()) ||
3660 doesNotNeedToSchedule(VL)) &&
3661 "Bundle and VL out of sync");
3662 if (BundleMember) {
3663 for (Value *V : VL) {
3665 continue;
3666 if (!BundleMember)
3667 continue;
3668 BundleMember->TE = Last;
3669 BundleMember = BundleMember->NextInBundle;
3670 }
3671 }
3672 assert(!BundleMember && "Bundle and VL out of sync");
3673 } else {
3674 // Build a map for gathered scalars to the nodes where they are used.
3675 bool AllConstsOrCasts = true;
3676 for (Value *V : VL)
3677 if (!isConstant(V)) {
3678 auto *I = dyn_cast<CastInst>(V);
3679 AllConstsOrCasts &= I && I->getType()->isIntegerTy();
3680 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3681 !UserTreeIdx.UserTE->isGather())
3682 ValueToGatherNodes.try_emplace(V).first->getSecond().insert(Last);
3683 }
3684 if (AllConstsOrCasts)
3685 CastMaxMinBWSizes =
3686 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3687 MustGather.insert(VL.begin(), VL.end());
3688 }
3689
3690 if (UserTreeIdx.UserTE)
3691 Last->UserTreeIndices.push_back(UserTreeIdx);
3692 return Last;
3693 }
3694
3695 /// -- Vectorization State --
3696 /// Holds all of the tree entries.
3697 TreeEntry::VecTreeTy VectorizableTree;
3698
3699#ifndef NDEBUG
3700 /// Debug printer.
3701 LLVM_DUMP_METHOD void dumpVectorizableTree() const {
3702 for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3703 VectorizableTree[Id]->dump();
3704 dbgs() << "\n";
3705 }
3706 }
3707#endif
3708
3709 TreeEntry *getTreeEntry(Value *V) {
3710 assert(V && "V cannot be nullptr.");
3711 return ScalarToTreeEntry.lookup(V);
3712 }
3713
3714 const TreeEntry *getTreeEntry(Value *V) const {
3715 assert(V && "V cannot be nullptr.");
3716 return ScalarToTreeEntry.lookup(V);
3717 }
3718
3719 /// Check that the operand node of alternate node does not generate
3720 /// buildvector sequence. If it is, then probably not worth it to build
3721 /// alternate shuffle, if number of buildvector operands + alternate
3722 /// instruction > than the number of buildvector instructions.
3723 /// \param S the instructions state of the analyzed values.
3724 /// \param VL list of the instructions with alternate opcodes.
3725 bool areAltOperandsProfitable(const InstructionsState &S,
3726 ArrayRef<Value *> VL) const;
3727
3728 /// Checks if the specified list of the instructions/values can be vectorized
3729 /// and fills required data before actual scheduling of the instructions.
3730 TreeEntry::EntryState
3731 getScalarsVectorizationState(const InstructionsState &S, ArrayRef<Value *> VL,
3732 bool IsScatterVectorizeUserTE,
3733 OrdersType &CurrentOrder,
3734 SmallVectorImpl<Value *> &PointerOps);
3735
3736 /// Maps a specific scalar to its tree entry.
3737 SmallDenseMap<Value *, TreeEntry *> ScalarToTreeEntry;
3738
3739 /// List of scalars, used in several vectorize nodes, and the list of the
3740 /// nodes.
3742
3743 /// Maps a value to the proposed vectorizable size.
3744 SmallDenseMap<Value *, unsigned> InstrElementSize;
3745
3746 /// A list of scalars that we found that we need to keep as scalars.
3747 ValueSet MustGather;
3748
3749 /// A set of first non-schedulable values.
3750 ValueSet NonScheduledFirst;
3751
3752 /// A map between the vectorized entries and the last instructions in the
3753 /// bundles. The bundles are built in use order, not in the def order of the
3754 /// instructions. So, we cannot rely directly on the last instruction in the
3755 /// bundle being the last instruction in the program order during
3756 /// vectorization process since the basic blocks are affected, need to
3757 /// pre-gather them before.
3758 DenseMap<const TreeEntry *, Instruction *> EntryToLastInstruction;
3759
3760 /// List of gather nodes, depending on other gather/vector nodes, which should
3761 /// be emitted after the vector instruction emission process to correctly
3762 /// handle order of the vector instructions and shuffles.
3763 SetVector<const TreeEntry *> PostponedGathers;
3764
3765 using ValueToGatherNodesMap =
3767 ValueToGatherNodesMap ValueToGatherNodes;
3768
3769 /// A list of the load entries (node indices), which can be vectorized using
3770 /// strided or masked gather approach, but attempted to be represented as
3771 /// contiguous loads.
3772 SetVector<unsigned> LoadEntriesToVectorize;
3773
3774 /// true if graph nodes transforming mode is on.
3775 bool IsGraphTransformMode = false;
3776
3777 /// The index of the first gathered load entry in the VectorizeTree.
3778 std::optional<unsigned> GatheredLoadsEntriesFirst;
3779
3780 /// This POD struct describes one external user in the vectorized tree.
3781 struct ExternalUser {
3782 ExternalUser(Value *S, llvm::User *U, int L)
3783 : Scalar(S), User(U), Lane(L) {}
3784
3785 // Which scalar in our function.
3786 Value *Scalar;
3787
3788 // Which user that uses the scalar.
3790
3791 // Which lane does the scalar belong to.
3792 int Lane;
3793 };
3794 using UserList = SmallVector<ExternalUser, 16>;
3795
3796 /// Checks if two instructions may access the same memory.
3797 ///
3798 /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
3799 /// is invariant in the calling loop.
3800 bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
3801 Instruction *Inst2) {
3802 if (!Loc1.Ptr || !isSimple(Inst1) || !isSimple(Inst2))
3803 return true;
3804 // First check if the result is already in the cache.
3805 AliasCacheKey Key = std::make_pair(Inst1, Inst2);
3806 auto It = AliasCache.find(Key);
3807 if (It != AliasCache.end())
3808 return It->second;
3809 bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1));
3810 // Store the result in the cache.
3811 AliasCache.try_emplace(Key, Aliased);
3812 AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3813 return Aliased;
3814 }
3815
3816 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3817
3818 /// Cache for alias results.
3819 /// TODO: consider moving this to the AliasAnalysis itself.
3821
3822 // Cache for pointerMayBeCaptured calls inside AA. This is preserved
3823 // globally through SLP because we don't perform any action which
3824 // invalidates capture results.
3825 BatchAAResults BatchAA;
3826
3827 /// Temporary store for deleted instructions. Instructions will be deleted
3828 /// eventually when the BoUpSLP is destructed. The deferral is required to
3829 /// ensure that there are no incorrect collisions in the AliasCache, which
3830 /// can happen if a new instruction is allocated at the same address as a
3831 /// previously deleted instruction.
3832 DenseSet<Instruction *> DeletedInstructions;
3833
3834 /// Set of the instruction, being analyzed already for reductions.
3835 SmallPtrSet<Instruction *, 16> AnalyzedReductionsRoots;
3836
3837 /// Set of hashes for the list of reduction values already being analyzed.
3838 DenseSet<size_t> AnalyzedReductionVals;
3839
3840 /// Values, already been analyzed for mininmal bitwidth and found to be
3841 /// non-profitable.
3842 DenseSet<Value *> AnalyzedMinBWVals;
3843
3844 /// A list of values that need to extracted out of the tree.
3845 /// This list holds pairs of (Internal Scalar : External User). External User
3846 /// can be nullptr, it means that this Internal Scalar will be used later,
3847 /// after vectorization.
3848 UserList ExternalUses;
3849
3850 /// A list of GEPs which can be reaplced by scalar GEPs instead of
3851 /// extractelement instructions.
3852 SmallPtrSet<Value *, 4> ExternalUsesAsOriginalScalar;
3853
3854 /// Values used only by @llvm.assume calls.
3856
3857 /// Holds all of the instructions that we gathered, shuffle instructions and
3858 /// extractelements.
3859 SetVector<Instruction *> GatherShuffleExtractSeq;
3860
3861 /// A list of blocks that we are going to CSE.
3862 DenseSet<BasicBlock *> CSEBlocks;
3863
3864 /// List of hashes of vector of loads, which are known to be non vectorizable.
3865 DenseSet<size_t> ListOfKnonwnNonVectorizableLoads;
3866
3867 /// Contains all scheduling relevant data for an instruction.
3868 /// A ScheduleData either represents a single instruction or a member of an
3869 /// instruction bundle (= a group of instructions which is combined into a
3870 /// vector instruction).
3871 struct ScheduleData {
3872 // The initial value for the dependency counters. It means that the
3873 // dependencies are not calculated yet.
3874 enum { InvalidDeps = -1 };
3875
3876 ScheduleData() = default;
3877
3878 void init(int BlockSchedulingRegionID, Instruction *I) {
3879 FirstInBundle = this;
3880 NextInBundle = nullptr;
3881 NextLoadStore = nullptr;
3882 IsScheduled = false;
3883 SchedulingRegionID = BlockSchedulingRegionID;
3884 clearDependencies();
3885 Inst = I;
3886 TE = nullptr;
3887 }
3888
3889 /// Verify basic self consistency properties
3890 void verify() {
3891 if (hasValidDependencies()) {
3892 assert(UnscheduledDeps <= Dependencies && "invariant");
3893 } else {
3894 assert(UnscheduledDeps == Dependencies && "invariant");
3895 }
3896
3897 if (IsScheduled) {
3898 assert(isSchedulingEntity() &&
3899 "unexpected scheduled state");
3900 for (const ScheduleData *BundleMember = this; BundleMember;
3901 BundleMember = BundleMember->NextInBundle) {
3902 assert(BundleMember->hasValidDependencies() &&
3903 BundleMember->UnscheduledDeps == 0 &&
3904 "unexpected scheduled state");
3905 assert((BundleMember == this || !BundleMember->IsScheduled) &&
3906 "only bundle is marked scheduled");
3907 }
3908 }
3909
3910 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3911 "all bundle members must be in same basic block");
3912 }
3913
3914 /// Returns true if the dependency information has been calculated.
3915 /// Note that depenendency validity can vary between instructions within
3916 /// a single bundle.
3917 bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
3918
3919 /// Returns true for single instructions and for bundle representatives
3920 /// (= the head of a bundle).
3921 bool isSchedulingEntity() const { return FirstInBundle == this; }
3922
3923 /// Returns true if it represents an instruction bundle and not only a
3924 /// single instruction.
3925 bool isPartOfBundle() const {
3926 return NextInBundle != nullptr || FirstInBundle != this || TE;
3927 }
3928
3929 /// Returns true if it is ready for scheduling, i.e. it has no more
3930 /// unscheduled depending instructions/bundles.
3931 bool isReady() const {
3932 assert(isSchedulingEntity() &&
3933 "can't consider non-scheduling entity for ready list");
3934 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3935 }
3936
3937 /// Modifies the number of unscheduled dependencies for this instruction,
3938 /// and returns the number of remaining dependencies for the containing
3939 /// bundle.
3940 int incrementUnscheduledDeps(int Incr) {
3941 assert(hasValidDependencies() &&
3942 "increment of unscheduled deps would be meaningless");
3943 UnscheduledDeps += Incr;
3944 return FirstInBundle->unscheduledDepsInBundle();
3945 }
3946
3947 /// Sets the number of unscheduled dependencies to the number of
3948 /// dependencies.
3949 void resetUnscheduledDeps() {
3950 UnscheduledDeps = Dependencies;
3951 }
3952
3953 /// Clears all dependency information.
3954 void clearDependencies() {
3955 Dependencies = InvalidDeps;
3956 resetUnscheduledDeps();
3957 MemoryDependencies.clear();
3958 ControlDependencies.clear();
3959 }
3960
3961 int unscheduledDepsInBundle() const {
3962 assert(isSchedulingEntity() && "only meaningful on the bundle");
3963 int Sum = 0;
3964 for (const ScheduleData *BundleMember = this; BundleMember;
3965 BundleMember = BundleMember->NextInBundle) {
3966 if (BundleMember->UnscheduledDeps == InvalidDeps)
3967 return InvalidDeps;
3968 Sum += BundleMember->UnscheduledDeps;
3969 }
3970 return Sum;
3971 }
3972
3973 void dump(raw_ostream &os) const {
3974 if (!isSchedulingEntity()) {
3975 os << "/ " << *Inst;
3976 } else if (NextInBundle) {
3977 os << '[' << *Inst;
3978 ScheduleData *SD = NextInBundle;
3979 while (SD) {
3980 os << ';' << *SD->Inst;
3981 SD = SD->NextInBundle;
3982 }
3983 os << ']';
3984 } else {
3985 os << *Inst;
3986 }
3987 }
3988
3989 Instruction *Inst = nullptr;
3990
3991 /// The TreeEntry that this instruction corresponds to.
3992 TreeEntry *TE = nullptr;
3993
3994 /// Points to the head in an instruction bundle (and always to this for
3995 /// single instructions).
3996 ScheduleData *FirstInBundle = nullptr;
3997
3998 /// Single linked list of all instructions in a bundle. Null if it is a
3999 /// single instruction.
4000 ScheduleData *NextInBundle = nullptr;
4001
4002 /// Single linked list of all memory instructions (e.g. load, store, call)
4003 /// in the block - until the end of the scheduling region.
4004 ScheduleData *NextLoadStore = nullptr;
4005
4006 /// The dependent memory instructions.
4007 /// This list is derived on demand in calculateDependencies().
4008 SmallVector<ScheduleData *, 4> MemoryDependencies;
4009
4010 /// List of instructions which this instruction could be control dependent
4011 /// on. Allowing such nodes to be scheduled below this one could introduce
4012 /// a runtime fault which didn't exist in the original program.
4013 /// ex: this is a load or udiv following a readonly call which inf loops
4014 SmallVector<ScheduleData *, 4> ControlDependencies;
4015
4016 /// This ScheduleData is in the current scheduling region if this matches
4017 /// the current SchedulingRegionID of BlockScheduling.
4018 int SchedulingRegionID = 0;
4019
4020 /// Used for getting a "good" final ordering of instructions.
4021 int SchedulingPriority = 0;
4022
4023 /// The number of dependencies. Constitutes of the number of users of the
4024 /// instruction plus the number of dependent memory instructions (if any).
4025 /// This value is calculated on demand.
4026 /// If InvalidDeps, the number of dependencies is not calculated yet.
4027 int Dependencies = InvalidDeps;
4028
4029 /// The number of dependencies minus the number of dependencies of scheduled
4030 /// instructions. As soon as this is zero, the instruction/bundle gets ready
4031 /// for scheduling.
4032 /// Note that this is negative as long as Dependencies is not calculated.
4033 int UnscheduledDeps = InvalidDeps;
4034
4035 /// True if this instruction is scheduled (or considered as scheduled in the
4036 /// dry-run).
4037 bool IsScheduled = false;
4038 };
4039
4040#ifndef NDEBUG
4042 const BoUpSLP::ScheduleData &SD) {
4043 SD.dump(os);
4044 return os;
4045 }
4046#endif
4047
4048 friend struct GraphTraits<BoUpSLP *>;
4049 friend struct DOTGraphTraits<BoUpSLP *>;
4050
4051 /// Contains all scheduling data for a basic block.
4052 /// It does not schedules instructions, which are not memory read/write
4053 /// instructions and their operands are either constants, or arguments, or
4054 /// phis, or instructions from others blocks, or their users are phis or from
4055 /// the other blocks. The resulting vector instructions can be placed at the
4056 /// beginning of the basic block without scheduling (if operands does not need
4057 /// to be scheduled) or at the end of the block (if users are outside of the
4058 /// block). It allows to save some compile time and memory used by the
4059 /// compiler.
4060 /// ScheduleData is assigned for each instruction in between the boundaries of
4061 /// the tree entry, even for those, which are not part of the graph. It is
4062 /// required to correctly follow the dependencies between the instructions and
4063 /// their correct scheduling. The ScheduleData is not allocated for the
4064 /// instructions, which do not require scheduling, like phis, nodes with
4065 /// extractelements/insertelements only or nodes with instructions, with
4066 /// uses/operands outside of the block.
4067 struct BlockScheduling {
4068 BlockScheduling(BasicBlock *BB)
4069 : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
4070
4071 void clear() {
4072 ReadyInsts.clear();
4073 ScheduleStart = nullptr;
4074 ScheduleEnd = nullptr;
4075 FirstLoadStoreInRegion = nullptr;
4076 LastLoadStoreInRegion = nullptr;
4077 RegionHasStackSave = false;
4078
4079 // Reduce the maximum schedule region size by the size of the
4080 // previous scheduling run.
4081 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4082 if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
4083 ScheduleRegionSizeLimit = MinScheduleRegionSize;
4084 ScheduleRegionSize = 0;
4085
4086 // Make a new scheduling region, i.e. all existing ScheduleData is not
4087 // in the new region yet.
4088 ++SchedulingRegionID;
4089 }
4090
4091 ScheduleData *getScheduleData(Instruction *I) {
4092 if (BB != I->getParent())
4093 // Avoid lookup if can't possibly be in map.
4094 return nullptr;
4095 ScheduleData *SD = ScheduleDataMap.lookup(I);
4096 if (SD && isInSchedulingRegion(SD))
4097 return SD;
4098 return nullptr;
4099 }
4100
4101 ScheduleData *getScheduleData(Value *V) {
4102 if (auto *I = dyn_cast<Instruction>(V))
4103 return getScheduleData(I);
4104 return nullptr;
4105 }
4106
4107 bool isInSchedulingRegion(ScheduleData *SD) const {
4108 return SD->SchedulingRegionID == SchedulingRegionID;
4109 }
4110
4111 /// Marks an instruction as scheduled and puts all dependent ready
4112 /// instructions into the ready-list.
4113 template <typename ReadyListType>
4114 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4115 SD->IsScheduled = true;
4116 LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
4117
4118 for (ScheduleData *BundleMember = SD; BundleMember;
4119 BundleMember = BundleMember->NextInBundle) {
4120
4121 // Handle the def-use chain dependencies.
4122
4123 // Decrement the unscheduled counter and insert to ready list if ready.
4124 auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
4125 ScheduleData *OpDef = getScheduleData(I);
4126 if (OpDef && OpDef->hasValidDependencies() &&
4127 OpDef->incrementUnscheduledDeps(-1) == 0) {
4128 // There are no more unscheduled dependencies after
4129 // decrementing, so we can put the dependent instruction
4130 // into the ready list.
4131 ScheduleData *DepBundle = OpDef->FirstInBundle;
4132 assert(!DepBundle->IsScheduled &&
4133 "already scheduled bundle gets ready");
4134 ReadyList.insert(DepBundle);
4136 << "SLP: gets ready (def): " << *DepBundle << "\n");
4137 }
4138 };
4139
4140 // If BundleMember is a vector bundle, its operands may have been
4141 // reordered during buildTree(). We therefore need to get its operands
4142 // through the TreeEntry.
4143 if (TreeEntry *TE = BundleMember->TE) {
4144 // Need to search for the lane since the tree entry can be reordered.
4145 int Lane = std::distance(TE->Scalars.begin(),
4146 find(TE->Scalars, BundleMember->Inst));
4147 assert(Lane >= 0 && "Lane not set");
4148
4149 // Since vectorization tree is being built recursively this assertion
4150 // ensures that the tree entry has all operands set before reaching
4151 // this code. Couple of exceptions known at the moment are extracts
4152 // where their second (immediate) operand is not added. Since
4153 // immediates do not affect scheduler behavior this is considered
4154 // okay.
4155 auto *In = BundleMember->Inst;
4156 assert(
4157 In &&
4158 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4159 In->getNumOperands() == TE->getNumOperands()) &&
4160 "Missed TreeEntry operands?");
4161 (void)In; // fake use to avoid build failure when assertions disabled
4162
4163 for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
4164 OpIdx != NumOperands; ++OpIdx)
4165 if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
4166 DecrUnsched(I);
4167 } else {
4168 // If BundleMember is a stand-alone instruction, no operand reordering
4169 // has taken place, so we directly access its operands.
4170 for (Use &U : BundleMember->Inst->operands())
4171 if (auto *I = dyn_cast<Instruction>(U.get()))
4172 DecrUnsched(I);
4173 }
4174 // Handle the memory dependencies.
4175 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4176 if (MemoryDepSD->hasValidDependencies() &&
4177 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4178 // There are no more unscheduled dependencies after decrementing,
4179 // so we can put the dependent instruction into the ready list.
4180 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4181 assert(!DepBundle->IsScheduled &&
4182 "already scheduled bundle gets ready");
4183 ReadyList.insert(DepBundle);
4185 << "SLP: gets ready (mem): " << *DepBundle << "\n");
4186 }
4187 }
4188 // Handle the control dependencies.
4189 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4190 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4191 // There are no more unscheduled dependencies after decrementing,
4192 // so we can put the dependent instruction into the ready list.
4193 ScheduleData *DepBundle = DepSD->FirstInBundle;
4194 assert(!DepBundle->IsScheduled &&
4195 "already scheduled bundle gets ready");
4196 ReadyList.insert(DepBundle);
4198 << "SLP: gets ready (ctl): " << *DepBundle << "\n");
4199 }
4200 }
4201 }
4202 }
4203
4204 /// Verify basic self consistency properties of the data structure.
4205 void verify() {
4206 if (!ScheduleStart)
4207 return;
4208
4209 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4210 ScheduleStart->comesBefore(ScheduleEnd) &&
4211 "Not a valid scheduling region?");
4212
4213 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4214 auto *SD = getScheduleData(I);
4215 if (!SD)
4216 continue;
4217 assert(isInSchedulingRegion(SD) &&
4218 "primary schedule data not in window?");
4219 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4220 "entire bundle in window!");
4221 SD->verify();
4222 }
4223
4224 for (auto *SD : ReadyInsts) {
4225 assert(SD->isSchedulingEntity() && SD->isReady() &&
4226 "item in ready list not ready?");
4227 (void)SD;
4228 }
4229 }
4230
4231 /// Put all instructions into the ReadyList which are ready for scheduling.
4232 template <typename ReadyListType>
4233 void initialFillReadyList(ReadyListType &ReadyList) {
4234 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
4235 ScheduleData *SD = getScheduleData(I);
4236 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4237 SD->isReady()) {
4238 ReadyList.insert(SD);
4240 << "SLP: initially in ready list: " << *SD << "\n");
4241 }
4242 }
4243 }
4244
4245 /// Build a bundle from the ScheduleData nodes corresponding to the
4246 /// scalar instruction for each lane.
4247 ScheduleData *buildBundle(ArrayRef<Value *> VL);
4248
4249 /// Checks if a bundle of instructions can be scheduled, i.e. has no
4250 /// cyclic dependencies. This is only a dry-run, no instructions are
4251 /// actually moved at this stage.
4252 /// \returns the scheduling bundle. The returned Optional value is not
4253 /// std::nullopt if \p VL is allowed to be scheduled.
4254 std::optional<ScheduleData *>
4255 tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
4256 const InstructionsState &S);
4257
4258 /// Un-bundles a group of instructions.
4259 void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
4260
4261 /// Allocates schedule data chunk.
4262 ScheduleData *allocateScheduleDataChunks();
4263
4264 /// Extends the scheduling region so that V is inside the region.
4265 /// \returns true if the region size is within the limit.
4266 bool extendSchedulingRegion(Value *V, const InstructionsState &S);
4267
4268 /// Initialize the ScheduleData structures for new instructions in the
4269 /// scheduling region.
4270 void initScheduleData(Instruction *FromI, Instruction *ToI,
4271 ScheduleData *PrevLoadStore,
4272 ScheduleData *NextLoadStore);
4273
4274 /// Updates the dependency information of a bundle and of all instructions/
4275 /// bundles which depend on the original bundle.
4276 void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
4277 BoUpSLP *SLP);
4278
4279 /// Sets all instruction in the scheduling region to un-scheduled.
4280 void resetSchedule();
4281
4282 BasicBlock *BB;
4283
4284 /// Simple memory allocation for ScheduleData.
4286
4287 /// The size of a ScheduleData array in ScheduleDataChunks.
4288 int ChunkSize;
4289
4290 /// The allocator position in the current chunk, which is the last entry
4291 /// of ScheduleDataChunks.
4292 int ChunkPos;
4293
4294 /// Attaches ScheduleData to Instruction.
4295 /// Note that the mapping survives during all vectorization iterations, i.e.
4296 /// ScheduleData structures are recycled.
4298
4299 /// The ready-list for scheduling (only used for the dry-run).
4300 SetVector<ScheduleData *> ReadyInsts;
4301
4302 /// The first instruction of the scheduling region.
4303 Instruction *ScheduleStart = nullptr;
4304
4305 /// The first instruction _after_ the scheduling region.
4306 Instruction *ScheduleEnd = nullptr;
4307
4308 /// The first memory accessing instruction in the scheduling region
4309 /// (can be null).
4310 ScheduleData *FirstLoadStoreInRegion = nullptr;
4311
4312 /// The last memory accessing instruction in the scheduling region
4313 /// (can be null).
4314 ScheduleData *LastLoadStoreInRegion = nullptr;
4315
4316 /// Is there an llvm.stacksave or llvm.stackrestore in the scheduling
4317 /// region? Used to optimize the dependence calculation for the
4318 /// common case where there isn't.
4319 bool RegionHasStackSave = false;
4320
4321 /// The current size of the scheduling region.
4322 int ScheduleRegionSize = 0;
4323
4324 /// The maximum size allowed for the scheduling region.
4325 int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
4326
4327 /// The ID of the scheduling region. For a new vectorization iteration this
4328 /// is incremented which "removes" all ScheduleData from the region.
4329 /// Make sure that the initial SchedulingRegionID is greater than the
4330 /// initial SchedulingRegionID in ScheduleData (which is 0).
4331 int SchedulingRegionID = 1;
4332 };
4333
4334 /// Attaches the BlockScheduling structures to basic blocks.
4336
4337 /// Performs the "real" scheduling. Done before vectorization is actually
4338 /// performed in a basic block.
4339 void scheduleBlock(BlockScheduling *BS);
4340
4341 /// List of users to ignore during scheduling and that don't need extracting.
4342 const SmallDenseSet<Value *> *UserIgnoreList = nullptr;
4343
4344 /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
4345 /// sorted SmallVectors of unsigned.
4346 struct OrdersTypeDenseMapInfo {
4347 static OrdersType getEmptyKey() {
4348 OrdersType V;
4349 V.push_back(~1U);
4350 return V;
4351 }
4352
4353 static OrdersType getTombstoneKey() {
4354 OrdersType V;
4355 V.push_back(~2U);
4356 return V;
4357 }
4358
4359 static unsigned getHashValue(const OrdersType &V) {
4360 return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
4361 }
4362
4363 static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
4364 return LHS == RHS;
4365 }
4366 };
4367
4368 // Analysis and block reference.
4369 Function *F;
4370 ScalarEvolution *SE;
4372 TargetLibraryInfo *TLI;
4373 LoopInfo *LI;
4374 DominatorTree *DT;
4375 AssumptionCache *AC;
4376 DemandedBits *DB;
4377 const DataLayout *DL;
4379
4380 unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
4381 unsigned MinVecRegSize; // Set by cl::opt (default: 128).
4382
4383 /// Instruction builder to construct the vectorized tree.
4385
4386 /// A map of scalar integer values to the smallest bit width with which they
4387 /// can legally be represented. The values map to (width, signed) pairs,
4388 /// where "width" indicates the minimum bit width and "signed" is True if the
4389 /// value must be signed-extended, rather than zero-extended, back to its
4390 /// original width.
4392
4393 /// Final size of the reduced vector, if the current graph represents the
4394 /// input for the reduction and it was possible to narrow the size of the
4395 /// reduction.
4396 unsigned ReductionBitWidth = 0;
4397
4398 /// Canonical graph size before the transformations.
4399 unsigned BaseGraphSize = 1;
4400
4401 /// If the tree contains any zext/sext/trunc nodes, contains max-min pair of
4402 /// type sizes, used in the tree.
4403 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4404
4405 /// Indices of the vectorized nodes, which supposed to be the roots of the new
4406 /// bitwidth analysis attempt, like trunc, IToFP or ICmp.
4407 DenseSet<unsigned> ExtraBitWidthNodes;
4408};
4409
4410} // end namespace slpvectorizer
4411
4412template <> struct GraphTraits<BoUpSLP *> {
4413 using TreeEntry = BoUpSLP::TreeEntry;
4414
4415 /// NodeRef has to be a pointer per the GraphWriter.
4417
4419
4420 /// Add the VectorizableTree to the index iterator to be able to return
4421 /// TreeEntry pointers.
4422 struct ChildIteratorType
4423 : public iterator_adaptor_base<
4424 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4426
4428 ContainerTy &VT)
4429 : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
4430
4431 NodeRef operator*() { return I->UserTE; }
4432 };
4433
4435 return R.VectorizableTree[0].get();
4436 }
4437
4438 static ChildIteratorType child_begin(NodeRef N) {
4439 return {N->UserTreeIndices.begin(), N->Container};
4440 }
4441
4442 static ChildIteratorType child_end(NodeRef N) {
4443 return {N->UserTreeIndices.end(), N->Container};
4444 }
4445
4446 /// For the node iterator we just need to turn the TreeEntry iterator into a
4447 /// TreeEntry* iterator so that it dereferences to NodeRef.
4448 class nodes_iterator {
4450 ItTy It;
4451
4452 public:
4453 nodes_iterator(const ItTy &It2) : It(It2) {}
4454 NodeRef operator*() { return It->get(); }
4455 nodes_iterator operator++() {
4456 ++It;
4457 return *this;
4458 }
4459 bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
4460 };
4461
4462 static nodes_iterator nodes_begin(BoUpSLP *R) {
4463 return nodes_iterator(R->VectorizableTree.begin());
4464 }
4465
4466 static nodes_iterator nodes_end(BoUpSLP *R) {
4467 return nodes_iterator(R->VectorizableTree.end());
4468 }
4469
4470 static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
4471};
4472
4473template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
4474 using TreeEntry = BoUpSLP::TreeEntry;
4475
4476 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
4477
4478 std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
4479 std::string Str;
4481 OS << Entry->Idx << ".\n";
4482 if (isSplat(Entry->Scalars))
4483 OS << "<splat> ";
4484 for (auto *V : Entry->Scalars) {
4485 OS << *V;
4486 if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
4487 return EU.Scalar == V;
4488 }))
4489 OS << " <extract>";
4490 OS << "\n";
4491 }
4492 return Str;
4493 }
4494
4495 static std::string getNodeAttributes(const TreeEntry *Entry,
4496 const BoUpSLP *) {
4497 if (Entry->isGather())
4498 return "color=red";
4499 if (Entry->State == TreeEntry::ScatterVectorize ||
4500 Entry->State == TreeEntry::StridedVectorize)
4501 return "color=blue";
4502 return "";
4503 }
4504};
4505
4506} // end namespace llvm
4507
4510 for (auto *I : DeletedInstructions) {
4511 if (!I->getParent()) {
4512 // Temporarily insert instruction back to erase them from parent and
4513 // memory later.
4514 if (isa<PHINode>(I))
4515 // Phi nodes must be the very first instructions in the block.
4516 I->insertBefore(F->getEntryBlock(),
4517 F->getEntryBlock().getFirstNonPHIIt());
4518 else
4519 I->insertBefore(F->getEntryBlock().getTerminator());
4520 continue;
4521 }
4522 for (Use &U : I->operands()) {
4523 auto *Op = dyn_cast<Instruction>(U.get());
4524 if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
4526 DeadInsts.emplace_back(Op);
4527 }
4528 I->dropAllReferences();
4529 }
4530 for (auto *I : DeletedInstructions) {
4531 assert(I->use_empty() &&
4532 "trying to erase instruction with users.");
4533 I->eraseFromParent();
4534 }
4535
4536 // Cleanup any dead scalar code feeding the vectorized instructions
4538
4539#ifdef EXPENSIVE_CHECKS
4540 // If we could guarantee that this call is not extremely slow, we could
4541 // remove the ifdef limitation (see PR47712).
4542 assert(!verifyFunction(*F, &dbgs()));
4543#endif
4544}
4545
4546/// Reorders the given \p Reuses mask according to the given \p Mask. \p Reuses
4547/// contains original mask for the scalars reused in the node. Procedure
4548/// transform this mask in accordance with the given \p Mask.
4550 assert(!Mask.empty() && Reuses.size() == Mask.size() &&
4551 "Expected non-empty mask.");
4552 SmallVector<int> Prev(Reuses.begin(), Reuses.end());
4553 Prev.swap(Reuses);
4554 for (unsigned I = 0, E = Prev.size(); I < E; ++I)
4555 if (Mask[I] != PoisonMaskElem)
4556 Reuses[Mask[I]] = Prev[I];
4557}
4558
4559/// Reorders the given \p Order according to the given \p Mask. \p Order - is
4560/// the original order of the scalars. Procedure transforms the provided order
4561/// in accordance with the given \p Mask. If the resulting \p Order is just an
4562/// identity order, \p Order is cleared.
4564 bool BottomOrder = false) {
4565 assert(!Mask.empty() && "Expected non-empty mask.");
4566 unsigned Sz = Mask.size();
4567 if (BottomOrder) {
4568 SmallVector<unsigned> PrevOrder;
4569 if (Order.empty()) {
4570 PrevOrder.resize(Sz);
4571 std::iota(PrevOrder.begin(), PrevOrder.end(), 0);
4572 } else {
4573 PrevOrder.swap(Order);
4574 }
4575 Order.assign(Sz, Sz);
4576 for (unsigned I = 0; I < Sz; ++I)
4577 if (Mask[I] != PoisonMaskElem)
4578 Order[I] = PrevOrder[Mask[I]];
4579 if (all_of(enumerate(Order), [&](const auto &Data) {
4580 return Data.value() == Sz || Data.index() == Data.value();
4581 })) {
4582 Order.clear();
4583 return;
4584 }
4585 fixupOrderingIndices(Order);
4586 return;
4587 }
4588 SmallVector<int> MaskOrder;
4589 if (Order.empty()) {
4590 MaskOrder.resize(Sz);
4591 std::iota(MaskOrder.begin(), MaskOrder.end(), 0);
4592 } else {
4593 inversePermutation(Order, MaskOrder);
4594 }
4595 reorderReuses(MaskOrder, Mask);
4596 if (ShuffleVectorInst::isIdentityMask(MaskOrder, Sz)) {
4597 Order.clear();
4598 return;
4599 }
4600 Order.assign(Sz, Sz);
4601 for (unsigned I = 0; I < Sz; ++I)
4602 if (MaskOrder[I] != PoisonMaskElem)
4603 Order[MaskOrder[I]] = I;
4604 fixupOrderingIndices(Order);
4605}
4606
4607std::optional<BoUpSLP::OrdersType>
4608BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) {
4609 assert(TE.isGather() && "Expected gather node only.");
4610 // Try to find subvector extract/insert patterns and reorder only such
4611 // patterns.
4612 SmallVector<Value *> GatheredScalars(TE.Scalars.begin(), TE.Scalars.end());
4613 Type *ScalarTy = GatheredScalars.front()->getType();
4614 int NumScalars = GatheredScalars.size();
4615 if (!isValidElementType(ScalarTy))
4616 return std::nullopt;
4617 auto *VecTy = getWidenedType(ScalarTy, NumScalars);
4618 int NumParts = TTI->getNumberOfParts(VecTy);
4619 if (NumParts == 0 || NumParts >= NumScalars ||
4620 VecTy->getNumElements() % NumParts != 0 ||
4621 !hasFullVectorsOrPowerOf2(*TTI, VecTy->getElementType(),
4622 VecTy->getNumElements() / NumParts))
4623 NumParts = 1;
4624 SmallVector<int> ExtractMask;
4625 SmallVector<int> Mask;
4628 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4630 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4631 /*ForOrder=*/true);
4632 // No shuffled operands - ignore.
4633 if (GatherShuffles.empty() && ExtractShuffles.empty())
4634 return std::nullopt;
4635 OrdersType CurrentOrder(NumScalars, NumScalars);
4636 if (GatherShuffles.size() == 1 &&
4637 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
4638 Entries.front().front()->isSame(TE.Scalars)) {
4639 // Perfect match in the graph, will reuse the previously vectorized
4640 // node. Cost is 0.
4641 std::iota(CurrentOrder.begin(), CurrentOrder.end(), 0);
4642 return CurrentOrder;
4643 }
4644 auto IsSplatMask = [](ArrayRef<int> Mask) {
4645 int SingleElt = PoisonMaskElem;
4646 return all_of(Mask, [&](int I) {
4647 if (SingleElt == PoisonMaskElem && I != PoisonMaskElem)
4648 SingleElt = I;
4649 return I == PoisonMaskElem || I == SingleElt;
4650 });
4651 };
4652 // Exclusive broadcast mask - ignore.
4653 if ((ExtractShuffles.empty() && IsSplatMask(Mask) &&
4654 (Entries.size() != 1 ||
4655 Entries.front().front()->ReorderIndices.empty())) ||
4656 (GatherShuffles.empty() && IsSplatMask(ExtractMask)))
4657 return std::nullopt;
4658 SmallBitVector ShuffledSubMasks(NumParts);
4659 auto TransformMaskToOrder = [&](MutableArrayRef<unsigned> CurrentOrder,
4660 ArrayRef<int> Mask, int PartSz, int NumParts,
4661 function_ref<unsigned(unsigned)> GetVF) {
4662 for (int I : seq<int>(0, NumParts)) {
4663 if (ShuffledSubMasks.test(I))
4664 continue;
4665 const int VF = GetVF(I);
4666 if (VF == 0)
4667 continue;
4668 unsigned Limit = getNumElems(CurrentOrder.size(), PartSz, I);
4669 MutableArrayRef<unsigned> Slice = CurrentOrder.slice(I * PartSz, Limit);
4670 // Shuffle of at least 2 vectors - ignore.
4671 if (any_of(Slice, [&](int I) { return I != NumScalars; })) {
4672 std::fill(Slice.begin(), Slice.end(), NumScalars);
4673 ShuffledSubMasks.set(I);
4674 continue;
4675 }
4676 // Try to include as much elements from the mask as possible.
4677 int FirstMin = INT_MAX;
4678 int SecondVecFound = false;
4679 for (int K : seq<int>(Limit)) {
4680 int Idx = Mask[I * PartSz + K];
4681 if (Idx == PoisonMaskElem) {
4682 Value *V = GatheredScalars[I * PartSz + K];
4683 if (isConstant(V) && !isa<PoisonValue>(V)) {
4684 SecondVecFound = true;
4685 break;
4686 }
4687 continue;
4688 }
4689 if (Idx < VF) {
4690 if (FirstMin > Idx)
4691 FirstMin = Idx;
4692 } else {
4693 SecondVecFound = true;
4694 break;
4695 }
4696 }
4697 FirstMin = (FirstMin / PartSz) * PartSz;
4698 // Shuffle of at least 2 vectors - ignore.
4699 if (SecondVecFound) {
4700 std::fill(Slice.begin(), Slice.end(), NumScalars);
4701 ShuffledSubMasks.set(I);
4702 continue;
4703 }
4704 for (int K : seq<int>(Limit)) {
4705 int Idx = Mask[I * PartSz + K];
4706 if (Idx == PoisonMaskElem)
4707 continue;
4708 Idx -= FirstMin;
4709 if (Idx >= PartSz) {
4710 SecondVecFound = true;
4711 break;
4712 }
4713 if (CurrentOrder[I * PartSz + Idx] >
4714 static_cast<unsigned>(I * PartSz + K) &&
4715 CurrentOrder[I * PartSz + Idx] !=
4716 static_cast<unsigned>(I * PartSz + Idx))
4717 CurrentOrder[I * PartSz + Idx] = I * PartSz + K;
4718 }
4719 // Shuffle of at least 2 vectors - ignore.
4720 if (SecondVecFound) {
4721 std::fill(Slice.begin(), Slice.end(), NumScalars);
4722 ShuffledSubMasks.set(I);
4723 continue;
4724 }
4725 }
4726 };
4727 int PartSz = getPartNumElems(NumScalars, NumParts);
4728 if (!ExtractShuffles.empty())
4729 TransformMaskToOrder(
4730 CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) {
4731 if (!ExtractShuffles[I])
4732 return 0U;
4733 unsigned VF = 0;
4734 unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I);
4735 for (unsigned Idx : seq<unsigned>(Sz)) {
4736 int K = I * PartSz + Idx;
4737 if (ExtractMask[K] == PoisonMaskElem)
4738 continue;
4739 if (!TE.ReuseShuffleIndices.empty())
4740 K = TE.ReuseShuffleIndices[K];
4741 if (K == PoisonMaskElem)
4742 continue;
4743 if (!TE.ReorderIndices.empty())
4744 K = std::distance(TE.ReorderIndices.begin(),
4745 find(TE.ReorderIndices, K));
4746 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4747 if (!EI)
4748 continue;
4749 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4750 ->getElementCount()
4751 .getKnownMinValue());
4752 }
4753 return VF;
4754 });
4755 // Check special corner case - single shuffle of the same entry.
4756 if (GatherShuffles.size() == 1 && NumParts != 1) {
4757 if (ShuffledSubMasks.any())
4758 return std::nullopt;
4759 PartSz = NumScalars;
4760 NumParts = 1;
4761 }
4762 if (!Entries.empty())
4763 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) {
4764 if (!GatherShuffles[I])
4765 return 0U;
4766 return std::max(Entries[I].front()->getVectorFactor(),
4767 Entries[I].back()->getVectorFactor());
4768 });
4769 int NumUndefs =
4770 count_if(CurrentOrder, [&](int Idx) { return Idx == NumScalars; });
4771 if (ShuffledSubMasks.all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4772 return std::nullopt;
4773 return std::move(CurrentOrder);
4774}
4775
4776static bool arePointersCompatible(Value *Ptr1, Value *Ptr2,
4777 const TargetLibraryInfo &TLI,
4778 bool CompareOpcodes = true) {
4781 return false;
4782 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4783 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4784 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4785 (!GEP2 || GEP2->getNumOperands() == 2) &&
4786 (((!GEP1 || isConstant(GEP1->getOperand(1))) &&
4787 (!GEP2 || isConstant(GEP2->getOperand(1)))) ||
4788 !CompareOpcodes ||
4789 (GEP1 && GEP2 &&
4790 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4791}
4792
4793/// Calculates minimal alignment as a common alignment.
4794template <typename T>
4796 Align CommonAlignment = cast<T>(VL.front())->getAlign();
4797 for (Value *V : VL.drop_front())
4798 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->getAlign());
4799 return CommonAlignment;
4800}
4801
4802/// Check if \p Order represents reverse order.
4804 assert(!Order.empty() &&
4805 "Order is empty. Please check it before using isReverseOrder.");
4806 unsigned Sz = Order.size();
4807 return all_of(enumerate(Order), [&](const auto &Pair) {
4808 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4809 });
4810}
4811
4812/// Checks if the provided list of pointers \p Pointers represents the strided
4813/// pointers for type ElemTy. If they are not, std::nullopt is returned.
4814/// Otherwise, if \p Inst is not specified, just initialized optional value is
4815/// returned to show that the pointers represent strided pointers. If \p Inst
4816/// specified, the runtime stride is materialized before the given \p Inst.
4817/// \returns std::nullopt if the pointers are not pointers with the runtime
4818/// stride, nullptr or actual stride value, otherwise.
4819static std::optional<Value *>
4821 const DataLayout &DL, ScalarEvolution &SE,
4822 SmallVectorImpl<unsigned> &SortedIndices,
4823 Instruction *Inst = nullptr) {
4825 const SCEV *PtrSCEVLowest = nullptr;
4826 const SCEV *PtrSCEVHighest = nullptr;
4827 // Find lower/upper pointers from the PointerOps (i.e. with lowest and highest
4828 // addresses).
4829 for (Value *Ptr : PointerOps) {
4830 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
4831 if (!PtrSCEV)
4832 return std::nullopt;
4833 SCEVs.push_back(PtrSCEV);
4834 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4835 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4836 continue;
4837 }
4838 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4839 if (isa<SCEVCouldNotCompute>(Diff))
4840 return std::nullopt;
4841 if (Diff->isNonConstantNegative()) {
4842 PtrSCEVLowest = PtrSCEV;
4843 continue;
4844 }
4845 const SCEV *Diff1 = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEV);
4846 if (isa<SCEVCouldNotCompute>(Diff1))
4847 return std::nullopt;
4848 if (Diff1->isNonConstantNegative()) {
4849 PtrSCEVHighest = PtrSCEV;
4850 continue;
4851 }
4852 }
4853 // Dist = PtrSCEVHighest - PtrSCEVLowest;
4854 const SCEV *Dist = SE.getMinusSCEV(PtrSCEVHighest, PtrSCEVLowest);
4855 if (isa<SCEVCouldNotCompute>(Dist))
4856 return std::nullopt;
4857 int Size = DL.getTypeStoreSize(ElemTy);
4858 auto TryGetStride = [&](const SCEV *Dist,
4859 const SCEV *Multiplier) -> const SCEV * {
4860 if (const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4861 if (M->getOperand(0) == Multiplier)
4862 return M->getOperand(1);
4863 if (M->getOperand(1) == Multiplier)
4864 return M->getOperand(0);
4865 return nullptr;
4866 }
4867 if (Multiplier == Dist)
4868 return SE.getConstant(Dist->getType(), 1);
4869 return SE.getUDivExactExpr(Dist, Multiplier);
4870 };
4871 // Stride_in_elements = Dist / element_size * (num_elems - 1).
4872 const SCEV *Stride = nullptr;
4873 if (Size != 1 || SCEVs.size() > 2) {
4874 const SCEV *Sz = SE.getConstant(Dist->getType(), Size * (SCEVs.size() - 1));
4875 Stride = TryGetStride(Dist, Sz);
4876 if (!Stride)
4877 return std::nullopt;
4878 }
4879 if (!Stride || isa<SCEVConstant>(Stride))
4880 return std::nullopt;
4881 // Iterate through all pointers and check if all distances are
4882 // unique multiple of Stride.
4883 using DistOrdPair = std::pair<int64_t, int>;
4884 auto Compare = llvm::less_first();
4885 std::set<DistOrdPair, decltype(Compare)> Offsets(Compare);
4886 int Cnt = 0;
4887 bool IsConsecutive = true;
4888 for (const SCEV *PtrSCEV : SCEVs) {
4889 unsigned Dist = 0;
4890 if (PtrSCEV != PtrSCEVLowest) {
4891 const SCEV *Diff = SE.getMinusSCEV(PtrSCEV, PtrSCEVLowest);
4892 const SCEV *Coeff = TryGetStride(Diff, Stride);
4893 if (!Coeff)
4894 return std::nullopt;
4895 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4896 if (!SC || isa<SCEVCouldNotCompute>(SC))
4897 return std::nullopt;
4898 if (!SE.getMinusSCEV(PtrSCEV, SE.getAddExpr(PtrSCEVLowest,
4899 SE.getMulExpr(Stride, SC)))
4900 ->isZero())
4901 return std::nullopt;
4902 Dist = SC->getAPInt().getZExtValue();
4903 }
4904 // If the strides are not the same or repeated, we can't vectorize.
4905 if ((Dist / Size) * Size != Dist || (Dist / Size) >= SCEVs.size())
4906 return std::nullopt;
4907 auto Res = Offsets.emplace(Dist, Cnt);
4908 if (!Res.second)
4909 return std::nullopt;
4910 // Consecutive order if the inserted element is the last one.
4911 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4912 ++Cnt;
4913 }
4914 if (Offsets.size() != SCEVs.size())
4915 return std::nullopt;
4916 SortedIndices.clear();
4917 if (!IsConsecutive) {
4918 // Fill SortedIndices array only if it is non-consecutive.
4919 SortedIndices.resize(PointerOps.size());
4920 Cnt = 0;
4921 for (const std::pair<int64_t, int> &Pair : Offsets) {
4922 SortedIndices[Cnt] = Pair.second;
4923 ++Cnt;
4924 }
4925 }
4926 if (!Inst)
4927 return nullptr;
4928 SCEVExpander Expander(SE, DL, "strided-load-vec");
4929 return Expander.expandCodeFor(Stride, Stride->getType(), Inst);
4930}
4931
4932static std::pair<InstructionCost, InstructionCost>
4934 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
4935 Type *ScalarTy, VectorType *VecTy);
4936
4937/// Returns the cost of the shuffle instructions with the given \p Kind, vector
4938/// type \p Tp and optional \p Mask. Adds SLP-specifc cost estimation for insert
4939/// subvector pattern.
4940static InstructionCost
4942 VectorType *Tp, ArrayRef<int> Mask = {},
4944 int Index = 0, VectorType *SubTp = nullptr,
4946 if (Kind != TTI::SK_PermuteTwoSrc)
4947 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4948 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4949 int NumSubElts;
4951 Mask, NumSrcElts, NumSubElts, Index)) {
4952 if (Index + NumSubElts > NumSrcElts &&
4953 Index + NumSrcElts <= static_cast<int>(Mask.size()))
4954 return TTI.getShuffleCost(
4956 getWidenedType(Tp->getElementType(), Mask.size()), Mask,
4958 }
4959 return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args);
4960}
4961
4962/// Correctly creates insert_subvector, checking that the index is multiple of
4963/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4964/// using default shuffle.
4966 IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index,
4967 function_ref<Value *(Value *, Value *, ArrayRef<int>)> Generator = {}) {
4968 const unsigned SubVecVF = getNumElements(V->getType());
4969 if (Index % SubVecVF == 0) {
4970 Vec = Builder.CreateInsertVector(Vec->getType(), Vec, V,
4971 Builder.getInt64(Index));
4972 } else {
4973 // Create shuffle, insertvector requires that index is multiple of
4974 // the subvector length.
4975 const unsigned VecVF = getNumElements(Vec->getType());
4977 std::iota(Mask.begin(), Mask.end(), 0);
4978 for (unsigned I : seq<unsigned>(SubVecVF))
4979 Mask[I + Index] = I + VecVF;
4980 if (Generator) {
4981 Vec = Generator(Vec, V, Mask);
4982 } else {
4983 // 1. Resize V to the size of Vec.
4984 SmallVector<int> ResizeMask(VecVF, PoisonMaskElem);
4985 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4986 V = Builder.CreateShuffleVector(V, ResizeMask);
4987 Vec = Builder.CreateShuffleVector(Vec, V, Mask);
4988 }
4989 }
4990 return Vec;
4991}
4992
4993/// Correctly creates extract_subvector, checking that the index is multiple of
4994/// the subvectors length. Otherwise, generates shuffle using \p Generator or
4995/// using default shuffle.
4997 unsigned SubVecVF, unsigned Index) {
4998 if (Index % SubVecVF == 0) {
4999 VectorType *SubVecTy =
5000 getWidenedType(Vec->getType()->getScalarType(), SubVecVF);
5001 return Builder.CreateExtractVector(SubVecTy, Vec, Builder.getInt64(Index));
5002 }
5003 // Create shuffle, extract_subvector requires that index is multiple of
5004 // the subvector length.
5005 SmallVector<int> Mask(SubVecVF, PoisonMaskElem);
5006 std::iota(Mask.begin(), Mask.end(), Index);
5007 return Builder.CreateShuffleVector(Vec, Mask);
5008}
5009
5013 SmallVectorImpl<Value *> &PointerOps,
5014 unsigned *BestVF, bool TryRecursiveCheck) const {
5015 // Check that a vectorized load would load the same memory as a scalar
5016 // load. For example, we don't want to vectorize loads that are smaller
5017 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
5018 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
5019 // from such a struct, we read/write packed bits disagreeing with the
5020 // unvectorized version.
5021 if (BestVF)
5022 *BestVF = 0;
5024 return LoadsState::Gather;
5025 Type *ScalarTy = VL0->getType();
5026
5027 if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy))
5028 return LoadsState::Gather;
5029
5030 // Make sure all loads in the bundle are simple - we can't vectorize
5031 // atomic or volatile loads.
5032 PointerOps.clear();
5033 const unsigned Sz = VL.size();
5034 PointerOps.resize(Sz);
5035 auto *POIter = PointerOps.begin();
5036 for (Value *V : VL) {
5037 auto *L = dyn_cast<LoadInst>(V);
5038 if (!L || !L->isSimple())
5039 return LoadsState::Gather;
5040 *POIter = L->getPointerOperand();
5041 ++POIter;
5042 }
5043
5044 Order.clear();
5045 // Check the order of pointer operands or that all pointers are the same.
5046 bool IsSorted = sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, Order);
5047
5048 auto *VecTy = getWidenedType(ScalarTy, Sz);
5049 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5050 if (!IsSorted) {
5051 if (Sz > MinProfitableStridedLoads && TTI->isTypeLegal(VecTy)) {
5052 if (TTI->isLegalStridedLoadStore(VecTy, CommonAlignment) &&
5053 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order))
5055 }
5056
5057 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5058 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5059 return LoadsState::Gather;
5060
5061 if (!all_of(PointerOps, [&](Value *P) {
5062 return arePointersCompatible(P, PointerOps.front(), *TLI);
5063 }))
5064 return LoadsState::Gather;
5065
5066 } else {
5067 Value *Ptr0;
5068 Value *PtrN;
5069 if (Order.empty()) {
5070 Ptr0 = PointerOps.front();
5071 PtrN = PointerOps.back();
5072 } else {
5073 Ptr0 = PointerOps[Order.front()];
5074 PtrN = PointerOps[Order.back()];
5075 }
5076 std::optional<int> Diff =
5077 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
5078 // Check that the sorted loads are consecutive.
5079 if (static_cast<unsigned>(*Diff) == Sz - 1)
5080 return LoadsState::Vectorize;
5081 if (!TTI->isLegalMaskedGather(VecTy, CommonAlignment) ||
5082 TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment))
5083 return LoadsState::Gather;
5084 // Simple check if not a strided access - clear order.
5085 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5086 // Try to generate strided load node if:
5087 // 1. Target with strided load support is detected.
5088 // 2. The number of loads is greater than MinProfitableStridedLoads,
5089 // or the potential stride <= MaxProfitableLoadStride and the
5090 // potential stride is power-of-2 (to avoid perf regressions for the very
5091 // small number of loads) and max distance > number of loads, or potential
5092 // stride is -1.
5093 // 3. The loads are ordered, or number of unordered loads <=
5094 // MaxProfitableUnorderedLoads, or loads are in reversed order.
5095 // (this check is to avoid extra costs for very expensive shuffles).
5096 // 4. Any pointer operand is an instruction with the users outside of the
5097 // current graph (for masked gathers extra extractelement instructions
5098 // might be required).
5099 auto IsAnyPointerUsedOutGraph =
5100 IsPossibleStrided && any_of(PointerOps, [&](Value *V) {
5101 return isa<Instruction>(V) && any_of(V->users(), [&](User *U) {
5102 return !getTreeEntry(U) && !MustGather.contains(U);
5103 });
5104 });
5105 const unsigned AbsoluteDiff = std::abs(*Diff);
5106 if (IsPossibleStrided &&
5107 (IsAnyPointerUsedOutGraph ||
5108 (AbsoluteDiff > Sz &&
5110 (AbsoluteDiff <= MaxProfitableLoadStride * Sz &&
5111 AbsoluteDiff % Sz == 0 && has_single_bit(AbsoluteDiff / Sz)))) ||
5112 *Diff == -(static_cast<int>(Sz) - 1))) {
5113 int Stride = *Diff / static_cast<int>(Sz - 1);
5114 if (*Diff == Stride * static_cast<int>(Sz - 1)) {
5115 Align Alignment =
5116 cast<LoadInst>(Order.empty() ? VL.front() : VL[Order.front()])
5117 ->getAlign();
5118 if (TTI->isLegalStridedLoadStore(VecTy, Alignment)) {
5119 // Iterate through all pointers and check if all distances are
5120 // unique multiple of Dist.
5121 SmallSet<int, 4> Dists;
5122 for (Value *Ptr : PointerOps) {
5123 int Dist = 0;
5124 if (Ptr == PtrN)
5125 Dist = *Diff;
5126 else if (Ptr != Ptr0)
5127 Dist = *getPointersDiff(ScalarTy, Ptr0, ScalarTy, Ptr, *DL, *SE);
5128 // If the strides are not the same or repeated, we can't
5129 // vectorize.
5130 if (((Dist / Stride) * Stride) != Dist ||
5131 !Dists.insert(Dist).second)
5132 break;
5133 }
5134 if (Dists.size() == Sz)
5136 }
5137 }
5138 }
5139 }
5140 // Correctly identify compare the cost of loads + shuffles rather than
5141 // strided/masked gather loads. Returns true if vectorized + shuffles
5142 // representation is better than just gather.
5143 auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment,
5144 unsigned *BestVF,
5145 bool ProfitableGatherPointers) {
5146 if (BestVF)
5147 *BestVF = 0;
5148 // Compare masked gather cost and loads + insert subvector costs.
5150 auto [ScalarGEPCost, VectorGEPCost] =
5151 getGEPCosts(TTI, PointerOps, PointerOps.front(),
5152 Instruction::GetElementPtr, CostKind, ScalarTy, VecTy);
5153 // Estimate the cost of masked gather GEP. If not a splat, roughly
5154 // estimate as a buildvector, otherwise estimate as splat.
5155 APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
5156 VectorType *PtrVecTy =
5157 getWidenedType(PointerOps.front()->getType()->getScalarType(),
5158 VecTy->getNumElements());
5159 if (static_cast<unsigned>(count_if(
5160 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.size() - 1 ||
5161 any_of(PointerOps, [&](Value *V) {
5162 return getUnderlyingObject(V) !=
5163 getUnderlyingObject(PointerOps.front());
5164 }))
5165 VectorGEPCost += TTI.getScalarizationOverhead(
5166 PtrVecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
5167 else
5168 VectorGEPCost +=
5170 PtrVecTy, APInt::getOneBitSet(VecTy->getNumElements(), 0),
5171 /*Insert=*/true, /*Extract=*/false, CostKind) +
5173 // The cost of scalar loads.
5174 InstructionCost ScalarLoadsCost =
5175 std::accumulate(VL.begin(), VL.end(), InstructionCost(),
5176 [&](InstructionCost C, Value *V) {
5177 return C + TTI.getInstructionCost(
5178 cast<Instruction>(V), CostKind);
5179 }) +
5180 ScalarGEPCost;
5181 // The cost of masked gather.
5182 InstructionCost MaskedGatherCost =
5184 Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
5185 /*VariableMask=*/false, CommonAlignment, CostKind) +
5186 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5187 InstructionCost GatherCost =
5188 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5189 /*Extract=*/false, CostKind) +
5190 ScalarLoadsCost;
5191 // The list of loads is small or perform partial check already - directly
5192 // compare masked gather cost and gather cost.
5193 constexpr unsigned ListLimit = 4;
5194 if (!TryRecursiveCheck || VL.size() < ListLimit)
5195 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5196
5197 // FIXME: The following code has not been updated for non-power-of-2
5198 // vectors (and not whole registers). The splitting logic here does not
5199 // cover the original vector if the vector factor is not a power of two.
5200 if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size()))
5201 return false;
5202
5203 unsigned Sz = DL->getTypeSizeInBits(ScalarTy);
5204 unsigned MinVF = getMinVF(2 * Sz);
5205 DemandedElts.clearAllBits();
5206 // Iterate through possible vectorization factors and check if vectorized +
5207 // shuffles is better than just gather.
5208 for (unsigned VF =
5209 getFloorFullVectorNumberOfElements(TTI, ScalarTy, VL.size() - 1);
5210 VF >= MinVF;
5211 VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) {
5213 for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) {
5214 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
5216 SmallVector<Value *> PointerOps;
5217 LoadsState LS =
5218 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF,
5219 /*TryRecursiveCheck=*/false);
5220 // Check that the sorted loads are consecutive.
5221 if (LS == LoadsState::Gather) {
5222 if (BestVF) {
5223 DemandedElts.setAllBits();
5224 break;
5225 }
5226 DemandedElts.setBits(Cnt, Cnt + VF);
5227 continue;
5228 }
5229 // If need the reorder - consider as high-cost masked gather for now.
5230 if ((LS == LoadsState::Vectorize ||
5232 !Order.empty() && !isReverseOrder(Order))
5234 States.push_back(LS);
5235 }
5236 if (DemandedElts.isAllOnes())
5237 // All loads gathered - try smaller VF.
5238 continue;
5239 // Can be vectorized later as a serie of loads/insertelements.
5240 InstructionCost VecLdCost = 0;
5241 if (!DemandedElts.isZero()) {
5242 VecLdCost =
5243 TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
5244 /*Extract=*/false, CostKind) +
5245 ScalarGEPCost;
5246 for (unsigned Idx : seq<unsigned>(VL.size()))
5247 if (DemandedElts[Idx])
5248 VecLdCost +=
5249 TTI.getInstructionCost(cast<Instruction>(VL[Idx]), CostKind);
5250 }
5251 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
5252 auto *SubVecTy = getWidenedType(ScalarTy, VF);
5253 for (auto [I, LS] : enumerate(States)) {
5254 auto *LI0 = cast<LoadInst>(VL[I * VF]);
5255 InstructionCost VectorGEPCost =
5256 (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers)
5257 ? 0
5258 : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF),
5259 LI0->getPointerOperand(),
5260 Instruction::GetElementPtr, CostKind, ScalarTy,
5261 SubVecTy)
5262 .second;
5263 if (LS == LoadsState::ScatterVectorize) {
5264 if (static_cast<unsigned>(
5265 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5266 PointerOps.size() - 1 ||
5267 any_of(PointerOps, [&](Value *V) {
5268 return getUnderlyingObject(V) !=
5269 getUnderlyingObject(PointerOps.front());
5270 }))
5271 VectorGEPCost += TTI.getScalarizationOverhead(
5272 SubVecTy, APInt::getAllOnes(VF),
5273 /*Insert=*/true, /*Extract=*/false, CostKind);
5274 else
5275 VectorGEPCost +=
5277 SubVecTy, APInt::getOneBitSet(ScalarTyNumElements * VF, 0),
5278 /*Insert=*/true, /*Extract=*/false, CostKind) +
5279 ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {},
5280 CostKind);
5281 }
5282 switch (LS) {
5284 VecLdCost +=
5285 TTI.getMemoryOpCost(Instruction::Load, SubVecTy, LI0->getAlign(),
5286 LI0->getPointerAddressSpace(), CostKind,
5288 VectorGEPCost;
5289 break;
5291 VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy,
5292 LI0->getPointerOperand(),
5293 /*VariableMask=*/false,
5294 CommonAlignment, CostKind) +
5295 VectorGEPCost;
5296 break;
5298 VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy,
5299 LI0->getPointerOperand(),
5300 /*VariableMask=*/false,
5301 CommonAlignment, CostKind) +
5302 VectorGEPCost;
5303 break;
5304 case LoadsState::Gather:
5305 // Gathers are already calculated - ignore.
5306 continue;
5307 }
5308 SmallVector<int> ShuffleMask(VL.size());
5309 for (int Idx : seq<int>(0, VL.size()))
5310 ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx;
5311 if (I > 0)
5312 VecLdCost +=
5313 ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask,
5314 CostKind, I * VF, SubVecTy);
5315 }
5316 // If masked gather cost is higher - better to vectorize, so
5317 // consider it as a gather node. It will be better estimated
5318 // later.
5319 if (MaskedGatherCost >= VecLdCost &&
5320 VecLdCost - GatherCost < -SLPCostThreshold) {
5321 if (BestVF)
5322 *BestVF = VF;
5323 return true;
5324 }
5325 }
5326 return MaskedGatherCost - GatherCost >= -SLPCostThreshold;
5327 };
5328 // TODO: need to improve analysis of the pointers, if not all of them are
5329 // GEPs or have > 2 operands, we end up with a gather node, which just
5330 // increases the cost.
5331 Loop *L = LI->getLoopFor(cast<LoadInst>(VL0)->getParent());
5332 bool ProfitableGatherPointers =
5333 L && Sz > 2 && static_cast<unsigned>(count_if(PointerOps, [L](Value *V) {
5334 return L->isLoopInvariant(V);
5335 })) <= Sz / 2;
5336 if (ProfitableGatherPointers || all_of(PointerOps, [](Value *P) {
5337 auto *GEP = dyn_cast<GetElementPtrInst>(P);
5338 return (!GEP && doesNotNeedToBeScheduled(P)) ||
5339 (GEP && GEP->getNumOperands() == 2 &&
5340 isa<Constant, Instruction>(GEP->getOperand(1)));
5341 })) {
5342 // Check if potential masked gather can be represented as series
5343 // of loads + insertsubvectors.
5344 // If masked gather cost is higher - better to vectorize, so
5345 // consider it as a gather node. It will be better estimated
5346 // later.
5347 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5348 ProfitableGatherPointers))
5350 }
5351
5352 return LoadsState::Gather;
5353}
5354
5356 ArrayRef<BasicBlock *> BBs, Type *ElemTy,
5357 const DataLayout &DL, ScalarEvolution &SE,
5358 SmallVectorImpl<unsigned> &SortedIndices) {
5359 assert(
5360 all_of(VL, [](const Value *V) { return V->getType()->isPointerTy(); }) &&
5361 "Expected list of pointer operands.");
5362 // Map from bases to a vector of (Ptr, Offset, OrigIdx), which we insert each
5363 // Ptr into, sort and return the sorted indices with values next to one
5364 // another.
5367 Bases;
5368 Bases
5369 .try_emplace(std::make_pair(
5371 .first->second.emplace_back().emplace_back(VL.front(), 0U, 0U);
5372
5373 SortedIndices.clear();
5374 for (auto [Cnt, Ptr] : enumerate(VL.drop_front())) {
5375 auto Key = std::make_pair(BBs[Cnt + 1],
5377 bool Found = any_of(Bases.try_emplace(Key).first->second,
5378 [&, &Cnt = Cnt, &Ptr = Ptr](auto &Base) {
5379 std::optional<int> Diff = getPointersDiff(
5380 ElemTy, std::get<0>(Base.front()), ElemTy,
5381 Ptr, DL, SE,
5382 /*StrictCheck=*/true);
5383 if (!Diff)
5384 return false;
5385
5386 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5387 return true;
5388 });
5389
5390 if (!Found) {
5391 // If we haven't found enough to usefully cluster, return early.
5392 if (Bases.size() > VL.size() / 2 - 1)
5393 return false;
5394
5395 // Not found already - add a new Base
5396 Bases.find(Key)->second.emplace_back().emplace_back(Ptr, 0, Cnt + 1);
5397 }
5398 }
5399
5400 if (Bases.size() == VL.size())
5401 return false;
5402
5403 if (Bases.size() == 1 && (Bases.front().second.size() == 1 ||
5404 Bases.front().second.size() == VL.size()))
5405 return false;
5406
5407 // For each of the bases sort the pointers by Offset and check if any of the
5408 // base become consecutively allocated.
5409 auto ComparePointers = [](Value *Ptr1, Value *Ptr2) {
5410 SmallPtrSet<Value *, 13> FirstPointers;
5411 SmallPtrSet<Value *, 13> SecondPointers;
5412 Value *P1 = Ptr1;
5413 Value *P2 = Ptr2;
5414 unsigned Depth = 0;
5415 while (!FirstPointers.contains(P2) && !SecondPointers.contains(P1)) {
5416 if (P1 == P2 || Depth > RecursionMaxDepth)
5417 return false;
5418 FirstPointers.insert(P1);
5419 SecondPointers.insert(P2);
5420 P1 = getUnderlyingObject(P1, /*MaxLookup=*/1);
5421 P2 = getUnderlyingObject(P2, /*MaxLookup=*/1);
5422 ++Depth;
5423 }
5424 assert((FirstPointers.contains(P2) || SecondPointers.contains(P1)) &&
5425 "Unable to find matching root.");
5426 return FirstPointers.contains(P2) && !SecondPointers.contains(P1);
5427 };
5428 for (auto &Base : Bases) {
5429 for (auto &Vec : Base.second) {
5430 if (Vec.size() > 1) {
5431 stable_sort(Vec, [](const std::tuple<Value *, int, unsigned> &X,
5432 const std::tuple<Value *, int, unsigned> &Y) {
5433 return std::get<1>(X) < std::get<1>(Y);
5434 });
5435 int InitialOffset = std::get<1>(Vec[0]);
5436 bool AnyConsecutive =
5437 all_of(enumerate(Vec), [InitialOffset](const auto &P) {
5438 return std::get<1>(P.value()) == int(P.index()) + InitialOffset;
5439 });
5440 // Fill SortedIndices array only if it looks worth-while to sort the
5441 // ptrs.
5442 if (!AnyConsecutive)
5443 return false;
5444 }
5445 }
5446 stable_sort(Base.second, [&](const auto &V1, const auto &V2) {
5447 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5448 });
5449 }
5450
5451 for (auto &T : Bases)
5452 for (const auto &Vec : T.second)
5453 for (const auto &P : Vec)
5454 SortedIndices.push_back(std::get<2>(P));
5455
5456 assert(SortedIndices.size() == VL.size() &&
5457 "Expected SortedIndices to be the size of VL");
5458 return true;
5459}
5460
5461std::optional<BoUpSLP::OrdersType>
5462BoUpSLP::findPartiallyOrderedLoads(const BoUpSLP::TreeEntry &TE) {
5463 assert(TE.isGather() && "Expected gather node only.");
5464 Type *ScalarTy = TE.Scalars[0]->getType();
5465
5467 Ptrs.reserve(TE.Scalars.size());
5469 BBs.reserve(TE.Scalars.size());
5470 for (Value *V : TE.Scalars) {
5471 auto *L = dyn_cast<LoadInst>(V);
5472 if (!L || !L->isSimple())
5473 return std::nullopt;
5474 Ptrs.push_back(L->getPointerOperand());
5475 BBs.push_back(L->getParent());
5476 }
5477
5478 BoUpSLP::OrdersType Order;
5479 if (!LoadEntriesToVectorize.contains(TE.Idx) &&
5480 clusterSortPtrAccesses(Ptrs, BBs, ScalarTy, *DL, *SE, Order))
5481 return std::move(Order);
5482 return std::nullopt;
5483}
5484
5485/// Check if two insertelement instructions are from the same buildvector.
5488 function_ref<Value *(InsertElementInst *)> GetBaseOperand) {
5489 // Instructions must be from the same basic blocks.
5490 if (VU->getParent() != V->getParent())
5491 return false;
5492 // Checks if 2 insertelements are from the same buildvector.
5493 if (VU->getType() != V->getType())
5494 return false;
5495 // Multiple used inserts are separate nodes.
5496 if (!VU->hasOneUse() && !V->hasOneUse())
5497 return false;
5498 auto *IE1 = VU;
5499 auto *IE2 = V;
5500 std::optional<unsigned> Idx1 = getElementIndex(IE1);
5501 std::optional<unsigned> Idx2 = getElementIndex(IE2);
5502 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5503 return false;
5504 // Go through the vector operand of insertelement instructions trying to find
5505 // either VU as the original vector for IE2 or V as the original vector for
5506 // IE1.
5507 SmallBitVector ReusedIdx(
5508 cast<VectorType>(VU->getType())->getElementCount().getKnownMinValue());
5509 bool IsReusedIdx = false;
5510 do {
5511 if (IE2 == VU && !IE1)
5512 return VU->hasOneUse();
5513 if (IE1 == V && !IE2)
5514 return V->hasOneUse();
5515 if (IE1 && IE1 != V) {
5516 unsigned Idx1 = getElementIndex(IE1).value_or(*Idx2);
5517 IsReusedIdx |= ReusedIdx.test(Idx1);
5518 ReusedIdx.set(Idx1);
5519 if ((IE1 != VU && !IE1->hasOneUse()) || IsReusedIdx)
5520 IE1 = nullptr;
5521 else
5522 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5523 }
5524 if (IE2 && IE2 != VU) {
5525 unsigned Idx2 = getElementIndex(IE2).value_or(*Idx1);
5526 IsReusedIdx |= ReusedIdx.test(Idx2);
5527 ReusedIdx.set(Idx2);
5528 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5529 IE2 = nullptr;
5530 else
5531 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5532 }
5533 } while (!IsReusedIdx && (IE1 || IE2));
5534 return false;
5535}
5536
5537std::optional<BoUpSLP::OrdersType>
5538BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
5539 // No need to reorder if need to shuffle reuses, still need to shuffle the
5540 // node.
5541 if (!TE.ReuseShuffleIndices.empty()) {
5542 // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors.
5543 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) &&
5544 "Reshuffling scalars not yet supported for nodes with padding");
5545
5546 if (isSplat(TE.Scalars))
5547 return std::nullopt;
5548 // Check if reuse shuffle indices can be improved by reordering.
5549 // For this, check that reuse mask is "clustered", i.e. each scalar values
5550 // is used once in each submask of size <number_of_scalars>.
5551 // Example: 4 scalar values.
5552 // ReuseShuffleIndices mask: 0, 1, 2, 3, 3, 2, 0, 1 - clustered.
5553 // 0, 1, 2, 3, 3, 3, 1, 0 - not clustered, because
5554 // element 3 is used twice in the second submask.
5555 unsigned Sz = TE.Scalars.size();
5556 if (TE.isGather()) {
5557 if (std::optional<OrdersType> CurrentOrder =
5559 SmallVector<int> Mask;
5560 fixupOrderingIndices(*CurrentOrder);
5561 inversePermutation(*CurrentOrder, Mask);
5562 ::addMask(Mask, TE.ReuseShuffleIndices);
5563 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5564 unsigned Sz = TE.Scalars.size();
5565 for (int K = 0, E = TE.getVectorFactor() / Sz; K < E; ++K) {
5566 for (auto [I, Idx] : enumerate(ArrayRef(Mask).slice(K * Sz, Sz)))
5567 if (Idx != PoisonMaskElem)
5568 Res[Idx + K * Sz] = I + K * Sz;
5569 }
5570 return std::move(Res);
5571 }
5572 }
5573 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5574 TTI->getNumberOfParts(getWidenedType(TE.Scalars.front()->getType(),
5575 2 * TE.getVectorFactor())) == 1)
5576 return std::nullopt;
5577 if (!ShuffleVectorInst::isOneUseSingleSourceMask(TE.ReuseShuffleIndices,
5578 Sz)) {
5579 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5580 if (TE.ReorderIndices.empty())
5581 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5582 else
5583 inversePermutation(TE.ReorderIndices, ReorderMask);
5584 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5585 unsigned VF = ReorderMask.size();
5586 OrdersType ResOrder(VF, VF);
5587 unsigned NumParts = divideCeil(VF, Sz);
5588 SmallBitVector UsedVals(NumParts);
5589 for (unsigned I = 0; I < VF; I += Sz) {
5590 int Val = PoisonMaskElem;
5591 unsigned UndefCnt = 0;
5592 unsigned Limit = std::min(Sz, VF - I);
5593 if (any_of(ArrayRef(ReorderMask).slice(I, Limit),
5594 [&](int Idx) {
5595 if (Val == PoisonMaskElem && Idx != PoisonMaskElem)
5596 Val = Idx;
5597 if (Idx == PoisonMaskElem)
5598 ++UndefCnt;
5599 return Idx != PoisonMaskElem && Idx != Val;
5600 }) ||
5601 Val >= static_cast<int>(NumParts) || UsedVals.test(Val) ||
5602 UndefCnt > Sz / 2)
5603 return std::nullopt;
5604 UsedVals.set(Val);
5605 for (unsigned K = 0; K < NumParts; ++K) {
5606 unsigned Idx = Val + Sz * K;
5607 if (Idx < VF)
5608 ResOrder[Idx] = I + K;
5609 }
5610 }
5611 return std::move(ResOrder);
5612 }
5613 unsigned VF = TE.getVectorFactor();
5614 // Try build correct order for extractelement instructions.
5615 SmallVector<int> ReusedMask(TE.ReuseShuffleIndices.begin(),
5616 TE.ReuseShuffleIndices.end());
5617 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5618 all_of(TE.Scalars, [Sz](Value *V) {
5619 if (isa<PoisonValue>(V))
5620 return true;
5621 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5622 return Idx && *Idx < Sz;
5623 })) {
5624 assert(!TE.isAltShuffle() && "Alternate instructions are only supported "
5625 "by BinaryOperator and CastInst.");
5626 SmallVector<int> ReorderMask(Sz, PoisonMaskElem);
5627 if (TE.ReorderIndices.empty())
5628 std::iota(ReorderMask.begin(), ReorderMask.end(), 0);
5629 else
5630 inversePermutation(TE.ReorderIndices, ReorderMask);
5631 for (unsigned I = 0; I < VF; ++I) {
5632 int &Idx = ReusedMask[I];
5633 if (Idx == PoisonMaskElem)
5634 continue;
5635 Value *V = TE.Scalars[ReorderMask[Idx]];
5636 std::optional<unsigned> EI = getExtractIndex(cast<Instruction>(V));
5637 Idx = std::distance(ReorderMask.begin(), find(ReorderMask, *EI));
5638 }
5639 }
5640 // Build the order of the VF size, need to reorder reuses shuffles, they are
5641 // always of VF size.
5642 OrdersType ResOrder(VF);
5643 std::iota(ResOrder.begin(), ResOrder.end(), 0);
5644 auto *It = ResOrder.begin();
5645 for (unsigned K = 0; K < VF; K += Sz) {
5646 OrdersType CurrentOrder(TE.ReorderIndices);
5647 SmallVector<int> SubMask{ArrayRef(ReusedMask).slice(K, Sz)};
5648 if (SubMask.front() == PoisonMaskElem)
5649 std::iota(SubMask.begin(), SubMask.end(), 0);
5650 reorderOrder(CurrentOrder, SubMask);
5651 transform(CurrentOrder, It, [K](unsigned Pos) { return Pos + K; });
5652 std::advance(It, Sz);
5653 }
5654 if (TE.isGather() && all_of(enumerate(ResOrder), [](const auto &Data) {
5655 return Data.index() == Data.value();
5656 }))
5657 return std::nullopt; // No need to reorder.
5658 return std::move(ResOrder);
5659 }
5660 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5661 any_of(TE.UserTreeIndices,
5662 [](const EdgeInfo &EI) {
5663 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5664 }) &&
5665 (TE.ReorderIndices.empty() || isReverseOrder(TE.ReorderIndices)))
5666 return std::nullopt;
5667 if ((TE.State == TreeEntry::Vectorize ||
5668 TE.State == TreeEntry::StridedVectorize) &&
5669 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5670 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5671 assert(!TE.isAltShuffle() && "Alternate instructions are only supported by "
5672 "BinaryOperator and CastInst.");
5673 return TE.ReorderIndices;
5674 }
5675 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5676 if (!TE.ReorderIndices.empty())
5677 return TE.ReorderIndices;
5678
5679 SmallVector<Instruction *> UserBVHead(TE.Scalars.size());
5680 for (auto [I, V] : zip(UserBVHead, TE.Scalars)) {
5681 if (!V->hasNUsesOrMore(1))
5682 continue;
5683 auto *II = dyn_cast<InsertElementInst>(*V->user_begin());
5684 if (!II)
5685 continue;
5686 Instruction *BVHead = nullptr;
5687 BasicBlock *BB = II->getParent();
5688 while (II && II->hasOneUse() && II->getParent() == BB) {
5689 BVHead = II;
5690 II = dyn_cast<InsertElementInst>(II->getOperand(0));
5691 }
5692 I = BVHead;
5693 }
5694
5695 auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
5696 assert(BB1 != BB2 && "Expected different basic blocks.");
5697 auto *NodeA = DT->getNode(BB1);
5698 auto *NodeB = DT->getNode(BB2);
5699 assert(NodeA && "Should only process reachable instructions");
5700 assert(NodeB && "Should only process reachable instructions");
5701 assert((NodeA == NodeB) ==
5702 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5703 "Different nodes should have different DFS numbers");
5704 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5705 };
5706 auto PHICompare = [&](unsigned I1, unsigned I2) {
5707 Value *V1 = TE.Scalars[I1];
5708 Value *V2 = TE.Scalars[I2];
5709 if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
5710 return false;
5711 if (isa<PoisonValue>(V1))
5712 return true;
5713 if (isa<PoisonValue>(V2))
5714 return false;
5715 if (V1->getNumUses() < V2->getNumUses())
5716 return true;
5717 if (V1->getNumUses() > V2->getNumUses())
5718 return false;
5719 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
5720 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5721 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5722 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5723 FirstUserOfPhi2->getParent());
5724 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5725 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5726 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5727 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5728 if (IE1 && !IE2)
5729 return true;
5730 if (!IE1 && IE2)
5731 return false;
5732 if (IE1 && IE2) {
5733 if (UserBVHead[I1] && !UserBVHead[I2])
5734 return true;
5735 if (!UserBVHead[I1])
5736 return false;
5737 if (UserBVHead[I1] == UserBVHead[I2])
5738 return getElementIndex(IE1) < getElementIndex(IE2);
5739 if (UserBVHead[I1]->getParent() != UserBVHead[I2]->getParent())
5740 return CompareByBasicBlocks(UserBVHead[I1]->getParent(),
5741 UserBVHead[I2]->getParent());
5742 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5743 }
5744 if (EE1 && !EE2)
5745 return true;
5746 if (!EE1 && EE2)
5747 return false;
5748 if (EE1 && EE2) {
5749 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5750 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5751 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5752 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5753 if (!Inst2 && !P2)
5754 return Inst1 || P1;
5755 if (EE1->getOperand(0) == EE2->getOperand(0))
5756 return getElementIndex(EE1) < getElementIndex(EE2);
5757 if (!Inst1 && Inst2)
5758 return false;
5759 if (Inst1 && Inst2) {
5760 if (Inst1->getParent() != Inst2->getParent())
5761 return CompareByBasicBlocks(Inst1->getParent(), Inst2->getParent());
5762 return Inst1->comesBefore(Inst2);
5763 }
5764 if (!P1 && P2)
5765 return false;
5766 assert(P1 && P2 &&
5767 "Expected either instructions or arguments vector operands.");
5768 return P1->getArgNo() < P2->getArgNo();
5769 }
5770 return false;
5771 };
5772 OrdersType Phis(TE.Scalars.size());
5773 std::iota(Phis.begin(), Phis.end(), 0);
5774 stable_sort(Phis, PHICompare);
5775 if (isIdentityOrder(Phis))
5776 return std::nullopt; // No need to reorder.
5777 return std::move(Phis);
5778 }
5779 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5780 allSameType(TE.Scalars)) {
5781 // TODO: add analysis of other gather nodes with extractelement
5782 // instructions and other values/instructions, not only undefs.
5783 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5784 (all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5785 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5786 all_of(TE.Scalars, [](Value *V) {
5787 auto *EE = dyn_cast<ExtractElementInst>(V);
5788 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5789 })) {
5790 // Check that gather of extractelements can be represented as
5791 // just a shuffle of a single vector.
5792 OrdersType CurrentOrder;
5793 bool Reuse =
5794 canReuseExtract(TE.Scalars, CurrentOrder, /*ResizeAllowed=*/true);
5795 if (Reuse || !CurrentOrder.empty())
5796 return std::move(CurrentOrder);
5797 }
5798 // If the gather node is <undef, v, .., poison> and
5799 // insertelement poison, v, 0 [+ permute]
5800 // is cheaper than
5801 // insertelement poison, v, n - try to reorder.
5802 // If rotating the whole graph, exclude the permute cost, the whole graph
5803 // might be transformed.
5804 int Sz = TE.Scalars.size();
5805 if (isSplat(TE.Scalars) && !allConstant(TE.Scalars) &&
5806 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5807 const auto *It =
5808 find_if(TE.Scalars, [](Value *V) { return !isConstant(V); });
5809 if (It == TE.Scalars.begin())
5810 return OrdersType();
5811 auto *Ty = getWidenedType(TE.Scalars.front()->getType(), Sz);
5812 if (It != TE.Scalars.end()) {
5813 OrdersType Order(Sz, Sz);
5814 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5815 Order[Idx] = 0;
5816 fixupOrderingIndices(Order);
5817 SmallVector<int> Mask;
5818 inversePermutation(Order, Mask);
5819 InstructionCost PermuteCost =
5820 TopToBottom
5821 ? 0
5823 InstructionCost InsertFirstCost = TTI->getVectorInstrCost(
5824 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, 0,
5825 PoisonValue::get(Ty), *It);
5826 InstructionCost InsertIdxCost = TTI->getVectorInstrCost(
5827 Instruction::InsertElement, Ty, TTI::TCK_RecipThroughput, Idx,
5828 PoisonValue::get(Ty), *It);
5829 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5830 OrdersType Order(Sz, Sz);
5831 Order[Idx] = 0;
5832 return std::move(Order);
5833 }
5834 }
5835 }
5836 if (isSplat(TE.Scalars))
5837 return std::nullopt;
5838 if (TE.Scalars.size() >= 3)
5839 if (std::optional<OrdersType> Order = findPartiallyOrderedLoads(TE))
5840 return Order;
5841 // Check if can include the order of vectorized loads. For masked gathers do
5842 // extra analysis later, so include such nodes into a special list.
5843 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5844 SmallVector<Value *> PointerOps;
5845 OrdersType CurrentOrder;
5846 LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(),
5847 CurrentOrder, PointerOps);
5849 return std::move(CurrentOrder);
5850 }
5851 // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars
5852 // has been auditted for correctness with non-power-of-two vectors.
5853 if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
5854 if (std::optional<OrdersType> CurrentOrder = findReusedOrderedScalars(TE))
5855 return CurrentOrder;
5856 }
5857 return std::nullopt;
5858}
5859
5860/// Checks if the given mask is a "clustered" mask with the same clusters of
5861/// size \p Sz, which are not identity submasks.
5863 unsigned Sz) {
5864 ArrayRef<int> FirstCluster = Mask.slice(0, Sz);
5865 if (ShuffleVectorInst::isIdentityMask(FirstCluster, Sz))
5866 return false;
5867 for (unsigned I = Sz, E = Mask.size(); I < E; I += Sz) {
5868 ArrayRef<int> Cluster = Mask.slice(I, Sz);
5869 if (Cluster != FirstCluster)
5870 return false;
5871 }
5872 return true;
5873}
5874
5875void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE, ArrayRef<int> Mask) const {
5876 // Reorder reuses mask.
5877 reorderReuses(TE.ReuseShuffleIndices, Mask);
5878 const unsigned Sz = TE.Scalars.size();
5879 // For vectorized and non-clustered reused no need to do anything else.
5880 if (!TE.isGather() ||
5882 Sz) ||
5883 !isRepeatedNonIdentityClusteredMask(TE.ReuseShuffleIndices, Sz))
5884 return;
5885 SmallVector<int> NewMask;
5886 inversePermutation(TE.ReorderIndices, NewMask);
5887 addMask(NewMask, TE.ReuseShuffleIndices);
5888 // Clear reorder since it is going to be applied to the new mask.
5889 TE.ReorderIndices.clear();
5890 // Try to improve gathered nodes with clustered reuses, if possible.
5891 ArrayRef<int> Slice = ArrayRef(NewMask).slice(0, Sz);
5892 SmallVector<unsigned> NewOrder(Slice);
5893 inversePermutation(NewOrder, NewMask);
5894 reorderScalars(TE.Scalars, NewMask);
5895 // Fill the reuses mask with the identity submasks.
5896 for (auto *It = TE.ReuseShuffleIndices.begin(),
5897 *End = TE.ReuseShuffleIndices.end();
5898 It != End; std::advance(It, Sz))
5899 std::iota(It, std::next(It, Sz), 0);
5900}
5901
5903 ArrayRef<unsigned> SecondaryOrder) {
5904 assert((SecondaryOrder.empty() || Order.size() == SecondaryOrder.size()) &&
5905 "Expected same size of orders");
5906 unsigned Sz = Order.size();
5907 SmallBitVector UsedIndices(Sz);
5908 for (unsigned Idx : seq<unsigned>(0, Sz)) {
5909 if (Order[Idx] != Sz)
5910 UsedIndices.set(Order[Idx]);
5911 }
5912 if (SecondaryOrder.empty()) {
5913 for (unsigned Idx : seq<unsigned>(0, Sz))
5914 if (Order[Idx] == Sz && !UsedIndices.test(Idx))
5915 Order[Idx] = Idx;
5916 } else {
5917 for (unsigned Idx : seq<unsigned>(0, Sz))
5918 if (SecondaryOrder[Idx] != Sz && Order[Idx] == Sz &&
5919 !UsedIndices.test(SecondaryOrder[Idx]))
5920 Order[Idx] = SecondaryOrder[Idx];
5921 }
5922}
5923
5925 // Maps VF to the graph nodes.
5927 // ExtractElement gather nodes which can be vectorized and need to handle
5928 // their ordering.
5930
5931 // Phi nodes can have preferred ordering based on their result users
5933
5934 // AltShuffles can also have a preferred ordering that leads to fewer
5935 // instructions, e.g., the addsub instruction in x86.
5936 DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
5937
5938 // Maps a TreeEntry to the reorder indices of external users.
5940 ExternalUserReorderMap;
5941 // Find all reorderable nodes with the given VF.
5942 // Currently the are vectorized stores,loads,extracts + some gathering of
5943 // extracts.
5944 for_each(VectorizableTree, [&, &TTIRef = *TTI](
5945 const std::unique_ptr<TreeEntry> &TE) {
5946 // Look for external users that will probably be vectorized.
5947 SmallVector<OrdersType, 1> ExternalUserReorderIndices =
5948 findExternalStoreUsersReorderIndices(TE.get());
5949 if (!ExternalUserReorderIndices.empty()) {
5950 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5951 ExternalUserReorderMap.try_emplace(TE.get(),
5952 std::move(ExternalUserReorderIndices));
5953 }
5954
5955 // Patterns like [fadd,fsub] can be combined into a single instruction in
5956 // x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
5957 // to take into account their order when looking for the most used order.
5958 if (TE->hasState() && TE->isAltShuffle()) {
5959 VectorType *VecTy =
5960 getWidenedType(TE->Scalars[0]->getType(), TE->Scalars.size());
5961 unsigned Opcode0 = TE->getOpcode();
5962 unsigned Opcode1 = TE->getAltOpcode();
5963 SmallBitVector OpcodeMask(getAltInstrMask(TE->Scalars, Opcode0, Opcode1));
5964 // If this pattern is supported by the target then we consider the order.
5965 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5966 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5967 AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
5968 }
5969 // TODO: Check the reverse order too.
5970 }
5971
5972 if (std::optional<OrdersType> CurrentOrder =
5973 getReorderingData(*TE, /*TopToBottom=*/true)) {
5974 // Do not include ordering for nodes used in the alt opcode vectorization,
5975 // better to reorder them during bottom-to-top stage. If follow the order
5976 // here, it causes reordering of the whole graph though actually it is
5977 // profitable just to reorder the subgraph that starts from the alternate
5978 // opcode vectorization node. Such nodes already end-up with the shuffle
5979 // instruction and it is just enough to change this shuffle rather than
5980 // rotate the scalars for the whole graph.
5981 unsigned Cnt = 0;
5982 const TreeEntry *UserTE = TE.get();
5983 while (UserTE && Cnt < RecursionMaxDepth) {
5984 if (UserTE->UserTreeIndices.size() != 1)
5985 break;
5986 if (all_of(UserTE->UserTreeIndices, [](const EdgeInfo &EI) {
5987 return EI.UserTE->State == TreeEntry::Vectorize &&
5988 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5989 }))
5990 return;
5991 UserTE = UserTE->UserTreeIndices.back().UserTE;
5992 ++Cnt;
5993 }
5994 VFToOrderedEntries[TE->getVectorFactor()].insert(TE.get());
5995 if (!(TE->State == TreeEntry::Vectorize ||
5996 TE->State == TreeEntry::StridedVectorize) ||
5997 !TE->ReuseShuffleIndices.empty())
5998 GathersToOrders.try_emplace(TE.get(), *CurrentOrder);
5999 if (TE->State == TreeEntry::Vectorize &&
6000 TE->getOpcode() == Instruction::PHI)
6001 PhisToOrders.try_emplace(TE.get(), *CurrentOrder);
6002 }
6003 });
6004
6005 // Reorder the graph nodes according to their vectorization factor.
6006 for (unsigned VF = VectorizableTree.front()->getVectorFactor();
6007 !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6008 auto It = VFToOrderedEntries.find(VF);
6009 if (It == VFToOrderedEntries.end())
6010 continue;
6011 // Try to find the most profitable order. We just are looking for the most
6012 // used order and reorder scalar elements in the nodes according to this
6013 // mostly used order.
6014 ArrayRef<TreeEntry *> OrderedEntries = It->second.getArrayRef();
6015 // Delete VF entry upon exit.
6016 auto Cleanup = make_scope_exit([&]() { VFToOrderedEntries.erase(It); });
6017
6018 // All operands are reordered and used only in this node - propagate the
6019 // most used order to the user node.
6022 OrdersUses;
6024 for (const TreeEntry *OpTE : OrderedEntries) {
6025 // No need to reorder this nodes, still need to extend and to use shuffle,
6026 // just need to merge reordering shuffle and the reuse shuffle.
6027 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6028 continue;
6029 // Count number of orders uses.
6030 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6031 &PhisToOrders]() -> const OrdersType & {
6032 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6033 auto It = GathersToOrders.find(OpTE);
6034 if (It != GathersToOrders.end())
6035 return It->second;
6036 }
6037 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6038 auto It = AltShufflesToOrders.find(OpTE);
6039 if (It != AltShufflesToOrders.end())
6040 return It->second;
6041 }
6042 if (OpTE->State == TreeEntry::Vectorize &&
6043 OpTE->getOpcode() == Instruction::PHI) {
6044 auto It = PhisToOrders.find(OpTE);
6045 if (It != PhisToOrders.end())
6046 return It->second;
6047 }
6048 return OpTE->ReorderIndices;
6049 }();
6050 // First consider the order of the external scalar users.
6051 auto It = ExternalUserReorderMap.find(OpTE);
6052 if (It != ExternalUserReorderMap.end()) {
6053 const auto &ExternalUserReorderIndices = It->second;
6054 // If the OpTE vector factor != number of scalars - use natural order,
6055 // it is an attempt to reorder node with reused scalars but with
6056 // external uses.
6057 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6058 OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second +=
6059 ExternalUserReorderIndices.size();
6060 } else {
6061 for (const OrdersType &ExtOrder : ExternalUserReorderIndices)
6062 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6063 }
6064 // No other useful reorder data in this entry.
6065 if (Order.empty())
6066 continue;
6067 }
6068 // Stores actually store the mask, not the order, need to invert.
6069 if (OpTE->State == TreeEntry::Vectorize &&
6070 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6071 assert(!OpTE->isAltShuffle() &&
6072 "Alternate instructions are only supported by BinaryOperator "
6073 "and CastInst.");
6074 SmallVector<int> Mask;
6075 inversePermutation(Order, Mask);
6076 unsigned E = Order.size();
6077 OrdersType CurrentOrder(E, E);
6078 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6079 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6080 });
6081 fixupOrderingIndices(CurrentOrder);
6082 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6083 } else {
6084 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6085 }
6086 }
6087 if (OrdersUses.empty())
6088 continue;
6089 // Choose the most used order.
6090 unsigned IdentityCnt = 0;
6091 unsigned FilledIdentityCnt = 0;
6092 OrdersType IdentityOrder(VF, VF);
6093 for (auto &Pair : OrdersUses) {
6094 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6095 if (!Pair.first.empty())
6096 FilledIdentityCnt += Pair.second;
6097 IdentityCnt += Pair.second;
6098 combineOrders(IdentityOrder, Pair.first);
6099 }
6100 }
6101 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6102 unsigned Cnt = IdentityCnt;
6103 for (auto &Pair : OrdersUses) {
6104 // Prefer identity order. But, if filled identity found (non-empty order)
6105 // with same number of uses, as the new candidate order, we can choose
6106 // this candidate order.
6107 if (Cnt < Pair.second ||
6108 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6109 Cnt == Pair.second && !BestOrder.empty() &&
6110 isIdentityOrder(BestOrder))) {
6111 combineOrders(Pair.first, BestOrder);
6112 BestOrder = Pair.first;
6113 Cnt = Pair.second;
6114 } else {
6115 combineOrders(BestOrder, Pair.first);
6116 }
6117 }
6118 // Set order of the user node.
6119 if (isIdentityOrder(BestOrder))
6120 continue;
6121 fixupOrderingIndices(BestOrder);
6122 SmallVector<int> Mask;
6123 inversePermutation(BestOrder, Mask);
6124 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6125 unsigned E = BestOrder.size();
6126 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6127 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6128 });
6129 // Do an actual reordering, if profitable.
6130 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6131 // Just do the reordering for the nodes with the given VF.
6132 if (TE->Scalars.size() != VF) {
6133 if (TE->ReuseShuffleIndices.size() == VF) {
6134 // Need to reorder the reuses masks of the operands with smaller VF to
6135 // be able to find the match between the graph nodes and scalar
6136 // operands of the given node during vectorization/cost estimation.
6137 assert(all_of(TE->UserTreeIndices,
6138 [VF, &TE](const EdgeInfo &EI) {
6139 return EI.UserTE->Scalars.size() == VF ||
6140 EI.UserTE->Scalars.size() ==
6141 TE->Scalars.size();
6142 }) &&
6143 "All users must be of VF size.");
6144 if (SLPReVec) {
6145 assert(SLPReVec && "Only supported by REVEC.");
6146 // ShuffleVectorInst does not do reorderOperands (and it should not
6147 // because ShuffleVectorInst supports only a limited set of
6148 // patterns). Only do reorderNodeWithReuses if all of the users are
6149 // not ShuffleVectorInst.
6150 if (all_of(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
6151 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6152 }))
6153 continue;
6154 assert(none_of(TE->UserTreeIndices,
6155 [&](const EdgeInfo &EI) {
6156 return isa<ShuffleVectorInst>(
6157 EI.UserTE->getMainOp());
6158 }) &&
6159 "Does not know how to reorder.");
6160 }
6161 // Update ordering of the operands with the smaller VF than the given
6162 // one.
6163 reorderNodeWithReuses(*TE, Mask);
6164 }
6165 continue;
6166 }
6167 if ((TE->State == TreeEntry::Vectorize ||
6168 TE->State == TreeEntry::StridedVectorize) &&
6170 InsertElementInst>(TE->getMainOp()) ||
6171 (SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6172 assert(!TE->isAltShuffle() &&
6173 "Alternate instructions are only supported by BinaryOperator "
6174 "and CastInst.");
6175 // Build correct orders for extract{element,value}, loads and
6176 // stores.
6177 reorderOrder(TE->ReorderIndices, Mask);
6178 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6179 TE->reorderOperands(Mask);
6180 } else {
6181 // Reorder the node and its operands.
6182 TE->reorderOperands(Mask);
6183 assert(TE->ReorderIndices.empty() &&
6184 "Expected empty reorder sequence.");
6185 reorderScalars(TE->Scalars, Mask);
6186 }
6187 if (!TE->ReuseShuffleIndices.empty()) {
6188 // Apply reversed order to keep the original ordering of the reused
6189 // elements to avoid extra reorder indices shuffling.
6190 OrdersType CurrentOrder;
6191 reorderOrder(CurrentOrder, MaskOrder);
6192 SmallVector<int> NewReuses;
6193 inversePermutation(CurrentOrder, NewReuses);
6194 addMask(NewReuses, TE->ReuseShuffleIndices);
6195 TE->ReuseShuffleIndices.swap(NewReuses);
6196 }
6197 }
6198 }
6199}
6200
6201bool BoUpSLP::canReorderOperands(
6202 TreeEntry *UserTE, SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6203 ArrayRef<TreeEntry *> ReorderableGathers,
6204 SmallVectorImpl<TreeEntry *> &GatherOps) {
6205 for (unsigned I = 0, E = UserTE->getNumOperands(); I < E; ++I) {
6206 if (any_of(Edges, [I](const std::pair<unsigned, TreeEntry *> &OpData) {
6207 return OpData.first == I &&
6208 (OpData.second->State == TreeEntry::Vectorize ||
6209 OpData.second->State == TreeEntry::StridedVectorize);
6210 }))
6211 continue;
6212 if (TreeEntry *TE = getVectorizedOperand(UserTE, I)) {
6213 // Do not reorder if operand node is used by many user nodes.
6214 if (any_of(TE->UserTreeIndices,
6215 [UserTE](const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6216 return false;
6217 // Add the node to the list of the ordered nodes with the identity
6218 // order.
6219 Edges.emplace_back(I, TE);
6220 // Add ScatterVectorize nodes to the list of operands, where just
6221 // reordering of the scalars is required. Similar to the gathers, so
6222 // simply add to the list of gathered ops.
6223 // If there are reused scalars, process this node as a regular vectorize
6224 // node, just reorder reuses mask.
6225 if (TE->State != TreeEntry::Vectorize &&
6226 TE->State != TreeEntry::StridedVectorize &&
6227 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6228 GatherOps.push_back(TE);
6229 continue;
6230 }
6231 TreeEntry *Gather = nullptr;
6232 if (count_if(ReorderableGathers,
6233 [&Gather, UserTE, I](TreeEntry *TE) {
6234 assert(TE->State != TreeEntry::Vectorize &&
6235 TE->State != TreeEntry::StridedVectorize &&
6236 "Only non-vectorized nodes are expected.");
6237 if (any_of(TE->UserTreeIndices,
6238 [UserTE, I](const EdgeInfo &EI) {
6239 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6240 })) {
6241 assert(TE->isSame(UserTE->getOperand(I)) &&
6242 "Operand entry does not match operands.");
6243 Gather = TE;
6244 return true;
6245 }
6246 return false;
6247 }) > 1 &&
6248 !allConstant(UserTE->getOperand(I)))
6249 return false;
6250 if (Gather)
6251 GatherOps.push_back(Gather);
6252 }
6253 return true;
6254}
6255
6256void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) {
6257 SetVector<TreeEntry *> OrderedEntries;
6258 DenseSet<const TreeEntry *> GathersToOrders;
6259 // Find all reorderable leaf nodes with the given VF.
6260 // Currently the are vectorized loads,extracts without alternate operands +
6261 // some gathering of extracts.
6262 SmallVector<TreeEntry *> NonVectorized;
6263 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6264 if (TE->State != TreeEntry::Vectorize &&
6265 TE->State != TreeEntry::StridedVectorize)
6266 NonVectorized.push_back(TE.get());
6267 if (std::optional<OrdersType> CurrentOrder =
6268 getReorderingData(*TE, /*TopToBottom=*/false)) {
6269 OrderedEntries.insert(TE.get());
6270 if (!(TE->State == TreeEntry::Vectorize ||
6271 TE->State == TreeEntry::StridedVectorize) ||
6272 !TE->ReuseShuffleIndices.empty())
6273 GathersToOrders.insert(TE.get());
6274 }
6275 }
6276
6277 // 1. Propagate order to the graph nodes, which use only reordered nodes.
6278 // I.e., if the node has operands, that are reordered, try to make at least
6279 // one operand order in the natural order and reorder others + reorder the
6280 // user node itself.
6282 while (!OrderedEntries.empty()) {
6283 // 1. Filter out only reordered nodes.
6284 // 2. If the entry has multiple uses - skip it and jump to the next node.
6286 SmallVector<TreeEntry *> Filtered;
6287 for (TreeEntry *TE : OrderedEntries) {
6288 if (!(TE->State == TreeEntry::Vectorize ||
6289 TE->State == TreeEntry::StridedVectorize ||
6290 (TE->isGather() && GathersToOrders.contains(TE))) ||
6291 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6292 !all_of(drop_begin(TE->UserTreeIndices),
6293 [TE](const EdgeInfo &EI) {
6294 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6295 }) ||
6296 !Visited.insert(TE).second) {
6297 Filtered.push_back(TE);
6298 continue;
6299 }
6300 // Build a map between user nodes and their operands order to speedup
6301 // search. The graph currently does not provide this dependency directly.
6302 for (EdgeInfo &EI : TE->UserTreeIndices)
6303 Users[EI.UserTE].emplace_back(EI.EdgeIdx, TE);
6304 }
6305 // Erase filtered entries.
6306 for (TreeEntry *TE : Filtered)
6307 OrderedEntries.remove(TE);
6309 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6310 UsersVec(Users.begin(), Users.end());
6311 sort(UsersVec, [](const auto &Data1, const auto &Data2) {
6312 return Data1.first->Idx > Data2.first->Idx;
6313 });
6314 for (auto &Data : UsersVec) {
6315 // Check that operands are used only in the User node.
6316 SmallVector<TreeEntry *> GatherOps;
6317 if (!canReorderOperands(Data.first, Data.second, NonVectorized,
6318 GatherOps)) {
6319 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6320 OrderedEntries.remove(Op.second);
6321 continue;
6322 }
6323 // All operands are reordered and used only in this node - propagate the
6324 // most used order to the user node.
6327 OrdersUses;
6328 // Do the analysis for each tree entry only once, otherwise the order of
6329 // the same node my be considered several times, though might be not
6330 // profitable.
6333 for (const auto &Op : Data.second) {
6334 TreeEntry *OpTE = Op.second;
6335 if (!VisitedOps.insert(OpTE).second)
6336 continue;
6337 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.count(OpTE))
6338 continue;
6339 const auto Order = [&]() -> const OrdersType {
6340 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6341 return getReorderingData(*OpTE, /*TopToBottom=*/false)
6342 .value_or(OrdersType(1));
6343 return OpTE->ReorderIndices;
6344 }();
6345 // The order is partially ordered, skip it in favor of fully non-ordered
6346 // orders.
6347 if (Order.size() == 1)
6348 continue;
6349 unsigned NumOps = count_if(
6350 Data.second, [OpTE](const std::pair<unsigned, TreeEntry *> &P) {
6351 return P.second == OpTE;
6352 });
6353 // Stores actually store the mask, not the order, need to invert.
6354 if (OpTE->State == TreeEntry::Vectorize &&
6355 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6356 assert(!OpTE->isAltShuffle() &&
6357 "Alternate instructions are only supported by BinaryOperator "
6358 "and CastInst.");
6359 SmallVector<int> Mask;
6360 inversePermutation(Order, Mask);
6361 unsigned E = Order.size();
6362 OrdersType CurrentOrder(E, E);
6363 transform(Mask, CurrentOrder.begin(), [E](int Idx) {
6364 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6365 });
6366 fixupOrderingIndices(CurrentOrder);
6367 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6368 NumOps;
6369 } else {
6370 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6371 }
6372 auto Res = OrdersUses.insert(std::make_pair(OrdersType(), 0));
6373 const auto AllowsReordering = [&](const TreeEntry *TE) {
6374 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6375 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6376 (IgnoreReorder && TE->Idx == 0))
6377 return true;
6378 if (TE->isGather()) {
6379 if (GathersToOrders.contains(TE))
6380 return !getReorderingData(*TE, /*TopToBottom=*/false)
6381 .value_or(OrdersType(1))
6382 .empty();
6383 return true;
6384 }
6385 return false;
6386 };
6387 for (const EdgeInfo &EI : OpTE->UserTreeIndices) {
6388 TreeEntry *UserTE = EI.UserTE;
6389 if (!VisitedUsers.insert(UserTE).second)
6390 continue;
6391 // May reorder user node if it requires reordering, has reused
6392 // scalars, is an alternate op vectorize node or its op nodes require
6393 // reordering.
6394 if (AllowsReordering(UserTE))
6395 continue;
6396 // Check if users allow reordering.
6397 // Currently look up just 1 level of operands to avoid increase of
6398 // the compile time.
6399 // Profitable to reorder if definitely more operands allow
6400 // reordering rather than those with natural order.
6402 if (static_cast<unsigned>(count_if(
6403 Ops, [UserTE, &AllowsReordering](
6404 const std::pair<unsigned, TreeEntry *> &Op) {
6405 return AllowsReordering(Op.second) &&
6406 all_of(Op.second->UserTreeIndices,
6407 [UserTE](const EdgeInfo &EI) {
6408 return EI.UserTE == UserTE;
6409 });
6410 })) <= Ops.size() / 2)
6411 ++Res.first->second;
6412 }
6413 }
6414 if (OrdersUses.empty()) {
6415 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6416 OrderedEntries.remove(Op.second);
6417 continue;
6418 }
6419 // Choose the most used order.
6420 unsigned IdentityCnt = 0;
6421 unsigned VF = Data.second.front().second->getVectorFactor();
6422 OrdersType IdentityOrder(VF, VF);
6423 for (auto &Pair : OrdersUses) {
6424 if (Pair.first.empty() || isIdentityOrder(Pair.first)) {
6425 IdentityCnt += Pair.second;
6426 combineOrders(IdentityOrder, Pair.first);
6427 }
6428 }
6429 MutableArrayRef<unsigned> BestOrder = IdentityOrder;
6430 unsigned Cnt = IdentityCnt;
6431 for (auto &Pair : OrdersUses) {
6432 // Prefer identity order. But, if filled identity found (non-empty
6433 // order) with same number of uses, as the new candidate order, we can
6434 // choose this candidate order.
6435 if (Cnt < Pair.second) {
6436 combineOrders(Pair.first, BestOrder);
6437 BestOrder = Pair.first;
6438 Cnt = Pair.second;
6439 } else {
6440 combineOrders(BestOrder, Pair.first);
6441 }
6442 }
6443 // Set order of the user node.
6444 if (isIdentityOrder(BestOrder)) {
6445 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second)
6446 OrderedEntries.remove(Op.second);
6447 continue;
6448 }
6449 fixupOrderingIndices(BestOrder);
6450 // Erase operands from OrderedEntries list and adjust their orders.
6451 VisitedOps.clear();
6452 SmallVector<int> Mask;
6453 inversePermutation(BestOrder, Mask);
6454 SmallVector<int> MaskOrder(BestOrder.size(), PoisonMaskElem);
6455 unsigned E = BestOrder.size();
6456 transform(BestOrder, MaskOrder.begin(), [E](unsigned I) {
6457 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6458 });
6459 for (const std::pair<unsigned, TreeEntry *> &Op : Data.second) {
6460 TreeEntry *TE = Op.second;
6461 OrderedEntries.remove(TE);
6462 if (!VisitedOps.insert(TE).second)
6463 continue;
6464 if (TE->ReuseShuffleIndices.size() == BestOrder.size()) {
6465 reorderNodeWithReuses(*TE, Mask);
6466 continue;
6467 }
6468 // Gathers are processed separately.
6469 if (TE->State != TreeEntry::Vectorize &&
6470 TE->State != TreeEntry::StridedVectorize &&
6471 (TE->State != TreeEntry::ScatterVectorize ||
6472 TE->ReorderIndices.empty()))
6473 continue;
6474 assert((BestOrder.size() == TE->ReorderIndices.size() ||
6475 TE->ReorderIndices.empty()) &&
6476 "Non-matching sizes of user/operand entries.");
6477 reorderOrder(TE->ReorderIndices, Mask);
6478 if (IgnoreReorder && TE == VectorizableTree.front().get())
6479 IgnoreReorder = false;
6480 }
6481 // For gathers just need to reorder its scalars.
6482 for (TreeEntry *Gather : GatherOps) {
6483 assert(Gather->ReorderIndices.empty() &&
6484 "Unexpected reordering of gathers.");
6485 if (!Gather->ReuseShuffleIndices.empty()) {
6486 // Just reorder reuses indices.
6487 reorderReuses(Gather->ReuseShuffleIndices, Mask);
6488 continue;
6489 }
6490 reorderScalars(Gather->Scalars, Mask);
6491 OrderedEntries.remove(Gather);
6492 }
6493 // Reorder operands of the user node and set the ordering for the user
6494 // node itself.
6495 if (Data.first->State != TreeEntry::Vectorize ||
6496 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6497 Data.first->getMainOp()) ||
6498 Data.first->isAltShuffle())
6499 Data.first->reorderOperands(Mask);
6500 if (!isa<InsertElementInst, StoreInst>(Data.first->getMainOp()) ||
6501 Data.first->isAltShuffle() ||
6502 Data.first->State == TreeEntry::StridedVectorize) {
6503 reorderScalars(Data.first->Scalars, Mask);
6504 reorderOrder(Data.first->ReorderIndices, MaskOrder,
6505 /*BottomOrder=*/true);
6506 if (Data.first->ReuseShuffleIndices.empty() &&
6507 !Data.first->ReorderIndices.empty() &&
6508 !Data.first->isAltShuffle()) {
6509 // Insert user node to the list to try to sink reordering deeper in
6510 // the graph.
6511 OrderedEntries.insert(Data.first);
6512 }
6513 } else {
6514 reorderOrder(Data.first->ReorderIndices, Mask);
6515 }
6516 }
6517 }
6518 // If the reordering is unnecessary, just remove the reorder.
6519 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6520 VectorizableTree.front()->ReuseShuffleIndices.empty())
6521 VectorizableTree.front()->ReorderIndices.clear();
6522}
6523
6524Instruction *BoUpSLP::getRootEntryInstruction(const TreeEntry &Entry) const {
6525 if ((Entry.getOpcode() == Instruction::Store ||
6526 Entry.getOpcode() == Instruction::Load) &&
6527 Entry.State == TreeEntry::StridedVectorize &&
6528 !Entry.ReorderIndices.empty() && isReverseOrder(Entry.ReorderIndices))
6529 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6530 return dyn_cast<Instruction>(Entry.Scalars.front());
6531}
6532
6534 const ExtraValueToDebugLocsMap &ExternallyUsedValues) {
6535 DenseMap<Value *, unsigned> ScalarToExtUses;
6536 // Collect the values that we need to extract from the tree.
6537 for (auto &TEPtr : VectorizableTree) {
6538 TreeEntry *Entry = TEPtr.get();
6539
6540 // No need to handle users of gathered values.
6541 if (Entry->isGather())
6542 continue;
6543
6544 // For each lane:
6545 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6546 Value *Scalar = Entry->Scalars[Lane];
6547 if (!isa<Instruction>(Scalar))
6548 continue;
6549 // All uses must be replaced already? No need to do it again.
6550 auto It = ScalarToExtUses.find(Scalar);
6551 if (It != ScalarToExtUses.end() && !ExternalUses[It->second].User)
6552 continue;
6553
6554 // Check if the scalar is externally used as an extra arg.
6555 const auto ExtI = ExternallyUsedValues.find(Scalar);
6556 if (ExtI != ExternallyUsedValues.end()) {
6557 int FoundLane = Entry->findLaneForValue(Scalar);
6558 LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
6559 << FoundLane << " from " << *Scalar << ".\n");
6560 ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
6561 ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
6562 continue;
6563 }
6564 for (User *U : Scalar->users()) {
6565 LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
6566
6567 Instruction *UserInst = dyn_cast<Instruction>(U);
6568 if (!UserInst || isDeleted(UserInst))
6569 continue;
6570
6571 // Ignore users in the user ignore list.
6572 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6573 continue;
6574
6575 // Skip in-tree scalars that become vectors
6576 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6577 // Some in-tree scalars will remain as scalar in vectorized
6578 // instructions. If that is the case, the one in FoundLane will
6579 // be used.
6580 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6582 Scalar, getRootEntryInstruction(*UseEntry), TLI, TTI)) {
6583 LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
6584 << ".\n");
6585 assert(!UseEntry->isGather() && "Bad state");
6586 continue;
6587 }
6588 U = nullptr;
6589 if (It != ScalarToExtUses.end()) {
6590 ExternalUses[It->second].User = nullptr;
6591 break;
6592 }
6593 }
6594
6595 if (U && Scalar->hasNUsesOrMore(UsesLimit))
6596 U = nullptr;
6597 int FoundLane = Entry->findLaneForValue(Scalar);
6598 LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
6599 << " from lane " << FoundLane << " from " << *Scalar
6600 << ".\n");
6601 It = ScalarToExtUses.try_emplace(Scalar, ExternalUses.size()).first;
6602 ExternalUses.emplace_back(Scalar, U, FoundLane);
6603 if (!U)
6604 break;
6605 }
6606 }
6607 }
6608}
6609
6611BoUpSLP::collectUserStores(const BoUpSLP::TreeEntry *TE) const {
6614 PtrToStoresMap;
6615 for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6616 Value *V = TE->Scalars[Lane];
6617 // Don't iterate over the users of constant data.
6618 if (!isa<Instruction>(V))
6619 continue;
6620 // To save compilation time we don't visit if we have too many users.
6621 if (V->hasNUsesOrMore(UsesLimit))
6622 break;
6623
6624 // Collect stores per pointer object.
6625 for (User *U : V->users()) {
6626 auto *SI = dyn_cast<StoreInst>(U);
6627 // Test whether we can handle the store. V might be a global, which could
6628 // be used in a different function.
6629 if (SI == nullptr || !SI->isSimple() || SI->getFunction() != F ||
6630 !isValidElementType(SI->getValueOperand()->getType()))
6631 continue;
6632 // Skip entry if already
6633 if (getTreeEntry(U))
6634 continue;
6635
6636 Value *Ptr =
6637 getUnderlyingObject(SI->getPointerOperand(), RecursionMaxDepth);
6638 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6639 SI->getValueOperand()->getType(), Ptr}];
6640 // For now just keep one store per pointer object per lane.
6641 // TODO: Extend this to support multiple stores per pointer per lane
6642 if (StoresVec.size() > Lane)
6643 continue;
6644 if (!StoresVec.empty()) {
6645 std::optional<int> Diff = getPointersDiff(
6646 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6647 SI->getValueOperand()->getType(),
6648 StoresVec.front()->getPointerOperand(), *DL, *SE,
6649 /*StrictCheck=*/true);
6650 // We failed to compare the pointers so just abandon this store.
6651 if (!Diff)
6652 continue;
6653 }
6654 StoresVec.push_back(SI);
6655 }
6656 }
6657 SmallVector<SmallVector<StoreInst *>> Res(PtrToStoresMap.size());
6658 unsigned I = 0;
6659 for (auto &P : PtrToStoresMap) {
6660 Res[I].swap(P.second);
6661 ++I;
6662 }
6663 return Res;
6664}
6665
6666bool BoUpSLP::canFormVector(ArrayRef<StoreInst *> StoresVec,
6667 OrdersType &ReorderIndices) const {
6668 // We check whether the stores in StoreVec can form a vector by sorting them
6669 // and checking whether they are consecutive.
6670
6671 // To avoid calling getPointersDiff() while sorting we create a vector of
6672 // pairs {store, offset from first} and sort this instead.
6674 StoreInst *S0 = StoresVec[0];
6675 StoreOffsetVec.emplace_back(0, 0);
6676 Type *S0Ty = S0->getValueOperand()->getType();
6677 Value *S0Ptr = S0->getPointerOperand();
6678 for (unsigned Idx : seq<unsigned>(1, StoresVec.size())) {
6679 StoreInst *SI = StoresVec[Idx];
6680 std::optional<int> Diff =
6681 getPointersDiff(S0Ty, S0Ptr, SI->getValueOperand()->getType(),
6682 SI->getPointerOperand(), *DL, *SE,
6683 /*StrictCheck=*/true);
6684 StoreOffsetVec.emplace_back(*Diff, Idx);
6685 }
6686
6687 // Check if the stores are consecutive by checking if their difference is 1.
6688 if (StoreOffsetVec.size() != StoresVec.size())
6689 return false;
6690 sort(StoreOffsetVec,
6691 [](const std::pair<int, unsigned> &L,
6692 const std::pair<int, unsigned> &R) { return L.first < R.first; });
6693 unsigned Idx = 0;
6694 int PrevDist = 0;
6695 for (const auto &P : StoreOffsetVec) {
6696 if (Idx > 0 && P.first != PrevDist + 1)
6697 return false;
6698 PrevDist = P.first;
6699 ++Idx;
6700 }
6701
6702 // Calculate the shuffle indices according to their offset against the sorted
6703 // StoreOffsetVec.
6704 ReorderIndices.assign(StoresVec.size(), 0);
6705 bool IsIdentity = true;
6706 for (auto [I, P] : enumerate(StoreOffsetVec)) {
6707 ReorderIndices[P.second] = I;
6708 IsIdentity &= P.second == I;
6709 }
6710 // Identity order (e.g., {0,1,2,3}) is modeled as an empty OrdersType in
6711 // reorderTopToBottom() and reorderBottomToTop(), so we are following the
6712 // same convention here.
6713 if (IsIdentity)
6714 ReorderIndices.clear();
6715
6716 return true;
6717}
6718
6719#ifndef NDEBUG
6721 for (unsigned Idx : Order)
6722 dbgs() << Idx << ", ";
6723 dbgs() << "\n";
6724}
6725#endif
6726
6728BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE) const {
6729 unsigned NumLanes = TE->Scalars.size();
6730
6731 SmallVector<SmallVector<StoreInst *>> Stores = collectUserStores(TE);
6732
6733 // Holds the reorder indices for each candidate store vector that is a user of
6734 // the current TreeEntry.
6735 SmallVector<OrdersType, 1> ExternalReorderIndices;
6736
6737 // Now inspect the stores collected per pointer and look for vectorization
6738 // candidates. For each candidate calculate the reorder index vector and push
6739 // it into `ExternalReorderIndices`
6740 for (ArrayRef<StoreInst *> StoresVec : Stores) {
6741 // If we have fewer than NumLanes stores, then we can't form a vector.
6742 if (StoresVec.size() != NumLanes)
6743 continue;
6744
6745 // If the stores are not consecutive then abandon this StoresVec.
6746 OrdersType ReorderIndices;
6747 if (!canFormVector(StoresVec, ReorderIndices))
6748 continue;
6749
6750 // We now know that the scalars in StoresVec can form a vector instruction,
6751 // so set the reorder indices.
6752 ExternalReorderIndices.push_back(ReorderIndices);
6753 }
6754 return ExternalReorderIndices;
6755}
6756
6758 const SmallDenseSet<Value *> &UserIgnoreLst) {
6759 deleteTree();
6760 UserIgnoreList = &UserIgnoreLst;
6761 if (!allSameType(Roots))
6762 return;
6763 buildTree_rec(Roots, 0, EdgeInfo());
6764}
6765
6767 deleteTree();
6768 if (!allSameType(Roots))
6769 return;
6770 buildTree_rec(Roots, 0, EdgeInfo());
6771}
6772
6773/// Tries to find subvector of loads and builds new vector of only loads if can
6774/// be profitable.
6776 const BoUpSLP &R, ArrayRef<Value *> VL, const DataLayout &DL,
6778 SmallVectorImpl<SmallVector<std::pair<LoadInst *, int>>> &GatheredLoads,
6779 bool AddNew = true) {
6780 if (VL.empty())
6781 return;
6782 Type *ScalarTy = getValueType(VL.front());
6783 if (!isValidElementType(ScalarTy))
6784 return;
6786 SmallVector<DenseMap<int, LoadInst *>> ClusteredDistToLoad;
6787 for (Value *V : VL) {
6788 auto *LI = dyn_cast<LoadInst>(V);
6789 if (!LI)
6790 continue;
6791 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6792 continue;
6793 bool IsFound = false;
6794 for (auto [Map, Data] : zip(ClusteredDistToLoad, ClusteredLoads)) {
6795 assert(LI->getParent() == Data.front().first->getParent() &&
6796 LI->getType() == Data.front().first->getType() &&
6797 getUnderlyingObject(LI->getPointerOperand(), RecursionMaxDepth) ==
6798 getUnderlyingObject(Data.front().first->getPointerOperand(),
6800 "Expected loads with the same type, same parent and same "
6801 "underlying pointer.");
6802 std::optional<int> Dist = getPointersDiff(
6803 LI->getType(), LI->getPointerOperand(), Data.front().first->getType(),
6804 Data.front().first->getPointerOperand(), DL, SE,
6805 /*StrictCheck=*/true);
6806 if (!Dist)
6807 continue;
6808 auto It = Map.find(*Dist);
6809 if (It != Map.end() && It->second != LI)
6810 continue;
6811 if (It == Map.end()) {
6812 Data.emplace_back(LI, *Dist);
6813 Map.try_emplace(*Dist, LI);
6814 }
6815 IsFound = true;
6816 break;
6817 }
6818 if (!IsFound) {
6819 ClusteredLoads.emplace_back().emplace_back(LI, 0);
6820 ClusteredDistToLoad.emplace_back().try_emplace(0, LI);
6821 }
6822 }
6823 auto FindMatchingLoads =
6826 &GatheredLoads,
6827 SetVector<unsigned> &ToAdd, SetVector<unsigned> &Repeated,
6828 int &Offset, unsigned &Start) {
6829 if (Loads.empty())
6830 return GatheredLoads.end();
6832 LoadInst *LI = Loads.front().first;
6833 for (auto [Idx, Data] : enumerate(GatheredLoads)) {
6834 if (Idx < Start)
6835 continue;
6836 ToAdd.clear();
6837 if (LI->getParent() != Data.front().first->getParent() ||
6838 LI->getType() != Data.front().first->getType())
6839 continue;
6840 std::optional<int> Dist =
6842 Data.front().first->getType(),
6843 Data.front().first->getPointerOperand(), DL, SE,
6844 /*StrictCheck=*/true);
6845 if (!Dist)
6846 continue;
6847 SmallSet<int, 4> DataDists;
6849 for (std::pair<LoadInst *, int> P : Data) {
6850 DataDists.insert(P.second);
6851 DataLoads.insert(P.first);
6852 }
6853 // Found matching gathered loads - check if all loads are unique or
6854 // can be effectively vectorized.
6855 unsigned NumUniques = 0;
6856 for (auto [Cnt, Pair] : enumerate(Loads)) {
6857 bool Used = DataLoads.contains(Pair.first);
6858 if (!Used && !DataDists.contains(*Dist + Pair.second)) {
6859 ++NumUniques;
6860 ToAdd.insert(Cnt);
6861 } else if (Used) {
6862 Repeated.insert(Cnt);
6863 }
6864 }
6865 if (NumUniques > 0 &&
6866 (Loads.size() == NumUniques ||
6867 (Loads.size() - NumUniques >= 2 &&
6868 Loads.size() - NumUniques >= Loads.size() / 2 &&
6869 (has_single_bit(Data.size() + NumUniques) ||
6870 bit_ceil(Data.size()) <
6871 bit_ceil(Data.size() + NumUniques))))) {
6872 Offset = *Dist;
6873 Start = Idx + 1;
6874 return std::next(GatheredLoads.begin(), Idx);
6875 }
6876 }
6877 ToAdd.clear();
6878 return GatheredLoads.end();
6879 };
6880 for (ArrayRef<std::pair<LoadInst *, int>> Data : ClusteredLoads) {
6881 unsigned Start = 0;
6882 SetVector<unsigned> ToAdd, LocalToAdd, Repeated;
6883 int Offset = 0;
6884 auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated,
6885 Offset, Start);
6886 while (It != GatheredLoads.end()) {
6887 assert(!LocalToAdd.empty() && "Expected some elements to add.");
6888 for (unsigned Idx : LocalToAdd)
6889 It->emplace_back(Data[Idx].first, Data[Idx].second + Offset);
6890 ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end());
6891 It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset,
6892 Start);
6893 }
6894 if (any_of(seq<unsigned>(Data.size()), [&](unsigned Idx) {
6895 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6896 })) {
6897 auto AddNewLoads =
6899 for (unsigned Idx : seq<unsigned>(Data.size())) {
6900 if (ToAdd.contains(Idx) || Repeated.contains(Idx))
6901 continue;
6902 Loads.push_back(Data[Idx]);
6903 }
6904 };
6905 if (!AddNew) {
6906 LoadInst *LI = Data.front().first;
6907 It = find_if(
6908 GatheredLoads, [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6909 return PD.front().first->getParent() == LI->getParent() &&
6910 PD.front().first->getType() == LI->getType();
6911 });
6912 while (It != GatheredLoads.end()) {
6913 AddNewLoads(*It);
6914 It = std::find_if(
6915 std::next(It), GatheredLoads.end(),
6916 [&](ArrayRef<std::pair<LoadInst *, int>> PD) {
6917 return PD.front().first->getParent() == LI->getParent() &&
6918 PD.front().first->getType() == LI->getType();
6919 });
6920 }
6921 }
6922 GatheredLoads.emplace_back().append(Data.begin(), Data.end());
6923 AddNewLoads(GatheredLoads.emplace_back());
6924 }
6925 }
6926}
6927
6928void BoUpSLP::tryToVectorizeGatheredLoads(
6929 const SmallMapVector<std::tuple<BasicBlock *, Value *, Type *>,
6930 SmallVector<SmallVector<std::pair<LoadInst *, int>>>,
6931 8> &GatheredLoads) {
6932 GatheredLoadsEntriesFirst = VectorizableTree.size();
6933
6934 SmallVector<SmallPtrSet<const Value *, 4>> LoadSetsToVectorize(
6935 LoadEntriesToVectorize.size());
6936 for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6937 Set.insert(VectorizableTree[Idx]->Scalars.begin(),
6938 VectorizableTree[Idx]->Scalars.end());
6939
6940 // Sort loads by distance.
6941 auto LoadSorter = [](const std::pair<LoadInst *, int> &L1,
6942 const std::pair<LoadInst *, int> &L2) {
6943 return L1.second > L2.second;
6944 };
6945
6946 auto IsMaskedGatherSupported = [&, TTI = TTI](ArrayRef<LoadInst *> Loads) {
6947 ArrayRef<Value *> Values(reinterpret_cast<Value *const *>(Loads.begin()),
6948 Loads.size());
6949 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6950 auto *Ty = getWidenedType(Loads.front()->getType(), Loads.size());
6951 return TTI->isLegalMaskedGather(Ty, Alignment) &&
6952 !TTI->forceScalarizeMaskedGather(Ty, Alignment);
6953 };
6954
6955 auto GetVectorizedRanges = [this](ArrayRef<LoadInst *> Loads,
6956 BoUpSLP::ValueSet &VectorizedLoads,
6957 SmallVectorImpl<LoadInst *> &NonVectorized,
6958 bool Final, unsigned MaxVF) {
6960 unsigned StartIdx = 0;
6961 SmallVector<int> CandidateVFs;
6962 if (VectorizeNonPowerOf2 && has_single_bit(MaxVF + 1))
6963 CandidateVFs.push_back(MaxVF);
6964 for (int NumElts = getFloorFullVectorNumberOfElements(
6965 *TTI, Loads.front()->getType(), MaxVF);
6966 NumElts > 1; NumElts = getFloorFullVectorNumberOfElements(
6967 *TTI, Loads.front()->getType(), NumElts - 1)) {
6968 CandidateVFs.push_back(NumElts);
6969 if (VectorizeNonPowerOf2 && NumElts > 2)
6970 CandidateVFs.push_back(NumElts - 1);
6971 }
6972
6973 if (Final && CandidateVFs.empty())
6974 return Results;
6975
6976 unsigned BestVF = Final ? CandidateVFs.back() : 0;
6977 for (unsigned NumElts : CandidateVFs) {
6978 if (Final && NumElts > BestVF)
6979 continue;
6980 SmallVector<unsigned> MaskedGatherVectorized;
6981 for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt < E;
6982 ++Cnt) {
6983 ArrayRef<LoadInst *> Slice =
6984 ArrayRef(Loads).slice(Cnt, std::min(NumElts, E - Cnt));
6985 if (VectorizedLoads.count(Slice.front()) ||
6986 VectorizedLoads.count(Slice.back()) ||
6988 continue;
6989 // Check if it is profitable to try vectorizing gathered loads. It is
6990 // profitable if we have more than 3 consecutive loads or if we have
6991 // less but all users are vectorized or deleted.
6992 bool AllowToVectorize = false;
6993 // Check if it is profitable to vectorize 2-elements loads.
6994 if (NumElts == 2) {
6995 bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad(
6996 Slice.front()->getType(), ElementCount::getFixed(NumElts));
6997 auto CheckIfAllowed = [=](ArrayRef<LoadInst *> Slice) {
6998 for (LoadInst *LI : Slice) {
6999 // If single use/user - allow to vectorize.
7000 if (LI->hasOneUse())
7001 continue;
7002 // 1. Check if number of uses equals number of users.
7003 // 2. All users are deleted.
7004 // 3. The load broadcasts are not allowed or the load is not
7005 // broadcasted.
7006 if (static_cast<unsigned int>(std::distance(
7007 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7008 return false;
7009 if (!IsLegalBroadcastLoad)
7010 continue;
7011 if (LI->hasNUsesOrMore(UsesLimit))
7012 return false;
7013 for (User *U : LI->users()) {
7014 if (auto *UI = dyn_cast<Instruction>(U); UI && isDeleted(UI))
7015 continue;
7016 if (const TreeEntry *UTE = getTreeEntry(U)) {
7017 for (int I : seq<int>(UTE->getNumOperands())) {
7018 if (all_of(UTE->getOperand(I),
7019 [LI](Value *V) { return V == LI; }))
7020 // Found legal broadcast - do not vectorize.
7021 return false;
7022 }
7023 }
7024 }
7025 }
7026 return true;
7027 };
7028 AllowToVectorize = CheckIfAllowed(Slice);
7029 } else {
7030 AllowToVectorize =
7031 (NumElts >= 3 ||
7032 any_of(ValueToGatherNodes.at(Slice.front()),
7033 [=](const TreeEntry *TE) {
7034 return TE->Scalars.size() == 2 &&
7035 ((TE->Scalars.front() == Slice.front() &&
7036 TE->Scalars.back() == Slice.back()) ||
7037 (TE->Scalars.front() == Slice.back() &&
7038 TE->Scalars.back() == Slice.front()));
7039 })) &&
7040 hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(),
7041 Slice.size());
7042 }
7043 if (AllowToVectorize) {
7044 SmallVector<Value *> PointerOps;
7045 OrdersType CurrentOrder;
7046 // Try to build vector load.
7047 ArrayRef<Value *> Values(
7048 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7049 LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder,
7050 PointerOps, &BestVF);
7051 if (LS != LoadsState::Gather ||
7052 (BestVF > 1 && static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7053 if (LS == LoadsState::ScatterVectorize) {
7054 if (MaskedGatherVectorized.empty() ||
7055 Cnt >= MaskedGatherVectorized.back() + NumElts)
7056 MaskedGatherVectorized.push_back(Cnt);
7057 continue;
7058 }
7059 if (LS != LoadsState::Gather) {
7060 Results.emplace_back(Values, LS);
7061 VectorizedLoads.insert(Slice.begin(), Slice.end());
7062 // If we vectorized initial block, no need to try to vectorize it
7063 // again.
7064 if (Cnt == StartIdx)
7065 StartIdx += NumElts;
7066 }
7067 // Check if the whole array was vectorized already - exit.
7068 if (StartIdx >= Loads.size())
7069 break;
7070 // Erase last masked gather candidate, if another candidate within
7071 // the range is found to be better.
7072 if (!MaskedGatherVectorized.empty() &&
7073 Cnt < MaskedGatherVectorized.back() + NumElts)
7074 MaskedGatherVectorized.pop_back();
7075 Cnt += NumElts - 1;
7076 continue;
7077 }
7078 }
7079 if (!AllowToVectorize || BestVF == 0)
7081 }
7082 // Mark masked gathers candidates as vectorized, if any.
7083 for (unsigned Cnt : MaskedGatherVectorized) {
7084 ArrayRef<LoadInst *> Slice = ArrayRef(Loads).slice(
7085 Cnt, std::min<unsigned>(NumElts, Loads.size() - Cnt));
7086 ArrayRef<Value *> Values(
7087 reinterpret_cast<Value *const *>(Slice.begin()), Slice.size());
7088 Results.emplace_back(Values, LoadsState::ScatterVectorize);
7089 VectorizedLoads.insert(Slice.begin(), Slice.end());
7090 // If we vectorized initial block, no need to try to vectorize it again.
7091 if (Cnt == StartIdx)
7092 StartIdx += NumElts;
7093 }
7094 }
7095 for (LoadInst *LI : Loads) {
7096 if (!VectorizedLoads.contains(LI))
7097 NonVectorized.push_back(LI);
7098 }
7099 return Results;
7100 };
7101 auto ProcessGatheredLoads =
7102 [&, &TTI = *TTI](
7104 bool Final = false) {
7105 SmallVector<LoadInst *> NonVectorized;
7106 for (ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7107 if (LoadsDists.size() <= 1) {
7108 NonVectorized.push_back(LoadsDists.back().first);
7109 continue;
7110 }
7111 SmallVector<std::pair<LoadInst *, int>> LocalLoadsDists(LoadsDists);
7112 SmallVector<LoadInst *> OriginalLoads(LocalLoadsDists.size());
7113 transform(LoadsDists, OriginalLoads.begin(),
7114 [](const std::pair<LoadInst *, int> &L) -> LoadInst * {
7115 return L.first;
7116 });
7117 stable_sort(LocalLoadsDists, LoadSorter);
7119 unsigned MaxConsecutiveDistance = 0;
7120 unsigned CurrentConsecutiveDist = 1;
7121 int LastDist = LocalLoadsDists.front().second;
7122 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7123 for (const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7124 if (getTreeEntry(L.first))
7125 continue;
7126 assert(LastDist >= L.second &&
7127 "Expected first distance always not less than second");
7128 if (static_cast<unsigned>(LastDist - L.second) ==
7129 CurrentConsecutiveDist) {
7130 ++CurrentConsecutiveDist;
7131 MaxConsecutiveDistance =
7132 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7133 Loads.push_back(L.first);
7134 continue;
7135 }
7136 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7137 !Loads.empty())
7138 Loads.pop_back();
7139 CurrentConsecutiveDist = 1;
7140 LastDist = L.second;
7141 Loads.push_back(L.first);
7142 }
7143 if (Loads.size() <= 1)
7144 continue;
7145 if (AllowMaskedGather)
7146 MaxConsecutiveDistance = Loads.size();
7147 else if (MaxConsecutiveDistance < 2)
7148 continue;
7149 BoUpSLP::ValueSet VectorizedLoads;
7150 SmallVector<LoadInst *> SortedNonVectorized;
7152 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7153 Final, MaxConsecutiveDistance);
7154 if (!Results.empty() && !SortedNonVectorized.empty() &&
7155 OriginalLoads.size() == Loads.size() &&
7156 MaxConsecutiveDistance == Loads.size() &&
7158 [](const std::pair<ArrayRef<Value *>, LoadsState> &P) {
7159 return P.second == LoadsState::ScatterVectorize;
7160 })) {
7161 VectorizedLoads.clear();
7162 SmallVector<LoadInst *> UnsortedNonVectorized;
7164 UnsortedResults =
7165 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7166 UnsortedNonVectorized, Final,
7167 OriginalLoads.size());
7168 if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) {
7169 SortedNonVectorized.swap(UnsortedNonVectorized);
7170 Results.swap(UnsortedResults);
7171 }
7172 }
7173 for (auto [Slice, _] : Results) {
7174 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads ("
7175 << Slice.size() << ")\n");
7176 if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) {
7177 for (Value *L : Slice)
7178 if (!getTreeEntry(L))
7179 SortedNonVectorized.push_back(cast<LoadInst>(L));
7180 continue;
7181 }
7182
7183 // Select maximum VF as a maximum of user gathered nodes and
7184 // distance between scalar loads in these nodes.
7185 unsigned MaxVF = Slice.size();
7186 unsigned UserMaxVF = 0;
7187 unsigned InterleaveFactor = 0;
7188 if (MaxVF == 2) {
7189 UserMaxVF = MaxVF;
7190 } else {
7191 // Found distance between segments of the interleaved loads.
7192 std::optional<unsigned> InterleavedLoadsDistance = 0;
7193 unsigned Order = 0;
7194 std::optional<unsigned> CommonVF = 0;
7196 SmallPtrSet<const TreeEntry *, 8> DeinterleavedNodes;
7197 for (auto [Idx, V] : enumerate(Slice)) {
7198 for (const TreeEntry *E : ValueToGatherNodes.at(V)) {
7199 UserMaxVF = std::max<unsigned>(UserMaxVF, E->Scalars.size());
7200 unsigned Pos =
7201 EntryToPosition.try_emplace(E, Idx).first->second;
7202 UserMaxVF = std::max<unsigned>(UserMaxVF, Idx - Pos + 1);
7203 if (CommonVF) {
7204 if (*CommonVF == 0) {
7205 CommonVF = E->Scalars.size();
7206 continue;
7207 }
7208 if (*CommonVF != E->Scalars.size())
7209 CommonVF.reset();
7210 }
7211 // Check if the load is the part of the interleaved load.
7212 if (Pos != Idx && InterleavedLoadsDistance) {
7213 if (!DeinterleavedNodes.contains(E) &&
7214 any_of(E->Scalars, [&, Slice = Slice](Value *V) {
7215 if (isa<Constant>(V))
7216 return false;
7217 if (getTreeEntry(V))
7218 return true;
7219 const auto &Nodes = ValueToGatherNodes.at(V);
7220 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7221 !is_contained(Slice, V);
7222 })) {
7223 InterleavedLoadsDistance.reset();
7224 continue;
7225 }
7226 DeinterleavedNodes.insert(E);
7227 if (*InterleavedLoadsDistance == 0) {
7228 InterleavedLoadsDistance = Idx - Pos;
7229 continue;
7230 }
7231 if ((Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7232 (Idx - Pos) / *InterleavedLoadsDistance < Order)
7233 InterleavedLoadsDistance.reset();
7234 Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7235 }
7236 }
7237 }
7238 DeinterleavedNodes.clear();
7239 // Check if the large load represents interleaved load operation.
7240 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7241 CommonVF.value_or(0) != 0) {
7242 InterleaveFactor = bit_ceil(*InterleavedLoadsDistance);
7243 unsigned VF = *CommonVF;
7244 OrdersType Order;
7245 SmallVector<Value *> PointerOps;
7246 // Segmented load detected - vectorize at maximum vector factor.
7247 if (InterleaveFactor <= Slice.size() &&
7249 getWidenedType(Slice.front()->getType(), VF),
7250 InterleaveFactor,
7251 cast<LoadInst>(Slice.front())->getAlign(),
7252 cast<LoadInst>(Slice.front())
7254 canVectorizeLoads(Slice, Slice.front(), Order,
7255 PointerOps) == LoadsState::Vectorize) {
7256 UserMaxVF = InterleaveFactor * VF;
7257 } else {
7258 InterleaveFactor = 0;
7259 }
7260 }
7261 // Cannot represent the loads as consecutive vectorizable nodes -
7262 // just exit.
7263 unsigned ConsecutiveNodesSize = 0;
7264 if (!LoadEntriesToVectorize.empty() && InterleaveFactor == 0 &&
7265 any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7266 [&, Slice = Slice](const auto &P) {
7267 const auto *It = find_if(Slice, [&](Value *V) {
7268 return std::get<1>(P).contains(V);
7269 });
7270 if (It == Slice.end())
7271 return false;
7273 VectorizableTree[std::get<0>(P)]->Scalars;
7274 ConsecutiveNodesSize += VL.size();
7275 unsigned Start = std::distance(Slice.begin(), It);
7276 unsigned Sz = Slice.size() - Start;
7277 return Sz < VL.size() ||
7278 Slice.slice(std::distance(Slice.begin(), It),
7279 VL.size()) != VL;
7280 }))
7281 continue;
7282 // Try to build long masked gather loads.
7283 UserMaxVF = bit_ceil(UserMaxVF);
7284 if (InterleaveFactor == 0 &&
7285 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7286 [&, Slice = Slice](unsigned Idx) {
7287 OrdersType Order;
7288 SmallVector<Value *> PointerOps;
7289 return canVectorizeLoads(
7290 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7291 Slice[Idx * UserMaxVF], Order,
7292 PointerOps) ==
7293 LoadsState::ScatterVectorize;
7294 }))
7295 UserMaxVF = MaxVF;
7296 if (Slice.size() != ConsecutiveNodesSize)
7297 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7298 }
7299 for (unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7300 bool IsVectorized = true;
7301 for (unsigned I = 0, E = Slice.size(); I < E; I += VF) {
7302 ArrayRef<Value *> SubSlice =
7303 Slice.slice(I, std::min(VF, E - I));
7304 if (getTreeEntry(SubSlice.front()))
7305 continue;
7306 // Check if the subslice is to be-vectorized entry, which is not
7307 // equal to entry.
7308 if (any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7309 [&](const auto &P) {
7310 return !SubSlice.equals(
7311 VectorizableTree[std::get<0>(P)]
7312 ->Scalars) &&
7313 set_is_subset(SubSlice, std::get<1>(P));
7314 }))
7315 continue;
7316 unsigned Sz = VectorizableTree.size();
7317 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7318 if (Sz == VectorizableTree.size()) {
7319 IsVectorized = false;
7320 // Try non-interleaved vectorization with smaller vector
7321 // factor.
7322 if (InterleaveFactor > 0) {
7323 VF = 2 * (MaxVF / InterleaveFactor);
7324 InterleaveFactor = 0;
7325 }
7326 continue;
7327 }
7328 }
7329 if (IsVectorized)
7330 break;
7331 }
7332 }
7333 NonVectorized.append(SortedNonVectorized);
7334 }
7335 return NonVectorized;
7336 };
7337 for (const auto &GLs : GatheredLoads) {
7338 const auto &Ref = GLs.second;
7339 SmallVector<LoadInst *> NonVectorized = ProcessGatheredLoads(Ref);
7340 if (!Ref.empty() && !NonVectorized.empty() &&
7341 std::accumulate(
7342 Ref.begin(), Ref.end(), 0u,
7343 [](unsigned S,
7344 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) -> unsigned {
7345 return S + LoadsDists.size();
7346 }) != NonVectorized.size() &&
7347 IsMaskedGatherSupported(NonVectorized)) {
7349 for (LoadInst *LI : NonVectorized) {
7350 // Reinsert non-vectorized loads to other list of loads with the same
7351 // base pointers.
7352 gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI,
7353 FinalGatheredLoads,
7354 /*AddNew=*/false);
7355 }
7356 // Final attempt to vectorize non-vectorized loads.
7357 (void)ProcessGatheredLoads(FinalGatheredLoads, /*Final=*/true);
7358 }
7359 }
7360 // Try to vectorize postponed load entries, previously marked as gathered.
7361 for (unsigned Idx : LoadEntriesToVectorize) {
7362 const TreeEntry &E = *VectorizableTree[Idx];
7363 SmallVector<Value *> GatheredScalars(E.Scalars.begin(), E.Scalars.end());
7364 // Avoid reordering, if possible.
7365 if (!E.ReorderIndices.empty()) {
7366 // Build a mask out of the reorder indices and reorder scalars per this
7367 // mask.
7368 SmallVector<int> ReorderMask;
7369 inversePermutation(E.ReorderIndices, ReorderMask);
7370 reorderScalars(GatheredScalars, ReorderMask);
7371 }
7372 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7373 }
7374 // If no new entries created, consider it as no gathered loads entries must be
7375 // handled.
7376 if (static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7377 VectorizableTree.size())
7378 GatheredLoadsEntriesFirst.reset();
7379}
7380
7381/// \return true if the specified list of values has only one instruction that
7382/// requires scheduling, false otherwise.
7383#ifndef NDEBUG
7385 Value *NeedsScheduling = nullptr;
7386 for (Value *V : VL) {
7388 continue;
7389 if (!NeedsScheduling) {
7390 NeedsScheduling = V;
7391 continue;
7392 }
7393 return false;
7394 }
7395 return NeedsScheduling;
7396}
7397#endif
7398
7399/// Generates key/subkey pair for the given value to provide effective sorting
7400/// of the values and better detection of the vectorizable values sequences. The
7401/// keys/subkeys can be used for better sorting of the values themselves (keys)
7402/// and in values subgroups (subkeys).
7403static std::pair<size_t, size_t> generateKeySubkey(
7404 Value *V, const TargetLibraryInfo *TLI,
7405 function_ref<hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator,
7406 bool AllowAlternate) {
7407 hash_code Key = hash_value(V->getValueID() + 2);
7408 hash_code SubKey = hash_value(0);
7409 // Sort the loads by the distance between the pointers.
7410 if (auto *LI = dyn_cast<LoadInst>(V)) {
7411 Key = hash_combine(LI->getType(), hash_value(Instruction::Load), Key);
7412 if (LI->isSimple())
7413 SubKey = hash_value(LoadsSubkeyGenerator(Key, LI));
7414 else
7415 Key = SubKey = hash_value(LI);
7416 } else if (isVectorLikeInstWithConstOps(V)) {
7417 // Sort extracts by the vector operands.
7418 if (isa<ExtractElementInst, UndefValue>(V))
7419 Key = hash_value(Value::UndefValueVal + 1);
7420 if (auto *EI = dyn_cast<ExtractElementInst>(V)) {
7421 if (!isUndefVector(EI->getVectorOperand()).all() &&
7422 !isa<UndefValue>(EI->getIndexOperand()))
7423 SubKey = hash_value(EI->getVectorOperand());
7424 }
7425 } else if (auto *I = dyn_cast<Instruction>(V)) {
7426 // Sort other instructions just by the opcodes except for CMPInst.
7427 // For CMP also sort by the predicate kind.
7428 if ((isa<BinaryOperator, CastInst>(I)) &&
7429 isValidForAlternation(I->getOpcode())) {
7430 if (AllowAlternate)
7431 Key = hash_value(isa<BinaryOperator>(I) ? 1 : 0);
7432 else
7433 Key = hash_combine(hash_value(I->getOpcode()), Key);
7434 SubKey = hash_combine(
7435 hash_value(I->getOpcode()), hash_value(I->getType()),
7436 hash_value(isa<BinaryOperator>(I)
7437 ? I->getType()
7438 : cast<CastInst>(I)->getOperand(0)->getType()));
7439 // For casts, look through the only operand to improve compile time.
7440 if (isa<CastInst>(I)) {
7441 std::pair<size_t, size_t> OpVals =
7442 generateKeySubkey(I->getOperand(0), TLI, LoadsSubkeyGenerator,
7443 /*AllowAlternate=*/true);
7444 Key = hash_combine(OpVals.first, Key);
7445 SubKey = hash_combine(OpVals.first, SubKey);
7446 }
7447 } else if (auto *CI = dyn_cast<CmpInst>(I)) {
7448 CmpInst::Predicate Pred = CI->getPredicate();
7449 if (CI->isCommutative())
7450 Pred = std::min(Pred, CmpInst::getInversePredicate(Pred));
7452 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Pred),
7453 hash_value(SwapPred),
7454 hash_value(CI->getOperand(0)->getType()));
7455 } else if (auto *Call = dyn_cast<CallInst>(I)) {
7458 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(ID));
7459 } else if (!VFDatabase(*Call).getMappings(*Call).empty()) {
7460 SubKey = hash_combine(hash_value(I->getOpcode()),
7461 hash_value(Call->getCalledFunction()));
7462 } else {
7463 Key = hash_combine(hash_value(Call), Key);
7464 SubKey = hash_combine(hash_value(I->getOpcode()), hash_value(Call));
7465 }
7466 for (const CallBase::BundleOpInfo &Op : Call->bundle_op_infos())
7467 SubKey = hash_combine(hash_value(Op.Begin), hash_value(Op.End),
7468 hash_value(Op.Tag), SubKey);
7469 } else if (auto *Gep = dyn_cast<GetElementPtrInst>(I)) {
7470 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7471 SubKey = hash_value(Gep->getPointerOperand());
7472 else
7473 SubKey = hash_value(Gep);
7474 } else if (BinaryOperator::isIntDivRem(I->getOpcode()) &&
7475 !isa<ConstantInt>(I->getOperand(1))) {
7476 // Do not try to vectorize instructions with potentially high cost.
7477 SubKey = hash_value(I);
7478 } else {
7479 SubKey = hash_value(I->getOpcode());
7480 }
7481 Key = hash_combine(hash_value(I->getParent()), Key);
7482 }
7483 return std::make_pair(Key, SubKey);
7484}
7485
7486/// Checks if the specified instruction \p I is an alternate operation for
7487/// the given \p MainOp and \p AltOp instructions.
7488static bool isAlternateInstruction(const Instruction *I,
7489 const Instruction *MainOp,
7490 const Instruction *AltOp,
7491 const TargetLibraryInfo &TLI);
7492
7493bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S,
7494 ArrayRef<Value *> VL) const {
7495 unsigned Opcode0 = S.getOpcode();
7496 unsigned Opcode1 = S.getAltOpcode();
7497 SmallBitVector OpcodeMask(getAltInstrMask(VL, Opcode0, Opcode1));
7498 // If this pattern is supported by the target then consider it profitable.
7499 if (TTI->isLegalAltInstr(getWidenedType(S.getMainOp()->getType(), VL.size()),
7500 Opcode0, Opcode1, OpcodeMask))
7501 return true;
7503 for (unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7504 Operands.emplace_back();
7505 // Prepare the operand vector.
7506 for (Value *V : VL) {
7507 if (isa<PoisonValue>(V)) {
7508 Operands.back().push_back(
7509 PoisonValue::get(S.getMainOp()->getOperand(I)->getType()));
7510 continue;
7511 }
7512 Operands.back().push_back(cast<Instruction>(V)->getOperand(I));
7513 }
7514 }
7515 if (Operands.size() == 2) {
7516 // Try find best operands candidates.
7517 for (unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7519 Candidates[0] = std::make_pair(Operands[0][I], Operands[0][I + 1]);
7520 Candidates[1] = std::make_pair(Operands[0][I], Operands[1][I + 1]);
7521 Candidates[2] = std::make_pair(Operands[1][I], Operands[0][I + 1]);
7522 std::optional<int> Res = findBestRootPair(Candidates);
7523 switch (Res.value_or(0)) {
7524 case 0:
7525 break;
7526 case 1:
7527 std::swap(Operands[0][I + 1], Operands[1][I + 1]);
7528 break;
7529 case 2:
7530 std::swap(Operands[0][I], Operands[1][I]);
7531 break;
7532 default:
7533 llvm_unreachable("Unexpected index.");
7534 }
7535 }
7536 }
7537 DenseSet<unsigned> UniqueOpcodes;
7538 constexpr unsigned NumAltInsts = 3; // main + alt + shuffle.
7539 unsigned NonInstCnt = 0;
7540 // Estimate number of instructions, required for the vectorized node and for
7541 // the buildvector node.
7542 unsigned UndefCnt = 0;
7543 // Count the number of extra shuffles, required for vector nodes.
7544 unsigned ExtraShuffleInsts = 0;
7545 // Check that operands do not contain same values and create either perfect
7546 // diamond match or shuffled match.
7547 if (Operands.size() == 2) {
7548 // Do not count same operands twice.
7549 if (Operands.front() == Operands.back()) {
7550 Operands.erase(Operands.begin());
7551 } else if (!allConstant(Operands.front()) &&
7552 all_of(Operands.front(), [&](Value *V) {
7553 return is_contained(Operands.back(), V);
7554 })) {
7555 Operands.erase(Operands.begin());
7556 ++ExtraShuffleInsts;
7557 }
7558 }
7559 const Loop *L = LI->getLoopFor(S.getMainOp()->getParent());
7560 // Vectorize node, if:
7561 // 1. at least single operand is constant or splat.
7562 // 2. Operands have many loop invariants (the instructions are not loop
7563 // invariants).
7564 // 3. At least single unique operands is supposed to vectorized.
7565 return none_of(Operands,
7566 [&](ArrayRef<Value *> Op) {
7567 if (allConstant(Op) ||
7568 (!isSplat(Op) && allSameBlock(Op) && allSameType(Op) &&
7569 getSameOpcode(Op, *TLI)))
7570 return false;
7572 for (Value *V : Op) {
7573 if (isa<Constant, ExtractElementInst>(V) ||
7574 getTreeEntry(V) || (L && L->isLoopInvariant(V))) {
7575 if (isa<UndefValue>(V))
7576 ++UndefCnt;
7577 continue;
7578 }
7579 auto Res = Uniques.try_emplace(V, 0);
7580 // Found first duplicate - need to add shuffle.
7581 if (!Res.second && Res.first->second == 1)
7582 ++ExtraShuffleInsts;
7583 ++Res.first->getSecond();
7584 if (auto *I = dyn_cast<Instruction>(V))
7585 UniqueOpcodes.insert(I->getOpcode());
7586 else if (Res.second)
7587 ++NonInstCnt;
7588 }
7589 return none_of(Uniques, [&](const auto &P) {
7590 return P.first->hasNUsesOrMore(P.second + 1) &&
7591 none_of(P.first->users(), [&](User *U) {
7592 return getTreeEntry(U) || Uniques.contains(U);
7593 });
7594 });
7595 }) ||
7596 // Do not vectorize node, if estimated number of vector instructions is
7597 // more than estimated number of buildvector instructions. Number of
7598 // vector operands is number of vector instructions + number of vector
7599 // instructions for operands (buildvectors). Number of buildvector
7600 // instructions is just number_of_operands * number_of_scalars.
7601 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7602 (UniqueOpcodes.size() + NonInstCnt + ExtraShuffleInsts +
7603 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7604}
7605
7606BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7607 const InstructionsState &S, ArrayRef<Value *> VL,
7608 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7609 SmallVectorImpl<Value *> &PointerOps) {
7610 assert(S.getMainOp() &&
7611 "Expected instructions with same/alternate opcodes only.");
7612
7613 unsigned ShuffleOrOp =
7614 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
7615 Instruction *VL0 = S.getMainOp();
7616 switch (ShuffleOrOp) {
7617 case Instruction::PHI: {
7618 // Too many operands - gather, most probably won't be vectorized.
7619 if (VL0->getNumOperands() > MaxPHINumOperands)
7620 return TreeEntry::NeedToGather;
7621 // Check for terminator values (e.g. invoke).
7622 for (Value *V : VL) {
7623 auto *PHI = dyn_cast<PHINode>(V);
7624 if (!PHI)
7625 continue;
7626 for (Value *Incoming : PHI->incoming_values()) {
7627 Instruction *Term = dyn_cast<Instruction>(Incoming);
7628 if (Term && Term->isTerminator()) {
7630 << "SLP: Need to swizzle PHINodes (terminator use).\n");
7631 return TreeEntry::NeedToGather;
7632 }
7633 }
7634 }
7635
7636 return TreeEntry::Vectorize;
7637 }
7638 case Instruction::ExtractValue:
7639 case Instruction::ExtractElement: {
7640 bool Reuse = canReuseExtract(VL, CurrentOrder);
7641 // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and
7642 // non-full registers).
7643 if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size()))
7644 return TreeEntry::NeedToGather;
7645 if (Reuse || !CurrentOrder.empty())
7646 return TreeEntry::Vectorize;
7647 LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
7648 return TreeEntry::NeedToGather;
7649 }
7650 case Instruction::InsertElement: {
7651 // Check that we have a buildvector and not a shuffle of 2 or more
7652 // different vectors.
7653 ValueSet SourceVectors;
7654 for (Value *V : VL) {
7655 SourceVectors.insert(cast<Instruction>(V)->getOperand(0));
7656 assert(getElementIndex(V) != std::nullopt &&
7657 "Non-constant or undef index?");
7658 }
7659
7660 if (count_if(VL, [&SourceVectors](Value *V) {
7661 return !SourceVectors.contains(V);
7662 }) >= 2) {
7663 // Found 2nd source vector - cancel.
7664 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7665 "different source vectors.\n");
7666 return TreeEntry::NeedToGather;
7667 }
7668
7669 if (any_of(VL, [&SourceVectors](Value *V) {
7670 // The last InsertElement can have multiple uses.
7671 return SourceVectors.contains(V) && !V->hasOneUse();
7672 })) {
7673 assert(SLPReVec && "Only supported by REVEC.");
7674 LLVM_DEBUG(dbgs() << "SLP: Gather of insertelement vectors with "
7675 "multiple uses.\n");
7676 return TreeEntry::NeedToGather;
7677 }
7678
7679 return TreeEntry::Vectorize;
7680 }
7681 case Instruction::Load: {
7682 // Check that a vectorized load would load the same memory as a scalar
7683 // load. For example, we don't want to vectorize loads that are smaller
7684 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
7685 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
7686 // from such a struct, we read/write packed bits disagreeing with the
7687 // unvectorized version.
7688 switch (canVectorizeLoads(VL, VL0, CurrentOrder, PointerOps)) {
7690 return TreeEntry::Vectorize;
7692 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7693 // Delay slow vectorized nodes for better vectorization attempts.
7694 LoadEntriesToVectorize.insert(VectorizableTree.size());
7695 return TreeEntry::NeedToGather;
7696 }
7697 return TreeEntry::ScatterVectorize;
7699 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7700 // Delay slow vectorized nodes for better vectorization attempts.
7701 LoadEntriesToVectorize.insert(VectorizableTree.size());
7702 return TreeEntry::NeedToGather;
7703 }
7704 return TreeEntry::StridedVectorize;
7705 case LoadsState::Gather:
7706#ifndef NDEBUG
7707 Type *ScalarTy = VL0->getType();
7708 if (DL->getTypeSizeInBits(ScalarTy) !=
7709 DL->getTypeAllocSizeInBits(ScalarTy))
7710 LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
7711 else if (any_of(VL, [](Value *V) {
7712 auto *LI = dyn_cast<LoadInst>(V);
7713 return !LI || !LI->isSimple();
7714 }))
7715 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
7716 else
7717 LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
7718#endif // NDEBUG
7720 return TreeEntry::NeedToGather;
7721 }
7722 llvm_unreachable("Unexpected state of loads");
7723 }
7724 case Instruction::ZExt:
7725 case Instruction::SExt:
7726 case Instruction::FPToUI:
7727 case Instruction::FPToSI:
7728 case Instruction::FPExt:
7729 case Instruction::PtrToInt:
7730 case Instruction::IntToPtr:
7731 case Instruction::SIToFP:
7732 case Instruction::UIToFP:
7733 case Instruction::Trunc:
7734 case Instruction::FPTrunc:
7735 case Instruction::BitCast: {
7736 Type *SrcTy = VL0->getOperand(0)->getType();
7737 for (Value *V : VL) {
7738 if (isa<PoisonValue>(V))
7739 continue;
7740 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7741 if (Ty != SrcTy || !isValidElementType(Ty)) {
7742 LLVM_DEBUG(
7743 dbgs() << "SLP: Gathering casts with different src types.\n");
7744 return TreeEntry::NeedToGather;
7745 }
7746 }
7747 return TreeEntry::Vectorize;
7748 }
7749 case Instruction::ICmp:
7750 case Instruction::FCmp: {
7751 // Check that all of the compares have the same predicate.
7752 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
7754 Type *ComparedTy = VL0->getOperand(0)->getType();
7755 for (Value *V : VL) {
7756 if (isa<PoisonValue>(V))
7757 continue;
7758 auto *Cmp = cast<CmpInst>(V);
7759 if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
7760 Cmp->getOperand(0)->getType() != ComparedTy) {
7761 LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
7762 return TreeEntry::NeedToGather;
7763 }
7764 }
7765 return TreeEntry::Vectorize;
7766 }
7767 case Instruction::Select:
7768 case Instruction::FNeg:
7769 case Instruction::Add:
7770 case Instruction::FAdd:
7771 case Instruction::Sub:
7772 case Instruction::FSub:
7773 case Instruction::Mul:
7774 case Instruction::FMul:
7775 case Instruction::UDiv:
7776 case Instruction::SDiv:
7777 case Instruction::FDiv:
7778 case Instruction::URem:
7779 case Instruction::SRem:
7780 case Instruction::FRem:
7781 case Instruction::Shl:
7782 case Instruction::LShr:
7783 case Instruction::AShr:
7784 case Instruction::And:
7785 case Instruction::Or:
7786 case Instruction::Xor:
7787 case Instruction::Freeze:
7788 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7790 auto *I = dyn_cast<Instruction>(V);
7791 return I && I->isBinaryOp() && !I->isFast();
7792 }))
7793 return TreeEntry::NeedToGather;
7794 return TreeEntry::Vectorize;
7795 case Instruction::GetElementPtr: {
7796 // We don't combine GEPs with complicated (nested) indexing.
7797 for (Value *V : VL) {
7798 auto *I = dyn_cast<GetElementPtrInst>(V);
7799 if (!I)
7800 continue;
7801 if (I->getNumOperands() != 2) {
7802 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
7803 return TreeEntry::NeedToGather;
7804 }
7805 }
7806
7807 // We can't combine several GEPs into one vector if they operate on
7808 // different types.
7809 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7810 for (Value *V : VL) {
7811 auto *GEP = dyn_cast<GEPOperator>(V);
7812 if (!GEP)
7813 continue;
7814 Type *CurTy = GEP->getSourceElementType();
7815 if (Ty0 != CurTy) {
7816 LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
7817 return TreeEntry::NeedToGather;
7818 }
7819 }
7820
7821 // We don't combine GEPs with non-constant indexes.
7822 Type *Ty1 = VL0->getOperand(1)->getType();
7823 for (Value *V : VL) {
7824 auto *I = dyn_cast<GetElementPtrInst>(V);
7825 if (!I)
7826 continue;
7827 auto *Op = I->getOperand(1);
7828 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7829 (Op->getType() != Ty1 &&
7830 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(Op)) ||
7831 Op->getType()->getScalarSizeInBits() >
7832 DL->getIndexSizeInBits(
7833 V->getType()->getPointerAddressSpace())))) {
7834 LLVM_DEBUG(
7835 dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
7836 return TreeEntry::NeedToGather;
7837 }
7838 }
7839
7840 return TreeEntry::Vectorize;
7841 }
7842 case Instruction::Store: {
7843 // Check if the stores are consecutive or if we need to swizzle them.
7844 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7845 // Avoid types that are padded when being allocated as scalars, while
7846 // being packed together in a vector (such as i1).
7847 if (DL->getTypeSizeInBits(ScalarTy) !=
7848 DL->getTypeAllocSizeInBits(ScalarTy)) {
7849 LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
7850 return TreeEntry::NeedToGather;
7851 }
7852 // Make sure all stores in the bundle are simple - we can't vectorize
7853 // atomic or volatile stores.
7854 for (Value *V : VL) {
7855 auto *SI = cast<StoreInst>(V);
7856 if (!SI->isSimple()) {
7857 LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
7858 return TreeEntry::NeedToGather;
7859 }
7860 PointerOps.push_back(SI->getPointerOperand());
7861 }
7862
7863 // Check the order of pointer operands.
7864 if (llvm::sortPtrAccesses(PointerOps, ScalarTy, *DL, *SE, CurrentOrder)) {
7865 Value *Ptr0;
7866 Value *PtrN;
7867 if (CurrentOrder.empty()) {
7868 Ptr0 = PointerOps.front();
7869 PtrN = PointerOps.back();
7870 } else {
7871 Ptr0 = PointerOps[CurrentOrder.front()];
7872 PtrN = PointerOps[CurrentOrder.back()];
7873 }
7874 std::optional<int> Dist =
7875 getPointersDiff(ScalarTy, Ptr0, ScalarTy, PtrN, *DL, *SE);
7876 // Check that the sorted pointer operands are consecutive.
7877 if (static_cast<unsigned>(*Dist) == VL.size() - 1)
7878 return TreeEntry::Vectorize;
7879 }
7880
7881 LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
7882 return TreeEntry::NeedToGather;
7883 }
7884 case Instruction::Call: {
7885 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7887 auto *I = dyn_cast<Instruction>(V);
7888 return I && !I->isFast();
7889 }))
7890 return TreeEntry::NeedToGather;
7891 // Check if the calls are all to the same vectorizable intrinsic or
7892 // library function.
7893 CallInst *CI = cast<CallInst>(VL0);
7895
7896 VFShape Shape = VFShape::get(
7897 CI->getFunctionType(),
7898 ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
7899 false /*HasGlobalPred*/);
7900 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
7901
7902 if (!VecFunc && !isTriviallyVectorizable(ID)) {
7903 LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
7904 return TreeEntry::NeedToGather;
7905 }
7906 Function *F = CI->getCalledFunction();
7907 unsigned NumArgs = CI->arg_size();
7908 SmallVector<Value *, 4> ScalarArgs(NumArgs, nullptr);
7909 for (unsigned J = 0; J != NumArgs; ++J)
7911 ScalarArgs[J] = CI->getArgOperand(J);
7912 for (Value *V : VL) {
7913 CallInst *CI2 = dyn_cast<CallInst>(V);
7914 if (!CI2 || CI2->getCalledFunction() != F ||
7915 getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
7916 (VecFunc &&
7917 VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
7919 LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
7920 << "\n");
7921 return TreeEntry::NeedToGather;
7922 }
7923 // Some intrinsics have scalar arguments and should be same in order for
7924 // them to be vectorized.
7925 for (unsigned J = 0; J != NumArgs; ++J) {
7927 Value *A1J = CI2->getArgOperand(J);
7928 if (ScalarArgs[J] != A1J) {
7930 << "SLP: mismatched arguments in call:" << *CI
7931 << " argument " << ScalarArgs[J] << "!=" << A1J << "\n");
7932 return TreeEntry::NeedToGather;
7933 }
7934 }
7935 }
7936 // Verify that the bundle operands are identical between the two calls.
7937 if (CI->hasOperandBundles() &&
7938 !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
7939 CI->op_begin() + CI->getBundleOperandsEndIndex(),
7940 CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
7941 LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI
7942 << "!=" << *V << '\n');
7943 return TreeEntry::NeedToGather;
7944 }
7945 }
7946
7947 return TreeEntry::Vectorize;
7948 }
7949 case Instruction::ShuffleVector: {
7950 if (!S.isAltShuffle()) {
7951 // REVEC can support non alternate shuffle.
7953 return TreeEntry::Vectorize;
7954 // If this is not an alternate sequence of opcode like add-sub
7955 // then do not vectorize this instruction.
7956 LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
7957 return TreeEntry::NeedToGather;
7958 }
7959 if (!SLPSkipEarlyProfitabilityCheck && !areAltOperandsProfitable(S, VL)) {
7960 LLVM_DEBUG(
7961 dbgs()
7962 << "SLP: ShuffleVector not vectorized, operands are buildvector and "
7963 "the whole alt sequence is not profitable.\n");
7964 return TreeEntry::NeedToGather;
7965 }
7966
7967 return TreeEntry::Vectorize;
7968 }
7969 default:
7970 LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
7971 return TreeEntry::NeedToGather;
7972 }
7973}
7974
7975namespace {
7976/// Allows to correctly handle operands of the phi nodes based on the \p Main
7977/// PHINode order of incoming basic blocks/values.
7978class PHIHandler {
7979 DominatorTree &DT;
7980 PHINode *Main = nullptr;
7983
7984public:
7985 PHIHandler() = delete;
7986 PHIHandler(DominatorTree &DT, PHINode *Main, ArrayRef<Value *> Phis)
7987 : DT(DT), Main(Main), Phis(Phis),
7988 Operands(Main->getNumIncomingValues(),
7989 SmallVector<Value *>(Phis.size(), nullptr)) {}
7990 void buildOperands() {
7991 constexpr unsigned FastLimit = 4;
7992 if (Main->getNumIncomingValues() <= FastLimit) {
7993 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
7994 BasicBlock *InBB = Main->getIncomingBlock(I);
7995 if (!DT.isReachableFromEntry(InBB)) {
7996 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
7997 continue;
7998 }
7999 // Prepare the operand vector.
8000 for (auto [Idx, V] : enumerate(Phis)) {
8001 auto *P = dyn_cast<PHINode>(V);
8002 if (!P) {
8003 assert(isa<PoisonValue>(V) &&
8004 "Expected isa instruction or poison value.");
8005 Operands[I][Idx] = V;
8006 continue;
8007 }
8008 if (P->getIncomingBlock(I) == InBB)
8009 Operands[I][Idx] = P->getIncomingValue(I);
8010 else
8011 Operands[I][Idx] = P->getIncomingValueForBlock(InBB);
8012 }
8013 }
8014 return;
8015 }
8017 for (unsigned I : seq<unsigned>(0, Main->getNumIncomingValues())) {
8018 BasicBlock *InBB = Main->getIncomingBlock(I);
8019 if (!DT.isReachableFromEntry(InBB)) {
8020 Operands[I].assign(Phis.size(), PoisonValue::get(Main->getType()));
8021 continue;
8022 }
8023 Blocks.try_emplace(InBB).first->second.push_back(I);
8024 }
8025 for (auto [Idx, V] : enumerate(Phis)) {
8026 if (isa<PoisonValue>(V)) {
8027 for (unsigned I : seq<unsigned>(Main->getNumIncomingValues()))
8028 Operands[I][Idx] = V;
8029 continue;
8030 }
8031 auto *P = cast<PHINode>(V);
8032 for (unsigned I : seq<unsigned>(0, P->getNumIncomingValues())) {
8033 BasicBlock *InBB = P->getIncomingBlock(I);
8034 if (InBB == Main->getIncomingBlock(I)) {
8035 if (isa_and_nonnull<PoisonValue>(Operands[I][Idx]))
8036 continue;
8037 Operands[I][Idx] = P->getIncomingValue(I);
8038 continue;
8039 }
8040 auto It = Blocks.find(InBB);
8041 if (It == Blocks.end())
8042 continue;
8043 Operands[It->second.front()][Idx] = P->getIncomingValue(I);
8044 }
8045 }
8046 for (const auto &P : Blocks) {
8047 if (P.getSecond().size() <= 1)
8048 continue;
8049 unsigned BasicI = P.getSecond().front();
8050 for (unsigned I : ArrayRef(P.getSecond()).drop_front()) {
8052 [&](const auto &Data) {
8053 return !Data.value() ||
8054 Data.value() == Operands[BasicI][Data.index()];
8055 }) &&
8056 "Expected empty operands list.");
8057 Operands[I] = Operands[BasicI];
8058 }
8059 }
8060 }
8061 ArrayRef<Value *> getOperands(unsigned I) const { return Operands[I]; }
8062};
8063} // namespace
8064
8065void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
8066 const EdgeInfo &UserTreeIdx,
8067 unsigned InterleaveFactor) {
8068 assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
8069
8070 SmallVector<int> ReuseShuffleIndices;
8071 SmallVector<Value *> UniqueValues;
8072 SmallVector<Value *> NonUniqueValueVL;
8073 auto TryToFindDuplicates = [&](const InstructionsState &S,
8074 bool DoNotFail = false) {
8075 // Check that every instruction appears once in this bundle.
8076 SmallDenseMap<Value *, unsigned, 16> UniquePositions(VL.size());
8077 for (Value *V : VL) {
8078 if (isConstant(V)) {
8079 ReuseShuffleIndices.emplace_back(
8080 isa<PoisonValue>(V) ? PoisonMaskElem : UniqueValues.size());
8081 UniqueValues.emplace_back(V);
8082 continue;
8083 }
8084 auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
8085 ReuseShuffleIndices.emplace_back(Res.first->second);
8086 if (Res.second)
8087 UniqueValues.emplace_back(V);
8088 }
8089 size_t NumUniqueScalarValues = UniqueValues.size();
8090 bool IsFullVectors = hasFullVectorsOrPowerOf2(
8091 *TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues);
8092 if (NumUniqueScalarValues == VL.size() &&
8093 (VectorizeNonPowerOf2 || IsFullVectors)) {
8094 ReuseShuffleIndices.clear();
8095 } else {
8096 // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops.
8097 if ((UserTreeIdx.UserTE &&
8098 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) ||
8099 !hasFullVectorsOrPowerOf2(*TTI, VL.front()->getType(), VL.size())) {
8100 LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported "
8101 "for nodes with padding.\n");
8102 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8103 return false;
8104 }
8105 LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
8106 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8107 (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) {
8108 return isa<UndefValue>(V) || !isConstant(V);
8109 }))) {
8110 if (DoNotFail && UniquePositions.size() > 1 &&
8111 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8112 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8113 // Find the number of elements, which forms full vectors.
8114 unsigned PWSz = getFullVectorNumberOfElements(
8115 *TTI, UniqueValues.front()->getType(), UniqueValues.size());
8116 if (PWSz == VL.size()) {
8117 ReuseShuffleIndices.clear();
8118 } else {
8119 NonUniqueValueVL.assign(UniqueValues.begin(), UniqueValues.end());
8120 NonUniqueValueVL.append(
8121 PWSz - UniqueValues.size(),
8122 PoisonValue::get(UniqueValues.front()->getType()));
8123 // Check that extended with poisons operations are still valid for
8124 // vectorization (div/rem are not allowed).
8125 if (!getSameOpcode(NonUniqueValueVL, *TLI).valid()) {
8126 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8127 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8128 return false;
8129 }
8130 VL = NonUniqueValueVL;
8131 }
8132 return true;
8133 }
8134 LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
8135 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8136 return false;
8137 }
8138 VL = UniqueValues;
8139 }
8140 return true;
8141 };
8142
8143 InstructionsState S = getSameOpcode(VL, *TLI);
8144
8145 // Don't go into catchswitch blocks, which can happen with PHIs.
8146 // Such blocks can only have PHIs and the catchswitch. There is no
8147 // place to insert a shuffle if we need to, so just avoid that issue.
8148 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8149 LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n");
8150 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8151 return;
8152 }
8153
8154 // Check if this is a duplicate of another entry.
8155 if (S) {
8156 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8157 LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.getMainOp()
8158 << ".\n");
8159 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8160 auto It = MultiNodeScalars.find(S.getMainOp());
8161 if (It != MultiNodeScalars.end()) {
8162 auto *TEIt = find_if(It->getSecond(),
8163 [&](TreeEntry *ME) { return ME->isSame(VL); });
8164 if (TEIt != It->getSecond().end())
8165 E = *TEIt;
8166 else
8167 E = nullptr;
8168 } else {
8169 E = nullptr;
8170 }
8171 }
8172 if (!E) {
8173 if (!doesNotNeedToBeScheduled(S.getMainOp())) {
8174 LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
8175 if (TryToFindDuplicates(S))
8176 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8177 ReuseShuffleIndices);
8178 return;
8179 }
8181 Nodes.insert(getTreeEntry(S.getMainOp()));
8182 for (const TreeEntry *E : MultiNodeScalars.lookup(S.getMainOp()))
8183 Nodes.insert(E);
8184 SmallPtrSet<Value *, 8> Values(VL.begin(), VL.end());
8185 if (any_of(Nodes, [&](const TreeEntry *E) {
8186 if (all_of(E->Scalars,
8187 [&](Value *V) { return Values.contains(V); }))
8188 return true;
8189 SmallPtrSet<Value *, 8> EValues(E->Scalars.begin(),
8190 E->Scalars.end());
8191 return (
8192 all_of(VL, [&](Value *V) { return EValues.contains(V); }));
8193 })) {
8194 LLVM_DEBUG(dbgs() << "SLP: Gathering due to full overlap.\n");
8195 if (TryToFindDuplicates(S))
8196 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8197 ReuseShuffleIndices);
8198 return;
8199 }
8200 } else {
8201 // Record the reuse of the tree node. FIXME, currently this is only
8202 // used to properly draw the graph rather than for the actual
8203 // vectorization.
8204 E->UserTreeIndices.push_back(UserTreeIdx);
8205 LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.getMainOp()
8206 << ".\n");
8207 return;
8208 }
8209 }
8210 }
8211
8212 // Gather if we hit the RecursionMaxDepth, unless this is a load (or z/sext of
8213 // a load), in which case peek through to include it in the tree, without
8214 // ballooning over-budget.
8215 if (Depth >= RecursionMaxDepth &&
8216 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8217 (match(S.getMainOp(), m_Load(m_Value())) ||
8218 all_of(VL, [&S](const Value *I) {
8219 return match(I,
8221 cast<Instruction>(I)->getOpcode() == S.getOpcode();
8222 })))) {
8223 LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
8224 if (TryToFindDuplicates(S))
8225 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8226 ReuseShuffleIndices);
8227 return;
8228 }
8229
8230 // Don't handle scalable vectors
8231 if (S && S.getOpcode() == Instruction::ExtractElement &&
8232 isa<ScalableVectorType>(
8233 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8234 LLVM_DEBUG(dbgs() << "SLP: Gathering due to scalable vector type.\n");
8235 if (TryToFindDuplicates(S))
8236 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8237 ReuseShuffleIndices);
8238 return;
8239 }
8240
8241 // Don't handle vectors.
8242 if (!SLPReVec && getValueType(VL.front())->isVectorTy()) {
8243 LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
8244 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8245 return;
8246 }
8247
8248 // If all of the operands are identical or constant we have a simple solution.
8249 // If we deal with insert/extract instructions, they all must have constant
8250 // indices, otherwise we should gather them, not try to vectorize.
8251 // If alternate op node with 2 elements with gathered operands - do not
8252 // vectorize.
8253 auto &&NotProfitableForVectorization = [&S, this,
8255 if (!S || !S.isAltShuffle() || VL.size() > 2)
8256 return false;
8257 if (VectorizableTree.size() < MinTreeSize)
8258 return false;
8259 if (Depth >= RecursionMaxDepth - 1)
8260 return true;
8261 // Check if all operands are extracts, part of vector node or can build a
8262 // regular vectorize node.
8263 SmallVector<unsigned, 8> InstsCount;
8264 for (Value *V : VL) {
8265 auto *I = cast<Instruction>(V);
8266 InstsCount.push_back(count_if(I->operand_values(), [](Value *Op) {
8267 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8268 }));
8269 }
8270 bool IsCommutative =
8271 isCommutative(S.getMainOp()) || isCommutative(S.getAltOp());
8272 if ((IsCommutative &&
8273 std::accumulate(InstsCount.begin(), InstsCount.end(), 0) < 2) ||
8274 (!IsCommutative &&
8275 all_of(InstsCount, [](unsigned ICnt) { return ICnt < 2; })))
8276 return true;
8277 assert(VL.size() == 2 && "Expected only 2 alternate op instructions.");
8279 auto *I1 = cast<Instruction>(VL.front());
8280 auto *I2 = cast<Instruction>(VL.back());
8281 for (int Op : seq<int>(S.getMainOp()->getNumOperands()))
8282 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8283 I2->getOperand(Op));
8284 if (static_cast<unsigned>(count_if(
8285 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8287 })) >= S.getMainOp()->getNumOperands() / 2)
8288 return false;
8289 if (S.getMainOp()->getNumOperands() > 2)
8290 return true;
8291 if (IsCommutative) {
8292 // Check permuted operands.
8293 Candidates.clear();
8294 for (int Op = 0, E = S.getMainOp()->getNumOperands(); Op < E; ++Op)
8295 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
8296 I2->getOperand((Op + 1) % E));
8297 if (any_of(
8298 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
8300 }))
8301 return false;
8302 }
8303 return true;
8304 };
8305 SmallVector<unsigned> SortedIndices;
8306 BasicBlock *BB = nullptr;
8307 bool IsScatterVectorizeUserTE =
8308 UserTreeIdx.UserTE &&
8309 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8310 bool AreAllSameBlock = S && allSameBlock(VL);
8311 bool AreScatterAllGEPSameBlock =
8312 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8313 VL.size() > 2 &&
8314 all_of(VL,
8315 [&BB](Value *V) {
8316 auto *I = dyn_cast<GetElementPtrInst>(V);
8317 if (!I)
8318 return doesNotNeedToBeScheduled(V);
8319 if (!BB)
8320 BB = I->getParent();
8321 return BB == I->getParent() && I->getNumOperands() == 2;
8322 }) &&
8323 BB &&
8324 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8325 SortedIndices));
8326 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8327 if (!AreAllSameInsts || (!S && allConstant(VL)) || isSplat(VL) ||
8328 (S &&
8329 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8330 S.getMainOp()) &&
8332 NotProfitableForVectorization(VL)) {
8333 LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n");
8334 if (TryToFindDuplicates(S))
8335 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8336 ReuseShuffleIndices);
8337 return;
8338 }
8339
8340 // Don't vectorize ephemeral values.
8341 if (S && !EphValues.empty()) {
8342 for (Value *V : VL) {
8343 if (EphValues.count(V)) {
8344 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8345 << ") is ephemeral.\n");
8346 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8347 return;
8348 }
8349 }
8350 }
8351
8352 // We now know that this is a vector of instructions of the same type from
8353 // the same block.
8354
8355 // Check that none of the instructions in the bundle are already in the tree.
8356 for (Value *V : VL) {
8357 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8359 continue;
8360 if (getTreeEntry(V)) {
8361 LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
8362 << ") is already in tree.\n");
8363 if (TryToFindDuplicates(S))
8364 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8365 ReuseShuffleIndices);
8366 return;
8367 }
8368 }
8369
8370 // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
8371 if (UserIgnoreList && !UserIgnoreList->empty()) {
8372 for (Value *V : VL) {
8373 if (UserIgnoreList->contains(V)) {
8374 LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
8375 if (TryToFindDuplicates(S))
8376 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8377 ReuseShuffleIndices);
8378 return;
8379 }
8380 }
8381 }
8382
8383 // Special processing for sorted pointers for ScatterVectorize node with
8384 // constant indeces only.
8385 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8386 assert(VL.front()->getType()->isPointerTy() &&
8387 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8388 "Expected pointers only.");
8389 // Reset S to make it GetElementPtr kind of node.
8390 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
8391 assert(It != VL.end() && "Expected at least one GEP.");
8392 S = getSameOpcode(*It, *TLI);
8393 }
8394
8395 // Check that all of the users of the scalars that we want to vectorize are
8396 // schedulable.
8397 Instruction *VL0 = S.getMainOp();
8398 BB = VL0->getParent();
8399
8400 if (S &&
8401 (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()) ||
8402 !DT->isReachableFromEntry(BB))) {
8403 // Don't go into unreachable blocks. They may contain instructions with
8404 // dependency cycles which confuse the final scheduling.
8405 // Do not vectorize EH and non-returning blocks, not profitable in most
8406 // cases.
8407 LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
8408 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx);
8409 return;
8410 }
8411
8412 // Check that every instruction appears once in this bundle.
8413 if (!TryToFindDuplicates(S, /*DoNotFail=*/true))
8414 return;
8415
8416 // Perform specific checks for each particular instruction kind.
8417 OrdersType CurrentOrder;
8418 SmallVector<Value *> PointerOps;
8419 TreeEntry::EntryState State = getScalarsVectorizationState(
8420 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8421 if (State == TreeEntry::NeedToGather) {
8422 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8423 ReuseShuffleIndices);
8424 return;
8425 }
8426
8427 auto &BSRef = BlocksSchedules[BB];
8428 if (!BSRef)
8429 BSRef = std::make_unique<BlockScheduling>(BB);
8430
8431 BlockScheduling &BS = *BSRef;
8432
8433 std::optional<ScheduleData *> Bundle =
8434 BS.tryScheduleBundle(UniqueValues, this, S);
8435#ifdef EXPENSIVE_CHECKS
8436 // Make sure we didn't break any internal invariants
8437 BS.verify();
8438#endif
8439 if (!Bundle) {
8440 LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
8441 assert((!BS.getScheduleData(VL0) ||
8442 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8443 "tryScheduleBundle should cancelScheduling on failure");
8444 newTreeEntry(VL, std::nullopt /*not vectorized*/, S, UserTreeIdx,
8445 ReuseShuffleIndices);
8446 NonScheduledFirst.insert(VL.front());
8447 if (S.getOpcode() == Instruction::Load &&
8448 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8450 return;
8451 }
8452 LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
8453
8454 unsigned ShuffleOrOp =
8455 S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode();
8456 auto CreateOperandNodes = [&](TreeEntry *TE, const auto &Operands) {
8457 // Postpone PHI nodes creation
8458 SmallVector<unsigned> PHIOps;
8459 for (unsigned I : seq<unsigned>(Operands.size())) {
8461 if (Op.empty())
8462 continue;
8463 InstructionsState S = getSameOpcode(Op, *TLI);
8464 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8465 buildTree_rec(Op, Depth + 1, {TE, I});
8466 else
8467 PHIOps.push_back(I);
8468 }
8469 for (unsigned I : PHIOps)
8470 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8471 };
8472 switch (ShuffleOrOp) {
8473 case Instruction::PHI: {
8474 auto *PH = cast<PHINode>(VL0);
8475
8476 TreeEntry *TE =
8477 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8478 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (PHINode).\n";
8479 TE->dump());
8480
8481 // Keeps the reordered operands to avoid code duplication.
8482 PHIHandler Handler(*DT, PH, VL);
8483 Handler.buildOperands();
8484 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8485 TE->setOperand(I, Handler.getOperands(I));
8486 SmallVector<ArrayRef<Value *>> Operands(PH->getNumOperands());
8487 for (unsigned I : seq<unsigned>(PH->getNumOperands()))
8488 Operands[I] = Handler.getOperands(I);
8489 CreateOperandNodes(TE, Operands);
8490 return;
8491 }
8492 case Instruction::ExtractValue:
8493 case Instruction::ExtractElement: {
8494 if (CurrentOrder.empty()) {
8495 LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
8496 } else {
8497 LLVM_DEBUG({
8498 dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
8499 "with order";
8500 for (unsigned Idx : CurrentOrder)
8501 dbgs() << " " << Idx;
8502 dbgs() << "\n";
8503 });
8504 fixupOrderingIndices(CurrentOrder);
8505 }
8506 // Insert new order with initial value 0, if it does not exist,
8507 // otherwise return the iterator to the existing one.
8508 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8509 ReuseShuffleIndices, CurrentOrder);
8510 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry "
8511 "(ExtractValueInst/ExtractElementInst).\n";
8512 TE->dump());
8513 // This is a special case, as it does not gather, but at the same time
8514 // we are not extending buildTree_rec() towards the operands.
8515 TE->setOperand(*this);
8516 return;
8517 }
8518 case Instruction::InsertElement: {
8519 assert(ReuseShuffleIndices.empty() && "All inserts should be unique");
8520
8521 auto OrdCompare = [](const std::pair<int, int> &P1,
8522 const std::pair<int, int> &P2) {
8523 return P1.first > P2.first;
8524 };
8526 decltype(OrdCompare)>
8527 Indices(OrdCompare);
8528 for (int I = 0, E = VL.size(); I < E; ++I) {
8529 unsigned Idx = *getElementIndex(VL[I]);
8530 Indices.emplace(Idx, I);
8531 }
8532 OrdersType CurrentOrder(VL.size(), VL.size());
8533 bool IsIdentity = true;
8534 for (int I = 0, E = VL.size(); I < E; ++I) {
8535 CurrentOrder[Indices.top().second] = I;
8536 IsIdentity &= Indices.top().second == I;
8537 Indices.pop();
8538 }
8539 if (IsIdentity)
8540 CurrentOrder.clear();
8541 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8542 {}, CurrentOrder);
8543 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (InsertElementInst).\n";
8544 TE->dump());
8545
8546 TE->setOperand(*this);
8547 buildTree_rec(TE->getOperand(1), Depth + 1, {TE, 1});
8548 return;
8549 }
8550 case Instruction::Load: {
8551 // Check that a vectorized load would load the same memory as a scalar
8552 // load. For example, we don't want to vectorize loads that are smaller
8553 // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
8554 // treats loading/storing it as an i8 struct. If we vectorize loads/stores
8555 // from such a struct, we read/write packed bits disagreeing with the
8556 // unvectorized version.
8557 TreeEntry *TE = nullptr;
8558 fixupOrderingIndices(CurrentOrder);
8559 switch (State) {
8560 case TreeEntry::Vectorize:
8561 TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8562 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8563 if (CurrentOrder.empty())
8564 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (LoadInst).\n";
8565 TE->dump());
8566 else
8568 << "SLP: added a new TreeEntry (jumbled LoadInst).\n";
8569 TE->dump());
8570 break;
8571 case TreeEntry::StridedVectorize:
8572 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8573 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8574 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8575 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (strided LoadInst).\n";
8576 TE->dump());
8577 break;
8578 case TreeEntry::ScatterVectorize:
8579 // Vectorizing non-consecutive loads with `llvm.masked.gather`.
8580 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8581 UserTreeIdx, ReuseShuffleIndices);
8582 LLVM_DEBUG(
8583 dbgs()
8584 << "SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8585 TE->dump());
8586 break;
8587 case TreeEntry::CombinedVectorize:
8588 case TreeEntry::NeedToGather:
8589 llvm_unreachable("Unexpected loads state.");
8590 }
8591 TE->setOperand(*this);
8592 if (State == TreeEntry::ScatterVectorize)
8593 buildTree_rec(PointerOps, Depth + 1, {TE, 0});
8594 return;
8595 }
8596 case Instruction::ZExt:
8597 case Instruction::SExt:
8598 case Instruction::FPToUI:
8599 case Instruction::FPToSI:
8600 case Instruction::FPExt:
8601 case Instruction::PtrToInt:
8602 case Instruction::IntToPtr:
8603 case Instruction::SIToFP:
8604 case Instruction::UIToFP:
8605 case Instruction::Trunc:
8606 case Instruction::FPTrunc:
8607 case Instruction::BitCast: {
8608 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8609 std::make_pair(std::numeric_limits<unsigned>::min(),
8610 std::numeric_limits<unsigned>::max()));
8611 if (ShuffleOrOp == Instruction::ZExt ||
8612 ShuffleOrOp == Instruction::SExt) {
8613 CastMaxMinBWSizes = std::make_pair(
8614 std::max<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8615 PrevMaxBW),
8616 std::min<unsigned>(
8617 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8618 PrevMinBW));
8619 } else if (ShuffleOrOp == Instruction::Trunc) {
8620 CastMaxMinBWSizes = std::make_pair(
8621 std::max<unsigned>(
8622 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()),
8623 PrevMaxBW),
8624 std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
8625 PrevMinBW));
8626 }
8627 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8628 ReuseShuffleIndices);
8629 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CastInst).\n";
8630 TE->dump());
8631
8632 TE->setOperand(*this);
8633 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8634 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8635 if (ShuffleOrOp == Instruction::Trunc) {
8636 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8637 } else if (ShuffleOrOp == Instruction::SIToFP ||
8638 ShuffleOrOp == Instruction::UIToFP) {
8639 unsigned NumSignBits =
8640 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8641 if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
8642 APInt Mask = DB->getDemandedBits(OpI);
8643 NumSignBits = std::max(NumSignBits, Mask.countl_zero());
8644 }
8645 if (NumSignBits * 2 >=
8646 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8647 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8648 }
8649 return;
8650 }
8651 case Instruction::ICmp:
8652 case Instruction::FCmp: {
8653 // Check that all of the compares have the same predicate.
8654 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
8655 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8656 ReuseShuffleIndices);
8657 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CmpInst).\n";
8658 TE->dump());
8659
8661 VLOperands Ops(VL, S, *this);
8662 if (cast<CmpInst>(VL0)->isCommutative()) {
8663 // Commutative predicate - collect + sort operands of the instructions
8664 // so that each side is more likely to have the same opcode.
8666 "Commutative Predicate mismatch");
8667 Ops.reorder();
8668 Left = Ops.getVL(0);
8669 Right = Ops.getVL(1);
8670 } else {
8671 // Collect operands - commute if it uses the swapped predicate.
8672 for (Value *V : VL) {
8673 if (isa<PoisonValue>(V)) {
8674 Left.push_back(PoisonValue::get(VL0->getOperand(0)->getType()));
8675 Right.push_back(PoisonValue::get(VL0->getOperand(1)->getType()));
8676 continue;
8677 }
8678 auto *Cmp = cast<CmpInst>(V);
8679 Value *LHS = Cmp->getOperand(0);
8680 Value *RHS = Cmp->getOperand(1);
8681 if (Cmp->getPredicate() != P0)
8682 std::swap(LHS, RHS);
8683 Left.push_back(LHS);
8684 Right.push_back(RHS);
8685 }
8686 }
8687 TE->setOperand(0, Left);
8688 TE->setOperand(1, Right);
8689 buildTree_rec(Left, Depth + 1, {TE, 0});
8690 buildTree_rec(Right, Depth + 1, {TE, 1});
8691 if (ShuffleOrOp == Instruction::ICmp) {
8692 unsigned NumSignBits0 =
8693 ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
8694 if (NumSignBits0 * 2 >=
8695 DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
8696 ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
8697 unsigned NumSignBits1 =
8698 ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
8699 if (NumSignBits1 * 2 >=
8700 DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
8701 ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
8702 }
8703 return;
8704 }
8705 case Instruction::Select:
8706 case Instruction::FNeg:
8707 case Instruction::Add:
8708 case Instruction::FAdd:
8709 case Instruction::Sub:
8710 case Instruction::FSub:
8711 case Instruction::Mul:
8712 case Instruction::FMul:
8713 case Instruction::UDiv:
8714 case Instruction::SDiv:
8715 case Instruction::FDiv:
8716 case Instruction::URem:
8717 case Instruction::SRem:
8718 case Instruction::FRem:
8719 case Instruction::Shl:
8720 case Instruction::LShr:
8721 case Instruction::AShr:
8722 case Instruction::And:
8723 case Instruction::Or:
8724 case Instruction::Xor:
8725 case Instruction::Freeze: {
8726 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8727 ReuseShuffleIndices);
8728 LLVM_DEBUG(
8729 dbgs() << "SLP: added a new TreeEntry "
8730 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8731 TE->dump());
8732
8733 TE->setOperand(*this, isa<BinaryOperator>(VL0) && isCommutative(VL0));
8734 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8735 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8736 return;
8737 }
8738 case Instruction::GetElementPtr: {
8739 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8740 ReuseShuffleIndices);
8741 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (GetElementPtrInst).\n";
8742 TE->dump());
8744 // Prepare the operand vector for pointer operands.
8745 for (Value *V : VL) {
8746 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8747 if (!GEP) {
8748 Operands.front().push_back(V);
8749 continue;
8750 }
8751 Operands.front().push_back(GEP->getPointerOperand());
8752 }
8753 TE->setOperand(0, Operands.front());
8754 // Need to cast all indices to the same type before vectorization to
8755 // avoid crash.
8756 // Required to be able to find correct matches between different gather
8757 // nodes and reuse the vectorized values rather than trying to gather them
8758 // again.
8759 int IndexIdx = 1;
8760 Type *VL0Ty = VL0->getOperand(IndexIdx)->getType();
8761 Type *Ty = all_of(VL,
8762 [VL0Ty, IndexIdx](Value *V) {
8763 auto *GEP = dyn_cast<GetElementPtrInst>(V);
8764 if (!GEP)
8765 return true;
8766 return VL0Ty == GEP->getOperand(IndexIdx)->getType();
8767 })
8768 ? VL0Ty
8769 : DL->getIndexType(cast<GetElementPtrInst>(VL0)
8770 ->getPointerOperandType()
8771 ->getScalarType());
8772 // Prepare the operand vector.
8773 for (Value *V : VL) {
8774 auto *I = dyn_cast<GetElementPtrInst>(V);
8775 if (!I) {
8776 Operands.back().push_back(
8777 ConstantInt::get(Ty, 0, /*isSigned=*/false));
8778 continue;
8779 }
8780 auto *Op = I->getOperand(IndexIdx);
8781 auto *CI = dyn_cast<ConstantInt>(Op);
8782 if (!CI)
8783 Operands.back().push_back(Op);
8784 else
8785 Operands.back().push_back(ConstantFoldIntegerCast(
8786 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8787 }
8788 TE->setOperand(IndexIdx, Operands.back());
8789
8790 for (unsigned I = 0, Ops = Operands.size(); I < Ops; ++I)
8791 buildTree_rec(Operands[I], Depth + 1, {TE, I});
8792 return;
8793 }
8794 case Instruction::Store: {
8795 bool Consecutive = CurrentOrder.empty();
8796 if (!Consecutive)
8797 fixupOrderingIndices(CurrentOrder);
8798 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8799 ReuseShuffleIndices, CurrentOrder);
8800 if (Consecutive)
8801 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (StoreInst).\n";
8802 TE->dump());
8803 else
8804 LLVM_DEBUG(
8805 dbgs() << "SLP: added a new TreeEntry (jumbled StoreInst).\n";
8806 TE->dump());
8807 TE->setOperand(*this);
8808 buildTree_rec(TE->getOperand(0), Depth + 1, {TE, 0});
8809 return;
8810 }
8811 case Instruction::Call: {
8812 // Check if the calls are all to the same vectorizable intrinsic or
8813 // library function.
8814 CallInst *CI = cast<CallInst>(VL0);
8816
8817 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8818 ReuseShuffleIndices);
8819 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (CallInst).\n";
8820 TE->dump());
8821 TE->setOperand(*this, isCommutative(VL0));
8822 for (unsigned I : seq<unsigned>(CI->arg_size())) {
8823 // For scalar operands no need to create an entry since no need to
8824 // vectorize it.
8826 continue;
8827 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8828 }
8829 return;
8830 }
8831 case Instruction::ShuffleVector: {
8832 TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
8833 ReuseShuffleIndices);
8834 if (S.isAltShuffle()) {
8835 LLVM_DEBUG(dbgs() << "SLP: added a new TreeEntry (isAltShuffle).\n";
8836 TE->dump());
8837 } else {
8838 assert(SLPReVec && "Only supported by REVEC.");
8839 LLVM_DEBUG(
8840 dbgs() << "SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8841 TE->dump());
8842 }
8843
8844 // Reorder operands if reordering would enable vectorization.
8845 auto *CI = dyn_cast<CmpInst>(VL0);
8846 if (CI && any_of(VL, [](Value *V) {
8847 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8848 })) {
8849 auto *MainCI = cast<CmpInst>(S.getMainOp());
8850 auto *AltCI = cast<CmpInst>(S.getAltOp());
8851 CmpInst::Predicate MainP = MainCI->getPredicate();
8852 CmpInst::Predicate AltP = AltCI->getPredicate();
8853 assert(MainP != AltP &&
8854 "Expected different main/alternate predicates.");
8856 // Collect operands - commute if it uses the swapped predicate or
8857 // alternate operation.
8858 for (Value *V : VL) {
8859 if (isa<PoisonValue>(V)) {
8860 Left.push_back(PoisonValue::get(MainCI->getOperand(0)->getType()));
8861 Right.push_back(PoisonValue::get(MainCI->getOperand(1)->getType()));
8862 continue;
8863 }
8864 auto *Cmp = cast<CmpInst>(V);
8865 Value *LHS = Cmp->getOperand(0);
8866 Value *RHS = Cmp->getOperand(1);
8867
8868 if (isAlternateInstruction(Cmp, MainCI, AltCI, *TLI)) {
8869 if (AltP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8870 std::swap(LHS, RHS);
8871 } else {
8872 if (MainP == CmpInst::getSwappedPredicate(Cmp->getPredicate()))
8873 std::swap(LHS, RHS);
8874 }
8875 Left.push_back(LHS);
8876 Right.push_back(RHS);
8877 }
8878 TE->setOperand(0, Left);
8879 TE->setOperand(1, Right);
8880 buildTree_rec(Left, Depth + 1, {TE, 0});
8881 buildTree_rec(Right, Depth + 1, {TE, 1});
8882 return;
8883 }
8884
8885 TE->setOperand(*this, isa<BinaryOperator>(VL0) || CI);
8886 for (unsigned I : seq<unsigned>(VL0->getNumOperands()))
8887 buildTree_rec(TE->getOperand(I), Depth + 1, {TE, I});
8888 return;
8889 }
8890 default:
8891 break;
8892 }
8893 llvm_unreachable("Unexpected vectorization of the instructions.");
8894}
8895
8897 unsigned N = 1;
8898 Type *EltTy = T;
8899
8900 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8901 if (EltTy->isEmptyTy())
8902 return 0;
8903 if (auto *ST = dyn_cast<StructType>(EltTy)) {
8904 // Check that struct is homogeneous.
8905 for (const auto *Ty : ST->elements())
8906 if (Ty != *ST->element_begin())
8907 return 0;
8908 N *= ST->getNumElements();
8909 EltTy = *ST->element_begin();
8910 } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
8911 N *= AT->getNumElements();
8912 EltTy = AT->getElementType();
8913 } else {
8914 auto *VT = cast<FixedVectorType>(EltTy);
8915 N *= VT->getNumElements();
8916 EltTy = VT->getElementType();
8917 }
8918 }
8919
8920 if (!isValidElementType(EltTy))
8921 return 0;
8922 uint64_t VTSize = DL->getTypeStoreSizeInBits(getWidenedType(EltTy, N));
8923 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8924 VTSize != DL->getTypeStoreSizeInBits(T))
8925 return 0;
8926 return N;
8927}
8928
8929bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL,
8930 SmallVectorImpl<unsigned> &CurrentOrder,
8931 bool ResizeAllowed) const {
8932 const auto *It = find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8933 assert(It != VL.end() && "Expected at least one extract instruction.");
8934 auto *E0 = cast<Instruction>(*It);
8935 assert(
8936 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8937 "Invalid opcode");
8938 // Check if all of the extracts come from the same vector and from the
8939 // correct offset.
8940 Value *Vec = E0->getOperand(0);
8941
8942 CurrentOrder.clear();
8943
8944 // We have to extract from a vector/aggregate with the same number of elements.
8945 unsigned NElts;
8946 if (E0->getOpcode() == Instruction::ExtractValue) {
8947 NElts = canMapToVector(Vec->getType());
8948 if (!NElts)
8949 return false;
8950 // Check if load can be rewritten as load of vector.
8951 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8952 if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
8953 return false;
8954 } else {
8955 NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
8956 }
8957
8958 unsigned E = VL.size();
8959 if (!ResizeAllowed && NElts != E)
8960 return false;
8961 SmallVector<int> Indices(E, PoisonMaskElem);
8962 unsigned MinIdx = NElts, MaxIdx = 0;
8963 for (auto [I, V] : enumerate(VL)) {
8964 auto *Inst = dyn_cast<Instruction>(V);
8965 if (!Inst)
8966 continue;
8967 if (Inst->getOperand(0) != Vec)
8968 return false;
8969 if (auto *EE = dyn_cast<ExtractElementInst>(Inst))
8970 if (isa<UndefValue>(EE->getIndexOperand()))
8971 continue;
8972 std::optional<unsigned> Idx = getExtractIndex(Inst);
8973 if (!Idx)
8974 return false;
8975 const unsigned ExtIdx = *Idx;
8976 if (ExtIdx >= NElts)
8977 continue;
8978 Indices[I] = ExtIdx;
8979 if (MinIdx > ExtIdx)
8980 MinIdx = ExtIdx;
8981 if (MaxIdx < ExtIdx)
8982 MaxIdx = ExtIdx;
8983 }
8984 if (MaxIdx - MinIdx + 1 > E)
8985 return false;
8986 if (MaxIdx + 1 <= E)
8987 MinIdx = 0;
8988
8989 // Check that all of the indices extract from the correct offset.
8990 bool ShouldKeepOrder = true;
8991 // Assign to all items the initial value E + 1 so we can check if the extract
8992 // instruction index was used already.
8993 // Also, later we can check that all the indices are used and we have a
8994 // consecutive access in the extract instructions, by checking that no
8995 // element of CurrentOrder still has value E + 1.
8996 CurrentOrder.assign(E, E);
8997 for (unsigned I = 0; I < E; ++I) {
8998 if (Indices[I] == PoisonMaskElem)
8999 continue;
9000 const unsigned ExtIdx = Indices[I] - MinIdx;
9001 if (CurrentOrder[ExtIdx] != E) {
9002 CurrentOrder.clear();
9003 return false;
9004 }
9005 ShouldKeepOrder &= ExtIdx == I;
9006 CurrentOrder[ExtIdx] = I;
9007 }
9008 if (ShouldKeepOrder)
9009 CurrentOrder.clear();
9010
9011 return ShouldKeepOrder;
9012}
9013
9014bool BoUpSLP::areAllUsersVectorized(
9015 Instruction *I, const SmallDenseSet<Value *> *VectorizedVals) const {
9016 return (I->hasOneUse() && (!VectorizedVals || VectorizedVals->contains(I))) ||
9017 all_of(I->users(), [this](User *U) {
9018 return ScalarToTreeEntry.contains(U) ||
9019 isVectorLikeInstWithConstOps(U) ||
9020 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9021 });
9022}
9023
9024static std::pair<InstructionCost, InstructionCost>
9027 ArrayRef<Type *> ArgTys) {
9029
9030 // Calculate the cost of the scalar and vector calls.
9031 FastMathFlags FMF;
9032 if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
9033 FMF = FPCI->getFastMathFlags();
9035 IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys, FMF,
9036 dyn_cast<IntrinsicInst>(CI));
9037 auto IntrinsicCost =
9039
9040 auto Shape = VFShape::get(CI->getFunctionType(),
9042 false /*HasGlobalPred*/);
9043 Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
9044 auto LibCost = IntrinsicCost;
9045 if (!CI->isNoBuiltin() && VecFunc) {
9046 // Calculate the cost of the vector library call.
9047 // If the corresponding vector call is cheaper, return its cost.
9048 LibCost =
9049 TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
9050 }
9051 return {IntrinsicCost, LibCost};
9052}
9053
9054void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9055 const function_ref<bool(Instruction *)> IsAltOp, SmallVectorImpl<int> &Mask,
9056 SmallVectorImpl<Value *> *OpScalars,
9057 SmallVectorImpl<Value *> *AltScalars) const {
9058 unsigned Sz = Scalars.size();
9059 Mask.assign(Sz, PoisonMaskElem);
9060 SmallVector<int> OrderMask;
9061 if (!ReorderIndices.empty())
9062 inversePermutation(ReorderIndices, OrderMask);
9063 for (unsigned I = 0; I < Sz; ++I) {
9064 unsigned Idx = I;
9065 if (!ReorderIndices.empty())
9066 Idx = OrderMask[I];
9067 if (isa<PoisonValue>(Scalars[Idx]))
9068 continue;
9069 auto *OpInst = cast<Instruction>(Scalars[Idx]);
9070 if (IsAltOp(OpInst)) {
9071 Mask[I] = Sz + Idx;
9072 if (AltScalars)
9073 AltScalars->push_back(OpInst);
9074 } else {
9075 Mask[I] = Idx;
9076 if (OpScalars)
9077 OpScalars->push_back(OpInst);
9078 }
9079 }
9080 if (!ReuseShuffleIndices.empty()) {
9081 SmallVector<int> NewMask(ReuseShuffleIndices.size(), PoisonMaskElem);
9082 transform(ReuseShuffleIndices, NewMask.begin(), [&Mask](int Idx) {
9083 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9084 });
9085 Mask.swap(NewMask);
9086 }
9087}
9088
9090 const Instruction *MainOp,
9091 const Instruction *AltOp,
9092 const TargetLibraryInfo &TLI) {
9093 if (auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9094 auto *AltCI = cast<CmpInst>(AltOp);
9095 CmpInst::Predicate MainP = MainCI->getPredicate();
9096 CmpInst::Predicate AltP = AltCI->getPredicate();
9097 assert(MainP != AltP && "Expected different main/alternate predicates.");
9098 auto *CI = cast<CmpInst>(I);
9099 if (isCmpSameOrSwapped(MainCI, CI, TLI))
9100 return false;
9101 if (isCmpSameOrSwapped(AltCI, CI, TLI))
9102 return true;
9103 CmpInst::Predicate P = CI->getPredicate();
9105
9106 assert((MainP == P || AltP == P || MainP == SwappedP || AltP == SwappedP) &&
9107 "CmpInst expected to match either main or alternate predicate or "
9108 "their swap.");
9109 (void)AltP;
9110 return MainP != P && MainP != SwappedP;
9111 }
9112 return I->getOpcode() == AltOp->getOpcode();
9113}
9114
9115TTI::OperandValueInfo BoUpSLP::getOperandInfo(ArrayRef<Value *> Ops) {
9116 assert(!Ops.empty());
9117 const auto *Op0 = Ops.front();
9118
9119 const bool IsConstant = all_of(Ops, [](Value *V) {
9120 // TODO: We should allow undef elements here
9121 return isConstant(V) && !isa<UndefValue>(V);
9122 });
9123 const bool IsUniform = all_of(Ops, [=](Value *V) {
9124 // TODO: We should allow undef elements here
9125 return V == Op0;
9126 });
9127 const bool IsPowerOfTwo = all_of(Ops, [](Value *V) {
9128 // TODO: We should allow undef elements here
9129 if (auto *CI = dyn_cast<ConstantInt>(V))
9130 return CI->getValue().isPowerOf2();
9131 return false;
9132 });
9133 const bool IsNegatedPowerOfTwo = all_of(Ops, [](Value *V) {
9134 // TODO: We should allow undef elements here
9135 if (auto *CI = dyn_cast<ConstantInt>(V))
9136 return CI->getValue().isNegatedPowerOf2();
9137 return false;
9138 });
9139
9141 if (IsConstant && IsUniform)
9143 else if (IsConstant)
9145 else if (IsUniform)
9147
9149 VP = IsPowerOfTwo ? TTI::OP_PowerOf2 : VP;
9150 VP = IsNegatedPowerOfTwo ? TTI::OP_NegatedPowerOf2 : VP;
9151
9152 return {VK, VP};
9153}
9154
9155namespace {
9156/// The base class for shuffle instruction emission and shuffle cost estimation.
9157class BaseShuffleAnalysis {
9158protected:
9159 Type *ScalarTy = nullptr;
9160
9161 BaseShuffleAnalysis(Type *ScalarTy) : ScalarTy(ScalarTy) {}
9162
9163 /// V is expected to be a vectorized value.
9164 /// When REVEC is disabled, there is no difference between VF and
9165 /// VNumElements.
9166 /// When REVEC is enabled, VF is VNumElements / ScalarTyNumElements.
9167 /// e.g., if ScalarTy is <4 x Ty> and V1 is <8 x Ty>, 2 is returned instead
9168 /// of 8.
9169 unsigned getVF(Value *V) const {
9170 assert(V && "V cannot be nullptr");
9171 assert(isa<FixedVectorType>(V->getType()) &&
9172 "V does not have FixedVectorType");
9173 assert(ScalarTy && "ScalarTy cannot be nullptr");
9174 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9175 unsigned VNumElements =
9176 cast<FixedVectorType>(V->getType())->getNumElements();
9177 assert(VNumElements > ScalarTyNumElements &&
9178 "the number of elements of V is not large enough");
9179 assert(VNumElements % ScalarTyNumElements == 0 &&
9180 "the number of elements of V is not a vectorized value");
9181 return VNumElements / ScalarTyNumElements;
9182 }
9183
9184 /// Checks if the mask is an identity mask.
9185 /// \param IsStrict if is true the function returns false if mask size does
9186 /// not match vector size.
9187 static bool isIdentityMask(ArrayRef<int> Mask, const FixedVectorType *VecTy,
9188 bool IsStrict) {
9189 int Limit = Mask.size();
9190 int VF = VecTy->getNumElements();
9191 int Index = -1;
9192 if (VF == Limit && ShuffleVectorInst::isIdentityMask(Mask, Limit))
9193 return true;
9194 if (!IsStrict) {
9195 // Consider extract subvector starting from index 0.
9197 Index == 0)
9198 return true;
9199 // All VF-size submasks are identity (e.g.
9200 // <poison,poison,poison,poison,0,1,2,poison,poison,1,2,3> etc. for VF 4).
9201 if (Limit % VF == 0 && all_of(seq<int>(0, Limit / VF), [=](int Idx) {
9202 ArrayRef<int> Slice = Mask.slice(Idx * VF, VF);
9203 return all_of(Slice, [](int I) { return I == PoisonMaskElem; }) ||
9205 }))
9206 return true;
9207 }
9208 return false;
9209 }
9210
9211 /// Tries to combine 2 different masks into single one.
9212 /// \param LocalVF Vector length of the permuted input vector. \p Mask may
9213 /// change the size of the vector, \p LocalVF is the original size of the
9214 /// shuffled vector.
9215 static void combineMasks(unsigned LocalVF, SmallVectorImpl<int> &Mask,
9216 ArrayRef<int> ExtMask) {
9217 unsigned VF = Mask.size();
9218 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
9219 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
9220 if (ExtMask[I] == PoisonMaskElem)
9221 continue;
9222 int MaskedIdx = Mask[ExtMask[I] % VF];
9223 NewMask[I] =
9224 MaskedIdx == PoisonMaskElem ? PoisonMaskElem : MaskedIdx % LocalVF;
9225 }
9226 Mask.swap(NewMask);
9227 }
9228
9229 /// Looks through shuffles trying to reduce final number of shuffles in the
9230 /// code. The function looks through the previously emitted shuffle
9231 /// instructions and properly mark indices in mask as undef.
9232 /// For example, given the code
9233 /// \code
9234 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
9235 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
9236 /// \endcode
9237 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
9238 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9239 /// <0, 1, 2, 3> for the shuffle.
9240 /// If 2 operands are of different size, the smallest one will be resized and
9241 /// the mask recalculated properly.
9242 /// For example, given the code
9243 /// \code
9244 /// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
9245 /// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
9246 /// \endcode
9247 /// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
9248 /// look through %s1 and %s2 and select vectors %0 and %1 with mask
9249 /// <0, 1, 2, 3> for the shuffle.
9250 /// So, it tries to transform permutations to simple vector merge, if
9251 /// possible.
9252 /// \param V The input vector which must be shuffled using the given \p Mask.
9253 /// If the better candidate is found, \p V is set to this best candidate
9254 /// vector.
9255 /// \param Mask The input mask for the shuffle. If the best candidate is found
9256 /// during looking-through-shuffles attempt, it is updated accordingly.
9257 /// \param SinglePermute true if the shuffle operation is originally a
9258 /// single-value-permutation. In this case the look-through-shuffles procedure
9259 /// may look for resizing shuffles as the best candidates.
9260 /// \return true if the shuffle results in the non-resizing identity shuffle
9261 /// (and thus can be ignored), false - otherwise.
9262 static bool peekThroughShuffles(Value *&V, SmallVectorImpl<int> &Mask,
9263 bool SinglePermute) {
9264 Value *Op = V;
9265 ShuffleVectorInst *IdentityOp = nullptr;
9266 SmallVector<int> IdentityMask;
9267 while (auto *SV = dyn_cast<ShuffleVectorInst>(Op)) {
9268 // Exit if not a fixed vector type or changing size shuffle.
9269 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9270 if (!SVTy)
9271 break;
9272 // Remember the identity or broadcast mask, if it is not a resizing
9273 // shuffle. If no better candidates are found, this Op and Mask will be
9274 // used in the final shuffle.
9275 if (isIdentityMask(Mask, SVTy, /*IsStrict=*/false)) {
9276 if (!IdentityOp || !SinglePermute ||
9277 (isIdentityMask(Mask, SVTy, /*IsStrict=*/true) &&
9279 IdentityMask.size()))) {
9280 IdentityOp = SV;
9281 // Store current mask in the IdentityMask so later we did not lost
9282 // this info if IdentityOp is selected as the best candidate for the
9283 // permutation.
9284 IdentityMask.assign(Mask);
9285 }
9286 }
9287 // Remember the broadcast mask. If no better candidates are found, this Op
9288 // and Mask will be used in the final shuffle.
9289 // Zero splat can be used as identity too, since it might be used with
9290 // mask <0, 1, 2, ...>, i.e. identity mask without extra reshuffling.
9291 // E.g. if need to shuffle the vector with the mask <3, 1, 2, 0>, which is
9292 // expensive, the analysis founds out, that the source vector is just a
9293 // broadcast, this original mask can be transformed to identity mask <0,
9294 // 1, 2, 3>.
9295 // \code
9296 // %0 = shuffle %v, poison, zeroinitalizer
9297 // %res = shuffle %0, poison, <3, 1, 2, 0>
9298 // \endcode
9299 // may be transformed to
9300 // \code
9301 // %0 = shuffle %v, poison, zeroinitalizer
9302 // %res = shuffle %0, poison, <0, 1, 2, 3>
9303 // \endcode
9304 if (SV->isZeroEltSplat()) {
9305 IdentityOp = SV;
9306 IdentityMask.assign(Mask);
9307 }
9308 int LocalVF = Mask.size();
9309 if (auto *SVOpTy =
9310 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9311 LocalVF = SVOpTy->getNumElements();
9312 SmallVector<int> ExtMask(Mask.size(), PoisonMaskElem);
9313 for (auto [Idx, I] : enumerate(Mask)) {
9314 if (I == PoisonMaskElem ||
9315 static_cast<unsigned>(I) >= SV->getShuffleMask().size())
9316 continue;
9317 ExtMask[Idx] = SV->getMaskValue(I);
9318 }
9319 bool IsOp1Undef = isUndefVector</*isPoisonOnly=*/true>(
9320 SV->getOperand(0),
9321 buildUseMask(LocalVF, ExtMask, UseMask::FirstArg))
9322 .all();
9323 bool IsOp2Undef = isUndefVector</*isPoisonOnly=*/true>(
9324 SV->getOperand(1),
9325 buildUseMask(LocalVF, ExtMask, UseMask::SecondArg))
9326 .all();
9327 if (!IsOp1Undef && !IsOp2Undef) {
9328 // Update mask and mark undef elems.
9329 for (int &I : Mask) {
9330 if (I == PoisonMaskElem)
9331 continue;
9332 if (SV->getMaskValue(I % SV->getShuffleMask().size()) ==
9334 I = PoisonMaskElem;
9335 }
9336 break;
9337 }
9338 SmallVector<int> ShuffleMask(SV->getShuffleMask());
9339 combineMasks(LocalVF, ShuffleMask, Mask);
9340 Mask.swap(ShuffleMask);
9341 if (IsOp2Undef)
9342 Op = SV->getOperand(0);
9343 else
9344 Op = SV->getOperand(1);
9345 }
9346 if (auto *OpTy = dyn_cast<FixedVectorType>(Op->getType());
9347 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9349 if (IdentityOp) {
9350 V = IdentityOp;
9351 assert(Mask.size() == IdentityMask.size() &&
9352 "Expected masks of same sizes.");
9353 // Clear known poison elements.
9354 for (auto [I, Idx] : enumerate(Mask))
9355 if (Idx == PoisonMaskElem)
9356 IdentityMask[I] = PoisonMaskElem;
9357 Mask.swap(IdentityMask);
9358 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9359 return SinglePermute &&
9360 (isIdentityMask(Mask, cast<FixedVectorType>(V->getType()),
9361 /*IsStrict=*/true) ||
9362 (Shuffle && Mask.size() == Shuffle->getShuffleMask().size() &&
9363 Shuffle->isZeroEltSplat() &&
9365 }
9366 V = Op;
9367 return false;
9368 }
9369 V = Op;
9370 return true;
9371 }
9372
9373 /// Smart shuffle instruction emission, walks through shuffles trees and
9374 /// tries to find the best matching vector for the actual shuffle
9375 /// instruction.
9376 template <typename T, typename ShuffleBuilderTy>
9377 static T createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask,
9378 ShuffleBuilderTy &Builder) {
9379 assert(V1 && "Expected at least one vector value.");
9380 if (V2)
9381 Builder.resizeToMatch(V1, V2);
9382 int VF = Mask.size();
9383 if (auto *FTy = dyn_cast<FixedVectorType>(V1->getType()))
9384 VF = FTy->getNumElements();
9385 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9386 V2, buildUseMask(VF, Mask, UseMask::SecondArg))
9387 .all()) {
9388 // Peek through shuffles.
9389 Value *Op1 = V1;
9390 Value *Op2 = V2;
9391 int VF =
9392 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
9393 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
9394 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
9395 for (int I = 0, E = Mask.size(); I < E; ++I) {
9396 if (Mask[I] < VF)
9397 CombinedMask1[I] = Mask[I];
9398 else
9399 CombinedMask2[I] = Mask[I] - VF;
9400 }
9401 Value *PrevOp1;
9402 Value *PrevOp2;
9403 do {
9404 PrevOp1 = Op1;
9405 PrevOp2 = Op2;
9406 (void)peekThroughShuffles(Op1, CombinedMask1, /*SinglePermute=*/false);
9407 (void)peekThroughShuffles(Op2, CombinedMask2, /*SinglePermute=*/false);
9408 // Check if we have 2 resizing shuffles - need to peek through operands
9409 // again.
9410 if (auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9411 if (auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9412 SmallVector<int> ExtMask1(Mask.size(), PoisonMaskElem);
9413 for (auto [Idx, I] : enumerate(CombinedMask1)) {
9414 if (I == PoisonMaskElem)
9415 continue;
9416 ExtMask1[Idx] = SV1->getMaskValue(I);
9417 }
9418 SmallBitVector UseMask1 = buildUseMask(
9419 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9420 ->getNumElements(),
9421 ExtMask1, UseMask::SecondArg);
9422 SmallVector<int> ExtMask2(CombinedMask2.size(), PoisonMaskElem);
9423 for (auto [Idx, I] : enumerate(CombinedMask2)) {
9424 if (I == PoisonMaskElem)
9425 continue;
9426 ExtMask2[Idx] = SV2->getMaskValue(I);
9427 }
9428 SmallBitVector UseMask2 = buildUseMask(
9429 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9430 ->getNumElements(),
9431 ExtMask2, UseMask::SecondArg);
9432 if (SV1->getOperand(0)->getType() ==
9433 SV2->getOperand(0)->getType() &&
9434 SV1->getOperand(0)->getType() != SV1->getType() &&
9435 isUndefVector(SV1->getOperand(1), UseMask1).all() &&
9436 isUndefVector(SV2->getOperand(1), UseMask2).all()) {
9437 Op1 = SV1->getOperand(0);
9438 Op2 = SV2->getOperand(0);
9439 SmallVector<int> ShuffleMask1(SV1->getShuffleMask());
9440 int LocalVF = ShuffleMask1.size();
9441 if (auto *FTy = dyn_cast<FixedVectorType>(Op1->getType()))
9442 LocalVF = FTy->getNumElements();
9443 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9444 CombinedMask1.swap(ShuffleMask1);
9445 SmallVector<int> ShuffleMask2(SV2->getShuffleMask());
9446 LocalVF = ShuffleMask2.size();
9447 if (auto *FTy = dyn_cast<FixedVectorType>(Op2->getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9450 CombinedMask2.swap(ShuffleMask2);
9451 }
9452 }
9453 } while (PrevOp1 != Op1 || PrevOp2 != Op2);
9454 Builder.resizeToMatch(Op1, Op2);
9455 VF = std::max(cast<VectorType>(Op1->getType())
9456 ->getElementCount()
9457 .getKnownMinValue(),
9458 cast<VectorType>(Op2->getType())
9459 ->getElementCount()
9460 .getKnownMinValue());
9461 for (int I = 0, E = Mask.size(); I < E; ++I) {
9462 if (CombinedMask2[I] != PoisonMaskElem) {
9463 assert(CombinedMask1[I] == PoisonMaskElem &&
9464 "Expected undefined mask element");
9465 CombinedMask1[I] = CombinedMask2[I] + (Op1 == Op2 ? 0 : VF);
9466 }
9467 }
9468 if (Op1 == Op2 &&
9469 (ShuffleVectorInst::isIdentityMask(CombinedMask1, VF) ||
9470 (ShuffleVectorInst::isZeroEltSplatMask(CombinedMask1, VF) &&
9471 isa<ShuffleVectorInst>(Op1) &&
9472 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9473 ArrayRef(CombinedMask1))))
9474 return Builder.createIdentity(Op1);
9475 return Builder.createShuffleVector(
9476 Op1, Op1 == Op2 ? PoisonValue::get(Op1->getType()) : Op2,
9477 CombinedMask1);
9478 }
9479 if (isa<PoisonValue>(V1))
9480 return Builder.createPoison(
9481 cast<VectorType>(V1->getType())->getElementType(), Mask.size());
9482 SmallVector<int> NewMask(Mask);
9483 bool IsIdentity = peekThroughShuffles(V1, NewMask, /*SinglePermute=*/true);
9484 assert(V1 && "Expected non-null value after looking through shuffles.");
9485
9486 if (!IsIdentity)
9487 return Builder.createShuffleVector(V1, NewMask);
9488 return Builder.createIdentity(V1);
9489 }
9490};
9491} // namespace
9492
9493/// Calculate the scalar and the vector costs from vectorizing set of GEPs.
9494static std::pair<InstructionCost, InstructionCost>
9496 Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind,
9497 Type *ScalarTy, VectorType *VecTy) {
9498 InstructionCost ScalarCost = 0;
9499 InstructionCost VecCost = 0;
9500 // Here we differentiate two cases: (1) when Ptrs represent a regular
9501 // vectorization tree node (as they are pointer arguments of scattered
9502 // loads) or (2) when Ptrs are the arguments of loads or stores being
9503 // vectorized as plane wide unit-stride load/store since all the
9504 // loads/stores are known to be from/to adjacent locations.
9505 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9506 // Case 2: estimate costs for pointer related costs when vectorizing to
9507 // a wide load/store.
9508 // Scalar cost is estimated as a set of pointers with known relationship
9509 // between them.
9510 // For vector code we will use BasePtr as argument for the wide load/store
9511 // but we also need to account all the instructions which are going to
9512 // stay in vectorized code due to uses outside of these scalar
9513 // loads/stores.
9514 ScalarCost = TTI.getPointersChainCost(
9515 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9516 CostKind);
9517
9518 SmallVector<const Value *> PtrsRetainedInVecCode;
9519 for (Value *V : Ptrs) {
9520 if (V == BasePtr) {
9521 PtrsRetainedInVecCode.push_back(V);
9522 continue;
9523 }
9524 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9525 // For simplicity assume Ptr to stay in vectorized code if it's not a
9526 // GEP instruction. We don't care since it's cost considered free.
9527 // TODO: We should check for any uses outside of vectorizable tree
9528 // rather than just single use.
9529 if (!Ptr || !Ptr->hasOneUse())
9530 PtrsRetainedInVecCode.push_back(V);
9531 }
9532
9533 if (PtrsRetainedInVecCode.size() == Ptrs.size()) {
9534 // If all pointers stay in vectorized code then we don't have
9535 // any savings on that.
9536 return std::make_pair(TTI::TCC_Free, TTI::TCC_Free);
9537 }
9538 VecCost = TTI.getPointersChainCost(PtrsRetainedInVecCode, BasePtr,
9539 TTI::PointersChainInfo::getKnownStride(),
9540 VecTy, CostKind);
9541 } else {
9542 // Case 1: Ptrs are the arguments of loads that we are going to transform
9543 // into masked gather load intrinsic.
9544 // All the scalar GEPs will be removed as a result of vectorization.
9545 // For any external uses of some lanes extract element instructions will
9546 // be generated (which cost is estimated separately).
9547 TTI::PointersChainInfo PtrsInfo =
9548 all_of(Ptrs,
9549 [](const Value *V) {
9550 auto *Ptr = dyn_cast<GetElementPtrInst>(V);
9551 return Ptr && !Ptr->hasAllConstantIndices();
9552 })
9553 ? TTI::PointersChainInfo::getUnknownStride()
9554 : TTI::PointersChainInfo::getKnownStride();
9555
9556 ScalarCost =
9557 TTI.getPointersChainCost(Ptrs, BasePtr, PtrsInfo, ScalarTy, CostKind);
9558 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9559 if (!BaseGEP) {
9560 auto *It = find_if(Ptrs, IsaPred<GEPOperator>);
9561 if (It != Ptrs.end())
9562 BaseGEP = cast<GEPOperator>(*It);
9563 }
9564 if (BaseGEP) {
9565 SmallVector<const Value *> Indices(BaseGEP->indices());
9566 VecCost = TTI.getGEPCost(BaseGEP->getSourceElementType(),
9567 BaseGEP->getPointerOperand(), Indices, VecTy,
9568 CostKind);
9569 }
9570 }
9571
9572 return std::make_pair(ScalarCost, VecCost);
9573}
9574
9575void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9576 assert(TE.isGather() && TE.ReorderIndices.empty() &&
9577 "Expected gather node without reordering.");
9579 SmallSet<size_t, 2> LoadKeyUsed;
9580
9581 // Do not reorder nodes if it small (just 2 elements), all-constant or all
9582 // instructions have same opcode already.
9583 if (TE.Scalars.size() == 2 || (TE.hasState() && !TE.isAltShuffle()) ||
9584 all_of(TE.Scalars, isConstant))
9585 return;
9586
9587 if (any_of(seq<unsigned>(TE.Idx), [&](unsigned Idx) {
9588 return VectorizableTree[Idx]->isSame(TE.Scalars);
9589 }))
9590 return;
9591
9592 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
9593 Key = hash_combine(hash_value(LI->getParent()), Key);
9594 Value *Ptr =
9596 if (LoadKeyUsed.contains(Key)) {
9597 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
9598 if (LIt != LoadsMap.end()) {
9599 for (LoadInst *RLI : LIt->second) {
9600 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
9601 LI->getType(), LI->getPointerOperand(), *DL, *SE,
9602 /*StrictCheck=*/true))
9603 return hash_value(RLI->getPointerOperand());
9604 }
9605 for (LoadInst *RLI : LIt->second) {
9607 LI->getPointerOperand(), *TLI)) {
9608 hash_code SubKey = hash_value(RLI->getPointerOperand());
9609 return SubKey;
9610 }
9611 }
9612 if (LIt->second.size() > 2) {
9613 hash_code SubKey =
9614 hash_value(LIt->second.back()->getPointerOperand());
9615 return SubKey;
9616 }
9617 }
9618 }
9619 LoadKeyUsed.insert(Key);
9620 LoadsMap.try_emplace(std::make_pair(Key, Ptr)).first->second.push_back(LI);
9621 return hash_value(LI->getPointerOperand());
9622 };
9625 bool IsOrdered = true;
9626 unsigned NumInstructions = 0;
9627 // Try to "cluster" scalar instructions, to be able to build extra vectorized
9628 // nodes.
9629 for (auto [I, V] : enumerate(TE.Scalars)) {
9630 size_t Key = 1, Idx = 1;
9631 if (auto *Inst = dyn_cast<Instruction>(V);
9632 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9633 !isDeleted(Inst) && !isVectorized(V)) {
9634 std::tie(Key, Idx) = generateKeySubkey(V, TLI, GenerateLoadsSubkey,
9635 /*AllowAlternate=*/false);
9636 ++NumInstructions;
9637 }
9638 auto &Container = SortedValues[Key];
9639 if (IsOrdered && !KeyToIndex.contains(V) &&
9640 !(isa<Constant, ExtractElementInst>(V) ||
9642 ((Container.contains(Idx) &&
9643 KeyToIndex.at(Container[Idx].back()).back() != I - 1) ||
9644 (!Container.empty() && !Container.contains(Idx) &&
9645 KeyToIndex.at(Container.back().second.back()).back() != I - 1)))
9646 IsOrdered = false;
9647 auto &KTI = KeyToIndex[V];
9648 if (KTI.empty())
9649 Container[Idx].push_back(V);
9650 KTI.push_back(I);
9651 }
9653 APInt DemandedElts = APInt::getAllOnes(TE.Scalars.size());
9654 if (!IsOrdered && NumInstructions > 1) {
9655 unsigned Cnt = 0;
9656 TE.ReorderIndices.resize(TE.Scalars.size(), TE.Scalars.size());
9657 for (const auto &D : SortedValues) {
9658 for (const auto &P : D.second) {
9659 unsigned Sz = 0;
9660 for (Value *V : P.second) {
9661 ArrayRef<unsigned> Indices = KeyToIndex.at(V);
9662 for (auto [K, Idx] : enumerate(Indices)) {
9663 TE.ReorderIndices[Cnt + K] = Idx;
9664 TE.Scalars[Cnt + K] = V;
9665 }
9666 Sz += Indices.size();
9667 Cnt += Indices.size();
9668 }
9669 if (Sz > 1 && isa<Instruction>(P.second.front())) {
9670 const unsigned SubVF = getFloorFullVectorNumberOfElements(
9671 *TTI, TE.Scalars.front()->getType(), Sz);
9672 SubVectors.emplace_back(Cnt - Sz, SubVF);
9673 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9674 DemandedElts.clearBit(I);
9675 } else if (!P.second.empty() && isConstant(P.second.front())) {
9676 for (unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9677 DemandedElts.clearBit(I);
9678 }
9679 }
9680 }
9681 }
9682 // Reuses always require shuffles, so consider it as profitable.
9683 if (!TE.ReuseShuffleIndices.empty() || TE.ReorderIndices.empty())
9684 return;
9685 // Do simple cost estimation.
9688 auto *ScalarTy = TE.Scalars.front()->getType();
9689 auto *VecTy = getWidenedType(ScalarTy, TE.Scalars.size());
9690 for (auto [Idx, Sz] : SubVectors) {
9692 Idx, getWidenedType(ScalarTy, Sz));
9693 }
9694 if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9695 assert(SLPReVec && "Only supported by REVEC.");
9696 // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
9697 // of CreateInsertElement.
9698 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
9699 for (unsigned I : seq<unsigned>(TE.Scalars.size()))
9700 if (DemandedElts[I])
9701 Cost +=
9702 TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
9703 CostKind, I * ScalarTyNumElements, FTy);
9704 } else {
9705 Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
9706 /*Extract=*/false, CostKind);
9707 }
9708 int Sz = TE.Scalars.size();
9709 SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
9710 TE.ReorderIndices.end());
9711 for (unsigned I : seq<unsigned>(Sz)) {
9712 Value *V = TE.getOrdered(I);
9713 if (isa<PoisonValue>(V)) {
9714 ReorderMask[I] = PoisonMaskElem;
9715 } else if (isConstant(V) || DemandedElts[I]) {
9716 ReorderMask[I] = I + TE.ReorderIndices.size();
9717 }
9718 }
9720 any_of(ReorderMask, [&](int I) { return I >= Sz; })
9723 VecTy, ReorderMask);
9724 DemandedElts = APInt::getAllOnes(VecTy->getNumElements());
9725 ReorderMask.assign(Sz, PoisonMaskElem);
9726 for (unsigned I : seq<unsigned>(Sz)) {
9727 Value *V = TE.getOrdered(I);
9728 if (isConstant(V)) {
9729 DemandedElts.clearBit(I);
9730 if (!isa<PoisonValue>(V))
9731 ReorderMask[I] = I;
9732 } else {
9733 ReorderMask[I] = I + Sz;
9734 }
9735 }
9737 VecTy, DemandedElts, /*Insert=*/true, /*Extract=*/false, CostKind);
9738 if (!DemandedElts.isAllOnes())
9739 BVCost += ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, VecTy, ReorderMask);
9740 if (Cost >= BVCost) {
9741 SmallVector<int> Mask(TE.ReorderIndices.begin(), TE.ReorderIndices.end());
9742 reorderScalars(TE.Scalars, Mask);
9743 TE.ReorderIndices.clear();
9744 }
9745}
9746
9749 BaseGraphSize = VectorizableTree.size();
9750 // Turn graph transforming mode on and off, when done.
9751 class GraphTransformModeRAAI {
9752 bool &SavedIsGraphTransformMode;
9753
9754 public:
9755 GraphTransformModeRAAI(bool &IsGraphTransformMode)
9756 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9757 IsGraphTransformMode = true;
9758 }
9759 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode = false; }
9760 } TransformContext(IsGraphTransformMode);
9761 // Operands are profitable if they are:
9762 // 1. At least one constant
9763 // or
9764 // 2. Splats
9765 // or
9766 // 3. Results in good vectorization opportunity, i.e. may generate vector
9767 // nodes and reduce cost of the graph.
9768 auto CheckOperandsProfitability = [this](Instruction *I1, Instruction *I2,
9769 const InstructionsState &S) {
9771 for (unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9772 Candidates.emplace_back().emplace_back(I1->getOperand(Op),
9773 I2->getOperand(Op));
9774 return all_of(
9775 Candidates, [this](ArrayRef<std::pair<Value *, Value *>> Cand) {
9776 return all_of(Cand,
9777 [](const std::pair<Value *, Value *> &P) {
9778 return isa<Constant>(P.first) ||
9779 isa<Constant>(P.second) || P.first == P.second;
9780 }) ||
9782 });
9783 };
9784
9785 // Try to reorder gather nodes for better vectorization opportunities.
9786 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9787 TreeEntry &E = *VectorizableTree[Idx];
9788 if (E.isGather())
9789 reorderGatherNode(E);
9790 }
9791
9792 // The tree may grow here, so iterate over nodes, built before.
9793 for (unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9794 TreeEntry &E = *VectorizableTree[Idx];
9795 if (E.isGather()) {
9796 ArrayRef<Value *> VL = E.Scalars;
9797 const unsigned Sz = getVectorElementSize(VL.front());
9798 unsigned MinVF = getMinVF(2 * Sz);
9799 // Do not try partial vectorization for small nodes (<= 2), nodes with the
9800 // same opcode and same parent block or all constants.
9801 if (VL.size() <= 2 || LoadEntriesToVectorize.contains(Idx) ||
9802 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9803 E.isAltShuffle() || !allSameBlock(VL)) ||
9804 allConstant(VL) || isSplat(VL))
9805 continue;
9806 // Try to find vectorizable sequences and transform them into a series of
9807 // insertvector instructions.
9808 unsigned StartIdx = 0;
9809 unsigned End = VL.size();
9810 for (unsigned VF = getFloorFullVectorNumberOfElements(
9811 *TTI, VL.front()->getType(), VL.size() - 1);
9812 VF >= MinVF; VF = getFloorFullVectorNumberOfElements(
9813 *TTI, VL.front()->getType(), VF - 1)) {
9814 if (StartIdx + VF > End)
9815 continue;
9817 for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) {
9818 ArrayRef<Value *> Slice = VL.slice(Cnt, VF);
9819 // If any instruction is vectorized already - do not try again.
9820 // Reuse the existing node, if it fully matches the slice.
9821 if (const TreeEntry *SE = getTreeEntry(Slice.front());
9822 SE || getTreeEntry(Slice.back())) {
9823 if (!SE)
9824 continue;
9825 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9826 continue;
9827 }
9828 // Constant already handled effectively - skip.
9829 if (allConstant(Slice))
9830 continue;
9831 // Do not try to vectorize small splats (less than vector register and
9832 // only with the single non-undef element).
9833 bool IsSplat = isSplat(Slice);
9834 if (Slices.empty() || !IsSplat ||
9835 (VF <= 2 && 2 * std::clamp(TTI->getNumberOfParts(getWidenedType(
9836 Slice.front()->getType(), VF)),
9837 1U, VF - 1) !=
9839 Slice.front()->getType(), 2 * VF)),
9840 1U, 2 * VF)) ||
9841 count(Slice, Slice.front()) ==
9842 static_cast<long>(isa<UndefValue>(Slice.front()) ? VF - 1
9843 : 1)) {
9844 if (IsSplat)
9845 continue;
9846 InstructionsState S = getSameOpcode(Slice, *TLI);
9847 if (!S || S.isAltShuffle() || !allSameBlock(Slice) ||
9848 (S.getOpcode() == Instruction::Load &&
9850 (S.getOpcode() != Instruction::Load &&
9851 !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF)))
9852 continue;
9853 if (VF == 2) {
9854 // Try to vectorize reduced values or if all users are vectorized.
9855 // For expensive instructions extra extracts might be profitable.
9856 if ((!UserIgnoreList || E.Idx != 0) &&
9857 TTI->getInstructionCost(S.getMainOp(), CostKind) <
9859 !all_of(Slice, [&](Value *V) {
9860 if (isa<PoisonValue>(V))
9861 return true;
9862 return areAllUsersVectorized(cast<Instruction>(V),
9863 UserIgnoreList);
9864 }))
9865 continue;
9866 if (S.getOpcode() == Instruction::Load) {
9867 OrdersType Order;
9868 SmallVector<Value *> PointerOps;
9869 LoadsState Res =
9870 canVectorizeLoads(Slice, Slice.front(), Order, PointerOps);
9871 // Do not vectorize gathers.
9872 if (Res == LoadsState::ScatterVectorize ||
9873 Res == LoadsState::Gather) {
9874 if (Res == LoadsState::Gather) {
9876 // If reductions and the scalars from the root node are
9877 // analyzed - mark as non-vectorizable reduction.
9878 if (UserIgnoreList && E.Idx == 0)
9879 analyzedReductionVals(Slice);
9880 }
9881 continue;
9882 }
9883 } else if (S.getOpcode() == Instruction::ExtractElement ||
9884 (TTI->getInstructionCost(S.getMainOp(), CostKind) <
9886 !CheckOperandsProfitability(
9887 S.getMainOp(),
9888 cast<Instruction>(*find_if(reverse(Slice),
9889 IsaPred<Instruction>)),
9890 S))) {
9891 // Do not vectorize extractelements (handled effectively
9892 // alread). Do not vectorize non-profitable instructions (with
9893 // low cost and non-vectorizable operands.)
9894 continue;
9895 }
9896 }
9897 }
9898 Slices.emplace_back(Cnt, Slice.size());
9899 }
9900 auto AddCombinedNode = [&](unsigned Idx, unsigned Cnt, unsigned Sz) {
9901 E.CombinedEntriesWithIndices.emplace_back(Idx, Cnt);
9902 if (StartIdx == Cnt)
9903 StartIdx = Cnt + Sz;
9904 if (End == Cnt + Sz)
9905 End = Cnt;
9906 };
9907 for (auto [Cnt, Sz] : Slices) {
9908 ArrayRef<Value *> Slice = VL.slice(Cnt, Sz);
9909 // If any instruction is vectorized already - do not try again.
9910 if (TreeEntry *SE = getTreeEntry(Slice.front());
9911 SE || getTreeEntry(Slice.back())) {
9912 if (!SE)
9913 continue;
9914 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9915 continue;
9916 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9917 AddCombinedNode(SE->Idx, Cnt, Sz);
9918 continue;
9919 }
9920 unsigned PrevSize = VectorizableTree.size();
9921 [[maybe_unused]] unsigned PrevEntriesSize =
9922 LoadEntriesToVectorize.size();
9923 buildTree_rec(Slice, 0, EdgeInfo(&E, UINT_MAX));
9924 if (PrevSize + 1 == VectorizableTree.size() &&
9925 VectorizableTree[PrevSize]->isGather() &&
9926 VectorizableTree[PrevSize]->hasState() &&
9927 VectorizableTree[PrevSize]->getOpcode() !=
9928 Instruction::ExtractElement &&
9929 !isSplat(Slice)) {
9930 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9931 analyzedReductionVals(Slice);
9932 VectorizableTree.pop_back();
9933 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9934 "LoadEntriesToVectorize expected to remain the same");
9935 continue;
9936 }
9937 AddCombinedNode(PrevSize, Cnt, Sz);
9938 }
9939 }
9940 // Restore ordering, if no extra vectorization happened.
9941 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9942 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
9943 reorderScalars(E.Scalars, Mask);
9944 E.ReorderIndices.clear();
9945 }
9946 }
9947 if (!E.hasState())
9948 continue;
9949 switch (E.getOpcode()) {
9950 case Instruction::Load: {
9951 // No need to reorder masked gather loads, just reorder the scalar
9952 // operands.
9953 if (E.State != TreeEntry::Vectorize)
9954 break;
9955 Type *ScalarTy = E.getMainOp()->getType();
9956 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9957 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9958 // Check if profitable to represent consecutive load + reverse as strided
9959 // load with stride -1.
9960 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9961 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9962 SmallVector<int> Mask;
9963 inversePermutation(E.ReorderIndices, Mask);
9964 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9965 InstructionCost OriginalVecCost =
9966 TTI->getMemoryOpCost(Instruction::Load, VecTy, BaseLI->getAlign(),
9971 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9972 /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI);
9973 if (StridedCost < OriginalVecCost)
9974 // Strided load is more profitable than consecutive load + reverse -
9975 // transform the node to strided load.
9976 E.State = TreeEntry::StridedVectorize;
9977 }
9978 break;
9979 }
9980 case Instruction::Store: {
9981 Type *ScalarTy =
9982 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9983 auto *VecTy = getWidenedType(ScalarTy, E.Scalars.size());
9984 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9985 // Check if profitable to represent consecutive load + reverse as strided
9986 // load with stride -1.
9987 if (!E.ReorderIndices.empty() && isReverseOrder(E.ReorderIndices) &&
9988 TTI->isLegalStridedLoadStore(VecTy, CommonAlignment)) {
9989 SmallVector<int> Mask;
9990 inversePermutation(E.ReorderIndices, Mask);
9991 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9992 InstructionCost OriginalVecCost =
9993 TTI->getMemoryOpCost(Instruction::Store, VecTy, BaseSI->getAlign(),
9998 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9999 /*VariableMask=*/false, CommonAlignment, CostKind, BaseSI);
10000 if (StridedCost < OriginalVecCost)
10001 // Strided store is more profitable than reverse + consecutive store -
10002 // transform the node to strided store.
10003 E.State = TreeEntry::StridedVectorize;
10004 } else if (!E.ReorderIndices.empty()) {
10005 // Check for interleaved stores.
10006 auto IsInterleaveMask = [&, &TTI = *TTI](ArrayRef<int> Mask) {
10007 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10008 assert(Mask.size() > 1 && "Expected mask greater than 1 element.");
10009 if (Mask.size() < 4)
10010 return 0u;
10011 for (unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10013 Mask, Factor, VecTy->getElementCount().getFixedValue()) &&
10015 VecTy, Factor, BaseSI->getAlign(),
10016 BaseSI->getPointerAddressSpace()))
10017 return Factor;
10018 }
10019
10020 return 0u;
10021 };
10022 SmallVector<int> Mask(E.ReorderIndices.begin(), E.ReorderIndices.end());
10023 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10024 if (InterleaveFactor != 0)
10025 E.setInterleave(InterleaveFactor);
10026 }
10027 break;
10028 }
10029 case Instruction::Select: {
10030 if (E.State != TreeEntry::Vectorize)
10031 break;
10032 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(E.Scalars);
10033 if (MinMaxID == Intrinsic::not_intrinsic)
10034 break;
10035 // This node is a minmax node.
10036 E.CombinedOp = TreeEntry::MinMax;
10037 TreeEntry *CondEntry = const_cast<TreeEntry *>(getOperandEntry(&E, 0));
10038 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10039 CondEntry->State == TreeEntry::Vectorize) {
10040 // The condition node is part of the combined minmax node.
10041 CondEntry->State = TreeEntry::CombinedVectorize;
10042 }
10043 break;
10044 }
10045 default:
10046 break;
10047 }
10048 }
10049
10050 if (LoadEntriesToVectorize.empty()) {
10051 // Single load node - exit.
10052 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10053 VectorizableTree.front()->getOpcode() == Instruction::Load)
10054 return;
10055 // Small graph with small VF - exit.
10056 constexpr unsigned SmallTree = 3;
10057 constexpr unsigned SmallVF = 2;
10058 if ((VectorizableTree.size() <= SmallTree &&
10059 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10060 (VectorizableTree.size() <= 2 && UserIgnoreList))
10061 return;
10062
10063 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10064 getCanonicalGraphSize() != getTreeSize() && UserIgnoreList &&
10065 getCanonicalGraphSize() <= SmallTree &&
10066 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
10067 [](const std::unique_ptr<TreeEntry> &TE) {
10068 return TE->isGather() && TE->hasState() &&
10069 TE->getOpcode() == Instruction::Load &&
10070 !allSameBlock(TE->Scalars);
10071 }) == 1)
10072 return;
10073 }
10074
10075 // A list of loads to be gathered during the vectorization process. We can
10076 // try to vectorize them at the end, if profitable.
10079 GatheredLoads;
10080
10081 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10082 TreeEntry &E = *TE;
10083 if (E.isGather() &&
10084 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10085 (!E.hasState() && any_of(E.Scalars,
10086 [&](Value *V) {
10087 return isa<LoadInst>(V) &&
10088 !isVectorized(V) &&
10089 !isDeleted(cast<Instruction>(V));
10090 }))) &&
10091 !isSplat(E.Scalars)) {
10092 for (Value *V : E.Scalars) {
10093 auto *LI = dyn_cast<LoadInst>(V);
10094 if (!LI)
10095 continue;
10096 if (isDeleted(LI) || isVectorized(LI) || !LI->isSimple())
10097 continue;
10099 *this, V, *DL, *SE, *TTI,
10100 GatheredLoads[std::make_tuple(
10101 LI->getParent(),
10103 LI->getType())]);
10104 }
10105 }
10106 }
10107 // Try to vectorize gathered loads if this is not just a gather of loads.
10108 if (!GatheredLoads.empty())
10109 tryToVectorizeGatheredLoads(GatheredLoads);
10110}
10111
10112/// Merges shuffle masks and emits final shuffle instruction, if required. It
10113/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
10114/// when the actual shuffle instruction is generated only if this is actually
10115/// required. Otherwise, the shuffle instruction emission is delayed till the
10116/// end of the process, to reduce the number of emitted instructions and further
10117/// analysis/transformations.
10118class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
10119 bool IsFinalized = false;
10120 SmallVector<int> CommonMask;
10122 const TargetTransformInfo &TTI;
10124 SmallDenseSet<Value *> VectorizedVals;
10125 BoUpSLP &R;
10126 SmallPtrSetImpl<Value *> &CheckedExtracts;
10127 constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
10128 /// While set, still trying to estimate the cost for the same nodes and we
10129 /// can delay actual cost estimation (virtual shuffle instruction emission).
10130 /// May help better estimate the cost if same nodes must be permuted + allows
10131 /// to move most of the long shuffles cost estimation to TTI.
10132 bool SameNodesEstimated = true;
10133
10134 static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
10135 if (Ty->getScalarType()->isPointerTy()) {
10139 DL.getTypeStoreSizeInBits(Ty->getScalarType()))),
10140 Ty->getScalarType());
10141 if (auto *VTy = dyn_cast<VectorType>(Ty))
10142 Res = ConstantVector::getSplat(VTy->getElementCount(), Res);
10143 return Res;
10144 }
10145 return Constant::getAllOnesValue(Ty);
10146 }
10147
10148 InstructionCost getBuildVectorCost(ArrayRef<Value *> VL, Value *Root) {
10149 if ((!Root && allConstant(VL)) || all_of(VL, IsaPred<UndefValue>))
10150 return TTI::TCC_Free;
10151 auto *VecTy = getWidenedType(ScalarTy, VL.size());
10152 InstructionCost GatherCost = 0;
10153 SmallVector<Value *> Gathers(VL);
10154 if (!Root && isSplat(VL)) {
10155 // Found the broadcasting of the single scalar, calculate the cost as
10156 // the broadcast.
10157 const auto *It = find_if_not(VL, IsaPred<UndefValue>);
10158 assert(It != VL.end() && "Expected at least one non-undef value.");
10159 // Add broadcast for non-identity shuffle only.
10160 bool NeedShuffle =
10161 count(VL, *It) > 1 &&
10162 (VL.front() != *It || !all_of(VL.drop_front(), IsaPred<UndefValue>));
10163 if (!NeedShuffle) {
10164 if (isa<FixedVectorType>(ScalarTy)) {
10165 assert(SLPReVec && "FixedVectorType is not expected.");
10166 return TTI.getShuffleCost(
10167 TTI::SK_InsertSubvector, VecTy, {}, CostKind,
10168 std::distance(VL.begin(), It) * getNumElements(ScalarTy),
10169 cast<FixedVectorType>(ScalarTy));
10170 }
10171 return TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
10172 CostKind, std::distance(VL.begin(), It),
10173 PoisonValue::get(VecTy), *It);
10174 }
10175
10176 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
10177 transform(VL, ShuffleMask.begin(), [](Value *V) {
10178 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10179 });
10180 InstructionCost InsertCost =
10181 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0,
10182 PoisonValue::get(VecTy), *It);
10183 return InsertCost + ::getShuffleCost(TTI,
10185 VecTy, ShuffleMask, CostKind,
10186 /*Index=*/0, /*SubTp=*/nullptr,
10187 /*Args=*/*It);
10188 }
10189 return GatherCost +
10190 (all_of(Gathers, IsaPred<UndefValue>)
10192 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers),
10193 ScalarTy));
10194 };
10195
10196 /// Compute the cost of creating a vector containing the extracted values from
10197 /// \p VL.
10199 computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
10200 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10201 unsigned NumParts) {
10202 assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
10203 unsigned NumElts =
10204 std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
10205 auto *EE = dyn_cast<ExtractElementInst>(V);
10206 if (!EE)
10207 return Sz;
10208 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10209 if (!VecTy)
10210 return Sz;
10211 return std::max(Sz, VecTy->getNumElements());
10212 });
10213 // FIXME: this must be moved to TTI for better estimation.
10214 unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
10215 auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
10217 -> std::optional<TTI::ShuffleKind> {
10218 if (NumElts <= EltsPerVector)
10219 return std::nullopt;
10220 int OffsetReg0 =
10221 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10222 [](int S, int I) {
10223 if (I == PoisonMaskElem)
10224 return S;
10225 return std::min(S, I);
10226 }),
10227 EltsPerVector);
10228 int OffsetReg1 = OffsetReg0;
10229 DenseSet<int> RegIndices;
10230 // Check that if trying to permute same single/2 input vectors.
10232 int FirstRegId = -1;
10233 Indices.assign(1, OffsetReg0);
10234 for (auto [Pos, I] : enumerate(Mask)) {
10235 if (I == PoisonMaskElem)
10236 continue;
10237 int Idx = I - OffsetReg0;
10238 int RegId =
10239 (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
10240 if (FirstRegId < 0)
10241 FirstRegId = RegId;
10242 RegIndices.insert(RegId);
10243 if (RegIndices.size() > 2)
10244 return std::nullopt;
10245 if (RegIndices.size() == 2) {
10246 ShuffleKind = TTI::SK_PermuteTwoSrc;
10247 if (Indices.size() == 1) {
10248 OffsetReg1 = alignDown(
10249 std::accumulate(
10250 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10251 [&](int S, int I) {
10252 if (I == PoisonMaskElem)
10253 return S;
10254 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10255 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10256 if (RegId == FirstRegId)
10257 return S;
10258 return std::min(S, I);
10259 }),
10260 EltsPerVector);
10261 Indices.push_back(OffsetReg1 % NumElts);
10262 }
10263 Idx = I - OffsetReg1;
10264 }
10265 I = (Idx % NumElts) % EltsPerVector +
10266 (RegId == FirstRegId ? 0 : EltsPerVector);
10267 }
10268 return ShuffleKind;
10269 };
10271
10272 // Process extracts in blocks of EltsPerVector to check if the source vector
10273 // operand can be re-used directly. If not, add the cost of creating a
10274 // shuffle to extract the values into a vector register.
10275 for (unsigned Part : seq<unsigned>(NumParts)) {
10276 if (!ShuffleKinds[Part])
10277 continue;
10278 ArrayRef<int> MaskSlice = Mask.slice(
10279 Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
10280 SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
10281 copy(MaskSlice, SubMask.begin());
10283 std::optional<TTI::ShuffleKind> RegShuffleKind =
10284 CheckPerRegistersShuffle(SubMask, Indices);
10285 if (!RegShuffleKind) {
10286 if (*ShuffleKinds[Part] != TTI::SK_PermuteSingleSrc ||
10288 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.size())))
10289 Cost +=
10290 ::getShuffleCost(TTI, *ShuffleKinds[Part],
10291 getWidenedType(ScalarTy, NumElts), MaskSlice);
10292 continue;
10293 }
10294 if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
10295 !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
10296 Cost +=
10297 ::getShuffleCost(TTI, *RegShuffleKind,
10298 getWidenedType(ScalarTy, EltsPerVector), SubMask);
10299 }
10300 const unsigned BaseVF = getFullVectorNumberOfElements(
10301 *R.TTI, VL.front()->getType(), alignTo(NumElts, EltsPerVector));
10302 for (unsigned Idx : Indices) {
10303 assert((Idx + EltsPerVector) <= BaseVF &&
10304 "SK_ExtractSubvector index out of range");
10306 getWidenedType(ScalarTy, BaseVF), {}, CostKind,
10307 Idx, getWidenedType(ScalarTy, EltsPerVector));
10308 }
10309 // Second attempt to check, if just a permute is better estimated than
10310 // subvector extract.
10311 SubMask.assign(NumElts, PoisonMaskElem);
10312 copy(MaskSlice, SubMask.begin());
10313 InstructionCost OriginalCost = ::getShuffleCost(
10314 TTI, *ShuffleKinds[Part], getWidenedType(ScalarTy, NumElts), SubMask);
10315 if (OriginalCost < Cost)
10316 Cost = OriginalCost;
10317 }
10318 return Cost;
10319 }
10320 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
10321 /// shuffle emission.
10322 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
10323 ArrayRef<int> Mask) {
10324 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10325 if (Mask[Idx] != PoisonMaskElem)
10326 CommonMask[Idx] = Idx;
10327 }
10328 /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
10329 /// mask \p Mask, register number \p Part, that includes \p SliceSize
10330 /// elements.
10331 void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
10332 ArrayRef<int> Mask, unsigned Part,
10333 unsigned SliceSize) {
10334 if (SameNodesEstimated) {
10335 // Delay the cost estimation if the same nodes are reshuffling.
10336 // If we already requested the cost of reshuffling of E1 and E2 before, no
10337 // need to estimate another cost with the sub-Mask, instead include this
10338 // sub-Mask into the CommonMask to estimate it later and avoid double cost
10339 // estimation.
10340 if ((InVectors.size() == 2 &&
10341 cast<const TreeEntry *>(InVectors.front()) == &E1 &&
10342 cast<const TreeEntry *>(InVectors.back()) == E2) ||
10343 (!E2 && cast<const TreeEntry *>(InVectors.front()) == &E1)) {
10344 unsigned Limit = getNumElems(Mask.size(), SliceSize, Part);
10345 assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, Limit),
10346 [](int Idx) { return Idx == PoisonMaskElem; }) &&
10347 "Expected all poisoned elements.");
10348 ArrayRef<int> SubMask = ArrayRef(Mask).slice(Part * SliceSize, Limit);
10349 copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
10350 return;
10351 }
10352 // Found non-matching nodes - need to estimate the cost for the matched
10353 // and transform mask.
10354 Cost += createShuffle(InVectors.front(),
10355 InVectors.size() == 1 ? nullptr : InVectors.back(),
10356 CommonMask);
10357 transformMaskAfterShuffle(CommonMask, CommonMask);
10358 } else if (InVectors.size() == 2) {
10359 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10360 transformMaskAfterShuffle(CommonMask, CommonMask);
10361 }
10362 SameNodesEstimated = false;
10363 if (!E2 && InVectors.size() == 1) {
10364 unsigned VF = E1.getVectorFactor();
10365 if (Value *V1 = InVectors.front().dyn_cast<Value *>()) {
10366 VF = std::max(VF,
10367 cast<FixedVectorType>(V1->getType())->getNumElements());
10368 } else {
10369 const auto *E = cast<const TreeEntry *>(InVectors.front());
10370 VF = std::max(VF, E->getVectorFactor());
10371 }
10372 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10373 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10374 CommonMask[Idx] = Mask[Idx] + VF;
10375 Cost += createShuffle(InVectors.front(), &E1, CommonMask);
10376 transformMaskAfterShuffle(CommonMask, CommonMask);
10377 } else {
10378 auto P = InVectors.front();
10379 Cost += createShuffle(&E1, E2, Mask);
10380 unsigned VF = Mask.size();
10381 if (Value *V1 = P.dyn_cast<Value *>()) {
10382 VF = std::max(VF,
10383 getNumElements(V1->getType()));
10384 } else {
10385 const auto *E = cast<const TreeEntry *>(P);
10386 VF = std::max(VF, E->getVectorFactor());
10387 }
10388 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10389 if (Mask[Idx] != PoisonMaskElem)
10390 CommonMask[Idx] = Idx + (InVectors.empty() ? 0 : VF);
10391 Cost += createShuffle(P, InVectors.front(), CommonMask);
10392 transformMaskAfterShuffle(CommonMask, CommonMask);
10393 }
10394 }
10395
10396 class ShuffleCostBuilder {
10397 const TargetTransformInfo &TTI;
10398
10399 static bool isEmptyOrIdentity(ArrayRef<int> Mask, unsigned VF) {
10400 int Index = -1;
10401 return Mask.empty() ||
10402 (VF == Mask.size() &&
10405 Index == 0);
10406 }
10407
10408 public:
10409 ShuffleCostBuilder(const TargetTransformInfo &TTI) : TTI(TTI) {}
10410 ~ShuffleCostBuilder() = default;
10411 InstructionCost createShuffleVector(Value *V1, Value *,
10412 ArrayRef<int> Mask) const {
10413 // Empty mask or identity mask are free.
10414 unsigned VF =
10415 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10416 if (isEmptyOrIdentity(Mask, VF))
10417 return TTI::TCC_Free;
10418 return ::getShuffleCost(TTI, TTI::SK_PermuteTwoSrc,
10419 cast<VectorType>(V1->getType()), Mask);
10420 }
10421 InstructionCost createShuffleVector(Value *V1, ArrayRef<int> Mask) const {
10422 // Empty mask or identity mask are free.
10423 unsigned VF =
10424 cast<VectorType>(V1->getType())->getElementCount().getKnownMinValue();
10425 if (isEmptyOrIdentity(Mask, VF))
10426 return TTI::TCC_Free;
10427 return ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc,
10428 cast<VectorType>(V1->getType()), Mask);
10429 }
10430 InstructionCost createIdentity(Value *) const { return TTI::TCC_Free; }
10431 InstructionCost createPoison(Type *Ty, unsigned VF) const {
10432 return TTI::TCC_Free;
10433 }
10434 void resizeToMatch(Value *&, Value *&) const {}
10435 };
10436
10437 /// Smart shuffle instruction emission, walks through shuffles trees and
10438 /// tries to find the best matching vector for the actual shuffle
10439 /// instruction.
10441 createShuffle(const PointerUnion<Value *, const TreeEntry *> &P1,
10443 ArrayRef<int> Mask) {
10444 ShuffleCostBuilder Builder(TTI);
10445 SmallVector<int> CommonMask(Mask);
10446 Value *V1 = P1.dyn_cast<Value *>(), *V2 = P2.dyn_cast<Value *>();
10447 unsigned CommonVF = Mask.size();
10448 InstructionCost ExtraCost = 0;
10449 auto GetNodeMinBWAffectedCost = [&](const TreeEntry &E,
10450 unsigned VF) -> InstructionCost {
10451 if (E.isGather() && allConstant(E.Scalars))
10452 return TTI::TCC_Free;
10453 Type *EScalarTy = E.Scalars.front()->getType();
10454 bool IsSigned = true;
10455 if (auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10456 EScalarTy = IntegerType::get(EScalarTy->getContext(), It->second.first);
10457 IsSigned = It->second.second;
10458 }
10459 if (EScalarTy != ScalarTy) {
10460 unsigned CastOpcode = Instruction::Trunc;
10461 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10462 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10463 if (DstSz > SrcSz)
10464 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10465 return TTI.getCastInstrCost(CastOpcode, getWidenedType(ScalarTy, VF),
10466 getWidenedType(EScalarTy, VF),
10467 TTI::CastContextHint::None, CostKind);
10468 }
10469 return TTI::TCC_Free;
10470 };
10471 auto GetValueMinBWAffectedCost = [&](const Value *V) -> InstructionCost {
10472 if (isa<Constant>(V))
10473 return TTI::TCC_Free;
10474 auto *VecTy = cast<VectorType>(V->getType());
10475 Type *EScalarTy = VecTy->getElementType();
10476 if (EScalarTy != ScalarTy) {
10477 bool IsSigned = !isKnownNonNegative(V, SimplifyQuery(*R.DL));
10478 unsigned CastOpcode = Instruction::Trunc;
10479 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10480 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10481 if (DstSz > SrcSz)
10482 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10483 return TTI.getCastInstrCost(
10484 CastOpcode, VectorType::get(ScalarTy, VecTy->getElementCount()),
10485 VecTy, TTI::CastContextHint::None, CostKind);
10486 }
10487 return TTI::TCC_Free;
10488 };
10489 if (!V1 && !V2 && !P2.isNull()) {
10490 // Shuffle 2 entry nodes.
10491 const TreeEntry *E = cast<const TreeEntry *>(P1);
10492 unsigned VF = E->getVectorFactor();
10493 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10494 CommonVF = std::max(VF, E2->getVectorFactor());
10495 assert(all_of(Mask,
10496 [=](int Idx) {
10497 return Idx < 2 * static_cast<int>(CommonVF);
10498 }) &&
10499 "All elements in mask must be less than 2 * CommonVF.");
10500 if (E->Scalars.size() == E2->Scalars.size()) {
10501 SmallVector<int> EMask = E->getCommonMask();
10502 SmallVector<int> E2Mask = E2->getCommonMask();
10503 if (!EMask.empty() || !E2Mask.empty()) {
10504 for (int &Idx : CommonMask) {
10505 if (Idx == PoisonMaskElem)
10506 continue;
10507 if (Idx < static_cast<int>(CommonVF) && !EMask.empty())
10508 Idx = EMask[Idx];
10509 else if (Idx >= static_cast<int>(CommonVF))
10510 Idx = (E2Mask.empty() ? Idx - CommonVF : E2Mask[Idx - CommonVF]) +
10511 E->Scalars.size();
10512 }
10513 }
10514 CommonVF = E->Scalars.size();
10515 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10516 GetNodeMinBWAffectedCost(*E2, CommonVF);
10517 } else {
10518 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10519 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10520 }
10521 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10522 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10523 } else if (!V1 && P2.isNull()) {
10524 // Shuffle single entry node.
10525 const TreeEntry *E = cast<const TreeEntry *>(P1);
10526 unsigned VF = E->getVectorFactor();
10527 CommonVF = VF;
10528 assert(
10529 all_of(Mask,
10530 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10531 "All elements in mask must be less than CommonVF.");
10532 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10533 SmallVector<int> EMask = E->getCommonMask();
10534 assert(!EMask.empty() && "Expected non-empty common mask.");
10535 for (int &Idx : CommonMask) {
10536 if (Idx != PoisonMaskElem)
10537 Idx = EMask[Idx];
10538 }
10539 CommonVF = E->Scalars.size();
10540 } else if (unsigned Factor = E->getInterleaveFactor();
10541 Factor > 0 && E->Scalars.size() != Mask.size() &&
10543 Factor)) {
10544 // Deinterleaved nodes are free.
10545 std::iota(CommonMask.begin(), CommonMask.end(), 0);
10546 }
10547 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10548 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10549 // Not identity/broadcast? Try to see if the original vector is better.
10550 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10551 CommonVF == CommonMask.size() &&
10552 any_of(enumerate(CommonMask),
10553 [](const auto &&P) {
10554 return P.value() != PoisonMaskElem &&
10555 static_cast<unsigned>(P.value()) != P.index();
10556 }) &&
10557 any_of(CommonMask,
10558 [](int Idx) { return Idx != PoisonMaskElem && Idx != 0; })) {
10559 SmallVector<int> ReorderMask;
10560 inversePermutation(E->ReorderIndices, ReorderMask);
10561 ::addMask(CommonMask, ReorderMask);
10562 }
10563 } else if (V1 && P2.isNull()) {
10564 // Shuffle single vector.
10565 ExtraCost += GetValueMinBWAffectedCost(V1);
10566 CommonVF = getVF(V1);
10567 assert(
10568 all_of(Mask,
10569 [=](int Idx) { return Idx < static_cast<int>(CommonVF); }) &&
10570 "All elements in mask must be less than CommonVF.");
10571 } else if (V1 && !V2) {
10572 // Shuffle vector and tree node.
10573 unsigned VF = getVF(V1);
10574 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10575 CommonVF = std::max(VF, E2->getVectorFactor());
10576 assert(all_of(Mask,
10577 [=](int Idx) {
10578 return Idx < 2 * static_cast<int>(CommonVF);
10579 }) &&
10580 "All elements in mask must be less than 2 * CommonVF.");
10581 if (E2->Scalars.size() == VF && VF != CommonVF) {
10582 SmallVector<int> E2Mask = E2->getCommonMask();
10583 assert(!E2Mask.empty() && "Expected non-empty common mask.");
10584 for (int &Idx : CommonMask) {
10585 if (Idx == PoisonMaskElem)
10586 continue;
10587 if (Idx >= static_cast<int>(CommonVF))
10588 Idx = E2Mask[Idx - CommonVF] + VF;
10589 }
10590 CommonVF = VF;
10591 }
10592 ExtraCost += GetValueMinBWAffectedCost(V1);
10593 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10594 ExtraCost += GetNodeMinBWAffectedCost(
10595 *E2, std::min(CommonVF, E2->getVectorFactor()));
10596 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10597 } else if (!V1 && V2) {
10598 // Shuffle vector and tree node.
10599 unsigned VF = getVF(V2);
10600 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10601 CommonVF = std::max(VF, E1->getVectorFactor());
10602 assert(all_of(Mask,
10603 [=](int Idx) {
10604 return Idx < 2 * static_cast<int>(CommonVF);
10605 }) &&
10606 "All elements in mask must be less than 2 * CommonVF.");
10607 if (E1->Scalars.size() == VF && VF != CommonVF) {
10608 SmallVector<int> E1Mask = E1->getCommonMask();
10609 assert(!E1Mask.empty() && "Expected non-empty common mask.");
10610 for (int &Idx : CommonMask) {
10611 if (Idx == PoisonMaskElem)
10612 continue;
10613 if (Idx >= static_cast<int>(CommonVF))
10614 Idx = E1Mask[Idx - CommonVF] + VF;
10615 else
10616 Idx = E1Mask[Idx];
10617 }
10618 CommonVF = VF;
10619 }
10620 ExtraCost += GetNodeMinBWAffectedCost(
10621 *E1, std::min(CommonVF, E1->getVectorFactor()));
10622 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10623 ExtraCost += GetValueMinBWAffectedCost(V2);
10624 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10625 } else {
10626 assert(V1 && V2 && "Expected both vectors.");
10627 unsigned VF = getVF(V1);
10628 CommonVF = std::max(VF, getVF(V2));
10629 assert(all_of(Mask,
10630 [=](int Idx) {
10631 return Idx < 2 * static_cast<int>(CommonVF);
10632 }) &&
10633 "All elements in mask must be less than 2 * CommonVF.");
10634 ExtraCost +=
10635 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10636 if (V1->getType() != V2->getType()) {
10637 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10638 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10639 } else {
10640 if (cast<VectorType>(V1->getType())->getElementType() != ScalarTy)
10641 V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF));
10642 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10643 V2 = getAllOnesValue(*R.DL, getWidenedType(ScalarTy, CommonVF));
10644 }
10645 }
10646 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10647 assert(SLPReVec && "FixedVectorType is not expected.");
10649 CommonMask);
10650 }
10651 InVectors.front() =
10652 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10653 if (InVectors.size() == 2)
10654 InVectors.pop_back();
10655 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10656 V1, V2, CommonMask, Builder);
10657 }
10658
10659public:
10661 ArrayRef<Value *> VectorizedVals, BoUpSLP &R,
10662 SmallPtrSetImpl<Value *> &CheckedExtracts)
10663 : BaseShuffleAnalysis(ScalarTy), TTI(TTI),
10664 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10665 CheckedExtracts(CheckedExtracts) {}
10666 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
10667 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10668 unsigned NumParts, bool &UseVecBaseAsInput) {
10669 UseVecBaseAsInput = false;
10670 if (Mask.empty())
10671 return nullptr;
10672 Value *VecBase = nullptr;
10673 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
10674 if (!E->ReorderIndices.empty()) {
10675 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
10676 E->ReorderIndices.end());
10677 reorderScalars(VL, ReorderMask);
10678 }
10679 // Check if it can be considered reused if same extractelements were
10680 // vectorized already.
10681 bool PrevNodeFound = any_of(
10682 ArrayRef(R.VectorizableTree).take_front(E->Idx),
10683 [&](const std::unique_ptr<TreeEntry> &TE) {
10684 return ((TE->hasState() && !TE->isAltShuffle() &&
10685 TE->getOpcode() == Instruction::ExtractElement) ||
10686 TE->isGather()) &&
10687 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10688 return VL.size() > Data.index() &&
10689 (Mask[Data.index()] == PoisonMaskElem ||
10690 isa<UndefValue>(VL[Data.index()]) ||
10691 Data.value() == VL[Data.index()]);
10692 });
10693 });
10694 SmallPtrSet<Value *, 4> UniqueBases;
10695 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
10696 for (unsigned Part : seq<unsigned>(NumParts)) {
10697 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
10698 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10699 for (auto [I, V] :
10700 enumerate(ArrayRef(VL).slice(Part * SliceSize, Limit))) {
10701 // Ignore non-extractelement scalars.
10702 if (isa<UndefValue>(V) ||
10703 (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
10704 continue;
10705 // If all users of instruction are going to be vectorized and this
10706 // instruction itself is not going to be vectorized, consider this
10707 // instruction as dead and remove its cost from the final cost of the
10708 // vectorized tree.
10709 // Also, avoid adjusting the cost for extractelements with multiple uses
10710 // in different graph entries.
10711 auto *EE = cast<ExtractElementInst>(V);
10712 VecBase = EE->getVectorOperand();
10713 UniqueBases.insert(VecBase);
10714 const TreeEntry *VE = R.getTreeEntry(V);
10715 if (!CheckedExtracts.insert(V).second ||
10716 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10717 any_of(EE->users(),
10718 [&](User *U) {
10719 return isa<GetElementPtrInst>(U) &&
10720 !R.areAllUsersVectorized(cast<Instruction>(U),
10721 &VectorizedVals);
10722 }) ||
10723 (VE && VE != E))
10724 continue;
10725 std::optional<unsigned> EEIdx = getExtractIndex(EE);
10726 if (!EEIdx)
10727 continue;
10728 unsigned Idx = *EEIdx;
10729 // Take credit for instruction that will become dead.
10730 if (EE->hasOneUse() || !PrevNodeFound) {
10731 Instruction *Ext = EE->user_back();
10732 if (isa<SExtInst, ZExtInst>(Ext) &&
10733 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10734 // Use getExtractWithExtendCost() to calculate the cost of
10735 // extractelement/ext pair.
10736 Cost -=
10737 TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
10738 EE->getVectorOperandType(), Idx);
10739 // Add back the cost of s|zext which is subtracted separately.
10741 Ext->getOpcode(), Ext->getType(), EE->getType(),
10742 TTI::getCastContextHint(Ext), CostKind, Ext);
10743 continue;
10744 }
10745 }
10746 Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
10747 CostKind, Idx);
10748 }
10749 }
10750 // Check that gather of extractelements can be represented as just a
10751 // shuffle of a single/two vectors the scalars are extracted from.
10752 // Found the bunch of extractelement instructions that must be gathered
10753 // into a vector and can be represented as a permutation elements in a
10754 // single input vector or of 2 input vectors.
10755 // Done for reused if same extractelements were vectorized already.
10756 if (!PrevNodeFound)
10757 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10758 InVectors.assign(1, E);
10759 CommonMask.assign(Mask.begin(), Mask.end());
10760 transformMaskAfterShuffle(CommonMask, CommonMask);
10761 SameNodesEstimated = false;
10762 if (NumParts != 1 && UniqueBases.size() != 1) {
10763 UseVecBaseAsInput = true;
10764 VecBase =
10765 Constant::getNullValue(getWidenedType(ScalarTy, CommonMask.size()));
10766 }
10767 return VecBase;
10768 }
10769 /// Checks if the specified entry \p E needs to be delayed because of its
10770 /// dependency nodes.
10771 std::optional<InstructionCost>
10772 needToDelay(const TreeEntry *,
10774 // No need to delay the cost estimation during analysis.
10775 return std::nullopt;
10776 }
10777 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
10778 if (&E1 == &E2) {
10779 assert(all_of(Mask,
10780 [&](int Idx) {
10781 return Idx < static_cast<int>(E1.getVectorFactor());
10782 }) &&
10783 "Expected single vector shuffle mask.");
10784 add(E1, Mask);
10785 return;
10786 }
10787 if (InVectors.empty()) {
10788 CommonMask.assign(Mask.begin(), Mask.end());
10789 InVectors.assign({&E1, &E2});
10790 return;
10791 }
10792 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10793 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10794 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10795 if (NumParts == 0 || NumParts >= Mask.size() ||
10796 MaskVecTy->getNumElements() % NumParts != 0 ||
10797 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10798 MaskVecTy->getNumElements() / NumParts))
10799 NumParts = 1;
10800 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10801 const auto *It =
10802 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10803 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10804 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10805 }
10806 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
10807 if (InVectors.empty()) {
10808 CommonMask.assign(Mask.begin(), Mask.end());
10809 InVectors.assign(1, &E1);
10810 return;
10811 }
10812 assert(!CommonMask.empty() && "Expected non-empty common mask.");
10813 auto *MaskVecTy = getWidenedType(ScalarTy, Mask.size());
10814 unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
10815 if (NumParts == 0 || NumParts >= Mask.size() ||
10816 MaskVecTy->getNumElements() % NumParts != 0 ||
10817 !hasFullVectorsOrPowerOf2(TTI, MaskVecTy->getElementType(),
10818 MaskVecTy->getNumElements() / NumParts))
10819 NumParts = 1;
10820 unsigned SliceSize = getPartNumElems(Mask.size(), NumParts);
10821 const auto *It =
10822 find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
10823 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10824 estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
10825 if (!SameNodesEstimated && InVectors.size() == 1)
10826 InVectors.emplace_back(&E1);
10827 }
10828 /// Adds 2 input vectors and the mask for their shuffling.
10829 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
10830 // May come only for shuffling of 2 vectors with extractelements, already
10831 // handled in adjustExtracts.
10832 assert(InVectors.size() == 1 &&
10833 all_of(enumerate(CommonMask),
10834 [&](auto P) {
10835 if (P.value() == PoisonMaskElem)
10836 return Mask[P.index()] == PoisonMaskElem;
10837 auto *EI = cast<ExtractElementInst>(
10838 cast<const TreeEntry *>(InVectors.front())
10839 ->getOrdered(P.index()));
10840 return EI->getVectorOperand() == V1 ||
10841 EI->getVectorOperand() == V2;
10842 }) &&
10843 "Expected extractelement vectors.");
10844 }
10845 /// Adds another one input vector and the mask for the shuffling.
10846 void add(Value *V1, ArrayRef<int> Mask, bool ForExtracts = false) {
10847 if (InVectors.empty()) {
10848 assert(CommonMask.empty() && !ForExtracts &&
10849 "Expected empty input mask/vectors.");
10850 CommonMask.assign(Mask.begin(), Mask.end());
10851 InVectors.assign(1, V1);
10852 return;
10853 }
10854 if (ForExtracts) {
10855 // No need to add vectors here, already handled them in adjustExtracts.
10856 assert(InVectors.size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10857 !CommonMask.empty() &&
10858 all_of(enumerate(CommonMask),
10859 [&](auto P) {
10860 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10861 ->getOrdered(P.index());
10862 if (P.value() == PoisonMaskElem)
10863 return P.value() == Mask[P.index()] ||
10864 isa<UndefValue>(Scalar);
10865 if (isa<Constant>(V1))
10866 return true;
10867 auto *EI = cast<ExtractElementInst>(Scalar);
10868 return EI->getVectorOperand() == V1;
10869 }) &&
10870 "Expected only tree entry for extractelement vectors.");
10871 return;
10872 }
10873 assert(!InVectors.empty() && !CommonMask.empty() &&
10874 "Expected only tree entries from extracts/reused buildvectors.");
10875 unsigned VF = getVF(V1);
10876 if (InVectors.size() == 2) {
10877 Cost += createShuffle(InVectors.front(), InVectors.back(), CommonMask);
10878 transformMaskAfterShuffle(CommonMask, CommonMask);
10879 VF = std::max<unsigned>(VF, CommonMask.size());
10880 } else if (const auto *InTE =
10881 InVectors.front().dyn_cast<const TreeEntry *>()) {
10882 VF = std::max(VF, InTE->getVectorFactor());
10883 } else {
10884 VF = std::max(
10885 VF, cast<FixedVectorType>(cast<Value *>(InVectors.front())->getType())
10886 ->getNumElements());
10887 }
10888 InVectors.push_back(V1);
10889 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10890 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
10891 CommonMask[Idx] = Mask[Idx] + VF;
10892 }
10893 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
10894 Value *Root = nullptr) {
10895 Cost += getBuildVectorCost(VL, Root);
10896 if (!Root) {
10897 // FIXME: Need to find a way to avoid use of getNullValue here.
10899 unsigned VF = VL.size();
10900 if (MaskVF != 0)
10901 VF = std::min(VF, MaskVF);
10902 for (Value *V : VL.take_front(VF)) {
10903 if (isa<UndefValue>(V)) {
10904 Vals.push_back(cast<Constant>(V));
10905 continue;
10906 }
10907 Vals.push_back(Constant::getNullValue(V->getType()));
10908 }
10909 if (auto *VecTy = dyn_cast<FixedVectorType>(Vals.front()->getType())) {
10910 assert(SLPReVec && "FixedVectorType is not expected.");
10911 // When REVEC is enabled, we need to expand vector types into scalar
10912 // types.
10913 unsigned VecTyNumElements = VecTy->getNumElements();
10914 SmallVector<Constant *> NewVals(VF * VecTyNumElements, nullptr);
10915 for (auto [I, V] : enumerate(Vals)) {
10916 Type *ScalarTy = V->getType()->getScalarType();
10917 Constant *NewVal;
10918 if (isa<PoisonValue>(V))
10919 NewVal = PoisonValue::get(ScalarTy);
10920 else if (isa<UndefValue>(V))
10921 NewVal = UndefValue::get(ScalarTy);
10922 else
10923 NewVal = Constant::getNullValue(ScalarTy);
10924 std::fill_n(NewVals.begin() + I * VecTyNumElements, VecTyNumElements,
10925 NewVal);
10926 }
10927 Vals.swap(NewVals);
10928 }
10929 return ConstantVector::get(Vals);
10930 }
10933 cast<FixedVectorType>(Root->getType())->getNumElements()),
10934 getAllOnesValue(*R.DL, ScalarTy->getScalarType()));
10935 }
10937 /// Finalize emission of the shuffles.
10940 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10941 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
10942 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
10943 IsFinalized = true;
10944 if (Action) {
10945 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10946 if (InVectors.size() == 2)
10947 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10948 else
10949 Cost += createShuffle(Vec, nullptr, CommonMask);
10950 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10951 if (CommonMask[Idx] != PoisonMaskElem)
10952 CommonMask[Idx] = Idx;
10953 assert(VF > 0 &&
10954 "Expected vector length for the final value before action.");
10955 Value *V = cast<Value *>(Vec);
10956 Action(V, CommonMask);
10957 InVectors.front() = V;
10958 }
10959 if (!SubVectors.empty()) {
10960 const PointerUnion<Value *, const TreeEntry *> &Vec = InVectors.front();
10961 if (InVectors.size() == 2)
10962 Cost += createShuffle(Vec, InVectors.back(), CommonMask);
10963 else
10964 Cost += createShuffle(Vec, nullptr, CommonMask);
10965 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
10966 if (CommonMask[Idx] != PoisonMaskElem)
10967 CommonMask[Idx] = Idx;
10968 // Add subvectors permutation cost.
10969 if (!SubVectorsMask.empty()) {
10970 assert(SubVectorsMask.size() <= CommonMask.size() &&
10971 "Expected same size of masks for subvectors and common mask.");
10972 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
10973 copy(SubVectorsMask, SVMask.begin());
10974 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
10975 if (I2 != PoisonMaskElem) {
10976 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
10977 I1 = I2 + CommonMask.size();
10978 }
10979 }
10981 getWidenedType(ScalarTy, CommonMask.size()),
10982 SVMask, CostKind);
10983 }
10984 for (auto [E, Idx] : SubVectors) {
10985 Type *EScalarTy = E->Scalars.front()->getType();
10986 bool IsSigned = true;
10987 if (auto It = R.MinBWs.find(E); It != R.MinBWs.end()) {
10988 EScalarTy =
10989 IntegerType::get(EScalarTy->getContext(), It->second.first);
10990 IsSigned = It->second.second;
10991 }
10992 if (ScalarTy != EScalarTy) {
10993 unsigned CastOpcode = Instruction::Trunc;
10994 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10995 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10996 if (DstSz > SrcSz)
10997 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10999 CastOpcode, getWidenedType(ScalarTy, E->getVectorFactor()),
11000 getWidenedType(EScalarTy, E->getVectorFactor()),
11002 }
11005 getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
11006 getWidenedType(ScalarTy, E->getVectorFactor()));
11007 if (!CommonMask.empty()) {
11008 std::iota(std::next(CommonMask.begin(), Idx),
11009 std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
11010 Idx);
11011 }
11012 }
11013 }
11014
11015 if (!ExtMask.empty()) {
11016 if (CommonMask.empty()) {
11017 CommonMask.assign(ExtMask.begin(), ExtMask.end());
11018 } else {
11019 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
11020 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
11021 if (ExtMask[I] == PoisonMaskElem)
11022 continue;
11023 NewMask[I] = CommonMask[ExtMask[I]];
11024 }
11025 CommonMask.swap(NewMask);
11026 }
11027 }
11028 if (CommonMask.empty()) {
11029 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
11030 return Cost;
11031 }
11032 return Cost +
11033 createShuffle(InVectors.front(),
11034 InVectors.size() == 2 ? InVectors.back() : nullptr,
11035 CommonMask);
11036 }
11037
11039 assert((IsFinalized || CommonMask.empty()) &&
11040 "Shuffle construction must be finalized.");
11041 }
11042};
11043
11044const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(const TreeEntry *E,
11045 unsigned Idx) const {
11046 if (const TreeEntry *VE = getMatchedVectorizedOperand(E, Idx))
11047 return VE;
11048 const auto *It =
11049 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
11050 return TE->isGather() &&
11051 find_if(TE->UserTreeIndices, [&](const EdgeInfo &EI) {
11052 return EI.EdgeIdx == Idx && EI.UserTE == E;
11053 }) != TE->UserTreeIndices.end();
11054 });
11055 assert(It != VectorizableTree.end() && "Expected vectorizable entry.");
11056 return It->get();
11057}
11058
11059TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
11060 if (TE.State == TreeEntry::ScatterVectorize ||
11061 TE.State == TreeEntry::StridedVectorize)
11063 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11064 !TE.isAltShuffle()) {
11065 if (TE.ReorderIndices.empty())
11067 SmallVector<int> Mask;
11068 inversePermutation(TE.ReorderIndices, Mask);
11069 if (ShuffleVectorInst::isReverseMask(Mask, Mask.size()))
11071 }
11073}
11074
11075/// Builds the arguments types vector for the given call instruction with the
11076/// given \p ID for the specified vector factor.
11079 const unsigned VF, unsigned MinBW,
11080 const TargetTransformInfo *TTI) {
11081 SmallVector<Type *> ArgTys;
11082 for (auto [Idx, Arg] : enumerate(CI->args())) {
11085 ArgTys.push_back(Arg->getType());
11086 continue;
11087 }
11088 if (MinBW > 0) {
11089 ArgTys.push_back(
11090 getWidenedType(IntegerType::get(CI->getContext(), MinBW), VF));
11091 continue;
11092 }
11093 }
11094 ArgTys.push_back(getWidenedType(Arg->getType(), VF));
11095 }
11096 return ArgTys;
11097}
11098
11100BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
11101 SmallPtrSetImpl<Value *> &CheckedExtracts) {
11102 ArrayRef<Value *> VL = E->Scalars;
11103
11104 Type *ScalarTy = getValueType(VL[0]);
11105 if (!isValidElementType(ScalarTy))
11108
11109 // If we have computed a smaller type for the expression, update VecTy so
11110 // that the costs will be accurate.
11111 auto It = MinBWs.find(E);
11112 Type *OrigScalarTy = ScalarTy;
11113 if (It != MinBWs.end()) {
11114 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11115 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
11116 if (VecTy)
11117 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
11118 }
11119 auto *VecTy = getWidenedType(ScalarTy, VL.size());
11120 unsigned EntryVF = E->getVectorFactor();
11121 auto *FinalVecTy = getWidenedType(ScalarTy, EntryVF);
11122
11123 if (E->isGather()) {
11124 if (allConstant(VL))
11125 return 0;
11126 if (isa<InsertElementInst>(VL[0]))
11128 if (isa<CmpInst>(VL.front()))
11129 ScalarTy = VL.front()->getType();
11130 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11131 E, ScalarTy, *TTI, VectorizedVals, *this, CheckedExtracts);
11132 }
11133 InstructionCost CommonCost = 0;
11135 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11136 !isReverseOrder(E->ReorderIndices))) {
11137 SmallVector<int> NewMask;
11138 if (E->getOpcode() == Instruction::Store) {
11139 // For stores the order is actually a mask.
11140 NewMask.resize(E->ReorderIndices.size());
11141 copy(E->ReorderIndices, NewMask.begin());
11142 } else {
11143 inversePermutation(E->ReorderIndices, NewMask);
11144 }
11145 ::addMask(Mask, NewMask);
11146 }
11147 if (!E->ReuseShuffleIndices.empty())
11148 ::addMask(Mask, E->ReuseShuffleIndices);
11149 if (!Mask.empty() && !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
11150 CommonCost =
11151 ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, FinalVecTy, Mask);
11152 assert((E->State == TreeEntry::Vectorize ||
11153 E->State == TreeEntry::ScatterVectorize ||
11154 E->State == TreeEntry::StridedVectorize) &&
11155 "Unhandled state");
11156 assert(E->getOpcode() &&
11157 ((allSameType(VL) && allSameBlock(VL)) ||
11158 (E->getOpcode() == Instruction::GetElementPtr &&
11159 E->getMainOp()->getType()->isPointerTy())) &&
11160 "Invalid VL");
11161 Instruction *VL0 = E->getMainOp();
11162 unsigned ShuffleOrOp =
11163 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
11164 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11165 ShuffleOrOp = E->CombinedOp;
11166 SmallSetVector<Value *, 16> UniqueValues(VL.begin(), VL.end());
11167 const unsigned Sz = UniqueValues.size();
11168 SmallBitVector UsedScalars(Sz, false);
11169 for (unsigned I = 0; I < Sz; ++I) {
11170 if (isa<Instruction>(UniqueValues[I]) && getTreeEntry(UniqueValues[I]) == E)
11171 continue;
11172 UsedScalars.set(I);
11173 }
11174 auto GetCastContextHint = [&](Value *V) {
11175 if (const TreeEntry *OpTE = getTreeEntry(V))
11176 return getCastContextHint(*OpTE);
11177 InstructionsState SrcState = getSameOpcode(E->getOperand(0), *TLI);
11178 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11179 !SrcState.isAltShuffle())
11182 };
11183 auto GetCostDiff =
11184 [=](function_ref<InstructionCost(unsigned)> ScalarEltCost,
11186 // Calculate the cost of this instruction.
11187 InstructionCost ScalarCost = 0;
11188 if (isa<CastInst, CallInst>(VL0)) {
11189 // For some of the instructions no need to calculate cost for each
11190 // particular instruction, we can use the cost of the single
11191 // instruction x total number of scalar instructions.
11192 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11193 } else {
11194 for (unsigned I = 0; I < Sz; ++I) {
11195 if (UsedScalars.test(I))
11196 continue;
11197 ScalarCost += ScalarEltCost(I);
11198 }
11199 }
11200
11201 InstructionCost VecCost = VectorCost(CommonCost);
11202 // Check if the current node must be resized, if the parent node is not
11203 // resized.
11204 if (It != MinBWs.end() && !UnaryInstruction::isCast(E->getOpcode()) &&
11205 E->Idx != 0 &&
11206 (E->getOpcode() != Instruction::Load ||
11207 !E->UserTreeIndices.empty())) {
11208 const EdgeInfo &EI =
11209 *find_if(E->UserTreeIndices, [](const EdgeInfo &EI) {
11210 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11211 });
11212 if (EI.UserTE->getOpcode() != Instruction::Select ||
11213 EI.EdgeIdx != 0) {
11214 auto UserBWIt = MinBWs.find(EI.UserTE);
11215 Type *UserScalarTy =
11216 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11217 if (UserBWIt != MinBWs.end())
11218 UserScalarTy = IntegerType::get(ScalarTy->getContext(),
11219 UserBWIt->second.first);
11220 if (ScalarTy != UserScalarTy) {
11221 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11222 unsigned SrcBWSz = DL->getTypeSizeInBits(UserScalarTy);
11223 unsigned VecOpcode;
11224 auto *UserVecTy = getWidenedType(UserScalarTy, E->Scalars.size());
11225 if (BWSz > SrcBWSz)
11226 VecOpcode = Instruction::Trunc;
11227 else
11228 VecOpcode =
11229 It->second.second ? Instruction::SExt : Instruction::ZExt;
11230 TTI::CastContextHint CCH = GetCastContextHint(VL0);
11231 VecCost += TTI->getCastInstrCost(VecOpcode, UserVecTy, VecTy, CCH,
11232 CostKind);
11233 }
11234 }
11235 }
11236 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11237 ScalarCost, "Calculated costs for Tree"));
11238 return VecCost - ScalarCost;
11239 };
11240 // Calculate cost difference from vectorizing set of GEPs.
11241 // Negative value means vectorizing is profitable.
11242 auto GetGEPCostDiff = [=](ArrayRef<Value *> Ptrs, Value *BasePtr) {
11243 assert((E->State == TreeEntry::Vectorize ||
11244 E->State == TreeEntry::StridedVectorize) &&
11245 "Entry state expected to be Vectorize or StridedVectorize here.");
11246 InstructionCost ScalarCost = 0;
11247 InstructionCost VecCost = 0;
11248 std::tie(ScalarCost, VecCost) = getGEPCosts(
11249 *TTI, Ptrs, BasePtr, E->getOpcode(), CostKind, OrigScalarTy, VecTy);
11250 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11251 "Calculated GEPs cost for Tree"));
11252
11253 return VecCost - ScalarCost;
11254 };
11255
11256 auto GetMinMaxCost = [&](Type *Ty, Instruction *VI = nullptr) {
11257 auto [MinMaxID, SelectOnly] = canConvertToMinOrMaxIntrinsic(VI ? VI : VL);
11258 if (MinMaxID == Intrinsic::not_intrinsic)
11260 Type *CanonicalType = Ty;
11261 if (CanonicalType->isPtrOrPtrVectorTy())
11262 CanonicalType = CanonicalType->getWithNewType(IntegerType::get(
11263 CanonicalType->getContext(),
11264 DL->getTypeSizeInBits(CanonicalType->getScalarType())));
11265
11266 IntrinsicCostAttributes CostAttrs(MinMaxID, CanonicalType,
11267 {CanonicalType, CanonicalType});
11268 InstructionCost IntrinsicCost =
11269 TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11270 // If the selects are the only uses of the compares, they will be
11271 // dead and we can adjust the cost by removing their cost.
11272 if (VI && SelectOnly) {
11273 assert((!Ty->isVectorTy() || SLPReVec) &&
11274 "Expected only for scalar type.");
11275 auto *CI = cast<CmpInst>(VI->getOperand(0));
11276 IntrinsicCost -= TTI->getCmpSelInstrCost(
11277 CI->getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11278 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11279 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11280 }
11281 return IntrinsicCost;
11282 };
11283 switch (ShuffleOrOp) {
11284 case Instruction::PHI: {
11285 // Count reused scalars.
11286 InstructionCost ScalarCost = 0;
11288 for (Value *V : UniqueValues) {
11289 auto *PHI = dyn_cast<PHINode>(V);
11290 if (!PHI)
11291 continue;
11292
11293 ValueList Operands(PHI->getNumIncomingValues(), nullptr);
11294 for (unsigned I = 0, N = PHI->getNumIncomingValues(); I < N; ++I) {
11295 Value *Op = PHI->getIncomingValue(I);
11296 Operands[I] = Op;
11297 }
11298 if (const TreeEntry *OpTE = getTreeEntry(Operands.front()))
11299 if (OpTE->isSame(Operands) && CountedOps.insert(OpTE).second)
11300 if (!OpTE->ReuseShuffleIndices.empty())
11301 ScalarCost += TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11302 OpTE->Scalars.size());
11303 }
11304
11305 return CommonCost - ScalarCost;
11306 }
11307 case Instruction::ExtractValue:
11308 case Instruction::ExtractElement: {
11309 auto GetScalarCost = [&](unsigned Idx) {
11310 if (isa<PoisonValue>(UniqueValues[Idx]))
11312
11313 auto *I = cast<Instruction>(UniqueValues[Idx]);
11314 VectorType *SrcVecTy;
11315 if (ShuffleOrOp == Instruction::ExtractElement) {
11316 auto *EE = cast<ExtractElementInst>(I);
11317 SrcVecTy = EE->getVectorOperandType();
11318 } else {
11319 auto *EV = cast<ExtractValueInst>(I);
11320 Type *AggregateTy = EV->getAggregateOperand()->getType();
11321 unsigned NumElts;
11322 if (auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11323 NumElts = ATy->getNumElements();
11324 else
11325 NumElts = AggregateTy->getStructNumElements();
11326 SrcVecTy = getWidenedType(OrigScalarTy, NumElts);
11327 }
11328 if (I->hasOneUse()) {
11329 Instruction *Ext = I->user_back();
11330 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11331 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
11332 // Use getExtractWithExtendCost() to calculate the cost of
11333 // extractelement/ext pair.
11335 Ext->getOpcode(), Ext->getType(), SrcVecTy, *getExtractIndex(I));
11336 // Subtract the cost of s|zext which is subtracted separately.
11338 Ext->getOpcode(), Ext->getType(), I->getType(),
11340 return Cost;
11341 }
11342 }
11343 return TTI->getVectorInstrCost(Instruction::ExtractElement, SrcVecTy,
11345 };
11346 auto GetVectorCost = [](InstructionCost CommonCost) { return CommonCost; };
11347 return GetCostDiff(GetScalarCost, GetVectorCost);
11348 }
11349 case Instruction::InsertElement: {
11350 assert(E->ReuseShuffleIndices.empty() &&
11351 "Unique insertelements only are expected.");
11352 auto *SrcVecTy = cast<FixedVectorType>(VL0->getType());
11353 unsigned const NumElts = SrcVecTy->getNumElements();
11354 unsigned const NumScalars = VL.size();
11355
11356 unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy);
11357
11358 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
11359 unsigned OffsetBeg = *getElementIndex(VL.front());
11360 unsigned OffsetEnd = OffsetBeg;
11361 InsertMask[OffsetBeg] = 0;
11362 for (auto [I, V] : enumerate(VL.drop_front())) {
11363 unsigned Idx = *getElementIndex(V);
11364 if (OffsetBeg > Idx)
11365 OffsetBeg = Idx;
11366 else if (OffsetEnd < Idx)
11367 OffsetEnd = Idx;
11368 InsertMask[Idx] = I + 1;
11369 }
11370 unsigned VecScalarsSz = PowerOf2Ceil(NumElts);
11371 if (NumOfParts > 0 && NumOfParts < NumElts)
11372 VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11373 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11374 VecScalarsSz;
11375 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11376 unsigned InsertVecSz = std::min<unsigned>(
11377 PowerOf2Ceil(OffsetEnd - OffsetBeg + 1),
11378 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11379 bool IsWholeSubvector =
11380 OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11381 // Check if we can safely insert a subvector. If it is not possible, just
11382 // generate a whole-sized vector and shuffle the source vector and the new
11383 // subvector.
11384 if (OffsetBeg + InsertVecSz > VecSz) {
11385 // Align OffsetBeg to generate correct mask.
11386 OffsetBeg = alignDown(OffsetBeg, VecSz, Offset);
11387 InsertVecSz = VecSz;
11388 }
11389
11390 APInt DemandedElts = APInt::getZero(NumElts);
11391 // TODO: Add support for Instruction::InsertValue.
11393 if (!E->ReorderIndices.empty()) {
11394 inversePermutation(E->ReorderIndices, Mask);
11395 Mask.append(InsertVecSz - Mask.size(), PoisonMaskElem);
11396 } else {
11397 Mask.assign(VecSz, PoisonMaskElem);
11398 std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0);
11399 }
11400 bool IsIdentity = true;
11401 SmallVector<int> PrevMask(InsertVecSz, PoisonMaskElem);
11402 Mask.swap(PrevMask);
11403 for (unsigned I = 0; I < NumScalars; ++I) {
11404 unsigned InsertIdx = *getElementIndex(VL[PrevMask[I]]);
11405 DemandedElts.setBit(InsertIdx);
11406 IsIdentity &= InsertIdx - OffsetBeg == I;
11407 Mask[InsertIdx - OffsetBeg] = I;
11408 }
11409 assert(Offset < NumElts && "Failed to find vector index offset");
11410
11412 Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts,
11413 /*Insert*/ true, /*Extract*/ false,
11414 CostKind);
11415
11416 // First cost - resize to actual vector size if not identity shuffle or
11417 // need to shift the vector.
11418 // Do not calculate the cost if the actual size is the register size and
11419 // we can merge this shuffle with the following SK_Select.
11420 auto *InsertVecTy = getWidenedType(ScalarTy, InsertVecSz);
11421 if (!IsIdentity)
11423 InsertVecTy, Mask);
11424 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
11425 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11426 }));
11427 // Second cost - permutation with subvector, if some elements are from the
11428 // initial vector or inserting a subvector.
11429 // TODO: Implement the analysis of the FirstInsert->getOperand(0)
11430 // subvector of ActualVecTy.
11431 SmallBitVector InMask =
11432 isUndefVector(FirstInsert->getOperand(0),
11433 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11434 if (!InMask.all() && NumScalars != NumElts && !IsWholeSubvector) {
11435 if (InsertVecSz != VecSz) {
11436 auto *ActualVecTy = getWidenedType(ScalarTy, VecSz);
11437 Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, ActualVecTy, {},
11438 CostKind, OffsetBeg - Offset, InsertVecTy);
11439 } else {
11440 for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I)
11441 Mask[I] = InMask.test(I) ? PoisonMaskElem : I;
11442 for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset;
11443 I <= End; ++I)
11444 if (Mask[I] != PoisonMaskElem)
11445 Mask[I] = I + VecSz;
11446 for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I)
11447 Mask[I] =
11448 ((I >= InMask.size()) || InMask.test(I)) ? PoisonMaskElem : I;
11449 Cost +=
11450 ::getShuffleCost(*TTI, TTI::SK_PermuteTwoSrc, InsertVecTy, Mask);
11451 }
11452 }
11453 return Cost;
11454 }
11455 case Instruction::ZExt:
11456 case Instruction::SExt:
11457 case Instruction::FPToUI:
11458 case Instruction::FPToSI:
11459 case Instruction::FPExt:
11460 case Instruction::PtrToInt:
11461 case Instruction::IntToPtr:
11462 case Instruction::SIToFP:
11463 case Instruction::UIToFP:
11464 case Instruction::Trunc:
11465 case Instruction::FPTrunc:
11466 case Instruction::BitCast: {
11467 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11468 Type *SrcScalarTy = VL0->getOperand(0)->getType();
11469 auto *SrcVecTy = getWidenedType(SrcScalarTy, VL.size());
11470 unsigned Opcode = ShuffleOrOp;
11471 unsigned VecOpcode = Opcode;
11472 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
11473 (SrcIt != MinBWs.end() || It != MinBWs.end())) {
11474 // Check if the values are candidates to demote.
11475 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy->getScalarType());
11476 if (SrcIt != MinBWs.end()) {
11477 SrcBWSz = SrcIt->second.first;
11478 unsigned SrcScalarTyNumElements = getNumElements(SrcScalarTy);
11479 SrcScalarTy = IntegerType::get(F->getContext(), SrcBWSz);
11480 SrcVecTy =
11481 getWidenedType(SrcScalarTy, VL.size() * SrcScalarTyNumElements);
11482 }
11483 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
11484 if (BWSz == SrcBWSz) {
11485 VecOpcode = Instruction::BitCast;
11486 } else if (BWSz < SrcBWSz) {
11487 VecOpcode = Instruction::Trunc;
11488 } else if (It != MinBWs.end()) {
11489 assert(BWSz > SrcBWSz && "Invalid cast!");
11490 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11491 } else if (SrcIt != MinBWs.end()) {
11492 assert(BWSz > SrcBWSz && "Invalid cast!");
11493 VecOpcode =
11494 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11495 }
11496 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
11497 !SrcIt->second.second) {
11498 VecOpcode = Instruction::UIToFP;
11499 }
11500 auto GetScalarCost = [&](unsigned Idx) -> InstructionCost {
11501 assert(Idx == 0 && "Expected 0 index only");
11502 return TTI->getCastInstrCost(Opcode, VL0->getType(),
11503 VL0->getOperand(0)->getType(),
11505 };
11506 auto GetVectorCost = [=](InstructionCost CommonCost) {
11507 // Do not count cost here if minimum bitwidth is in effect and it is just
11508 // a bitcast (here it is just a noop).
11509 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11510 return CommonCost;
11511 auto *VI = VL0->getOpcode() == Opcode ? VL0 : nullptr;
11512 TTI::CastContextHint CCH = GetCastContextHint(VL0->getOperand(0));
11513
11514 bool IsArithmeticExtendedReduction =
11515 E->Idx == 0 && UserIgnoreList &&
11516 all_of(*UserIgnoreList, [](Value *V) {
11517 auto *I = cast<Instruction>(V);
11518 return is_contained({Instruction::Add, Instruction::FAdd,
11519 Instruction::Mul, Instruction::FMul,
11520 Instruction::And, Instruction::Or,
11521 Instruction::Xor},
11522 I->getOpcode());
11523 });
11524 if (IsArithmeticExtendedReduction &&
11525 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11526 return CommonCost;
11527 return CommonCost +
11528 TTI->getCastInstrCost(VecOpcode, VecTy, SrcVecTy, CCH, CostKind,
11529 VecOpcode == Opcode ? VI : nullptr);
11530 };
11531 return GetCostDiff(GetScalarCost, GetVectorCost);
11532 }
11533 case Instruction::FCmp:
11534 case Instruction::ICmp:
11535 case Instruction::Select: {
11536 CmpPredicate VecPred, SwappedVecPred;
11537 auto MatchCmp = m_Cmp(VecPred, m_Value(), m_Value());
11538 if (match(VL0, m_Select(MatchCmp, m_Value(), m_Value())) ||
11539 match(VL0, MatchCmp))
11540 SwappedVecPred = CmpInst::getSwappedPredicate(VecPred);
11541 else
11542 SwappedVecPred = VecPred = ScalarTy->isFloatingPointTy()
11545 auto GetScalarCost = [&](unsigned Idx) {
11546 if (isa<PoisonValue>(UniqueValues[Idx]))
11548
11549 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11550 CmpPredicate CurrentPred = ScalarTy->isFloatingPointTy()
11553 auto MatchCmp = m_Cmp(CurrentPred, m_Value(), m_Value());
11554 if ((!match(VI, m_Select(MatchCmp, m_Value(), m_Value())) &&
11555 !match(VI, MatchCmp)) ||
11556 (CurrentPred != static_cast<CmpInst::Predicate>(VecPred) &&
11557 CurrentPred != static_cast<CmpInst::Predicate>(SwappedVecPred)))
11558 VecPred = SwappedVecPred = ScalarTy->isFloatingPointTy()
11561
11563 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11564 CostKind, getOperandInfo(VI->getOperand(0)),
11565 getOperandInfo(VI->getOperand(1)), VI);
11566 InstructionCost IntrinsicCost = GetMinMaxCost(OrigScalarTy, VI);
11567 if (IntrinsicCost.isValid())
11568 ScalarCost = IntrinsicCost;
11569
11570 return ScalarCost;
11571 };
11572 auto GetVectorCost = [&](InstructionCost CommonCost) {
11573 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11574
11575 InstructionCost VecCost =
11576 TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VecPred,
11577 CostKind, getOperandInfo(E->getOperand(0)),
11578 getOperandInfo(E->getOperand(1)), VL0);
11579 if (auto *SI = dyn_cast<SelectInst>(VL0)) {
11580 auto *CondType =
11581 getWidenedType(SI->getCondition()->getType(), VL.size());
11582 unsigned CondNumElements = CondType->getNumElements();
11583 unsigned VecTyNumElements = getNumElements(VecTy);
11584 assert(VecTyNumElements >= CondNumElements &&
11585 VecTyNumElements % CondNumElements == 0 &&
11586 "Cannot vectorize Instruction::Select");
11587 if (CondNumElements != VecTyNumElements) {
11588 // When the return type is i1 but the source is fixed vector type, we
11589 // need to duplicate the condition value.
11590 VecCost += ::getShuffleCost(
11591 *TTI, TTI::SK_PermuteSingleSrc, CondType,
11592 createReplicatedMask(VecTyNumElements / CondNumElements,
11593 CondNumElements));
11594 }
11595 }
11596 return VecCost + CommonCost;
11597 };
11598 return GetCostDiff(GetScalarCost, GetVectorCost);
11599 }
11600 case TreeEntry::MinMax: {
11601 auto GetScalarCost = [&](unsigned Idx) {
11602 return GetMinMaxCost(OrigScalarTy);
11603 };
11604 auto GetVectorCost = [&](InstructionCost CommonCost) {
11605 InstructionCost VecCost = GetMinMaxCost(VecTy);
11606 return VecCost + CommonCost;
11607 };
11608 return GetCostDiff(GetScalarCost, GetVectorCost);
11609 }
11610 case Instruction::FNeg:
11611 case Instruction::Add:
11612 case Instruction::FAdd:
11613 case Instruction::Sub:
11614 case Instruction::FSub:
11615 case Instruction::Mul:
11616 case Instruction::FMul:
11617 case Instruction::UDiv:
11618 case Instruction::SDiv:
11619 case Instruction::FDiv:
11620 case Instruction::URem:
11621 case Instruction::SRem:
11622 case Instruction::FRem:
11623 case Instruction::Shl:
11624 case Instruction::LShr:
11625 case Instruction::AShr:
11626 case Instruction::And:
11627 case Instruction::Or:
11628 case Instruction::Xor: {
11629 auto GetScalarCost = [&](unsigned Idx) {
11630 if (isa<PoisonValue>(UniqueValues[Idx]))
11632
11633 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11634 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11635 TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
11636 TTI::OperandValueInfo Op2Info =
11637 TTI::getOperandInfo(VI->getOperand(OpIdx));
11638 SmallVector<const Value *> Operands(VI->operand_values());
11639 return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
11640 Op1Info, Op2Info, Operands, VI);
11641 };
11642 auto GetVectorCost = [=](InstructionCost CommonCost) {
11643 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
11644 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11645 ArrayRef<Value *> Ops = E->getOperand(I);
11646 if (all_of(Ops, [&](Value *Op) {
11647 auto *CI = dyn_cast<ConstantInt>(Op);
11648 return CI && CI->getValue().countr_one() >= It->second.first;
11649 }))
11650 return CommonCost;
11651 }
11652 }
11653 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11654 TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
11655 TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
11656 return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
11657 Op2Info, {}, nullptr, TLI) +
11658 CommonCost;
11659 };
11660 return GetCostDiff(GetScalarCost, GetVectorCost);
11661 }
11662 case Instruction::GetElementPtr: {
11663 return CommonCost + GetGEPCostDiff(VL, VL0);
11664 }
11665 case Instruction::Load: {
11666 auto GetScalarCost = [&](unsigned Idx) {
11667 auto *VI = cast<LoadInst>(UniqueValues[Idx]);
11668 return TTI->getMemoryOpCost(Instruction::Load, OrigScalarTy,
11669 VI->getAlign(), VI->getPointerAddressSpace(),
11671 };
11672 auto *LI0 = cast<LoadInst>(VL0);
11673 auto GetVectorCost = [&](InstructionCost CommonCost) {
11674 InstructionCost VecLdCost;
11675 switch (E->State) {
11676 case TreeEntry::Vectorize:
11677 if (unsigned Factor = E->getInterleaveFactor()) {
11678 VecLdCost = TTI->getInterleavedMemoryOpCost(
11679 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11680 LI0->getPointerAddressSpace(), CostKind);
11681
11682 } else {
11683 VecLdCost = TTI->getMemoryOpCost(
11684 Instruction::Load, VecTy, LI0->getAlign(),
11685 LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo());
11686 }
11687 break;
11688 case TreeEntry::StridedVectorize: {
11689 Align CommonAlignment =
11690 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11691 VecLdCost = TTI->getStridedMemoryOpCost(
11692 Instruction::Load, VecTy, LI0->getPointerOperand(),
11693 /*VariableMask=*/false, CommonAlignment, CostKind);
11694 break;
11695 }
11696 case TreeEntry::ScatterVectorize: {
11697 Align CommonAlignment =
11698 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11699 VecLdCost = TTI->getGatherScatterOpCost(
11700 Instruction::Load, VecTy, LI0->getPointerOperand(),
11701 /*VariableMask=*/false, CommonAlignment, CostKind);
11702 break;
11703 }
11704 case TreeEntry::CombinedVectorize:
11705 case TreeEntry::NeedToGather:
11706 llvm_unreachable("Unexpected vectorization state.");
11707 }
11708 return VecLdCost + CommonCost;
11709 };
11710
11711 InstructionCost Cost = GetCostDiff(GetScalarCost, GetVectorCost);
11712 // If this node generates masked gather load then it is not a terminal node.
11713 // Hence address operand cost is estimated separately.
11714 if (E->State == TreeEntry::ScatterVectorize)
11715 return Cost;
11716
11717 // Estimate cost of GEPs since this tree node is a terminator.
11718 SmallVector<Value *> PointerOps(VL.size());
11719 for (auto [I, V] : enumerate(VL))
11720 PointerOps[I] = cast<LoadInst>(V)->getPointerOperand();
11721 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11722 }
11723 case Instruction::Store: {
11724 bool IsReorder = !E->ReorderIndices.empty();
11725 auto GetScalarCost = [=](unsigned Idx) {
11726 auto *VI = cast<StoreInst>(VL[Idx]);
11727 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(VI->getValueOperand());
11728 return TTI->getMemoryOpCost(Instruction::Store, OrigScalarTy,
11729 VI->getAlign(), VI->getPointerAddressSpace(),
11730 CostKind, OpInfo, VI);
11731 };
11732 auto *BaseSI =
11733 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11734 auto GetVectorCost = [=](InstructionCost CommonCost) {
11735 // We know that we can merge the stores. Calculate the cost.
11736 InstructionCost VecStCost;
11737 if (E->State == TreeEntry::StridedVectorize) {
11738 Align CommonAlignment =
11739 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11740 VecStCost = TTI->getStridedMemoryOpCost(
11741 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11742 /*VariableMask=*/false, CommonAlignment, CostKind);
11743 } else {
11744 assert(E->State == TreeEntry::Vectorize &&
11745 "Expected either strided or consecutive stores.");
11746 if (unsigned Factor = E->getInterleaveFactor()) {
11747 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11748 "No reused shuffles expected");
11749 CommonCost = 0;
11750 VecStCost = TTI->getInterleavedMemoryOpCost(
11751 Instruction::Store, VecTy, Factor, std::nullopt,
11752 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(), CostKind);
11753 } else {
11754 TTI::OperandValueInfo OpInfo = getOperandInfo(E->getOperand(0));
11755 VecStCost = TTI->getMemoryOpCost(
11756 Instruction::Store, VecTy, BaseSI->getAlign(),
11757 BaseSI->getPointerAddressSpace(), CostKind, OpInfo);
11758 }
11759 }
11760 return VecStCost + CommonCost;
11761 };
11762 SmallVector<Value *> PointerOps(VL.size());
11763 for (auto [I, V] : enumerate(VL)) {
11764 unsigned Idx = IsReorder ? E->ReorderIndices[I] : I;
11765 PointerOps[Idx] = cast<StoreInst>(V)->getPointerOperand();
11766 }
11767
11768 return GetCostDiff(GetScalarCost, GetVectorCost) +
11769 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11770 }
11771 case Instruction::Call: {
11772 auto GetScalarCost = [&](unsigned Idx) {
11773 auto *CI = cast<CallInst>(UniqueValues[Idx]);
11776 IntrinsicCostAttributes CostAttrs(ID, *CI, 1);
11777 return TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
11778 }
11781 CI->getFunctionType()->params(), CostKind);
11782 };
11783 auto GetVectorCost = [=](InstructionCost CommonCost) {
11784 auto *CI = cast<CallInst>(VL0);
11787 CI, ID, VecTy->getNumElements(),
11788 It != MinBWs.end() ? It->second.first : 0, TTI);
11789 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
11790 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11791 };
11792 return GetCostDiff(GetScalarCost, GetVectorCost);
11793 }
11794 case Instruction::ShuffleVector: {
11795 if (!SLPReVec || E->isAltShuffle())
11796 assert(E->isAltShuffle() &&
11797 ((Instruction::isBinaryOp(E->getOpcode()) &&
11798 Instruction::isBinaryOp(E->getAltOpcode())) ||
11799 (Instruction::isCast(E->getOpcode()) &&
11800 Instruction::isCast(E->getAltOpcode())) ||
11801 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11802 "Invalid Shuffle Vector Operand");
11803 // Try to find the previous shuffle node with the same operands and same
11804 // main/alternate ops.
11805 auto TryFindNodeWithEqualOperands = [=]() {
11806 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11807 if (TE.get() == E)
11808 break;
11809 if (TE->hasState() && TE->isAltShuffle() &&
11810 ((TE->getOpcode() == E->getOpcode() &&
11811 TE->getAltOpcode() == E->getAltOpcode()) ||
11812 (TE->getOpcode() == E->getAltOpcode() &&
11813 TE->getAltOpcode() == E->getOpcode())) &&
11814 TE->hasEqualOperands(*E))
11815 return true;
11816 }
11817 return false;
11818 };
11819 auto GetScalarCost = [&](unsigned Idx) {
11820 if (isa<PoisonValue>(UniqueValues[Idx]))
11822
11823 auto *VI = cast<Instruction>(UniqueValues[Idx]);
11824 assert(E->isOpcodeOrAlt(VI) && "Unexpected main/alternate opcode");
11825 (void)E;
11826 return TTI->getInstructionCost(VI, CostKind);
11827 };
11828 // Need to clear CommonCost since the final shuffle cost is included into
11829 // vector cost.
11830 auto GetVectorCost = [&, &TTIRef = *TTI](InstructionCost) {
11831 // VecCost is equal to sum of the cost of creating 2 vectors
11832 // and the cost of creating shuffle.
11833 InstructionCost VecCost = 0;
11834 if (TryFindNodeWithEqualOperands()) {
11835 LLVM_DEBUG({
11836 dbgs() << "SLP: diamond match for alternate node found.\n";
11837 E->dump();
11838 });
11839 // No need to add new vector costs here since we're going to reuse
11840 // same main/alternate vector ops, just do different shuffling.
11841 } else if (Instruction::isBinaryOp(E->getOpcode())) {
11842 VecCost =
11843 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
11844 VecCost +=
11845 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy, CostKind);
11846 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11847 auto *MaskTy = getWidenedType(Builder.getInt1Ty(), VL.size());
11848 VecCost = TTIRef.getCmpSelInstrCost(
11849 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(), CostKind,
11850 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11851 VL0);
11852 VecCost += TTIRef.getCmpSelInstrCost(
11853 E->getOpcode(), VecTy, MaskTy,
11854 cast<CmpInst>(E->getAltOp())->getPredicate(), CostKind,
11855 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11856 E->getAltOp());
11857 } else {
11858 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11859 auto *SrcTy = getWidenedType(SrcSclTy, VL.size());
11860 if (SrcSclTy->isIntegerTy() && ScalarTy->isIntegerTy()) {
11861 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
11862 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
11863 unsigned SrcBWSz =
11864 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11865 if (SrcIt != MinBWs.end()) {
11866 SrcBWSz = SrcIt->second.first;
11867 SrcSclTy = IntegerType::get(SrcSclTy->getContext(), SrcBWSz);
11868 SrcTy = getWidenedType(SrcSclTy, VL.size());
11869 }
11870 if (BWSz <= SrcBWSz) {
11871 if (BWSz < SrcBWSz)
11872 VecCost =
11873 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11875 LLVM_DEBUG({
11876 dbgs()
11877 << "SLP: alternate extension, which should be truncated.\n";
11878 E->dump();
11879 });
11880 return VecCost;
11881 }
11882 }
11883 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11885 VecCost +=
11886 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11888 }
11890 E->buildAltOpShuffleMask(
11891 [&](Instruction *I) {
11892 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
11893 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
11894 *TLI);
11895 },
11896 Mask);
11898 FinalVecTy, Mask, CostKind);
11899 // Patterns like [fadd,fsub] can be combined into a single instruction
11900 // in x86. Reordering them into [fsub,fadd] blocks this pattern. So we
11901 // need to take into account their order when looking for the most used
11902 // order.
11903 unsigned Opcode0 = E->getOpcode();
11904 unsigned Opcode1 = E->getAltOpcode();
11905 SmallBitVector OpcodeMask(getAltInstrMask(E->Scalars, Opcode0, Opcode1));
11906 // If this pattern is supported by the target then we consider the
11907 // order.
11908 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11909 InstructionCost AltVecCost = TTIRef.getAltInstrCost(
11910 VecTy, Opcode0, Opcode1, OpcodeMask, CostKind);
11911 return AltVecCost < VecCost ? AltVecCost : VecCost;
11912 }
11913 // TODO: Check the reverse order too.
11914 return VecCost;
11915 };
11916 if (SLPReVec && !E->isAltShuffle())
11917 return GetCostDiff(
11918 GetScalarCost, [&](InstructionCost) -> InstructionCost {
11919 // If a group uses mask in order, the shufflevector can be
11920 // eliminated by instcombine. Then the cost is 0.
11921 assert(isa<ShuffleVectorInst>(VL.front()) &&
11922 "Not supported shufflevector usage.");
11923 auto *SV = cast<ShuffleVectorInst>(VL.front());
11924 unsigned SVNumElements =
11925 cast<FixedVectorType>(SV->getOperand(0)->getType())
11926 ->getNumElements();
11927 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11928 for (size_t I = 0, End = VL.size(); I != End; I += GroupSize) {
11929 ArrayRef<Value *> Group = VL.slice(I, GroupSize);
11930 int NextIndex = 0;
11931 if (!all_of(Group, [&](Value *V) {
11932 assert(isa<ShuffleVectorInst>(V) &&
11933 "Not supported shufflevector usage.");
11934 auto *SV = cast<ShuffleVectorInst>(V);
11935 int Index;
11936 [[maybe_unused]] bool IsExtractSubvectorMask =
11937 SV->isExtractSubvectorMask(Index);
11938 assert(IsExtractSubvectorMask &&
11939 "Not supported shufflevector usage.");
11940 if (NextIndex != Index)
11941 return false;
11942 NextIndex += SV->getShuffleMask().size();
11943 return true;
11944 }))
11945 return ::getShuffleCost(
11947 calculateShufflevectorMask(E->Scalars));
11948 }
11949 return TTI::TCC_Free;
11950 });
11951 return GetCostDiff(GetScalarCost, GetVectorCost);
11952 }
11953 case Instruction::Freeze:
11954 return CommonCost;
11955 default:
11956 llvm_unreachable("Unknown instruction");
11957 }
11958}
11959
11960bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const {
11961 LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
11962 << VectorizableTree.size() << " is fully vectorizable .\n");
11963
11964 auto &&AreVectorizableGathers = [this](const TreeEntry *TE, unsigned Limit) {
11966 return TE->isGather() &&
11967 !any_of(TE->Scalars,
11968 [this](Value *V) { return EphValues.contains(V); }) &&
11969 (allConstant(TE->Scalars) || isSplat(TE->Scalars) ||
11970 TE->Scalars.size() < Limit ||
11971 (((TE->hasState() &&
11972 TE->getOpcode() == Instruction::ExtractElement) ||
11973 all_of(TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11974 isFixedVectorShuffle(TE->Scalars, Mask, AC)) ||
11975 (TE->hasState() && TE->getOpcode() == Instruction::Load &&
11976 !TE->isAltShuffle()) ||
11977 any_of(TE->Scalars, IsaPred<LoadInst>));
11978 };
11979
11980 // We only handle trees of heights 1 and 2.
11981 if (VectorizableTree.size() == 1 &&
11982 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11983 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11984 (ForReduction &&
11985 AreVectorizableGathers(VectorizableTree[0].get(),
11986 VectorizableTree[0]->Scalars.size()) &&
11987 VectorizableTree[0]->getVectorFactor() > 2)))
11988 return true;
11989
11990 if (VectorizableTree.size() != 2)
11991 return false;
11992
11993 // Handle splat and all-constants stores. Also try to vectorize tiny trees
11994 // with the second gather nodes if they have less scalar operands rather than
11995 // the initial tree element (may be profitable to shuffle the second gather)
11996 // or they are extractelements, which form shuffle.
11998 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11999 AreVectorizableGathers(VectorizableTree[1].get(),
12000 VectorizableTree[0]->Scalars.size()))
12001 return true;
12002
12003 // Gathering cost would be too much for tiny trees.
12004 if (VectorizableTree[0]->isGather() ||
12005 (VectorizableTree[1]->isGather() &&
12006 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12007 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12008 return false;
12009
12010 return true;
12011}
12012
12013static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
12015 bool MustMatchOrInst) {
12016 // Look past the root to find a source value. Arbitrarily follow the
12017 // path through operand 0 of any 'or'. Also, peek through optional
12018 // shift-left-by-multiple-of-8-bits.
12019 Value *ZextLoad = Root;
12020 const APInt *ShAmtC;
12021 bool FoundOr = false;
12022 while (!isa<ConstantExpr>(ZextLoad) &&
12023 (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
12024 (match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
12025 ShAmtC->urem(8) == 0))) {
12026 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12027 ZextLoad = BinOp->getOperand(0);
12028 if (BinOp->getOpcode() == Instruction::Or)
12029 FoundOr = true;
12030 }
12031 // Check if the input is an extended load of the required or/shift expression.
12032 Value *Load;
12033 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12034 !match(ZextLoad, m_ZExt(m_Value(Load))) || !isa<LoadInst>(Load))
12035 return false;
12036
12037 // Require that the total load bit width is a legal integer type.
12038 // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
12039 // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
12040 Type *SrcTy = Load->getType();
12041 unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
12042 if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
12043 return false;
12044
12045 // Everything matched - assume that we can fold the whole sequence using
12046 // load combining.
12047 LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
12048 << *(cast<Instruction>(Root)) << "\n");
12049
12050 return true;
12051}
12052
12054 if (RdxKind != RecurKind::Or)
12055 return false;
12056
12057 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12058 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12059 return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI,
12060 /* MatchOr */ false);
12061}
12062
12064 // Peek through a final sequence of stores and check if all operations are
12065 // likely to be load-combined.
12066 unsigned NumElts = Stores.size();
12067 for (Value *Scalar : Stores) {
12068 Value *X;
12069 if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
12070 !isLoadCombineCandidateImpl(X, NumElts, TTI, /* MatchOr */ true))
12071 return false;
12072 }
12073 return true;
12074}
12075
12076bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
12077 if (!DebugCounter::shouldExecute(VectorizedGraphs))
12078 return true;
12079
12080 // Graph is empty - do nothing.
12081 if (VectorizableTree.empty()) {
12082 assert(ExternalUses.empty() && "We shouldn't have any external users");
12083
12084 return true;
12085 }
12086
12087 // No need to vectorize inserts of gathered values.
12088 if (VectorizableTree.size() == 2 &&
12089 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12090 VectorizableTree[1]->isGather() &&
12091 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12092 !(isSplat(VectorizableTree[1]->Scalars) ||
12093 allConstant(VectorizableTree[1]->Scalars))))
12094 return true;
12095
12096 // If the graph includes only PHI nodes and gathers, it is defnitely not
12097 // profitable for the vectorization, we can skip it, if the cost threshold is
12098 // default. The cost of vectorized PHI nodes is almost always 0 + the cost of
12099 // gathers/buildvectors.
12100 constexpr int Limit = 4;
12101 if (!ForReduction && !SLPCostThreshold.getNumOccurrences() &&
12102 !VectorizableTree.empty() &&
12103 all_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12104 return (TE->isGather() &&
12105 (!TE->hasState() ||
12106 TE->getOpcode() != Instruction::ExtractElement) &&
12107 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12108 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12109 }))
12110 return true;
12111
12112 // We can vectorize the tree if its size is greater than or equal to the
12113 // minimum size specified by the MinTreeSize command line option.
12114 if (VectorizableTree.size() >= MinTreeSize)
12115 return false;
12116
12117 // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
12118 // can vectorize it if we can prove it fully vectorizable.
12119 if (isFullyVectorizableTinyTree(ForReduction))
12120 return false;
12121
12122 // Check if any of the gather node forms an insertelement buildvector
12123 // somewhere.
12124 bool IsAllowedSingleBVNode =
12125 VectorizableTree.size() > 1 ||
12126 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12127 !VectorizableTree.front()->isAltShuffle() &&
12128 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12129 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12130 allSameBlock(VectorizableTree.front()->Scalars));
12131 if (any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
12132 return TE->isGather() && all_of(TE->Scalars, [&](Value *V) {
12133 return isa<ExtractElementInst, UndefValue>(V) ||
12134 (IsAllowedSingleBVNode &&
12135 !V->hasNUsesOrMore(UsesLimit) &&
12136 any_of(V->users(), IsaPred<InsertElementInst>));
12137 });
12138 }))
12139 return false;
12140
12141 if (VectorizableTree.back()->isGather() &&
12142 VectorizableTree.back()->hasState() &&
12143 VectorizableTree.back()->isAltShuffle() &&
12144 VectorizableTree.back()->getVectorFactor() > 2 &&
12145 allSameBlock(VectorizableTree.back()->Scalars) &&
12146 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12148 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12149 VectorizableTree.back()->getVectorFactor()),
12150 APInt::getAllOnes(VectorizableTree.back()->getVectorFactor()),
12151 /*Insert=*/true, /*Extract=*/false,
12153 return false;
12154
12155 // Otherwise, we can't vectorize the tree. It is both tiny and not fully
12156 // vectorizable.
12157 return true;
12158}
12159
12162 constexpr unsigned SmallTree = 3;
12163 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12164 getCanonicalGraphSize() <= SmallTree &&
12165 count_if(ArrayRef(VectorizableTree).drop_front(getCanonicalGraphSize()),
12166 [](const std::unique_ptr<TreeEntry> &TE) {
12167 return TE->isGather() && TE->hasState() &&
12168 TE->getOpcode() == Instruction::Load &&
12169 !allSameBlock(TE->Scalars);
12170 }) == 1)
12171 return true;
12172 return false;
12173 }
12174 bool Res = false;
12175 for (unsigned Idx : seq<unsigned>(getTreeSize())) {
12176 TreeEntry &E = *VectorizableTree[Idx];
12177 if (!E.isGather())
12178 continue;
12179 if (E.hasState() && E.getOpcode() != Instruction::Load)
12180 return false;
12181 if (isSplat(E.Scalars) || allConstant(E.Scalars))
12182 continue;
12183 Res = true;
12184 }
12185 return Res;
12186}
12187
12189 // Walk from the bottom of the tree to the top, tracking which values are
12190 // live. When we see a call instruction that is not part of our tree,
12191 // query TTI to see if there is a cost to keeping values live over it
12192 // (for example, if spills and fills are required).
12193 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12195
12197 Instruction *PrevInst = nullptr;
12198
12199 // The entries in VectorizableTree are not necessarily ordered by their
12200 // position in basic blocks. Collect them and order them by dominance so later
12201 // instructions are guaranteed to be visited first. For instructions in
12202 // different basic blocks, we only scan to the beginning of the block, so
12203 // their order does not matter, as long as all instructions in a basic block
12204 // are grouped together. Using dominance ensures a deterministic order.
12205 SmallVector<Instruction *, 16> OrderedScalars;
12206 for (const auto &TEPtr : VectorizableTree) {
12207 if (TEPtr->State != TreeEntry::Vectorize)
12208 continue;
12209 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12210 if (!Inst)
12211 continue;
12212 OrderedScalars.push_back(Inst);
12213 }
12214 llvm::sort(OrderedScalars, [&](Instruction *A, Instruction *B) {
12215 auto *NodeA = DT->getNode(A->getParent());
12216 auto *NodeB = DT->getNode(B->getParent());
12217 assert(NodeA && "Should only process reachable instructions");
12218 assert(NodeB && "Should only process reachable instructions");
12219 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12220 "Different nodes should have different DFS numbers");
12221 if (NodeA != NodeB)
12222 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12223 return B->comesBefore(A);
12224 });
12225
12226 for (Instruction *Inst : OrderedScalars) {
12227 if (!PrevInst) {
12228 PrevInst = Inst;
12229 continue;
12230 }
12231
12232 // Update LiveValues.
12233 LiveValues.erase(PrevInst);
12234 for (auto &J : PrevInst->operands()) {
12235 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12236 LiveValues.insert(cast<Instruction>(&*J));
12237 }
12238
12239 LLVM_DEBUG({
12240 dbgs() << "SLP: #LV: " << LiveValues.size();
12241 for (auto *X : LiveValues)
12242 dbgs() << " " << X->getName();
12243 dbgs() << ", Looking at ";
12244 Inst->dump();
12245 });
12246
12247 // Now find the sequence of instructions between PrevInst and Inst.
12248 unsigned NumCalls = 0;
12249 BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
12250 PrevInstIt =
12251 PrevInst->getIterator().getReverse();
12252 while (InstIt != PrevInstIt) {
12253 if (PrevInstIt == PrevInst->getParent()->rend()) {
12254 PrevInstIt = Inst->getParent()->rbegin();
12255 continue;
12256 }
12257
12258 auto NoCallIntrinsic = [this](Instruction *I) {
12259 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
12260 if (II->isAssumeLikeIntrinsic())
12261 return true;
12262 FastMathFlags FMF;
12264 for (auto &ArgOp : II->args())
12265 Tys.push_back(ArgOp->getType());
12266 if (auto *FPMO = dyn_cast<FPMathOperator>(II))
12267 FMF = FPMO->getFastMathFlags();
12268 IntrinsicCostAttributes ICA(II->getIntrinsicID(), II->getType(), Tys,
12269 FMF);
12270 InstructionCost IntrCost =
12273 nullptr, II->getType(), Tys, TTI::TCK_RecipThroughput);
12274 if (IntrCost < CallCost)
12275 return true;
12276 }
12277 return false;
12278 };
12279
12280 // Debug information does not impact spill cost.
12281 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12282 &*PrevInstIt != PrevInst)
12283 NumCalls++;
12284
12285 ++PrevInstIt;
12286 }
12287
12288 if (NumCalls) {
12290 for (auto *II : LiveValues) {
12291 auto *ScalarTy = II->getType();
12292 if (auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12293 ScalarTy = VectorTy->getElementType();
12294 V.push_back(getWidenedType(ScalarTy, BundleWidth));
12295 }
12296 Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
12297 }
12298
12299 PrevInst = Inst;
12300 }
12301
12302 return Cost;
12303}
12304
12305/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
12306/// buildvector sequence.
12308 const InsertElementInst *IE2) {
12309 if (IE1 == IE2)
12310 return false;
12311 const auto *I1 = IE1;
12312 const auto *I2 = IE2;
12313 const InsertElementInst *PrevI1;
12314 const InsertElementInst *PrevI2;
12315 unsigned Idx1 = *getElementIndex(IE1);
12316 unsigned Idx2 = *getElementIndex(IE2);
12317 do {
12318 if (I2 == IE1)
12319 return true;
12320 if (I1 == IE2)
12321 return false;
12322 PrevI1 = I1;
12323 PrevI2 = I2;
12324 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12325 getElementIndex(I1).value_or(Idx2) != Idx2)
12326 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12327 if (I2 && ((I2 == IE2 || I2->hasOneUse())) &&
12328 getElementIndex(I2).value_or(Idx1) != Idx1)
12329 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12330 } while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12331 llvm_unreachable("Two different buildvectors not expected.");
12332}
12333
12334namespace {
12335/// Returns incoming Value *, if the requested type is Value * too, or a default
12336/// value, otherwise.
12337struct ValueSelect {
12338 template <typename U>
12339 static std::enable_if_t<std::is_same_v<Value *, U>, Value *> get(Value *V) {
12340 return V;
12341 }
12342 template <typename U>
12343 static std::enable_if_t<!std::is_same_v<Value *, U>, U> get(Value *) {
12344 return U();
12345 }
12346};
12347} // namespace
12348
12349/// Does the analysis of the provided shuffle masks and performs the requested
12350/// actions on the vectors with the given shuffle masks. It tries to do it in
12351/// several steps.
12352/// 1. If the Base vector is not undef vector, resizing the very first mask to
12353/// have common VF and perform action for 2 input vectors (including non-undef
12354/// Base). Other shuffle masks are combined with the resulting after the 1 stage
12355/// and processed as a shuffle of 2 elements.
12356/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
12357/// action only for 1 vector with the given mask, if it is not the identity
12358/// mask.
12359/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
12360/// vectors, combing the masks properly between the steps.
12361template <typename T>
12363 MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
12364 function_ref<unsigned(T *)> GetVF,
12365 function_ref<std::pair<T *, bool>(T *, ArrayRef<int>, bool)> ResizeAction,
12367 assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
12368 SmallVector<int> Mask(ShuffleMask.begin()->second);
12369 auto VMIt = std::next(ShuffleMask.begin());
12370 T *Prev = nullptr;
12371 SmallBitVector UseMask =
12372 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12373 SmallBitVector IsBaseUndef = isUndefVector(Base, UseMask);
12374 if (!IsBaseUndef.all()) {
12375 // Base is not undef, need to combine it with the next subvectors.
12376 std::pair<T *, bool> Res =
12377 ResizeAction(ShuffleMask.begin()->first, Mask, /*ForSingleMask=*/false);
12378 SmallBitVector IsBasePoison = isUndefVector<true>(Base, UseMask);
12379 for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
12380 if (Mask[Idx] == PoisonMaskElem)
12381 Mask[Idx] = IsBasePoison.test(Idx) ? PoisonMaskElem : Idx;
12382 else
12383 Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
12384 }
12385 auto *V = ValueSelect::get<T *>(Base);
12386 (void)V;
12387 assert((!V || GetVF(V) == Mask.size()) &&
12388 "Expected base vector of VF number of elements.");
12389 Prev = Action(Mask, {nullptr, Res.first});
12390 } else if (ShuffleMask.size() == 1) {
12391 // Base is undef and only 1 vector is shuffled - perform the action only for
12392 // single vector, if the mask is not the identity mask.
12393 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12394 /*ForSingleMask=*/true);
12395 if (Res.second)
12396 // Identity mask is found.
12397 Prev = Res.first;
12398 else
12399 Prev = Action(Mask, {ShuffleMask.begin()->first});
12400 } else {
12401 // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
12402 // shuffles step by step, combining shuffle between the steps.
12403 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12404 unsigned Vec2VF = GetVF(VMIt->first);
12405 if (Vec1VF == Vec2VF) {
12406 // No need to resize the input vectors since they are of the same size, we
12407 // can shuffle them directly.
12408 ArrayRef<int> SecMask = VMIt->second;
12409 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12410 if (SecMask[I] != PoisonMaskElem) {
12411 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12412 Mask[I] = SecMask[I] + Vec1VF;
12413 }
12414 }
12415 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12416 } else {
12417 // Vectors of different sizes - resize and reshuffle.
12418 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12419 /*ForSingleMask=*/false);
12420 std::pair<T *, bool> Res2 =
12421 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12422 ArrayRef<int> SecMask = VMIt->second;
12423 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12424 if (Mask[I] != PoisonMaskElem) {
12425 assert(SecMask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12426 if (Res1.second)
12427 Mask[I] = I;
12428 } else if (SecMask[I] != PoisonMaskElem) {
12429 assert(Mask[I] == PoisonMaskElem && "Multiple uses of scalars.");
12430 Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
12431 }
12432 }
12433 Prev = Action(Mask, {Res1.first, Res2.first});
12434 }
12435 VMIt = std::next(VMIt);
12436 }
12437 bool IsBaseNotUndef = !IsBaseUndef.all();
12438 (void)IsBaseNotUndef;
12439 // Perform requested actions for the remaining masks/vectors.
12440 for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12441 // Shuffle other input vectors, if any.
12442 std::pair<T *, bool> Res =
12443 ResizeAction(VMIt->first, VMIt->second, /*ForSingleMask=*/false);
12444 ArrayRef<int> SecMask = VMIt->second;
12445 for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
12446 if (SecMask[I] != PoisonMaskElem) {
12447 assert((Mask[I] == PoisonMaskElem || IsBaseNotUndef) &&
12448 "Multiple uses of scalars.");
12449 Mask[I] = (Res.second ? I : SecMask[I]) + VF;
12450 } else if (Mask[I] != PoisonMaskElem) {
12451 Mask[I] = I;
12452 }
12453 }
12454 Prev = Action(Mask, {Prev, Res.first});
12455 }
12456 return Prev;
12457}
12458
12459namespace {
12460/// Data type for handling buildvector sequences with the reused scalars from
12461/// other tree entries.
12462template <typename T> struct ShuffledInsertData {
12463 /// List of insertelements to be replaced by shuffles.
12464 SmallVector<InsertElementInst *> InsertElements;
12465 /// The parent vectors and shuffle mask for the given list of inserts.
12467};
12468} // namespace
12469
12472 LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
12473 << VectorizableTree.size() << ".\n");
12474
12475 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12476
12477 SmallPtrSet<Value *, 4> CheckedExtracts;
12478 for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
12479 TreeEntry &TE = *VectorizableTree[I];
12480 // No need to count the cost for combined entries, they are combined and
12481 // just skip their cost.
12482 if (TE.State == TreeEntry::CombinedVectorize) {
12483 LLVM_DEBUG(
12484 dbgs() << "SLP: Skipping cost for combined node that starts with "
12485 << *TE.Scalars[0] << ".\n";
12486 TE.dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12487 continue;
12488 }
12489 if (TE.isGather() && TE.hasState()) {
12490 if (const TreeEntry *E = getTreeEntry(TE.getMainOp());
12491 E && E->getVectorFactor() == TE.getVectorFactor() &&
12492 E->isSame(TE.Scalars)) {
12493 // Some gather nodes might be absolutely the same as some vectorizable
12494 // nodes after reordering, need to handle it.
12495 LLVM_DEBUG(dbgs() << "SLP: Adding cost 0 for bundle "
12496 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12497 << "SLP: Current total cost = " << Cost << "\n");
12498 continue;
12499 }
12500 }
12501
12502 // Exclude cost of gather loads nodes which are not used. These nodes were
12503 // built as part of the final attempt to vectorize gathered loads.
12504 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12505 "Expected gather nodes with users only.");
12506
12507 InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts);
12508 Cost += C;
12509 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle "
12510 << shortBundleName(TE.Scalars, TE.Idx) << ".\n"
12511 << "SLP: Current total cost = " << Cost << "\n");
12512 }
12513
12514 SmallPtrSet<Value *, 16> ExtractCostCalculated;
12515 InstructionCost ExtractCost = 0;
12517 SmallVector<APInt> DemandedElts;
12518 SmallDenseSet<Value *, 4> UsedInserts;
12520 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12522 SmallPtrSet<Value *, 4> ScalarOpsFromCasts;
12523 // Keep track {Scalar, Index, User} tuple.
12524 // On AArch64, this helps in fusing a mov instruction, associated with
12525 // extractelement, with fmul in the backend so that extractelement is free.
12527 for (ExternalUser &EU : ExternalUses) {
12528 ScalarUserAndIdx.emplace_back(EU.Scalar, EU.User, EU.Lane);
12529 }
12530 for (ExternalUser &EU : ExternalUses) {
12531 // Uses by ephemeral values are free (because the ephemeral value will be
12532 // removed prior to code generation, and so the extraction will be
12533 // removed as well).
12534 if (EphValues.count(EU.User))
12535 continue;
12536
12537 // Used in unreachable blocks or in EH pads (rarely executed) or is
12538 // terminated with unreachable instruction.
12539 if (BasicBlock *UserParent =
12540 EU.User ? cast<Instruction>(EU.User)->getParent() : nullptr;
12541 UserParent &&
12542 (!DT->isReachableFromEntry(UserParent) || UserParent->isEHPad() ||
12543 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12544 continue;
12545
12546 // We only add extract cost once for the same scalar.
12547 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12548 !ExtractCostCalculated.insert(EU.Scalar).second)
12549 continue;
12550
12551 // No extract cost for vector "scalar"
12552 if (isa<FixedVectorType>(EU.Scalar->getType()))
12553 continue;
12554
12555 // If found user is an insertelement, do not calculate extract cost but try
12556 // to detect it as a final shuffled/identity match.
12557 if (auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12558 VU && VU->getOperand(1) == EU.Scalar) {
12559 if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
12560 if (!UsedInserts.insert(VU).second)
12561 continue;
12562 std::optional<unsigned> InsertIdx = getElementIndex(VU);
12563 if (InsertIdx) {
12564 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12565 auto *It = find_if(
12566 ShuffledInserts,
12567 [this, VU](const ShuffledInsertData<const TreeEntry *> &Data) {
12568 // Checks if 2 insertelements are from the same buildvector.
12569 InsertElementInst *VecInsert = Data.InsertElements.front();
12571 VU, VecInsert, [this](InsertElementInst *II) -> Value * {
12572 Value *Op0 = II->getOperand(0);
12573 if (getTreeEntry(II) && !getTreeEntry(Op0))
12574 return nullptr;
12575 return Op0;
12576 });
12577 });
12578 int VecId = -1;
12579 if (It == ShuffledInserts.end()) {
12580 auto &Data = ShuffledInserts.emplace_back();
12581 Data.InsertElements.emplace_back(VU);
12582 DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
12583 VecId = ShuffledInserts.size() - 1;
12584 auto It = MinBWs.find(ScalarTE);
12585 if (It != MinBWs.end() &&
12586 VectorCasts
12587 .insert(std::make_pair(ScalarTE, FTy->getElementType()))
12588 .second) {
12589 unsigned BWSz = It->second.first;
12590 unsigned DstBWSz = DL->getTypeSizeInBits(FTy->getElementType());
12591 unsigned VecOpcode;
12592 if (DstBWSz < BWSz)
12593 VecOpcode = Instruction::Trunc;
12594 else
12595 VecOpcode =
12596 It->second.second ? Instruction::SExt : Instruction::ZExt;
12599 VecOpcode, FTy,
12600 getWidenedType(IntegerType::get(FTy->getContext(), BWSz),
12601 FTy->getNumElements()),
12603 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12604 << " for extending externally used vector with "
12605 "non-equal minimum bitwidth.\n");
12606 Cost += C;
12607 }
12608 } else {
12609 if (isFirstInsertElement(VU, It->InsertElements.front()))
12610 It->InsertElements.front() = VU;
12611 VecId = std::distance(ShuffledInserts.begin(), It);
12612 }
12613 int InIdx = *InsertIdx;
12614 SmallVectorImpl<int> &Mask =
12615 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12616 if (Mask.empty())
12617 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
12618 Mask[InIdx] = EU.Lane;
12619 DemandedElts[VecId].setBit(InIdx);
12620 continue;
12621 }
12622 }
12623 }
12624
12626 // If we plan to rewrite the tree in a smaller type, we will need to sign
12627 // extend the extracted value back to the original type. Here, we account
12628 // for the extract and the added cost of the sign extend if needed.
12629 InstructionCost ExtraCost = TTI::TCC_Free;
12630 auto *VecTy = getWidenedType(EU.Scalar->getType(), BundleWidth);
12631 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12632 auto It = MinBWs.find(Entry);
12633 if (It != MinBWs.end()) {
12634 auto *MinTy = IntegerType::get(F->getContext(), It->second.first);
12635 unsigned Extend = isKnownNonNegative(EU.Scalar, SimplifyQuery(*DL))
12636 ? Instruction::ZExt
12637 : Instruction::SExt;
12638 VecTy = getWidenedType(MinTy, BundleWidth);
12639 ExtraCost = TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
12640 VecTy, EU.Lane);
12641 } else {
12642 ExtraCost =
12643 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
12644 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12645 }
12646 // Leave the scalar instructions as is if they are cheaper than extracts.
12647 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12648 Entry->getOpcode() == Instruction::Load) {
12649 // Checks if the user of the external scalar is phi in loop body.
12650 auto IsPhiInLoop = [&](const ExternalUser &U) {
12651 if (auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12652 auto *I = cast<Instruction>(U.Scalar);
12653 const Loop *L = LI->getLoopFor(Phi->getParent());
12654 return L && (Phi->getParent() == I->getParent() ||
12655 L == LI->getLoopFor(I->getParent()));
12656 }
12657 return false;
12658 };
12659 if (!ValueToExtUses) {
12660 ValueToExtUses.emplace();
12661 for_each(enumerate(ExternalUses), [&](const auto &P) {
12662 // Ignore phis in loops.
12663 if (IsPhiInLoop(P.value()))
12664 return;
12665
12666 ValueToExtUses->try_emplace(P.value().Scalar, P.index());
12667 });
12668 }
12669 // Can use original instruction, if no operands vectorized or they are
12670 // marked as externally used already.
12671 auto *Inst = cast<Instruction>(EU.Scalar);
12672 InstructionCost ScalarCost = TTI->getInstructionCost(Inst, CostKind);
12673 auto OperandIsScalar = [&](Value *V) {
12674 if (!getTreeEntry(V)) {
12675 // Some extractelements might be not vectorized, but
12676 // transformed into shuffle and removed from the function,
12677 // consider it here.
12678 if (auto *EE = dyn_cast<ExtractElementInst>(V))
12679 return !EE->hasOneUse() || !MustGather.contains(EE);
12680 return true;
12681 }
12682 return ValueToExtUses->contains(V);
12683 };
12684 bool CanBeUsedAsScalar = all_of(Inst->operands(), OperandIsScalar);
12685 bool CanBeUsedAsScalarCast = false;
12686 if (auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12687 if (auto *Op = dyn_cast<Instruction>(CI->getOperand(0));
12688 Op && all_of(Op->operands(), OperandIsScalar)) {
12689 InstructionCost OpCost =
12690 (getTreeEntry(Op) && !ValueToExtUses->contains(Op))
12692 : 0;
12693 if (ScalarCost + OpCost <= ExtraCost) {
12694 CanBeUsedAsScalar = CanBeUsedAsScalarCast = true;
12695 ScalarCost += OpCost;
12696 }
12697 }
12698 }
12699 if (CanBeUsedAsScalar) {
12700 bool KeepScalar = ScalarCost <= ExtraCost;
12701 // Try to keep original scalar if the user is the phi node from the same
12702 // block as the root phis, currently vectorized. It allows to keep
12703 // better ordering info of PHIs, being vectorized currently.
12704 bool IsProfitablePHIUser =
12705 (KeepScalar || (ScalarCost - ExtraCost <= TTI::TCC_Basic &&
12706 VectorizableTree.front()->Scalars.size() > 2)) &&
12707 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12708 !Inst->hasNUsesOrMore(UsesLimit) &&
12709 none_of(Inst->users(),
12710 [&](User *U) {
12711 auto *PHIUser = dyn_cast<PHINode>(U);
12712 return (!PHIUser ||
12713 PHIUser->getParent() !=
12714 cast<Instruction>(
12715 VectorizableTree.front()->getMainOp())
12716 ->getParent()) &&
12717 !getTreeEntry(U);
12718 }) &&
12719 count_if(Entry->Scalars, [&](Value *V) {
12720 return ValueToExtUses->contains(V);
12721 }) <= 2;
12722 if (IsProfitablePHIUser) {
12723 KeepScalar = true;
12724 } else if (KeepScalar && ScalarCost != TTI::TCC_Free &&
12725 ExtraCost - ScalarCost <= TTI::TCC_Basic &&
12726 (!GatheredLoadsEntriesFirst.has_value() ||
12727 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12728 unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) {
12729 return ValueToExtUses->contains(V);
12730 });
12731 auto It = ExtractsCount.find(Entry);
12732 if (It != ExtractsCount.end()) {
12733 assert(ScalarUsesCount >= It->getSecond().size() &&
12734 "Expected total number of external uses not less than "
12735 "number of scalar uses.");
12736 ScalarUsesCount -= It->getSecond().size();
12737 }
12738 // Keep original scalar if number of externally used instructions in
12739 // the same entry is not power of 2. It may help to do some extra
12740 // vectorization for now.
12741 KeepScalar = ScalarUsesCount <= 1 || !has_single_bit(ScalarUsesCount);
12742 }
12743 if (KeepScalar) {
12744 ExternalUsesAsOriginalScalar.insert(EU.Scalar);
12745 for_each(Inst->operands(), [&](Value *V) {
12746 auto It = ValueToExtUses->find(V);
12747 if (It != ValueToExtUses->end()) {
12748 // Replace all uses to avoid compiler crash.
12749 ExternalUses[It->second].User = nullptr;
12750 }
12751 });
12752 ExtraCost = ScalarCost;
12753 if (!IsPhiInLoop(EU))
12754 ExtractsCount[Entry].insert(Inst);
12755 if (CanBeUsedAsScalarCast) {
12756 ScalarOpsFromCasts.insert(Inst->getOperand(0));
12757 // Update the users of the operands of the cast operand to avoid
12758 // compiler crash.
12759 if (auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12760 for_each(IOp->operands(), [&](Value *V) {
12761 auto It = ValueToExtUses->find(V);
12762 if (It != ValueToExtUses->end()) {
12763 // Replace all uses to avoid compiler crash.
12764 ExternalUses[It->second].User = nullptr;
12765 }
12766 });
12767 }
12768 }
12769 }
12770 }
12771 }
12772
12773 ExtractCost += ExtraCost;
12774 }
12775 // Insert externals for extract of operands of casts to be emitted as scalars
12776 // instead of extractelement.
12777 for (Value *V : ScalarOpsFromCasts) {
12778 ExternalUsesAsOriginalScalar.insert(V);
12779 if (const TreeEntry *E = getTreeEntry(V)) {
12780 ExternalUses.emplace_back(V, nullptr, E->findLaneForValue(V));
12781 }
12782 }
12783 // Add reduced value cost, if resized.
12784 if (!VectorizedVals.empty()) {
12785 const TreeEntry &Root = *VectorizableTree.front();
12786 auto BWIt = MinBWs.find(&Root);
12787 if (BWIt != MinBWs.end()) {
12788 Type *DstTy = Root.Scalars.front()->getType();
12789 unsigned OriginalSz = DL->getTypeSizeInBits(DstTy->getScalarType());
12790 unsigned SrcSz =
12791 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12792 if (OriginalSz != SrcSz) {
12793 unsigned Opcode = Instruction::Trunc;
12794 if (OriginalSz > SrcSz)
12795 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12796 Type *SrcTy = IntegerType::get(DstTy->getContext(), SrcSz);
12797 if (auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12798 assert(SLPReVec && "Only supported by REVEC.");
12799 SrcTy = getWidenedType(SrcTy, VecTy->getNumElements());
12800 }
12801 Cost += TTI->getCastInstrCost(Opcode, DstTy, SrcTy,
12804 }
12805 }
12806 }
12807
12808 InstructionCost SpillCost = getSpillCost();
12809 Cost += SpillCost + ExtractCost;
12810 auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12811 bool) {
12812 InstructionCost C = 0;
12813 unsigned VF = Mask.size();
12814 unsigned VecVF = TE->getVectorFactor();
12815 if (VF != VecVF &&
12816 (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12818 SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12819 std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12820 OrigMask.begin());
12822 getWidenedType(TE->getMainOp()->getType(), VecVF),
12823 OrigMask);
12824 LLVM_DEBUG(
12825 dbgs() << "SLP: Adding cost " << C
12826 << " for final shuffle of insertelement external users.\n";
12827 TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12828 Cost += C;
12829 return std::make_pair(TE, true);
12830 }
12831 return std::make_pair(TE, false);
12832 };
12833 // Calculate the cost of the reshuffled vectors, if any.
12834 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
12835 Value *Base = ShuffledInserts[I].InsertElements.front()->getOperand(0);
12836 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
12837 unsigned VF = 0;
12838 auto EstimateShufflesCost = [&](ArrayRef<int> Mask,
12840 assert((TEs.size() == 1 || TEs.size() == 2) &&
12841 "Expected exactly 1 or 2 tree entries.");
12842 if (TEs.size() == 1) {
12843 if (VF == 0)
12844 VF = TEs.front()->getVectorFactor();
12845 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12846 if (!ShuffleVectorInst::isIdentityMask(Mask, VF) &&
12847 !all_of(enumerate(Mask), [=](const auto &Data) {
12848 return Data.value() == PoisonMaskElem ||
12849 (Data.index() < VF &&
12850 static_cast<int>(Data.index()) == Data.value());
12851 })) {
12854 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12855 << " for final shuffle of insertelement "
12856 "external users.\n";
12857 TEs.front()->dump();
12858 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12859 Cost += C;
12860 }
12861 } else {
12862 if (VF == 0) {
12863 if (TEs.front() &&
12864 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12865 VF = TEs.front()->getVectorFactor();
12866 else
12867 VF = Mask.size();
12868 }
12869 auto *FTy = getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12872 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
12873 << " for final shuffle of vector node and external "
12874 "insertelement users.\n";
12875 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12876 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12877 Cost += C;
12878 }
12879 VF = Mask.size();
12880 return TEs.back();
12881 };
12882 (void)performExtractsShuffleAction<const TreeEntry>(
12883 MutableArrayRef(Vector.data(), Vector.size()), Base,
12884 [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
12885 EstimateShufflesCost);
12887 cast<FixedVectorType>(
12888 ShuffledInserts[I].InsertElements.front()->getType()),
12889 DemandedElts[I],
12890 /*Insert*/ true, /*Extract*/ false, TTI::TCK_RecipThroughput);
12891 Cost -= InsertCost;
12892 }
12893
12894 // Add the cost for reduced value resize (if required).
12895 if (ReductionBitWidth != 0) {
12896 assert(UserIgnoreList && "Expected reduction tree.");
12897 const TreeEntry &E = *VectorizableTree.front();
12898 auto It = MinBWs.find(&E);
12899 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12900 unsigned SrcSize = It->second.first;
12901 unsigned DstSize = ReductionBitWidth;
12902 unsigned Opcode = Instruction::Trunc;
12903 if (SrcSize < DstSize) {
12904 bool IsArithmeticExtendedReduction =
12905 all_of(*UserIgnoreList, [](Value *V) {
12906 auto *I = cast<Instruction>(V);
12907 return is_contained({Instruction::Add, Instruction::FAdd,
12908 Instruction::Mul, Instruction::FMul,
12909 Instruction::And, Instruction::Or,
12910 Instruction::Xor},
12911 I->getOpcode());
12912 });
12913 if (IsArithmeticExtendedReduction)
12914 Opcode =
12915 Instruction::BitCast; // Handle it by getExtendedReductionCost
12916 else
12917 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12918 }
12919 if (Opcode != Instruction::BitCast) {
12920 auto *SrcVecTy =
12921 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12922 auto *DstVecTy =
12923 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12924 TTI::CastContextHint CCH = getCastContextHint(E);
12925 InstructionCost CastCost;
12926 switch (E.getOpcode()) {
12927 case Instruction::SExt:
12928 case Instruction::ZExt:
12929 case Instruction::Trunc: {
12930 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12931 CCH = getCastContextHint(*OpTE);
12932 break;
12933 }
12934 default:
12935 break;
12936 }
12937 CastCost += TTI->getCastInstrCost(Opcode, DstVecTy, SrcVecTy, CCH,
12939 Cost += CastCost;
12940 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << CastCost
12941 << " for final resize for reduction from " << SrcVecTy
12942 << " to " << DstVecTy << "\n";
12943 dbgs() << "SLP: Current total cost = " << Cost << "\n");
12944 }
12945 }
12946 }
12947
12948#ifndef NDEBUG
12949 SmallString<256> Str;
12950 {
12952 OS << "SLP: Spill Cost = " << SpillCost << ".\n"
12953 << "SLP: Extract Cost = " << ExtractCost << ".\n"
12954 << "SLP: Total Cost = " << Cost << ".\n";
12955 }
12956 LLVM_DEBUG(dbgs() << Str);
12957 if (ViewSLPTree)
12958 ViewGraph(this, "SLP" + F->getName(), false, Str);
12959#endif
12960
12961 return Cost;
12962}
12963
12964/// Tries to find extractelement instructions with constant indices from fixed
12965/// vector type and gather such instructions into a bunch, which highly likely
12966/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
12967/// successful, the matched scalars are replaced by poison values in \p VL for
12968/// future analysis.
12969std::optional<TTI::ShuffleKind>
12970BoUpSLP::tryToGatherSingleRegisterExtractElements(
12972 // Scan list of gathered scalars for extractelements that can be represented
12973 // as shuffles.
12975 SmallVector<int> UndefVectorExtracts;
12976 for (int I = 0, E = VL.size(); I < E; ++I) {
12977 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
12978 if (!EI) {
12979 if (isa<UndefValue>(VL[I]))
12980 UndefVectorExtracts.push_back(I);
12981 continue;
12982 }
12983 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12984 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12985 continue;
12986 std::optional<unsigned> Idx = getExtractIndex(EI);
12987 // Undefined index.
12988 if (!Idx) {
12989 UndefVectorExtracts.push_back(I);
12990 continue;
12991 }
12992 if (Idx >= VecTy->getNumElements()) {
12993 UndefVectorExtracts.push_back(I);
12994 continue;
12995 }
12996 SmallBitVector ExtractMask(VecTy->getNumElements(), true);
12997 ExtractMask.reset(*Idx);
12998 if (isUndefVector(EI->getVectorOperand(), ExtractMask).all()) {
12999 UndefVectorExtracts.push_back(I);
13000 continue;
13001 }
13002 VectorOpToIdx[EI->getVectorOperand()].push_back(I);
13003 }
13004 // Sort the vector operands by the maximum number of uses in extractelements.
13006 VectorOpToIdx.takeVector();
13007 stable_sort(Vectors, [](const auto &P1, const auto &P2) {
13008 return P1.second.size() > P2.second.size();
13009 });
13010 // Find the best pair of the vectors or a single vector.
13011 const int UndefSz = UndefVectorExtracts.size();
13012 unsigned SingleMax = 0;
13013 unsigned PairMax = 0;
13014 if (!Vectors.empty()) {
13015 SingleMax = Vectors.front().second.size() + UndefSz;
13016 if (Vectors.size() > 1) {
13017 auto *ItNext = std::next(Vectors.begin());
13018 PairMax = SingleMax + ItNext->second.size();
13019 }
13020 }
13021 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13022 return std::nullopt;
13023 // Check if better to perform a shuffle of 2 vectors or just of a single
13024 // vector.
13025 SmallVector<Value *> SavedVL(VL.begin(), VL.end());
13026 SmallVector<Value *> GatheredExtracts(
13027 VL.size(), PoisonValue::get(VL.front()->getType()));
13028 if (SingleMax >= PairMax && SingleMax) {
13029 for (int Idx : Vectors.front().second)
13030 std::swap(GatheredExtracts[Idx], VL[Idx]);
13031 } else if (!Vectors.empty()) {
13032 for (unsigned Idx : {0, 1})
13033 for (int Idx : Vectors[Idx].second)
13034 std::swap(GatheredExtracts[Idx], VL[Idx]);
13035 }
13036 // Add extracts from undefs too.
13037 for (int Idx : UndefVectorExtracts)
13038 std::swap(GatheredExtracts[Idx], VL[Idx]);
13039 // Check that gather of extractelements can be represented as just a
13040 // shuffle of a single/two vectors the scalars are extracted from.
13041 std::optional<TTI::ShuffleKind> Res =
13042 isFixedVectorShuffle(GatheredExtracts, Mask, AC);
13043 if (!Res || all_of(Mask, [](int Idx) { return Idx == PoisonMaskElem; })) {
13044 // TODO: try to check other subsets if possible.
13045 // Restore the original VL if attempt was not successful.
13046 copy(SavedVL, VL.begin());
13047 return std::nullopt;
13048 }
13049 // Restore unused scalars from mask, if some of the extractelements were not
13050 // selected for shuffle.
13051 for (int I = 0, E = GatheredExtracts.size(); I < E; ++I) {
13052 if (Mask[I] == PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[I]) &&
13053 isa<UndefValue>(GatheredExtracts[I])) {
13054 std::swap(VL[I], GatheredExtracts[I]);
13055 continue;
13056 }
13057 auto *EI = dyn_cast<ExtractElementInst>(VL[I]);
13058 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13059 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13060 is_contained(UndefVectorExtracts, I))
13061 continue;
13062 }
13063 return Res;
13064}
13065
13066/// Tries to find extractelement instructions with constant indices from fixed
13067/// vector type and gather such instructions into a bunch, which highly likely
13068/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
13069/// successful, the matched scalars are replaced by poison values in \p VL for
13070/// future analysis.
13072BoUpSLP::tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
13074 unsigned NumParts) const {
13075 assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
13076 SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
13077 Mask.assign(VL.size(), PoisonMaskElem);
13078 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13079 for (unsigned Part : seq<unsigned>(NumParts)) {
13080 // Scan list of gathered scalars for extractelements that can be represented
13081 // as shuffles.
13083 Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13084 SmallVector<int> SubMask;
13085 std::optional<TTI::ShuffleKind> Res =
13086 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13087 ShufflesRes[Part] = Res;
13088 copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
13089 }
13090 if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
13091 return Res.has_value();
13092 }))
13093 ShufflesRes.clear();
13094 return ShufflesRes;
13095}
13096
13097std::optional<TargetTransformInfo::ShuffleKind>
13098BoUpSLP::isGatherShuffledSingleRegisterEntry(
13099 const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
13100 SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part, bool ForOrder) {
13101 Entries.clear();
13102 // TODO: currently checking only for Scalars in the tree entry, need to count
13103 // reused elements too for better cost estimation.
13104 const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get()
13105 ? EdgeInfo(const_cast<TreeEntry *>(TE), 0)
13106 : TE->UserTreeIndices.front();
13107 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13108 const BasicBlock *TEInsertBlock = nullptr;
13109 // Main node of PHI entries keeps the correct order of operands/incoming
13110 // blocks.
13111 if (auto *PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13112 TEInsertBlock = PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13113 TEInsertPt = TEInsertBlock->getTerminator();
13114 } else {
13115 TEInsertBlock = TEInsertPt->getParent();
13116 }
13117 if (!DT->isReachableFromEntry(TEInsertBlock))
13118 return std::nullopt;
13119 auto *NodeUI = DT->getNode(TEInsertBlock);
13120 assert(NodeUI && "Should only process reachable instructions");
13121 SmallPtrSet<Value *, 4> GatheredScalars(VL.begin(), VL.end());
13122 auto CheckOrdering = [&](const Instruction *InsertPt) {
13123 // Argument InsertPt is an instruction where vector code for some other
13124 // tree entry (one that shares one or more scalars with TE) is going to be
13125 // generated. This lambda returns true if insertion point of vector code
13126 // for the TE dominates that point (otherwise dependency is the other way
13127 // around). The other node is not limited to be of a gather kind. Gather
13128 // nodes are not scheduled and their vector code is inserted before their
13129 // first user. If user is PHI, that is supposed to be at the end of a
13130 // predecessor block. Otherwise it is the last instruction among scalars of
13131 // the user node. So, instead of checking dependency between instructions
13132 // themselves, we check dependency between their insertion points for vector
13133 // code (since each scalar instruction ends up as a lane of a vector
13134 // instruction).
13135 const BasicBlock *InsertBlock = InsertPt->getParent();
13136 auto *NodeEUI = DT->getNode(InsertBlock);
13137 if (!NodeEUI)
13138 return false;
13139 assert((NodeUI == NodeEUI) ==
13140 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13141 "Different nodes should have different DFS numbers");
13142 // Check the order of the gather nodes users.
13143 if (TEInsertPt->getParent() != InsertBlock &&
13144 (DT->dominates(NodeUI, NodeEUI) || !DT->dominates(NodeEUI, NodeUI)))
13145 return false;
13146 if (TEInsertPt->getParent() == InsertBlock &&
13147 TEInsertPt->comesBefore(InsertPt))
13148 return false;
13149 return true;
13150 };
13151 // Find all tree entries used by the gathered values. If no common entries
13152 // found - not a shuffle.
13153 // Here we build a set of tree nodes for each gathered value and trying to
13154 // find the intersection between these sets. If we have at least one common
13155 // tree node for each gathered value - we have just a permutation of the
13156 // single vector. If we have 2 different sets, we're in situation where we
13157 // have a permutation of 2 input vectors.
13159 DenseMap<Value *, int> UsedValuesEntry;
13160 for (Value *V : VL) {
13161 if (isConstant(V))
13162 continue;
13163 // Build a list of tree entries where V is used.
13165 for (const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13166 if (TEPtr == TE || TEPtr->Idx == 0)
13167 continue;
13168 assert(any_of(TEPtr->Scalars,
13169 [&](Value *V) { return GatheredScalars.contains(V); }) &&
13170 "Must contain at least single gathered value.");
13171 assert(TEPtr->UserTreeIndices.size() == 1 &&
13172 "Expected only single user of a gather node.");
13173 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13174
13175 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13176 const Instruction *InsertPt =
13177 UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
13178 : &getLastInstructionInBundle(UseEI.UserTE);
13179 if (TEInsertPt == InsertPt) {
13180 // If 2 gathers are operands of the same entry (regardless of whether
13181 // user is PHI or else), compare operands indices, use the earlier one
13182 // as the base.
13183 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13184 continue;
13185 // If the user instruction is used for some reason in different
13186 // vectorized nodes - make it depend on index.
13187 if (TEUseEI.UserTE != UseEI.UserTE &&
13188 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13189 continue;
13190 }
13191
13192 // Check if the user node of the TE comes after user node of TEPtr,
13193 // otherwise TEPtr depends on TE.
13194 if ((TEInsertBlock != InsertPt->getParent() ||
13195 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13196 !CheckOrdering(InsertPt))
13197 continue;
13198 VToTEs.insert(TEPtr);
13199 }
13200 if (const TreeEntry *VTE = getTreeEntry(V)) {
13201 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13202 if (VTE->State != TreeEntry::Vectorize) {
13203 auto It = MultiNodeScalars.find(V);
13204 if (It == MultiNodeScalars.end())
13205 continue;
13206 VTE = *It->getSecond().begin();
13207 // Iterate through all vectorized nodes.
13208 auto *MIt = find_if(It->getSecond(), [](const TreeEntry *MTE) {
13209 return MTE->State == TreeEntry::Vectorize;
13210 });
13211 if (MIt == It->getSecond().end())
13212 continue;
13213 VTE = *MIt;
13214 }
13215 }
13216 if (none_of(TE->CombinedEntriesWithIndices,
13217 [&](const auto &P) { return P.first == VTE->Idx; })) {
13218 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13219 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13220 continue;
13221 }
13222 VToTEs.insert(VTE);
13223 }
13224 if (VToTEs.empty())
13225 continue;
13226 if (UsedTEs.empty()) {
13227 // The first iteration, just insert the list of nodes to vector.
13228 UsedTEs.push_back(VToTEs);
13229 UsedValuesEntry.try_emplace(V, 0);
13230 } else {
13231 // Need to check if there are any previously used tree nodes which use V.
13232 // If there are no such nodes, consider that we have another one input
13233 // vector.
13234 SmallPtrSet<const TreeEntry *, 4> SavedVToTEs(VToTEs);
13235 unsigned Idx = 0;
13236 for (SmallPtrSet<const TreeEntry *, 4> &Set : UsedTEs) {
13237 // Do we have a non-empty intersection of previously listed tree entries
13238 // and tree entries using current V?
13239 set_intersect(VToTEs, Set);
13240 if (!VToTEs.empty()) {
13241 // Yes, write the new subset and continue analysis for the next
13242 // scalar.
13243 Set.swap(VToTEs);
13244 break;
13245 }
13246 VToTEs = SavedVToTEs;
13247 ++Idx;
13248 }
13249 // No non-empty intersection found - need to add a second set of possible
13250 // source vectors.
13251 if (Idx == UsedTEs.size()) {
13252 // If the number of input vectors is greater than 2 - not a permutation,
13253 // fallback to the regular gather.
13254 // TODO: support multiple reshuffled nodes.
13255 if (UsedTEs.size() == 2)
13256 continue;
13257 UsedTEs.push_back(SavedVToTEs);
13258 Idx = UsedTEs.size() - 1;
13259 }
13260 UsedValuesEntry.try_emplace(V, Idx);
13261 }
13262 }
13263
13264 if (UsedTEs.empty()) {
13265 Entries.clear();
13266 return std::nullopt;
13267 }
13268
13269 unsigned VF = 0;
13270 if (UsedTEs.size() == 1) {
13271 // Keep the order to avoid non-determinism.
13272 SmallVector<const TreeEntry *> FirstEntries(UsedTEs.front().begin(),
13273 UsedTEs.front().end());
13274 sort(FirstEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13275 return TE1->Idx < TE2->Idx;
13276 });
13277 // Try to find the perfect match in another gather node at first.
13278 auto *It = find_if(FirstEntries, [=](const TreeEntry *EntryPtr) {
13279 return EntryPtr->isSame(VL) || EntryPtr->isSame(TE->Scalars);
13280 });
13281 if (It != FirstEntries.end() &&
13282 ((*It)->getVectorFactor() == VL.size() ||
13283 ((*It)->getVectorFactor() == TE->Scalars.size() &&
13284 TE->ReuseShuffleIndices.size() == VL.size() &&
13285 (*It)->isSame(TE->Scalars)))) {
13286 Entries.push_back(*It);
13287 if ((*It)->getVectorFactor() == VL.size()) {
13288 std::iota(std::next(Mask.begin(), Part * VL.size()),
13289 std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
13290 } else {
13291 SmallVector<int> CommonMask = TE->getCommonMask();
13292 copy(CommonMask, Mask.begin());
13293 }
13294 // Clear undef scalars.
13295 for (unsigned I : seq<unsigned>(VL.size()))
13296 if (isa<PoisonValue>(VL[I]))
13297 Mask[Part * VL.size() + I] = PoisonMaskElem;
13299 }
13300 // No perfect match, just shuffle, so choose the first tree node from the
13301 // tree.
13302 Entries.push_back(FirstEntries.front());
13303 VF = FirstEntries.front()->getVectorFactor();
13304 } else {
13305 // Try to find nodes with the same vector factor.
13306 assert(UsedTEs.size() == 2 && "Expected at max 2 permuted entries.");
13307 // Keep the order of tree nodes to avoid non-determinism.
13309 for (const TreeEntry *TE : UsedTEs.front()) {
13310 unsigned VF = TE->getVectorFactor();
13311 auto It = VFToTE.find(VF);
13312 if (It != VFToTE.end()) {
13313 if (It->second->Idx > TE->Idx)
13314 It->getSecond() = TE;
13315 continue;
13316 }
13317 VFToTE.try_emplace(VF, TE);
13318 }
13319 // Same, keep the order to avoid non-determinism.
13320 SmallVector<const TreeEntry *> SecondEntries(UsedTEs.back().begin(),
13321 UsedTEs.back().end());
13322 sort(SecondEntries, [](const TreeEntry *TE1, const TreeEntry *TE2) {
13323 return TE1->Idx < TE2->Idx;
13324 });
13325 for (const TreeEntry *TE : SecondEntries) {
13326 auto It = VFToTE.find(TE->getVectorFactor());
13327 if (It != VFToTE.end()) {
13328 VF = It->first;
13329 Entries.push_back(It->second);
13330 Entries.push_back(TE);
13331 break;
13332 }
13333 }
13334 // No 2 source vectors with the same vector factor - just choose 2 with max
13335 // index.
13336 if (Entries.empty()) {
13337 Entries.push_back(*llvm::max_element(
13338 UsedTEs.front(), [](const TreeEntry *TE1, const TreeEntry *TE2) {
13339 return TE1->Idx < TE2->Idx;
13340 }));
13341 Entries.push_back(SecondEntries.front());
13342 VF = std::max(Entries.front()->getVectorFactor(),
13343 Entries.back()->getVectorFactor());
13344 } else {
13345 VF = Entries.front()->getVectorFactor();
13346 }
13347 }
13348
13349 bool IsSplatOrUndefs = isSplat(VL) || all_of(VL, IsaPred<UndefValue>);
13350 // Checks if the 2 PHIs are compatible in terms of high possibility to be
13351 // vectorized.
13352 auto AreCompatiblePHIs = [&](Value *V, Value *V1) {
13353 auto *PHI = cast<PHINode>(V);
13354 auto *PHI1 = cast<PHINode>(V1);
13355 // Check that all incoming values are compatible/from same parent (if they
13356 // are instructions).
13357 // The incoming values are compatible if they all are constants, or
13358 // instruction with the same/alternate opcodes from the same basic block.
13359 for (int I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
13360 Value *In = PHI->getIncomingValue(I);
13361 Value *In1 = PHI1->getIncomingValue(I);
13362 if (isConstant(In) && isConstant(In1))
13363 continue;
13364 if (!getSameOpcode({In, In1}, *TLI))
13365 return false;
13366 if (cast<Instruction>(In)->getParent() !=
13367 cast<Instruction>(In1)->getParent())
13368 return false;
13369 }
13370 return true;
13371 };
13372 // Check if the value can be ignored during analysis for shuffled gathers.
13373 // We suppose it is better to ignore instruction, which do not form splats,
13374 // are not vectorized/not extractelements (these instructions will be handled
13375 // by extractelements processing) or may form vector node in future.
13376 auto MightBeIgnored = [=](Value *V) {
13377 auto *I = dyn_cast<Instruction>(V);
13378 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.count(I) &&
13380 !areAllUsersVectorized(I, UserIgnoreList) && isSimple(I);
13381 };
13382 // Check that the neighbor instruction may form a full vector node with the
13383 // current instruction V. It is possible, if they have same/alternate opcode
13384 // and same parent basic block.
13385 auto NeighborMightBeIgnored = [&](Value *V, int Idx) {
13386 Value *V1 = VL[Idx];
13387 bool UsedInSameVTE = false;
13388 auto It = UsedValuesEntry.find(V1);
13389 if (It != UsedValuesEntry.end())
13390 UsedInSameVTE = It->second == UsedValuesEntry.find(V)->second;
13391 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13392 getSameOpcode({V, V1}, *TLI) &&
13393 cast<Instruction>(V)->getParent() ==
13394 cast<Instruction>(V1)->getParent() &&
13395 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13396 };
13397 // Build a shuffle mask for better cost estimation and vector emission.
13398 SmallBitVector UsedIdxs(Entries.size());
13400 for (int I = 0, E = VL.size(); I < E; ++I) {
13401 Value *V = VL[I];
13402 auto It = UsedValuesEntry.find(V);
13403 if (It == UsedValuesEntry.end())
13404 continue;
13405 // Do not try to shuffle scalars, if they are constants, or instructions
13406 // that can be vectorized as a result of the following vector build
13407 // vectorization.
13408 if (isConstant(V) || (MightBeIgnored(V) &&
13409 ((I > 0 && NeighborMightBeIgnored(V, I - 1)) ||
13410 (I != E - 1 && NeighborMightBeIgnored(V, I + 1)))))
13411 continue;
13412 unsigned Idx = It->second;
13413 EntryLanes.emplace_back(Idx, I);
13414 UsedIdxs.set(Idx);
13415 }
13416 // Iterate through all shuffled scalars and select entries, which can be used
13417 // for final shuffle.
13419 for (unsigned I = 0, Sz = Entries.size(); I < Sz; ++I) {
13420 if (!UsedIdxs.test(I))
13421 continue;
13422 // Fix the entry number for the given scalar. If it is the first entry, set
13423 // Pair.first to 0, otherwise to 1 (currently select at max 2 nodes).
13424 // These indices are used when calculating final shuffle mask as the vector
13425 // offset.
13426 for (std::pair<unsigned, int> &Pair : EntryLanes)
13427 if (Pair.first == I)
13428 Pair.first = TempEntries.size();
13429 TempEntries.push_back(Entries[I]);
13430 }
13431 Entries.swap(TempEntries);
13432 if (EntryLanes.size() == Entries.size() &&
13433 !VL.equals(ArrayRef(TE->Scalars)
13434 .slice(Part * VL.size(),
13435 std::min<int>(VL.size(), TE->Scalars.size())))) {
13436 // We may have here 1 or 2 entries only. If the number of scalars is equal
13437 // to the number of entries, no need to do the analysis, it is not very
13438 // profitable. Since VL is not the same as TE->Scalars, it means we already
13439 // have some shuffles before. Cut off not profitable case.
13440 Entries.clear();
13441 return std::nullopt;
13442 }
13443 // Build the final mask, check for the identity shuffle, if possible.
13444 bool IsIdentity = Entries.size() == 1;
13445 // Pair.first is the offset to the vector, while Pair.second is the index of
13446 // scalar in the list.
13447 for (const std::pair<unsigned, int> &Pair : EntryLanes) {
13448 unsigned Idx = Part * VL.size() + Pair.second;
13449 Mask[Idx] =
13450 Pair.first * VF +
13451 (ForOrder ? std::distance(
13452 Entries[Pair.first]->Scalars.begin(),
13453 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13454 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13455 IsIdentity &= Mask[Idx] == Pair.second;
13456 }
13457 if (ForOrder || IsIdentity || Entries.empty()) {
13458 switch (Entries.size()) {
13459 case 1:
13460 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13462 break;
13463 case 2:
13464 if (EntryLanes.size() > 2 || VL.size() <= 2)
13466 break;
13467 default:
13468 break;
13469 }
13470 } else if (!isa<VectorType>(VL.front()->getType()) &&
13471 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13472 // Do the cost estimation if shuffle beneficial than buildvector.
13473 SmallVector<int> SubMask(std::next(Mask.begin(), Part * VL.size()),
13474 std::next(Mask.begin(), (Part + 1) * VL.size()));
13475 int MinElement = SubMask.front(), MaxElement = SubMask.front();
13476 for (int Idx : SubMask) {
13477 if (Idx == PoisonMaskElem)
13478 continue;
13479 if (MinElement == PoisonMaskElem || MinElement % VF > Idx % VF)
13480 MinElement = Idx;
13481 if (MaxElement == PoisonMaskElem || MaxElement % VF < Idx % VF)
13482 MaxElement = Idx;
13483 }
13484 assert(MaxElement >= 0 && MinElement >= 0 &&
13485 MaxElement % VF >= MinElement % VF &&
13486 "Expected at least single element.");
13487 unsigned NewVF = std::max<unsigned>(
13488 VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(),
13489 (MaxElement % VF) -
13490 (MinElement % VF) + 1));
13491 if (NewVF < VF) {
13492 for_each(SubMask, [&](int &Idx) {
13493 if (Idx == PoisonMaskElem)
13494 return;
13495 Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13496 (Idx >= static_cast<int>(VF) ? NewVF : 0);
13497 });
13498 } else {
13499 NewVF = VF;
13500 }
13501
13503 auto *VecTy = getWidenedType(VL.front()->getType(), NewVF);
13504 auto *MaskVecTy = getWidenedType(VL.front()->getType(), SubMask.size());
13505 auto GetShuffleCost = [&,
13508 VectorType *VecTy) -> InstructionCost {
13509 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13511 Mask, Entries.front()->getInterleaveFactor()))
13512 return TTI::TCC_Free;
13513 return ::getShuffleCost(TTI,
13514 Entries.size() > 1 ? TTI::SK_PermuteTwoSrc
13516 VecTy, Mask, CostKind);
13517 };
13518 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13519 InstructionCost FirstShuffleCost = 0;
13520 SmallVector<int> FirstMask(SubMask.begin(), SubMask.end());
13521 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13522 FirstShuffleCost = ShuffleCost;
13523 } else {
13524 // Transform mask to include only first entry.
13525 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13526 bool IsIdentity = true;
13527 for (auto [I, Idx] : enumerate(FirstMask)) {
13528 if (Idx >= static_cast<int>(NewVF)) {
13530 } else {
13531 DemandedElts.clearBit(I);
13532 if (Idx != PoisonMaskElem)
13533 IsIdentity &= static_cast<int>(I) == Idx;
13534 }
13535 }
13536 if (!IsIdentity)
13537 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13538 FirstShuffleCost += TTI->getScalarizationOverhead(
13539 MaskVecTy, DemandedElts, /*Insert=*/true,
13540 /*Extract=*/false, CostKind);
13541 }
13542 InstructionCost SecondShuffleCost = 0;
13543 SmallVector<int> SecondMask(SubMask.begin(), SubMask.end());
13544 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13545 SecondShuffleCost = ShuffleCost;
13546 } else {
13547 // Transform mask to include only first entry.
13548 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13549 bool IsIdentity = true;
13550 for (auto [I, Idx] : enumerate(SecondMask)) {
13551 if (Idx < static_cast<int>(NewVF) && Idx >= 0) {
13553 } else {
13554 DemandedElts.clearBit(I);
13555 if (Idx != PoisonMaskElem) {
13556 Idx -= NewVF;
13557 IsIdentity &= static_cast<int>(I) == Idx;
13558 }
13559 }
13560 }
13561 if (!IsIdentity)
13562 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13563 SecondShuffleCost += TTI->getScalarizationOverhead(
13564 MaskVecTy, DemandedElts, /*Insert=*/true,
13565 /*Extract=*/false, CostKind);
13566 }
13567 APInt DemandedElts = APInt::getAllOnes(SubMask.size());
13568 for (auto [I, Idx] : enumerate(SubMask))
13569 if (Idx == PoisonMaskElem)
13570 DemandedElts.clearBit(I);
13571 InstructionCost BuildVectorCost =
13572 TTI->getScalarizationOverhead(MaskVecTy, DemandedElts, /*Insert=*/true,
13573 /*Extract=*/false, CostKind);
13574 const TreeEntry *BestEntry = nullptr;
13575 if (FirstShuffleCost < ShuffleCost) {
13576 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13577 std::next(Mask.begin(), (Part + 1) * VL.size()),
13578 [&](int &Idx) {
13579 if (Idx >= static_cast<int>(VF))
13580 Idx = PoisonMaskElem;
13581 });
13582 BestEntry = Entries.front();
13583 ShuffleCost = FirstShuffleCost;
13584 }
13585 if (SecondShuffleCost < ShuffleCost) {
13586 std::for_each(std::next(Mask.begin(), Part * VL.size()),
13587 std::next(Mask.begin(), (Part + 1) * VL.size()),
13588 [&](int &Idx) {
13589 if (Idx < static_cast<int>(VF))
13590 Idx = PoisonMaskElem;
13591 else
13592 Idx -= VF;
13593 });
13594 BestEntry = Entries[1];
13595 ShuffleCost = SecondShuffleCost;
13596 }
13597 if (BuildVectorCost >= ShuffleCost) {
13598 if (BestEntry) {
13599 Entries.clear();
13600 Entries.push_back(BestEntry);
13601 }
13602 return Entries.size() > 1 ? TargetTransformInfo::SK_PermuteTwoSrc
13604 }
13605 }
13606 Entries.clear();
13607 // Clear the corresponding mask elements.
13608 std::fill(std::next(Mask.begin(), Part * VL.size()),
13609 std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
13610 return std::nullopt;
13611}
13612
13614BoUpSLP::isGatherShuffledEntry(
13615 const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
13616 SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries, unsigned NumParts,
13617 bool ForOrder) {
13618 assert(NumParts > 0 && NumParts < VL.size() &&
13619 "Expected positive number of registers.");
13620 Entries.clear();
13621 // No need to check for the topmost gather node.
13622 if (TE == VectorizableTree.front().get() &&
13623 (!GatheredLoadsEntriesFirst.has_value() ||
13624 none_of(ArrayRef(VectorizableTree).drop_front(),
13625 [](const std::unique_ptr<TreeEntry> &TE) {
13626 return !TE->isGather();
13627 })))
13628 return {};
13629 // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not
13630 // implemented yet.
13631 if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI))
13632 return {};
13633 Mask.assign(VL.size(), PoisonMaskElem);
13634 assert((TE->UserTreeIndices.size() == 1 ||
13635 TE == VectorizableTree.front().get()) &&
13636 "Expected only single user of the gather node.");
13637 assert(VL.size() % NumParts == 0 &&
13638 "Number of scalars must be divisible by NumParts.");
13639 if (!TE->UserTreeIndices.empty() &&
13640 TE->UserTreeIndices.front().UserTE->isGather() &&
13641 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13642 assert(
13643 (TE->Idx == 0 ||
13644 (TE->hasState() && TE->getOpcode() == Instruction::ExtractElement) ||
13645 isSplat(TE->Scalars)) &&
13646 "Expected splat or extractelements only node.");
13647 return {};
13648 }
13649 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
13651 for (unsigned Part : seq<unsigned>(NumParts)) {
13652 ArrayRef<Value *> SubVL =
13653 VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part));
13654 SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
13655 std::optional<TTI::ShuffleKind> SubRes =
13656 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13657 ForOrder);
13658 if (!SubRes)
13659 SubEntries.clear();
13660 Res.push_back(SubRes);
13661 if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
13662 SubEntries.front()->getVectorFactor() == VL.size() &&
13663 (SubEntries.front()->isSame(TE->Scalars) ||
13664 SubEntries.front()->isSame(VL))) {
13665 SmallVector<const TreeEntry *> LocalSubEntries;
13666 LocalSubEntries.swap(SubEntries);
13667 Entries.clear();
13668 Res.clear();
13669 std::iota(Mask.begin(), Mask.end(), 0);
13670 // Clear undef scalars.
13671 for (int I = 0, Sz = VL.size(); I < Sz; ++I)
13672 if (isa<PoisonValue>(VL[I]))
13674 Entries.emplace_back(1, LocalSubEntries.front());
13676 return Res;
13677 }
13678 }
13679 if (all_of(Res,
13680 [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
13681 Entries.clear();
13682 return {};
13683 }
13684 return Res;
13685}
13686
13687InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
13688 Type *ScalarTy) const {
13689 auto *VecTy = getWidenedType(ScalarTy, VL.size());
13690 bool DuplicateNonConst = false;
13691 // Find the cost of inserting/extracting values from the vector.
13692 // Check if the same elements are inserted several times and count them as
13693 // shuffle candidates.
13694 APInt ShuffledElements = APInt::getZero(VL.size());
13695 DenseMap<Value *, unsigned> UniqueElements;
13698 auto EstimateInsertCost = [&](unsigned I, Value *V) {
13699 if (V->getType() != ScalarTy) {
13700 Cost += TTI->getCastInstrCost(Instruction::Trunc, ScalarTy, V->getType(),
13702 V = nullptr;
13703 }
13704 if (!ForPoisonSrc)
13705 Cost +=
13706 TTI->getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
13707 I, Constant::getNullValue(VecTy), V);
13708 };
13709 SmallVector<int> ShuffleMask(VL.size(), PoisonMaskElem);
13710 for (unsigned I = 0, E = VL.size(); I < E; ++I) {
13711 Value *V = VL[I];
13712 // No need to shuffle duplicates for constants.
13713 if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
13714 ShuffledElements.setBit(I);
13715 ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
13716 continue;
13717 }
13718
13719 auto Res = UniqueElements.try_emplace(V, I);
13720 if (Res.second) {
13721 EstimateInsertCost(I, V);
13722 ShuffleMask[I] = I;
13723 continue;
13724 }
13725
13726 DuplicateNonConst = true;
13727 ShuffledElements.setBit(I);
13728 ShuffleMask[I] = Res.first->second;
13729 }
13730 if (ForPoisonSrc) {
13731 if (isa<FixedVectorType>(ScalarTy)) {
13732 assert(SLPReVec && "Only supported by REVEC.");
13733 // We don't need to insert elements one by one. Instead, we can insert the
13734 // entire vector into the destination.
13735 Cost = 0;
13736 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
13737 for (unsigned I : seq<unsigned>(VL.size()))
13738 if (!ShuffledElements[I])
13740 TTI::SK_InsertSubvector, VecTy, std::nullopt, CostKind,
13741 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13742 } else {
13744 /*DemandedElts*/ ~ShuffledElements,
13745 /*Insert*/ true,
13746 /*Extract*/ false, CostKind, VL);
13747 }
13748 }
13749 if (DuplicateNonConst)
13751 VecTy, ShuffleMask);
13752 return Cost;
13753}
13754
13755Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) {
13756 auto &Res = EntryToLastInstruction.try_emplace(E).first->second;
13757 if (Res)
13758 return *Res;
13759 // Get the basic block this bundle is in. All instructions in the bundle
13760 // should be in this block (except for extractelement-like instructions with
13761 // constant indices or gathered loads).
13762 auto *Front = E->getMainOp();
13763 auto *BB = Front->getParent();
13764 assert(((GatheredLoadsEntriesFirst.has_value() &&
13765 E->getOpcode() == Instruction::Load && E->isGather() &&
13766 E->Idx < *GatheredLoadsEntriesFirst) ||
13767 all_of(E->Scalars,
13768 [=](Value *V) -> bool {
13769 if (E->getOpcode() == Instruction::GetElementPtr &&
13770 !isa<GetElementPtrInst>(V))
13771 return true;
13772 auto *I = dyn_cast<Instruction>(V);
13773 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13774 isVectorLikeInstWithConstOps(I);
13775 })) &&
13776 "Expected gathered loads or GEPs or instructions from same basic "
13777 "block.");
13778
13779 auto FindLastInst = [&]() {
13780 Instruction *LastInst = Front;
13781 for (Value *V : E->Scalars) {
13782 auto *I = dyn_cast<Instruction>(V);
13783 if (!I)
13784 continue;
13785 if (LastInst->getParent() == I->getParent()) {
13786 if (LastInst->comesBefore(I))
13787 LastInst = I;
13788 continue;
13789 }
13790 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13791 !isa<GetElementPtrInst>(I)) ||
13792 (isVectorLikeInstWithConstOps(LastInst) &&
13794 (GatheredLoadsEntriesFirst.has_value() &&
13795 E->getOpcode() == Instruction::Load && E->isGather() &&
13796 E->Idx < *GatheredLoadsEntriesFirst)) &&
13797 "Expected vector-like or non-GEP in GEP node insts only.");
13798 if (!DT->isReachableFromEntry(LastInst->getParent())) {
13799 LastInst = I;
13800 continue;
13801 }
13802 if (!DT->isReachableFromEntry(I->getParent()))
13803 continue;
13804 auto *NodeA = DT->getNode(LastInst->getParent());
13805 auto *NodeB = DT->getNode(I->getParent());
13806 assert(NodeA && "Should only process reachable instructions");
13807 assert(NodeB && "Should only process reachable instructions");
13808 assert((NodeA == NodeB) ==
13809 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13810 "Different nodes should have different DFS numbers");
13811 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13812 LastInst = I;
13813 }
13814 BB = LastInst->getParent();
13815 return LastInst;
13816 };
13817
13818 auto FindFirstInst = [&]() {
13819 Instruction *FirstInst = Front;
13820 for (Value *V : E->Scalars) {
13821 auto *I = dyn_cast<Instruction>(V);
13822 if (!I)
13823 continue;
13824 if (FirstInst->getParent() == I->getParent()) {
13825 if (I->comesBefore(FirstInst))
13826 FirstInst = I;
13827 continue;
13828 }
13829 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13830 !isa<GetElementPtrInst>(I)) ||
13831 (isVectorLikeInstWithConstOps(FirstInst) &&
13833 "Expected vector-like or non-GEP in GEP node insts only.");
13834 if (!DT->isReachableFromEntry(FirstInst->getParent())) {
13835 FirstInst = I;
13836 continue;
13837 }
13838 if (!DT->isReachableFromEntry(I->getParent()))
13839 continue;
13840 auto *NodeA = DT->getNode(FirstInst->getParent());
13841 auto *NodeB = DT->getNode(I->getParent());
13842 assert(NodeA && "Should only process reachable instructions");
13843 assert(NodeB && "Should only process reachable instructions");
13844 assert((NodeA == NodeB) ==
13845 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13846 "Different nodes should have different DFS numbers");
13847 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13848 FirstInst = I;
13849 }
13850 return FirstInst;
13851 };
13852
13853 // Set insertpoint for gathered loads to the very first load.
13854 if (GatheredLoadsEntriesFirst.has_value() &&
13855 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13856 E->getOpcode() == Instruction::Load) {
13857 Res = FindFirstInst();
13858 return *Res;
13859 }
13860
13861 // Set the insert point to the beginning of the basic block if the entry
13862 // should not be scheduled.
13863 if (doesNotNeedToSchedule(E->Scalars) ||
13864 (!E->isGather() && all_of(E->Scalars, isVectorLikeInstWithConstOps))) {
13865 if ((E->getOpcode() == Instruction::GetElementPtr &&
13866 any_of(E->Scalars,
13867 [](Value *V) {
13868 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13869 })) ||
13870 all_of(E->Scalars,
13871 [](Value *V) {
13872 return isa<PoisonValue>(V) ||
13873 (!isVectorLikeInstWithConstOps(V) &&
13874 isUsedOutsideBlock(V));
13875 }) ||
13876 (E->isGather() && E->Idx == 0 && all_of(E->Scalars, [](Value *V) {
13877 return isa<ExtractElementInst, UndefValue>(V) ||
13878 areAllOperandsNonInsts(V);
13879 })))
13880 Res = FindLastInst();
13881 else
13882 Res = FindFirstInst();
13883 return *Res;
13884 }
13885
13886 // Find the last instruction. The common case should be that BB has been
13887 // scheduled, and the last instruction is VL.back(). So we start with
13888 // VL.back() and iterate over schedule data until we reach the end of the
13889 // bundle. The end of the bundle is marked by null ScheduleData.
13890 if (BlocksSchedules.count(BB) && !E->isGather()) {
13891 Value *V = E->isOneOf(E->Scalars.back());
13893 V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled);
13894 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13895 if (Bundle && Bundle->isPartOfBundle())
13896 for (; Bundle; Bundle = Bundle->NextInBundle)
13897 Res = Bundle->Inst;
13898 }
13899
13900 // LastInst can still be null at this point if there's either not an entry
13901 // for BB in BlocksSchedules or there's no ScheduleData available for
13902 // VL.back(). This can be the case if buildTree_rec aborts for various
13903 // reasons (e.g., the maximum recursion depth is reached, the maximum region
13904 // size is reached, etc.). ScheduleData is initialized in the scheduling
13905 // "dry-run".
13906 //
13907 // If this happens, we can still find the last instruction by brute force. We
13908 // iterate forwards from Front (inclusive) until we either see all
13909 // instructions in the bundle or reach the end of the block. If Front is the
13910 // last instruction in program order, LastInst will be set to Front, and we
13911 // will visit all the remaining instructions in the block.
13912 //
13913 // One of the reasons we exit early from buildTree_rec is to place an upper
13914 // bound on compile-time. Thus, taking an additional compile-time hit here is
13915 // not ideal. However, this should be exceedingly rare since it requires that
13916 // we both exit early from buildTree_rec and that the bundle be out-of-order
13917 // (causing us to iterate all the way to the end of the block).
13918 if (!Res)
13919 Res = FindLastInst();
13920 assert(Res && "Failed to find last instruction in bundle");
13921 return *Res;
13922}
13923
13924void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
13925 auto *Front = E->getMainOp();
13926 Instruction *LastInst = &getLastInstructionInBundle(E);
13927 assert(LastInst && "Failed to find last instruction in bundle");
13928 BasicBlock::iterator LastInstIt = LastInst->getIterator();
13929 // If the instruction is PHI, set the insert point after all the PHIs.
13930 bool IsPHI = isa<PHINode>(LastInst);
13931 if (IsPHI)
13932 LastInstIt = LastInst->getParent()->getFirstNonPHIIt();
13933 if (IsPHI || (!E->isGather() && doesNotNeedToSchedule(E->Scalars))) {
13934 Builder.SetInsertPoint(LastInst->getParent(), LastInstIt);
13935 } else {
13936 // Set the insertion point after the last instruction in the bundle. Set the
13937 // debug location to Front.
13938 Builder.SetInsertPoint(
13939 LastInst->getParent(),
13941 }
13942 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13943}
13944
13945Value *BoUpSLP::gather(
13946 ArrayRef<Value *> VL, Value *Root, Type *ScalarTy,
13947 function_ref<Value *(Value *, Value *, ArrayRef<int>)> CreateShuffle) {
13948 // List of instructions/lanes from current block and/or the blocks which are
13949 // part of the current loop. These instructions will be inserted at the end to
13950 // make it possible to optimize loops and hoist invariant instructions out of
13951 // the loops body with better chances for success.
13953 SmallSet<int, 4> PostponedIndices;
13954 Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
13955 auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
13957 while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
13958 InsertBB = InsertBB->getSinglePredecessor();
13959 return InsertBB && InsertBB == InstBB;
13960 };
13961 for (int I = 0, E = VL.size(); I < E; ++I) {
13962 if (auto *Inst = dyn_cast<Instruction>(VL[I]))
13963 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13964 getTreeEntry(Inst) ||
13965 (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) &&
13966 PostponedIndices.insert(I).second)
13967 PostponedInsts.emplace_back(Inst, I);
13968 }
13969
13970 auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos,
13971 Type *Ty) {
13972 Value *Scalar = V;
13973 if (Scalar->getType() != Ty) {
13974 assert(Scalar->getType()->isIntOrIntVectorTy() &&
13975 Ty->isIntOrIntVectorTy() && "Expected integer types only.");
13976 Value *V = Scalar;
13977 if (auto *CI = dyn_cast<CastInst>(Scalar);
13978 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13979 Value *Op = CI->getOperand(0);
13980 if (auto *IOp = dyn_cast<Instruction>(Op);
13981 !IOp || !(isDeleted(IOp) || getTreeEntry(IOp)))
13982 V = Op;
13983 }
13984 Scalar = Builder.CreateIntCast(
13985 V, Ty, !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
13986 }
13987
13988 Instruction *InsElt;
13989 if (auto *VecTy = dyn_cast<FixedVectorType>(Scalar->getType())) {
13990 assert(SLPReVec && "FixedVectorType is not expected.");
13991 Vec =
13992 createInsertVector(Builder, Vec, Scalar, Pos * getNumElements(VecTy));
13993 auto *II = dyn_cast<IntrinsicInst>(Vec);
13994 if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
13995 return Vec;
13996 InsElt = II;
13997 } else {
13998 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13999 InsElt = dyn_cast<InsertElementInst>(Vec);
14000 if (!InsElt)
14001 return Vec;
14002 }
14003 GatherShuffleExtractSeq.insert(InsElt);
14004 CSEBlocks.insert(InsElt->getParent());
14005 // Add to our 'need-to-extract' list.
14006 if (isa<Instruction>(V)) {
14007 if (TreeEntry *Entry = getTreeEntry(V)) {
14008 // Find which lane we need to extract.
14009 User *UserOp = nullptr;
14010 if (Scalar != V) {
14011 if (auto *SI = dyn_cast<Instruction>(Scalar))
14012 UserOp = SI;
14013 } else {
14014 UserOp = InsElt;
14015 }
14016 if (UserOp) {
14017 unsigned FoundLane = Entry->findLaneForValue(V);
14018 ExternalUses.emplace_back(V, UserOp, FoundLane);
14019 }
14020 }
14021 }
14022 return Vec;
14023 };
14024 auto *VecTy = getWidenedType(ScalarTy, VL.size());
14025 Value *Vec = PoisonValue::get(VecTy);
14026 SmallVector<int> NonConsts;
14028 std::iota(Mask.begin(), Mask.end(), 0);
14029 Value *OriginalRoot = Root;
14030 if (auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14031 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14032 SV->getOperand(0)->getType() == VecTy) {
14033 Root = SV->getOperand(0);
14034 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14035 }
14036 // Insert constant values at first.
14037 for (int I = 0, E = VL.size(); I < E; ++I) {
14038 if (PostponedIndices.contains(I))
14039 continue;
14040 if (!isConstant(VL[I])) {
14041 NonConsts.push_back(I);
14042 continue;
14043 }
14044 if (isa<PoisonValue>(VL[I]))
14045 continue;
14046 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14047 Mask[I] = I + E;
14048 }
14049 if (Root) {
14050 if (isa<PoisonValue>(Vec)) {
14051 Vec = OriginalRoot;
14052 } else {
14053 Vec = CreateShuffle(Root, Vec, Mask);
14054 if (auto *OI = dyn_cast<Instruction>(OriginalRoot);
14055 OI && OI->hasNUses(0) &&
14056 none_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14057 return TE->VectorizedValue == OI;
14058 }))
14059 eraseInstruction(OI);
14060 }
14061 }
14062 // Insert non-constant values.
14063 for (int I : NonConsts)
14064 Vec = CreateInsertElement(Vec, VL[I], I, ScalarTy);
14065 // Append instructions, which are/may be part of the loop, in the end to make
14066 // it possible to hoist non-loop-based instructions.
14067 for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14068 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14069
14070 return Vec;
14071}
14072
14073/// Merges shuffle masks and emits final shuffle instruction, if required. It
14074/// supports shuffling of 2 input vectors. It implements lazy shuffles emission,
14075/// when the actual shuffle instruction is generated only if this is actually
14076/// required. Otherwise, the shuffle instruction emission is delayed till the
14077/// end of the process, to reduce the number of emitted instructions and further
14078/// analysis/transformations.
14079/// The class also will look through the previously emitted shuffle instructions
14080/// and properly mark indices in mask as undef.
14081/// For example, given the code
14082/// \code
14083/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0>
14084/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0>
14085/// \endcode
14086/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 3, 2>, it will
14087/// look through %s1 and %s2 and emit
14088/// \code
14089/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14090/// \endcode
14091/// instead.
14092/// If 2 operands are of different size, the smallest one will be resized and
14093/// the mask recalculated properly.
14094/// For example, given the code
14095/// \code
14096/// %s1 = shufflevector <2 x ty> %0, poison, <1, 0, 1, 0>
14097/// %s2 = shufflevector <2 x ty> %1, poison, <1, 0, 1, 0>
14098/// \endcode
14099/// and if need to emit shuffle of %s1 and %s2 with mask <1, 0, 5, 4>, it will
14100/// look through %s1 and %s2 and emit
14101/// \code
14102/// %res = shufflevector <2 x ty> %0, %1, <0, 1, 2, 3>
14103/// \endcode
14104/// instead.
14105class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
14106 bool IsFinalized = false;
14107 /// Combined mask for all applied operands and masks. It is built during
14108 /// analysis and actual emission of shuffle vector instructions.
14109 SmallVector<int> CommonMask;
14110 /// List of operands for the shuffle vector instruction. It hold at max 2
14111 /// operands, if the 3rd is going to be added, the first 2 are combined into
14112 /// shuffle with \p CommonMask mask, the first operand sets to be the
14113 /// resulting shuffle and the second operand sets to be the newly added
14114 /// operand. The \p CommonMask is transformed in the proper way after that.
14115 SmallVector<Value *, 2> InVectors;
14116 IRBuilderBase &Builder;
14117 BoUpSLP &R;
14118
14119 class ShuffleIRBuilder {
14120 IRBuilderBase &Builder;
14121 /// Holds all of the instructions that we gathered.
14122 SetVector<Instruction *> &GatherShuffleExtractSeq;
14123 /// A list of blocks that we are going to CSE.
14124 DenseSet<BasicBlock *> &CSEBlocks;
14125 /// Data layout.
14126 const DataLayout &DL;
14127
14128 public:
14129 ShuffleIRBuilder(IRBuilderBase &Builder,
14130 SetVector<Instruction *> &GatherShuffleExtractSeq,
14131 DenseSet<BasicBlock *> &CSEBlocks, const DataLayout &DL)
14132 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14133 CSEBlocks(CSEBlocks), DL(DL) {}
14134 ~ShuffleIRBuilder() = default;
14135 /// Creates shufflevector for the 2 operands with the given mask.
14136 Value *createShuffleVector(Value *V1, Value *V2, ArrayRef<int> Mask) {
14137 if (V1->getType() != V2->getType()) {
14139 V1->getType()->isIntOrIntVectorTy() &&
14140 "Expected integer vector types only.");
14141 if (V1->getType() != V2->getType()) {
14142 if (cast<VectorType>(V2->getType())
14143 ->getElementType()
14144 ->getIntegerBitWidth() < cast<VectorType>(V1->getType())
14145 ->getElementType()
14146 ->getIntegerBitWidth())
14147 V2 = Builder.CreateIntCast(
14148 V2, V1->getType(), !isKnownNonNegative(V2, SimplifyQuery(DL)));
14149 else
14150 V1 = Builder.CreateIntCast(
14151 V1, V2->getType(), !isKnownNonNegative(V1, SimplifyQuery(DL)));
14152 }
14153 }
14154 Value *Vec = Builder.CreateShuffleVector(V1, V2, Mask);
14155 if (auto *I = dyn_cast<Instruction>(Vec)) {
14156 GatherShuffleExtractSeq.insert(I);
14157 CSEBlocks.insert(I->getParent());
14158 }
14159 return Vec;
14160 }
14161 /// Creates permutation of the single vector operand with the given mask, if
14162 /// it is not identity mask.
14163 Value *createShuffleVector(Value *V1, ArrayRef<int> Mask) {
14164 if (Mask.empty())
14165 return V1;
14166 unsigned VF = Mask.size();
14167 unsigned LocalVF = cast<FixedVectorType>(V1->getType())->getNumElements();
14168 if (VF == LocalVF && ShuffleVectorInst::isIdentityMask(Mask, VF))
14169 return V1;
14170 Value *Vec = Builder.CreateShuffleVector(V1, Mask);
14171 if (auto *I = dyn_cast<Instruction>(Vec)) {
14172 GatherShuffleExtractSeq.insert(I);
14173 CSEBlocks.insert(I->getParent());
14174 }
14175 return Vec;
14176 }
14177 Value *createIdentity(Value *V) { return V; }
14178 Value *createPoison(Type *Ty, unsigned VF) {
14179 return PoisonValue::get(getWidenedType(Ty, VF));
14180 }
14181 /// Resizes 2 input vector to match the sizes, if the they are not equal
14182 /// yet. The smallest vector is resized to the size of the larger vector.
14183 void resizeToMatch(Value *&V1, Value *&V2) {
14184 if (V1->getType() == V2->getType())
14185 return;
14186 int V1VF = cast<FixedVectorType>(V1->getType())->getNumElements();
14187 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14188 int VF = std::max(V1VF, V2VF);
14189 int MinVF = std::min(V1VF, V2VF);
14190 SmallVector<int> IdentityMask(VF, PoisonMaskElem);
14191 std::iota(IdentityMask.begin(), std::next(IdentityMask.begin(), MinVF),
14192 0);
14193 Value *&Op = MinVF == V1VF ? V1 : V2;
14194 Op = Builder.CreateShuffleVector(Op, IdentityMask);
14195 if (auto *I = dyn_cast<Instruction>(Op)) {
14196 GatherShuffleExtractSeq.insert(I);
14197 CSEBlocks.insert(I->getParent());
14198 }
14199 if (MinVF == V1VF)
14200 V1 = Op;
14201 else
14202 V2 = Op;
14203 }
14204 };
14205
14206 /// Smart shuffle instruction emission, walks through shuffles trees and
14207 /// tries to find the best matching vector for the actual shuffle
14208 /// instruction.
14209 Value *createShuffle(Value *V1, Value *V2, ArrayRef<int> Mask) {
14210 assert(V1 && "Expected at least one vector value.");
14211 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14212 R.CSEBlocks, *R.DL);
14213 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14214 ShuffleBuilder);
14215 }
14216
14217 /// Transforms mask \p CommonMask per given \p Mask to make proper set after
14218 /// shuffle emission.
14219 static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
14220 ArrayRef<int> Mask) {
14221 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14222 if (Mask[Idx] != PoisonMaskElem)
14223 CommonMask[Idx] = Idx;
14224 }
14225
14226 /// Cast value \p V to the vector type with the same number of elements, but
14227 /// the base type \p ScalarTy.
14228 Value *castToScalarTyElem(Value *V,
14229 std::optional<bool> IsSigned = std::nullopt) {
14230 auto *VecTy = cast<VectorType>(V->getType());
14231 assert(getNumElements(VecTy) % getNumElements(ScalarTy) == 0);
14232 if (VecTy->getElementType() == ScalarTy->getScalarType())
14233 return V;
14234 return Builder.CreateIntCast(
14235 V, VectorType::get(ScalarTy->getScalarType(), VecTy->getElementCount()),
14236 IsSigned.value_or(!isKnownNonNegative(V, SimplifyQuery(*R.DL))));
14237 }
14238
14239public:
14241 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14242
14243 /// Adjusts extractelements after reusing them.
14244 Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
14245 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14246 unsigned NumParts, bool &UseVecBaseAsInput) {
14247 UseVecBaseAsInput = false;
14248 SmallPtrSet<Value *, 4> UniqueBases;
14249 Value *VecBase = nullptr;
14250 SmallVector<Value *> VL(E->Scalars.begin(), E->Scalars.end());
14251 if (!E->ReorderIndices.empty()) {
14252 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14253 E->ReorderIndices.end());
14254 reorderScalars(VL, ReorderMask);
14255 }
14256 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
14257 int Idx = Mask[I];
14258 if (Idx == PoisonMaskElem)
14259 continue;
14260 auto *EI = cast<ExtractElementInst>(VL[I]);
14261 VecBase = EI->getVectorOperand();
14262 if (const TreeEntry *TE = R.getTreeEntry(VecBase))
14263 VecBase = TE->VectorizedValue;
14264 assert(VecBase && "Expected vectorized value.");
14265 UniqueBases.insert(VecBase);
14266 // If the only one use is vectorized - can delete the extractelement
14267 // itself.
14268 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14269 (NumParts != 1 && count(VL, EI) > 1) ||
14270 any_of(EI->users(), [&](User *U) {
14271 const TreeEntry *UTE = R.getTreeEntry(U);
14272 return !UTE || R.MultiNodeScalars.contains(U) ||
14273 (isa<GetElementPtrInst>(U) &&
14274 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14275 count_if(R.VectorizableTree,
14276 [&](const std::unique_ptr<TreeEntry> &TE) {
14277 return any_of(TE->UserTreeIndices,
14278 [&](const EdgeInfo &Edge) {
14279 return Edge.UserTE == UTE;
14280 }) &&
14281 is_contained(VL, EI);
14282 }) != 1;
14283 }))
14284 continue;
14285 R.eraseInstruction(EI);
14286 }
14287 if (NumParts == 1 || UniqueBases.size() == 1) {
14288 assert(VecBase && "Expected vectorized value.");
14289 return castToScalarTyElem(VecBase);
14290 }
14291 UseVecBaseAsInput = true;
14292 auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
14293 for (auto [I, Idx] : enumerate(Mask))
14294 if (Idx != PoisonMaskElem)
14295 Idx = I;
14296 };
14297 // Perform multi-register vector shuffle, joining them into a single virtual
14298 // long vector.
14299 // Need to shuffle each part independently and then insert all this parts
14300 // into a long virtual vector register, forming the original vector.
14301 Value *Vec = nullptr;
14302 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
14303 unsigned SliceSize = getPartNumElems(VL.size(), NumParts);
14304 for (unsigned Part : seq<unsigned>(NumParts)) {
14305 unsigned Limit = getNumElems(VL.size(), SliceSize, Part);
14306 ArrayRef<Value *> SubVL = ArrayRef(VL).slice(Part * SliceSize, Limit);
14307 MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
14308 constexpr int MaxBases = 2;
14309 SmallVector<Value *, MaxBases> Bases(MaxBases);
14310 auto VLMask = zip(SubVL, SubMask);
14311 const unsigned VF = std::accumulate(
14312 VLMask.begin(), VLMask.end(), 0U, [&](unsigned S, const auto &D) {
14313 if (std::get<1>(D) == PoisonMaskElem)
14314 return S;
14315 Value *VecOp =
14316 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14317 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14318 VecOp = TE->VectorizedValue;
14319 assert(VecOp && "Expected vectorized value.");
14320 const unsigned Size =
14321 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14322 return std::max(S, Size);
14323 });
14324 for (const auto [V, I] : VLMask) {
14325 if (I == PoisonMaskElem)
14326 continue;
14327 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14328 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14329 VecOp = TE->VectorizedValue;
14330 assert(VecOp && "Expected vectorized value.");
14331 VecOp = castToScalarTyElem(VecOp);
14332 Bases[I / VF] = VecOp;
14333 }
14334 if (!Bases.front())
14335 continue;
14336 Value *SubVec;
14337 if (Bases.back()) {
14338 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14339 TransformToIdentity(SubMask);
14340 } else {
14341 SubVec = Bases.front();
14342 }
14343 if (!Vec) {
14344 Vec = SubVec;
14345 assert((Part == 0 || all_of(seq<unsigned>(0, Part),
14346 [&](unsigned P) {
14347 ArrayRef<int> SubMask =
14348 Mask.slice(P * SliceSize,
14349 getNumElems(Mask.size(),
14350 SliceSize, P));
14351 return all_of(SubMask, [](int Idx) {
14352 return Idx == PoisonMaskElem;
14353 });
14354 })) &&
14355 "Expected first part or all previous parts masked.");
14356 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14357 } else {
14358 unsigned NewVF =
14359 cast<FixedVectorType>(Vec->getType())->getNumElements();
14360 if (Vec->getType() != SubVec->getType()) {
14361 unsigned SubVecVF =
14362 cast<FixedVectorType>(SubVec->getType())->getNumElements();
14363 NewVF = std::max(NewVF, SubVecVF);
14364 }
14365 // Adjust SubMask.
14366 for (int &Idx : SubMask)
14367 if (Idx != PoisonMaskElem)
14368 Idx += NewVF;
14369 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14370 Vec = createShuffle(Vec, SubVec, VecMask);
14371 TransformToIdentity(VecMask);
14372 }
14373 }
14374 copy(VecMask, Mask.begin());
14375 return Vec;
14376 }
14377 /// Checks if the specified entry \p E needs to be delayed because of its
14378 /// dependency nodes.
14379 std::optional<Value *>
14380 needToDelay(const TreeEntry *E,
14382 // No need to delay emission if all deps are ready.
14383 if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
14384 return all_of(
14385 TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
14386 }))
14387 return std::nullopt;
14388 // Postpone gather emission, will be emitted after the end of the
14389 // process to keep correct order.
14390 auto *ResVecTy = getWidenedType(ScalarTy, E->getVectorFactor());
14391 return Builder.CreateAlignedLoad(
14392 ResVecTy,
14394 MaybeAlign());
14395 }
14396 /// Adds 2 input vectors (in form of tree entries) and the mask for their
14397 /// shuffling.
14398 void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
14399 Value *V1 = E1.VectorizedValue;
14400 if (V1->getType()->isIntOrIntVectorTy())
14401 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14402 if (isa<PoisonValue>(V))
14403 return false;
14404 return !isKnownNonNegative(
14405 V, SimplifyQuery(*R.DL));
14406 }));
14407 Value *V2 = E2.VectorizedValue;
14408 if (V2->getType()->isIntOrIntVectorTy())
14409 V2 = castToScalarTyElem(V2, any_of(E2.Scalars, [&](Value *V) {
14410 if (isa<PoisonValue>(V))
14411 return false;
14412 return !isKnownNonNegative(
14413 V, SimplifyQuery(*R.DL));
14414 }));
14415 add(V1, V2, Mask);
14416 }
14417 /// Adds single input vector (in form of tree entry) and the mask for its
14418 /// shuffling.
14419 void add(const TreeEntry &E1, ArrayRef<int> Mask) {
14420 Value *V1 = E1.VectorizedValue;
14421 if (V1->getType()->isIntOrIntVectorTy())
14422 V1 = castToScalarTyElem(V1, any_of(E1.Scalars, [&](Value *V) {
14423 if (isa<PoisonValue>(V))
14424 return false;
14425 return !isKnownNonNegative(
14426 V, SimplifyQuery(*R.DL));
14427 }));
14428 add(V1, Mask);
14429 }
14430 /// Adds 2 input vectors and the mask for their shuffling.
14431 void add(Value *V1, Value *V2, ArrayRef<int> Mask) {
14432 assert(V1 && V2 && !Mask.empty() && "Expected non-empty input vectors.");
14433 assert(isa<FixedVectorType>(V1->getType()) &&
14434 isa<FixedVectorType>(V2->getType()) &&
14435 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14436 V1 = castToScalarTyElem(V1);
14437 V2 = castToScalarTyElem(V2);
14438 if (InVectors.empty()) {
14439 InVectors.push_back(V1);
14440 InVectors.push_back(V2);
14441 CommonMask.assign(Mask.begin(), Mask.end());
14442 return;
14443 }
14444 Value *Vec = InVectors.front();
14445 if (InVectors.size() == 2) {
14446 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14447 transformMaskAfterShuffle(CommonMask, CommonMask);
14448 } else if (cast<FixedVectorType>(Vec->getType())->getNumElements() !=
14449 Mask.size()) {
14450 Vec = createShuffle(Vec, nullptr, CommonMask);
14451 transformMaskAfterShuffle(CommonMask, CommonMask);
14452 }
14453 V1 = createShuffle(V1, V2, Mask);
14454 unsigned VF = std::max(getVF(V1), getVF(Vec));
14455 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14456 if (Mask[Idx] != PoisonMaskElem)
14457 CommonMask[Idx] = Idx + VF;
14458 InVectors.front() = Vec;
14459 if (InVectors.size() == 2)
14460 InVectors.back() = V1;
14461 else
14462 InVectors.push_back(V1);
14463 }
14464 /// Adds another one input vector and the mask for the shuffling.
14465 void add(Value *V1, ArrayRef<int> Mask, bool = false) {
14466 assert(isa<FixedVectorType>(V1->getType()) &&
14467 "castToScalarTyElem expects V1 to be FixedVectorType");
14468 V1 = castToScalarTyElem(V1);
14469 if (InVectors.empty()) {
14470 InVectors.push_back(V1);
14471 CommonMask.assign(Mask.begin(), Mask.end());
14472 return;
14473 }
14474 const auto *It = find(InVectors, V1);
14475 if (It == InVectors.end()) {
14476 if (InVectors.size() == 2 ||
14477 InVectors.front()->getType() != V1->getType()) {
14478 Value *V = InVectors.front();
14479 if (InVectors.size() == 2) {
14480 V = createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14481 transformMaskAfterShuffle(CommonMask, CommonMask);
14482 } else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14483 CommonMask.size()) {
14484 V = createShuffle(InVectors.front(), nullptr, CommonMask);
14485 transformMaskAfterShuffle(CommonMask, CommonMask);
14486 }
14487 unsigned VF = std::max(CommonMask.size(), Mask.size());
14488 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14489 if (CommonMask[Idx] == PoisonMaskElem && Mask[Idx] != PoisonMaskElem)
14490 CommonMask[Idx] =
14491 V->getType() != V1->getType()
14492 ? Idx + VF
14493 : Mask[Idx] + cast<FixedVectorType>(V1->getType())
14494 ->getNumElements();
14495 if (V->getType() != V1->getType())
14496 V1 = createShuffle(V1, nullptr, Mask);
14497 InVectors.front() = V;
14498 if (InVectors.size() == 2)
14499 InVectors.back() = V1;
14500 else
14501 InVectors.push_back(V1);
14502 return;
14503 }
14504 // Check if second vector is required if the used elements are already
14505 // used from the first one.
14506 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14507 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem) {
14508 InVectors.push_back(V1);
14509 break;
14510 }
14511 }
14512 unsigned VF = 0;
14513 for (Value *V : InVectors)
14514 VF = std::max(VF, getVF(V));
14515 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14516 if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
14517 CommonMask[Idx] = Mask[Idx] + (It == InVectors.begin() ? 0 : VF);
14518 }
14519 /// Adds another one input vector and the mask for the shuffling.
14521 SmallVector<int> NewMask;
14522 inversePermutation(Order, NewMask);
14523 add(V1, NewMask);
14524 }
14525 Value *gather(ArrayRef<Value *> VL, unsigned MaskVF = 0,
14526 Value *Root = nullptr) {
14527 return R.gather(VL, Root, ScalarTy,
14528 [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
14529 return createShuffle(V1, V2, Mask);
14530 });
14531 }
14532 Value *createFreeze(Value *V) { return Builder.CreateFreeze(V); }
14533 /// Finalize emission of the shuffles.
14534 /// \param Action the action (if any) to be performed before final applying of
14535 /// the \p ExtMask mask.
14536 Value *
14538 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14539 ArrayRef<int> SubVectorsMask, unsigned VF = 0,
14540 function_ref<void(Value *&, SmallVectorImpl<int> &)> Action = {}) {
14541 IsFinalized = true;
14542 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
14543 SmallVector<int> NewExtMask(ExtMask);
14544 if (ScalarTyNumElements != 1) {
14545 assert(SLPReVec && "FixedVectorType is not expected.");
14546 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, CommonMask);
14547 transformScalarShuffleIndiciesToVector(ScalarTyNumElements, NewExtMask);
14548 ExtMask = NewExtMask;
14549 }
14550 if (Action) {
14551 Value *Vec = InVectors.front();
14552 if (InVectors.size() == 2) {
14553 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14554 InVectors.pop_back();
14555 } else {
14556 Vec = createShuffle(Vec, nullptr, CommonMask);
14557 }
14558 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14559 if (CommonMask[Idx] != PoisonMaskElem)
14560 CommonMask[Idx] = Idx;
14561 assert(VF > 0 &&
14562 "Expected vector length for the final value before action.");
14563 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
14564 if (VecVF < VF) {
14565 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
14566 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14567 Vec = createShuffle(Vec, nullptr, ResizeMask);
14568 }
14569 Action(Vec, CommonMask);
14570 InVectors.front() = Vec;
14571 }
14572 if (!SubVectors.empty()) {
14573 Value *Vec = InVectors.front();
14574 if (InVectors.size() == 2) {
14575 Vec = createShuffle(Vec, InVectors.back(), CommonMask);
14576 InVectors.pop_back();
14577 } else {
14578 Vec = createShuffle(Vec, nullptr, CommonMask);
14579 }
14580 for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
14581 if (CommonMask[Idx] != PoisonMaskElem)
14582 CommonMask[Idx] = Idx;
14583 auto CreateSubVectors = [&](Value *Vec,
14584 SmallVectorImpl<int> &CommonMask) {
14585 for (auto [E, Idx] : SubVectors) {
14586 Value *V = E->VectorizedValue;
14587 if (V->getType()->isIntOrIntVectorTy())
14588 V = castToScalarTyElem(V, any_of(E->Scalars, [&](Value *V) {
14589 if (isa<PoisonValue>(V))
14590 return false;
14591 return !isKnownNonNegative(
14592 V, SimplifyQuery(*R.DL));
14593 }));
14594 unsigned InsertionIndex = Idx * ScalarTyNumElements;
14595 Vec = createInsertVector(
14596 Builder, Vec, V, InsertionIndex,
14597 std::bind(&ShuffleInstructionBuilder::createShuffle, this, _1, _2,
14598 _3));
14599 if (!CommonMask.empty()) {
14600 std::iota(
14601 std::next(CommonMask.begin(), InsertionIndex),
14602 std::next(CommonMask.begin(),
14603 (Idx + E->getVectorFactor()) * ScalarTyNumElements),
14604 InsertionIndex);
14605 }
14606 }
14607 return Vec;
14608 };
14609 if (SubVectorsMask.empty()) {
14610 Vec = CreateSubVectors(Vec, CommonMask);
14611 } else {
14612 SmallVector<int> SVMask(CommonMask.size(), PoisonMaskElem);
14613 copy(SubVectorsMask, SVMask.begin());
14614 for (auto [I1, I2] : zip(SVMask, CommonMask)) {
14615 if (I2 != PoisonMaskElem) {
14616 assert(I1 == PoisonMaskElem && "Expected unused subvectors mask");
14617 I1 = I2 + CommonMask.size();
14618 }
14619 }
14620 Value *InsertVec =
14621 CreateSubVectors(PoisonValue::get(Vec->getType()), CommonMask);
14622 Vec = createShuffle(InsertVec, Vec, SVMask);
14623 for (unsigned I : seq<unsigned>(CommonMask.size())) {
14624 if (SVMask[I] != PoisonMaskElem)
14625 CommonMask[I] = I;
14626 }
14627 }
14628 InVectors.front() = Vec;
14629 }
14630
14631 if (!ExtMask.empty()) {
14632 if (CommonMask.empty()) {
14633 CommonMask.assign(ExtMask.begin(), ExtMask.end());
14634 } else {
14635 SmallVector<int> NewMask(ExtMask.size(), PoisonMaskElem);
14636 for (int I = 0, Sz = ExtMask.size(); I < Sz; ++I) {
14637 if (ExtMask[I] == PoisonMaskElem)
14638 continue;
14639 NewMask[I] = CommonMask[ExtMask[I]];
14640 }
14641 CommonMask.swap(NewMask);
14642 }
14643 }
14644 if (CommonMask.empty()) {
14645 assert(InVectors.size() == 1 && "Expected only one vector with no mask");
14646 return InVectors.front();
14647 }
14648 if (InVectors.size() == 2)
14649 return createShuffle(InVectors.front(), InVectors.back(), CommonMask);
14650 return createShuffle(InVectors.front(), nullptr, CommonMask);
14651 }
14652
14654 assert((IsFinalized || CommonMask.empty()) &&
14655 "Shuffle construction must be finalized.");
14656 }
14657};
14658
14659BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(const TreeEntry *E,
14660 unsigned NodeIdx) {
14661 ArrayRef<Value *> VL = E->getOperand(NodeIdx);
14662 InstructionsState S = getSameOpcode(VL, *TLI);
14663 // Special processing for GEPs bundle, which may include non-gep values.
14664 if (!S && VL.front()->getType()->isPointerTy()) {
14665 const auto *It = find_if(VL, IsaPred<GetElementPtrInst>);
14666 if (It != VL.end())
14667 S = getSameOpcode(*It, *TLI);
14668 }
14669 if (!S)
14670 return nullptr;
14671 auto CheckSameVE = [&](const TreeEntry *VE) {
14672 return VE->isSame(VL) &&
14673 (any_of(VE->UserTreeIndices,
14674 [E, NodeIdx](const EdgeInfo &EI) {
14675 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14676 }) ||
14677 any_of(VectorizableTree,
14678 [E, NodeIdx, VE](const std::unique_ptr<TreeEntry> &TE) {
14679 return TE->isOperandGatherNode(
14680 {const_cast<TreeEntry *>(E), NodeIdx}) &&
14681 VE->isSame(TE->Scalars);
14682 }));
14683 };
14684 TreeEntry *VE = getTreeEntry(S.getMainOp());
14685 if (VE && CheckSameVE(VE))
14686 return VE;
14687 auto It = MultiNodeScalars.find(S.getMainOp());
14688 if (It != MultiNodeScalars.end()) {
14689 auto *I = find_if(It->getSecond(), [&](const TreeEntry *TE) {
14690 return TE != VE && CheckSameVE(TE);
14691 });
14692 if (I != It->getSecond().end())
14693 return *I;
14694 }
14695 return nullptr;
14696}
14697
14698Value *BoUpSLP::vectorizeOperand(TreeEntry *E, unsigned NodeIdx,
14699 bool PostponedPHIs) {
14700 ValueList &VL = E->getOperand(NodeIdx);
14701 const unsigned VF = VL.size();
14702 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14703 auto FinalShuffle = [&](Value *V, ArrayRef<int> Mask) {
14704 // V may be affected by MinBWs.
14705 // We want ShuffleInstructionBuilder to correctly support REVEC. The key
14706 // factor is the number of elements, not their type.
14707 Type *ScalarTy = cast<VectorType>(V->getType())->getElementType();
14708 unsigned NumElements = getNumElements(VL.front()->getType());
14709 ShuffleInstructionBuilder ShuffleBuilder(
14710 NumElements != 1 ? FixedVectorType::get(ScalarTy, NumElements)
14711 : ScalarTy,
14712 Builder, *this);
14713 ShuffleBuilder.add(V, Mask);
14715 E->CombinedEntriesWithIndices.size());
14716 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14717 [&](const auto &P) {
14718 return std::make_pair(VectorizableTree[P.first].get(),
14719 P.second);
14720 });
14721 assert((E->CombinedEntriesWithIndices.empty() ||
14722 E->ReorderIndices.empty()) &&
14723 "Expected either combined subnodes or reordering");
14724 return ShuffleBuilder.finalize({}, SubVectors, {});
14725 };
14726 Value *V = vectorizeTree(VE, PostponedPHIs);
14727 if (VF * getNumElements(VL[0]->getType()) !=
14728 cast<FixedVectorType>(V->getType())->getNumElements()) {
14729 if (!VE->ReuseShuffleIndices.empty()) {
14730 // Reshuffle to get only unique values.
14731 // If some of the scalars are duplicated in the vectorization
14732 // tree entry, we do not vectorize them but instead generate a
14733 // mask for the reuses. But if there are several users of the
14734 // same entry, they may have different vectorization factors.
14735 // This is especially important for PHI nodes. In this case, we
14736 // need to adapt the resulting instruction for the user
14737 // vectorization factor and have to reshuffle it again to take
14738 // only unique elements of the vector. Without this code the
14739 // function incorrectly returns reduced vector instruction with
14740 // the same elements, not with the unique ones.
14741
14742 // block:
14743 // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
14744 // %2 = shuffle <2 x > %phi, poison, <4 x > <1, 1, 0, 0>
14745 // ... (use %2)
14746 // %shuffle = shuffle <2 x> %2, poison, <2 x> {2, 0}
14747 // br %block
14749 for (auto [I, V] : enumerate(VL)) {
14750 if (isa<PoisonValue>(V))
14751 continue;
14752 Mask[I] = VE->findLaneForValue(V);
14753 }
14754 V = FinalShuffle(V, Mask);
14755 } else {
14756 assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
14757 "Expected vectorization factor less "
14758 "than original vector size.");
14759 SmallVector<int> UniformMask(VF, 0);
14760 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14761 V = FinalShuffle(V, UniformMask);
14762 }
14763 }
14764 // Need to update the operand gather node, if actually the operand is not a
14765 // vectorized node, but the buildvector/gather node, which matches one of
14766 // the vectorized nodes.
14767 if (find_if(VE->UserTreeIndices, [&](const EdgeInfo &EI) {
14768 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14769 }) == VE->UserTreeIndices.end()) {
14770 auto *It =
14771 find_if(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
14772 return TE->isGather() && TE->UserTreeIndices.front().UserTE == E &&
14773 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14774 });
14775 assert(It != VectorizableTree.end() && "Expected gather node operand.");
14776 (*It)->VectorizedValue = V;
14777 }
14778 return V;
14779 }
14780
14781 // Find the corresponding gather entry and vectorize it.
14782 // Allows to be more accurate with tree/graph transformations, checks for the
14783 // correctness of the transformations in many cases.
14784 auto *I = find_if(VectorizableTree,
14785 [E, NodeIdx](const std::unique_ptr<TreeEntry> &TE) {
14786 return TE->isOperandGatherNode({E, NodeIdx});
14787 });
14788 assert(I != VectorizableTree.end() && "Gather node is not in the graph.");
14789 assert(I->get()->UserTreeIndices.size() == 1 &&
14790 "Expected only single user for the gather node.");
14791 assert(I->get()->isSame(VL) && "Expected same list of scalars.");
14792 return vectorizeTree(I->get(), PostponedPHIs);
14793}
14794
14795template <typename BVTy, typename ResTy, typename... Args>
14796ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy,
14797 Args &...Params) {
14798 assert(E->isGather() && "Expected gather node.");
14799 unsigned VF = E->getVectorFactor();
14800
14801 bool NeedFreeze = false;
14802 SmallVector<int> ReuseShuffleIndices(E->ReuseShuffleIndices.begin(),
14803 E->ReuseShuffleIndices.end());
14804 SmallVector<Value *> GatheredScalars(E->Scalars.begin(), E->Scalars.end());
14805 // Clear values, to be replaced by insertvector instructions.
14806 for (auto [EIdx, Idx] : E->CombinedEntriesWithIndices)
14807 for_each(MutableArrayRef(GatheredScalars)
14808 .slice(Idx, VectorizableTree[EIdx]->getVectorFactor()),
14809 [&](Value *&V) { V = PoisonValue::get(V->getType()); });
14811 E->CombinedEntriesWithIndices.size());
14812 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14813 [&](const auto &P) {
14814 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14815 });
14816 // Build a mask out of the reorder indices and reorder scalars per this
14817 // mask.
14818 SmallVector<int> ReorderMask(E->ReorderIndices.begin(),
14819 E->ReorderIndices.end());
14820 if (!ReorderMask.empty())
14821 reorderScalars(GatheredScalars, ReorderMask);
14822 SmallVector<int> SubVectorsMask;
14823 inversePermutation(E->ReorderIndices, SubVectorsMask);
14824 // Transform non-clustered elements in the mask to poison (-1).
14825 // "Clustered" operations will be reordered using this mask later.
14826 if (!SubVectors.empty() && !SubVectorsMask.empty()) {
14827 for (unsigned I : seq<unsigned>(GatheredScalars.size()))
14828 if (E->Scalars[I] == GatheredScalars[ReorderMask[I]])
14829 SubVectorsMask[ReorderMask[I]] = PoisonMaskElem;
14830 } else {
14831 SubVectorsMask.clear();
14832 }
14833 SmallVector<Value *> StoredGS(GatheredScalars);
14834 auto FindReusedSplat = [&](MutableArrayRef<int> Mask, unsigned InputVF,
14835 unsigned I, unsigned SliceSize,
14836 bool IsNotPoisonous) {
14837 if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) {
14838 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14839 }))
14840 return false;
14841 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14842 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14843 if (UserTE->getNumOperands() != 2)
14844 return false;
14845 if (!IsNotPoisonous) {
14846 auto *It =
14847 find_if(VectorizableTree, [=](const std::unique_ptr<TreeEntry> &TE) {
14848 return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) {
14849 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14850 }) != TE->UserTreeIndices.end();
14851 });
14852 if (It == VectorizableTree.end())
14853 return false;
14854 SmallVector<Value *> GS((*It)->Scalars.begin(), (*It)->Scalars.end());
14855 if (!(*It)->ReorderIndices.empty()) {
14856 inversePermutation((*It)->ReorderIndices, ReorderMask);
14857 reorderScalars(GS, ReorderMask);
14858 }
14859 if (!all_of(zip(GatheredScalars, GS), [&](const auto &P) {
14860 Value *V0 = std::get<0>(P);
14861 Value *V1 = std::get<1>(P);
14862 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14863 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14864 is_contained(E->Scalars, V1));
14865 }))
14866 return false;
14867 }
14868 int Idx;
14869 if ((Mask.size() < InputVF &&
14871 Idx == 0) ||
14872 (Mask.size() == InputVF &&
14873 ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))) {
14874 std::iota(
14875 std::next(Mask.begin(), I * SliceSize),
14876 std::next(Mask.begin(),
14877 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14878 0);
14879 } else {
14880 unsigned IVal =
14881 *find_if_not(Mask, [](int Idx) { return Idx == PoisonMaskElem; });
14882 std::fill(
14883 std::next(Mask.begin(), I * SliceSize),
14884 std::next(Mask.begin(),
14885 I * SliceSize + getNumElems(Mask.size(), SliceSize, I)),
14886 IVal);
14887 }
14888 return true;
14889 };
14890 BVTy ShuffleBuilder(ScalarTy, Params...);
14891 ResTy Res = ResTy();
14893 SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
14895 Value *ExtractVecBase = nullptr;
14896 bool UseVecBaseAsInput = false;
14899 Type *OrigScalarTy = GatheredScalars.front()->getType();
14900 auto *VecTy = getWidenedType(ScalarTy, GatheredScalars.size());
14901 unsigned NumParts = TTI->getNumberOfParts(VecTy);
14902 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14903 VecTy->getNumElements() % NumParts != 0 ||
14905 VecTy->getNumElements() / NumParts))
14906 NumParts = 1;
14907 if (!all_of(GatheredScalars, IsaPred<UndefValue>)) {
14908 // Check for gathered extracts.
14909 bool Resized = false;
14910 ExtractShuffles =
14911 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14912 if (!ExtractShuffles.empty()) {
14913 SmallVector<const TreeEntry *> ExtractEntries;
14914 for (auto [Idx, I] : enumerate(ExtractMask)) {
14915 if (I == PoisonMaskElem)
14916 continue;
14917 if (const auto *TE = getTreeEntry(
14918 cast<ExtractElementInst>(StoredGS[Idx])->getVectorOperand()))
14919 ExtractEntries.push_back(TE);
14920 }
14921 if (std::optional<ResTy> Delayed =
14922 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14923 // Delay emission of gathers which are not ready yet.
14924 PostponedGathers.insert(E);
14925 // Postpone gather emission, will be emitted after the end of the
14926 // process to keep correct order.
14927 return *Delayed;
14928 }
14929 if (Value *VecBase = ShuffleBuilder.adjustExtracts(
14930 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14931 ExtractVecBase = VecBase;
14932 if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14933 if (VF == VecBaseTy->getNumElements() &&
14934 GatheredScalars.size() != VF) {
14935 Resized = true;
14936 GatheredScalars.append(VF - GatheredScalars.size(),
14937 PoisonValue::get(OrigScalarTy));
14938 }
14939 }
14940 }
14941 // Gather extracts after we check for full matched gathers only.
14942 if (!ExtractShuffles.empty() || !E->hasState() ||
14943 E->getOpcode() != Instruction::Load ||
14944 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14945 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14946 any_of(E->Scalars,
14947 [this](Value *V) {
14948 return isa<LoadInst>(V) && getTreeEntry(V);
14949 })) ||
14950 (E->hasState() && E->isAltShuffle()) ||
14951 all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
14952 isSplat(E->Scalars) ||
14953 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14954 GatherShuffles =
14955 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14956 }
14957 if (!GatherShuffles.empty()) {
14958 if (std::optional<ResTy> Delayed =
14959 ShuffleBuilder.needToDelay(E, Entries)) {
14960 // Delay emission of gathers which are not ready yet.
14961 PostponedGathers.insert(E);
14962 // Postpone gather emission, will be emitted after the end of the
14963 // process to keep correct order.
14964 return *Delayed;
14965 }
14966 if (GatherShuffles.size() == 1 &&
14967 *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
14968 Entries.front().front()->isSame(E->Scalars)) {
14969 // Perfect match in the graph, will reuse the previously vectorized
14970 // node. Cost is 0.
14971 LLVM_DEBUG(dbgs() << "SLP: perfect diamond match for gather bundle "
14972 << shortBundleName(E->Scalars, E->Idx) << ".\n");
14973 // Restore the mask for previous partially matched values.
14974 Mask.resize(E->Scalars.size());
14975 const TreeEntry *FrontTE = Entries.front().front();
14976 if (FrontTE->ReorderIndices.empty() &&
14977 ((FrontTE->ReuseShuffleIndices.empty() &&
14978 E->Scalars.size() == FrontTE->Scalars.size()) ||
14979 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14980 std::iota(Mask.begin(), Mask.end(), 0);
14981 } else {
14982 for (auto [I, V] : enumerate(E->Scalars)) {
14983 if (isa<PoisonValue>(V)) {
14985 continue;
14986 }
14987 Mask[I] = FrontTE->findLaneForValue(V);
14988 }
14989 }
14990 ShuffleBuilder.add(*FrontTE, Mask);
14991 // Full matched entry found, no need to insert subvectors.
14992 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14993 return Res;
14994 }
14995 if (!Resized) {
14996 if (GatheredScalars.size() != VF &&
14997 any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
14998 return any_of(TEs, [&](const TreeEntry *TE) {
14999 return TE->getVectorFactor() == VF;
15000 });
15001 }))
15002 GatheredScalars.append(VF - GatheredScalars.size(),
15003 PoisonValue::get(OrigScalarTy));
15004 }
15005 // Remove shuffled elements from list of gathers.
15006 for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
15007 if (Mask[I] != PoisonMaskElem)
15008 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15009 }
15010 }
15011 }
15012 auto TryPackScalars = [&](SmallVectorImpl<Value *> &Scalars,
15013 SmallVectorImpl<int> &ReuseMask,
15014 bool IsRootPoison) {
15015 // For splats with can emit broadcasts instead of gathers, so try to find
15016 // such sequences.
15017 bool IsSplat = IsRootPoison && isSplat(Scalars) &&
15018 (Scalars.size() > 2 || Scalars.front() == Scalars.back());
15019 Scalars.append(VF - Scalars.size(), PoisonValue::get(OrigScalarTy));
15020 SmallVector<int> UndefPos;
15021 DenseMap<Value *, unsigned> UniquePositions;
15022 // Gather unique non-const values and all constant values.
15023 // For repeated values, just shuffle them.
15024 int NumNonConsts = 0;
15025 int SinglePos = 0;
15026 for (auto [I, V] : enumerate(Scalars)) {
15027 if (isa<UndefValue>(V)) {
15028 if (!isa<PoisonValue>(V)) {
15029 ReuseMask[I] = I;
15030 UndefPos.push_back(I);
15031 }
15032 continue;
15033 }
15034 if (isConstant(V)) {
15035 ReuseMask[I] = I;
15036 continue;
15037 }
15038 ++NumNonConsts;
15039 SinglePos = I;
15040 Value *OrigV = V;
15041 Scalars[I] = PoisonValue::get(OrigScalarTy);
15042 if (IsSplat) {
15043 Scalars.front() = OrigV;
15044 ReuseMask[I] = 0;
15045 } else {
15046 const auto Res = UniquePositions.try_emplace(OrigV, I);
15047 Scalars[Res.first->second] = OrigV;
15048 ReuseMask[I] = Res.first->second;
15049 }
15050 }
15051 if (NumNonConsts == 1) {
15052 // Restore single insert element.
15053 if (IsSplat) {
15054 ReuseMask.assign(VF, PoisonMaskElem);
15055 std::swap(Scalars.front(), Scalars[SinglePos]);
15056 if (!UndefPos.empty() && UndefPos.front() == 0)
15057 Scalars.front() = UndefValue::get(OrigScalarTy);
15058 }
15059 ReuseMask[SinglePos] = SinglePos;
15060 } else if (!UndefPos.empty() && IsSplat) {
15061 // For undef values, try to replace them with the simple broadcast.
15062 // We can do it if the broadcasted value is guaranteed to be
15063 // non-poisonous, or by freezing the incoming scalar value first.
15064 auto *It = find_if(Scalars, [this, E](Value *V) {
15065 return !isa<UndefValue>(V) &&
15066 (getTreeEntry(V) || isGuaranteedNotToBePoison(V, AC) ||
15067 (E->UserTreeIndices.size() == 1 &&
15068 any_of(V->uses(), [E](const Use &U) {
15069 // Check if the value already used in the same operation in
15070 // one of the nodes already.
15071 return E->UserTreeIndices.front().EdgeIdx !=
15072 U.getOperandNo() &&
15073 is_contained(
15074 E->UserTreeIndices.front().UserTE->Scalars,
15075 U.getUser());
15076 })));
15077 });
15078 if (It != Scalars.end()) {
15079 // Replace undefs by the non-poisoned scalars and emit broadcast.
15080 int Pos = std::distance(Scalars.begin(), It);
15081 for (int I : UndefPos) {
15082 // Set the undef position to the non-poisoned scalar.
15083 ReuseMask[I] = Pos;
15084 // Replace the undef by the poison, in the mask it is replaced by
15085 // non-poisoned scalar already.
15086 if (I != Pos)
15087 Scalars[I] = PoisonValue::get(OrigScalarTy);
15088 }
15089 } else {
15090 // Replace undefs by the poisons, emit broadcast and then emit
15091 // freeze.
15092 for (int I : UndefPos) {
15093 ReuseMask[I] = PoisonMaskElem;
15094 if (isa<UndefValue>(Scalars[I]))
15095 Scalars[I] = PoisonValue::get(OrigScalarTy);
15096 }
15097 NeedFreeze = true;
15098 }
15099 }
15100 };
15101 if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
15102 bool IsNonPoisoned = true;
15103 bool IsUsedInExpr = true;
15104 Value *Vec1 = nullptr;
15105 if (!ExtractShuffles.empty()) {
15106 // Gather of extractelements can be represented as just a shuffle of
15107 // a single/two vectors the scalars are extracted from.
15108 // Find input vectors.
15109 Value *Vec2 = nullptr;
15110 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15111 if (!Mask.empty() && Mask[I] != PoisonMaskElem)
15112 ExtractMask[I] = PoisonMaskElem;
15113 }
15114 if (UseVecBaseAsInput) {
15115 Vec1 = ExtractVecBase;
15116 } else {
15117 for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
15118 if (ExtractMask[I] == PoisonMaskElem)
15119 continue;
15120 if (isa<UndefValue>(E->Scalars[I]))
15121 continue;
15122 auto *EI = cast<ExtractElementInst>(StoredGS[I]);
15123 Value *VecOp = EI->getVectorOperand();
15124 if (const auto *TE = getTreeEntry(VecOp))
15125 if (TE->VectorizedValue)
15126 VecOp = TE->VectorizedValue;
15127 if (!Vec1) {
15128 Vec1 = VecOp;
15129 } else if (Vec1 != VecOp) {
15130 assert((!Vec2 || Vec2 == VecOp) &&
15131 "Expected only 1 or 2 vectors shuffle.");
15132 Vec2 = VecOp;
15133 }
15134 }
15135 }
15136 if (Vec2) {
15137 IsUsedInExpr = false;
15138 IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1, AC) &&
15139 isGuaranteedNotToBePoison(Vec2, AC);
15140 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15141 } else if (Vec1) {
15142 bool IsNotPoisonedVec = isGuaranteedNotToBePoison(Vec1, AC);
15143 IsUsedInExpr &= FindReusedSplat(
15144 ExtractMask,
15145 cast<FixedVectorType>(Vec1->getType())->getNumElements(), 0,
15146 ExtractMask.size(), IsNotPoisonedVec);
15147 ShuffleBuilder.add(Vec1, ExtractMask, /*ForExtracts=*/true);
15148 IsNonPoisoned &= IsNotPoisonedVec;
15149 } else {
15150 IsUsedInExpr = false;
15151 ShuffleBuilder.add(PoisonValue::get(VecTy), ExtractMask,
15152 /*ForExtracts=*/true);
15153 }
15154 }
15155 if (!GatherShuffles.empty()) {
15156 unsigned SliceSize = getPartNumElems(E->Scalars.size(), NumParts);
15157 SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
15158 for (const auto [I, TEs] : enumerate(Entries)) {
15159 if (TEs.empty()) {
15160 assert(!GatherShuffles[I] &&
15161 "No shuffles with empty entries list expected.");
15162 continue;
15163 }
15164 assert((TEs.size() == 1 || TEs.size() == 2) &&
15165 "Expected shuffle of 1 or 2 entries.");
15166 unsigned Limit = getNumElems(Mask.size(), SliceSize, I);
15167 auto SubMask = ArrayRef(Mask).slice(I * SliceSize, Limit);
15168 VecMask.assign(VecMask.size(), PoisonMaskElem);
15169 copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
15170 if (TEs.size() == 1) {
15171 bool IsNotPoisonedVec =
15172 TEs.front()->VectorizedValue
15173 ? isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC)
15174 : true;
15175 IsUsedInExpr &=
15176 FindReusedSplat(VecMask, TEs.front()->getVectorFactor(), I,
15177 SliceSize, IsNotPoisonedVec);
15178 ShuffleBuilder.add(*TEs.front(), VecMask);
15179 IsNonPoisoned &= IsNotPoisonedVec;
15180 } else {
15181 IsUsedInExpr = false;
15182 ShuffleBuilder.add(*TEs.front(), *TEs.back(), VecMask);
15183 if (TEs.front()->VectorizedValue && TEs.back()->VectorizedValue)
15184 IsNonPoisoned &=
15185 isGuaranteedNotToBePoison(TEs.front()->VectorizedValue, AC) &&
15186 isGuaranteedNotToBePoison(TEs.back()->VectorizedValue, AC);
15187 }
15188 }
15189 }
15190 // Try to figure out best way to combine values: build a shuffle and insert
15191 // elements or just build several shuffles.
15192 // Insert non-constant scalars.
15193 SmallVector<Value *> NonConstants(GatheredScalars);
15194 int EMSz = ExtractMask.size();
15195 int MSz = Mask.size();
15196 // Try to build constant vector and shuffle with it only if currently we
15197 // have a single permutation and more than 1 scalar constants.
15198 bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
15199 bool IsIdentityShuffle =
15200 ((UseVecBaseAsInput ||
15201 all_of(ExtractShuffles,
15202 [](const std::optional<TTI::ShuffleKind> &SK) {
15203 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15205 })) &&
15206 none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
15207 ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
15208 (!GatherShuffles.empty() &&
15209 all_of(GatherShuffles,
15210 [](const std::optional<TTI::ShuffleKind> &SK) {
15211 return SK.value_or(TTI::SK_PermuteTwoSrc) ==
15213 }) &&
15214 none_of(Mask, [&](int I) { return I >= MSz; }) &&
15216 bool EnoughConstsForShuffle =
15217 IsSingleShuffle &&
15218 (none_of(GatheredScalars,
15219 [](Value *V) {
15220 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15221 }) ||
15222 any_of(GatheredScalars,
15223 [](Value *V) {
15224 return isa<Constant>(V) && !isa<UndefValue>(V);
15225 })) &&
15226 (!IsIdentityShuffle ||
15227 (GatheredScalars.size() == 2 &&
15228 any_of(GatheredScalars,
15229 [](Value *V) { return !isa<UndefValue>(V); })) ||
15230 count_if(GatheredScalars, [](Value *V) {
15231 return isa<Constant>(V) && !isa<PoisonValue>(V);
15232 }) > 1);
15233 // NonConstants array contains just non-constant values, GatheredScalars
15234 // contains only constant to build final vector and then shuffle.
15235 for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) {
15236 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[I]))
15237 NonConstants[I] = PoisonValue::get(OrigScalarTy);
15238 else
15239 GatheredScalars[I] = PoisonValue::get(OrigScalarTy);
15240 }
15241 // Generate constants for final shuffle and build a mask for them.
15242 if (!all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15243 SmallVector<int> BVMask(GatheredScalars.size(), PoisonMaskElem);
15244 TryPackScalars(GatheredScalars, BVMask, /*IsRootPoison=*/true);
15245 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15246 ShuffleBuilder.add(BV, BVMask);
15247 }
15248 if (all_of(NonConstants, [=](Value *V) {
15249 return isa<PoisonValue>(V) ||
15250 (IsSingleShuffle && ((IsIdentityShuffle &&
15251 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15252 }))
15253 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15254 SubVectorsMask);
15255 else
15256 Res = ShuffleBuilder.finalize(
15257 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15258 [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
15259 TryPackScalars(NonConstants, Mask, /*IsRootPoison=*/false);
15260 Vec = ShuffleBuilder.gather(NonConstants, Mask.size(), Vec);
15261 });
15262 } else if (!allConstant(GatheredScalars)) {
15263 // Gather unique scalars and all constants.
15264 SmallVector<int> ReuseMask(GatheredScalars.size(), PoisonMaskElem);
15265 TryPackScalars(GatheredScalars, ReuseMask, /*IsRootPoison=*/true);
15266 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.size());
15267 ShuffleBuilder.add(BV, ReuseMask);
15268 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15269 SubVectorsMask);
15270 } else {
15271 // Gather all constants.
15272 SmallVector<int> Mask(GatheredScalars.size(), PoisonMaskElem);
15273 for (auto [I, V] : enumerate(GatheredScalars)) {
15274 if (!isa<PoisonValue>(V))
15275 Mask[I] = I;
15276 }
15277 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15278 ShuffleBuilder.add(BV, Mask);
15279 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15280 SubVectorsMask);
15281 }
15282
15283 if (NeedFreeze)
15284 Res = ShuffleBuilder.createFreeze(Res);
15285 return Res;
15286}
15287
15288Value *BoUpSLP::createBuildVector(const TreeEntry *E, Type *ScalarTy,
15289 bool PostponedPHIs) {
15290 for (auto [EIdx, _] : E->CombinedEntriesWithIndices)
15291 (void)vectorizeTree(VectorizableTree[EIdx].get(), PostponedPHIs);
15292 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15293 Builder, *this);
15294}
15295
15296/// \returns \p I after propagating metadata from \p VL only for instructions in
15297/// \p VL.
15300 for (Value *V : VL)
15301 if (isa<Instruction>(V))
15302 Insts.push_back(V);
15303 return llvm::propagateMetadata(Inst, Insts);
15304}
15305
15306Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
15307 IRBuilderBase::InsertPointGuard Guard(Builder);
15308
15309 if (E->VectorizedValue &&
15310 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15311 E->isAltShuffle())) {
15312 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
15313 return E->VectorizedValue;
15314 }
15315
15316 Value *V = E->Scalars.front();
15317 Type *ScalarTy = V->getType();
15318 if (!isa<CmpInst>(V))
15319 ScalarTy = getValueType(V);
15320 auto It = MinBWs.find(E);
15321 if (It != MinBWs.end()) {
15322 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15323 ScalarTy = IntegerType::get(F->getContext(), It->second.first);
15324 if (VecTy)
15325 ScalarTy = getWidenedType(ScalarTy, VecTy->getNumElements());
15326 }
15327 auto *VecTy = getWidenedType(ScalarTy, E->Scalars.size());
15328 if (E->isGather()) {
15329 // Set insert point for non-reduction initial nodes.
15330 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15331 setInsertPointAfterBundle(E);
15332 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15333 E->VectorizedValue = Vec;
15334 return Vec;
15335 }
15336
15337 bool IsReverseOrder =
15338 !E->ReorderIndices.empty() && isReverseOrder(E->ReorderIndices);
15339 auto FinalShuffle = [&](Value *V, const TreeEntry *E) {
15340 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *this);
15341 if (E->getOpcode() == Instruction::Store &&
15342 E->State == TreeEntry::Vectorize) {
15344 ArrayRef(reinterpret_cast<const int *>(E->ReorderIndices.begin()),
15345 E->ReorderIndices.size());
15346 ShuffleBuilder.add(V, Mask);
15347 } else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15348 ShuffleBuilder.addOrdered(V, {});
15349 } else {
15350 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15351 }
15353 E->CombinedEntriesWithIndices.size());
15354 transform(
15355 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](const auto &P) {
15356 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15357 });
15358 assert(
15359 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15360 "Expected either combined subnodes or reordering");
15361 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15362 };
15363
15364 assert(!E->isGather() && "Unhandled state");
15365 unsigned ShuffleOrOp =
15366 E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
15367 Instruction *VL0 = E->getMainOp();
15368 auto GetOperandSignedness = [&](unsigned Idx) {
15369 const TreeEntry *OpE = getOperandEntry(E, Idx);
15370 bool IsSigned = false;
15371 auto It = MinBWs.find(OpE);
15372 if (It != MinBWs.end())
15373 IsSigned = It->second.second;
15374 else
15375 IsSigned = any_of(OpE->Scalars, [&](Value *R) {
15376 if (isa<PoisonValue>(V))
15377 return false;
15378 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15379 });
15380 return IsSigned;
15381 };
15382 switch (ShuffleOrOp) {
15383 case Instruction::PHI: {
15384 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15385 E != VectorizableTree.front().get() ||
15386 !E->UserTreeIndices.empty()) &&
15387 "PHI reordering is free.");
15388 if (PostponedPHIs && E->VectorizedValue)
15389 return E->VectorizedValue;
15390 auto *PH = cast<PHINode>(VL0);
15391 Builder.SetInsertPoint(PH->getParent(),
15392 PH->getParent()->getFirstNonPHIIt());
15393 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15394 if (PostponedPHIs || !E->VectorizedValue) {
15395 PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
15396 E->PHI = NewPhi;
15397 Value *V = NewPhi;
15398
15399 // Adjust insertion point once all PHI's have been generated.
15400 Builder.SetInsertPoint(PH->getParent(),
15401 PH->getParent()->getFirstInsertionPt());
15402 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15403
15404 V = FinalShuffle(V, E);
15405
15406 E->VectorizedValue = V;
15407 if (PostponedPHIs)
15408 return V;
15409 }
15410 PHINode *NewPhi = cast<PHINode>(E->PHI);
15411 // If phi node is fully emitted - exit.
15412 if (NewPhi->getNumIncomingValues() != 0)
15413 return NewPhi;
15414
15415 // PHINodes may have multiple entries from the same block. We want to
15416 // visit every block once.
15418
15419 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15421 BasicBlock *IBB = PH->getIncomingBlock(I);
15422
15423 // Stop emission if all incoming values are generated.
15424 if (NewPhi->getNumIncomingValues() == PH->getNumIncomingValues()) {
15425 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15426 return NewPhi;
15427 }
15428
15429 if (!VisitedBBs.insert(IBB).second) {
15430 NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
15431 continue;
15432 }
15433
15434 Builder.SetInsertPoint(IBB->getTerminator());
15435 Builder.SetCurrentDebugLocation(PH->getDebugLoc());
15436 Value *Vec = vectorizeOperand(E, I, /*PostponedPHIs=*/true);
15437 if (VecTy != Vec->getType()) {
15438 assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() ||
15439 MinBWs.contains(getOperandEntry(E, I))) &&
15440 "Expected item in MinBWs.");
15441 Vec = Builder.CreateIntCast(Vec, VecTy, GetOperandSignedness(I));
15442 }
15443 NewPhi->addIncoming(Vec, IBB);
15444 }
15445
15446 assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
15447 "Invalid number of incoming values");
15448 assert(E->VectorizedValue && "Expected vectorized value.");
15449 return E->VectorizedValue;
15450 }
15451
15452 case Instruction::ExtractElement: {
15453 Value *V = E->getSingleOperand(0);
15454 if (const TreeEntry *TE = getTreeEntry(V))
15455 V = TE->VectorizedValue;
15456 setInsertPointAfterBundle(E);
15457 V = FinalShuffle(V, E);
15458 E->VectorizedValue = V;
15459 return V;
15460 }
15461 case Instruction::ExtractValue: {
15462 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15463 Builder.SetInsertPoint(LI);
15464 Value *Ptr = LI->getPointerOperand();
15465 LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
15466 Value *NewV = ::propagateMetadata(V, E->Scalars);
15467 NewV = FinalShuffle(NewV, E);
15468 E->VectorizedValue = NewV;
15469 return NewV;
15470 }
15471 case Instruction::InsertElement: {
15472 assert(E->ReuseShuffleIndices.empty() && "All inserts should be unique");
15473 Builder.SetInsertPoint(cast<Instruction>(E->Scalars.back()));
15474 Value *V = vectorizeOperand(E, 1, PostponedPHIs);
15475 ArrayRef<Value *> Op = E->getOperand(1);
15476 Type *ScalarTy = Op.front()->getType();
15477 if (cast<VectorType>(V->getType())->getElementType() != ScalarTy) {
15478 assert(ScalarTy->isIntegerTy() && "Expected item in MinBWs.");
15479 std::pair<unsigned, bool> Res = MinBWs.lookup(getOperandEntry(E, 1));
15480 assert(Res.first > 0 && "Expected item in MinBWs.");
15481 V = Builder.CreateIntCast(
15482 V,
15484 ScalarTy,
15485 cast<FixedVectorType>(V->getType())->getNumElements()),
15486 Res.second);
15487 }
15488
15489 // Create InsertVector shuffle if necessary
15490 auto *FirstInsert = cast<Instruction>(*find_if(E->Scalars, [E](Value *V) {
15491 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15492 }));
15493 const unsigned NumElts =
15494 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15495 const unsigned NumScalars = E->Scalars.size();
15496
15497 unsigned Offset = *getElementIndex(VL0);
15498 assert(Offset < NumElts && "Failed to find vector index offset");
15499
15500 // Create shuffle to resize vector
15502 if (!E->ReorderIndices.empty()) {
15503 inversePermutation(E->ReorderIndices, Mask);
15504 Mask.append(NumElts - NumScalars, PoisonMaskElem);
15505 } else {
15506 Mask.assign(NumElts, PoisonMaskElem);
15507 std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0);
15508 }
15509 // Create InsertVector shuffle if necessary
15510 bool IsIdentity = true;
15511 SmallVector<int> PrevMask(NumElts, PoisonMaskElem);
15512 Mask.swap(PrevMask);
15513 for (unsigned I = 0; I < NumScalars; ++I) {
15514 Value *Scalar = E->Scalars[PrevMask[I]];
15515 unsigned InsertIdx = *getElementIndex(Scalar);
15516 IsIdentity &= InsertIdx - Offset == I;
15517 Mask[InsertIdx - Offset] = I;
15518 }
15519 if (!IsIdentity || NumElts != NumScalars) {
15520 Value *V2 = nullptr;
15521 bool IsVNonPoisonous =
15523 SmallVector<int> InsertMask(Mask);
15524 if (NumElts != NumScalars && Offset == 0) {
15525 // Follow all insert element instructions from the current buildvector
15526 // sequence.
15527 InsertElementInst *Ins = cast<InsertElementInst>(VL0);
15528 do {
15529 std::optional<unsigned> InsertIdx = getElementIndex(Ins);
15530 if (!InsertIdx)
15531 break;
15532 if (InsertMask[*InsertIdx] == PoisonMaskElem)
15533 InsertMask[*InsertIdx] = *InsertIdx;
15534 if (!Ins->hasOneUse())
15535 break;
15536 Ins = dyn_cast_or_null<InsertElementInst>(
15537 Ins->getUniqueUndroppableUser());
15538 } while (Ins);
15539 SmallBitVector UseMask =
15540 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15541 SmallBitVector IsFirstPoison =
15542 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15543 SmallBitVector IsFirstUndef =
15544 isUndefVector(FirstInsert->getOperand(0), UseMask);
15545 if (!IsFirstPoison.all()) {
15546 unsigned Idx = 0;
15547 for (unsigned I = 0; I < NumElts; I++) {
15548 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I) &&
15549 IsFirstUndef.test(I)) {
15550 if (IsVNonPoisonous) {
15551 InsertMask[I] = I < NumScalars ? I : 0;
15552 continue;
15553 }
15554 if (!V2)
15555 V2 = UndefValue::get(V->getType());
15556 if (Idx >= NumScalars)
15557 Idx = NumScalars - 1;
15558 InsertMask[I] = NumScalars + Idx;
15559 ++Idx;
15560 } else if (InsertMask[I] != PoisonMaskElem &&
15561 Mask[I] == PoisonMaskElem) {
15562 InsertMask[I] = PoisonMaskElem;
15563 }
15564 }
15565 } else {
15566 InsertMask = Mask;
15567 }
15568 }
15569 if (!V2)
15570 V2 = PoisonValue::get(V->getType());
15571 V = Builder.CreateShuffleVector(V, V2, InsertMask);
15572 if (auto *I = dyn_cast<Instruction>(V)) {
15573 GatherShuffleExtractSeq.insert(I);
15574 CSEBlocks.insert(I->getParent());
15575 }
15576 }
15577
15578 SmallVector<int> InsertMask(NumElts, PoisonMaskElem);
15579 for (unsigned I = 0; I < NumElts; I++) {
15580 if (Mask[I] != PoisonMaskElem)
15581 InsertMask[Offset + I] = I;
15582 }
15583 SmallBitVector UseMask =
15584 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15585 SmallBitVector IsFirstUndef =
15586 isUndefVector(FirstInsert->getOperand(0), UseMask);
15587 if ((!IsIdentity || Offset != 0 || !IsFirstUndef.all()) &&
15588 NumElts != NumScalars) {
15589 if (IsFirstUndef.all()) {
15590 if (!ShuffleVectorInst::isIdentityMask(InsertMask, NumElts)) {
15591 SmallBitVector IsFirstPoison =
15592 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15593 if (!IsFirstPoison.all()) {
15594 for (unsigned I = 0; I < NumElts; I++) {
15595 if (InsertMask[I] == PoisonMaskElem && !IsFirstPoison.test(I))
15596 InsertMask[I] = I + NumElts;
15597 }
15598 }
15599 V = Builder.CreateShuffleVector(
15600 V,
15601 IsFirstPoison.all() ? PoisonValue::get(V->getType())
15602 : FirstInsert->getOperand(0),
15603 InsertMask, cast<Instruction>(E->Scalars.back())->getName());
15604 if (auto *I = dyn_cast<Instruction>(V)) {
15605 GatherShuffleExtractSeq.insert(I);
15606 CSEBlocks.insert(I->getParent());
15607 }
15608 }
15609 } else {
15610 SmallBitVector IsFirstPoison =
15611 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15612 for (unsigned I = 0; I < NumElts; I++) {
15613 if (InsertMask[I] == PoisonMaskElem)
15614 InsertMask[I] = IsFirstPoison.test(I) ? PoisonMaskElem : I;
15615 else
15616 InsertMask[I] += NumElts;
15617 }
15618 V = Builder.CreateShuffleVector(
15619 FirstInsert->getOperand(0), V, InsertMask,
15620 cast<Instruction>(E->Scalars.back())->getName());
15621 if (auto *I = dyn_cast<Instruction>(V)) {
15622 GatherShuffleExtractSeq.insert(I);
15623 CSEBlocks.insert(I->getParent());
15624 }
15625 }
15626 }
15627
15628 ++NumVectorInstructions;
15629 E->VectorizedValue = V;
15630 return V;
15631 }
15632 case Instruction::ZExt:
15633 case Instruction::SExt:
15634 case Instruction::FPToUI:
15635 case Instruction::FPToSI:
15636 case Instruction::FPExt:
15637 case Instruction::PtrToInt:
15638 case Instruction::IntToPtr:
15639 case Instruction::SIToFP:
15640 case Instruction::UIToFP:
15641 case Instruction::Trunc:
15642 case Instruction::FPTrunc:
15643 case Instruction::BitCast: {
15644 setInsertPointAfterBundle(E);
15645
15646 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15647 if (E->VectorizedValue) {
15648 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15649 return E->VectorizedValue;
15650 }
15651
15652 auto *CI = cast<CastInst>(VL0);
15653 Instruction::CastOps VecOpcode = CI->getOpcode();
15654 Type *SrcScalarTy = cast<VectorType>(InVec->getType())->getElementType();
15655 auto SrcIt = MinBWs.find(getOperandEntry(E, 0));
15656 if (!ScalarTy->isFPOrFPVectorTy() && !SrcScalarTy->isFPOrFPVectorTy() &&
15657 (SrcIt != MinBWs.end() || It != MinBWs.end() ||
15658 SrcScalarTy != CI->getOperand(0)->getType()->getScalarType())) {
15659 // Check if the values are candidates to demote.
15660 unsigned SrcBWSz = DL->getTypeSizeInBits(SrcScalarTy);
15661 if (SrcIt != MinBWs.end())
15662 SrcBWSz = SrcIt->second.first;
15663 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy->getScalarType());
15664 if (BWSz == SrcBWSz) {
15665 VecOpcode = Instruction::BitCast;
15666 } else if (BWSz < SrcBWSz) {
15667 VecOpcode = Instruction::Trunc;
15668 } else if (It != MinBWs.end()) {
15669 assert(BWSz > SrcBWSz && "Invalid cast!");
15670 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15671 } else if (SrcIt != MinBWs.end()) {
15672 assert(BWSz > SrcBWSz && "Invalid cast!");
15673 VecOpcode =
15674 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15675 }
15676 } else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.end() &&
15677 !SrcIt->second.second) {
15678 VecOpcode = Instruction::UIToFP;
15679 }
15680 Value *V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15681 ? InVec
15682 : Builder.CreateCast(VecOpcode, InVec, VecTy);
15683 V = FinalShuffle(V, E);
15684
15685 E->VectorizedValue = V;
15686 ++NumVectorInstructions;
15687 return V;
15688 }
15689 case Instruction::FCmp:
15690 case Instruction::ICmp: {
15691 setInsertPointAfterBundle(E);
15692
15693 Value *L = vectorizeOperand(E, 0, PostponedPHIs);
15694 if (E->VectorizedValue) {
15695 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15696 return E->VectorizedValue;
15697 }
15698 Value *R = vectorizeOperand(E, 1, PostponedPHIs);
15699 if (E->VectorizedValue) {
15700 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15701 return E->VectorizedValue;
15702 }
15703 if (L->getType() != R->getType()) {
15704 assert((getOperandEntry(E, 0)->isGather() ||
15705 getOperandEntry(E, 1)->isGather() ||
15706 MinBWs.contains(getOperandEntry(E, 0)) ||
15707 MinBWs.contains(getOperandEntry(E, 1))) &&
15708 "Expected item in MinBWs.");
15709 if (cast<VectorType>(L->getType())
15710 ->getElementType()
15711 ->getIntegerBitWidth() < cast<VectorType>(R->getType())
15712 ->getElementType()
15713 ->getIntegerBitWidth()) {
15714 Type *CastTy = R->getType();
15715 L = Builder.CreateIntCast(L, CastTy, GetOperandSignedness(0));
15716 } else {
15717 Type *CastTy = L->getType();
15718 R = Builder.CreateIntCast(R, CastTy, GetOperandSignedness(1));
15719 }
15720 }
15721
15722 CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
15723 Value *V = Builder.CreateCmp(P0, L, R);
15724 propagateIRFlags(V, E->Scalars, VL0);
15725 if (auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.end())
15726 ICmp->setSameSign(/*B=*/false);
15727 // Do not cast for cmps.
15728 VecTy = cast<FixedVectorType>(V->getType());
15729 V = FinalShuffle(V, E);
15730
15731 E->VectorizedValue = V;
15732 ++NumVectorInstructions;
15733 return V;
15734 }
15735 case Instruction::Select: {
15736 setInsertPointAfterBundle(E);
15737
15738 Value *Cond = vectorizeOperand(E, 0, PostponedPHIs);
15739 if (E->VectorizedValue) {
15740 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15741 return E->VectorizedValue;
15742 }
15743 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15744 if (E->VectorizedValue) {
15745 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15746 return E->VectorizedValue;
15747 }
15748 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15749 if (E->VectorizedValue) {
15750 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15751 return E->VectorizedValue;
15752 }
15753 if (True->getType() != VecTy || False->getType() != VecTy) {
15754 assert((It != MinBWs.end() || getOperandEntry(E, 1)->isGather() ||
15755 getOperandEntry(E, 2)->isGather() ||
15756 MinBWs.contains(getOperandEntry(E, 1)) ||
15757 MinBWs.contains(getOperandEntry(E, 2))) &&
15758 "Expected item in MinBWs.");
15759 if (True->getType() != VecTy)
15760 True = Builder.CreateIntCast(True, VecTy, GetOperandSignedness(1));
15761 if (False->getType() != VecTy)
15762 False = Builder.CreateIntCast(False, VecTy, GetOperandSignedness(2));
15763 }
15764
15765 unsigned CondNumElements = getNumElements(Cond->getType());
15766 unsigned TrueNumElements = getNumElements(True->getType());
15767 assert(TrueNumElements >= CondNumElements &&
15768 TrueNumElements % CondNumElements == 0 &&
15769 "Cannot vectorize Instruction::Select");
15770 assert(TrueNumElements == getNumElements(False->getType()) &&
15771 "Cannot vectorize Instruction::Select");
15772 if (CondNumElements != TrueNumElements) {
15773 // When the return type is i1 but the source is fixed vector type, we
15774 // need to duplicate the condition value.
15775 Cond = Builder.CreateShuffleVector(
15776 Cond, createReplicatedMask(TrueNumElements / CondNumElements,
15777 CondNumElements));
15778 }
15779 assert(getNumElements(Cond->getType()) == TrueNumElements &&
15780 "Cannot vectorize Instruction::Select");
15781 Value *V = Builder.CreateSelect(Cond, True, False);
15782 V = FinalShuffle(V, E);
15783
15784 E->VectorizedValue = V;
15785 ++NumVectorInstructions;
15786 return V;
15787 }
15788 case Instruction::FNeg: {
15789 setInsertPointAfterBundle(E);
15790
15791 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15792
15793 if (E->VectorizedValue) {
15794 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15795 return E->VectorizedValue;
15796 }
15797
15798 Value *V = Builder.CreateUnOp(
15799 static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
15800 propagateIRFlags(V, E->Scalars, VL0);
15801 if (auto *I = dyn_cast<Instruction>(V))
15802 V = ::propagateMetadata(I, E->Scalars);
15803
15804 V = FinalShuffle(V, E);
15805
15806 E->VectorizedValue = V;
15807 ++NumVectorInstructions;
15808
15809 return V;
15810 }
15811 case Instruction::Freeze: {
15812 setInsertPointAfterBundle(E);
15813
15814 Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
15815
15816 if (E->VectorizedValue) {
15817 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15818 return E->VectorizedValue;
15819 }
15820
15821 if (Op->getType() != VecTy) {
15822 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15823 MinBWs.contains(getOperandEntry(E, 0))) &&
15824 "Expected item in MinBWs.");
15825 Op = Builder.CreateIntCast(Op, VecTy, GetOperandSignedness(0));
15826 }
15827 Value *V = Builder.CreateFreeze(Op);
15828 V = FinalShuffle(V, E);
15829
15830 E->VectorizedValue = V;
15831 ++NumVectorInstructions;
15832
15833 return V;
15834 }
15835 case Instruction::Add:
15836 case Instruction::FAdd:
15837 case Instruction::Sub:
15838 case Instruction::FSub:
15839 case Instruction::Mul:
15840 case Instruction::FMul:
15841 case Instruction::UDiv:
15842 case Instruction::SDiv:
15843 case Instruction::FDiv:
15844 case Instruction::URem:
15845 case Instruction::SRem:
15846 case Instruction::FRem:
15847 case Instruction::Shl:
15848 case Instruction::LShr:
15849 case Instruction::AShr:
15850 case Instruction::And:
15851 case Instruction::Or:
15852 case Instruction::Xor: {
15853 setInsertPointAfterBundle(E);
15854
15855 Value *LHS = vectorizeOperand(E, 0, PostponedPHIs);
15856 if (E->VectorizedValue) {
15857 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15858 return E->VectorizedValue;
15859 }
15860 Value *RHS = vectorizeOperand(E, 1, PostponedPHIs);
15861 if (E->VectorizedValue) {
15862 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15863 return E->VectorizedValue;
15864 }
15865 if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
15866 for (unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15867 ArrayRef<Value *> Ops = E->getOperand(I);
15868 if (all_of(Ops, [&](Value *Op) {
15869 auto *CI = dyn_cast<ConstantInt>(Op);
15870 return CI && CI->getValue().countr_one() >= It->second.first;
15871 })) {
15872 V = FinalShuffle(I == 0 ? RHS : LHS, E);
15873 E->VectorizedValue = V;
15874 ++NumVectorInstructions;
15875 return V;
15876 }
15877 }
15878 }
15879 if (LHS->getType() != VecTy || RHS->getType() != VecTy) {
15880 assert((It != MinBWs.end() || getOperandEntry(E, 0)->isGather() ||
15881 getOperandEntry(E, 1)->isGather() ||
15882 MinBWs.contains(getOperandEntry(E, 0)) ||
15883 MinBWs.contains(getOperandEntry(E, 1))) &&
15884 "Expected item in MinBWs.");
15885 if (LHS->getType() != VecTy)
15886 LHS = Builder.CreateIntCast(LHS, VecTy, GetOperandSignedness(0));
15887 if (RHS->getType() != VecTy)
15888 RHS = Builder.CreateIntCast(RHS, VecTy, GetOperandSignedness(1));
15889 }
15890
15891 Value *V = Builder.CreateBinOp(
15892 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
15893 RHS);
15894 propagateIRFlags(V, E->Scalars, VL0, It == MinBWs.end());
15895 if (auto *I = dyn_cast<Instruction>(V)) {
15896 V = ::propagateMetadata(I, E->Scalars);
15897 // Drop nuw flags for abs(sub(commutative), true).
15898 if (!MinBWs.contains(E) && ShuffleOrOp == Instruction::Sub &&
15899 any_of(E->Scalars, [](Value *V) {
15900 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15901 }))
15902 I->setHasNoUnsignedWrap(/*b=*/false);
15903 }
15904
15905 V = FinalShuffle(V, E);
15906
15907 E->VectorizedValue = V;
15908 ++NumVectorInstructions;
15909
15910 return V;
15911 }
15912 case Instruction::Load: {
15913 // Loads are inserted at the head of the tree because we don't want to
15914 // sink them all the way down past store instructions.
15915 setInsertPointAfterBundle(E);
15916
15917 LoadInst *LI = cast<LoadInst>(VL0);
15918 Instruction *NewLI;
15919 Value *PO = LI->getPointerOperand();
15920 if (E->State == TreeEntry::Vectorize) {
15921 NewLI = Builder.CreateAlignedLoad(VecTy, PO, LI->getAlign());
15922 } else if (E->State == TreeEntry::StridedVectorize) {
15923 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15924 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15925 PO = IsReverseOrder ? PtrN : Ptr0;
15926 std::optional<int> Diff = getPointersDiff(
15927 VL0->getType(), Ptr0, VL0->getType(), PtrN, *DL, *SE);
15928 Type *StrideTy = DL->getIndexType(PO->getType());
15929 Value *StrideVal;
15930 if (Diff) {
15931 int Stride = *Diff / (static_cast<int>(E->Scalars.size()) - 1);
15932 StrideVal =
15933 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15934 DL->getTypeAllocSize(ScalarTy));
15935 } else {
15936 SmallVector<Value *> PointerOps(E->Scalars.size(), nullptr);
15937 transform(E->Scalars, PointerOps.begin(), [](Value *V) {
15938 return cast<LoadInst>(V)->getPointerOperand();
15939 });
15940 OrdersType Order;
15941 std::optional<Value *> Stride =
15942 calculateRtStride(PointerOps, ScalarTy, *DL, *SE, Order,
15943 &*Builder.GetInsertPoint());
15944 Value *NewStride =
15945 Builder.CreateIntCast(*Stride, StrideTy, /*isSigned=*/true);
15946 StrideVal = Builder.CreateMul(
15947 NewStride,
15948 ConstantInt::get(
15949 StrideTy,
15950 (IsReverseOrder ? -1 : 1) *
15951 static_cast<int>(DL->getTypeAllocSize(ScalarTy))));
15952 }
15953 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15954 auto *Inst = Builder.CreateIntrinsic(
15955 Intrinsic::experimental_vp_strided_load,
15956 {VecTy, PO->getType(), StrideTy},
15957 {PO, StrideVal, Builder.getAllOnesMask(VecTy->getElementCount()),
15958 Builder.getInt32(E->Scalars.size())});
15959 Inst->addParamAttr(
15960 /*ArgNo=*/0,
15961 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
15962 NewLI = Inst;
15963 } else {
15964 assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
15965 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15966 if (E->VectorizedValue) {
15967 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
15968 return E->VectorizedValue;
15969 }
15970 if (isa<FixedVectorType>(ScalarTy)) {
15971 assert(SLPReVec && "FixedVectorType is not expected.");
15972 // CreateMaskedGather expects VecTy and VecPtr have same size. We need
15973 // to expand VecPtr if ScalarTy is a vector type.
15974 unsigned ScalarTyNumElements =
15975 cast<FixedVectorType>(ScalarTy)->getNumElements();
15976 unsigned VecTyNumElements =
15977 cast<FixedVectorType>(VecTy)->getNumElements();
15978 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15979 "Cannot expand getelementptr.");
15980 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15981 SmallVector<Constant *> Indices(VecTyNumElements);
15982 transform(seq(VecTyNumElements), Indices.begin(), [=](unsigned I) {
15983 return Builder.getInt64(I % ScalarTyNumElements);
15984 });
15985 VecPtr = Builder.CreateGEP(
15986 VecTy->getElementType(),
15987 Builder.CreateShuffleVector(
15988 VecPtr, createReplicatedMask(ScalarTyNumElements, VF)),
15989 ConstantVector::get(Indices));
15990 }
15991 // Use the minimum alignment of the gathered loads.
15992 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15993 NewLI = Builder.CreateMaskedGather(VecTy, VecPtr, CommonAlignment);
15994 }
15995 Value *V = ::propagateMetadata(NewLI, E->Scalars);
15996
15997 V = FinalShuffle(V, E);
15998 E->VectorizedValue = V;
15999 ++NumVectorInstructions;
16000 return V;
16001 }
16002 case Instruction::Store: {
16003 auto *SI = cast<StoreInst>(VL0);
16004
16005 setInsertPointAfterBundle(E);
16006
16007 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
16008 if (VecValue->getType() != VecTy)
16009 VecValue =
16010 Builder.CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
16011 VecValue = FinalShuffle(VecValue, E);
16012
16013 Value *Ptr = SI->getPointerOperand();
16014 Instruction *ST;
16015 if (E->State == TreeEntry::Vectorize) {
16016 ST = Builder.CreateAlignedStore(VecValue, Ptr, SI->getAlign());
16017 } else {
16018 assert(E->State == TreeEntry::StridedVectorize &&
16019 "Expected either strided or consecutive stores.");
16020 if (!E->ReorderIndices.empty()) {
16021 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
16022 Ptr = SI->getPointerOperand();
16023 }
16024 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
16025 Type *StrideTy = DL->getIndexType(SI->getPointerOperandType());
16026 auto *Inst = Builder.CreateIntrinsic(
16027 Intrinsic::experimental_vp_strided_store,
16028 {VecTy, Ptr->getType(), StrideTy},
16029 {VecValue, Ptr,
16030 ConstantInt::get(
16031 StrideTy, -static_cast<int>(DL->getTypeAllocSize(ScalarTy))),
16032 Builder.getAllOnesMask(VecTy->getElementCount()),
16033 Builder.getInt32(E->Scalars.size())});
16034 Inst->addParamAttr(
16035 /*ArgNo=*/1,
16036 Attribute::getWithAlignment(Inst->getContext(), CommonAlignment));
16037 ST = Inst;
16038 }
16039
16040 Value *V = ::propagateMetadata(ST, E->Scalars);
16041
16042 E->VectorizedValue = V;
16043 ++NumVectorInstructions;
16044 return V;
16045 }
16046 case Instruction::GetElementPtr: {
16047 auto *GEP0 = cast<GetElementPtrInst>(VL0);
16048 setInsertPointAfterBundle(E);
16049
16050 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16051 if (E->VectorizedValue) {
16052 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16053 return E->VectorizedValue;
16054 }
16055
16056 SmallVector<Value *> OpVecs;
16057 for (int J = 1, N = GEP0->getNumOperands(); J < N; ++J) {
16058 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16059 if (E->VectorizedValue) {
16060 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16061 return E->VectorizedValue;
16062 }
16063 OpVecs.push_back(OpVec);
16064 }
16065
16066 Value *V = Builder.CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16067 if (Instruction *I = dyn_cast<GetElementPtrInst>(V)) {
16069 for (Value *V : E->Scalars) {
16070 if (isa<GetElementPtrInst>(V))
16071 GEPs.push_back(V);
16072 }
16073 V = ::propagateMetadata(I, GEPs);
16074 }
16075
16076 V = FinalShuffle(V, E);
16077
16078 E->VectorizedValue = V;
16079 ++NumVectorInstructions;
16080
16081 return V;
16082 }
16083 case Instruction::Call: {
16084 CallInst *CI = cast<CallInst>(VL0);
16085 setInsertPointAfterBundle(E);
16086
16088
16090 CI, ID, VecTy->getNumElements(),
16091 It != MinBWs.end() ? It->second.first : 0, TTI);
16092 auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
16093 bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
16094 VecCallCosts.first <= VecCallCosts.second;
16095
16096 Value *ScalarArg = nullptr;
16097 SmallVector<Value *> OpVecs;
16098 SmallVector<Type *, 2> TysForDecl;
16099 // Add return type if intrinsic is overloaded on it.
16100 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1, TTI))
16101 TysForDecl.push_back(VecTy);
16102 auto *CEI = cast<CallInst>(VL0);
16103 for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
16104 ValueList OpVL;
16105 // Some intrinsics have scalar arguments. This argument should not be
16106 // vectorized.
16107 if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I, TTI)) {
16108 ScalarArg = CEI->getArgOperand(I);
16109 // if decided to reduce bitwidth of abs intrinsic, it second argument
16110 // must be set false (do not return poison, if value issigned min).
16111 if (ID == Intrinsic::abs && It != MinBWs.end() &&
16112 It->second.first < DL->getTypeSizeInBits(CEI->getType()))
16113 ScalarArg = Builder.getFalse();
16114 OpVecs.push_back(ScalarArg);
16116 TysForDecl.push_back(ScalarArg->getType());
16117 continue;
16118 }
16119
16120 Value *OpVec = vectorizeOperand(E, I, PostponedPHIs);
16121 if (E->VectorizedValue) {
16122 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16123 return E->VectorizedValue;
16124 }
16125 ScalarArg = CEI->getArgOperand(I);
16126 if (cast<VectorType>(OpVec->getType())->getElementType() !=
16127 ScalarArg->getType()->getScalarType() &&
16128 It == MinBWs.end()) {
16129 auto *CastTy =
16130 getWidenedType(ScalarArg->getType(), VecTy->getNumElements());
16131 OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
16132 } else if (It != MinBWs.end()) {
16133 OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
16134 }
16135 LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
16136 OpVecs.push_back(OpVec);
16137 if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, I, TTI))
16138 TysForDecl.push_back(OpVec->getType());
16139 }
16140
16141 Function *CF;
16142 if (!UseIntrinsic) {
16143 VFShape Shape =
16146 static_cast<unsigned>(VecTy->getNumElements())),
16147 false /*HasGlobalPred*/);
16148 CF = VFDatabase(*CI).getVectorizedFunction(Shape);
16149 } else {
16150 CF = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, TysForDecl);
16151 }
16152
16154 CI->getOperandBundlesAsDefs(OpBundles);
16155 Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
16156
16157 propagateIRFlags(V, E->Scalars, VL0);
16158 V = FinalShuffle(V, E);
16159
16160 E->VectorizedValue = V;
16161 ++NumVectorInstructions;
16162 return V;
16163 }
16164 case Instruction::ShuffleVector: {
16165 Value *V;
16166 if (SLPReVec && !E->isAltShuffle()) {
16167 setInsertPointAfterBundle(E);
16168 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16169 if (E->VectorizedValue) {
16170 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16171 return E->VectorizedValue;
16172 }
16173 SmallVector<int> ThisMask(calculateShufflevectorMask(E->Scalars));
16174 if (auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16175 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16176 "Not supported shufflevector usage.");
16177 SmallVector<int> NewMask(ThisMask.size());
16178 transform(ThisMask, NewMask.begin(), [&SVSrc](int Mask) {
16179 return SVSrc->getShuffleMask()[Mask];
16180 });
16181 V = Builder.CreateShuffleVector(SVSrc->getOperand(0), NewMask);
16182 } else {
16183 V = Builder.CreateShuffleVector(Src, ThisMask);
16184 }
16185 propagateIRFlags(V, E->Scalars, VL0);
16186 if (auto *I = dyn_cast<Instruction>(V))
16187 V = ::propagateMetadata(I, E->Scalars);
16188 V = FinalShuffle(V, E);
16189 } else {
16190 assert(E->isAltShuffle() &&
16191 ((Instruction::isBinaryOp(E->getOpcode()) &&
16192 Instruction::isBinaryOp(E->getAltOpcode())) ||
16193 (Instruction::isCast(E->getOpcode()) &&
16194 Instruction::isCast(E->getAltOpcode())) ||
16195 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16196 "Invalid Shuffle Vector Operand");
16197
16198 Value *LHS = nullptr, *RHS = nullptr;
16199 if (Instruction::isBinaryOp(E->getOpcode()) || isa<CmpInst>(VL0)) {
16200 setInsertPointAfterBundle(E);
16201 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16202 if (E->VectorizedValue) {
16203 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16204 return E->VectorizedValue;
16205 }
16206 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16207 } else {
16208 setInsertPointAfterBundle(E);
16209 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16210 }
16211 if (E->VectorizedValue) {
16212 LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
16213 return E->VectorizedValue;
16214 }
16215 if (LHS && RHS &&
16216 ((Instruction::isBinaryOp(E->getOpcode()) &&
16217 (LHS->getType() != VecTy || RHS->getType() != VecTy)) ||
16218 (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()))) {
16219 assert((It != MinBWs.end() ||
16220 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16221 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16222 MinBWs.contains(getOperandEntry(E, 0)) ||
16223 MinBWs.contains(getOperandEntry(E, 1))) &&
16224 "Expected item in MinBWs.");
16225 Type *CastTy = VecTy;
16226 if (isa<CmpInst>(VL0) && LHS->getType() != RHS->getType()) {
16227 if (cast<VectorType>(LHS->getType())
16228 ->getElementType()
16229 ->getIntegerBitWidth() < cast<VectorType>(RHS->getType())
16230 ->getElementType()
16231 ->getIntegerBitWidth())
16232 CastTy = RHS->getType();
16233 else
16234 CastTy = LHS->getType();
16235 }
16236 if (LHS->getType() != CastTy)
16237 LHS = Builder.CreateIntCast(LHS, CastTy, GetOperandSignedness(0));
16238 if (RHS->getType() != CastTy)
16239 RHS = Builder.CreateIntCast(RHS, CastTy, GetOperandSignedness(1));
16240 }
16241
16242 Value *V0, *V1;
16243 if (Instruction::isBinaryOp(E->getOpcode())) {
16244 V0 = Builder.CreateBinOp(
16245 static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
16246 V1 = Builder.CreateBinOp(
16247 static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
16248 } else if (auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16249 V0 = Builder.CreateCmp(CI0->getPredicate(), LHS, RHS);
16250 auto *AltCI = cast<CmpInst>(E->getAltOp());
16251 CmpInst::Predicate AltPred = AltCI->getPredicate();
16252 V1 = Builder.CreateCmp(AltPred, LHS, RHS);
16253 } else {
16254 if (LHS->getType()->isIntOrIntVectorTy() && ScalarTy->isIntegerTy()) {
16255 unsigned SrcBWSz = DL->getTypeSizeInBits(
16256 cast<VectorType>(LHS->getType())->getElementType());
16257 unsigned BWSz = DL->getTypeSizeInBits(ScalarTy);
16258 if (BWSz <= SrcBWSz) {
16259 if (BWSz < SrcBWSz)
16260 LHS = Builder.CreateIntCast(LHS, VecTy, It->second.first);
16261 assert(LHS->getType() == VecTy &&
16262 "Expected same type as operand.");
16263 if (auto *I = dyn_cast<Instruction>(LHS))
16264 LHS = ::propagateMetadata(I, E->Scalars);
16265 LHS = FinalShuffle(LHS, E);
16266 E->VectorizedValue = LHS;
16267 ++NumVectorInstructions;
16268 return LHS;
16269 }
16270 }
16271 V0 = Builder.CreateCast(
16272 static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
16273 V1 = Builder.CreateCast(
16274 static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
16275 }
16276 // Add V0 and V1 to later analysis to try to find and remove matching
16277 // instruction, if any.
16278 for (Value *V : {V0, V1}) {
16279 if (auto *I = dyn_cast<Instruction>(V)) {
16280 GatherShuffleExtractSeq.insert(I);
16281 CSEBlocks.insert(I->getParent());
16282 }
16283 }
16284
16285 // Create shuffle to take alternate operations from the vector.
16286 // Also, gather up main and alt scalar ops to propagate IR flags to
16287 // each vector operation.
16288 ValueList OpScalars, AltScalars;
16290 E->buildAltOpShuffleMask(
16291 [E, this](Instruction *I) {
16292 assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
16293 return isAlternateInstruction(I, E->getMainOp(), E->getAltOp(),
16294 *TLI);
16295 },
16296 Mask, &OpScalars, &AltScalars);
16297
16298 propagateIRFlags(V0, OpScalars, E->getMainOp(), It == MinBWs.end());
16299 propagateIRFlags(V1, AltScalars, E->getAltOp(), It == MinBWs.end());
16300 auto DropNuwFlag = [&](Value *Vec, unsigned Opcode) {
16301 // Drop nuw flags for abs(sub(commutative), true).
16302 if (auto *I = dyn_cast<Instruction>(Vec);
16303 I && Opcode == Instruction::Sub && !MinBWs.contains(E) &&
16304 any_of(E->Scalars, [](Value *V) {
16305 if (isa<PoisonValue>(V))
16306 return false;
16307 auto *IV = cast<Instruction>(V);
16308 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16309 }))
16310 I->setHasNoUnsignedWrap(/*b=*/false);
16311 };
16312 DropNuwFlag(V0, E->getOpcode());
16313 DropNuwFlag(V1, E->getAltOpcode());
16314
16315 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16316 assert(SLPReVec && "FixedVectorType is not expected.");
16318 }
16319 V = Builder.CreateShuffleVector(V0, V1, Mask);
16320 if (auto *I = dyn_cast<Instruction>(V)) {
16321 V = ::propagateMetadata(I, E->Scalars);
16322 GatherShuffleExtractSeq.insert(I);
16323 CSEBlocks.insert(I->getParent());
16324 }
16325 }
16326
16327 E->VectorizedValue = V;
16328 ++NumVectorInstructions;
16329
16330 return V;
16331 }
16332 default:
16333 llvm_unreachable("unknown inst");
16334 }
16335 return nullptr;
16336}
16337
16339 ExtraValueToDebugLocsMap ExternallyUsedValues;
16340 return vectorizeTree(ExternallyUsedValues);
16341}
16342
16343Value *
16345 Instruction *ReductionRoot) {
16346 // All blocks must be scheduled before any instructions are inserted.
16347 for (auto &BSIter : BlocksSchedules) {
16348 scheduleBlock(BSIter.second.get());
16349 }
16350 // Clean Entry-to-LastInstruction table. It can be affected after scheduling,
16351 // need to rebuild it.
16352 EntryToLastInstruction.clear();
16353
16354 if (ReductionRoot)
16355 Builder.SetInsertPoint(ReductionRoot->getParent(),
16356 ReductionRoot->getIterator());
16357 else
16358 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16359
16360 // Emit gathered loads first to emit better code for the users of those
16361 // gathered loads.
16362 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16363 if (GatheredLoadsEntriesFirst.has_value() &&
16364 TE->Idx >= *GatheredLoadsEntriesFirst &&
16365 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16366 assert((!TE->UserTreeIndices.empty() ||
16367 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16368 "Expected gathered load node.");
16369 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16370 }
16371 }
16372 // Postpone emission of PHIs operands to avoid cyclic dependencies issues.
16373 (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true);
16374 for (const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16375 if (TE->State == TreeEntry::Vectorize &&
16376 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16377 TE->VectorizedValue)
16378 (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false);
16379 // Run through the list of postponed gathers and emit them, replacing the temp
16380 // emitted allocas with actual vector instructions.
16381 ArrayRef<const TreeEntry *> PostponedNodes = PostponedGathers.getArrayRef();
16383 for (const TreeEntry *E : PostponedNodes) {
16384 auto *TE = const_cast<TreeEntry *>(E);
16385 if (auto *VecTE = getTreeEntry(TE->Scalars.front()))
16386 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16387 TE->UserTreeIndices.front().EdgeIdx)) &&
16388 VecTE->isSame(TE->Scalars))
16389 // Found gather node which is absolutely the same as one of the
16390 // vectorized nodes. It may happen after reordering.
16391 continue;
16392 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16393 TE->VectorizedValue = nullptr;
16394 auto *UserI =
16395 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16396 // If user is a PHI node, its vector code have to be inserted right before
16397 // block terminator. Since the node was delayed, there were some unresolved
16398 // dependencies at the moment when stab instruction was emitted. In a case
16399 // when any of these dependencies turn out an operand of another PHI, coming
16400 // from this same block, position of a stab instruction will become invalid.
16401 // The is because source vector that supposed to feed this gather node was
16402 // inserted at the end of the block [after stab instruction]. So we need
16403 // to adjust insertion point again to the end of block.
16404 if (isa<PHINode>(UserI)) {
16405 // Insert before all users.
16406 Instruction *InsertPt = PrevVec->getParent()->getTerminator();
16407 for (User *U : PrevVec->users()) {
16408 if (U == UserI)
16409 continue;
16410 auto *UI = dyn_cast<Instruction>(U);
16411 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
16412 continue;
16413 if (UI->comesBefore(InsertPt))
16414 InsertPt = UI;
16415 }
16416 Builder.SetInsertPoint(InsertPt);
16417 } else {
16418 Builder.SetInsertPoint(PrevVec);
16419 }
16420 Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
16421 Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
16422 if (auto *VecI = dyn_cast<Instruction>(Vec);
16423 VecI && VecI->getParent() == Builder.GetInsertBlock() &&
16424 Builder.GetInsertPoint()->comesBefore(VecI))
16425 VecI->moveBeforePreserving(*Builder.GetInsertBlock(),
16426 Builder.GetInsertPoint());
16427 if (Vec->getType() != PrevVec->getType()) {
16428 assert(Vec->getType()->isIntOrIntVectorTy() &&
16429 PrevVec->getType()->isIntOrIntVectorTy() &&
16430 "Expected integer vector types only.");
16431 std::optional<bool> IsSigned;
16432 for (Value *V : TE->Scalars) {
16433 if (const TreeEntry *BaseTE = getTreeEntry(V)) {
16434 auto It = MinBWs.find(BaseTE);
16435 if (It != MinBWs.end()) {
16436 IsSigned = IsSigned.value_or(false) || It->second.second;
16437 if (*IsSigned)
16438 break;
16439 }
16440 for (const TreeEntry *MNTE : MultiNodeScalars.lookup(V)) {
16441 auto It = MinBWs.find(MNTE);
16442 if (It != MinBWs.end()) {
16443 IsSigned = IsSigned.value_or(false) || It->second.second;
16444 if (*IsSigned)
16445 break;
16446 }
16447 }
16448 if (IsSigned.value_or(false))
16449 break;
16450 // Scan through gather nodes.
16451 for (const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16452 auto It = MinBWs.find(BVE);
16453 if (It != MinBWs.end()) {
16454 IsSigned = IsSigned.value_or(false) || It->second.second;
16455 if (*IsSigned)
16456 break;
16457 }
16458 }
16459 if (IsSigned.value_or(false))
16460 break;
16461 if (auto *EE = dyn_cast<ExtractElementInst>(V)) {
16462 IsSigned =
16463 IsSigned.value_or(false) ||
16464 !isKnownNonNegative(EE->getVectorOperand(), SimplifyQuery(*DL));
16465 continue;
16466 }
16467 if (IsSigned.value_or(false))
16468 break;
16469 }
16470 }
16471 if (IsSigned.value_or(false)) {
16472 // Final attempt - check user node.
16473 auto It = MinBWs.find(TE->UserTreeIndices.front().UserTE);
16474 if (It != MinBWs.end())
16475 IsSigned = It->second.second;
16476 }
16477 assert(IsSigned &&
16478 "Expected user node or perfect diamond match in MinBWs.");
16479 Vec = Builder.CreateIntCast(Vec, PrevVec->getType(), *IsSigned);
16480 }
16481 PrevVec->replaceAllUsesWith(Vec);
16482 PostponedValues.try_emplace(Vec).first->second.push_back(TE);
16483 // Replace the stub vector node, if it was used before for one of the
16484 // buildvector nodes already.
16485 auto It = PostponedValues.find(PrevVec);
16486 if (It != PostponedValues.end()) {
16487 for (TreeEntry *VTE : It->getSecond())
16488 VTE->VectorizedValue = Vec;
16489 }
16490 eraseInstruction(PrevVec);
16491 }
16492
16493 LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
16494 << " values .\n");
16495
16497 // Maps vector instruction to original insertelement instruction
16498 DenseMap<Value *, InsertElementInst *> VectorToInsertElement;
16499 // Maps extract Scalar to the corresponding extractelement instruction in the
16500 // basic block. Only one extractelement per block should be emitted.
16502 ScalarToEEs;
16503 SmallDenseSet<Value *, 4> UsedInserts;
16505 SmallDenseSet<Value *, 4> ScalarsWithNullptrUser;
16507 // Extract all of the elements with the external uses.
16508 for (const auto &ExternalUse : ExternalUses) {
16509 Value *Scalar = ExternalUse.Scalar;
16510 llvm::User *User = ExternalUse.User;
16511
16512 // Skip users that we already RAUW. This happens when one instruction
16513 // has multiple uses of the same value.
16514 if (User && !is_contained(Scalar->users(), User))
16515 continue;
16516 TreeEntry *E = getTreeEntry(Scalar);
16517 assert(E && "Invalid scalar");
16518 assert(!E->isGather() && "Extracting from a gather list");
16519 // Non-instruction pointers are not deleted, just skip them.
16520 if (E->getOpcode() == Instruction::GetElementPtr &&
16521 !isa<GetElementPtrInst>(Scalar))
16522 continue;
16523
16524 Value *Vec = E->VectorizedValue;
16525 assert(Vec && "Can't find vectorizable value");
16526
16527 Value *Lane = Builder.getInt32(ExternalUse.Lane);
16528 auto ExtractAndExtendIfNeeded = [&](Value *Vec) {
16529 if (Scalar->getType() != Vec->getType()) {
16530 Value *Ex = nullptr;
16531 Value *ExV = nullptr;
16532 auto *Inst = dyn_cast<Instruction>(Scalar);
16533 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.contains(Inst);
16534 auto It = ScalarToEEs.find(Scalar);
16535 if (It != ScalarToEEs.end()) {
16536 // No need to emit many extracts, just move the only one in the
16537 // current block.
16538 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16539 : Builder.GetInsertBlock());
16540 if (EEIt != It->second.end()) {
16541 Value *PrevV = EEIt->second.first;
16542 if (auto *I = dyn_cast<Instruction>(PrevV);
16543 I && !ReplaceInst &&
16544 Builder.GetInsertPoint() != Builder.GetInsertBlock()->end() &&
16545 Builder.GetInsertPoint()->comesBefore(I)) {
16546 I->moveBefore(*Builder.GetInsertPoint()->getParent(),
16547 Builder.GetInsertPoint());
16548 if (auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16549 CI->moveAfter(I);
16550 }
16551 Ex = PrevV;
16552 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16553 }
16554 }
16555 if (!Ex) {
16556 // "Reuse" the existing extract to improve final codegen.
16557 if (ReplaceInst) {
16558 // Leave the instruction as is, if it cheaper extracts and all
16559 // operands are scalar.
16560 if (auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16561 IgnoredExtracts.insert(EE);
16562 Ex = EE;
16563 } else {
16564 auto *CloneInst = Inst->clone();
16565 CloneInst->insertBefore(Inst);
16566 if (Inst->hasName())
16567 CloneInst->takeName(Inst);
16568 Ex = CloneInst;
16569 }
16570 } else if (auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16571 ES && isa<Instruction>(Vec)) {
16572 Value *V = ES->getVectorOperand();
16573 auto *IVec = cast<Instruction>(Vec);
16574 if (const TreeEntry *ETE = getTreeEntry(V))
16575 V = ETE->VectorizedValue;
16576 if (auto *IV = dyn_cast<Instruction>(V);
16577 !IV || IV == Vec || IV->getParent() != IVec->getParent() ||
16578 IV->comesBefore(IVec))
16579 Ex = Builder.CreateExtractElement(V, ES->getIndexOperand());
16580 else
16581 Ex = Builder.CreateExtractElement(Vec, Lane);
16582 } else if (auto *VecTy =
16583 dyn_cast<FixedVectorType>(Scalar->getType())) {
16584 assert(SLPReVec && "FixedVectorType is not expected.");
16585 unsigned VecTyNumElements = VecTy->getNumElements();
16586 // When REVEC is enabled, we need to extract a vector.
16587 // Note: The element size of Scalar may be different from the
16588 // element size of Vec.
16589 Ex = createExtractVector(Builder, Vec, VecTyNumElements,
16590 ExternalUse.Lane * VecTyNumElements);
16591 } else {
16592 Ex = Builder.CreateExtractElement(Vec, Lane);
16593 }
16594 // If necessary, sign-extend or zero-extend ScalarRoot
16595 // to the larger type.
16596 ExV = Ex;
16597 if (Scalar->getType() != Ex->getType())
16598 ExV = Builder.CreateIntCast(
16599 Ex, Scalar->getType(),
16600 !isKnownNonNegative(Scalar, SimplifyQuery(*DL)));
16601 auto *I = dyn_cast<Instruction>(Ex);
16602 ScalarToEEs[Scalar].try_emplace(I ? I->getParent()
16603 : &F->getEntryBlock(),
16604 std::make_pair(Ex, ExV));
16605 }
16606 // The then branch of the previous if may produce constants, since 0
16607 // operand might be a constant.
16608 if (auto *ExI = dyn_cast<Instruction>(Ex);
16609 ExI && !isa<PHINode>(ExI) && !mayHaveNonDefUseDependency(*ExI)) {
16610 GatherShuffleExtractSeq.insert(ExI);
16611 CSEBlocks.insert(ExI->getParent());
16612 }
16613 return ExV;
16614 }
16615 assert(isa<FixedVectorType>(Scalar->getType()) &&
16616 isa<InsertElementInst>(Scalar) &&
16617 "In-tree scalar of vector type is not insertelement?");
16618 auto *IE = cast<InsertElementInst>(Scalar);
16619 VectorToInsertElement.try_emplace(Vec, IE);
16620 return Vec;
16621 };
16622 // If User == nullptr, the Scalar remains as scalar in vectorized
16623 // instructions or is used as extra arg. Generate ExtractElement instruction
16624 // and update the record for this scalar in ExternallyUsedValues.
16625 if (!User) {
16626 if (!ScalarsWithNullptrUser.insert(Scalar).second)
16627 continue;
16628 assert((ExternallyUsedValues.count(Scalar) ||
16629 Scalar->hasNUsesOrMore(UsesLimit) ||
16630 ExternalUsesAsOriginalScalar.contains(Scalar) ||
16631 any_of(Scalar->users(),
16632 [&](llvm::User *U) {
16633 if (ExternalUsesAsOriginalScalar.contains(U))
16634 return true;
16635 TreeEntry *UseEntry = getTreeEntry(U);
16636 return UseEntry &&
16637 (UseEntry->State == TreeEntry::Vectorize ||
16638 UseEntry->State ==
16639 TreeEntry::StridedVectorize) &&
16640 (E->State == TreeEntry::Vectorize ||
16641 E->State == TreeEntry::StridedVectorize) &&
16642 doesInTreeUserNeedToExtract(
16643 Scalar, getRootEntryInstruction(*UseEntry),
16644 TLI, TTI);
16645 })) &&
16646 "Scalar with nullptr User must be registered in "
16647 "ExternallyUsedValues map or remain as scalar in vectorized "
16648 "instructions");
16649 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16650 if (auto *PHI = dyn_cast<PHINode>(VecI)) {
16651 if (PHI->getParent()->isLandingPad())
16652 Builder.SetInsertPoint(
16653 PHI->getParent(),
16654 std::next(
16655 PHI->getParent()->getLandingPadInst()->getIterator()));
16656 else
16657 Builder.SetInsertPoint(PHI->getParent(),
16658 PHI->getParent()->getFirstNonPHIIt());
16659 } else {
16660 Builder.SetInsertPoint(VecI->getParent(),
16661 std::next(VecI->getIterator()));
16662 }
16663 } else {
16664 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16665 }
16666 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16667 // Required to update internally referenced instructions.
16668 if (Scalar != NewInst) {
16669 assert((!isa<ExtractElementInst>(Scalar) ||
16670 !IgnoredExtracts.contains(cast<ExtractElementInst>(Scalar))) &&
16671 "Extractelements should not be replaced.");
16672 Scalar->replaceAllUsesWith(NewInst);
16673 }
16674 continue;
16675 }
16676
16677 if (auto *VU = dyn_cast<InsertElementInst>(User);
16678 VU && VU->getOperand(1) == Scalar) {
16679 // Skip if the scalar is another vector op or Vec is not an instruction.
16680 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16681 if (auto *FTy = dyn_cast<FixedVectorType>(User->getType())) {
16682 if (!UsedInserts.insert(VU).second)
16683 continue;
16684 // Need to use original vector, if the root is truncated.
16685 auto BWIt = MinBWs.find(E);
16686 if (BWIt != MinBWs.end() && Vec->getType() != VU->getType()) {
16687 auto *ScalarTy = FTy->getElementType();
16688 auto Key = std::make_pair(Vec, ScalarTy);
16689 auto VecIt = VectorCasts.find(Key);
16690 if (VecIt == VectorCasts.end()) {
16691 IRBuilderBase::InsertPointGuard Guard(Builder);
16692 if (auto *IVec = dyn_cast<PHINode>(Vec)) {
16693 if (IVec->getParent()->isLandingPad())
16694 Builder.SetInsertPoint(IVec->getParent(),
16695 std::next(IVec->getParent()
16696 ->getLandingPadInst()
16697 ->getIterator()));
16698 else
16699 Builder.SetInsertPoint(
16700 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16701 } else if (auto *IVec = dyn_cast<Instruction>(Vec)) {
16702 Builder.SetInsertPoint(IVec->getNextNonDebugInstruction());
16703 }
16704 Vec = Builder.CreateIntCast(
16705 Vec,
16707 ScalarTy,
16708 cast<FixedVectorType>(Vec->getType())->getNumElements()),
16709 BWIt->second.second);
16710 VectorCasts.try_emplace(Key, Vec);
16711 } else {
16712 Vec = VecIt->second;
16713 }
16714 }
16715
16716 std::optional<unsigned> InsertIdx = getElementIndex(VU);
16717 if (InsertIdx) {
16718 auto *It = find_if(
16719 ShuffledInserts, [VU](const ShuffledInsertData<Value *> &Data) {
16720 // Checks if 2 insertelements are from the same buildvector.
16721 InsertElementInst *VecInsert = Data.InsertElements.front();
16723 VU, VecInsert,
16724 [](InsertElementInst *II) { return II->getOperand(0); });
16725 });
16726 unsigned Idx = *InsertIdx;
16727 if (It == ShuffledInserts.end()) {
16728 (void)ShuffledInserts.emplace_back();
16729 It = std::next(ShuffledInserts.begin(),
16730 ShuffledInserts.size() - 1);
16731 }
16732 SmallVectorImpl<int> &Mask = It->ValueMasks[Vec];
16733 if (Mask.empty())
16734 Mask.assign(FTy->getNumElements(), PoisonMaskElem);
16735 Mask[Idx] = ExternalUse.Lane;
16736 It->InsertElements.push_back(cast<InsertElementInst>(User));
16737 continue;
16738 }
16739 }
16740 }
16741 }
16742
16743 // Generate extracts for out-of-tree users.
16744 // Find the insertion point for the extractelement lane.
16745 if (auto *VecI = dyn_cast<Instruction>(Vec)) {
16746 if (PHINode *PH = dyn_cast<PHINode>(User)) {
16747 for (unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16748 if (PH->getIncomingValue(I) == Scalar) {
16749 Instruction *IncomingTerminator =
16750 PH->getIncomingBlock(I)->getTerminator();
16751 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16752 Builder.SetInsertPoint(VecI->getParent(),
16753 std::next(VecI->getIterator()));
16754 } else {
16755 Builder.SetInsertPoint(PH->getIncomingBlock(I)->getTerminator());
16756 }
16757 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16758 PH->setOperand(I, NewInst);
16759 }
16760 }
16761 } else {
16762 Builder.SetInsertPoint(cast<Instruction>(User));
16763 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16764 User->replaceUsesOfWith(Scalar, NewInst);
16765 }
16766 } else {
16767 Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin());
16768 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16769 User->replaceUsesOfWith(Scalar, NewInst);
16770 }
16771
16772 LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
16773 }
16774
16775 auto CreateShuffle = [&](Value *V1, Value *V2, ArrayRef<int> Mask) {
16776 SmallVector<int> CombinedMask1(Mask.size(), PoisonMaskElem);
16777 SmallVector<int> CombinedMask2(Mask.size(), PoisonMaskElem);
16778 int VF = cast<FixedVectorType>(V1->getType())->getNumElements();
16779 for (int I = 0, E = Mask.size(); I < E; ++I) {
16780 if (Mask[I] < VF)
16781 CombinedMask1[I] = Mask[I];
16782 else
16783 CombinedMask2[I] = Mask[I] - VF;
16784 }
16785 ShuffleInstructionBuilder ShuffleBuilder(
16786 cast<VectorType>(V1->getType())->getElementType(), Builder, *this);
16787 ShuffleBuilder.add(V1, CombinedMask1);
16788 if (V2)
16789 ShuffleBuilder.add(V2, CombinedMask2);
16790 return ShuffleBuilder.finalize({}, {}, {});
16791 };
16792
16793 auto &&ResizeToVF = [&CreateShuffle](Value *Vec, ArrayRef<int> Mask,
16794 bool ForSingleMask) {
16795 unsigned VF = Mask.size();
16796 unsigned VecVF = cast<FixedVectorType>(Vec->getType())->getNumElements();
16797 if (VF != VecVF) {
16798 if (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); })) {
16799 Vec = CreateShuffle(Vec, nullptr, Mask);
16800 return std::make_pair(Vec, true);
16801 }
16802 if (!ForSingleMask) {
16803 SmallVector<int> ResizeMask(VF, PoisonMaskElem);
16804 for (unsigned I = 0; I < VF; ++I) {
16805 if (Mask[I] != PoisonMaskElem)
16806 ResizeMask[Mask[I]] = Mask[I];
16807 }
16808 Vec = CreateShuffle(Vec, nullptr, ResizeMask);
16809 }
16810 }
16811
16812 return std::make_pair(Vec, false);
16813 };
16814 // Perform shuffling of the vectorize tree entries for better handling of
16815 // external extracts.
16816 for (int I = 0, E = ShuffledInserts.size(); I < E; ++I) {
16817 // Find the first and the last instruction in the list of insertelements.
16818 sort(ShuffledInserts[I].InsertElements, isFirstInsertElement);
16819 InsertElementInst *FirstInsert = ShuffledInserts[I].InsertElements.front();
16820 InsertElementInst *LastInsert = ShuffledInserts[I].InsertElements.back();
16821 Builder.SetInsertPoint(LastInsert);
16822 auto Vector = ShuffledInserts[I].ValueMasks.takeVector();
16823 Value *NewInst = performExtractsShuffleAction<Value>(
16824 MutableArrayRef(Vector.data(), Vector.size()),
16825 FirstInsert->getOperand(0),
16826 [](Value *Vec) {
16827 return cast<VectorType>(Vec->getType())
16828 ->getElementCount()
16829 .getKnownMinValue();
16830 },
16831 ResizeToVF,
16832 [FirstInsert, &CreateShuffle](ArrayRef<int> Mask,
16833 ArrayRef<Value *> Vals) {
16834 assert((Vals.size() == 1 || Vals.size() == 2) &&
16835 "Expected exactly 1 or 2 input values.");
16836 if (Vals.size() == 1) {
16837 // Do not create shuffle if the mask is a simple identity
16838 // non-resizing mask.
16839 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16840 ->getNumElements() ||
16841 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16842 return CreateShuffle(Vals.front(), nullptr, Mask);
16843 return Vals.front();
16844 }
16845 return CreateShuffle(Vals.front() ? Vals.front()
16846 : FirstInsert->getOperand(0),
16847 Vals.back(), Mask);
16848 });
16849 auto It = ShuffledInserts[I].InsertElements.rbegin();
16850 // Rebuild buildvector chain.
16851 InsertElementInst *II = nullptr;
16852 if (It != ShuffledInserts[I].InsertElements.rend())
16853 II = *It;
16855 while (It != ShuffledInserts[I].InsertElements.rend()) {
16856 assert(II && "Must be an insertelement instruction.");
16857 if (*It == II)
16858 ++It;
16859 else
16860 Inserts.push_back(cast<Instruction>(II));
16861 II = dyn_cast<InsertElementInst>(II->getOperand(0));
16862 }
16863 for (Instruction *II : reverse(Inserts)) {
16864 II->replaceUsesOfWith(II->getOperand(0), NewInst);
16865 if (auto *NewI = dyn_cast<Instruction>(NewInst))
16866 if (II->getParent() == NewI->getParent() && II->comesBefore(NewI))
16867 II->moveAfter(NewI);
16868 NewInst = II;
16869 }
16870 LastInsert->replaceAllUsesWith(NewInst);
16871 for (InsertElementInst *IE : reverse(ShuffledInserts[I].InsertElements)) {
16872 IE->replaceUsesOfWith(IE->getOperand(0),
16873 PoisonValue::get(IE->getOperand(0)->getType()));
16874 IE->replaceUsesOfWith(IE->getOperand(1),
16875 PoisonValue::get(IE->getOperand(1)->getType()));
16876 eraseInstruction(IE);
16877 }
16878 CSEBlocks.insert(LastInsert->getParent());
16879 }
16880
16881 SmallVector<Instruction *> RemovedInsts;
16882 // For each vectorized value:
16883 for (auto &TEPtr : VectorizableTree) {
16884 TreeEntry *Entry = TEPtr.get();
16885
16886 // No need to handle users of gathered values.
16887 if (Entry->isGather())
16888 continue;
16889
16890 assert(Entry->VectorizedValue && "Can't find vectorizable value");
16891
16892 // For each lane:
16893 for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16894 Value *Scalar = Entry->Scalars[Lane];
16895
16896 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16897 !isa<GetElementPtrInst>(Scalar))
16898 continue;
16899 if (auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16900 EE && IgnoredExtracts.contains(EE))
16901 continue;
16902 if (isa<PoisonValue>(Scalar))
16903 continue;
16904#ifndef NDEBUG
16905 Type *Ty = Scalar->getType();
16906 if (!Ty->isVoidTy()) {
16907 for (User *U : Scalar->users()) {
16908 LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
16909
16910 // It is legal to delete users in the ignorelist.
16911 assert((getTreeEntry(U) ||
16912 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16913 (isa_and_nonnull<Instruction>(U) &&
16914 isDeleted(cast<Instruction>(U)))) &&
16915 "Deleting out-of-tree value");
16916 }
16917 }
16918#endif
16919 LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
16920 auto *I = cast<Instruction>(Scalar);
16921 RemovedInsts.push_back(I);
16922 }
16923 }
16924
16925 // Merge the DIAssignIDs from the about-to-be-deleted instructions into the
16926 // new vector instruction.
16927 if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16928 V->mergeDIAssignID(RemovedInsts);
16929
16930 // Clear up reduction references, if any.
16931 if (UserIgnoreList) {
16932 for (Instruction *I : RemovedInsts) {
16933 const TreeEntry *IE = getTreeEntry(I);
16934 if (IE->Idx != 0 &&
16935 !(VectorizableTree.front()->isGather() &&
16936 !IE->UserTreeIndices.empty() &&
16937 (ValueToGatherNodes.lookup(I).contains(
16938 VectorizableTree.front().get()) ||
16939 any_of(IE->UserTreeIndices,
16940 [&](const EdgeInfo &EI) {
16941 return EI.UserTE == VectorizableTree.front().get() &&
16942 EI.EdgeIdx == UINT_MAX;
16943 }))) &&
16944 !(GatheredLoadsEntriesFirst.has_value() &&
16945 IE->Idx >= *GatheredLoadsEntriesFirst &&
16946 VectorizableTree.front()->isGather() &&
16947 is_contained(VectorizableTree.front()->Scalars, I)))
16948 continue;
16949 SmallVector<SelectInst *> LogicalOpSelects;
16950 I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
16951 // Do not replace condition of the logical op in form select <cond>.
16952 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16953 (match(U.getUser(), m_LogicalAnd()) ||
16954 match(U.getUser(), m_LogicalOr())) &&
16955 U.getOperandNo() == 0;
16956 if (IsPoisoningLogicalOp) {
16957 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16958 return false;
16959 }
16960 return UserIgnoreList->contains(U.getUser());
16961 });
16962 // Replace conditions of the poisoning logical ops with the non-poison
16963 // constant value.
16964 for (SelectInst *SI : LogicalOpSelects)
16965 SI->setCondition(Constant::getNullValue(SI->getCondition()->getType()));
16966 }
16967 }
16968 // Retain to-be-deleted instructions for some debug-info bookkeeping and alias
16969 // cache correctness.
16970 // NOTE: removeInstructionAndOperands only marks the instruction for deletion
16971 // - instructions are not deleted until later.
16972 removeInstructionsAndOperands(ArrayRef(RemovedInsts));
16973
16974 Builder.ClearInsertionPoint();
16975 InstrElementSize.clear();
16976
16977 const TreeEntry &RootTE = *VectorizableTree.front();
16978 Value *Vec = RootTE.VectorizedValue;
16979 if (auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16980 It != MinBWs.end() &&
16981 ReductionBitWidth != It->second.first) {
16982 IRBuilder<>::InsertPointGuard Guard(Builder);
16983 Builder.SetInsertPoint(ReductionRoot->getParent(),
16984 ReductionRoot->getIterator());
16985 Vec = Builder.CreateIntCast(
16986 Vec,
16987 VectorType::get(Builder.getIntNTy(ReductionBitWidth),
16988 cast<VectorType>(Vec->getType())->getElementCount()),
16989 It->second.second);
16990 }
16991 return Vec;
16992}
16993
16995 LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherShuffleExtractSeq.size()
16996 << " gather sequences instructions.\n");
16997 // LICM InsertElementInst sequences.
16998 for (Instruction *I : GatherShuffleExtractSeq) {
16999 if (isDeleted(I))
17000 continue;
17001
17002 // Check if this block is inside a loop.
17003 Loop *L = LI->getLoopFor(I->getParent());
17004 if (!L)
17005 continue;
17006
17007 // Check if it has a preheader.
17008 BasicBlock *PreHeader = L->getLoopPreheader();
17009 if (!PreHeader)
17010 continue;
17011
17012 // If the vector or the element that we insert into it are
17013 // instructions that are defined in this basic block then we can't
17014 // hoist this instruction.
17015 if (any_of(I->operands(), [L](Value *V) {
17016 auto *OpI = dyn_cast<Instruction>(V);
17017 return OpI && L->contains(OpI);
17018 }))
17019 continue;
17020
17021 // We can hoist this instruction. Move it to the pre-header.
17022 I->moveBefore(PreHeader->getTerminator());
17023 CSEBlocks.insert(PreHeader);
17024 }
17025
17026 // Make a list of all reachable blocks in our CSE queue.
17028 CSEWorkList.reserve(CSEBlocks.size());
17029 for (BasicBlock *BB : CSEBlocks)
17030 if (DomTreeNode *N = DT->getNode(BB)) {
17032 CSEWorkList.push_back(N);
17033 }
17034
17035 // Sort blocks by domination. This ensures we visit a block after all blocks
17036 // dominating it are visited.
17037 llvm::sort(CSEWorkList, [](const DomTreeNode *A, const DomTreeNode *B) {
17038 assert((A == B) == (A->getDFSNumIn() == B->getDFSNumIn()) &&
17039 "Different nodes should have different DFS numbers");
17040 return A->getDFSNumIn() < B->getDFSNumIn();
17041 });
17042
17043 // Less defined shuffles can be replaced by the more defined copies.
17044 // Between two shuffles one is less defined if it has the same vector operands
17045 // and its mask indeces are the same as in the first one or undefs. E.g.
17046 // shuffle %0, poison, <0, 0, 0, undef> is less defined than shuffle %0,
17047 // poison, <0, 0, 0, 0>.
17048 auto &&IsIdenticalOrLessDefined = [TTI = TTI](Instruction *I1,
17049 Instruction *I2,
17050 SmallVectorImpl<int> &NewMask) {
17051 if (I1->getType() != I2->getType())
17052 return false;
17053 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17054 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17055 if (!SI1 || !SI2)
17056 return I1->isIdenticalTo(I2);
17057 if (SI1->isIdenticalTo(SI2))
17058 return true;
17059 for (int I = 0, E = SI1->getNumOperands(); I < E; ++I)
17060 if (SI1->getOperand(I) != SI2->getOperand(I))
17061 return false;
17062 // Check if the second instruction is more defined than the first one.
17063 NewMask.assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17064 ArrayRef<int> SM1 = SI1->getShuffleMask();
17065 // Count trailing undefs in the mask to check the final number of used
17066 // registers.
17067 unsigned LastUndefsCnt = 0;
17068 for (int I = 0, E = NewMask.size(); I < E; ++I) {
17069 if (SM1[I] == PoisonMaskElem)
17070 ++LastUndefsCnt;
17071 else
17072 LastUndefsCnt = 0;
17073 if (NewMask[I] != PoisonMaskElem && SM1[I] != PoisonMaskElem &&
17074 NewMask[I] != SM1[I])
17075 return false;
17076 if (NewMask[I] == PoisonMaskElem)
17077 NewMask[I] = SM1[I];
17078 }
17079 // Check if the last undefs actually change the final number of used vector
17080 // registers.
17081 return SM1.size() - LastUndefsCnt > 1 &&
17082 TTI->getNumberOfParts(SI1->getType()) ==
17084 getWidenedType(SI1->getType()->getElementType(),
17085 SM1.size() - LastUndefsCnt));
17086 };
17087 // Perform O(N^2) search over the gather/shuffle sequences and merge identical
17088 // instructions. TODO: We can further optimize this scan if we split the
17089 // instructions into different buckets based on the insert lane.
17091 for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
17092 assert(*I &&
17093 (I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
17094 "Worklist not sorted properly!");
17095 BasicBlock *BB = (*I)->getBlock();
17096 // For all instructions in blocks containing gather sequences:
17097 for (Instruction &In : llvm::make_early_inc_range(*BB)) {
17098 if (isDeleted(&In))
17099 continue;
17100 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17101 !GatherShuffleExtractSeq.contains(&In))
17102 continue;
17103
17104 // Check if we can replace this instruction with any of the
17105 // visited instructions.
17106 bool Replaced = false;
17107 for (Instruction *&V : Visited) {
17108 SmallVector<int> NewMask;
17109 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17110 DT->dominates(V->getParent(), In.getParent())) {
17111 In.replaceAllUsesWith(V);
17112 eraseInstruction(&In);
17113 if (auto *SI = dyn_cast<ShuffleVectorInst>(V))
17114 if (!NewMask.empty())
17115 SI->setShuffleMask(NewMask);
17116 Replaced = true;
17117 break;
17118 }
17119 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17120 GatherShuffleExtractSeq.contains(V) &&
17121 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17122 DT->dominates(In.getParent(), V->getParent())) {
17123 In.moveAfter(V);
17124 V->replaceAllUsesWith(&In);
17126 if (auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17127 if (!NewMask.empty())
17128 SI->setShuffleMask(NewMask);
17129 V = &In;
17130 Replaced = true;
17131 break;
17132 }
17133 }
17134 if (!Replaced) {
17135 assert(!is_contained(Visited, &In));
17136 Visited.push_back(&In);
17137 }
17138 }
17139 }
17140 CSEBlocks.clear();
17141 GatherShuffleExtractSeq.clear();
17142}
17143
17144BoUpSLP::ScheduleData *
17145BoUpSLP::BlockScheduling::buildBundle(ArrayRef<Value *> VL) {
17146 ScheduleData *Bundle = nullptr;
17147 ScheduleData *PrevInBundle = nullptr;
17148 for (Value *V : VL) {
17150 continue;
17151 ScheduleData *BundleMember = getScheduleData(V);
17152 assert(BundleMember &&
17153 "no ScheduleData for bundle member "
17154 "(maybe not in same basic block)");
17155 assert(BundleMember->isSchedulingEntity() &&
17156 "bundle member already part of other bundle");
17157 if (PrevInBundle) {
17158 PrevInBundle->NextInBundle = BundleMember;
17159 } else {
17160 Bundle = BundleMember;
17161 }
17162
17163 // Group the instructions to a bundle.
17164 BundleMember->FirstInBundle = Bundle;
17165 PrevInBundle = BundleMember;
17166 }
17167 assert(Bundle && "Failed to find schedule bundle");
17168 return Bundle;
17169}
17170
17171// Groups the instructions to a bundle (which is then a single scheduling entity)
17172// and schedules instructions until the bundle gets ready.
17173std::optional<BoUpSLP::ScheduleData *>
17174BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
17175 const InstructionsState &S) {
17176 // No need to schedule PHIs, insertelement, extractelement and extractvalue
17177 // instructions.
17178 if (isa<PHINode>(S.getMainOp()) ||
17180 return nullptr;
17181
17182 // Initialize the instruction bundle.
17183 Instruction *OldScheduleEnd = ScheduleEnd;
17184 LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n");
17185
17186 auto TryScheduleBundleImpl = [this, OldScheduleEnd, SLP](bool ReSchedule,
17187 ScheduleData *Bundle) {
17188 // The scheduling region got new instructions at the lower end (or it is a
17189 // new region for the first bundle). This makes it necessary to
17190 // recalculate all dependencies.
17191 // It is seldom that this needs to be done a second time after adding the
17192 // initial bundle to the region.
17193 if (ScheduleEnd != OldScheduleEnd) {
17194 for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode())
17195 if (ScheduleData *SD = getScheduleData(I))
17196 SD->clearDependencies();
17197 ReSchedule = true;
17198 }
17199 if (Bundle) {
17200 LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle
17201 << " in block " << BB->getName() << "\n");
17202 calculateDependencies(Bundle, /*InsertInReadyList=*/true, SLP);
17203 }
17204
17205 if (ReSchedule) {
17206 resetSchedule();
17207 initialFillReadyList(ReadyInsts);
17208 }
17209
17210 // Now try to schedule the new bundle or (if no bundle) just calculate
17211 // dependencies. As soon as the bundle is "ready" it means that there are no
17212 // cyclic dependencies and we can schedule it. Note that's important that we
17213 // don't "schedule" the bundle yet (see cancelScheduling).
17214 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17215 !ReadyInsts.empty()) {
17216 ScheduleData *Picked = ReadyInsts.pop_back_val();
17217 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17218 "must be ready to schedule");
17219 schedule(Picked, ReadyInsts);
17220 }
17221 };
17222
17223 // Make sure that the scheduling region contains all
17224 // instructions of the bundle.
17225 for (Value *V : VL) {
17227 continue;
17228 if (!extendSchedulingRegion(V, S)) {
17229 // If the scheduling region got new instructions at the lower end (or it
17230 // is a new region for the first bundle). This makes it necessary to
17231 // recalculate all dependencies.
17232 // Otherwise the compiler may crash trying to incorrectly calculate
17233 // dependencies and emit instruction in the wrong order at the actual
17234 // scheduling.
17235 TryScheduleBundleImpl(/*ReSchedule=*/false, nullptr);
17236 return std::nullopt;
17237 }
17238 }
17239
17240 bool ReSchedule = false;
17241 for (Value *V : VL) {
17243 continue;
17244 ScheduleData *BundleMember = getScheduleData(V);
17245 assert(BundleMember &&
17246 "no ScheduleData for bundle member (maybe not in same basic block)");
17247
17248 // Make sure we don't leave the pieces of the bundle in the ready list when
17249 // whole bundle might not be ready.
17250 ReadyInsts.remove(BundleMember);
17251
17252 if (!BundleMember->IsScheduled)
17253 continue;
17254 // A bundle member was scheduled as single instruction before and now
17255 // needs to be scheduled as part of the bundle. We just get rid of the
17256 // existing schedule.
17257 LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
17258 << " was already scheduled\n");
17259 ReSchedule = true;
17260 }
17261
17262 auto *Bundle = buildBundle(VL);
17263 TryScheduleBundleImpl(ReSchedule, Bundle);
17264 if (!Bundle->isReady()) {
17265 cancelScheduling(VL, S.getMainOp());
17266 return std::nullopt;
17267 }
17268 return Bundle;
17269}
17270
17271void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
17272 Value *OpValue) {
17273 if (isa<PHINode>(OpValue) || isVectorLikeInstWithConstOps(OpValue) ||
17275 return;
17276
17277 if (doesNotNeedToBeScheduled(OpValue))
17278 OpValue = *find_if_not(VL, doesNotNeedToBeScheduled);
17279 ScheduleData *Bundle = getScheduleData(OpValue);
17280 LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
17281 assert(!Bundle->IsScheduled &&
17282 "Can't cancel bundle which is already scheduled");
17283 assert(Bundle->isSchedulingEntity() &&
17284 (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) &&
17285 "tried to unbundle something which is not a bundle");
17286
17287 // Remove the bundle from the ready list.
17288 if (Bundle->isReady())
17289 ReadyInsts.remove(Bundle);
17290
17291 // Un-bundle: make single instructions out of the bundle.
17292 ScheduleData *BundleMember = Bundle;
17293 while (BundleMember) {
17294 assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
17295 BundleMember->FirstInBundle = BundleMember;
17296 ScheduleData *Next = BundleMember->NextInBundle;
17297 BundleMember->NextInBundle = nullptr;
17298 BundleMember->TE = nullptr;
17299 if (BundleMember->unscheduledDepsInBundle() == 0) {
17300 ReadyInsts.insert(BundleMember);
17301 }
17302 BundleMember = Next;
17303 }
17304}
17305
17306BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17307 // Allocate a new ScheduleData for the instruction.
17308 if (ChunkPos >= ChunkSize) {
17309 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17310 ChunkPos = 0;
17311 }
17312 return &(ScheduleDataChunks.back()[ChunkPos++]);
17313}
17314
17315bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17316 Value *V, const InstructionsState &S) {
17317 Instruction *I = dyn_cast<Instruction>(V);
17318 assert(I && "bundle member must be an instruction");
17319 assert(!isa<PHINode>(I) && !isVectorLikeInstWithConstOps(I) &&
17321 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17322 "be scheduled");
17323 if (getScheduleData(I))
17324 return true;
17325 if (!ScheduleStart) {
17326 // It's the first instruction in the new region.
17327 initScheduleData(I, I->getNextNode(), nullptr, nullptr);
17328 ScheduleStart = I;
17329 ScheduleEnd = I->getNextNode();
17330 assert(ScheduleEnd && "tried to vectorize a terminator?");
17331 LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
17332 return true;
17333 }
17334 // Search up and down at the same time, because we don't know if the new
17335 // instruction is above or below the existing scheduling region.
17336 // Ignore debug info (and other "AssumeLike" intrinsics) so that's not counted
17337 // against the budget. Otherwise debug info could affect codegen.
17339 ++ScheduleStart->getIterator().getReverse();
17340 BasicBlock::reverse_iterator UpperEnd = BB->rend();
17341 BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
17342 BasicBlock::iterator LowerEnd = BB->end();
17343 auto IsAssumeLikeIntr = [](const Instruction &I) {
17344 if (auto *II = dyn_cast<IntrinsicInst>(&I))
17345 return II->isAssumeLikeIntrinsic();
17346 return false;
17347 };
17348 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17349 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17350 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter != I &&
17351 &*DownIter != I) {
17352 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17353 LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
17354 return false;
17355 }
17356
17357 ++UpIter;
17358 ++DownIter;
17359
17360 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17361 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17362 }
17363 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter == I)) {
17364 assert(I->getParent() == ScheduleStart->getParent() &&
17365 "Instruction is in wrong basic block.");
17366 initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
17367 ScheduleStart = I;
17368 LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
17369 << "\n");
17370 return true;
17371 }
17372 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter == I)) &&
17373 "Expected to reach top of the basic block or instruction down the "
17374 "lower end.");
17375 assert(I->getParent() == ScheduleEnd->getParent() &&
17376 "Instruction is in wrong basic block.");
17377 initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
17378 nullptr);
17379 ScheduleEnd = I->getNextNode();
17380 assert(ScheduleEnd && "tried to vectorize a terminator?");
17381 LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n");
17382 return true;
17383}
17384
17385void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
17386 Instruction *ToI,
17387 ScheduleData *PrevLoadStore,
17388 ScheduleData *NextLoadStore) {
17389 ScheduleData *CurrentLoadStore = PrevLoadStore;
17390 for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
17391 // No need to allocate data for non-schedulable instructions.
17393 continue;
17394 ScheduleData *SD = ScheduleDataMap.lookup(I);
17395 if (!SD) {
17396 SD = allocateScheduleDataChunks();
17397 ScheduleDataMap[I] = SD;
17398 }
17399 assert(!isInSchedulingRegion(SD) &&
17400 "new ScheduleData already in scheduling region");
17401 SD->init(SchedulingRegionID, I);
17402
17403 if (I->mayReadOrWriteMemory() &&
17404 (!isa<IntrinsicInst>(I) ||
17405 (cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
17406 cast<IntrinsicInst>(I)->getIntrinsicID() !=
17407 Intrinsic::pseudoprobe))) {
17408 // Update the linked list of memory accessing instructions.
17409 if (CurrentLoadStore) {
17410 CurrentLoadStore->NextLoadStore = SD;
17411 } else {
17412 FirstLoadStoreInRegion = SD;
17413 }
17414 CurrentLoadStore = SD;
17415 }
17416
17417 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17418 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17419 RegionHasStackSave = true;
17420 }
17421 if (NextLoadStore) {
17422 if (CurrentLoadStore)
17423 CurrentLoadStore->NextLoadStore = NextLoadStore;
17424 } else {
17425 LastLoadStoreInRegion = CurrentLoadStore;
17426 }
17427}
17428
17429void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17430 bool InsertInReadyList,
17431 BoUpSLP *SLP) {
17432 assert(SD->isSchedulingEntity());
17433
17435 WorkList.push_back(SD);
17436
17437 while (!WorkList.empty()) {
17438 ScheduleData *SD = WorkList.pop_back_val();
17439 for (ScheduleData *BundleMember = SD; BundleMember;
17440 BundleMember = BundleMember->NextInBundle) {
17441 assert(isInSchedulingRegion(BundleMember));
17442 if (BundleMember->hasValidDependencies())
17443 continue;
17444
17445 LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
17446 << "\n");
17447 BundleMember->Dependencies = 0;
17448 BundleMember->resetUnscheduledDeps();
17449
17450 // Handle def-use chain dependencies.
17451 for (User *U : BundleMember->Inst->users()) {
17452 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17453 BundleMember->Dependencies++;
17454 ScheduleData *DestBundle = UseSD->FirstInBundle;
17455 if (!DestBundle->IsScheduled)
17456 BundleMember->incrementUnscheduledDeps(1);
17457 if (!DestBundle->hasValidDependencies())
17458 WorkList.push_back(DestBundle);
17459 }
17460 }
17461
17462 auto MakeControlDependent = [&](Instruction *I) {
17463 auto *DepDest = getScheduleData(I);
17464 assert(DepDest && "must be in schedule window");
17465 DepDest->ControlDependencies.push_back(BundleMember);
17466 BundleMember->Dependencies++;
17467 ScheduleData *DestBundle = DepDest->FirstInBundle;
17468 if (!DestBundle->IsScheduled)
17469 BundleMember->incrementUnscheduledDeps(1);
17470 if (!DestBundle->hasValidDependencies())
17471 WorkList.push_back(DestBundle);
17472 };
17473
17474 // Any instruction which isn't safe to speculate at the beginning of the
17475 // block is control dependend on any early exit or non-willreturn call
17476 // which proceeds it.
17477 if (!isGuaranteedToTransferExecutionToSuccessor(BundleMember->Inst)) {
17478 for (Instruction *I = BundleMember->Inst->getNextNode();
17479 I != ScheduleEnd; I = I->getNextNode()) {
17480 if (isSafeToSpeculativelyExecute(I, &*BB->begin(), SLP->AC))
17481 continue;
17482
17483 // Add the dependency
17484 MakeControlDependent(I);
17485
17487 // Everything past here must be control dependent on I.
17488 break;
17489 }
17490 }
17491
17492 if (RegionHasStackSave) {
17493 // If we have an inalloc alloca instruction, it needs to be scheduled
17494 // after any preceeding stacksave. We also need to prevent any alloca
17495 // from reordering above a preceeding stackrestore.
17496 if (match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17497 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17498 for (Instruction *I = BundleMember->Inst->getNextNode();
17499 I != ScheduleEnd; I = I->getNextNode()) {
17500 if (match(I, m_Intrinsic<Intrinsic::stacksave>()) ||
17501 match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17502 // Any allocas past here must be control dependent on I, and I
17503 // must be memory dependend on BundleMember->Inst.
17504 break;
17505
17506 if (!isa<AllocaInst>(I))
17507 continue;
17508
17509 // Add the dependency
17510 MakeControlDependent(I);
17511 }
17512 }
17513
17514 // In addition to the cases handle just above, we need to prevent
17515 // allocas and loads/stores from moving below a stacksave or a
17516 // stackrestore. Avoiding moving allocas below stackrestore is currently
17517 // thought to be conservatism. Moving loads/stores below a stackrestore
17518 // can lead to incorrect code.
17519 if (isa<AllocaInst>(BundleMember->Inst) ||
17520 BundleMember->Inst->mayReadOrWriteMemory()) {
17521 for (Instruction *I = BundleMember->Inst->getNextNode();
17522 I != ScheduleEnd; I = I->getNextNode()) {
17523 if (!match(I, m_Intrinsic<Intrinsic::stacksave>()) &&
17524 !match(I, m_Intrinsic<Intrinsic::stackrestore>()))
17525 continue;
17526
17527 // Add the dependency
17528 MakeControlDependent(I);
17529 break;
17530 }
17531 }
17532 }
17533
17534 // Handle the memory dependencies (if any).
17535 ScheduleData *DepDest = BundleMember->NextLoadStore;
17536 if (!DepDest)
17537 continue;
17538 Instruction *SrcInst = BundleMember->Inst;
17539 assert(SrcInst->mayReadOrWriteMemory() &&
17540 "NextLoadStore list for non memory effecting bundle?");
17541 MemoryLocation SrcLoc = getLocation(SrcInst);
17542 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17543 unsigned NumAliased = 0;
17544 unsigned DistToSrc = 1;
17545
17546 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17547 assert(isInSchedulingRegion(DepDest));
17548
17549 // We have two limits to reduce the complexity:
17550 // 1) AliasedCheckLimit: It's a small limit to reduce calls to
17551 // SLP->isAliased (which is the expensive part in this loop).
17552 // 2) MaxMemDepDistance: It's for very large blocks and it aborts
17553 // the whole loop (even if the loop is fast, it's quadratic).
17554 // It's important for the loop break condition (see below) to
17555 // check this limit even between two read-only instructions.
17556 if (DistToSrc >= MaxMemDepDistance ||
17557 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17558 (NumAliased >= AliasedCheckLimit ||
17559 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17560
17561 // We increment the counter only if the locations are aliased
17562 // (instead of counting all alias checks). This gives a better
17563 // balance between reduced runtime and accurate dependencies.
17564 NumAliased++;
17565
17566 DepDest->MemoryDependencies.push_back(BundleMember);
17567 BundleMember->Dependencies++;
17568 ScheduleData *DestBundle = DepDest->FirstInBundle;
17569 if (!DestBundle->IsScheduled) {
17570 BundleMember->incrementUnscheduledDeps(1);
17571 }
17572 if (!DestBundle->hasValidDependencies()) {
17573 WorkList.push_back(DestBundle);
17574 }
17575 }
17576
17577 // Example, explaining the loop break condition: Let's assume our
17578 // starting instruction is i0 and MaxMemDepDistance = 3.
17579 //
17580 // +--------v--v--v
17581 // i0,i1,i2,i3,i4,i5,i6,i7,i8
17582 // +--------^--^--^
17583 //
17584 // MaxMemDepDistance let us stop alias-checking at i3 and we add
17585 // dependencies from i0 to i3,i4,.. (even if they are not aliased).
17586 // Previously we already added dependencies from i3 to i6,i7,i8
17587 // (because of MaxMemDepDistance). As we added a dependency from
17588 // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
17589 // and we can abort this loop at i6.
17590 if (DistToSrc >= 2 * MaxMemDepDistance)
17591 break;
17592 DistToSrc++;
17593 }
17594 }
17595 if (InsertInReadyList && SD->isReady()) {
17596 ReadyInsts.insert(SD);
17597 LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
17598 << "\n");
17599 }
17600 }
17601}
17602
17603void BoUpSLP::BlockScheduling::resetSchedule() {
17604 assert(ScheduleStart &&
17605 "tried to reset schedule on block which has not been scheduled");
17606 for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
17607 if (ScheduleData *SD = getScheduleData(I)) {
17608 assert(isInSchedulingRegion(SD) &&
17609 "ScheduleData not in scheduling region");
17610 SD->IsScheduled = false;
17611 SD->resetUnscheduledDeps();
17612 }
17613 }
17614 ReadyInsts.clear();
17615}
17616
17617void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17618 if (!BS->ScheduleStart)
17619 return;
17620
17621 LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
17622
17623 // A key point - if we got here, pre-scheduling was able to find a valid
17624 // scheduling of the sub-graph of the scheduling window which consists
17625 // of all vector bundles and their transitive users. As such, we do not
17626 // need to reschedule anything *outside of* that subgraph.
17627
17628 BS->resetSchedule();
17629
17630 // For the real scheduling we use a more sophisticated ready-list: it is
17631 // sorted by the original instruction location. This lets the final schedule
17632 // be as close as possible to the original instruction order.
17633 // WARNING: If changing this order causes a correctness issue, that means
17634 // there is some missing dependence edge in the schedule data graph.
17635 struct ScheduleDataCompare {
17636 bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
17637 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17638 }
17639 };
17640 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17641
17642 // Ensure that all dependency data is updated (for nodes in the sub-graph)
17643 // and fill the ready-list with initial instructions.
17644 int Idx = 0;
17645 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
17646 I = I->getNextNode()) {
17647 if (ScheduleData *SD = BS->getScheduleData(I)) {
17648 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17649 (void)SDTE;
17651 SD->isPartOfBundle() ==
17652 (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) &&
17653 "scheduler and vectorizer bundle mismatch");
17654 SD->FirstInBundle->SchedulingPriority = Idx++;
17655
17656 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17657 BS->calculateDependencies(SD, false, this);
17658 }
17659 }
17660 BS->initialFillReadyList(ReadyInsts);
17661
17662 Instruction *LastScheduledInst = BS->ScheduleEnd;
17663
17664 // Do the "real" scheduling.
17665 while (!ReadyInsts.empty()) {
17666 ScheduleData *Picked = *ReadyInsts.begin();
17667 ReadyInsts.erase(ReadyInsts.begin());
17668
17669 // Move the scheduled instruction(s) to their dedicated places, if not
17670 // there yet.
17671 for (ScheduleData *BundleMember = Picked; BundleMember;
17672 BundleMember = BundleMember->NextInBundle) {
17673 Instruction *PickedInst = BundleMember->Inst;
17674 if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
17675 PickedInst->moveAfter(LastScheduledInst->getPrevNode());
17676 LastScheduledInst = PickedInst;
17677 }
17678
17679 BS->schedule(Picked, ReadyInsts);
17680 }
17681
17682 // Check that we didn't break any of our invariants.
17683#ifdef EXPENSIVE_CHECKS
17684 BS->verify();
17685#endif
17686
17687#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17688 // Check that all schedulable entities got scheduled
17689 for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) {
17690 ScheduleData *SD = BS->getScheduleData(I);
17691 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17692 assert(SD->IsScheduled && "must be scheduled at this point");
17693 }
17694#endif
17695
17696 // Avoid duplicate scheduling of the block.
17697 BS->ScheduleStart = nullptr;
17698}
17699
17701 // If V is a store, just return the width of the stored value (or value
17702 // truncated just before storing) without traversing the expression tree.
17703 // This is the common case.
17704 if (auto *Store = dyn_cast<StoreInst>(V))
17705 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17706
17707 if (auto *IEI = dyn_cast<InsertElementInst>(V))
17708 return getVectorElementSize(IEI->getOperand(1));
17709
17710 auto E = InstrElementSize.find(V);
17711 if (E != InstrElementSize.end())
17712 return E->second;
17713
17714 // If V is not a store, we can traverse the expression tree to find loads
17715 // that feed it. The type of the loaded value may indicate a more suitable
17716 // width than V's type. We want to base the vector element size on the width
17717 // of memory operations where possible.
17720 if (auto *I = dyn_cast<Instruction>(V)) {
17721 Worklist.emplace_back(I, I->getParent(), 0);
17722 Visited.insert(I);
17723 }
17724
17725 // Traverse the expression tree in bottom-up order looking for loads. If we
17726 // encounter an instruction we don't yet handle, we give up.
17727 auto Width = 0u;
17728 Value *FirstNonBool = nullptr;
17729 while (!Worklist.empty()) {
17730 auto [I, Parent, Level] = Worklist.pop_back_val();
17731
17732 // We should only be looking at scalar instructions here. If the current
17733 // instruction has a vector type, skip.
17734 auto *Ty = I->getType();
17735 if (isa<VectorType>(Ty))
17736 continue;
17737 if (Ty != Builder.getInt1Ty() && !FirstNonBool)
17738 FirstNonBool = I;
17739 if (Level > RecursionMaxDepth)
17740 continue;
17741
17742 // If the current instruction is a load, update MaxWidth to reflect the
17743 // width of the loaded value.
17744 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(I))
17745 Width = std::max<unsigned>(Width, DL->getTypeSizeInBits(Ty));
17746
17747 // Otherwise, we need to visit the operands of the instruction. We only
17748 // handle the interesting cases from buildTree here. If an operand is an
17749 // instruction we haven't yet visited and from the same basic block as the
17750 // user or the use is a PHI node, we add it to the worklist.
17753 for (Use &U : I->operands()) {
17754 if (auto *J = dyn_cast<Instruction>(U.get()))
17755 if (Visited.insert(J).second &&
17756 (isa<PHINode>(I) || J->getParent() == Parent)) {
17757 Worklist.emplace_back(J, J->getParent(), Level + 1);
17758 continue;
17759 }
17760 if (!FirstNonBool && U.get()->getType() != Builder.getInt1Ty())
17761 FirstNonBool = U.get();
17762 }
17763 } else {
17764 break;
17765 }
17766 }
17767
17768 // If we didn't encounter a memory access in the expression tree, or if we
17769 // gave up for some reason, just return the width of V. Otherwise, return the
17770 // maximum width we found.
17771 if (!Width) {
17772 if (V->getType() == Builder.getInt1Ty() && FirstNonBool)
17773 V = FirstNonBool;
17774 Width = DL->getTypeSizeInBits(V->getType());
17775 }
17776
17777 for (Instruction *I : Visited)
17778 InstrElementSize[I] = Width;
17779
17780 return Width;
17781}
17782
17783bool BoUpSLP::collectValuesToDemote(
17784 const TreeEntry &E, bool IsProfitableToDemoteRoot, unsigned &BitWidth,
17786 const SmallDenseSet<unsigned, 8> &NodesToKeepBWs, unsigned &MaxDepthLevel,
17787 bool &IsProfitableToDemote, bool IsTruncRoot) const {
17788 // We can always demote constants.
17789 if (all_of(E.Scalars, IsaPred<Constant>))
17790 return true;
17791
17792 unsigned OrigBitWidth =
17793 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17794 if (OrigBitWidth == BitWidth) {
17795 MaxDepthLevel = 1;
17796 return true;
17797 }
17798
17799 // Check if the node was analyzed already and must keep its original bitwidth.
17800 if (NodesToKeepBWs.contains(E.Idx))
17801 return false;
17802
17803 // If the value is not a vectorized instruction in the expression and not used
17804 // by the insertelement instruction and not used in multiple vector nodes, it
17805 // cannot be demoted.
17806 bool IsSignedNode = any_of(E.Scalars, [&](Value *R) {
17807 if (isa<PoisonValue>(R))
17808 return false;
17809 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17810 });
17811 auto IsPotentiallyTruncated = [&](Value *V, unsigned &BitWidth) -> bool {
17812 if (isa<PoisonValue>(V))
17813 return true;
17814 if (MultiNodeScalars.contains(V))
17815 return false;
17816 // For lat shuffle of sext/zext with many uses need to check the extra bit
17817 // for unsigned values, otherwise may have incorrect casting for reused
17818 // scalars.
17819 bool IsSignedVal = !isKnownNonNegative(V, SimplifyQuery(*DL));
17820 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth > BitWidth) {
17821 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
17822 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17823 return true;
17824 }
17825 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
17826 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17827 if (IsSignedNode)
17828 ++BitWidth1;
17829 if (auto *I = dyn_cast<Instruction>(V)) {
17830 APInt Mask = DB->getDemandedBits(I);
17831 unsigned BitWidth2 =
17832 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17833 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17834 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth2 - 1);
17835 if (MaskedValueIsZero(V, Mask, SimplifyQuery(*DL)))
17836 break;
17837 BitWidth2 *= 2;
17838 }
17839 BitWidth1 = std::min(BitWidth1, BitWidth2);
17840 }
17841 BitWidth = std::max(BitWidth, BitWidth1);
17842 return BitWidth > 0 && OrigBitWidth >= (BitWidth * 2);
17843 };
17844 auto FinalAnalysis = [&, TTI = TTI]() {
17845 if (!IsProfitableToDemote)
17846 return false;
17847 bool Res = all_of(
17848 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(BitWidth)));
17849 // Demote gathers.
17850 if (Res && E.isGather()) {
17851 // Check possible extractelement instructions bases and final vector
17852 // length.
17853 SmallPtrSet<Value *, 4> UniqueBases;
17854 for (Value *V : E.Scalars) {
17855 auto *EE = dyn_cast<ExtractElementInst>(V);
17856 if (!EE)
17857 continue;
17858 UniqueBases.insert(EE->getVectorOperand());
17859 }
17860 const unsigned VF = E.Scalars.size();
17861 Type *OrigScalarTy = E.Scalars.front()->getType();
17862 if (UniqueBases.size() <= 2 ||
17863 TTI->getNumberOfParts(getWidenedType(OrigScalarTy, VF)) ==
17865 IntegerType::get(OrigScalarTy->getContext(), BitWidth), VF)))
17866 ToDemote.push_back(E.Idx);
17867 }
17868 return Res;
17869 };
17870 if (E.isGather() || !Visited.insert(&E).second ||
17871 any_of(E.Scalars, [&](Value *V) {
17872 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17873 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17874 });
17875 }))
17876 return FinalAnalysis();
17877
17878 if (any_of(E.Scalars, [&](Value *V) {
17879 return !all_of(V->users(), [=](User *U) {
17880 return getTreeEntry(U) ||
17881 (E.Idx == 0 && UserIgnoreList &&
17882 UserIgnoreList->contains(U)) ||
17883 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17884 !U->getType()->isScalableTy() &&
17885 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17886 }) && !IsPotentiallyTruncated(V, BitWidth);
17887 }))
17888 return false;
17889
17890 auto ProcessOperands = [&](ArrayRef<const TreeEntry *> Operands,
17891 bool &NeedToExit) {
17892 NeedToExit = false;
17893 unsigned InitLevel = MaxDepthLevel;
17894 for (const TreeEntry *Op : Operands) {
17895 unsigned Level = InitLevel;
17896 if (!collectValuesToDemote(*Op, IsProfitableToDemoteRoot, BitWidth,
17897 ToDemote, Visited, NodesToKeepBWs, Level,
17898 IsProfitableToDemote, IsTruncRoot)) {
17899 if (!IsProfitableToDemote)
17900 return false;
17901 NeedToExit = true;
17902 if (!FinalAnalysis())
17903 return false;
17904 continue;
17905 }
17906 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17907 }
17908 return true;
17909 };
17910 auto AttemptCheckBitwidth =
17911 [&](function_ref<bool(unsigned, unsigned)> Checker, bool &NeedToExit) {
17912 // Try all bitwidth < OrigBitWidth.
17913 NeedToExit = false;
17914 unsigned BestFailBitwidth = 0;
17915 for (; BitWidth < OrigBitWidth; BitWidth *= 2) {
17916 if (Checker(BitWidth, OrigBitWidth))
17917 return true;
17918 if (BestFailBitwidth == 0 && FinalAnalysis())
17919 BestFailBitwidth = BitWidth;
17920 }
17921 if (BitWidth >= OrigBitWidth) {
17922 if (BestFailBitwidth == 0) {
17923 BitWidth = OrigBitWidth;
17924 return false;
17925 }
17926 MaxDepthLevel = 1;
17927 BitWidth = BestFailBitwidth;
17928 NeedToExit = true;
17929 return true;
17930 }
17931 return false;
17932 };
17933 auto TryProcessInstruction =
17934 [&](unsigned &BitWidth, ArrayRef<const TreeEntry *> Operands = {},
17935 function_ref<bool(unsigned, unsigned)> Checker = {}) {
17936 if (Operands.empty()) {
17937 if (!IsTruncRoot)
17938 MaxDepthLevel = 1;
17939 (void)for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17940 std::ref(BitWidth)));
17941 } else {
17942 // Several vectorized uses? Check if we can truncate it, otherwise -
17943 // exit.
17944 if (E.UserTreeIndices.size() > 1 &&
17945 !all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17946 std::ref(BitWidth))))
17947 return false;
17948 bool NeedToExit = false;
17949 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17950 return false;
17951 if (NeedToExit)
17952 return true;
17953 if (!ProcessOperands(Operands, NeedToExit))
17954 return false;
17955 if (NeedToExit)
17956 return true;
17957 }
17958
17959 ++MaxDepthLevel;
17960 // Record the entry that we can demote.
17961 ToDemote.push_back(E.Idx);
17962 return IsProfitableToDemote;
17963 };
17964 switch (E.getOpcode()) {
17965
17966 // We can always demote truncations and extensions. Since truncations can
17967 // seed additional demotion, we save the truncated value.
17968 case Instruction::Trunc:
17969 if (IsProfitableToDemoteRoot)
17970 IsProfitableToDemote = true;
17971 return TryProcessInstruction(BitWidth);
17972 case Instruction::ZExt:
17973 case Instruction::SExt:
17974 IsProfitableToDemote = true;
17975 return TryProcessInstruction(BitWidth);
17976
17977 // We can demote certain binary operations if we can demote both of their
17978 // operands.
17979 case Instruction::Add:
17980 case Instruction::Sub:
17981 case Instruction::Mul:
17982 case Instruction::And:
17983 case Instruction::Or:
17984 case Instruction::Xor: {
17985 return TryProcessInstruction(
17986 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17987 }
17988 case Instruction::Freeze:
17989 return TryProcessInstruction(BitWidth, getOperandEntry(&E, 0));
17990 case Instruction::Shl: {
17991 // If we are truncating the result of this SHL, and if it's a shift of an
17992 // inrange amount, we can always perform a SHL in a smaller type.
17993 auto ShlChecker = [&](unsigned BitWidth, unsigned) {
17994 return all_of(E.Scalars, [&](Value *V) {
17995 if (isa<PoisonValue>(V))
17996 return true;
17997 auto *I = cast<Instruction>(V);
17998 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17999 return AmtKnownBits.getMaxValue().ult(BitWidth);
18000 });
18001 };
18002 return TryProcessInstruction(
18003 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
18004 }
18005 case Instruction::LShr: {
18006 // If this is a truncate of a logical shr, we can truncate it to a smaller
18007 // lshr iff we know that the bits we would otherwise be shifting in are
18008 // already zeros.
18009 auto LShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18010 return all_of(E.Scalars, [&](Value *V) {
18011 if (isa<PoisonValue>(V))
18012 return true;
18013 auto *I = cast<Instruction>(V);
18014 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18015 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18016 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18017 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
18018 SimplifyQuery(*DL));
18019 });
18020 };
18021 return TryProcessInstruction(
18022 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18023 LShrChecker);
18024 }
18025 case Instruction::AShr: {
18026 // If this is a truncate of an arithmetic shr, we can truncate it to a
18027 // smaller ashr iff we know that all the bits from the sign bit of the
18028 // original type and the sign bit of the truncate type are similar.
18029 auto AShrChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18030 return all_of(E.Scalars, [&](Value *V) {
18031 if (isa<PoisonValue>(V))
18032 return true;
18033 auto *I = cast<Instruction>(V);
18034 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18035 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18036 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18037 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18038 nullptr, DT);
18039 });
18040 };
18041 return TryProcessInstruction(
18042 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18043 AShrChecker);
18044 }
18045 case Instruction::UDiv:
18046 case Instruction::URem: {
18047 // UDiv and URem can be truncated if all the truncated bits are zero.
18048 auto Checker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18049 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18050 return all_of(E.Scalars, [&](Value *V) {
18051 auto *I = cast<Instruction>(V);
18052 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18053 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18054 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18055 });
18056 };
18057 return TryProcessInstruction(
18058 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18059 }
18060
18061 // We can demote selects if we can demote their true and false values.
18062 case Instruction::Select: {
18063 return TryProcessInstruction(
18064 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18065 }
18066
18067 // We can demote phis if we can demote all their incoming operands. Note that
18068 // we don't need to worry about cycles since we ensure single use above.
18069 case Instruction::PHI: {
18070 const unsigned NumOps = E.getNumOperands();
18072 transform(seq<unsigned>(0, NumOps), Ops.begin(),
18073 std::bind(&BoUpSLP::getOperandEntry, this, &E, _1));
18074
18075 return TryProcessInstruction(BitWidth, Ops);
18076 }
18077
18078 case Instruction::Call: {
18079 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18080 if (!IC)
18081 break;
18083 if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
18084 ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
18085 break;
18086 SmallVector<const TreeEntry *, 2> Operands(1, getOperandEntry(&E, 0));
18087 function_ref<bool(unsigned, unsigned)> CallChecker;
18088 auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18089 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18090 return all_of(E.Scalars, [&](Value *V) {
18091 auto *I = cast<Instruction>(V);
18092 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18093 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18094 return MaskedValueIsZero(I->getOperand(0), Mask,
18095 SimplifyQuery(*DL)) &&
18096 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18097 }
18098 assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
18099 "Expected min/max intrinsics only.");
18100 unsigned SignBits = OrigBitWidth - BitWidth;
18101 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18102 unsigned Op0SignBits = ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18103 nullptr, DT);
18104 unsigned Op1SignBits = ComputeNumSignBits(I->getOperand(1), *DL, 0, AC,
18105 nullptr, DT);
18106 return SignBits <= Op0SignBits &&
18107 ((SignBits != Op0SignBits &&
18108 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18109 MaskedValueIsZero(I->getOperand(0), Mask,
18110 SimplifyQuery(*DL))) &&
18111 SignBits <= Op1SignBits &&
18112 ((SignBits != Op1SignBits &&
18113 !isKnownNonNegative(I->getOperand(1), SimplifyQuery(*DL))) ||
18114 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL)));
18115 });
18116 };
18117 auto AbsChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
18118 assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
18119 return all_of(E.Scalars, [&](Value *V) {
18120 auto *I = cast<Instruction>(V);
18121 unsigned SignBits = OrigBitWidth - BitWidth;
18122 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18123 unsigned Op0SignBits =
18124 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18125 return SignBits <= Op0SignBits &&
18126 ((SignBits != Op0SignBits &&
18127 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18128 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18129 });
18130 };
18131 if (ID != Intrinsic::abs) {
18132 Operands.push_back(getOperandEntry(&E, 1));
18133 CallChecker = CompChecker;
18134 } else {
18135 CallChecker = AbsChecker;
18136 }
18137 InstructionCost BestCost =
18138 std::numeric_limits<InstructionCost::CostType>::max();
18139 unsigned BestBitWidth = BitWidth;
18140 unsigned VF = E.Scalars.size();
18141 // Choose the best bitwidth based on cost estimations.
18142 auto Checker = [&](unsigned BitWidth, unsigned) {
18143 unsigned MinBW = PowerOf2Ceil(BitWidth);
18144 SmallVector<Type *> ArgTys =
18145 buildIntrinsicArgTypes(IC, ID, VF, MinBW, TTI);
18146 auto VecCallCosts = getVectorCallCosts(
18147 IC, getWidenedType(IntegerType::get(IC->getContext(), MinBW), VF),
18148 TTI, TLI, ArgTys);
18149 InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
18150 if (Cost < BestCost) {
18151 BestCost = Cost;
18152 BestBitWidth = BitWidth;
18153 }
18154 return false;
18155 };
18156 [[maybe_unused]] bool NeedToExit;
18157 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18158 BitWidth = BestBitWidth;
18159 return TryProcessInstruction(BitWidth, Operands, CallChecker);
18160 }
18161
18162 // Otherwise, conservatively give up.
18163 default:
18164 break;
18165 }
18166 MaxDepthLevel = 1;
18167 return FinalAnalysis();
18168}
18169
18170static RecurKind getRdxKind(Value *V);
18171
18173 // We only attempt to truncate integer expressions.
18174 bool IsStoreOrInsertElt =
18175 VectorizableTree.front()->hasState() &&
18176 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18177 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18178 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18179 ExtraBitWidthNodes.size() <= 1 &&
18180 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18181 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18182 return;
18183
18184 unsigned NodeIdx = 0;
18185 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18186 NodeIdx = 1;
18187
18188 // Ensure the roots of the vectorizable tree don't form a cycle.
18189 if (VectorizableTree[NodeIdx]->isGather() ||
18190 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18191 (NodeIdx != 0 && any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18192 [NodeIdx](const EdgeInfo &EI) {
18193 return EI.UserTE->Idx > NodeIdx;
18194 })))
18195 return;
18196
18197 // The first value node for store/insertelement is sext/zext/trunc? Skip it,
18198 // resize to the final type.
18199 bool IsTruncRoot = false;
18200 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18201 SmallVector<unsigned> RootDemotes;
18202 SmallDenseSet<unsigned, 8> NodesToKeepBWs;
18203 if (NodeIdx != 0 &&
18204 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18205 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18206 assert(IsStoreOrInsertElt && "Expected store/insertelement seeded graph.");
18207 IsTruncRoot = true;
18208 RootDemotes.push_back(NodeIdx);
18209 IsProfitableToDemoteRoot = true;
18210 ++NodeIdx;
18211 }
18212
18213 // Analyzed the reduction already and not profitable - exit.
18214 if (AnalyzedMinBWVals.contains(VectorizableTree[NodeIdx]->Scalars.front()))
18215 return;
18216
18217 SmallVector<unsigned> ToDemote;
18218 auto ComputeMaxBitWidth =
18219 [&](const TreeEntry &E, bool IsTopRoot, bool IsProfitableToDemoteRoot,
18220 unsigned Limit, bool IsTruncRoot, bool IsSignedCmp) -> unsigned {
18221 ToDemote.clear();
18222 // Check if the root is trunc and the next node is gather/buildvector, then
18223 // keep trunc in scalars, which is free in most cases.
18224 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18225 !NodesToKeepBWs.contains(E.Idx) &&
18226 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18227 all_of(E.Scalars, [&](Value *V) {
18228 return V->hasOneUse() || isa<Constant>(V) ||
18229 (!V->hasNUsesOrMore(UsesLimit) &&
18230 none_of(V->users(), [&](User *U) {
18231 const TreeEntry *TE = getTreeEntry(U);
18232 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18233 if (TE == UserTE || !TE)
18234 return false;
18235 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18236 SelectInst>(U) ||
18237 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18238 SelectInst>(UserTE->getMainOp()))
18239 return true;
18240 unsigned UserTESz = DL->getTypeSizeInBits(
18241 UserTE->Scalars.front()->getType());
18242 auto It = MinBWs.find(TE);
18243 if (It != MinBWs.end() && It->second.first > UserTESz)
18244 return true;
18245 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18246 }));
18247 })) {
18248 ToDemote.push_back(E.Idx);
18249 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18250 auto It = MinBWs.find(UserTE);
18251 if (It != MinBWs.end())
18252 return It->second.first;
18253 unsigned MaxBitWidth =
18254 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18255 MaxBitWidth = bit_ceil(MaxBitWidth);
18256 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18257 MaxBitWidth = 8;
18258 return MaxBitWidth;
18259 }
18260
18261 if (!E.hasState())
18262 return 0u;
18263
18264 unsigned VF = E.getVectorFactor();
18265 Type *ScalarTy = E.Scalars.front()->getType();
18266 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
18267 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->getScalarType());
18268 if (!TreeRootIT)
18269 return 0u;
18270
18271 if (any_of(E.Scalars,
18272 [&](Value *V) { return AnalyzedMinBWVals.contains(V); }))
18273 return 0u;
18274
18275 unsigned NumParts = TTI->getNumberOfParts(
18276 getWidenedType(TreeRootIT, VF * ScalarTyNumElements));
18277
18278 // The maximum bit width required to represent all the values that can be
18279 // demoted without loss of precision. It would be safe to truncate the roots
18280 // of the expression to this width.
18281 unsigned MaxBitWidth = 1u;
18282
18283 // True if the roots can be zero-extended back to their original type,
18284 // rather than sign-extended. We know that if the leading bits are not
18285 // demanded, we can safely zero-extend. So we initialize IsKnownPositive to
18286 // True.
18287 // Determine if the sign bit of all the roots is known to be zero. If not,
18288 // IsKnownPositive is set to False.
18289 bool IsKnownPositive = !IsSignedCmp && all_of(E.Scalars, [&](Value *R) {
18290 if (isa<PoisonValue>(R))
18291 return true;
18292 KnownBits Known = computeKnownBits(R, *DL);
18293 return Known.isNonNegative();
18294 });
18295
18296 // We first check if all the bits of the roots are demanded. If they're not,
18297 // we can truncate the roots to this narrower type.
18298 for (Value *Root : E.Scalars) {
18299 if (isa<PoisonValue>(Root))
18300 continue;
18301 unsigned NumSignBits = ComputeNumSignBits(Root, *DL, 0, AC, nullptr, DT);
18302 TypeSize NumTypeBits =
18303 DL->getTypeSizeInBits(Root->getType()->getScalarType());
18304 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18305 // If we can't prove that the sign bit is zero, we must add one to the
18306 // maximum bit width to account for the unknown sign bit. This preserves
18307 // the existing sign bit so we can safely sign-extend the root back to the
18308 // original type. Otherwise, if we know the sign bit is zero, we will
18309 // zero-extend the root instead.
18310 //
18311 // FIXME: This is somewhat suboptimal, as there will be cases where adding
18312 // one to the maximum bit width will yield a larger-than-necessary
18313 // type. In general, we need to add an extra bit only if we can't
18314 // prove that the upper bit of the original type is equal to the
18315 // upper bit of the proposed smaller type. If these two bits are
18316 // the same (either zero or one) we know that sign-extending from
18317 // the smaller type will result in the same value. Here, since we
18318 // can't yet prove this, we are just making the proposed smaller
18319 // type larger to ensure correctness.
18320 if (!IsKnownPositive)
18321 ++BitWidth1;
18322
18323 APInt Mask = DB->getDemandedBits(cast<Instruction>(Root));
18324 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18325 MaxBitWidth =
18326 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18327 }
18328
18329 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18330 MaxBitWidth = 8;
18331
18332 // If the original type is large, but reduced type does not improve the reg
18333 // use - ignore it.
18334 if (NumParts > 1 &&
18335 NumParts ==
18337 IntegerType::get(F->getContext(), bit_ceil(MaxBitWidth)), VF)))
18338 return 0u;
18339
18340 unsigned Opcode = E.getOpcode();
18341 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18342 Opcode == Instruction::SExt ||
18343 Opcode == Instruction::ZExt || NumParts > 1;
18344 // Conservatively determine if we can actually truncate the roots of the
18345 // expression. Collect the values that can be demoted in ToDemote and
18346 // additional roots that require investigating in Roots.
18348 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18349 bool NeedToDemote = IsProfitableToDemote;
18350
18351 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18352 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18353 NeedToDemote, IsTruncRoot) ||
18354 (MaxDepthLevel <= Limit &&
18355 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18356 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18357 DL->getTypeSizeInBits(TreeRootIT) /
18358 DL->getTypeSizeInBits(
18359 E.getMainOp()->getOperand(0)->getType()) >
18360 2)))))
18361 return 0u;
18362 // Round MaxBitWidth up to the next power-of-two.
18363 MaxBitWidth = bit_ceil(MaxBitWidth);
18364
18365 return MaxBitWidth;
18366 };
18367
18368 // If we can truncate the root, we must collect additional values that might
18369 // be demoted as a result. That is, those seeded by truncations we will
18370 // modify.
18371 // Add reduction ops sizes, if any.
18372 if (UserIgnoreList &&
18373 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18374 // Convert vector_reduce_add(ZExt(<n x i1>)) to ZExtOrTrunc(ctpop(bitcast <n
18375 // x i1> to in)).
18376 if (all_of(*UserIgnoreList,
18377 [](Value *V) {
18378 return isa<PoisonValue>(V) ||
18379 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18380 }) &&
18381 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18382 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18383 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18384 Builder.getInt1Ty()) {
18385 ReductionBitWidth = 1;
18386 } else {
18387 for (Value *V : *UserIgnoreList) {
18388 if (isa<PoisonValue>(V))
18389 continue;
18390 unsigned NumSignBits = ComputeNumSignBits(V, *DL, 0, AC, nullptr, DT);
18391 TypeSize NumTypeBits = DL->getTypeSizeInBits(V->getType());
18392 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18394 ++BitWidth1;
18395 unsigned BitWidth2 = BitWidth1;
18397 APInt Mask = DB->getDemandedBits(cast<Instruction>(V));
18398 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18399 }
18400 ReductionBitWidth =
18401 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18402 }
18403 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18404 ReductionBitWidth = 8;
18405
18406 ReductionBitWidth = bit_ceil(ReductionBitWidth);
18407 }
18408 }
18409 bool IsTopRoot = NodeIdx == 0;
18410 while (NodeIdx < VectorizableTree.size() &&
18411 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18412 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18413 RootDemotes.push_back(NodeIdx);
18414 ++NodeIdx;
18415 IsTruncRoot = true;
18416 }
18417 bool IsSignedCmp = false;
18418 while (NodeIdx < VectorizableTree.size()) {
18419 ArrayRef<Value *> TreeRoot = VectorizableTree[NodeIdx]->Scalars;
18420 unsigned Limit = 2;
18421 if (IsTopRoot &&
18422 ReductionBitWidth ==
18423 DL->getTypeSizeInBits(
18424 VectorizableTree.front()->Scalars.front()->getType()))
18425 Limit = 3;
18426 unsigned MaxBitWidth = ComputeMaxBitWidth(
18427 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18428 IsTruncRoot, IsSignedCmp);
18429 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.empty())) {
18430 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18431 ReductionBitWidth = bit_ceil(MaxBitWidth);
18432 else if (MaxBitWidth == 0)
18433 ReductionBitWidth = 0;
18434 }
18435
18436 for (unsigned Idx : RootDemotes) {
18437 if (all_of(VectorizableTree[Idx]->Scalars, [&](Value *V) {
18438 uint32_t OrigBitWidth =
18439 DL->getTypeSizeInBits(V->getType()->getScalarType());
18440 if (OrigBitWidth > MaxBitWidth) {
18441 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, MaxBitWidth);
18442 return MaskedValueIsZero(V, Mask, SimplifyQuery(*DL));
18443 }
18444 return false;
18445 }))
18446 ToDemote.push_back(Idx);
18447 }
18448 RootDemotes.clear();
18449 IsTopRoot = false;
18450 IsProfitableToDemoteRoot = true;
18451
18452 if (ExtraBitWidthNodes.empty()) {
18453 NodeIdx = VectorizableTree.size();
18454 } else {
18455 unsigned NewIdx = 0;
18456 do {
18457 NewIdx = *ExtraBitWidthNodes.begin();
18458 ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
18459 } while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
18460 NodeIdx = NewIdx;
18461 IsTruncRoot =
18462 NodeIdx < VectorizableTree.size() &&
18463 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18464 [](const EdgeInfo &EI) {
18465 return EI.EdgeIdx == 0 &&
18466 EI.UserTE->getOpcode() == Instruction::Trunc &&
18467 !EI.UserTE->isAltShuffle();
18468 });
18469 IsSignedCmp =
18470 NodeIdx < VectorizableTree.size() &&
18471 any_of(
18472 VectorizableTree[NodeIdx]->UserTreeIndices,
18473 [&](const EdgeInfo &EI) {
18474 return (EI.UserTE->hasState() &&
18475 EI.UserTE->getOpcode() == Instruction::ICmp) &&
18476 any_of(EI.UserTE->Scalars, [&](Value *V) {
18477 auto *IC = dyn_cast<ICmpInst>(V);
18478 return IC &&
18479 (IC->isSigned() ||
18480 !isKnownNonNegative(IC->getOperand(0),
18481 SimplifyQuery(*DL)) ||
18482 !isKnownNonNegative(IC->getOperand(1),
18483 SimplifyQuery(*DL)));
18484 });
18485 });
18486 }
18487
18488 // If the maximum bit width we compute is less than the width of the roots'
18489 // type, we can proceed with the narrowing. Otherwise, do nothing.
18490 if (MaxBitWidth == 0 ||
18491 MaxBitWidth >=
18492 cast<IntegerType>(TreeRoot.front()->getType()->getScalarType())
18493 ->getBitWidth()) {
18494 if (UserIgnoreList)
18495 AnalyzedMinBWVals.insert(TreeRoot.begin(), TreeRoot.end());
18496 NodesToKeepBWs.insert(ToDemote.begin(), ToDemote.end());
18497 continue;
18498 }
18499
18500 // Finally, map the values we can demote to the maximum bit with we
18501 // computed.
18502 for (unsigned Idx : ToDemote) {
18503 TreeEntry *TE = VectorizableTree[Idx].get();
18504 if (MinBWs.contains(TE))
18505 continue;
18506 bool IsSigned = any_of(TE->Scalars, [&](Value *R) {
18507 if (isa<PoisonValue>(R))
18508 return false;
18509 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18510 });
18511 MinBWs.try_emplace(TE, MaxBitWidth, IsSigned);
18512 }
18513 }
18514}
18515
18517 auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
18518 auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
18519 auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
18520 auto *AA = &AM.getResult<AAManager>(F);
18521 auto *LI = &AM.getResult<LoopAnalysis>(F);
18522 auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
18523 auto *AC = &AM.getResult<AssumptionAnalysis>(F);
18524 auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
18526
18527 bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
18528 if (!Changed)
18529 return PreservedAnalyses::all();
18530
18533 return PA;
18534}
18535
18537 TargetTransformInfo *TTI_,
18538 TargetLibraryInfo *TLI_, AAResults *AA_,
18539 LoopInfo *LI_, DominatorTree *DT_,
18540 AssumptionCache *AC_, DemandedBits *DB_,
18543 return false;
18544 SE = SE_;
18545 TTI = TTI_;
18546 TLI = TLI_;
18547 AA = AA_;
18548 LI = LI_;
18549 DT = DT_;
18550 AC = AC_;
18551 DB = DB_;
18552 DL = &F.getDataLayout();
18553
18554 Stores.clear();
18555 GEPs.clear();
18556 bool Changed = false;
18557
18558 // If the target claims to have no vector registers don't attempt
18559 // vectorization.
18561 LLVM_DEBUG(
18562 dbgs() << "SLP: Didn't find any vector registers for target, abort.\n");
18563 return false;
18564 }
18565
18566 // Don't vectorize when the attribute NoImplicitFloat is used.
18567 if (F.hasFnAttribute(Attribute::NoImplicitFloat))
18568 return false;
18569
18570 LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
18571
18572 // Use the bottom up slp vectorizer to construct chains that start with
18573 // store instructions.
18574 BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
18575
18576 // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
18577 // delete instructions.
18578
18579 // Update DFS numbers now so that we can use them for ordering.
18580 DT->updateDFSNumbers();
18581
18582 // Scan the blocks in the function in post order.
18583 for (auto *BB : post_order(&F.getEntryBlock())) {
18584 if (BB->isEHPad() || isa_and_nonnull<UnreachableInst>(BB->getTerminator()))
18585 continue;
18586
18587 // Start new block - clear the list of reduction roots.
18588 R.clearReductionData();
18589 collectSeedInstructions(BB);
18590
18591 // Vectorize trees that end at stores.
18592 if (!Stores.empty()) {
18593 LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
18594 << " underlying objects.\n");
18595 Changed |= vectorizeStoreChains(R);
18596 }
18597
18598 // Vectorize trees that end at reductions.
18599 Changed |= vectorizeChainsInBlock(BB, R);
18600
18601 // Vectorize the index computations of getelementptr instructions. This
18602 // is primarily intended to catch gather-like idioms ending at
18603 // non-consecutive loads.
18604 if (!GEPs.empty()) {
18605 LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
18606 << " underlying objects.\n");
18607 Changed |= vectorizeGEPIndices(BB, R);
18608 }
18609 }
18610
18611 if (Changed) {
18612 R.optimizeGatherSequence();
18613 LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
18614 }
18615 return Changed;
18616}
18617
18618std::optional<bool>
18619SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
18620 unsigned Idx, unsigned MinVF,
18621 unsigned &Size) {
18622 Size = 0;
18623 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
18624 << "\n");
18625 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18626 unsigned VF = Chain.size();
18627
18628 if (!has_single_bit(Sz) ||
18630 *TTI, cast<StoreInst>(Chain.front())->getValueOperand()->getType(),
18631 VF) ||
18632 VF < 2 || VF < MinVF) {
18633 // Check if vectorizing with a non-power-of-2 VF should be considered. At
18634 // the moment, only consider cases where VF + 1 is a power-of-2, i.e. almost
18635 // all vector lanes are used.
18636 if (!VectorizeNonPowerOf2 || (VF < MinVF && VF + 1 != MinVF))
18637 return false;
18638 }
18639
18640 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
18641 << "\n");
18642
18643 SetVector<Value *> ValOps;
18644 for (Value *V : Chain)
18645 ValOps.insert(cast<StoreInst>(V)->getValueOperand());
18646 // Operands are not same/alt opcodes or non-power-of-2 uniques - exit.
18647 InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI);
18648 if (all_of(ValOps, IsaPred<Instruction>) && ValOps.size() > 1) {
18649 DenseSet<Value *> Stores(Chain.begin(), Chain.end());
18650 bool IsAllowedSize =
18651 hasFullVectorsOrPowerOf2(*TTI, ValOps.front()->getType(),
18652 ValOps.size()) ||
18653 (VectorizeNonPowerOf2 && has_single_bit(ValOps.size() + 1));
18654 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18655 (!S.getMainOp()->isSafeToRemove() ||
18656 any_of(ValOps.getArrayRef(),
18657 [&](Value *V) {
18658 return !isa<ExtractElementInst>(V) &&
18659 (V->getNumUses() > Chain.size() ||
18660 any_of(V->users(), [&](User *U) {
18661 return !Stores.contains(U);
18662 }));
18663 }))) ||
18664 (ValOps.size() > Chain.size() / 2 && !S)) {
18665 Size = (!IsAllowedSize && S) ? 1 : 2;
18666 return false;
18667 }
18668 }
18669 if (R.isLoadCombineCandidate(Chain))
18670 return true;
18671 R.buildTree(Chain);
18672 // Check if tree tiny and store itself or its value is not vectorized.
18673 if (R.isTreeTinyAndNotFullyVectorizable()) {
18674 if (R.isGathered(Chain.front()) ||
18675 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18676 return std::nullopt;
18677 Size = R.getCanonicalGraphSize();
18678 return false;
18679 }
18680 R.reorderTopToBottom();
18681 R.reorderBottomToTop();
18682 R.transformNodes();
18683 R.buildExternalUses();
18684
18685 R.computeMinimumValueSizes();
18686
18687 Size = R.getCanonicalGraphSize();
18688 if (S && S.getOpcode() == Instruction::Load)
18689 Size = 2; // cut off masked gather small trees
18690 InstructionCost Cost = R.getTreeCost();
18691
18692 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF=" << VF << "\n");
18693 if (Cost < -SLPCostThreshold) {
18694 LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
18695
18696 using namespace ore;
18697
18698 R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
18699 cast<StoreInst>(Chain[0]))
18700 << "Stores SLP vectorized with cost " << NV("Cost", Cost)
18701 << " and with tree size "
18702 << NV("TreeSize", R.getTreeSize()));
18703
18704 R.vectorizeTree();
18705 return true;
18706 }
18707
18708 return false;
18709}
18710
18711/// Checks if the quadratic mean deviation is less than 90% of the mean size.
18712static bool checkTreeSizes(ArrayRef<std::pair<unsigned, unsigned>> Sizes,
18713 bool First) {
18714 unsigned Num = 0;
18715 uint64_t Sum = std::accumulate(
18716 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18717 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18718 unsigned Size = First ? Val.first : Val.second;
18719 if (Size == 1)
18720 return V;
18721 ++Num;
18722 return V + Size;
18723 });
18724 if (Num == 0)
18725 return true;
18726 uint64_t Mean = Sum / Num;
18727 if (Mean == 0)
18728 return true;
18729 uint64_t Dev = std::accumulate(
18730 Sizes.begin(), Sizes.end(), static_cast<uint64_t>(0),
18731 [&](uint64_t V, const std::pair<unsigned, unsigned> &Val) {
18732 unsigned P = First ? Val.first : Val.second;
18733 if (P == 1)
18734 return V;
18735 return V + (P - Mean) * (P - Mean);
18736 }) /
18737 Num;
18738 return Dev * 81 / (Mean * Mean) == 0;
18739}
18740
18741bool SLPVectorizerPass::vectorizeStores(
18742 ArrayRef<StoreInst *> Stores, BoUpSLP &R,
18743 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18744 &Visited) {
18745 // We may run into multiple chains that merge into a single chain. We mark the
18746 // stores that we vectorized so that we don't visit the same store twice.
18747 BoUpSLP::ValueSet VectorizedStores;
18748 bool Changed = false;
18749
18750 struct StoreDistCompare {
18751 bool operator()(const std::pair<unsigned, int> &Op1,
18752 const std::pair<unsigned, int> &Op2) const {
18753 return Op1.second < Op2.second;
18754 }
18755 };
18756 // A set of pairs (index of store in Stores array ref, Distance of the store
18757 // address relative to base store address in units).
18758 using StoreIndexToDistSet =
18759 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18760 auto TryToVectorize = [&](const StoreIndexToDistSet &Set) {
18761 int PrevDist = -1;
18763 // Collect the chain into a list.
18764 for (auto [Idx, Data] : enumerate(Set)) {
18765 if (Operands.empty() || Data.second - PrevDist == 1) {
18766 Operands.push_back(Stores[Data.first]);
18767 PrevDist = Data.second;
18768 if (Idx != Set.size() - 1)
18769 continue;
18770 }
18771 auto E = make_scope_exit([&, &DataVar = Data]() {
18772 Operands.clear();
18773 Operands.push_back(Stores[DataVar.first]);
18774 PrevDist = DataVar.second;
18775 });
18776
18777 if (Operands.size() <= 1 ||
18778 !Visited
18779 .insert({Operands.front(),
18780 cast<StoreInst>(Operands.front())->getValueOperand(),
18781 Operands.back(),
18782 cast<StoreInst>(Operands.back())->getValueOperand(),
18783 Operands.size()})
18784 .second)
18785 continue;
18786
18787 unsigned MaxVecRegSize = R.getMaxVecRegSize();
18788 unsigned EltSize = R.getVectorElementSize(Operands[0]);
18789 unsigned MaxElts = llvm::bit_floor(MaxVecRegSize / EltSize);
18790
18791 unsigned MaxVF =
18792 std::min(R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18793 auto *Store = cast<StoreInst>(Operands[0]);
18794 Type *StoreTy = Store->getValueOperand()->getType();
18795 Type *ValueTy = StoreTy;
18796 if (auto *Trunc = dyn_cast<TruncInst>(Store->getValueOperand()))
18797 ValueTy = Trunc->getSrcTy();
18798 unsigned MinVF = std::max<unsigned>(
18800 R.getMinVF(DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18801 ValueTy)));
18802
18803 if (MaxVF < MinVF) {
18804 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18805 << ") < "
18806 << "MinVF (" << MinVF << ")\n");
18807 continue;
18808 }
18809
18810 unsigned NonPowerOf2VF = 0;
18812 // First try vectorizing with a non-power-of-2 VF. At the moment, only
18813 // consider cases where VF + 1 is a power-of-2, i.e. almost all vector
18814 // lanes are used.
18815 unsigned CandVF = std::clamp<unsigned>(Operands.size(), MinVF, MaxVF);
18816 if (has_single_bit(CandVF + 1)) {
18817 NonPowerOf2VF = CandVF;
18818 assert(NonPowerOf2VF != MaxVF &&
18819 "Non-power-of-2 VF should not be equal to MaxVF");
18820 }
18821 }
18822
18823 unsigned MaxRegVF = MaxVF;
18824 MaxVF = std::min<unsigned>(MaxVF, bit_floor(Operands.size()));
18825 if (MaxVF < MinVF) {
18826 LLVM_DEBUG(dbgs() << "SLP: Vectorization infeasible as MaxVF (" << MaxVF
18827 << ") < "
18828 << "MinVF (" << MinVF << ")\n");
18829 continue;
18830 }
18831
18832 unsigned Sz = 1 + Log2_32(MaxVF) - Log2_32(MinVF);
18833 SmallVector<unsigned> CandidateVFs(Sz + (NonPowerOf2VF > 0 ? 1 : 0));
18834 unsigned Size = MinVF;
18835 for_each(reverse(CandidateVFs), [&](unsigned &VF) {
18836 VF = Size > MaxVF ? NonPowerOf2VF : Size;
18837 Size *= 2;
18838 });
18839 unsigned End = Operands.size();
18840 unsigned Repeat = 0;
18841 constexpr unsigned MaxAttempts = 4;
18843 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &P) {
18844 P.first = P.second = 1;
18845 });
18847 auto IsNotVectorized = [](bool First,
18848 const std::pair<unsigned, unsigned> &P) {
18849 return First ? P.first > 0 : P.second > 0;
18850 };
18851 auto IsVectorized = [](bool First,
18852 const std::pair<unsigned, unsigned> &P) {
18853 return First ? P.first == 0 : P.second == 0;
18854 };
18855 auto VFIsProfitable = [](bool First, unsigned Size,
18856 const std::pair<unsigned, unsigned> &P) {
18857 return First ? Size >= P.first : Size >= P.second;
18858 };
18859 auto FirstSizeSame = [](unsigned Size,
18860 const std::pair<unsigned, unsigned> &P) {
18861 return Size == P.first;
18862 };
18863 while (true) {
18864 ++Repeat;
18865 bool RepeatChanged = false;
18866 bool AnyProfitableGraph = false;
18867 for (unsigned Size : CandidateVFs) {
18868 AnyProfitableGraph = false;
18869 unsigned StartIdx = std::distance(
18870 RangeSizes.begin(),
18871 find_if(RangeSizes, std::bind(IsNotVectorized, Size >= MaxRegVF,
18872 std::placeholders::_1)));
18873 while (StartIdx < End) {
18874 unsigned EndIdx =
18875 std::distance(RangeSizes.begin(),
18876 find_if(RangeSizes.drop_front(StartIdx),
18877 std::bind(IsVectorized, Size >= MaxRegVF,
18878 std::placeholders::_1)));
18879 unsigned Sz = EndIdx >= End ? End : EndIdx;
18880 for (unsigned Cnt = StartIdx; Cnt + Size <= Sz;) {
18881 if (!checkTreeSizes(RangeSizes.slice(Cnt, Size),
18882 Size >= MaxRegVF)) {
18883 ++Cnt;
18884 continue;
18885 }
18887 assert(all_of(Slice,
18888 [&](Value *V) {
18889 return cast<StoreInst>(V)
18890 ->getValueOperand()
18891 ->getType() ==
18892 cast<StoreInst>(Slice.front())
18893 ->getValueOperand()
18894 ->getType();
18895 }) &&
18896 "Expected all operands of same type.");
18897 if (!NonSchedulable.empty()) {
18898 auto [NonSchedSizeMax, NonSchedSizeMin] =
18899 NonSchedulable.lookup(Slice.front());
18900 if (NonSchedSizeMax > 0 && NonSchedSizeMin <= Size) {
18901 Cnt += NonSchedSizeMax;
18902 continue;
18903 }
18904 }
18905 unsigned TreeSize;
18906 std::optional<bool> Res =
18907 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18908 if (!Res) {
18909 NonSchedulable
18910 .try_emplace(Slice.front(), std::make_pair(Size, Size))
18911 .first->getSecond()
18912 .second = Size;
18913 } else if (*Res) {
18914 // Mark the vectorized stores so that we don't vectorize them
18915 // again.
18916 VectorizedStores.insert(Slice.begin(), Slice.end());
18917 // Mark the vectorized stores so that we don't vectorize them
18918 // again.
18919 AnyProfitableGraph = RepeatChanged = Changed = true;
18920 // If we vectorized initial block, no need to try to vectorize
18921 // it again.
18922 for_each(RangeSizes.slice(Cnt, Size),
18923 [](std::pair<unsigned, unsigned> &P) {
18924 P.first = P.second = 0;
18925 });
18926 if (Cnt < StartIdx + MinVF) {
18927 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18928 [](std::pair<unsigned, unsigned> &P) {
18929 P.first = P.second = 0;
18930 });
18931 StartIdx = Cnt + Size;
18932 }
18933 if (Cnt > Sz - Size - MinVF) {
18934 for_each(RangeSizes.slice(Cnt + Size, Sz - (Cnt + Size)),
18935 [](std::pair<unsigned, unsigned> &P) {
18936 P.first = P.second = 0;
18937 });
18938 if (Sz == End)
18939 End = Cnt;
18940 Sz = Cnt;
18941 }
18942 Cnt += Size;
18943 continue;
18944 }
18945 if (Size > 2 && Res &&
18946 !all_of(RangeSizes.slice(Cnt, Size),
18947 std::bind(VFIsProfitable, Size >= MaxRegVF, TreeSize,
18948 std::placeholders::_1))) {
18949 Cnt += Size;
18950 continue;
18951 }
18952 // Check for the very big VFs that we're not rebuilding same
18953 // trees, just with larger number of elements.
18954 if (Size > MaxRegVF && TreeSize > 1 &&
18955 all_of(RangeSizes.slice(Cnt, Size),
18956 std::bind(FirstSizeSame, TreeSize,
18957 std::placeholders::_1))) {
18958 Cnt += Size;
18959 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18960 ++Cnt;
18961 continue;
18962 }
18963 if (TreeSize > 1)
18964 for_each(RangeSizes.slice(Cnt, Size),
18965 [&](std::pair<unsigned, unsigned> &P) {
18966 if (Size >= MaxRegVF)
18967 P.second = std::max(P.second, TreeSize);
18968 else
18969 P.first = std::max(P.first, TreeSize);
18970 });
18971 ++Cnt;
18972 AnyProfitableGraph = true;
18973 }
18974 if (StartIdx >= End)
18975 break;
18976 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18977 AnyProfitableGraph = true;
18978 StartIdx = std::distance(
18979 RangeSizes.begin(),
18980 find_if(RangeSizes.drop_front(Sz),
18981 std::bind(IsNotVectorized, Size >= MaxRegVF,
18982 std::placeholders::_1)));
18983 }
18984 if (!AnyProfitableGraph && Size >= MaxRegVF && has_single_bit(Size))
18985 break;
18986 }
18987 // All values vectorized - exit.
18988 if (all_of(RangeSizes, [](const std::pair<unsigned, unsigned> &P) {
18989 return P.first == 0 && P.second == 0;
18990 }))
18991 break;
18992 // Check if tried all attempts or no need for the last attempts at all.
18993 if (Repeat >= MaxAttempts ||
18994 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18995 break;
18996 constexpr unsigned StoresLimit = 64;
18997 const unsigned MaxTotalNum = std::min<unsigned>(
18998 Operands.size(),
18999 static_cast<unsigned>(
19000 End -
19001 std::distance(
19002 RangeSizes.begin(),
19003 find_if(RangeSizes, std::bind(IsNotVectorized, true,
19004 std::placeholders::_1))) +
19005 1));
19006 unsigned VF = bit_ceil(CandidateVFs.front()) * 2;
19007 unsigned Limit =
19008 getFloorFullVectorNumberOfElements(*TTI, StoreTy, MaxTotalNum);
19009 CandidateVFs.clear();
19010 if (bit_floor(Limit) == VF)
19011 CandidateVFs.push_back(Limit);
19012 if (VF > MaxTotalNum || VF >= StoresLimit)
19013 break;
19014 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &P) {
19015 if (P.first != 0)
19016 P.first = std::max(P.second, P.first);
19017 });
19018 // Last attempt to vectorize max number of elements, if all previous
19019 // attempts were unsuccessful because of the cost issues.
19020 CandidateVFs.push_back(VF);
19021 }
19022 }
19023 };
19024
19025 // Stores pair (first: index of the store into Stores array ref, address of
19026 // which taken as base, second: sorted set of pairs {index, dist}, which are
19027 // indices of stores in the set and their store location distances relative to
19028 // the base address).
19029
19030 // Need to store the index of the very first store separately, since the set
19031 // may be reordered after the insertion and the first store may be moved. This
19032 // container allows to reduce number of calls of getPointersDiff() function.
19034 // Inserts the specified store SI with the given index Idx to the set of the
19035 // stores. If the store with the same distance is found already - stop
19036 // insertion, try to vectorize already found stores. If some stores from this
19037 // sequence were not vectorized - try to vectorize them with the new store
19038 // later. But this logic is applied only to the stores, that come before the
19039 // previous store with the same distance.
19040 // Example:
19041 // 1. store x, %p
19042 // 2. store y, %p+1
19043 // 3. store z, %p+2
19044 // 4. store a, %p
19045 // 5. store b, %p+3
19046 // - Scan this from the last to first store. The very first bunch of stores is
19047 // {5, {{4, -3}, {2, -2}, {3, -1}, {5, 0}}} (the element in SortedStores
19048 // vector).
19049 // - The next store in the list - #1 - has the same distance from store #5 as
19050 // the store #4.
19051 // - Try to vectorize sequence of stores 4,2,3,5.
19052 // - If all these stores are vectorized - just drop them.
19053 // - If some of them are not vectorized (say, #3 and #5), do extra analysis.
19054 // - Start new stores sequence.
19055 // The new bunch of stores is {1, {1, 0}}.
19056 // - Add the stores from previous sequence, that were not vectorized.
19057 // Here we consider the stores in the reversed order, rather they are used in
19058 // the IR (Stores are reversed already, see vectorizeStoreChains() function).
19059 // Store #3 can be added -> comes after store #4 with the same distance as
19060 // store #1.
19061 // Store #5 cannot be added - comes before store #4.
19062 // This logic allows to improve the compile time, we assume that the stores
19063 // after previous store with the same distance most likely have memory
19064 // dependencies and no need to waste compile time to try to vectorize them.
19065 // - Try to vectorize the sequence {1, {1, 0}, {3, 2}}.
19066 auto FillStoresSet = [&](unsigned Idx, StoreInst *SI) {
19067 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19068 std::optional<int> Diff = getPointersDiff(
19069 Stores[Set.first]->getValueOperand()->getType(),
19070 Stores[Set.first]->getPointerOperand(),
19071 SI->getValueOperand()->getType(), SI->getPointerOperand(), *DL, *SE,
19072 /*StrictCheck=*/true);
19073 if (!Diff)
19074 continue;
19075 auto It = Set.second.find(std::make_pair(Idx, *Diff));
19076 if (It == Set.second.end()) {
19077 Set.second.emplace(Idx, *Diff);
19078 return;
19079 }
19080 // Try to vectorize the first found set to avoid duplicate analysis.
19081 TryToVectorize(Set.second);
19082 unsigned ItIdx = It->first;
19083 int ItDist = It->second;
19084 StoreIndexToDistSet PrevSet;
19085 copy_if(Set.second, std::inserter(PrevSet, PrevSet.end()),
19086 [&](const std::pair<unsigned, int> &Pair) {
19087 return Pair.first > ItIdx;
19088 });
19089 Set.second.clear();
19090 Set.first = Idx;
19091 Set.second.emplace(Idx, 0);
19092 // Insert stores that followed previous match to try to vectorize them
19093 // with this store.
19094 unsigned StartIdx = ItIdx + 1;
19095 SmallBitVector UsedStores(Idx - StartIdx);
19096 // Distances to previously found dup store (or this store, since they
19097 // store to the same addresses).
19098 SmallVector<int> Dists(Idx - StartIdx, 0);
19099 for (const std::pair<unsigned, int> &Pair : reverse(PrevSet)) {
19100 // Do not try to vectorize sequences, we already tried.
19101 if (VectorizedStores.contains(Stores[Pair.first]))
19102 break;
19103 unsigned BI = Pair.first - StartIdx;
19104 UsedStores.set(BI);
19105 Dists[BI] = Pair.second - ItDist;
19106 }
19107 for (unsigned I = StartIdx; I < Idx; ++I) {
19108 unsigned BI = I - StartIdx;
19109 if (UsedStores.test(BI))
19110 Set.second.emplace(I, Dists[BI]);
19111 }
19112 return;
19113 }
19114 auto &Res = SortedStores.emplace_back();
19115 Res.first = Idx;
19116 Res.second.emplace(Idx, 0);
19117 };
19118 Type *PrevValTy = nullptr;
19119 for (auto [I, SI] : enumerate(Stores)) {
19120 if (R.isDeleted(SI))
19121 continue;
19122 if (!PrevValTy)
19123 PrevValTy = SI->getValueOperand()->getType();
19124 // Check that we do not try to vectorize stores of different types.
19125 if (PrevValTy != SI->getValueOperand()->getType()) {
19126 for (auto &Set : SortedStores)
19127 TryToVectorize(Set.second);
19128 SortedStores.clear();
19129 PrevValTy = SI->getValueOperand()->getType();
19130 }
19131 FillStoresSet(I, SI);
19132 }
19133
19134 // Final vectorization attempt.
19135 for (auto &Set : SortedStores)
19136 TryToVectorize(Set.second);
19137
19138 return Changed;
19139}
19140
19141void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
19142 // Initialize the collections. We will make a single pass over the block.
19143 Stores.clear();
19144 GEPs.clear();
19145
19146 // Visit the store and getelementptr instructions in BB and organize them in
19147 // Stores and GEPs according to the underlying objects of their pointer
19148 // operands.
19149 for (Instruction &I : *BB) {
19150 // Ignore store instructions that are volatile or have a pointer operand
19151 // that doesn't point to a scalar type.
19152 if (auto *SI = dyn_cast<StoreInst>(&I)) {
19153 if (!SI->isSimple())
19154 continue;
19155 if (!isValidElementType(SI->getValueOperand()->getType()))
19156 continue;
19157 Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
19158 }
19159
19160 // Ignore getelementptr instructions that have more than one index, a
19161 // constant index, or a pointer operand that doesn't point to a scalar
19162 // type.
19163 else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
19164 if (GEP->getNumIndices() != 1)
19165 continue;
19166 Value *Idx = GEP->idx_begin()->get();
19167 if (isa<Constant>(Idx))
19168 continue;
19169 if (!isValidElementType(Idx->getType()))
19170 continue;
19171 if (GEP->getType()->isVectorTy())
19172 continue;
19173 GEPs[GEP->getPointerOperand()].push_back(GEP);
19174 }
19175 }
19176}
19177
19178bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
19179 bool MaxVFOnly) {
19180 if (VL.size() < 2)
19181 return false;
19182
19183 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
19184 << VL.size() << ".\n");
19185
19186 // Check that all of the parts are instructions of the same type,
19187 // we permit an alternate opcode via InstructionsState.
19188 InstructionsState S = getSameOpcode(VL, *TLI);
19189 if (!S)
19190 return false;
19191
19192 Instruction *I0 = S.getMainOp();
19193 // Make sure invalid types (including vector type) are rejected before
19194 // determining vectorization factor for scalar instructions.
19195 for (Value *V : VL) {
19196 Type *Ty = V->getType();
19197 if (!isa<InsertElementInst>(V) && !isValidElementType(Ty)) {
19198 // NOTE: the following will give user internal llvm type name, which may
19199 // not be useful.
19200 R.getORE()->emit([&]() {
19201 std::string TypeStr;
19202 llvm::raw_string_ostream rso(TypeStr);
19203 Ty->print(rso);
19204 return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
19205 << "Cannot SLP vectorize list: type "
19206 << TypeStr + " is unsupported by vectorizer";
19207 });
19208 return false;
19209 }
19210 }
19211
19212 Type *ScalarTy = getValueType(VL[0]);
19213 unsigned Sz = R.getVectorElementSize(I0);
19214 unsigned MinVF = R.getMinVF(Sz);
19215 unsigned MaxVF = std::max<unsigned>(
19216 getFloorFullVectorNumberOfElements(*TTI, ScalarTy, VL.size()), MinVF);
19217 MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19218 if (MaxVF < 2) {
19219 R.getORE()->emit([&]() {
19220 return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
19221 << "Cannot SLP vectorize list: vectorization factor "
19222 << "less than 2 is not supported";
19223 });
19224 return false;
19225 }
19226
19227 bool Changed = false;
19228 bool CandidateFound = false;
19229 InstructionCost MinCost = SLPCostThreshold.getValue();
19230
19231 unsigned NextInst = 0, MaxInst = VL.size();
19232 for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19233 VF = getFloorFullVectorNumberOfElements(*TTI, I0->getType(), VF - 1)) {
19234 // No actual vectorization should happen, if number of parts is the same as
19235 // provided vectorization factor (i.e. the scalar type is used for vector
19236 // code during codegen).
19237 auto *VecTy = getWidenedType(ScalarTy, VF);
19238 if (TTI->getNumberOfParts(VecTy) == VF)
19239 continue;
19240 for (unsigned I = NextInst; I < MaxInst; ++I) {
19241 unsigned ActualVF = std::min(MaxInst - I, VF);
19242
19243 if (!hasFullVectorsOrPowerOf2(*TTI, ScalarTy, ActualVF))
19244 continue;
19245
19246 if (MaxVFOnly && ActualVF < MaxVF)
19247 break;
19248 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19249 break;
19250
19251 SmallVector<Value *> Ops(ActualVF, nullptr);
19252 unsigned Idx = 0;
19253 for (Value *V : VL.drop_front(I)) {
19254 // Check that a previous iteration of this loop did not delete the
19255 // Value.
19256 if (auto *Inst = dyn_cast<Instruction>(V);
19257 !Inst || !R.isDeleted(Inst)) {
19258 Ops[Idx] = V;
19259 ++Idx;
19260 if (Idx == ActualVF)
19261 break;
19262 }
19263 }
19264 // Not enough vectorizable instructions - exit.
19265 if (Idx != ActualVF)
19266 break;
19267
19268 LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
19269 << "\n");
19270
19271 R.buildTree(Ops);
19272 if (R.isTreeTinyAndNotFullyVectorizable())
19273 continue;
19274 R.reorderTopToBottom();
19275 R.reorderBottomToTop(
19276 /*IgnoreReorder=*/!isa<InsertElementInst>(Ops.front()) &&
19277 !R.doesRootHaveInTreeUses());
19278 R.transformNodes();
19279 R.buildExternalUses();
19280
19281 R.computeMinimumValueSizes();
19282 InstructionCost Cost = R.getTreeCost();
19283 CandidateFound = true;
19284 MinCost = std::min(MinCost, Cost);
19285
19286 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
19287 << " for VF=" << ActualVF << "\n");
19288 if (Cost < -SLPCostThreshold) {
19289 LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
19290 R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
19291 cast<Instruction>(Ops[0]))
19292 << "SLP vectorized with cost " << ore::NV("Cost", Cost)
19293 << " and with tree size "
19294 << ore::NV("TreeSize", R.getTreeSize()));
19295
19296 R.vectorizeTree();
19297 // Move to the next bundle.
19298 I += VF - 1;
19299 NextInst = I + 1;
19300 Changed = true;
19301 }
19302 }
19303 }
19304
19305 if (!Changed && CandidateFound) {
19306 R.getORE()->emit([&]() {
19307 return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
19308 << "List vectorization was possible but not beneficial with cost "
19309 << ore::NV("Cost", MinCost) << " >= "
19310 << ore::NV("Treshold", -SLPCostThreshold);
19311 });
19312 } else if (!Changed) {
19313 R.getORE()->emit([&]() {
19314 return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
19315 << "Cannot SLP vectorize list: vectorization was impossible"
19316 << " with available vectorization factors";
19317 });
19318 }
19319 return Changed;
19320}
19321
19322bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
19323 if (!I)
19324 return false;
19325
19326 if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
19327 return false;
19328
19329 Value *P = I->getParent();
19330
19331 // Vectorize in current basic block only.
19332 auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
19333 auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
19334 if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
19335 R.isDeleted(Op0) || R.isDeleted(Op1))
19336 return false;
19337
19338 // First collect all possible candidates
19340 Candidates.emplace_back(Op0, Op1);
19341
19342 auto *A = dyn_cast<BinaryOperator>(Op0);
19343 auto *B = dyn_cast<BinaryOperator>(Op1);
19344 // Try to skip B.
19345 if (A && B && B->hasOneUse()) {
19346 auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
19347 auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
19348 if (B0 && B0->getParent() == P && !R.isDeleted(B0))
19349 Candidates.emplace_back(A, B0);
19350 if (B1 && B1->getParent() == P && !R.isDeleted(B1))
19351 Candidates.emplace_back(A, B1);
19352 }
19353 // Try to skip A.
19354 if (B && A && A->hasOneUse()) {
19355 auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
19356 auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
19357 if (A0 && A0->getParent() == P && !R.isDeleted(A0))
19358 Candidates.emplace_back(A0, B);
19359 if (A1 && A1->getParent() == P && !R.isDeleted(A1))
19360 Candidates.emplace_back(A1, B);
19361 }
19362
19363 if (Candidates.size() == 1)
19364 return tryToVectorizeList({Op0, Op1}, R);
19365
19366 // We have multiple options. Try to pick the single best.
19367 std::optional<int> BestCandidate = R.findBestRootPair(Candidates);
19368 if (!BestCandidate)
19369 return false;
19370 return tryToVectorizeList(
19371 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second}, R);
19372}
19373
19374namespace {
19375
19376/// Model horizontal reductions.
19377///
19378/// A horizontal reduction is a tree of reduction instructions that has values
19379/// that can be put into a vector as its leaves. For example:
19380///
19381/// mul mul mul mul
19382/// \ / \ /
19383/// + +
19384/// \ /
19385/// +
19386/// This tree has "mul" as its leaf values and "+" as its reduction
19387/// instructions. A reduction can feed into a store or a binary operation
19388/// feeding a phi.
19389/// ...
19390/// \ /
19391/// +
19392/// |
19393/// phi +=
19394///
19395/// Or:
19396/// ...
19397/// \ /
19398/// +
19399/// |
19400/// *p =
19401///
19402class HorizontalReduction {
19403 using ReductionOpsType = SmallVector<Value *, 16>;
19404 using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
19405 ReductionOpsListType ReductionOps;
19406 /// List of possibly reduced values.
19408 /// Maps reduced value to the corresponding reduction operation.
19410 WeakTrackingVH ReductionRoot;
19411 /// The type of reduction operation.
19412 RecurKind RdxKind;
19413 /// Checks if the optimization of original scalar identity operations on
19414 /// matched horizontal reductions is enabled and allowed.
19415 bool IsSupportedHorRdxIdentityOp = false;
19416
19417 static bool isCmpSelMinMax(Instruction *I) {
19418 return match(I, m_Select(m_Cmp(), m_Value(), m_Value())) &&
19420 }
19421
19422 // And/or are potentially poison-safe logical patterns like:
19423 // select x, y, false
19424 // select x, true, y
19425 static bool isBoolLogicOp(Instruction *I) {
19426 return isa<SelectInst>(I) &&
19427 (match(I, m_LogicalAnd()) || match(I, m_LogicalOr()));
19428 }
19429
19430 /// Checks if instruction is associative and can be vectorized.
19431 static bool isVectorizable(RecurKind Kind, Instruction *I) {
19432 if (Kind == RecurKind::None)
19433 return false;
19434
19435 // Integer ops that map to select instructions or intrinsics are fine.
19437 isBoolLogicOp(I))
19438 return true;
19439
19440 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19441 // FP min/max are associative except for NaN and -0.0. We do not
19442 // have to rule out -0.0 here because the intrinsic semantics do not
19443 // specify a fixed result for it.
19444 return I->getFastMathFlags().noNaNs();
19445 }
19446
19447 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19448 return true;
19449
19450 return I->isAssociative();
19451 }
19452
19453 static Value *getRdxOperand(Instruction *I, unsigned Index) {
19454 // Poison-safe 'or' takes the form: select X, true, Y
19455 // To make that work with the normal operand processing, we skip the
19456 // true value operand.
19457 // TODO: Change the code and data structures to handle this without a hack.
19458 if (getRdxKind(I) == RecurKind::Or && isa<SelectInst>(I) && Index == 1)
19459 return I->getOperand(2);
19460 return I->getOperand(Index);
19461 }
19462
19463 /// Creates reduction operation with the current opcode.
19464 static Value *createOp(IRBuilderBase &Builder, RecurKind Kind, Value *LHS,
19465 Value *RHS, const Twine &Name, bool UseSelect) {
19466 switch (Kind) {
19467 case RecurKind::Or: {
19468 if (UseSelect &&
19470 return Builder.CreateSelect(LHS, Builder.getTrue(), RHS, Name);
19471 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19472 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19473 Name);
19474 }
19475 case RecurKind::And: {
19476 if (UseSelect &&
19478 return Builder.CreateSelect(LHS, RHS, Builder.getFalse(), Name);
19479 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19480 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19481 Name);
19482 }
19483 case RecurKind::Add:
19484 case RecurKind::Mul:
19485 case RecurKind::Xor:
19486 case RecurKind::FAdd:
19487 case RecurKind::FMul: {
19488 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(Kind);
19489 return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
19490 Name);
19491 }
19492 case RecurKind::SMax:
19493 case RecurKind::SMin:
19494 case RecurKind::UMax:
19495 case RecurKind::UMin:
19496 if (UseSelect) {
19498 Value *Cmp = Builder.CreateICmp(Pred, LHS, RHS, Name);
19499 return Builder.CreateSelect(Cmp, LHS, RHS, Name);
19500 }
19501 [[fallthrough]];
19502 case RecurKind::FMax:
19503 case RecurKind::FMin:
19504 case RecurKind::FMaximum:
19505 case RecurKind::FMinimum: {
19507 return Builder.CreateBinaryIntrinsic(Id, LHS, RHS);
19508 }
19509 default:
19510 llvm_unreachable("Unknown reduction operation.");
19511 }
19512 }
19513
19514 /// Creates reduction operation with the current opcode with the IR flags
19515 /// from \p ReductionOps, dropping nuw/nsw flags.
19516 static Value *createOp(IRBuilderBase &Builder, RecurKind RdxKind, Value *LHS,
19517 Value *RHS, const Twine &Name,
19518 const ReductionOpsListType &ReductionOps) {
19519 bool UseSelect = ReductionOps.size() == 2 ||
19520 // Logical or/and.
19521 (ReductionOps.size() == 1 &&
19522 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19523 assert((!UseSelect || ReductionOps.size() != 2 ||
19524 isa<SelectInst>(ReductionOps[1][0])) &&
19525 "Expected cmp + select pairs for reduction");
19526 Value *Op = createOp(Builder, RdxKind, LHS, RHS, Name, UseSelect);
19528 if (auto *Sel = dyn_cast<SelectInst>(Op)) {
19529 propagateIRFlags(Sel->getCondition(), ReductionOps[0], nullptr,
19530 /*IncludeWrapFlags=*/false);
19531 propagateIRFlags(Op, ReductionOps[1], nullptr,
19532 /*IncludeWrapFlags=*/false);
19533 return Op;
19534 }
19535 }
19536 propagateIRFlags(Op, ReductionOps[0], nullptr, /*IncludeWrapFlags=*/false);
19537 return Op;
19538 }
19539
19540public:
19541 static RecurKind getRdxKind(Value *V) {
19542 auto *I = dyn_cast<Instruction>(V);
19543 if (!I)
19544 return RecurKind::None;
19545 if (match(I, m_Add(m_Value(), m_Value())))
19546 return RecurKind::Add;
19547 if (match(I, m_Mul(m_Value(), m_Value())))
19548 return RecurKind::Mul;
19549 if (match(I, m_And(m_Value(), m_Value())) ||
19551 return RecurKind::And;
19552 if (match(I, m_Or(m_Value(), m_Value())) ||
19554 return RecurKind::Or;
19555 if (match(I, m_Xor(m_Value(), m_Value())))
19556 return RecurKind::Xor;
19557 if (match(I, m_FAdd(m_Value(), m_Value())))
19558 return RecurKind::FAdd;
19559 if (match(I, m_FMul(m_Value(), m_Value())))
19560 return RecurKind::FMul;
19561
19562 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
19563 return RecurKind::FMax;
19564 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
19565 return RecurKind::FMin;
19566
19567 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(), m_Value())))
19568 return RecurKind::FMaximum;
19569 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(), m_Value())))
19570 return RecurKind::FMinimum;
19571 // This matches either cmp+select or intrinsics. SLP is expected to handle
19572 // either form.
19573 // TODO: If we are canonicalizing to intrinsics, we can remove several
19574 // special-case paths that deal with selects.
19575 if (match(I, m_SMax(m_Value(), m_Value())))
19576 return RecurKind::SMax;
19577 if (match(I, m_SMin(m_Value(), m_Value())))
19578 return RecurKind::SMin;
19579 if (match(I, m_UMax(m_Value(), m_Value())))
19580 return RecurKind::UMax;
19581 if (match(I, m_UMin(m_Value(), m_Value())))
19582 return RecurKind::UMin;
19583
19584 if (auto *Select = dyn_cast<SelectInst>(I)) {
19585 // Try harder: look for min/max pattern based on instructions producing
19586 // same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
19587 // During the intermediate stages of SLP, it's very common to have
19588 // pattern like this (since optimizeGatherSequence is run only once
19589 // at the end):
19590 // %1 = extractelement <2 x i32> %a, i32 0
19591 // %2 = extractelement <2 x i32> %a, i32 1
19592 // %cond = icmp sgt i32 %1, %2
19593 // %3 = extractelement <2 x i32> %a, i32 0
19594 // %4 = extractelement <2 x i32> %a, i32 1
19595 // %select = select i1 %cond, i32 %3, i32 %4
19596 CmpPredicate Pred;
19597 Instruction *L1;
19598 Instruction *L2;
19599
19600 Value *LHS = Select->getTrueValue();
19601 Value *RHS = Select->getFalseValue();
19602 Value *Cond = Select->getCondition();
19603
19604 // TODO: Support inverse predicates.
19605 if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
19606 if (!isa<ExtractElementInst>(RHS) ||
19607 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19608 return RecurKind::None;
19609 } else if (match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Specific(RHS)))) {
19610 if (!isa<ExtractElementInst>(LHS) ||
19611 !L1->isIdenticalTo(cast<Instruction>(LHS)))
19612 return RecurKind::None;
19613 } else {
19614 if (!isa<ExtractElementInst>(LHS) || !isa<ExtractElementInst>(RHS))
19615 return RecurKind::None;
19616 if (!match(Cond, m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2))) ||
19617 !L1->isIdenticalTo(cast<Instruction>(LHS)) ||
19618 !L2->isIdenticalTo(cast<Instruction>(RHS)))
19619 return RecurKind::None;
19620 }
19621
19622 switch (Pred) {
19623 default:
19624 return RecurKind::None;
19625 case CmpInst::ICMP_SGT:
19626 case CmpInst::ICMP_SGE:
19627 return RecurKind::SMax;
19628 case CmpInst::ICMP_SLT:
19629 case CmpInst::ICMP_SLE:
19630 return RecurKind::SMin;
19631 case CmpInst::ICMP_UGT:
19632 case CmpInst::ICMP_UGE:
19633 return RecurKind::UMax;
19634 case CmpInst::ICMP_ULT:
19635 case CmpInst::ICMP_ULE:
19636 return RecurKind::UMin;
19637 }
19638 }
19639 return RecurKind::None;
19640 }
19641
19642 /// Get the index of the first operand.
19643 static unsigned getFirstOperandIndex(Instruction *I) {
19644 return isCmpSelMinMax(I) ? 1 : 0;
19645 }
19646
19647private:
19648 /// Total number of operands in the reduction operation.
19649 static unsigned getNumberOfOperands(Instruction *I) {
19650 return isCmpSelMinMax(I) ? 3 : 2;
19651 }
19652
19653 /// Checks if the instruction is in basic block \p BB.
19654 /// For a cmp+sel min/max reduction check that both ops are in \p BB.
19655 static bool hasSameParent(Instruction *I, BasicBlock *BB) {
19656 if (isCmpSelMinMax(I) || isBoolLogicOp(I)) {
19657 auto *Sel = cast<SelectInst>(I);
19658 auto *Cmp = dyn_cast<Instruction>(Sel->getCondition());
19659 return Sel->getParent() == BB && Cmp && Cmp->getParent() == BB;
19660 }
19661 return I->getParent() == BB;
19662 }
19663
19664 /// Expected number of uses for reduction operations/reduced values.
19665 static bool hasRequiredNumberOfUses(bool IsCmpSelMinMax, Instruction *I) {
19666 if (IsCmpSelMinMax) {
19667 // SelectInst must be used twice while the condition op must have single
19668 // use only.
19669 if (auto *Sel = dyn_cast<SelectInst>(I))
19670 return Sel->hasNUses(2) && Sel->getCondition()->hasOneUse();
19671 return I->hasNUses(2);
19672 }
19673
19674 // Arithmetic reduction operation must be used once only.
19675 return I->hasOneUse();
19676 }
19677
19678 /// Initializes the list of reduction operations.
19679 void initReductionOps(Instruction *I) {
19680 if (isCmpSelMinMax(I))
19681 ReductionOps.assign(2, ReductionOpsType());
19682 else
19683 ReductionOps.assign(1, ReductionOpsType());
19684 }
19685
19686 /// Add all reduction operations for the reduction instruction \p I.
19687 void addReductionOps(Instruction *I) {
19688 if (isCmpSelMinMax(I)) {
19689 ReductionOps[0].emplace_back(cast<SelectInst>(I)->getCondition());
19690 ReductionOps[1].emplace_back(I);
19691 } else {
19692 ReductionOps[0].emplace_back(I);
19693 }
19694 }
19695
19696 static bool isGoodForReduction(ArrayRef<Value *> Data) {
19697 int Sz = Data.size();
19698 auto *I = dyn_cast<Instruction>(Data.front());
19699 return Sz > 1 || isConstant(Data.front()) ||
19700 (I && !isa<LoadInst>(I) && isValidForAlternation(I->getOpcode()));
19701 }
19702
19703public:
19704 HorizontalReduction() = default;
19705
19706 /// Try to find a reduction tree.
19707 bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
19708 ScalarEvolution &SE, const DataLayout &DL,
19709 const TargetLibraryInfo &TLI) {
19710 RdxKind = HorizontalReduction::getRdxKind(Root);
19711 if (!isVectorizable(RdxKind, Root))
19712 return false;
19713
19714 // Analyze "regular" integer/FP types for reductions - no target-specific
19715 // types or pointers.
19716 Type *Ty = Root->getType();
19717 if (!isValidElementType(Ty) || Ty->isPointerTy())
19718 return false;
19719
19720 // Though the ultimate reduction may have multiple uses, its condition must
19721 // have only single use.
19722 if (auto *Sel = dyn_cast<SelectInst>(Root))
19723 if (!Sel->getCondition()->hasOneUse())
19724 return false;
19725
19726 ReductionRoot = Root;
19727
19728 // Iterate through all the operands of the possible reduction tree and
19729 // gather all the reduced values, sorting them by their value id.
19730 BasicBlock *BB = Root->getParent();
19731 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19733 1, std::make_pair(Root, 0));
19734 // Checks if the operands of the \p TreeN instruction are also reduction
19735 // operations or should be treated as reduced values or an extra argument,
19736 // which is not part of the reduction.
19737 auto CheckOperands = [&](Instruction *TreeN,
19738 SmallVectorImpl<Value *> &PossibleReducedVals,
19739 SmallVectorImpl<Instruction *> &ReductionOps,
19740 unsigned Level) {
19741 for (int I : reverse(seq<int>(getFirstOperandIndex(TreeN),
19742 getNumberOfOperands(TreeN)))) {
19743 Value *EdgeVal = getRdxOperand(TreeN, I);
19744 ReducedValsToOps[EdgeVal].push_back(TreeN);
19745 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19746 // If the edge is not an instruction, or it is different from the main
19747 // reduction opcode or has too many uses - possible reduced value.
19748 // Also, do not try to reduce const values, if the operation is not
19749 // foldable.
19750 if (!EdgeInst || Level > RecursionMaxDepth ||
19751 getRdxKind(EdgeInst) != RdxKind ||
19752 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19753 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19754 !isVectorizable(RdxKind, EdgeInst) ||
19755 (R.isAnalyzedReductionRoot(EdgeInst) &&
19756 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19757 PossibleReducedVals.push_back(EdgeVal);
19758 continue;
19759 }
19760 ReductionOps.push_back(EdgeInst);
19761 }
19762 };
19763 // Try to regroup reduced values so that it gets more profitable to try to
19764 // reduce them. Values are grouped by their value ids, instructions - by
19765 // instruction op id and/or alternate op id, plus do extra analysis for
19766 // loads (grouping them by the distabce between pointers) and cmp
19767 // instructions (grouping them by the predicate).
19770 8>
19771 PossibleReducedVals;
19772 initReductionOps(Root);
19774 SmallSet<size_t, 2> LoadKeyUsed;
19775
19776 auto GenerateLoadsSubkey = [&](size_t Key, LoadInst *LI) {
19777 Key = hash_combine(hash_value(LI->getParent()), Key);
19778 Value *Ptr =
19780 if (!LoadKeyUsed.insert(Key).second) {
19781 auto LIt = LoadsMap.find(std::make_pair(Key, Ptr));
19782 if (LIt != LoadsMap.end()) {
19783 for (LoadInst *RLI : LIt->second) {
19784 if (getPointersDiff(RLI->getType(), RLI->getPointerOperand(),
19785 LI->getType(), LI->getPointerOperand(), DL, SE,
19786 /*StrictCheck=*/true))
19787 return hash_value(RLI->getPointerOperand());
19788 }
19789 for (LoadInst *RLI : LIt->second) {
19791 LI->getPointerOperand(), TLI)) {
19792 hash_code SubKey = hash_value(RLI->getPointerOperand());
19793 return SubKey;
19794 }
19795 }
19796 if (LIt->second.size() > 2) {
19797 hash_code SubKey =
19798 hash_value(LIt->second.back()->getPointerOperand());
19799 return SubKey;
19800 }
19801 }
19802 }
19803 LoadsMap.try_emplace(std::make_pair(Key, Ptr))
19804 .first->second.push_back(LI);
19805 return hash_value(LI->getPointerOperand());
19806 };
19807
19808 while (!Worklist.empty()) {
19809 auto [TreeN, Level] = Worklist.pop_back_val();
19810 SmallVector<Value *> PossibleRedVals;
19811 SmallVector<Instruction *> PossibleReductionOps;
19812 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19813 addReductionOps(TreeN);
19814 // Add reduction values. The values are sorted for better vectorization
19815 // results.
19816 for (Value *V : PossibleRedVals) {
19817 size_t Key, Idx;
19818 std::tie(Key, Idx) = generateKeySubkey(V, &TLI, GenerateLoadsSubkey,
19819 /*AllowAlternate=*/false);
19820 ++PossibleReducedVals[Key][Idx]
19821 .insert(std::make_pair(V, 0))
19822 .first->second;
19823 }
19824 for (Instruction *I : reverse(PossibleReductionOps))
19825 Worklist.emplace_back(I, I->getParent() == BB ? 0 : Level + 1);
19826 }
19827 auto PossibleReducedValsVect = PossibleReducedVals.takeVector();
19828 // Sort values by the total number of values kinds to start the reduction
19829 // from the longest possible reduced values sequences.
19830 for (auto &PossibleReducedVals : PossibleReducedValsVect) {
19831 auto PossibleRedVals = PossibleReducedVals.second.takeVector();
19832 SmallVector<SmallVector<Value *>> PossibleRedValsVect;
19833 for (auto It = PossibleRedVals.begin(), E = PossibleRedVals.end();
19834 It != E; ++It) {
19835 PossibleRedValsVect.emplace_back();
19836 auto RedValsVect = It->second.takeVector();
19837 stable_sort(RedValsVect, llvm::less_second());
19838 for (const std::pair<Value *, unsigned> &Data : RedValsVect)
19839 PossibleRedValsVect.back().append(Data.second, Data.first);
19840 }
19841 stable_sort(PossibleRedValsVect, [](const auto &P1, const auto &P2) {
19842 return P1.size() > P2.size();
19843 });
19844 int NewIdx = -1;
19845 for (ArrayRef<Value *> Data : PossibleRedValsVect) {
19846 if (NewIdx < 0 ||
19847 (!isGoodForReduction(Data) &&
19848 (!isa<LoadInst>(Data.front()) ||
19849 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19851 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19853 cast<LoadInst>(ReducedVals[NewIdx].front())
19854 ->getPointerOperand())))) {
19855 NewIdx = ReducedVals.size();
19856 ReducedVals.emplace_back();
19857 }
19858 ReducedVals[NewIdx].append(Data.rbegin(), Data.rend());
19859 }
19860 }
19861 // Sort the reduced values by number of same/alternate opcode and/or pointer
19862 // operand.
19863 stable_sort(ReducedVals, [](ArrayRef<Value *> P1, ArrayRef<Value *> P2) {
19864 return P1.size() > P2.size();
19865 });
19866 return true;
19867 }
19868
19869 /// Attempt to vectorize the tree found by matchAssociativeReduction.
19870 Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
19871 const TargetLibraryInfo &TLI, AssumptionCache *AC) {
19872 const unsigned ReductionLimit = VectorizeNonPowerOf2 ? 3 : 4;
19873 constexpr unsigned RegMaxNumber = 4;
19874 constexpr unsigned RedValsMaxNumber = 128;
19875 // If there are a sufficient number of reduction values, reduce
19876 // to a nearby power-of-2. We can safely generate oversized
19877 // vectors and rely on the backend to split them to legal sizes.
19878 if (unsigned NumReducedVals = std::accumulate(
19879 ReducedVals.begin(), ReducedVals.end(), 0,
19880 [](unsigned Num, ArrayRef<Value *> Vals) -> unsigned {
19881 if (!isGoodForReduction(Vals))
19882 return Num;
19883 return Num + Vals.size();
19884 });
19885 NumReducedVals < ReductionLimit &&
19886 all_of(ReducedVals, [](ArrayRef<Value *> RedV) {
19887 return RedV.size() < 2 || !allConstant(RedV) || !isSplat(RedV);
19888 })) {
19889 for (ReductionOpsType &RdxOps : ReductionOps)
19890 for (Value *RdxOp : RdxOps)
19891 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19892 return nullptr;
19893 }
19894
19895 IRBuilder<TargetFolder> Builder(ReductionRoot->getContext(),
19896 TargetFolder(DL));
19897 Builder.SetInsertPoint(cast<Instruction>(ReductionRoot));
19898
19899 // Track the reduced values in case if they are replaced by extractelement
19900 // because of the vectorization.
19901 DenseMap<Value *, WeakTrackingVH> TrackedVals(ReducedVals.size() *
19902 ReducedVals.front().size());
19903
19904 // The compare instruction of a min/max is the insertion point for new
19905 // instructions and may be replaced with a new compare instruction.
19906 auto &&GetCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
19907 assert(isa<SelectInst>(RdxRootInst) &&
19908 "Expected min/max reduction to have select root instruction");
19909 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19910 assert(isa<Instruction>(ScalarCond) &&
19911 "Expected min/max reduction to have compare condition");
19912 return cast<Instruction>(ScalarCond);
19913 };
19914
19915 bool AnyBoolLogicOp = any_of(ReductionOps.back(), [](Value *V) {
19916 return isBoolLogicOp(cast<Instruction>(V));
19917 });
19918 // Return new VectorizedTree, based on previous value.
19919 auto GetNewVectorizedTree = [&](Value *VectorizedTree, Value *Res) {
19920 if (VectorizedTree) {
19921 // Update the final value in the reduction.
19923 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19924 if (AnyBoolLogicOp) {
19925 auto It = ReducedValsToOps.find(VectorizedTree);
19926 auto It1 = ReducedValsToOps.find(Res);
19927 if ((It == ReducedValsToOps.end() && It1 == ReducedValsToOps.end()) ||
19928 isGuaranteedNotToBePoison(VectorizedTree, AC) ||
19929 (It != ReducedValsToOps.end() &&
19930 any_of(It->getSecond(), [&](Instruction *I) {
19931 return isBoolLogicOp(I) &&
19932 getRdxOperand(I, 0) == VectorizedTree;
19933 }))) {
19934 ;
19935 } else if (isGuaranteedNotToBePoison(Res, AC) ||
19936 (It1 != ReducedValsToOps.end() &&
19937 any_of(It1->getSecond(), [&](Instruction *I) {
19938 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19939 }))) {
19940 std::swap(VectorizedTree, Res);
19941 } else {
19942 VectorizedTree = Builder.CreateFreeze(VectorizedTree);
19943 }
19944 }
19945
19946 return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
19947 ReductionOps);
19948 }
19949 // Initialize the final value in the reduction.
19950 return Res;
19951 };
19952 SmallDenseSet<Value *> IgnoreList(ReductionOps.size() *
19953 ReductionOps.front().size());
19954 for (ReductionOpsType &RdxOps : ReductionOps)
19955 for (Value *RdxOp : RdxOps) {
19956 if (!RdxOp)
19957 continue;
19958 IgnoreList.insert(RdxOp);
19959 }
19960 // Intersect the fast-math-flags from all reduction operations.
19961 FastMathFlags RdxFMF;
19962 RdxFMF.set();
19963 for (Value *U : IgnoreList)
19964 if (auto *FPMO = dyn_cast<FPMathOperator>(U))
19965 RdxFMF &= FPMO->getFastMathFlags();
19966 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19967
19968 // Need to track reduced vals, they may be changed during vectorization of
19969 // subvectors.
19970 for (ArrayRef<Value *> Candidates : ReducedVals)
19971 for (Value *V : Candidates)
19972 TrackedVals.try_emplace(V, V);
19973
19975 Value *V) -> unsigned & {
19976 auto *It = MV.find(V);
19977 assert(It != MV.end() && "Unable to find given key.");
19978 return It->second;
19979 };
19980
19981 DenseMap<Value *, unsigned> VectorizedVals(ReducedVals.size());
19982 // List of the values that were reduced in other trees as part of gather
19983 // nodes and thus requiring extract if fully vectorized in other trees.
19984 SmallPtrSet<Value *, 4> RequiredExtract;
19985 WeakTrackingVH VectorizedTree = nullptr;
19986 bool CheckForReusedReductionOps = false;
19987 // Try to vectorize elements based on their type.
19989 for (ArrayRef<Value *> RV : ReducedVals)
19990 States.push_back(getSameOpcode(RV, TLI));
19991 for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
19992 ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
19993 InstructionsState S = States[I];
19994 SmallVector<Value *> Candidates;
19995 Candidates.reserve(2 * OrigReducedVals.size());
19996 DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
19997 for (unsigned Cnt = 0, Sz = OrigReducedVals.size(); Cnt < Sz; ++Cnt) {
19998 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
19999 // Check if the reduction value was not overriden by the extractelement
20000 // instruction because of the vectorization and exclude it, if it is not
20001 // compatible with other values.
20002 // Also check if the instruction was folded to constant/other value.
20003 auto *Inst = dyn_cast<Instruction>(RdxVal);
20004 if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
20005 (!S || !S.isOpcodeOrAlt(Inst))) ||
20006 (S && !Inst))
20007 continue;
20008 Candidates.push_back(RdxVal);
20009 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
20010 }
20011 bool ShuffledExtracts = false;
20012 // Try to handle shuffled extractelements.
20013 if (S && S.getOpcode() == Instruction::ExtractElement &&
20014 !S.isAltShuffle() && I + 1 < E) {
20015 SmallVector<Value *> CommonCandidates(Candidates);
20016 for (Value *RV : ReducedVals[I + 1]) {
20017 Value *RdxVal = TrackedVals.at(RV);
20018 // Check if the reduction value was not overriden by the
20019 // extractelement instruction because of the vectorization and
20020 // exclude it, if it is not compatible with other values.
20021 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
20022 if (!Inst)
20023 continue;
20024 CommonCandidates.push_back(RdxVal);
20025 TrackedToOrig.try_emplace(RdxVal, RV);
20026 }
20028 if (isFixedVectorShuffle(CommonCandidates, Mask, AC)) {
20029 ++I;
20030 Candidates.swap(CommonCandidates);
20031 ShuffledExtracts = true;
20032 }
20033 }
20034
20035 // Emit code for constant values.
20036 if (Candidates.size() > 1 && allConstant(Candidates)) {
20037 Value *Res = Candidates.front();
20038 Value *OrigV = TrackedToOrig.at(Candidates.front());
20039 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20040 for (Value *VC : ArrayRef(Candidates).drop_front()) {
20041 Res = createOp(Builder, RdxKind, Res, VC, "const.rdx", ReductionOps);
20042 Value *OrigV = TrackedToOrig.at(VC);
20043 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20044 if (auto *ResI = dyn_cast<Instruction>(Res))
20045 V.analyzedReductionRoot(ResI);
20046 }
20047 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20048 continue;
20049 }
20050
20051 unsigned NumReducedVals = Candidates.size();
20052 if (NumReducedVals < ReductionLimit &&
20053 (NumReducedVals < 2 || !isSplat(Candidates)))
20054 continue;
20055
20056 // Check if we support repeated scalar values processing (optimization of
20057 // original scalar identity operations on matched horizontal reductions).
20058 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20059 RdxKind != RecurKind::FMul &&
20060 RdxKind != RecurKind::FMulAdd;
20061 // Gather same values.
20062 SmallMapVector<Value *, unsigned, 16> SameValuesCounter;
20063 if (IsSupportedHorRdxIdentityOp)
20064 for (Value *V : Candidates) {
20065 Value *OrigV = TrackedToOrig.at(V);
20066 ++SameValuesCounter.try_emplace(OrigV).first->second;
20067 }
20068 // Used to check if the reduced values used same number of times. In this
20069 // case the compiler may produce better code. E.g. if reduced values are
20070 // aabbccdd (8 x values), then the first node of the tree will have a node
20071 // for 4 x abcd + shuffle <4 x abcd>, <0, 0, 1, 1, 2, 2, 3, 3>.
20072 // Plus, the final reduction will be performed on <8 x aabbccdd>.
20073 // Instead compiler may build <4 x abcd> tree immediately, + reduction (4
20074 // x abcd) * 2.
20075 // Currently it only handles add/fadd/xor. and/or/min/max do not require
20076 // this analysis, other operations may require an extra estimation of
20077 // the profitability.
20078 bool SameScaleFactor = false;
20079 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20080 SameValuesCounter.size() != Candidates.size();
20081 BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
20082 if (OptReusedScalars) {
20083 SameScaleFactor =
20084 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20085 RdxKind == RecurKind::Xor) &&
20086 all_of(drop_begin(SameValuesCounter),
20087 [&SameValuesCounter](const std::pair<Value *, unsigned> &P) {
20088 return P.second == SameValuesCounter.front().second;
20089 });
20090 Candidates.resize(SameValuesCounter.size());
20091 transform(SameValuesCounter, Candidates.begin(),
20092 [&](const auto &P) { return TrackedVals.at(P.first); });
20093 NumReducedVals = Candidates.size();
20094 // Have a reduction of the same element.
20095 if (NumReducedVals == 1) {
20096 Value *OrigV = TrackedToOrig.at(Candidates.front());
20097 unsigned Cnt = At(SameValuesCounter, OrigV);
20098 Value *RedVal =
20099 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20100 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20101 VectorizedVals.try_emplace(OrigV, Cnt);
20102 ExternallyUsedValues.insert(OrigV);
20103 continue;
20104 }
20105 }
20106
20107 unsigned MaxVecRegSize = V.getMaxVecRegSize();
20108 unsigned EltSize = V.getVectorElementSize(Candidates[0]);
20109 const unsigned MaxElts = std::clamp<unsigned>(
20110 llvm::bit_floor(MaxVecRegSize / EltSize), RedValsMaxNumber,
20111 RegMaxNumber * RedValsMaxNumber);
20112
20113 unsigned ReduxWidth = NumReducedVals;
20114 auto GetVectorFactor = [&, &TTI = *TTI](unsigned ReduxWidth) {
20115 unsigned NumParts, NumRegs;
20116 Type *ScalarTy = Candidates.front()->getType();
20117 ReduxWidth =
20118 getFloorFullVectorNumberOfElements(TTI, ScalarTy, ReduxWidth);
20119 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20120 NumParts = TTI.getNumberOfParts(Tp);
20121 NumRegs =
20123 while (NumParts > NumRegs) {
20124 assert(ReduxWidth > 0 && "ReduxWidth is unexpectedly 0.");
20125 ReduxWidth = bit_floor(ReduxWidth - 1);
20126 VectorType *Tp = getWidenedType(ScalarTy, ReduxWidth);
20127 NumParts = TTI.getNumberOfParts(Tp);
20128 NumRegs =
20130 }
20131 if (NumParts > NumRegs / 2)
20132 ReduxWidth = bit_floor(ReduxWidth);
20133 return ReduxWidth;
20134 };
20135 if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
20136 ReduxWidth = GetVectorFactor(ReduxWidth);
20137 ReduxWidth = std::min(ReduxWidth, MaxElts);
20138
20139 unsigned Start = 0;
20140 unsigned Pos = Start;
20141 // Restarts vectorization attempt with lower vector factor.
20142 unsigned PrevReduxWidth = ReduxWidth;
20143 bool CheckForReusedReductionOpsLocal = false;
20144 auto AdjustReducedVals = [&](bool IgnoreVL = false) {
20145 bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
20146 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20147 // Check if any of the reduction ops are gathered. If so, worth
20148 // trying again with less number of reduction ops.
20149 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20150 }
20151 ++Pos;
20152 if (Pos < NumReducedVals - ReduxWidth + 1)
20153 return IsAnyRedOpGathered;
20154 Pos = Start;
20155 --ReduxWidth;
20156 if (ReduxWidth > 1)
20157 ReduxWidth = GetVectorFactor(ReduxWidth);
20158 return IsAnyRedOpGathered;
20159 };
20160 bool AnyVectorized = false;
20161 SmallDenseSet<std::pair<unsigned, unsigned>, 8> IgnoredCandidates;
20162 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20163 ReduxWidth >= ReductionLimit) {
20164 // Dependency in tree of the reduction ops - drop this attempt, try
20165 // later.
20166 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20167 Start == 0) {
20168 CheckForReusedReductionOps = true;
20169 break;
20170 }
20171 PrevReduxWidth = ReduxWidth;
20172 ArrayRef<Value *> VL(std::next(Candidates.begin(), Pos), ReduxWidth);
20173 // Been analyzed already - skip.
20174 if (IgnoredCandidates.contains(std::make_pair(Pos, ReduxWidth)) ||
20175 (!has_single_bit(ReduxWidth) &&
20176 (IgnoredCandidates.contains(
20177 std::make_pair(Pos, bit_floor(ReduxWidth))) ||
20178 IgnoredCandidates.contains(
20179 std::make_pair(Pos + (ReduxWidth - bit_floor(ReduxWidth)),
20180 bit_floor(ReduxWidth))))) ||
20181 V.areAnalyzedReductionVals(VL)) {
20182 (void)AdjustReducedVals(/*IgnoreVL=*/true);
20183 continue;
20184 }
20185 // Early exit if any of the reduction values were deleted during
20186 // previous vectorization attempts.
20187 if (any_of(VL, [&V](Value *RedVal) {
20188 auto *RedValI = dyn_cast<Instruction>(RedVal);
20189 if (!RedValI)
20190 return false;
20191 return V.isDeleted(RedValI);
20192 }))
20193 break;
20194 V.buildTree(VL, IgnoreList);
20195 if (V.isTreeTinyAndNotFullyVectorizable(/*ForReduction=*/true)) {
20196 if (!AdjustReducedVals())
20197 V.analyzedReductionVals(VL);
20198 continue;
20199 }
20200 if (V.isLoadCombineReductionCandidate(RdxKind)) {
20201 if (!AdjustReducedVals())
20202 V.analyzedReductionVals(VL);
20203 continue;
20204 }
20205 V.reorderTopToBottom();
20206 // No need to reorder the root node at all.
20207 V.reorderBottomToTop(/*IgnoreReorder=*/true);
20208 // Keep extracted other reduction values, if they are used in the
20209 // vectorization trees.
20210 BoUpSLP::ExtraValueToDebugLocsMap LocalExternallyUsedValues(
20211 ExternallyUsedValues);
20212 // The reduction root is used as the insertion point for new
20213 // instructions, so set it as externally used to prevent it from being
20214 // deleted.
20215 LocalExternallyUsedValues.insert(ReductionRoot);
20216 for (unsigned Cnt = 0, Sz = ReducedVals.size(); Cnt < Sz; ++Cnt) {
20217 if (Cnt == I || (ShuffledExtracts && Cnt == I - 1))
20218 continue;
20219 for (Value *V : ReducedVals[Cnt])
20220 if (isa<Instruction>(V))
20221 LocalExternallyUsedValues.insert(TrackedVals[V]);
20222 }
20223 if (!IsSupportedHorRdxIdentityOp) {
20224 // Number of uses of the candidates in the vector of values.
20225 assert(SameValuesCounter.empty() &&
20226 "Reused values counter map is not empty");
20227 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20228 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20229 continue;
20230 Value *V = Candidates[Cnt];
20231 Value *OrigV = TrackedToOrig.at(V);
20232 ++SameValuesCounter.try_emplace(OrigV).first->second;
20233 }
20234 }
20235 V.transformNodes();
20236 SmallPtrSet<Value *, 4> VLScalars(VL.begin(), VL.end());
20237 // Gather externally used values.
20239 for (unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20240 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20241 continue;
20242 Value *RdxVal = Candidates[Cnt];
20243 if (auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20244 RdxVal = It->second;
20245 if (!Visited.insert(RdxVal).second)
20246 continue;
20247 // Check if the scalar was vectorized as part of the vectorization
20248 // tree but not the top node.
20249 if (!VLScalars.contains(RdxVal) && V.isVectorized(RdxVal)) {
20250 LocalExternallyUsedValues.insert(RdxVal);
20251 continue;
20252 }
20253 Value *OrigV = TrackedToOrig.at(RdxVal);
20254 unsigned NumOps =
20255 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20256 if (NumOps != ReducedValsToOps.at(OrigV).size())
20257 LocalExternallyUsedValues.insert(RdxVal);
20258 }
20259 // Do not need the list of reused scalars in regular mode anymore.
20260 if (!IsSupportedHorRdxIdentityOp)
20261 SameValuesCounter.clear();
20262 for (Value *RdxVal : VL)
20263 if (RequiredExtract.contains(RdxVal))
20264 LocalExternallyUsedValues.insert(RdxVal);
20265 V.buildExternalUses(LocalExternallyUsedValues);
20266
20267 V.computeMinimumValueSizes();
20268
20269 // Estimate cost.
20270 InstructionCost TreeCost = V.getTreeCost(VL);
20271 InstructionCost ReductionCost =
20272 getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20273 InstructionCost Cost = TreeCost + ReductionCost;
20274 LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
20275 << " for reduction\n");
20276 if (!Cost.isValid())
20277 break;
20278 if (Cost >= -SLPCostThreshold) {
20279 V.getORE()->emit([&]() {
20280 return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
20281 ReducedValsToOps.at(VL[0]).front())
20282 << "Vectorizing horizontal reduction is possible "
20283 << "but not beneficial with cost " << ore::NV("Cost", Cost)
20284 << " and threshold "
20285 << ore::NV("Threshold", -SLPCostThreshold);
20286 });
20287 if (!AdjustReducedVals()) {
20288 V.analyzedReductionVals(VL);
20289 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20290 if (ReduxWidth > ReductionLimit && V.isTreeNotExtendable()) {
20291 // Add subvectors of VL to the list of the analyzed values.
20292 for (unsigned VF = getFloorFullVectorNumberOfElements(
20293 *TTI, VL.front()->getType(), ReduxWidth - 1);
20294 VF >= ReductionLimit;
20296 *TTI, VL.front()->getType(), VF - 1)) {
20297 if (has_single_bit(VF) &&
20298 V.getCanonicalGraphSize() != V.getTreeSize())
20299 continue;
20300 for (unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20301 IgnoredCandidates.insert(std::make_pair(Offset + Idx, VF));
20302 }
20303 }
20304 }
20305 continue;
20306 }
20307
20308 LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
20309 << Cost << ". (HorRdx)\n");
20310 V.getORE()->emit([&]() {
20311 return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
20312 ReducedValsToOps.at(VL[0]).front())
20313 << "Vectorized horizontal reduction with cost "
20314 << ore::NV("Cost", Cost) << " and with tree size "
20315 << ore::NV("TreeSize", V.getTreeSize());
20316 });
20317
20318 Builder.setFastMathFlags(RdxFMF);
20319
20320 // Emit a reduction. If the root is a select (min/max idiom), the insert
20321 // point is the compare condition of that select.
20322 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20323 Instruction *InsertPt = RdxRootInst;
20324 if (IsCmpSelMinMax)
20325 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20326
20327 // Vectorize a tree.
20328 Value *VectorizedRoot =
20329 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20330 // Update TrackedToOrig mapping, since the tracked values might be
20331 // updated.
20332 for (Value *RdxVal : Candidates) {
20333 Value *OrigVal = TrackedToOrig.at(RdxVal);
20334 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20335 if (TransformedRdxVal != RdxVal)
20336 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20337 }
20338
20339 Builder.SetInsertPoint(InsertPt);
20340
20341 // To prevent poison from leaking across what used to be sequential,
20342 // safe, scalar boolean logic operations, the reduction operand must be
20343 // frozen.
20344 if (AnyBoolLogicOp && !isGuaranteedNotToBePoison(VectorizedRoot, AC))
20345 VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
20346
20347 // Emit code to correctly handle reused reduced values, if required.
20348 if (OptReusedScalars && !SameScaleFactor) {
20349 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20350 SameValuesCounter, TrackedToOrig);
20351 }
20352
20353 Value *ReducedSubTree;
20354 Type *ScalarTy = VL.front()->getType();
20355 if (isa<FixedVectorType>(ScalarTy)) {
20356 assert(SLPReVec && "FixedVectorType is not expected.");
20357 unsigned ScalarTyNumElements = getNumElements(ScalarTy);
20358 ReducedSubTree = PoisonValue::get(FixedVectorType::get(
20359 VectorizedRoot->getType()->getScalarType(), ScalarTyNumElements));
20360 for (unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20361 // Do reduction for each lane.
20362 // e.g., do reduce add for
20363 // VL[0] = <4 x Ty> <a, b, c, d>
20364 // VL[1] = <4 x Ty> <e, f, g, h>
20365 // Lane[0] = <2 x Ty> <a, e>
20366 // Lane[1] = <2 x Ty> <b, f>
20367 // Lane[2] = <2 x Ty> <c, g>
20368 // Lane[3] = <2 x Ty> <d, h>
20369 // result[0] = reduce add Lane[0]
20370 // result[1] = reduce add Lane[1]
20371 // result[2] = reduce add Lane[2]
20372 // result[3] = reduce add Lane[3]
20374 createStrideMask(I, ScalarTyNumElements, VL.size());
20375 Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask);
20376 ReducedSubTree = Builder.CreateInsertElement(
20377 ReducedSubTree,
20378 emitReduction(Lane, Builder, TTI, RdxRootInst->getType()), I);
20379 }
20380 } else {
20381 ReducedSubTree = emitReduction(VectorizedRoot, Builder, TTI,
20382 RdxRootInst->getType());
20383 }
20384 if (ReducedSubTree->getType() != VL.front()->getType()) {
20385 assert(ReducedSubTree->getType() != VL.front()->getType() &&
20386 "Expected different reduction type.");
20387 ReducedSubTree =
20388 Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
20389 V.isSignedMinBitwidthRootNode());
20390 }
20391
20392 // Improved analysis for add/fadd/xor reductions with same scale factor
20393 // for all operands of reductions. We can emit scalar ops for them
20394 // instead.
20395 if (OptReusedScalars && SameScaleFactor)
20396 ReducedSubTree = emitScaleForReusedOps(
20397 ReducedSubTree, Builder, SameValuesCounter.front().second);
20398
20399 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20400 // Count vectorized reduced values to exclude them from final reduction.
20401 for (Value *RdxVal : VL) {
20402 Value *OrigV = TrackedToOrig.at(RdxVal);
20403 if (IsSupportedHorRdxIdentityOp) {
20404 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20405 continue;
20406 }
20407 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20408 if (!V.isVectorized(RdxVal))
20409 RequiredExtract.insert(RdxVal);
20410 }
20411 Pos += ReduxWidth;
20412 Start = Pos;
20413 ReduxWidth = NumReducedVals - Pos;
20414 if (ReduxWidth > 1)
20415 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20416 AnyVectorized = true;
20417 }
20418 if (OptReusedScalars && !AnyVectorized) {
20419 for (const std::pair<Value *, unsigned> &P : SameValuesCounter) {
20420 Value *RdxVal = TrackedVals.at(P.first);
20421 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder, P.second);
20422 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20423 VectorizedVals.try_emplace(P.first, P.second);
20424 }
20425 continue;
20426 }
20427 }
20428 if (VectorizedTree) {
20429 // Reorder operands of bool logical op in the natural order to avoid
20430 // possible problem with poison propagation. If not possible to reorder
20431 // (both operands are originally RHS), emit an extra freeze instruction
20432 // for the LHS operand.
20433 // I.e., if we have original code like this:
20434 // RedOp1 = select i1 ?, i1 LHS, i1 false
20435 // RedOp2 = select i1 RHS, i1 ?, i1 false
20436
20437 // Then, we swap LHS/RHS to create a new op that matches the poison
20438 // semantics of the original code.
20439
20440 // If we have original code like this and both values could be poison:
20441 // RedOp1 = select i1 ?, i1 LHS, i1 false
20442 // RedOp2 = select i1 ?, i1 RHS, i1 false
20443
20444 // Then, we must freeze LHS in the new op.
20445 auto FixBoolLogicalOps = [&, VectorizedTree](Value *&LHS, Value *&RHS,
20446 Instruction *RedOp1,
20447 Instruction *RedOp2,
20448 bool InitStep) {
20449 if (!AnyBoolLogicOp)
20450 return;
20451 if (isBoolLogicOp(RedOp1) && ((!InitStep && LHS == VectorizedTree) ||
20452 getRdxOperand(RedOp1, 0) == LHS ||
20454 return;
20455 if (isBoolLogicOp(RedOp2) && ((!InitStep && RHS == VectorizedTree) ||
20456 getRdxOperand(RedOp2, 0) == RHS ||
20458 std::swap(LHS, RHS);
20459 return;
20460 }
20461 if (LHS != VectorizedTree)
20462 LHS = Builder.CreateFreeze(LHS);
20463 };
20464 // Finish the reduction.
20465 // Need to add extra arguments and not vectorized possible reduction
20466 // values.
20467 // Try to avoid dependencies between the scalar remainders after
20468 // reductions.
20469 auto FinalGen =
20471 bool InitStep) {
20472 unsigned Sz = InstVals.size();
20474 Sz % 2);
20475 for (unsigned I = 0, E = (Sz / 2) * 2; I < E; I += 2) {
20476 Instruction *RedOp = InstVals[I + 1].first;
20477 Builder.SetCurrentDebugLocation(RedOp->getDebugLoc());
20478 Value *RdxVal1 = InstVals[I].second;
20479 Value *StableRdxVal1 = RdxVal1;
20480 auto It1 = TrackedVals.find(RdxVal1);
20481 if (It1 != TrackedVals.end())
20482 StableRdxVal1 = It1->second;
20483 Value *RdxVal2 = InstVals[I + 1].second;
20484 Value *StableRdxVal2 = RdxVal2;
20485 auto It2 = TrackedVals.find(RdxVal2);
20486 if (It2 != TrackedVals.end())
20487 StableRdxVal2 = It2->second;
20488 // To prevent poison from leaking across what used to be
20489 // sequential, safe, scalar boolean logic operations, the
20490 // reduction operand must be frozen.
20491 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[I].first,
20492 RedOp, InitStep);
20493 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20494 StableRdxVal2, "op.rdx", ReductionOps);
20495 ExtraReds[I / 2] = std::make_pair(InstVals[I].first, ExtraRed);
20496 }
20497 if (Sz % 2 == 1)
20498 ExtraReds[Sz / 2] = InstVals.back();
20499 return ExtraReds;
20500 };
20502 ExtraReductions.emplace_back(cast<Instruction>(ReductionRoot),
20503 VectorizedTree);
20505 for (ArrayRef<Value *> Candidates : ReducedVals) {
20506 for (Value *RdxVal : Candidates) {
20507 if (!Visited.insert(RdxVal).second)
20508 continue;
20509 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20510 for (Instruction *RedOp :
20511 ArrayRef(ReducedValsToOps.at(RdxVal)).drop_back(NumOps))
20512 ExtraReductions.emplace_back(RedOp, RdxVal);
20513 }
20514 }
20515 // Iterate through all not-vectorized reduction values/extra arguments.
20516 bool InitStep = true;
20517 while (ExtraReductions.size() > 1) {
20519 FinalGen(ExtraReductions, InitStep);
20520 ExtraReductions.swap(NewReds);
20521 InitStep = false;
20522 }
20523 VectorizedTree = ExtraReductions.front().second;
20524
20525 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20526
20527 // The original scalar reduction is expected to have no remaining
20528 // uses outside the reduction tree itself. Assert that we got this
20529 // correct, replace internal uses with undef, and mark for eventual
20530 // deletion.
20531#ifndef NDEBUG
20532 SmallSet<Value *, 4> IgnoreSet;
20533 for (ArrayRef<Value *> RdxOps : ReductionOps)
20534 IgnoreSet.insert(RdxOps.begin(), RdxOps.end());
20535#endif
20536 for (ArrayRef<Value *> RdxOps : ReductionOps) {
20537 for (Value *Ignore : RdxOps) {
20538 if (!Ignore)
20539 continue;
20540#ifndef NDEBUG
20541 for (auto *U : Ignore->users()) {
20542 assert(IgnoreSet.count(U) &&
20543 "All users must be either in the reduction ops list.");
20544 }
20545#endif
20546 if (!Ignore->use_empty()) {
20547 Value *P = PoisonValue::get(Ignore->getType());
20548 Ignore->replaceAllUsesWith(P);
20549 }
20550 }
20551 V.removeInstructionsAndOperands(RdxOps);
20552 }
20553 } else if (!CheckForReusedReductionOps) {
20554 for (ReductionOpsType &RdxOps : ReductionOps)
20555 for (Value *RdxOp : RdxOps)
20556 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20557 }
20558 return VectorizedTree;
20559 }
20560
20561private:
20562 /// Calculate the cost of a reduction.
20563 InstructionCost getReductionCost(TargetTransformInfo *TTI,
20564 ArrayRef<Value *> ReducedVals,
20565 bool IsCmpSelMinMax, FastMathFlags FMF,
20566 const BoUpSLP &R) {
20568 Type *ScalarTy = ReducedVals.front()->getType();
20569 unsigned ReduxWidth = ReducedVals.size();
20570 FixedVectorType *VectorTy = R.getReductionType();
20571 InstructionCost VectorCost = 0, ScalarCost;
20572 // If all of the reduced values are constant, the vector cost is 0, since
20573 // the reduction value can be calculated at the compile time.
20574 bool AllConsts = allConstant(ReducedVals);
20575 auto EvaluateScalarCost = [&](function_ref<InstructionCost()> GenCostFn) {
20577 // Scalar cost is repeated for N-1 elements.
20578 int Cnt = ReducedVals.size();
20579 for (Value *RdxVal : ReducedVals) {
20580 if (Cnt == 1)
20581 break;
20582 --Cnt;
20583 if (RdxVal->hasNUsesOrMore(IsCmpSelMinMax ? 3 : 2)) {
20584 Cost += GenCostFn();
20585 continue;
20586 }
20587 InstructionCost ScalarCost = 0;
20588 for (User *U : RdxVal->users()) {
20589 auto *RdxOp = cast<Instruction>(U);
20590 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20591 ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
20592 continue;
20593 }
20594 ScalarCost = InstructionCost::getInvalid();
20595 break;
20596 }
20597 if (ScalarCost.isValid())
20598 Cost += ScalarCost;
20599 else
20600 Cost += GenCostFn();
20601 }
20602 return Cost;
20603 };
20604 switch (RdxKind) {
20605 case RecurKind::Add:
20606 case RecurKind::Mul:
20607 case RecurKind::Or:
20608 case RecurKind::And:
20609 case RecurKind::Xor:
20610 case RecurKind::FAdd:
20611 case RecurKind::FMul: {
20612 unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind);
20613 if (!AllConsts) {
20614 if (auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20615 assert(SLPReVec && "FixedVectorType is not expected.");
20616 unsigned ScalarTyNumElements = VecTy->getNumElements();
20617 for (unsigned I : seq<unsigned>(ReducedVals.size())) {
20618 VectorCost += TTI->getShuffleCost(
20619 TTI::SK_PermuteSingleSrc, VectorTy,
20620 createStrideMask(I, ScalarTyNumElements, ReducedVals.size()));
20621 VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF,
20622 CostKind);
20623 }
20624 VectorCost += TTI->getScalarizationOverhead(
20625 VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true,
20626 /*Extract*/ false, TTI::TCK_RecipThroughput);
20627 } else {
20628 Type *RedTy = VectorTy->getElementType();
20629 auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
20630 std::make_pair(RedTy, true));
20631 if (RType == RedTy) {
20632 VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
20633 FMF, CostKind);
20634 } else {
20635 VectorCost = TTI->getExtendedReductionCost(
20636 RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth),
20637 FMF, CostKind);
20638 }
20639 }
20640 }
20641 ScalarCost = EvaluateScalarCost([&]() {
20642 return TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
20643 });
20644 break;
20645 }
20646 case RecurKind::FMax:
20647 case RecurKind::FMin:
20648 case RecurKind::FMaximum:
20649 case RecurKind::FMinimum:
20650 case RecurKind::SMax:
20651 case RecurKind::SMin:
20652 case RecurKind::UMax:
20653 case RecurKind::UMin: {
20655 if (!AllConsts)
20656 VectorCost = TTI->getMinMaxReductionCost(Id, VectorTy, FMF, CostKind);
20657 ScalarCost = EvaluateScalarCost([&]() {
20658 IntrinsicCostAttributes ICA(Id, ScalarTy, {ScalarTy, ScalarTy}, FMF);
20659 return TTI->getIntrinsicInstrCost(ICA, CostKind);
20660 });
20661 break;
20662 }
20663 default:
20664 llvm_unreachable("Expected arithmetic or min/max reduction operation");
20665 }
20666
20667 LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
20668 << " for reduction of " << shortBundleName(ReducedVals)
20669 << " (It is a splitting reduction)\n");
20670 return VectorCost - ScalarCost;
20671 }
20672
20673 /// Emit a horizontal reduction of the vectorized value.
20674 Value *emitReduction(Value *VectorizedValue, IRBuilderBase &Builder,
20675 const TargetTransformInfo *TTI, Type *DestTy) {
20676 assert(VectorizedValue && "Need to have a vectorized tree node");
20677 assert(RdxKind != RecurKind::FMulAdd &&
20678 "A call to the llvm.fmuladd intrinsic is not handled yet");
20679
20680 auto *FTy = cast<FixedVectorType>(VectorizedValue->getType());
20681 if (FTy->getScalarType() == Builder.getInt1Ty() &&
20682 RdxKind == RecurKind::Add &&
20683 DestTy->getScalarType() != FTy->getScalarType()) {
20684 // Convert vector_reduce_add(ZExt(<n x i1>)) to
20685 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
20686 Value *V = Builder.CreateBitCast(
20687 VectorizedValue, Builder.getIntNTy(FTy->getNumElements()));
20688 ++NumVectorInstructions;
20689 return Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, V);
20690 }
20691 ++NumVectorInstructions;
20692 return createSimpleReduction(Builder, VectorizedValue, RdxKind);
20693 }
20694
20695 /// Emits optimized code for unique scalar value reused \p Cnt times.
20696 Value *emitScaleForReusedOps(Value *VectorizedValue, IRBuilderBase &Builder,
20697 unsigned Cnt) {
20698 assert(IsSupportedHorRdxIdentityOp &&
20699 "The optimization of matched scalar identity horizontal reductions "
20700 "must be supported.");
20701 if (Cnt == 1)
20702 return VectorizedValue;
20703 switch (RdxKind) {
20704 case RecurKind::Add: {
20705 // res = mul vv, n
20706 Value *Scale = ConstantInt::get(VectorizedValue->getType(), Cnt);
20707 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Cnt << "of "
20708 << VectorizedValue << ". (HorRdx)\n");
20709 return Builder.CreateMul(VectorizedValue, Scale);
20710 }
20711 case RecurKind::Xor: {
20712 // res = n % 2 ? 0 : vv
20713 LLVM_DEBUG(dbgs() << "SLP: Xor " << Cnt << "of " << VectorizedValue
20714 << ". (HorRdx)\n");
20715 if (Cnt % 2 == 0)
20716 return Constant::getNullValue(VectorizedValue->getType());
20717 return VectorizedValue;
20718 }
20719 case RecurKind::FAdd: {
20720 // res = fmul v, n
20721 Value *Scale = ConstantFP::get(VectorizedValue->getType(), Cnt);
20722 LLVM_DEBUG(dbgs() << "SLP: FAdd (to-fmul) " << Cnt << "of "
20723 << VectorizedValue << ". (HorRdx)\n");
20724 return Builder.CreateFMul(VectorizedValue, Scale);
20725 }
20726 case RecurKind::And:
20727 case RecurKind::Or:
20728 case RecurKind::SMax:
20729 case RecurKind::SMin:
20730 case RecurKind::UMax:
20731 case RecurKind::UMin:
20732 case RecurKind::FMax:
20733 case RecurKind::FMin:
20734 case RecurKind::FMaximum:
20735 case RecurKind::FMinimum:
20736 // res = vv
20737 return VectorizedValue;
20738 case RecurKind::Mul:
20739 case RecurKind::FMul:
20740 case RecurKind::FMulAdd:
20741 case RecurKind::IAnyOf:
20742 case RecurKind::FAnyOf:
20743 case RecurKind::IFindLastIV:
20744 case RecurKind::FFindLastIV:
20745 case RecurKind::None:
20746 llvm_unreachable("Unexpected reduction kind for repeated scalar.");
20747 }
20748 return nullptr;
20749 }
20750
20751 /// Emits actual operation for the scalar identity values, found during
20752 /// horizontal reduction analysis.
20753 Value *
20754 emitReusedOps(Value *VectorizedValue, IRBuilderBase &Builder, BoUpSLP &R,
20755 const SmallMapVector<Value *, unsigned, 16> &SameValuesCounter,
20756 const DenseMap<Value *, Value *> &TrackedToOrig) {
20757 assert(IsSupportedHorRdxIdentityOp &&
20758 "The optimization of matched scalar identity horizontal reductions "
20759 "must be supported.");
20760 ArrayRef<Value *> VL = R.getRootNodeScalars();
20761 auto *VTy = cast<FixedVectorType>(VectorizedValue->getType());
20762 if (VTy->getElementType() != VL.front()->getType()) {
20763 VectorizedValue = Builder.CreateIntCast(
20764 VectorizedValue,
20765 getWidenedType(VL.front()->getType(), VTy->getNumElements()),
20766 R.isSignedMinBitwidthRootNode());
20767 }
20768 switch (RdxKind) {
20769 case RecurKind::Add: {
20770 // root = mul prev_root, <1, 1, n, 1>
20772 for (Value *V : VL) {
20773 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20774 Vals.push_back(ConstantInt::get(V->getType(), Cnt, /*IsSigned=*/false));
20775 }
20776 auto *Scale = ConstantVector::get(Vals);
20777 LLVM_DEBUG(dbgs() << "SLP: Add (to-mul) " << Scale << "of "
20778 << VectorizedValue << ". (HorRdx)\n");
20779 return Builder.CreateMul(VectorizedValue, Scale);
20780 }
20781 case RecurKind::And:
20782 case RecurKind::Or:
20783 // No need for multiple or/and(s).
20784 LLVM_DEBUG(dbgs() << "SLP: And/or of same " << VectorizedValue
20785 << ". (HorRdx)\n");
20786 return VectorizedValue;
20787 case RecurKind::SMax:
20788 case RecurKind::SMin:
20789 case RecurKind::UMax:
20790 case RecurKind::UMin:
20791 case RecurKind::FMax:
20792 case RecurKind::FMin:
20793 case RecurKind::FMaximum:
20794 case RecurKind::FMinimum:
20795 // No need for multiple min/max(s) of the same value.
20796 LLVM_DEBUG(dbgs() << "SLP: Max/min of same " << VectorizedValue
20797 << ". (HorRdx)\n");
20798 return VectorizedValue;
20799 case RecurKind::Xor: {
20800 // Replace values with even number of repeats with 0, since
20801 // x xor x = 0.
20802 // root = shuffle prev_root, zeroinitalizer, <0, 1, 2, vf, 4, vf, 5, 6,
20803 // 7>, if elements 4th and 6th elements have even number of repeats.
20805 cast<FixedVectorType>(VectorizedValue->getType())->getNumElements(),
20807 std::iota(Mask.begin(), Mask.end(), 0);
20808 bool NeedShuffle = false;
20809 for (unsigned I = 0, VF = VL.size(); I < VF; ++I) {
20810 Value *V = VL[I];
20811 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20812 if (Cnt % 2 == 0) {
20813 Mask[I] = VF;
20814 NeedShuffle = true;
20815 }
20816 }
20817 LLVM_DEBUG(dbgs() << "SLP: Xor <"; for (int I
20818 : Mask) dbgs()
20819 << I << " ";
20820 dbgs() << "> of " << VectorizedValue << ". (HorRdx)\n");
20821 if (NeedShuffle)
20822 VectorizedValue = Builder.CreateShuffleVector(
20823 VectorizedValue,
20824 ConstantVector::getNullValue(VectorizedValue->getType()), Mask);
20825 return VectorizedValue;
20826 }
20827 case RecurKind::FAdd: {
20828 // root = fmul prev_root, <1.0, 1.0, n.0, 1.0>
20830 for (Value *V : VL) {
20831 unsigned Cnt = SameValuesCounter.lookup(TrackedToOrig.at(V));
20832 Vals.push_back(ConstantFP::get(V->getType(), Cnt));
20833 }
20834 auto *Scale = ConstantVector::get(Vals);
20835 return Builder.CreateFMul(VectorizedValue, Scale);
20836 }
20837 case RecurKind::Mul:
20838 case RecurKind::FMul:
20839 case RecurKind::FMulAdd:
20840 case RecurKind::IAnyOf:
20841 case RecurKind::FAnyOf:
20842 case RecurKind::IFindLastIV:
20843 case RecurKind::FFindLastIV:
20844 case RecurKind::None:
20845 llvm_unreachable("Unexpected reduction kind for reused scalars.");
20846 }
20847 return nullptr;
20848 }
20849};
20850} // end anonymous namespace
20851
20852/// Gets recurrence kind from the specified value.
20854 return HorizontalReduction::getRdxKind(V);
20855}
20856static std::optional<unsigned> getAggregateSize(Instruction *InsertInst) {
20857 if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20858 return cast<FixedVectorType>(IE->getType())->getNumElements();
20859
20860 unsigned AggregateSize = 1;
20861 auto *IV = cast<InsertValueInst>(InsertInst);
20862 Type *CurrentType = IV->getType();
20863 do {
20864 if (auto *ST = dyn_cast<StructType>(CurrentType)) {
20865 for (auto *Elt : ST->elements())
20866 if (Elt != ST->getElementType(0)) // check homogeneity
20867 return std::nullopt;
20868 AggregateSize *= ST->getNumElements();
20869 CurrentType = ST->getElementType(0);
20870 } else if (auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20871 AggregateSize *= AT->getNumElements();
20872 CurrentType = AT->getElementType();
20873 } else if (auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20874 AggregateSize *= VT->getNumElements();
20875 return AggregateSize;
20876 } else if (CurrentType->isSingleValueType()) {
20877 return AggregateSize;
20878 } else {
20879 return std::nullopt;
20880 }
20881 } while (true);
20882}
20883
20884static void findBuildAggregate_rec(Instruction *LastInsertInst,
20886 SmallVectorImpl<Value *> &BuildVectorOpds,
20887 SmallVectorImpl<Value *> &InsertElts,
20888 unsigned OperandOffset, const BoUpSLP &R) {
20889 do {
20890 Value *InsertedOperand = LastInsertInst->getOperand(1);
20891 std::optional<unsigned> OperandIndex =
20892 getElementIndex(LastInsertInst, OperandOffset);
20893 if (!OperandIndex || R.isDeleted(LastInsertInst))
20894 return;
20895 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20896 findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
20897 BuildVectorOpds, InsertElts, *OperandIndex, R);
20898
20899 } else {
20900 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20901 InsertElts[*OperandIndex] = LastInsertInst;
20902 }
20903 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->getOperand(0));
20904 } while (LastInsertInst != nullptr &&
20905 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20906 LastInsertInst->hasOneUse());
20907}
20908
20909/// Recognize construction of vectors like
20910/// %ra = insertelement <4 x float> poison, float %s0, i32 0
20911/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
20912/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
20913/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
20914/// starting from the last insertelement or insertvalue instruction.
20915///
20916/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
20917/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
20918/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
20919///
20920/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
20921///
20922/// \return true if it matches.
20923static bool findBuildAggregate(Instruction *LastInsertInst,
20925 SmallVectorImpl<Value *> &BuildVectorOpds,
20926 SmallVectorImpl<Value *> &InsertElts,
20927 const BoUpSLP &R) {
20928
20929 assert((isa<InsertElementInst>(LastInsertInst) ||
20930 isa<InsertValueInst>(LastInsertInst)) &&
20931 "Expected insertelement or insertvalue instruction!");
20932
20933 assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
20934 "Expected empty result vectors!");
20935
20936 std::optional<unsigned> AggregateSize = getAggregateSize(LastInsertInst);
20937 if (!AggregateSize)
20938 return false;
20939 BuildVectorOpds.resize(*AggregateSize);
20940 InsertElts.resize(*AggregateSize);
20941
20942 findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
20943 R);
20944 llvm::erase(BuildVectorOpds, nullptr);
20945 llvm::erase(InsertElts, nullptr);
20946 if (BuildVectorOpds.size() >= 2)
20947 return true;
20948
20949 return false;
20950}
20951
20952/// Try and get a reduction instruction from a phi node.
20953///
20954/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
20955/// if they come from either \p ParentBB or a containing loop latch.
20956///
20957/// \returns A candidate reduction value if possible, or \code nullptr \endcode
20958/// if not possible.
20960 BasicBlock *ParentBB, LoopInfo *LI) {
20961 // There are situations where the reduction value is not dominated by the
20962 // reduction phi. Vectorizing such cases has been reported to cause
20963 // miscompiles. See PR25787.
20964 auto DominatedReduxValue = [&](Value *R) {
20965 return isa<Instruction>(R) &&
20966 DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
20967 };
20968
20969 Instruction *Rdx = nullptr;
20970
20971 // Return the incoming value if it comes from the same BB as the phi node.
20972 if (P->getIncomingBlock(0) == ParentBB) {
20973 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20974 } else if (P->getIncomingBlock(1) == ParentBB) {
20975 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20976 }
20977
20978 if (Rdx && DominatedReduxValue(Rdx))
20979 return Rdx;
20980
20981 // Otherwise, check whether we have a loop latch to look at.
20982 Loop *BBL = LI->getLoopFor(ParentBB);
20983 if (!BBL)
20984 return nullptr;
20985 BasicBlock *BBLatch = BBL->getLoopLatch();
20986 if (!BBLatch)
20987 return nullptr;
20988
20989 // There is a loop latch, return the incoming value if it comes from
20990 // that. This reduction pattern occasionally turns up.
20991 if (P->getIncomingBlock(0) == BBLatch) {
20992 Rdx = dyn_cast<Instruction>(P->getIncomingValue(0));
20993 } else if (P->getIncomingBlock(1) == BBLatch) {
20994 Rdx = dyn_cast<Instruction>(P->getIncomingValue(1));
20995 }
20996
20997 if (Rdx && DominatedReduxValue(Rdx))
20998 return Rdx;
20999
21000 return nullptr;
21001}
21002
21003static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
21004 if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
21005 return true;
21006 if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
21007 return true;
21008 if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
21009 return true;
21010 if (match(I, m_Intrinsic<Intrinsic::maximum>(m_Value(V0), m_Value(V1))))
21011 return true;
21012 if (match(I, m_Intrinsic<Intrinsic::minimum>(m_Value(V0), m_Value(V1))))
21013 return true;
21014 if (match(I, m_Intrinsic<Intrinsic::smax>(m_Value(V0), m_Value(V1))))
21015 return true;
21016 if (match(I, m_Intrinsic<Intrinsic::smin>(m_Value(V0), m_Value(V1))))
21017 return true;
21018 if (match(I, m_Intrinsic<Intrinsic::umax>(m_Value(V0), m_Value(V1))))
21019 return true;
21020 if (match(I, m_Intrinsic<Intrinsic::umin>(m_Value(V0), m_Value(V1))))
21021 return true;
21022 return false;
21023}
21024
21025/// We could have an initial reduction that is not an add.
21026/// r *= v1 + v2 + v3 + v4
21027/// In such a case start looking for a tree rooted in the first '+'.
21028/// \Returns the new root if found, which may be nullptr if not an instruction.
21030 Instruction *Root) {
21031 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
21032 isa<IntrinsicInst>(Root)) &&
21033 "Expected binop, select, or intrinsic for reduction matching");
21034 Value *LHS =
21035 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root));
21036 Value *RHS =
21037 Root->getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21038 if (LHS == Phi)
21039 return dyn_cast<Instruction>(RHS);
21040 if (RHS == Phi)
21041 return dyn_cast<Instruction>(LHS);
21042 return nullptr;
21043}
21044
21045/// \p Returns the first operand of \p I that does not match \p Phi. If
21046/// operand is not an instruction it returns nullptr.
21048 Value *Op0 = nullptr;
21049 Value *Op1 = nullptr;
21050 if (!matchRdxBop(I, Op0, Op1))
21051 return nullptr;
21052 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21053}
21054
21055/// \Returns true if \p I is a candidate instruction for reduction vectorization.
21057 bool IsSelect = match(I, m_Select(m_Value(), m_Value(), m_Value()));
21058 Value *B0 = nullptr, *B1 = nullptr;
21059 bool IsBinop = matchRdxBop(I, B0, B1);
21060 return IsBinop || IsSelect;
21061}
21062
21063bool SLPVectorizerPass::vectorizeHorReduction(
21064 PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
21065 SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
21066 if (!ShouldVectorizeHor)
21067 return false;
21068 bool TryOperandsAsNewSeeds = P && isa<BinaryOperator>(Root);
21069
21070 if (Root->getParent() != BB || isa<PHINode>(Root))
21071 return false;
21072
21073 // If we can find a secondary reduction root, use that instead.
21074 auto SelectRoot = [&]() {
21075 if (TryOperandsAsNewSeeds && isReductionCandidate(Root) &&
21076 HorizontalReduction::getRdxKind(Root) != RecurKind::None)
21077 if (Instruction *NewRoot = tryGetSecondaryReductionRoot(P, Root))
21078 return NewRoot;
21079 return Root;
21080 };
21081
21082 // Start analysis starting from Root instruction. If horizontal reduction is
21083 // found, try to vectorize it. If it is not a horizontal reduction or
21084 // vectorization is not possible or not effective, and currently analyzed
21085 // instruction is a binary operation, try to vectorize the operands, using
21086 // pre-order DFS traversal order. If the operands were not vectorized, repeat
21087 // the same procedure considering each operand as a possible root of the
21088 // horizontal reduction.
21089 // Interrupt the process if the Root instruction itself was vectorized or all
21090 // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
21091 // If a horizintal reduction was not matched or vectorized we collect
21092 // instructions for possible later attempts for vectorization.
21093 std::queue<std::pair<Instruction *, unsigned>> Stack;
21094 Stack.emplace(SelectRoot(), 0);
21095 SmallPtrSet<Value *, 8> VisitedInstrs;
21096 bool Res = false;
21097 auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
21098 if (R.isAnalyzedReductionRoot(Inst))
21099 return nullptr;
21100 if (!isReductionCandidate(Inst))
21101 return nullptr;
21102 HorizontalReduction HorRdx;
21103 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
21104 return nullptr;
21105 return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
21106 };
21107 auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
21108 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21109 FutureSeed = getNonPhiOperand(Root, P);
21110 if (!FutureSeed)
21111 return false;
21112 }
21113 // Do not collect CmpInst or InsertElementInst/InsertValueInst as their
21114 // analysis is done separately.
21115 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21116 PostponedInsts.push_back(FutureSeed);
21117 return true;
21118 };
21119
21120 while (!Stack.empty()) {
21121 Instruction *Inst;
21122 unsigned Level;
21123 std::tie(Inst, Level) = Stack.front();
21124 Stack.pop();
21125 // Do not try to analyze instruction that has already been vectorized.
21126 // This may happen when we vectorize instruction operands on a previous
21127 // iteration while stack was populated before that happened.
21128 if (R.isDeleted(Inst))
21129 continue;
21130 if (Value *VectorizedV = TryToReduce(Inst)) {
21131 Res = true;
21132 if (auto *I = dyn_cast<Instruction>(VectorizedV)) {
21133 // Try to find another reduction.
21134 Stack.emplace(I, Level);
21135 continue;
21136 }
21137 if (R.isDeleted(Inst))
21138 continue;
21139 } else {
21140 // We could not vectorize `Inst` so try to use it as a future seed.
21141 if (!TryAppendToPostponedInsts(Inst)) {
21142 assert(Stack.empty() && "Expected empty stack");
21143 break;
21144 }
21145 }
21146
21147 // Try to vectorize operands.
21148 // Continue analysis for the instruction from the same basic block only to
21149 // save compile time.
21150 if (++Level < RecursionMaxDepth)
21151 for (auto *Op : Inst->operand_values())
21152 if (VisitedInstrs.insert(Op).second)
21153 if (auto *I = dyn_cast<Instruction>(Op))
21154 // Do not try to vectorize CmpInst operands, this is done
21155 // separately.
21156 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(I) &&
21157 !R.isDeleted(I) && I->getParent() == BB)
21158 Stack.emplace(I, Level);
21159 }
21160 return Res;
21161}
21162
21163bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
21164 BasicBlock *BB, BoUpSLP &R) {
21165 SmallVector<WeakTrackingVH> PostponedInsts;
21166 bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
21167 Res |= tryToVectorize(PostponedInsts, R);
21168 return Res;
21169}
21170
21171bool SLPVectorizerPass::tryToVectorize(ArrayRef<WeakTrackingVH> Insts,
21172 BoUpSLP &R) {
21173 bool Res = false;
21174 for (Value *V : Insts)
21175 if (auto *Inst = dyn_cast<Instruction>(V); Inst && !R.isDeleted(Inst))
21176 Res |= tryToVectorize(Inst, R);
21177 return Res;
21178}
21179
21180bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
21181 BasicBlock *BB, BoUpSLP &R,
21182 bool MaxVFOnly) {
21183 if (!R.canMapToVector(IVI->getType()))
21184 return false;
21185
21186 SmallVector<Value *, 16> BuildVectorOpds;
21187 SmallVector<Value *, 16> BuildVectorInsts;
21188 if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
21189 return false;
21190
21191 if (MaxVFOnly && BuildVectorOpds.size() == 2) {
21192 R.getORE()->emit([&]() {
21193 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IVI)
21194 << "Cannot SLP vectorize list: only 2 elements of buildvalue, "
21195 "trying reduction first.";
21196 });
21197 return false;
21198 }
21199 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
21200 // Aggregate value is unlikely to be processed in vector register.
21201 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21202}
21203
21204bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
21205 BasicBlock *BB, BoUpSLP &R,
21206 bool MaxVFOnly) {
21207 SmallVector<Value *, 16> BuildVectorInsts;
21208 SmallVector<Value *, 16> BuildVectorOpds;
21210 if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
21211 (all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21212 isFixedVectorShuffle(BuildVectorOpds, Mask, AC)))
21213 return false;
21214
21215 if (MaxVFOnly && BuildVectorInsts.size() == 2) {
21216 R.getORE()->emit([&]() {
21217 return OptimizationRemarkMissed(SV_NAME, "NotPossible", IEI)
21218 << "Cannot SLP vectorize list: only 2 elements of buildvector, "
21219 "trying reduction first.";
21220 });
21221 return false;
21222 }
21223 LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IEI << "\n");
21224 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21225}
21226
21227template <typename T>
21229 SmallVectorImpl<T *> &Incoming, function_ref<bool(T *, T *)> Comparator,
21230 function_ref<bool(T *, T *)> AreCompatible,
21231 function_ref<bool(ArrayRef<T *>, bool)> TryToVectorizeHelper,
21232 bool MaxVFOnly, BoUpSLP &R) {
21233 bool Changed = false;
21234 // Sort by type, parent, operands.
21235 stable_sort(Incoming, Comparator);
21236
21237 // Try to vectorize elements base on their type.
21238 SmallVector<T *> Candidates;
21240 for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
21241 VL.clear()) {
21242 // Look for the next elements with the same type, parent and operand
21243 // kinds.
21244 auto *I = dyn_cast<Instruction>(*IncIt);
21245 if (!I || R.isDeleted(I)) {
21246 ++IncIt;
21247 continue;
21248 }
21249 auto *SameTypeIt = IncIt;
21250 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21251 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21252 AreCompatible(*SameTypeIt, *IncIt))) {
21253 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21254 ++SameTypeIt;
21255 if (I && !R.isDeleted(I))
21256 VL.push_back(cast<T>(I));
21257 }
21258
21259 // Try to vectorize them.
21260 unsigned NumElts = VL.size();
21261 LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
21262 << NumElts << ")\n");
21263 // The vectorization is a 3-state attempt:
21264 // 1. Try to vectorize instructions with the same/alternate opcodes with the
21265 // size of maximal register at first.
21266 // 2. Try to vectorize remaining instructions with the same type, if
21267 // possible. This may result in the better vectorization results rather than
21268 // if we try just to vectorize instructions with the same/alternate opcodes.
21269 // 3. Final attempt to try to vectorize all instructions with the
21270 // same/alternate ops only, this may result in some extra final
21271 // vectorization.
21272 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
21273 // Success start over because instructions might have been changed.
21274 Changed = true;
21275 VL.swap(Candidates);
21276 Candidates.clear();
21277 for (T *V : VL) {
21278 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21279 Candidates.push_back(V);
21280 }
21281 } else {
21282 /// \Returns the minimum number of elements that we will attempt to
21283 /// vectorize.
21284 auto GetMinNumElements = [&R](Value *V) {
21285 unsigned EltSize = R.getVectorElementSize(V);
21286 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21287 };
21288 if (NumElts < GetMinNumElements(*IncIt) &&
21289 (Candidates.empty() ||
21290 Candidates.front()->getType() == (*IncIt)->getType())) {
21291 for (T *V : VL) {
21292 if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
21293 Candidates.push_back(V);
21294 }
21295 }
21296 }
21297 // Final attempt to vectorize instructions with the same types.
21298 if (Candidates.size() > 1 &&
21299 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21300 if (TryToVectorizeHelper(Candidates, /*MaxVFOnly=*/false)) {
21301 // Success start over because instructions might have been changed.
21302 Changed = true;
21303 } else if (MaxVFOnly) {
21304 // Try to vectorize using small vectors.
21306 for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
21307 VL.clear()) {
21308 auto *I = dyn_cast<Instruction>(*It);
21309 if (!I || R.isDeleted(I)) {
21310 ++It;
21311 continue;
21312 }
21313 auto *SameTypeIt = It;
21314 while (SameTypeIt != End &&
21315 (!isa<Instruction>(*SameTypeIt) ||
21316 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21317 AreCompatible(*SameTypeIt, *It))) {
21318 auto *I = dyn_cast<Instruction>(*SameTypeIt);
21319 ++SameTypeIt;
21320 if (I && !R.isDeleted(I))
21321 VL.push_back(cast<T>(I));
21322 }
21323 unsigned NumElts = VL.size();
21324 if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
21325 /*MaxVFOnly=*/false))
21326 Changed = true;
21327 It = SameTypeIt;
21328 }
21329 }
21330 Candidates.clear();
21331 }
21332
21333 // Start over at the next instruction of a different type (or the end).
21334 IncIt = SameTypeIt;
21335 }
21336 return Changed;
21337}
21338
21339/// Compare two cmp instructions. If IsCompatibility is true, function returns
21340/// true if 2 cmps have same/swapped predicates and mos compatible corresponding
21341/// operands. If IsCompatibility is false, function implements strict weak
21342/// ordering relation between two cmp instructions, returning true if the first
21343/// instruction is "less" than the second, i.e. its predicate is less than the
21344/// predicate of the second or the operands IDs are less than the operands IDs
21345/// of the second cmp instruction.
21346template <bool IsCompatibility>
21347static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI,
21348 const DominatorTree &DT) {
21349 assert(isValidElementType(V->getType()) &&
21350 isValidElementType(V2->getType()) &&
21351 "Expected valid element types only.");
21352 if (V == V2)
21353 return IsCompatibility;
21354 auto *CI1 = cast<CmpInst>(V);
21355 auto *CI2 = cast<CmpInst>(V2);
21356 if (CI1->getOperand(0)->getType()->getTypeID() <
21357 CI2->getOperand(0)->getType()->getTypeID())
21358 return !IsCompatibility;
21359 if (CI1->getOperand(0)->getType()->getTypeID() >
21360 CI2->getOperand(0)->getType()->getTypeID())
21361 return false;
21362 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21364 return !IsCompatibility;
21365 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21367 return false;
21368 CmpInst::Predicate Pred1 = CI1->getPredicate();
21369 CmpInst::Predicate Pred2 = CI2->getPredicate();
21372 CmpInst::Predicate BasePred1 = std::min(Pred1, SwapPred1);
21373 CmpInst::Predicate BasePred2 = std::min(Pred2, SwapPred2);
21374 if (BasePred1 < BasePred2)
21375 return !IsCompatibility;
21376 if (BasePred1 > BasePred2)
21377 return false;
21378 // Compare operands.
21379 bool CI1Preds = Pred1 == BasePred1;
21380 bool CI2Preds = Pred2 == BasePred1;
21381 for (int I = 0, E = CI1->getNumOperands(); I < E; ++I) {
21382 auto *Op1 = CI1->getOperand(CI1Preds ? I : E - I - 1);
21383 auto *Op2 = CI2->getOperand(CI2Preds ? I : E - I - 1);
21384 if (Op1 == Op2)
21385 continue;
21386 if (Op1->getValueID() < Op2->getValueID())
21387 return !IsCompatibility;
21388 if (Op1->getValueID() > Op2->getValueID())
21389 return false;
21390 if (auto *I1 = dyn_cast<Instruction>(Op1))
21391 if (auto *I2 = dyn_cast<Instruction>(Op2)) {
21392 if (IsCompatibility) {
21393 if (I1->getParent() != I2->getParent())
21394 return false;
21395 } else {
21396 // Try to compare nodes with same parent.
21397 DomTreeNodeBase<BasicBlock> *NodeI1 = DT.getNode(I1->getParent());
21398 DomTreeNodeBase<BasicBlock> *NodeI2 = DT.getNode(I2->getParent());
21399 if (!NodeI1)
21400 return NodeI2 != nullptr;
21401 if (!NodeI2)
21402 return false;
21403 assert((NodeI1 == NodeI2) ==
21404 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21405 "Different nodes should have different DFS numbers");
21406 if (NodeI1 != NodeI2)
21407 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21408 }
21409 InstructionsState S = getSameOpcode({I1, I2}, TLI);
21410 if (S && (IsCompatibility || !S.isAltShuffle()))
21411 continue;
21412 if (IsCompatibility)
21413 return false;
21414 if (I1->getOpcode() != I2->getOpcode())
21415 return I1->getOpcode() < I2->getOpcode();
21416 }
21417 }
21418 return IsCompatibility;
21419}
21420
21421template <typename ItT>
21422bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
21423 BasicBlock *BB, BoUpSLP &R) {
21424 bool Changed = false;
21425 // Try to find reductions first.
21426 for (CmpInst *I : CmpInsts) {
21427 if (R.isDeleted(I))
21428 continue;
21429 for (Value *Op : I->operands())
21430 if (auto *RootOp = dyn_cast<Instruction>(Op)) {
21431 Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
21432 if (R.isDeleted(I))
21433 break;
21434 }
21435 }
21436 // Try to vectorize operands as vector bundles.
21437 for (CmpInst *I : CmpInsts) {
21438 if (R.isDeleted(I))
21439 continue;
21440 Changed |= tryToVectorize(I, R);
21441 }
21442 // Try to vectorize list of compares.
21443 // Sort by type, compare predicate, etc.
21444 auto CompareSorter = [&](Value *V, Value *V2) {
21445 if (V == V2)
21446 return false;
21447 return compareCmp<false>(V, V2, *TLI, *DT);
21448 };
21449
21450 auto AreCompatibleCompares = [&](Value *V1, Value *V2) {
21451 if (V1 == V2)
21452 return true;
21453 return compareCmp<true>(V1, V2, *TLI, *DT);
21454 };
21455
21457 for (Instruction *V : CmpInsts)
21458 if (!R.isDeleted(V) && isValidElementType(getValueType(V)))
21459 Vals.push_back(V);
21460 if (Vals.size() <= 1)
21461 return Changed;
21462 Changed |= tryToVectorizeSequence<Value>(
21463 Vals, CompareSorter, AreCompatibleCompares,
21464 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21465 // Exclude possible reductions from other blocks.
21466 bool ArePossiblyReducedInOtherBlock = any_of(Candidates, [](Value *V) {
21467 return any_of(V->users(), [V](User *U) {
21468 auto *Select = dyn_cast<SelectInst>(U);
21469 return Select &&
21470 Select->getParent() != cast<Instruction>(V)->getParent();
21471 });
21472 });
21473 if (ArePossiblyReducedInOtherBlock)
21474 return false;
21475 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21476 },
21477 /*MaxVFOnly=*/true, R);
21478 return Changed;
21479}
21480
21481bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21482 BasicBlock *BB, BoUpSLP &R) {
21483 assert(all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21484 "This function only accepts Insert instructions");
21485 bool OpsChanged = false;
21486 SmallVector<WeakTrackingVH> PostponedInsts;
21487 for (auto *I : reverse(Instructions)) {
21488 // pass1 - try to match and vectorize a buildvector sequence for MaxVF only.
21489 if (R.isDeleted(I) || isa<CmpInst>(I))
21490 continue;
21491 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21492 OpsChanged |=
21493 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/true);
21494 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21495 OpsChanged |=
21496 vectorizeInsertElementInst(LastInsertElem, BB, R, /*MaxVFOnly=*/true);
21497 }
21498 // pass2 - try to vectorize reductions only
21499 if (R.isDeleted(I))
21500 continue;
21501 OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
21502 if (R.isDeleted(I) || isa<CmpInst>(I))
21503 continue;
21504 // pass3 - try to match and vectorize a buildvector sequence.
21505 if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I)) {
21506 OpsChanged |=
21507 vectorizeInsertValueInst(LastInsertValue, BB, R, /*MaxVFOnly=*/false);
21508 } else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I)) {
21509 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21510 /*MaxVFOnly=*/false);
21511 }
21512 }
21513 // Now try to vectorize postponed instructions.
21514 OpsChanged |= tryToVectorize(PostponedInsts, R);
21515
21516 Instructions.clear();
21517 return OpsChanged;
21518}
21519
21520bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
21521 bool Changed = false;
21523 SmallPtrSet<Value *, 16> VisitedInstrs;
21524 // Maps phi nodes to the non-phi nodes found in the use tree for each phi
21525 // node. Allows better to identify the chains that can be vectorized in the
21526 // better way.
21528 auto PHICompare = [this, &PHIToOpcodes](Value *V1, Value *V2) {
21530 isValidElementType(V2->getType()) &&
21531 "Expected vectorizable types only.");
21532 // It is fine to compare type IDs here, since we expect only vectorizable
21533 // types, like ints, floats and pointers, we don't care about other type.
21534 if (V1->getType()->getTypeID() < V2->getType()->getTypeID())
21535 return true;
21536 if (V1->getType()->getTypeID() > V2->getType()->getTypeID())
21537 return false;
21538 if (V1->getType()->getScalarSizeInBits() <
21539 V2->getType()->getScalarSizeInBits())
21540 return true;
21541 if (V1->getType()->getScalarSizeInBits() >
21542 V2->getType()->getScalarSizeInBits())
21543 return false;
21544 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21545 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21546 if (Opcodes1.size() < Opcodes2.size())
21547 return true;
21548 if (Opcodes1.size() > Opcodes2.size())
21549 return false;
21550 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21551 {
21552 // Instructions come first.
21553 auto *I1 = dyn_cast<Instruction>(Opcodes1[I]);
21554 auto *I2 = dyn_cast<Instruction>(Opcodes2[I]);
21555 if (I1 && I2) {
21556 DomTreeNodeBase<BasicBlock> *NodeI1 = DT->getNode(I1->getParent());
21557 DomTreeNodeBase<BasicBlock> *NodeI2 = DT->getNode(I2->getParent());
21558 if (!NodeI1)
21559 return NodeI2 != nullptr;
21560 if (!NodeI2)
21561 return false;
21562 assert((NodeI1 == NodeI2) ==
21563 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21564 "Different nodes should have different DFS numbers");
21565 if (NodeI1 != NodeI2)
21566 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21567 InstructionsState S = getSameOpcode({I1, I2}, *TLI);
21568 if (S && !S.isAltShuffle())
21569 continue;
21570 return I1->getOpcode() < I2->getOpcode();
21571 }
21572 if (I1)
21573 return true;
21574 if (I2)
21575 return false;
21576 }
21577 {
21578 // Non-undef constants come next.
21579 bool C1 = isa<Constant>(Opcodes1[I]) && !isa<UndefValue>(Opcodes1[I]);
21580 bool C2 = isa<Constant>(Opcodes2[I]) && !isa<UndefValue>(Opcodes2[I]);
21581 if (C1 && C2)
21582 continue;
21583 if (C1)
21584 return true;
21585 if (C2)
21586 return false;
21587 }
21588 bool U1 = isa<UndefValue>(Opcodes1[I]);
21589 bool U2 = isa<UndefValue>(Opcodes2[I]);
21590 {
21591 // Non-constant non-instructions come next.
21592 if (!U1 && !U2) {
21593 auto ValID1 = Opcodes1[I]->getValueID();
21594 auto ValID2 = Opcodes2[I]->getValueID();
21595 if (ValID1 == ValID2)
21596 continue;
21597 if (ValID1 < ValID2)
21598 return true;
21599 if (ValID1 > ValID2)
21600 return false;
21601 }
21602 if (!U1)
21603 return true;
21604 if (!U2)
21605 return false;
21606 }
21607 // Undefs come last.
21608 assert(U1 && U2 && "The only thing left should be undef & undef.");
21609 }
21610 return false;
21611 };
21612 auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
21613 if (V1 == V2)
21614 return true;
21615 if (V1->getType() != V2->getType())
21616 return false;
21617 ArrayRef<Value *> Opcodes1 = PHIToOpcodes[V1];
21618 ArrayRef<Value *> Opcodes2 = PHIToOpcodes[V2];
21619 if (Opcodes1.size() != Opcodes2.size())
21620 return false;
21621 for (int I = 0, E = Opcodes1.size(); I < E; ++I) {
21622 // Undefs are compatible with any other value.
21623 if (isa<UndefValue>(Opcodes1[I]) || isa<UndefValue>(Opcodes2[I]))
21624 continue;
21625 if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
21626 if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
21627 if (R.isDeleted(I1) || R.isDeleted(I2))
21628 return false;
21629 if (I1->getParent() != I2->getParent())
21630 return false;
21631 if (getSameOpcode({I1, I2}, *TLI))
21632 continue;
21633 return false;
21634 }
21635 if (isa<Constant>(Opcodes1[I]) && isa<Constant>(Opcodes2[I]))
21636 continue;
21637 if (Opcodes1[I]->getValueID() != Opcodes2[I]->getValueID())
21638 return false;
21639 }
21640 return true;
21641 };
21642
21643 bool HaveVectorizedPhiNodes = false;
21644 do {
21645 // Collect the incoming values from the PHIs.
21646 Incoming.clear();
21647 for (Instruction &I : *BB) {
21648 auto *P = dyn_cast<PHINode>(&I);
21649 if (!P || P->getNumIncomingValues() > MaxPHINumOperands)
21650 break;
21651
21652 // No need to analyze deleted, vectorized and non-vectorizable
21653 // instructions.
21654 if (!VisitedInstrs.count(P) && !R.isDeleted(P) &&
21655 isValidElementType(P->getType()))
21656 Incoming.push_back(P);
21657 }
21658
21659 if (Incoming.size() <= 1)
21660 break;
21661
21662 // Find the corresponding non-phi nodes for better matching when trying to
21663 // build the tree.
21664 for (Value *V : Incoming) {
21665 SmallVectorImpl<Value *> &Opcodes =
21666 PHIToOpcodes.try_emplace(V).first->getSecond();
21667 if (!Opcodes.empty())
21668 continue;
21669 SmallVector<Value *, 4> Nodes(1, V);
21671 while (!Nodes.empty()) {
21672 auto *PHI = cast<PHINode>(Nodes.pop_back_val());
21673 if (!Visited.insert(PHI).second)
21674 continue;
21675 for (Value *V : PHI->incoming_values()) {
21676 if (auto *PHI1 = dyn_cast<PHINode>((V))) {
21677 Nodes.push_back(PHI1);
21678 continue;
21679 }
21680 Opcodes.emplace_back(V);
21681 }
21682 }
21683 }
21684
21685 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21686 Incoming, PHICompare, AreCompatiblePHIs,
21687 [this, &R](ArrayRef<Value *> Candidates, bool MaxVFOnly) {
21688 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21689 },
21690 /*MaxVFOnly=*/true, R);
21691 Changed |= HaveVectorizedPhiNodes;
21692 if (HaveVectorizedPhiNodes && any_of(PHIToOpcodes, [&](const auto &P) {
21693 auto *PHI = dyn_cast<PHINode>(P.first);
21694 return !PHI || R.isDeleted(PHI);
21695 }))
21696 PHIToOpcodes.clear();
21697 VisitedInstrs.insert(Incoming.begin(), Incoming.end());
21698 } while (HaveVectorizedPhiNodes);
21699
21700 VisitedInstrs.clear();
21701
21702 InstSetVector PostProcessInserts;
21703 SmallSetVector<CmpInst *, 8> PostProcessCmps;
21704 // Vectorizes Inserts in `PostProcessInserts` and if `VecctorizeCmps` is true
21705 // also vectorizes `PostProcessCmps`.
21706 auto VectorizeInsertsAndCmps = [&](bool VectorizeCmps) {
21707 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21708 if (VectorizeCmps) {
21709 Changed |= vectorizeCmpInsts(reverse(PostProcessCmps), BB, R);
21710 PostProcessCmps.clear();
21711 }
21712 PostProcessInserts.clear();
21713 return Changed;
21714 };
21715 // Returns true if `I` is in `PostProcessInserts` or `PostProcessCmps`.
21716 auto IsInPostProcessInstrs = [&](Instruction *I) {
21717 if (auto *Cmp = dyn_cast<CmpInst>(I))
21718 return PostProcessCmps.contains(Cmp);
21719 return isa<InsertElementInst, InsertValueInst>(I) &&
21720 PostProcessInserts.contains(I);
21721 };
21722 // Returns true if `I` is an instruction without users, like terminator, or
21723 // function call with ignored return value, store. Ignore unused instructions
21724 // (basing on instruction type, except for CallInst and InvokeInst).
21725 auto HasNoUsers = [](Instruction *I) {
21726 return I->use_empty() &&
21727 (I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(I));
21728 };
21729 for (BasicBlock::iterator It = BB->begin(), E = BB->end(); It != E; ++It) {
21730 // Skip instructions with scalable type. The num of elements is unknown at
21731 // compile-time for scalable type.
21732 if (isa<ScalableVectorType>(It->getType()))
21733 continue;
21734
21735 // Skip instructions marked for the deletion.
21736 if (R.isDeleted(&*It))
21737 continue;
21738 // We may go through BB multiple times so skip the one we have checked.
21739 if (!VisitedInstrs.insert(&*It).second) {
21740 if (HasNoUsers(&*It) &&
21741 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator())) {
21742 // We would like to start over since some instructions are deleted
21743 // and the iterator may become invalid value.
21744 Changed = true;
21745 It = BB->begin();
21746 E = BB->end();
21747 }
21748 continue;
21749 }
21750
21751 if (isa<DbgInfoIntrinsic>(It))
21752 continue;
21753
21754 // Try to vectorize reductions that use PHINodes.
21755 if (PHINode *P = dyn_cast<PHINode>(It)) {
21756 // Check that the PHI is a reduction PHI.
21757 if (P->getNumIncomingValues() == 2) {
21758 // Try to match and vectorize a horizontal reduction.
21759 Instruction *Root = getReductionInstr(DT, P, BB, LI);
21760 if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
21761 Changed = true;
21762 It = BB->begin();
21763 E = BB->end();
21764 continue;
21765 }
21766 }
21767 // Try to vectorize the incoming values of the PHI, to catch reductions
21768 // that feed into PHIs.
21769 for (unsigned I : seq<unsigned>(P->getNumIncomingValues())) {
21770 // Skip if the incoming block is the current BB for now. Also, bypass
21771 // unreachable IR for efficiency and to avoid crashing.
21772 // TODO: Collect the skipped incoming values and try to vectorize them
21773 // after processing BB.
21774 if (BB == P->getIncomingBlock(I) ||
21775 !DT->isReachableFromEntry(P->getIncomingBlock(I)))
21776 continue;
21777
21778 // Postponed instructions should not be vectorized here, delay their
21779 // vectorization.
21780 if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
21781 PI && !IsInPostProcessInstrs(PI)) {
21782 bool Res =
21783 vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
21784 Changed |= Res;
21785 if (Res && R.isDeleted(P)) {
21786 It = BB->begin();
21787 E = BB->end();
21788 break;
21789 }
21790 }
21791 }
21792 continue;
21793 }
21794
21795 if (HasNoUsers(&*It)) {
21796 bool OpsChanged = false;
21797 auto *SI = dyn_cast<StoreInst>(It);
21798 bool TryToVectorizeRoot = ShouldStartVectorizeHorAtStore || !SI;
21799 if (SI) {
21800 auto *I = Stores.find(getUnderlyingObject(SI->getPointerOperand()));
21801 // Try to vectorize chain in store, if this is the only store to the
21802 // address in the block.
21803 // TODO: This is just a temporarily solution to save compile time. Need
21804 // to investigate if we can safely turn on slp-vectorize-hor-store
21805 // instead to allow lookup for reduction chains in all non-vectorized
21806 // stores (need to check side effects and compile time).
21807 TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
21808 SI->getValueOperand()->hasOneUse();
21809 }
21810 if (TryToVectorizeRoot) {
21811 for (auto *V : It->operand_values()) {
21812 // Postponed instructions should not be vectorized here, delay their
21813 // vectorization.
21814 if (auto *VI = dyn_cast<Instruction>(V);
21815 VI && !IsInPostProcessInstrs(VI))
21816 // Try to match and vectorize a horizontal reduction.
21817 OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
21818 }
21819 }
21820 // Start vectorization of post-process list of instructions from the
21821 // top-tree instructions to try to vectorize as many instructions as
21822 // possible.
21823 OpsChanged |=
21824 VectorizeInsertsAndCmps(/*VectorizeCmps=*/It->isTerminator());
21825 if (OpsChanged) {
21826 // We would like to start over since some instructions are deleted
21827 // and the iterator may become invalid value.
21828 Changed = true;
21829 It = BB->begin();
21830 E = BB->end();
21831 continue;
21832 }
21833 }
21834
21835 if (isa<InsertElementInst, InsertValueInst>(It))
21836 PostProcessInserts.insert(&*It);
21837 else if (isa<CmpInst>(It))
21838 PostProcessCmps.insert(cast<CmpInst>(&*It));
21839 }
21840
21841 return Changed;
21842}
21843
21844bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
21845 auto Changed = false;
21846 for (auto &Entry : GEPs) {
21847 // If the getelementptr list has fewer than two elements, there's nothing
21848 // to do.
21849 if (Entry.second.size() < 2)
21850 continue;
21851
21852 LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
21853 << Entry.second.size() << ".\n");
21854
21855 // Process the GEP list in chunks suitable for the target's supported
21856 // vector size. If a vector register can't hold 1 element, we are done. We
21857 // are trying to vectorize the index computations, so the maximum number of
21858 // elements is based on the size of the index expression, rather than the
21859 // size of the GEP itself (the target's pointer size).
21860 auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
21861 return !R.isDeleted(GEP);
21862 });
21863 if (It == Entry.second.end())
21864 continue;
21865 unsigned MaxVecRegSize = R.getMaxVecRegSize();
21866 unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
21867 if (MaxVecRegSize < EltSize)
21868 continue;
21869
21870 unsigned MaxElts = MaxVecRegSize / EltSize;
21871 for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
21872 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21873 ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
21874
21875 // Initialize a set a candidate getelementptrs. Note that we use a
21876 // SetVector here to preserve program order. If the index computations
21877 // are vectorizable and begin with loads, we want to minimize the chance
21878 // of having to reorder them later.
21879 SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
21880
21881 // Some of the candidates may have already been vectorized after we
21882 // initially collected them or their index is optimized to constant value.
21883 // If so, they are marked as deleted, so remove them from the set of
21884 // candidates.
21885 Candidates.remove_if([&R](Value *I) {
21886 return R.isDeleted(cast<Instruction>(I)) ||
21887 isa<Constant>(cast<GetElementPtrInst>(I)->idx_begin()->get());
21888 });
21889
21890 // Remove from the set of candidates all pairs of getelementptrs with
21891 // constant differences. Such getelementptrs are likely not good
21892 // candidates for vectorization in a bottom-up phase since one can be
21893 // computed from the other. We also ensure all candidate getelementptr
21894 // indices are unique.
21895 for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
21896 auto *GEPI = GEPList[I];
21897 if (!Candidates.count(GEPI))
21898 continue;
21899 const SCEV *SCEVI = SE->getSCEV(GEPList[I]);
21900 for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
21901 auto *GEPJ = GEPList[J];
21902 const SCEV *SCEVJ = SE->getSCEV(GEPList[J]);
21903 if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
21904 Candidates.remove(GEPI);
21905 Candidates.remove(GEPJ);
21906 } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21907 Candidates.remove(GEPJ);
21908 }
21909 }
21910 }
21911
21912 // We break out of the above computation as soon as we know there are
21913 // fewer than two candidates remaining.
21914 if (Candidates.size() < 2)
21915 continue;
21916
21917 // Add the single, non-constant index of each candidate to the bundle. We
21918 // ensured the indices met these constraints when we originally collected
21919 // the getelementptrs.
21920 SmallVector<Value *, 16> Bundle(Candidates.size());
21921 auto BundleIndex = 0u;
21922 for (auto *V : Candidates) {
21923 auto *GEP = cast<GetElementPtrInst>(V);
21924 auto *GEPIdx = GEP->idx_begin()->get();
21925 assert(GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21926 Bundle[BundleIndex++] = GEPIdx;
21927 }
21928
21929 // Try and vectorize the indices. We are currently only interested in
21930 // gather-like cases of the form:
21931 //
21932 // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
21933 //
21934 // where the loads of "a", the loads of "b", and the subtractions can be
21935 // performed in parallel. It's likely that detecting this pattern in a
21936 // bottom-up phase will be simpler and less costly than building a
21937 // full-blown top-down phase beginning at the consecutive loads.
21938 Changed |= tryToVectorizeList(Bundle, R);
21939 }
21940 }
21941 return Changed;
21942}
21943
21944bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
21945 bool Changed = false;
21946 // Sort by type, base pointers and values operand. Value operands must be
21947 // compatible (have the same opcode, same parent), otherwise it is
21948 // definitely not profitable to try to vectorize them.
21949 auto &&StoreSorter = [this](StoreInst *V, StoreInst *V2) {
21950 if (V->getValueOperand()->getType()->getTypeID() <
21951 V2->getValueOperand()->getType()->getTypeID())
21952 return true;
21953 if (V->getValueOperand()->getType()->getTypeID() >
21954 V2->getValueOperand()->getType()->getTypeID())
21955 return false;
21956 if (V->getPointerOperandType()->getTypeID() <
21957 V2->getPointerOperandType()->getTypeID())
21958 return true;
21959 if (V->getPointerOperandType()->getTypeID() >
21960 V2->getPointerOperandType()->getTypeID())
21961 return false;
21962 if (V->getValueOperand()->getType()->getScalarSizeInBits() <
21963 V2->getValueOperand()->getType()->getScalarSizeInBits())
21964 return true;
21965 if (V->getValueOperand()->getType()->getScalarSizeInBits() >
21966 V2->getValueOperand()->getType()->getScalarSizeInBits())
21967 return false;
21968 // UndefValues are compatible with all other values.
21969 if (auto *I1 = dyn_cast<Instruction>(V->getValueOperand()))
21970 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
21972 DT->getNode(I1->getParent());
21974 DT->getNode(I2->getParent());
21975 assert(NodeI1 && "Should only process reachable instructions");
21976 assert(NodeI2 && "Should only process reachable instructions");
21977 assert((NodeI1 == NodeI2) ==
21978 (NodeI1->getDFSNumIn() == NodeI2->getDFSNumIn()) &&
21979 "Different nodes should have different DFS numbers");
21980 if (NodeI1 != NodeI2)
21981 return NodeI1->getDFSNumIn() < NodeI2->getDFSNumIn();
21982 return I1->getOpcode() < I2->getOpcode();
21983 }
21984 return V->getValueOperand()->getValueID() <
21985 V2->getValueOperand()->getValueID();
21986 };
21987
21988 auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) {
21989 if (V1 == V2)
21990 return true;
21991 if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType())
21992 return false;
21993 if (V1->getPointerOperandType() != V2->getPointerOperandType())
21994 return false;
21995 // Undefs are compatible with any other value.
21996 if (isa<UndefValue>(V1->getValueOperand()) ||
21997 isa<UndefValue>(V2->getValueOperand()))
21998 return true;
21999 if (auto *I1 = dyn_cast<Instruction>(V1->getValueOperand()))
22000 if (auto *I2 = dyn_cast<Instruction>(V2->getValueOperand())) {
22001 if (I1->getParent() != I2->getParent())
22002 return false;
22003 return getSameOpcode({I1, I2}, *TLI).valid();
22004 }
22005 if (isa<Constant>(V1->getValueOperand()) &&
22006 isa<Constant>(V2->getValueOperand()))
22007 return true;
22008 return V1->getValueOperand()->getValueID() ==
22009 V2->getValueOperand()->getValueID();
22010 };
22011
22012 // Attempt to sort and vectorize each of the store-groups.
22014 for (auto &Pair : Stores) {
22015 if (Pair.second.size() < 2)
22016 continue;
22017
22018 LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
22019 << Pair.second.size() << ".\n");
22020
22021 if (!isValidElementType(Pair.second.front()->getValueOperand()->getType()))
22022 continue;
22023
22024 // Reverse stores to do bottom-to-top analysis. This is important if the
22025 // values are stores to the same addresses several times, in this case need
22026 // to follow the stores order (reversed to meet the memory dependecies).
22027 SmallVector<StoreInst *> ReversedStores(Pair.second.rbegin(),
22028 Pair.second.rend());
22029 Changed |= tryToVectorizeSequence<StoreInst>(
22030 ReversedStores, StoreSorter, AreCompatibleStores,
22031 [&](ArrayRef<StoreInst *> Candidates, bool) {
22032 return vectorizeStores(Candidates, R, Attempted);
22033 },
22034 /*MaxVFOnly=*/false, R);
22035 }
22036 return Changed;
22037}
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
Rewrite undef for PHI
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
basic Basic Alias true
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
Definition: Compiler.h:622
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Definition: DataLayout.cpp:920
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
Definition: DebugCounter.h:190
#define LLVM_DEBUG(...)
Definition: Debug.h:106
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
std::string Name
uint32_t Index
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Definition: HTTPClient.cpp:42
Hexagon Common GEP
#define _
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
Definition: IVUsers.cpp:48
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
ppc ctr loops verify
static bool IsSelect(MachineInstr &MI)
if(PassOpts->AAPipeline)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file contains some templates that are useful if you are working with the STL at all.
raw_pwrite_stream & OS
#define SV_NAME
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
Definition: VPlanSLP.cpp:154
Value * RHS
Value * LHS
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
A manager for alias analyses.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition: APInt.h:1407
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition: APInt.h:371
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition: APInt.h:380
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
Definition: APInt.cpp:1640
void clearAllBits()
Set every bit to 0.
Definition: APInt.h:1397
void setAllBits()
Set every bit to 1.
Definition: APInt.h:1319
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
Definition: APInt.h:1367
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
Definition: APInt.h:286
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition: APInt.h:239
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
Definition: PassManager.h:429
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Definition: PassManager.h:410
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
Definition: ArrayRef.h:190
const T & back() const
back - Get the last element.
Definition: ArrayRef.h:177
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition: ArrayRef.h:231
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
const T & front() const
front - Get the first element.
Definition: ArrayRef.h:171
iterator end() const
Definition: ArrayRef.h:157
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
iterator begin() const
Definition: ArrayRef.h:156
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:198
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
Definition: Attributes.cpp:234
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
reverse_iterator rend()
Definition: BasicBlock.h:466
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
bool isEHPad() const
Return true if this basic block is an exception handling block.
Definition: BasicBlock.h:675
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition: Analysis.h:72
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
Definition: InstrTypes.h:1980
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Definition: InstrTypes.h:1875
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
Definition: InstrTypes.h:2117
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Definition: InstrTypes.h:1974
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
FunctionType * getFunctionType() const
Definition: InstrTypes.h:1199
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1277
unsigned arg_size() const
Definition: InstrTypes.h:1284
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
Definition: InstrTypes.h:1494
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Definition: InstrTypes.h:1971
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition: InstrTypes.h:444
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:702
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition: InstrTypes.h:825
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition: InstrTypes.h:787
Predicate getPredicate() const
Return the predicate for this instruction.
Definition: InstrTypes.h:763
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
Definition: CmpPredicate.h:22
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition: Constants.h:157
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
Definition: Constants.cpp:1472
static Constant * get(ArrayRef< Constant * > V)
Definition: Constants.cpp:1421
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:420
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:373
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
Definition: DataLayout.cpp:878
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:617
static bool shouldExecute(unsigned CounterName)
Definition: DebugCounter.h:87
An analysis that produces DemandedBits for a function.
Definition: DemandedBits.h:103
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: DenseMap.h:194
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition: DenseMap.h:226
bool erase(const KeyT &Val)
Definition: DenseMap.h:321
unsigned size() const
Definition: DenseMap.h:99
bool empty() const
Definition: DenseMap.h:98
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition: DenseMap.h:152
iterator end()
Definition: DenseMap.h:84
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
Definition: DenseMap.h:202
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: DenseMap.h:211
Implements a dense probed hash-table based set.
Definition: DenseSet.h:278
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
Definition: Dominators.cpp:321
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Definition: Dominators.cpp:122
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
This instruction extracts a single (scalar) element from a VectorType value.
This instruction extracts a struct member or array element value from an aggregate value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
void set()
Definition: FMF.h:62
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:132
Type * getReturnType() const
Definition: DerivedTypes.h:126
bool empty() const
Definition: Function.h:859
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:113
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition: IRBuilder.h:1072
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2511
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition: IRBuilder.h:530
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1080
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2499
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:558
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1815
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition: IRBuilder.h:485
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1053
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:194
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition: IRBuilder.h:2574
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition: IRBuilder.h:2186
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:193
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:330
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:239
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1874
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:510
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Definition: IRBuilder.h:867
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1761
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
Definition: IRBuilder.cpp:889
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:900
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:505
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2404
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2435
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Definition: IRBuilder.cpp:881
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2533
ConstantInt * getFalse()
Get the constant value for i1 false.
Definition: IRBuilder.h:490
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2449
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:188
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2225
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:199
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1834
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2380
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition: IRBuilder.h:1614
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1404
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
Definition: IRBuilder.cpp:596
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2705
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool isCast() const
Definition: Instruction.h:283
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
Definition: Instruction.h:763
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:475
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool isBinaryOp() const
Definition: Instruction.h:279
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
bool isIntDivRem() const
Definition: Instruction.h:280
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
An instruction for reading from memory.
Definition: Instructions.h:176
Value * getPointerOperand()
Definition: Instructions.h:255
bool isSimple() const
Definition: Instructions.h:247
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:211
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
iterator end()
Definition: MapVector.h:71
VectorType takeVector()
Clear the MapVector and return the underlying vector.
Definition: MapVector.h:55
iterator find(const KeyT &Key)
Definition: MapVector.h:167
bool empty() const
Definition: MapVector.h:79
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
Definition: MapVector.h:118
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition: MapVector.h:141
ValueT lookup(const KeyT &Key) const
Definition: MapVector.h:110
size_type size() const
Definition: MapVector.h:60
std::pair< KeyT, ValueT > & front()
Definition: MapVector.h:83
void clear()
Definition: MapVector.h:88
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition: ArrayRef.h:310
T & front() const
front - Get the first element.
Definition: ArrayRef.h:366
iterator end() const
Definition: ArrayRef.h:360
iterator begin() const
Definition: ArrayRef.h:359
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition: ArrayRef.h:379
The optimization diagnostic interface.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
This is a MutableArrayRef that owns its array.
Definition: ArrayRef.h:452
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
Definition: Pass.h:94
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:686
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
Definition: PointerUnion.h:118
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
Definition: PointerUnion.h:142
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
Definition: PointerUnion.h:168
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1878
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
void preserveSet()
Mark an analysis set as preserved.
Definition: Analysis.h:146
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
Definition: PriorityQueue.h:28
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition: SetVector.h:57
ArrayRef< value_type > getArrayRef() const
Definition: SetVector.h:84
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
const value_type & front() const
Return the first element of the SetVector.
Definition: SetVector.h:143
void clear()
Completely clear the SetVector.
Definition: SetVector.h:273
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
Definition: SetVector.h:254
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
SmallBitVector & set()
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
SmallBitVector & reset()
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition: DenseSet.h:298
size_type size() const
Definition: SmallPtrSet.h:94
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
bool erase(PtrType Ptr)
Remove pointer from the set.
Definition: SmallPtrSet.h:401
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:452
iterator end() const
Definition: SmallPtrSet.h:477
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
iterator begin() const
Definition: SmallPtrSet.h:472
bool contains(ConstPtrType Ptr) const
Definition: SmallPtrSet.h:458
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:132
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition: SmallSet.h:175
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition: SmallSet.h:222
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:181
size_type size() const
Definition: SmallSet.h:170
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void assign(size_type NumElts, ValueParamT Elt)
Definition: SmallVector.h:704
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:937
void reserve(size_type N)
Definition: SmallVector.h:663
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:683
void swap(SmallVectorImpl &RHS)
Definition: SmallVector.h:968
void resize(size_type N)
Definition: SmallVector.h:638
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
Type * getPointerOperandType() const
Definition: Instructions.h:384
Value * getValueOperand()
Definition: Instructions.h:378
Value * getPointerOperand()
Definition: Instructions.h:381
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
TargetFolder - Create constants with target dependent folding.
Definition: TargetFolder.h:34
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
TypeSize getRegisterBitWidth(RegisterKind K) const
bool isLegalMaskedGather(Type *DataType, Align Alignment) const
Return true if the target supports masked gather.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, bool UseMaskForCond=false, bool UseMaskForGaps=false) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType=nullptr, TargetCostKind CostKind=TCK_SizeAndLatency) const
Estimate the cost of a GEP operation when lowered.
bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace) const
Return true is the target supports interleaved access for the given vector type VTy,...
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
\Returns true if the target supports broadcasting a load to a vector of type <NumElements x ElementTy...
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of an extended reduction pattern, similar to getArithmeticReductionCost of a reduc...
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
bool forceScalarizeMaskedGather(VectorType *Type, Align Alignment) const
Return true if the target forces scalarizing of llvm.masked.gather intrinsics.
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const
Return true if the target supports strided load.
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
OperandValueProperties
Additional properties of an operand's values.
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const PointersChainInfo &Info, Type *AccessTy, TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Estimate the cost of a chain of pointers (typically pointer operands of a chain of loads or stores wi...
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool isTypeLegal(Type *Ty) const
Return true if this type is legal.
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const
InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
unsigned getMinVectorRegisterBitWidth() const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, const Instruction *I=nullptr) const
unsigned getNumberOfRegisters(unsigned ClassID) const
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, const SmallBitVector &OpcodeMask) const
Return true if this is an alternating opcode pattern that can be lowered to a single instruction on t...
bool isFPVectorizationPotentiallyUnsafe() const
Indicate that it is potentially unsafe to automatically vectorize floating-point operations because t...
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) const
unsigned getNumberOfParts(Type *Tp) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, Value *Op0=nullptr, Value *Op1=nullptr) const
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency) const
CastContextHint
Represents a hint about the context in which a cast is used.
@ Reversed
The cast is used with a reversed load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
@ GatherScatter
The cast is used with a gather/scatter.
OperandValueKind
Additional information about an operand's possible values.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isX86_FP80Ty() const
Return true if this is x86 long double.
Definition: Type.h:159
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:264
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
Definition: Type.h:295
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
Definition: Type.h:165
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeID getTypeID() const
Return the type id for the type.
Definition: Type.h:136
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1859
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Definition: User.cpp:21
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Definition: User.h:115
op_iterator op_begin()
Definition: User.h:280
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
iterator_range< value_op_iterator > operand_values()
Definition: User.h:312
The Vector Function Database.
Definition: VectorUtils.h:31
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
Definition: VectorUtils.h:72
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition: Value.h:532
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition: Value.cpp:153
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition: Value.cpp:149
bool use_empty() const
Definition: Value.h:344
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1075
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
std::pair< iterator, bool > insert(const ValueT &V)
Definition: DenseSet.h:213
iterator find(const_arg_type_t< ValueT > V)
Definition: DenseSet.h:187
size_type size() const
Definition: DenseSet.h:81
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition: DenseSet.h:193
bool erase(const ValueT &V)
Definition: DenseSet.h:97
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
Definition: DenseSet.h:95
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
Definition: Hashing.h:75
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
CRTP base class for adapting an iterator to a different type.
Definition: iterator.h:237
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
Definition: raw_ostream.h:52
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
Definition: raw_ostream.h:661
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
Definition: VectorUtils.h:106
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Key
PAL metadata keys.
@ HorizontalReduction
Definition: ARMBaseInfo.h:425
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
Definition: BitmaskEnum.h:125
@ Entry
Definition: COFF.h:844
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Definition: Intrinsics.cpp:731
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:826
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:885
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:168
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
Definition: PatternMatch.h:299
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
Definition: PatternMatch.h:152
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
Definition: PatternMatch.h:239
@ GS
Definition: X86.h:210
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
Definition: Path.cpp:226
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
Definition: LoopUtils.cpp:1278
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:854
void stable_sort(R &&Range)
Definition: STLExtras.h:2037
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1759
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
Definition: APFixedPoint.h:136
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1697
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:546
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition: ScopeExit.h:59
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
Definition: SetOperations.h:58
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
Definition: Verifier.cpp:7297
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
Definition: Utils.cpp:1683
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
Definition: STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition: MathExtras.h:557
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition: bit.h:342
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1785
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition: MathExtras.h:396
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
Definition: STLExtras.h:2107
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition: Local.cpp:406
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:420
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1664
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
Definition: STLExtras.h:1771
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
Definition: SPIRVUtils.h:256
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1753
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
Definition: Local.cpp:425
bool isModOrRefSet(const ModRefInfo MRI)
Definition: ModRef.h:42
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition: Casting.h:548
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
Definition: LoopUtils.cpp:1368
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
@ Other
Any other memory.
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
TargetTransformInfo TTI
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
Definition: LoopUtils.cpp:1054
RecurKind
These are the kinds of recurrences that we support.
Definition: IVDescriptors.h:33
@ Or
Bitwise or logical OR of integers.
@ None
Not a recurrence.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:155
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition: STLExtras.h:1938
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition: STLExtras.h:2014
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
Definition: GraphWriter.h:427
OutputIt copy(R &&Range, OutputIt Out)
Definition: STLExtras.h:1841
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1766
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
InstructionCost Cost
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition: Sequence.h:305
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition: Hashing.h:590
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition: bit.h:327
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
Definition: VectorUtils.cpp:46
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition: Hashing.h:468
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Used to keep track of an operand bundle.
Definition: InstrTypes.h:2138
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Describe known properties for a set of pointers.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Definition: STLExtras.h:1467
Function object to check whether the second component of a container supported by std::get (like std:...
Definition: STLExtras.h:1476
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.