73#ifdef EXPENSIVE_CHECKS
106using namespace slpvectorizer;
107using namespace std::placeholders;
109#define SV_NAME "slp-vectorizer"
110#define DEBUG_TYPE "SLP"
112STATISTIC(NumVectorInstructions,
"Number of vector instructions generated");
115 "Controls which SLP graphs should be vectorized.");
119 cl::desc(
"Run the SLP vectorization passes"));
123 cl::desc(
"Enable vectorization for wider vector utilization"));
127 cl::desc(
"Only vectorize if you gain more than this "
132 cl::desc(
"When true, SLP vectorizer bypasses profitability checks based on "
133 "heuristics and makes vectorization decision via cost modeling."));
137 cl::desc(
"Attempt to vectorize horizontal reductions"));
142 "Attempt to vectorize horizontal reductions feeding into a store"));
146 cl::desc(
"Attempt to vectorize for this register size in bits"));
150 cl::desc(
"Maximum SLP vectorization factor (0=unlimited)"));
158 cl::desc(
"Limit the size of the SLP scheduling region per block"));
162 cl::desc(
"Attempt to vectorize for this register size in bits"));
166 cl::desc(
"Limit the recursion depth when building a vectorizable tree"));
170 cl::desc(
"Only vectorize small trees if they are fully vectorizable"));
176 cl::desc(
"The maximum look-ahead depth for operand reordering scores"));
185 cl::desc(
"The maximum look-ahead depth for searching best rooting option"));
189 cl::desc(
"The minimum number of loads, which should be considered strided, "
190 "if the stride is > 1 or is runtime value"));
194 cl::desc(
"The maximum stride, considered to be profitable."));
198 cl::desc(
"Display the SLP trees with Graphviz"));
202 cl::desc(
"Try to vectorize with non-power-of-2 number of elements."));
233 if (
SLPReVec && isa<FixedVectorType>(Ty))
235 return VectorType::isValidElementType(Ty) && !Ty->
isX86_FP80Ty() &&
244 if (
auto *SI = dyn_cast<StoreInst>(V))
245 return SI->getValueOperand()->getType();
246 if (
auto *CI = dyn_cast<CmpInst>(V))
247 return CI->getOperand(0)->getType();
248 if (
auto *IE = dyn_cast<InsertElementInst>(V))
249 return IE->getOperand(1)->getType();
255 assert(!isa<ScalableVectorType>(Ty) &&
256 "ScalableVectorType is not supported.");
257 if (
auto *VecTy = dyn_cast<FixedVectorType>(Ty))
258 return VecTy->getNumElements();
272 Type *Ty,
unsigned Sz) {
277 if (NumParts == 0 || NumParts >= Sz)
292 if (NumParts == 0 || NumParts >= Sz)
297 return (Sz / RegVF) * RegVF;
307 for (
unsigned I : seq<unsigned>(Mask.size()))
309 I * VecTyNumElements, VecTyNumElements)))
311 : Mask[
I] * VecTyNumElements + J;
342 if (!
all_of(VL, IsaPred<ShuffleVectorInst>))
344 auto *SV = cast<ShuffleVectorInst>(VL.
front());
345 unsigned SVNumElements =
346 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
347 unsigned ShuffleMaskSize = SV->getShuffleMask().size();
348 if (SVNumElements % ShuffleMaskSize != 0)
350 unsigned GroupSize = SVNumElements / ShuffleMaskSize;
351 if (GroupSize == 0 || (VL.
size() % GroupSize) != 0)
353 unsigned NumGroup = 0;
354 for (
size_t I = 0, E = VL.
size();
I != E;
I += GroupSize) {
355 auto *SV = cast<ShuffleVectorInst>(VL[
I]);
356 Value *Src = SV->getOperand(0);
360 auto *SV = cast<ShuffleVectorInst>(V);
362 if (SV->getOperand(0) != Src)
365 if (!SV->isExtractSubvectorMask(Index))
367 ExpectedIndex.
set(Index / ShuffleMaskSize);
371 if (!ExpectedIndex.
all())
375 assert(NumGroup == (VL.
size() / GroupSize) &&
"Unexpected number of groups");
393 auto *SV = cast<ShuffleVectorInst>(VL.
front());
394 unsigned SVNumElements =
395 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
397 unsigned AccumulateLength = 0;
398 for (
Value *V : VL) {
399 auto *SV = cast<ShuffleVectorInst>(V);
400 for (
int M : SV->getShuffleMask())
402 : AccumulateLength + M);
403 AccumulateLength += SVNumElements;
411 return isa<Constant>(V) && !isa<ConstantExpr, GlobalValue>(V);
418 if (!isa<InsertElementInst, ExtractElementInst>(V) &&
419 !isa<ExtractValueInst, UndefValue>(V))
421 auto *
I = dyn_cast<Instruction>(V);
422 if (!
I || isa<ExtractValueInst>(
I))
424 if (!isa<FixedVectorType>(
I->getOperand(0)->getType()))
426 if (isa<ExtractElementInst>(
I))
428 assert(isa<InsertElementInst>(V) &&
"Expected only insertelement.");
444 return std::min<unsigned>(PartNumElems,
Size - Part * PartNumElems);
453 OS <<
"Idx: " <<
Idx <<
", ";
454 OS <<
"n=" << VL.
size() <<
" [" << *VL.
front() <<
", ..]";
462 auto *It =
find_if(VL, IsaPred<Instruction>);
471 if (isa<PoisonValue>(V))
473 auto *
II = dyn_cast<Instruction>(V);
477 if (BB !=
II->getParent())
494 Value *FirstNonUndef =
nullptr;
495 for (
Value *V : VL) {
496 if (isa<UndefValue>(V))
498 if (!FirstNonUndef) {
502 if (V != FirstNonUndef)
505 return FirstNonUndef !=
nullptr;
510 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
511 return Cmp->isCommutative();
512 if (
auto *BO = dyn_cast<BinaryOperator>(
I))
513 return BO->isCommutative() ||
514 (BO->getOpcode() == Instruction::Sub &&
521 if (match(U.getUser(),
522 m_ICmp(Pred, m_Specific(U.get()), m_Zero())) &&
523 (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE))
527 return match(U.getUser(),
528 m_Intrinsic<Intrinsic::abs>(
529 m_Specific(U.get()), m_ConstantInt(Flag))) &&
530 (!cast<Instruction>(U.get())->hasNoSignedWrap() ||
533 (BO->getOpcode() == Instruction::FSub &&
536 return match(U.getUser(),
537 m_Intrinsic<Intrinsic::fabs>(m_Specific(U.get())));
539 return I->isCommutative();
545 static_assert(std::is_same_v<T, InsertElementInst> ||
546 std::is_same_v<T, ExtractElementInst>,
549 if (
const auto *IE = dyn_cast<T>(Inst)) {
550 const auto *VT = dyn_cast<FixedVectorType>(IE->getType());
553 const auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
556 if (CI->getValue().uge(VT->getNumElements()))
558 Index *= VT->getNumElements();
559 Index += CI->getZExtValue();
570 if (
auto Index = getInsertExtractIndex<InsertElementInst>(Inst,
Offset))
572 if (
auto Index = getInsertExtractIndex<ExtractElementInst>(Inst,
Offset))
577 const auto *
IV = dyn_cast<InsertValueInst>(Inst);
581 Type *CurrentType =
IV->getType();
582 for (
unsigned I :
IV->indices()) {
583 if (
const auto *ST = dyn_cast<StructType>(CurrentType)) {
584 Index *= ST->getNumElements();
585 CurrentType = ST->getElementType(
I);
586 }
else if (
const auto *AT = dyn_cast<ArrayType>(CurrentType)) {
587 Index *= AT->getNumElements();
588 CurrentType = AT->getElementType();
621 if (MaskArg == UseMask::UndefsAsMask)
625 if (MaskArg == UseMask::FirstArg &&
Value < VF)
626 UseMask.reset(
Value);
627 else if (MaskArg == UseMask::SecondArg &&
Value >= VF)
628 UseMask.reset(
Value - VF);
636template <
bool IsPoisonOnly = false>
640 using T = std::conditional_t<IsPoisonOnly, PoisonValue, UndefValue>;
643 auto *VecTy = dyn_cast<FixedVectorType>(
V->getType());
646 auto *
C = dyn_cast<Constant>(V);
648 if (!UseMask.empty()) {
650 while (
auto *
II = dyn_cast<InsertElementInst>(
Base)) {
652 if (isa<T>(
II->getOperand(1)))
659 if (*
Idx < UseMask.size() && !UseMask.test(*
Idx))
667 Res &= isUndefVector<IsPoisonOnly>(
Base, SubMask);
674 for (
unsigned I = 0, E = VecTy->getNumElements();
I != E; ++
I) {
675 if (
Constant *Elem =
C->getAggregateElement(
I))
677 (UseMask.empty() || (
I < UseMask.size() && !UseMask.test(
I))))
705static std::optional<TargetTransformInfo::ShuffleKind>
708 const auto *It =
find_if(VL, IsaPred<ExtractElementInst>);
712 std::accumulate(VL.
begin(), VL.
end(), 0u, [](
unsigned S,
Value *V) {
713 auto *EI = dyn_cast<ExtractElementInst>(V);
716 auto *VTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
719 return std::max(S, VTy->getNumElements());
722 Value *Vec1 =
nullptr;
723 Value *Vec2 =
nullptr;
725 auto *EE = dyn_cast<ExtractElementInst>(V);
728 Value *Vec = EE->getVectorOperand();
729 if (isa<UndefValue>(Vec))
734 ShuffleMode CommonShuffleMode =
Unknown;
736 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
738 if (isa<UndefValue>(VL[
I]))
740 auto *EI = cast<ExtractElementInst>(VL[
I]);
741 if (isa<ScalableVectorType>(EI->getVectorOperandType()))
743 auto *Vec = EI->getVectorOperand();
745 if (isUndefVector</*isPoisonOnly=*/true>(Vec).all())
748 if (isa<UndefValue>(Vec)) {
751 if (isa<UndefValue>(EI->getIndexOperand()))
753 auto *
Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
759 unsigned IntIdx =
Idx->getValue().getZExtValue();
766 if (!Vec1 || Vec1 == Vec) {
768 }
else if (!Vec2 || Vec2 == Vec) {
774 if (CommonShuffleMode == Permute)
778 if (Mask[
I] %
Size !=
I) {
779 CommonShuffleMode = Permute;
782 CommonShuffleMode =
Select;
785 if (CommonShuffleMode ==
Select && Vec2)
796 assert((Opcode == Instruction::ExtractElement ||
797 Opcode == Instruction::ExtractValue) &&
798 "Expected extractelement or extractvalue instruction.");
799 if (Opcode == Instruction::ExtractElement) {
800 auto *CI = dyn_cast<ConstantInt>(E->
getOperand(1));
803 return CI->getZExtValue();
805 auto *EI = cast<ExtractValueInst>(E);
806 if (EI->getNumIndices() != 1)
808 return *EI->idx_begin();
814class InstructionsState {
821 assert(valid() &&
"InstructionsState is invalid.");
826 assert(valid() &&
"InstructionsState is invalid.");
831 unsigned getOpcode()
const {
return getMainOp()->getOpcode(); }
833 unsigned getAltOpcode()
const {
return getAltOp()->getOpcode(); }
836 bool isAltShuffle()
const {
return getMainOp() != getAltOp(); }
839 unsigned CheckedOpcode =
I->getOpcode();
840 return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
844 bool valid()
const {
return MainOp && AltOp; }
846 explicit operator bool()
const {
return valid(); }
848 InstructionsState() =
delete;
850 : MainOp(MainOp), AltOp(AltOp) {}
851 static InstructionsState invalid() {
return {
nullptr,
nullptr}; }
877 (!isa<Instruction>(BaseOp0) && !isa<Instruction>(Op0) &&
878 !isa<Instruction>(BaseOp1) && !isa<Instruction>(Op1)) ||
879 BaseOp0 == Op0 || BaseOp1 == Op1 ||
890 "Assessing comparisons of different types?");
900 return (BasePred == Pred &&
902 (BasePred == SwappedPred &&
912 if (!
all_of(VL, IsaPred<Instruction, PoisonValue>))
913 return InstructionsState::invalid();
915 auto *It =
find_if(VL, IsaPred<Instruction>);
917 return InstructionsState::invalid();
920 unsigned InstCnt = std::count_if(It, VL.
end(), IsaPred<Instruction>);
921 if ((VL.
size() > 2 && !isa<PHINode>(MainOp) && InstCnt < VL.
size() / 2) ||
922 (VL.
size() == 2 && InstCnt < 2))
923 return InstructionsState::invalid();
925 bool IsCastOp = isa<CastInst>(MainOp);
926 bool IsBinOp = isa<BinaryOperator>(MainOp);
927 bool IsCmpOp = isa<CmpInst>(MainOp);
932 unsigned AltOpcode = Opcode;
934 bool SwappedPredsCompatible = IsCmpOp && [&]() {
936 UniquePreds.
insert(BasePred);
937 UniqueNonSwappedPreds.
insert(BasePred);
938 for (
Value *V : VL) {
939 auto *
I = dyn_cast<CmpInst>(V);
945 UniqueNonSwappedPreds.
insert(CurrentPred);
946 if (!UniquePreds.
contains(CurrentPred) &&
947 !UniquePreds.
contains(SwappedCurrentPred))
948 UniquePreds.
insert(CurrentPred);
953 return UniqueNonSwappedPreds.
size() > 2 && UniquePreds.
size() == 2;
959 if (
auto *
CallBase = dyn_cast<CallInst>(MainOp)) {
963 return InstructionsState::invalid();
965 bool AnyPoison = InstCnt != VL.
size();
968 auto *
I = dyn_cast<Instruction>(V);
975 if (AnyPoison && (
I->isIntDivRem() ||
I->isFPDivRem() || isa<CallInst>(
I)))
976 return InstructionsState::invalid();
977 unsigned InstOpcode =
I->getOpcode();
978 if (IsBinOp && isa<BinaryOperator>(
I)) {
979 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
983 AltOpcode = InstOpcode;
987 }
else if (IsCastOp && isa<CastInst>(
I)) {
990 Value *Op1 =
I->getOperand(0);
993 if (InstOpcode == Opcode || InstOpcode == AltOpcode)
995 if (Opcode == AltOpcode) {
998 "Cast isn't safe for alternation, logic needs to be updated!");
999 AltOpcode = InstOpcode;
1004 }
else if (
auto *Inst = dyn_cast<CmpInst>(
I); Inst && IsCmpOp) {
1005 auto *BaseInst = cast<CmpInst>(MainOp);
1006 Type *Ty0 = BaseInst->getOperand(0)->getType();
1007 Type *Ty1 = Inst->getOperand(0)->getType();
1009 assert(InstOpcode == Opcode &&
"Expected same CmpInst opcode.");
1010 assert(InstOpcode == AltOpcode &&
1011 "Alternate instructions are only supported by BinaryOperator "
1019 if ((VL.
size() == 2 || SwappedPredsCompatible) &&
1020 (BasePred == CurrentPred || BasePred == SwappedCurrentPred))
1025 auto *AltInst = cast<CmpInst>(AltOp);
1026 if (MainOp != AltOp) {
1029 }
else if (BasePred != CurrentPred) {
1032 "CmpInst isn't safe for alternation, logic needs to be updated!");
1037 if (BasePred == CurrentPred || BasePred == SwappedCurrentPred ||
1038 AltPred == CurrentPred || AltPred == SwappedCurrentPred)
1041 }
else if (InstOpcode == Opcode) {
1042 assert(InstOpcode == AltOpcode &&
1043 "Alternate instructions are only supported by BinaryOperator and "
1045 if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
1046 if (Gep->getNumOperands() != 2 ||
1048 return InstructionsState::invalid();
1049 }
else if (
auto *EI = dyn_cast<ExtractElementInst>(
I)) {
1051 return InstructionsState::invalid();
1052 }
else if (
auto *LI = dyn_cast<LoadInst>(
I)) {
1053 auto *BaseLI = cast<LoadInst>(MainOp);
1054 if (!LI->isSimple() || !BaseLI->isSimple())
1055 return InstructionsState::invalid();
1056 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
1057 auto *
CallBase = cast<CallInst>(MainOp);
1059 return InstructionsState::invalid();
1060 if (Call->hasOperandBundles() &&
1062 !std::equal(Call->op_begin() + Call->getBundleOperandsStartIndex(),
1063 Call->op_begin() + Call->getBundleOperandsEndIndex(),
1066 return InstructionsState::invalid();
1069 return InstructionsState::invalid();
1072 if (Mappings.
size() != BaseMappings.
size() ||
1073 Mappings.
front().ISA != BaseMappings.
front().ISA ||
1074 Mappings.
front().ScalarName != BaseMappings.
front().ScalarName ||
1075 Mappings.
front().VectorName != BaseMappings.
front().VectorName ||
1076 Mappings.
front().Shape.VF != BaseMappings.
front().Shape.VF ||
1077 Mappings.
front().Shape.Parameters !=
1078 BaseMappings.
front().Shape.Parameters)
1079 return InstructionsState::invalid();
1084 return InstructionsState::invalid();
1087 return InstructionsState(MainOp, AltOp);
1104 unsigned Opcode = UserInst->
getOpcode();
1106 case Instruction::Load: {
1107 LoadInst *LI = cast<LoadInst>(UserInst);
1110 case Instruction::Store: {
1111 StoreInst *SI = cast<StoreInst>(UserInst);
1112 return (SI->getPointerOperand() == Scalar);
1114 case Instruction::Call: {
1115 CallInst *CI = cast<CallInst>(UserInst);
1118 return isVectorIntrinsicWithScalarOpAtArg(ID, Arg.index(), TTI) &&
1119 Arg.value().get() == Scalar;
1131 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1138 if (
LoadInst *LI = dyn_cast<LoadInst>(
I))
1139 return LI->isSimple();
1141 return SI->isSimple();
1143 return !
MI->isVolatile();
1151 bool ExtendingManyInputs =
false) {
1152 if (SubMask.
empty())
1155 (!ExtendingManyInputs || SubMask.
size() > Mask.size() ||
1158 "SubMask with many inputs support must be larger than the mask.");
1160 Mask.append(SubMask.
begin(), SubMask.
end());
1164 int TermValue = std::min(Mask.size(), SubMask.
size());
1165 for (
int I = 0, E = SubMask.
size();
I < E; ++
I) {
1167 (!ExtendingManyInputs &&
1168 (SubMask[
I] >= TermValue || Mask[SubMask[
I]] >= TermValue)))
1170 NewMask[
I] = Mask[SubMask[
I]];
1186 const unsigned Sz = Order.
size();
1189 for (
unsigned I = 0;
I < Sz; ++
I) {
1191 UnusedIndices.
reset(Order[
I]);
1193 MaskedIndices.
set(
I);
1195 if (MaskedIndices.
none())
1198 "Non-synced masked/available indices.");
1202 assert(
Idx >= 0 &&
"Indices must be synced.");
1213 Type *ScalarTy = VL[0]->getType();
1216 for (
unsigned Lane : seq<unsigned>(VL.
size())) {
1217 if (isa<PoisonValue>(VL[Lane]))
1219 if (cast<Instruction>(VL[Lane])->
getOpcode() == Opcode1)
1220 OpcodeMask.
set(Lane * ScalarTyNumElements,
1221 Lane * ScalarTyNumElements + ScalarTyNumElements);
1231 const unsigned E = Indices.
size();
1233 for (
unsigned I = 0;
I < E; ++
I)
1234 Mask[Indices[
I]] =
I;
1240 assert(!Mask.empty() &&
"Expected non-empty mask.");
1244 for (
unsigned I = 0, E = Prev.
size();
I < E; ++
I)
1246 Scalars[Mask[
I]] = Prev[
I];
1254 auto *
I = dyn_cast<Instruction>(V);
1259 auto *IO = dyn_cast<Instruction>(V);
1262 return isa<PHINode>(IO) || IO->getParent() != I->getParent();
1271 auto *
I = dyn_cast<Instruction>(V);
1275 return !
I->mayReadOrWriteMemory() && !
I->hasNUsesOrMore(
UsesLimit) &&
1277 auto *IU = dyn_cast<Instruction>(U);
1280 return IU->getParent() != I->getParent() || isa<PHINode>(IU);
1296 return !VL.
empty() &&
1312 return NumParts > 0 && NumParts < Sz &&
has_single_bit(Sz / NumParts) &&
1316namespace slpvectorizer {
1321 struct ScheduleData;
1345 : BatchAA(*Aa),
F(Func), SE(Se),
TTI(Tti), TLI(TLi), LI(Li), DT(Dt),
1346 AC(AC), DB(DB),
DL(
DL), ORE(ORE),
1397 return !VectorizableTree.
empty() &&
1398 !VectorizableTree.
front()->UserTreeIndices.empty();
1403 assert(!VectorizableTree.
empty() &&
"No graph to get the first node from");
1404 return VectorizableTree.
front()->Scalars;
1410 const TreeEntry &Root = *VectorizableTree.
front().get();
1411 if (Root.State != TreeEntry::Vectorize || Root.isAltShuffle() ||
1412 !Root.Scalars.front()->getType()->isIntegerTy())
1413 return std::nullopt;
1414 auto It = MinBWs.
find(&Root);
1415 if (It != MinBWs.
end())
1419 if (Root.getOpcode() == Instruction::ZExt ||
1420 Root.getOpcode() == Instruction::SExt)
1421 return std::make_pair(cast<CastInst>(Root.getMainOp())->getSrcTy(),
1422 Root.getOpcode() == Instruction::SExt);
1423 return std::nullopt;
1429 return MinBWs.
at(VectorizableTree.
front().get()).second;
1434 if (ReductionBitWidth == 0 ||
1435 !VectorizableTree.
front()->Scalars.front()->getType()->isIntegerTy() ||
1436 ReductionBitWidth >=
1437 DL->getTypeSizeInBits(
1438 VectorizableTree.
front()->Scalars.front()->getType()))
1440 VectorizableTree.
front()->Scalars.front()->getType(),
1441 VectorizableTree.
front()->getVectorFactor());
1444 VectorizableTree.
front()->Scalars.front()->getContext(),
1446 VectorizableTree.
front()->getVectorFactor());
1461 VectorizableTree.
clear();
1462 ScalarToTreeEntry.clear();
1463 MultiNodeScalars.clear();
1465 NonScheduledFirst.
clear();
1466 EntryToLastInstruction.clear();
1467 LoadEntriesToVectorize.
clear();
1468 IsGraphTransformMode =
false;
1469 GatheredLoadsEntriesFirst.reset();
1470 ExternalUses.
clear();
1471 ExternalUsesAsOriginalScalar.clear();
1472 for (
auto &Iter : BlocksSchedules) {
1473 BlockScheduling *BS = Iter.second.get();
1477 ReductionBitWidth = 0;
1479 CastMaxMinBWSizes.reset();
1480 ExtraBitWidthNodes.
clear();
1481 InstrElementSize.clear();
1482 UserIgnoreList =
nullptr;
1483 PostponedGathers.
clear();
1484 ValueToGatherNodes.
clear();
1500 assert(!Order.
empty() &&
"expected non-empty order");
1501 const unsigned Sz = Order.
size();
1503 return P.value() ==
P.index() ||
P.value() == Sz;
1556 return MaxVecRegSize;
1561 return MinVecRegSize;
1569 unsigned MaxVF =
MaxVFOption.getNumOccurrences() ?
1571 return MaxVF ? MaxVF : UINT_MAX;
1623 unsigned *BestVF =
nullptr,
1624 bool TryRecursiveCheck =
true)
const;
1632 template <
typename T>
1659 OS <<
"{User:" << (
UserTE ? std::to_string(
UserTE->Idx) :
"null")
1660 <<
" EdgeIdx:" <<
EdgeIdx <<
"}";
1682 : TLI(TLI),
DL(
DL), SE(SE), R(R), NumLanes(NumLanes),
1683 MaxLevel(MaxLevel) {}
1737 if (isa<LoadInst>(V1)) {
1739 auto AllUsersAreInternal = [U1, U2,
this](
Value *V1,
Value *V2) {
1744 auto AllUsersVectorized = [U1, U2,
this](
Value *V) {
1746 return U == U1 || U == U2 || R.getTreeEntry(U) != nullptr;
1749 return AllUsersVectorized(V1) && AllUsersVectorized(V2);
1752 if (R.TTI->isLegalBroadcastLoad(V1->getType(),
1754 ((
int)V1->getNumUses() == NumLanes ||
1755 AllUsersAreInternal(V1, V2)))
1761 auto CheckSameEntryOrFail = [&]() {
1762 if (
const TreeEntry *TE1 = R.getTreeEntry(V1);
1763 TE1 && TE1 == R.getTreeEntry(V2))
1768 auto *LI1 = dyn_cast<LoadInst>(V1);
1769 auto *LI2 = dyn_cast<LoadInst>(V2);
1771 if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
1773 return CheckSameEntryOrFail();
1776 LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
1777 LI2->getPointerOperand(),
DL, SE,
true);
1778 if (!Dist || *Dist == 0) {
1781 R.TTI->isLegalMaskedGather(
1784 return CheckSameEntryOrFail();
1788 if (std::abs(*Dist) > NumLanes / 2)
1797 auto *C1 = dyn_cast<Constant>(V1);
1798 auto *C2 = dyn_cast<Constant>(V2);
1812 if (isa<UndefValue>(V2))
1816 Value *EV2 =
nullptr;
1829 int Dist = Idx2 - Idx1;
1832 if (std::abs(Dist) == 0)
1834 if (std::abs(Dist) > NumLanes / 2)
1841 return CheckSameEntryOrFail();
1844 auto *I1 = dyn_cast<Instruction>(V1);
1845 auto *I2 = dyn_cast<Instruction>(V2);
1847 if (I1->getParent() != I2->getParent())
1848 return CheckSameEntryOrFail();
1856 (S.getMainOp()->getNumOperands() <= 2 || !MainAltOps.
empty() ||
1857 !S.isAltShuffle()) &&
1859 return isa<PoisonValue>(V) ||
1860 cast<Instruction>(V)->getNumOperands() ==
1861 S.getMainOp()->getNumOperands();
1867 if (I1 && isa<PoisonValue>(V2))
1870 if (isa<UndefValue>(V2))
1873 return CheckSameEntryOrFail();
1907 int ShallowScoreAtThisLevel =
1916 auto *I1 = dyn_cast<Instruction>(
LHS);
1917 auto *I2 = dyn_cast<Instruction>(
RHS);
1918 if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
1920 (((isa<LoadInst>(I1) && isa<LoadInst>(I2)) ||
1921 (I1->getNumOperands() > 2 && I2->getNumOperands() > 2) ||
1922 (isa<ExtractElementInst>(I1) && isa<ExtractElementInst>(I2))) &&
1923 ShallowScoreAtThisLevel))
1924 return ShallowScoreAtThisLevel;
1925 assert(I1 && I2 &&
"Should have early exited.");
1932 for (
unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
1933 OpIdx1 != NumOperands1; ++OpIdx1) {
1935 int MaxTmpScore = 0;
1936 unsigned MaxOpIdx2 = 0;
1937 bool FoundBest =
false;
1941 ? I2->getNumOperands()
1942 : std::min(I2->getNumOperands(), OpIdx1 + 1);
1943 assert(FromIdx <= ToIdx &&
"Bad index");
1944 for (
unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
1946 if (Op2Used.
count(OpIdx2))
1951 I1, I2, CurrLevel + 1, {});
1954 TmpScore > MaxTmpScore) {
1955 MaxTmpScore = TmpScore;
1962 Op2Used.
insert(MaxOpIdx2);
1963 ShallowScoreAtThisLevel += MaxTmpScore;
1966 return ShallowScoreAtThisLevel;
1997 struct OperandData {
1998 OperandData() =
default;
1999 OperandData(
Value *V,
bool APO,
bool IsUsed)
2000 : V(V), APO(APO), IsUsed(IsUsed) {}
2010 bool IsUsed =
false;
2019 enum class ReorderingMode {
2033 unsigned ArgSize = 0;
2039 const Loop *L =
nullptr;
2042 OperandData &getData(
unsigned OpIdx,
unsigned Lane) {
2043 return OpsVec[OpIdx][Lane];
2047 const OperandData &getData(
unsigned OpIdx,
unsigned Lane)
const {
2048 return OpsVec[OpIdx][Lane];
2053 for (
unsigned OpIdx = 0, NumOperands = getNumOperands();
2054 OpIdx != NumOperands; ++OpIdx)
2055 for (
unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
2057 OpsVec[OpIdx][Lane].IsUsed =
false;
2061 void swap(
unsigned OpIdx1,
unsigned OpIdx2,
unsigned Lane) {
2062 std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
2074 int getSplatScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx,
2076 Value *IdxLaneV = getData(
Idx, Lane).V;
2077 if (!isa<Instruction>(IdxLaneV) || IdxLaneV == getData(OpIdx, Lane).V ||
2078 isa<ExtractElementInst>(IdxLaneV))
2081 for (
unsigned Ln : seq<unsigned>(getNumLanes())) {
2084 Value *OpIdxLnV = getData(OpIdx, Ln).V;
2085 if (!isa<Instruction>(OpIdxLnV))
2089 unsigned UniquesCount = Uniques.
size();
2090 auto IdxIt = Uniques.
find(IdxLaneV);
2091 unsigned UniquesCntWithIdxLaneV =
2092 IdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2093 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2094 auto OpIdxIt = Uniques.
find(OpIdxLaneV);
2095 unsigned UniquesCntWithOpIdxLaneV =
2096 OpIdxIt != Uniques.
end() ? UniquesCount : UniquesCount + 1;
2097 if (UniquesCntWithIdxLaneV == UniquesCntWithOpIdxLaneV)
2099 return std::min(
bit_ceil(UniquesCntWithOpIdxLaneV) -
2100 UniquesCntWithOpIdxLaneV,
2101 UniquesCntWithOpIdxLaneV -
2103 ((IdxIt != Uniques.
end() && UsedLanes.
test(IdxIt->second))
2104 ? UniquesCntWithIdxLaneV -
bit_floor(UniquesCntWithIdxLaneV)
2105 :
bit_ceil(UniquesCntWithIdxLaneV) - UniquesCntWithIdxLaneV);
2114 int getExternalUseScore(
unsigned Lane,
unsigned OpIdx,
unsigned Idx)
const {
2115 Value *IdxLaneV = getData(
Idx, Lane).V;
2116 Value *OpIdxLaneV = getData(OpIdx, Lane).V;
2125 auto *IdxLaneI = dyn_cast<Instruction>(IdxLaneV);
2126 if (!IdxLaneI || !isa<Instruction>(OpIdxLaneV))
2128 return R.areAllUsersVectorized(IdxLaneI)
2136 static const int ScoreScaleFactor = 10;
2144 int Lane,
unsigned OpIdx,
unsigned Idx,
2154 int SplatScore = getSplatScore(Lane, OpIdx,
Idx, UsedLanes);
2155 if (Score <= -SplatScore) {
2159 Score += SplatScore;
2165 Score *= ScoreScaleFactor;
2166 Score += getExternalUseScore(Lane, OpIdx,
Idx);
2184 std::optional<unsigned>
2185 getBestOperand(
unsigned OpIdx,
int Lane,
int LastLane,
2189 unsigned NumOperands = getNumOperands();
2192 Value *OpLastLane = getData(OpIdx, LastLane).V;
2195 ReorderingMode RMode = ReorderingModes[OpIdx];
2196 if (RMode == ReorderingMode::Failed)
2197 return std::nullopt;
2200 bool OpIdxAPO = getData(OpIdx, Lane).APO;
2206 std::optional<unsigned>
Idx;
2210 BestScoresPerLanes.
try_emplace(std::make_pair(OpIdx, Lane), 0)
2216 bool IsUsed = RMode == ReorderingMode::Splat ||
2217 RMode == ReorderingMode::Constant ||
2218 RMode == ReorderingMode::Load;
2220 for (
unsigned Idx = 0;
Idx != NumOperands; ++
Idx) {
2222 OperandData &OpData = getData(
Idx, Lane);
2224 bool OpAPO = OpData.APO;
2233 if (OpAPO != OpIdxAPO)
2238 case ReorderingMode::Load:
2239 case ReorderingMode::Opcode: {
2240 bool LeftToRight = Lane > LastLane;
2241 Value *OpLeft = (LeftToRight) ? OpLastLane :
Op;
2242 Value *OpRight = (LeftToRight) ?
Op : OpLastLane;
2243 int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
2244 OpIdx,
Idx, IsUsed, UsedLanes);
2245 if (Score >
static_cast<int>(BestOp.Score) ||
2246 (Score > 0 && Score ==
static_cast<int>(BestOp.Score) &&
2249 BestOp.Score = Score;
2250 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
2254 case ReorderingMode::Constant:
2255 if (isa<Constant>(
Op) ||
2256 (!BestOp.Score && L && L->isLoopInvariant(
Op))) {
2258 if (isa<Constant>(
Op)) {
2260 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2263 if (isa<UndefValue>(
Op) || !isa<Constant>(
Op))
2267 case ReorderingMode::Splat:
2268 if (
Op == OpLastLane || (!BestOp.Score && isa<Constant>(
Op))) {
2269 IsUsed =
Op == OpLastLane;
2270 if (
Op == OpLastLane) {
2272 BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
2278 case ReorderingMode::Failed:
2284 getData(*BestOp.Idx, Lane).IsUsed = IsUsed;
2288 return std::nullopt;
2295 unsigned getBestLaneToStartReordering()
const {
2296 unsigned Min = UINT_MAX;
2297 unsigned SameOpNumber = 0;
2308 for (
int I = getNumLanes();
I > 0; --
I) {
2309 unsigned Lane =
I - 1;
2310 OperandsOrderData NumFreeOpsHash =
2311 getMaxNumOperandsThatCanBeReordered(Lane);
2314 if (NumFreeOpsHash.NumOfAPOs < Min) {
2315 Min = NumFreeOpsHash.NumOfAPOs;
2316 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2318 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2319 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2320 NumFreeOpsHash.NumOpsWithSameOpcodeParent < SameOpNumber) {
2323 SameOpNumber = NumFreeOpsHash.NumOpsWithSameOpcodeParent;
2324 HashMap[NumFreeOpsHash.Hash] = std::make_pair(1, Lane);
2325 }
else if (NumFreeOpsHash.NumOfAPOs == Min &&
2326 NumFreeOpsHash.NumOpsWithSameOpcodeParent == SameOpNumber) {
2327 auto [It, Inserted] =
2328 HashMap.
try_emplace(NumFreeOpsHash.Hash, 1, Lane);
2334 unsigned BestLane = 0;
2335 unsigned CntMin = UINT_MAX;
2337 if (
Data.second.first < CntMin) {
2338 CntMin =
Data.second.first;
2339 BestLane =
Data.second.second;
2346 struct OperandsOrderData {
2349 unsigned NumOfAPOs = UINT_MAX;
2352 unsigned NumOpsWithSameOpcodeParent = 0;
2366 OperandsOrderData getMaxNumOperandsThatCanBeReordered(
unsigned Lane)
const {
2367 unsigned CntTrue = 0;
2368 unsigned NumOperands = getNumOperands();
2378 bool AllUndefs =
true;
2379 unsigned NumOpsWithSameOpcodeParent = 0;
2383 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2384 const OperandData &OpData = getData(OpIdx, Lane);
2389 if (
auto *
I = dyn_cast<Instruction>(OpData.V)) {
2391 I->getParent() != Parent) {
2392 if (NumOpsWithSameOpcodeParent == 0) {
2393 NumOpsWithSameOpcodeParent = 1;
2395 Parent =
I->getParent();
2397 --NumOpsWithSameOpcodeParent;
2400 ++NumOpsWithSameOpcodeParent;
2404 Hash,
hash_value((OpIdx + 1) * (OpData.V->getValueID() + 1)));
2405 AllUndefs = AllUndefs && isa<UndefValue>(OpData.V);
2409 OperandsOrderData
Data;
2410 Data.NumOfAPOs = std::max(CntTrue, NumOperands - CntTrue);
2411 Data.NumOpsWithSameOpcodeParent = NumOpsWithSameOpcodeParent;
2419 assert((empty() || VL.
size() == getNumLanes()) &&
2420 "Expected same number of lanes");
2421 assert(S.valid() &&
"InstructionsState is invalid.");
2424 constexpr unsigned IntrinsicNumOperands = 2;
2427 ArgSize = isa<IntrinsicInst>(MainOp) ? IntrinsicNumOperands : NumOperands;
2428 OpsVec.
resize(NumOperands);
2429 unsigned NumLanes = VL.
size();
2430 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2431 OpsVec[OpIdx].
resize(NumLanes);
2432 for (
unsigned Lane = 0; Lane != NumLanes; ++Lane) {
2433 assert((isa<Instruction>(VL[Lane]) || isa<PoisonValue>(VL[Lane])) &&
2434 "Expected instruction or poison value");
2445 if (isa<PoisonValue>(VL[Lane])) {
2446 if (
auto *EI = dyn_cast<ExtractElementInst>(MainOp)) {
2448 OpsVec[OpIdx][Lane] = {EI->getVectorOperand(),
true,
false};
2451 }
else if (
auto *EV = dyn_cast<ExtractValueInst>(MainOp)) {
2453 OpsVec[OpIdx][Lane] = {EV->getAggregateOperand(),
true,
false};
2457 OpsVec[OpIdx][Lane] = {
2462 bool IsInverseOperation = !
isCommutative(cast<Instruction>(VL[Lane]));
2463 bool APO = (OpIdx == 0) ?
false : IsInverseOperation;
2464 OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
2471 unsigned getNumOperands()
const {
return ArgSize; }
2474 unsigned getNumLanes()
const {
return OpsVec[0].
size(); }
2477 Value *getValue(
unsigned OpIdx,
unsigned Lane)
const {
2478 return getData(OpIdx, Lane).V;
2482 bool empty()
const {
return OpsVec.
empty(); }
2485 void clear() { OpsVec.
clear(); }
2490 bool shouldBroadcast(
Value *
Op,
unsigned OpIdx,
unsigned Lane) {
2491 assert(
Op == getValue(OpIdx, Lane) &&
2492 "Op is expected to be getValue(OpIdx, Lane).");
2494 if (isa<LoadInst>(
Op) && getNumLanes() == 2 && getNumOperands() == 2)
2496 bool OpAPO = getData(OpIdx, Lane).APO;
2497 bool IsInvariant = L && L->isLoopInvariant(
Op);
2499 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2503 bool FoundCandidate =
false;
2504 for (
unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
2505 OperandData &
Data = getData(OpI, Ln);
2506 if (
Data.APO != OpAPO ||
Data.IsUsed)
2508 Value *OpILane = getValue(OpI, Lane);
2509 bool IsConstantOp = isa<Constant>(OpILane);
2518 ((Lns > 2 && isa<Constant>(
Data.V)) ||
2523 isa<Constant>(
Data.V)))) ||
2530 (IsInvariant && !isa<Constant>(
Data.V) &&
2532 L->isLoopInvariant(
Data.V))) {
2533 FoundCandidate =
true;
2540 if (!FoundCandidate)
2543 return getNumLanes() == 2 || Cnt > 1;
2548 bool canBeVectorized(
Instruction *
Op,
unsigned OpIdx,
unsigned Lane)
const {
2549 assert(
Op == getValue(OpIdx, Lane) &&
2550 "Op is expected to be getValue(OpIdx, Lane).");
2551 bool OpAPO = getData(OpIdx, Lane).APO;
2552 for (
unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
2555 if (
any_of(seq<unsigned>(getNumOperands()), [&](
unsigned OpI) {
2556 const OperandData &
Data = getData(OpI, Ln);
2557 if (
Data.APO != OpAPO ||
Data.IsUsed)
2559 Value *OpILn = getValue(OpI, Ln);
2560 return (L && L->isLoopInvariant(OpILn)) ||
2573 : TLI(*R.TLI),
DL(*R.
DL), SE(*R.SE), R(R),
2574 L(R.LI->getLoopFor(S.getMainOp()->
getParent())) {
2576 appendOperandsOfVL(RootVL, S);
2583 assert(OpsVec[OpIdx].
size() == getNumLanes() &&
2584 "Expected same num of lanes across all operands");
2585 for (
unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
2586 OpVL[Lane] = OpsVec[OpIdx][Lane].V;
2594 unsigned NumOperands = getNumOperands();
2595 unsigned NumLanes = getNumLanes();
2615 unsigned FirstLane = getBestLaneToStartReordering();
2618 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2619 Value *OpLane0 = getValue(OpIdx, FirstLane);
2622 if (
auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
2624 if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
2625 !canBeVectorized(OpILane0, OpIdx, FirstLane))
2626 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2627 else if (isa<LoadInst>(OpILane0))
2628 ReorderingModes[OpIdx] = ReorderingMode::Load;
2630 ReorderingModes[OpIdx] = ReorderingMode::Opcode;
2631 }
else if (isa<Constant>(OpLane0)) {
2632 ReorderingModes[OpIdx] = ReorderingMode::Constant;
2633 }
else if (isa<Argument>(OpLane0)) {
2635 ReorderingModes[OpIdx] = ReorderingMode::Splat;
2645 auto &&SkipReordering = [
this]() {
2648 for (
const OperandData &
Data : Op0)
2652 if (
any_of(
Op, [&UniqueValues](
const OperandData &
Data) {
2659 return UniqueValues.
size() != 2 &&
2661 UniqueValues.
size());
2673 if (SkipReordering())
2676 bool StrategyFailed =
false;
2684 for (
unsigned I = 0;
I < NumOperands; ++
I)
2685 MainAltOps[
I].push_back(getData(
I, FirstLane).V);
2688 UsedLanes.
set(FirstLane);
2689 for (
unsigned Distance = 1; Distance != NumLanes; ++Distance) {
2692 int Lane = FirstLane +
Direction * Distance;
2693 if (Lane < 0 || Lane >= (
int)NumLanes)
2695 UsedLanes.
set(Lane);
2697 assert(LastLane >= 0 && LastLane < (
int)NumLanes &&
2700 for (
unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
2702 std::optional<unsigned> BestIdx =
2703 getBestOperand(OpIdx, Lane, LastLane, ReorderingModes,
2704 MainAltOps[OpIdx], UsedLanes);
2711 swap(OpIdx, *BestIdx, Lane);
2714 StrategyFailed =
true;
2717 if (MainAltOps[OpIdx].
size() != 2) {
2718 OperandData &AltOp = getData(OpIdx, Lane);
2719 InstructionsState OpS =
2721 if (OpS && OpS.isAltShuffle())
2728 if (!StrategyFailed)
2733#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2736 case ReorderingMode::Load:
2738 case ReorderingMode::Opcode:
2740 case ReorderingMode::Constant:
2742 case ReorderingMode::Splat:
2744 case ReorderingMode::Failed:
2765 const unsigned Indent = 2;
2768 OS <<
"Operand " << Cnt++ <<
"\n";
2769 for (
const OperandData &OpData : OpDataVec) {
2771 if (
Value *V = OpData.V)
2775 OS <<
", APO:" << OpData.APO <<
"}\n";
2797 int BestScore = Limit;
2798 std::optional<int> Index;
2799 for (
int I : seq<int>(0, Candidates.size())) {
2801 Candidates[
I].second,
2804 if (Score > BestScore) {
2819 DeletedInstructions.insert(
I);
2824 template <
typename T>
2827 for (
T *V : DeadVals) {
2828 auto *
I = cast<Instruction>(V);
2829 DeletedInstructions.insert(
I);
2832 for (
T *V : DeadVals) {
2833 if (!V || !Processed.
insert(V).second)
2835 auto *
I = cast<Instruction>(V);
2838 if (
const TreeEntry *Entry = getTreeEntry(
I)) {
2839 Entries.push_back(Entry);
2840 auto It = MultiNodeScalars.find(
I);
2841 if (It != MultiNodeScalars.end())
2842 Entries.append(It->second.begin(), It->second.end());
2844 for (
Use &U :
I->operands()) {
2845 if (
auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2846 OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2848 (Entries.empty() ||
none_of(Entries, [&](
const TreeEntry *Entry) {
2849 return Entry->VectorizedValue == OpI;
2853 I->dropAllReferences();
2855 for (
T *V : DeadVals) {
2856 auto *
I = cast<Instruction>(V);
2857 if (!
I->getParent())
2862 cast<Instruction>(U.getUser()));
2864 "trying to erase instruction with users.");
2865 I->removeFromParent();
2869 while (!DeadInsts.
empty()) {
2872 if (!VI || !VI->getParent())
2875 "Live instruction found in dead worklist!");
2876 assert(VI->use_empty() &&
"Instructions with uses are not dead.");
2883 for (
Use &OpU : VI->operands()) {
2884 Value *OpV = OpU.get();
2895 if (
auto *OpI = dyn_cast<Instruction>(OpV))
2896 if (!DeletedInstructions.contains(OpI) &&
2901 VI->removeFromParent();
2902 DeletedInstructions.insert(VI);
2910 return AnalyzedReductionsRoots.count(
I);
2915 AnalyzedReductionsRoots.insert(
I);
2929 AnalyzedReductionsRoots.clear();
2930 AnalyzedReductionVals.
clear();
2931 AnalyzedMinBWVals.
clear();
2943 return NonScheduledFirst.
contains(V);
2956 bool collectValuesToDemote(
2957 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
2960 bool &IsProfitableToDemote,
bool IsTruncRoot)
const;
2970 canReorderOperands(TreeEntry *UserTE,
2977 void reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const;
2981 TreeEntry *getVectorizedOperand(TreeEntry *UserTE,
unsigned OpIdx) {
2983 TreeEntry *TE =
nullptr;
2985 TE = getTreeEntry(V);
2986 if (TE &&
is_contained(TE->UserTreeIndices, EdgeInfo(UserTE, OpIdx)))
2988 auto It = MultiNodeScalars.find(V);
2989 if (It != MultiNodeScalars.end()) {
2990 for (TreeEntry *E : It->second) {
2991 if (
is_contained(E->UserTreeIndices, EdgeInfo(UserTE, OpIdx))) {
2999 if (It != VL.
end()) {
3000 assert(
TE->isSame(VL) &&
"Expected same scalars.");
3008 const TreeEntry *getVectorizedOperand(
const TreeEntry *UserTE,
3009 unsigned OpIdx)
const {
3010 return const_cast<BoUpSLP *
>(
this)->getVectorizedOperand(
3011 const_cast<TreeEntry *
>(UserTE), OpIdx);
3015 bool areAllUsersVectorized(
3024 const TreeEntry *getOperandEntry(
const TreeEntry *E,
unsigned Idx)
const;
3029 Instruction *getRootEntryInstruction(
const TreeEntry &Entry)
const;
3033 getCastContextHint(
const TreeEntry &TE)
const;
3042 const EdgeInfo &EI,
unsigned InterleaveFactor = 0);
3053 bool ResizeAllowed =
false)
const;
3062 TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
unsigned NodeIdx);
3063 const TreeEntry *getMatchedVectorizedOperand(
const TreeEntry *E,
3064 unsigned NodeIdx)
const {
3065 return const_cast<BoUpSLP *
>(
this)->getMatchedVectorizedOperand(E, NodeIdx);
3072 Value *vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
bool PostponedPHIs);
3077 template <
typename BVTy,
typename ResTy,
typename...
Args>
3078 ResTy processBuildVector(
const TreeEntry *E,
Type *ScalarTy, Args &...Params);
3083 Value *createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
3084 bool PostponedPHIs);
3090 Instruction &getLastInstructionInBundle(
const TreeEntry *E);
3097 std::optional<TargetTransformInfo::ShuffleKind>
3109 unsigned NumParts)
const;
3121 std::optional<TargetTransformInfo::ShuffleKind>
3122 isGatherShuffledSingleRegisterEntry(
3139 isGatherShuffledEntry(
3142 unsigned NumParts,
bool ForOrder =
false);
3148 Type *ScalarTy)
const;
3152 void setInsertPointAfterBundle(
const TreeEntry *E);
3162 bool isFullyVectorizableTinyTree(
bool ForReduction)
const;
3167 void tryToVectorizeGatheredLoads(
3176 collectUserStores(
const BoUpSLP::TreeEntry *TE)
const;
3192 findExternalStoreUsersReorderIndices(TreeEntry *TE)
const;
3196 void reorderGatherNode(TreeEntry &TE);
3200 TreeEntry(VecTreeTy &Container) : Container(Container) {}
3217 [Scalars](
Value *V,
int Idx) {
3218 return (isa<UndefValue>(V) &&
3219 Idx == PoisonMaskElem) ||
3220 (Idx != PoisonMaskElem && V == Scalars[Idx]);
3223 if (!ReorderIndices.empty()) {
3230 return IsSame(Scalars, Mask);
3231 if (VL.
size() == ReuseShuffleIndices.size()) {
3233 return IsSame(Scalars, Mask);
3237 return IsSame(Scalars, ReuseShuffleIndices);
3240 bool isOperandGatherNode(
const EdgeInfo &UserEI)
const {
3241 return isGather() && !UserTreeIndices.empty() &&
3242 UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx &&
3243 UserTreeIndices.front().UserTE == UserEI.UserTE;
3247 bool hasEqualOperands(
const TreeEntry &TE)
const {
3248 if (
TE.getNumOperands() != getNumOperands())
3251 for (
unsigned I = 0, E = getNumOperands();
I < E; ++
I) {
3252 unsigned PrevCount =
Used.count();
3253 for (
unsigned K = 0;
K < E; ++
K) {
3256 if (getOperand(K) ==
TE.getOperand(
I)) {
3262 if (PrevCount ==
Used.count())
3271 unsigned getVectorFactor()
const {
3272 if (!ReuseShuffleIndices.empty())
3273 return ReuseShuffleIndices.size();
3274 return Scalars.
size();
3278 bool isGather()
const {
return State == NeedToGather; }
3305 enum CombinedOpcode {
3307 MinMax = Instruction::OtherOpsEnd + 1,
3309 CombinedOpcode CombinedOp = NotCombinedOp;
3323 VecTreeTy &Container;
3344 InstructionsState S = InstructionsState::invalid();
3347 unsigned InterleaveFactor = 0;
3351 unsigned getInterleaveFactor()
const {
return InterleaveFactor; }
3353 void setInterleave(
unsigned Factor) { InterleaveFactor = Factor; }
3359 assert(Operands[OpIdx].empty() &&
"Already resized?");
3361 "Number of operands is greater than the number of scalars.");
3367 void setOperand(
const BoUpSLP &R,
bool RequireReorder =
false) {
3368 VLOperands Ops(Scalars, S, R);
3371 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands()))
3372 setOperand(
I, Ops.getVL(
I));
3394 unsigned getNumOperands()
const {
return Operands.size(); }
3397 Value *getSingleOperand(
unsigned OpIdx)
const {
3399 assert(!Operands[OpIdx].empty() &&
"No operand available");
3404 bool isAltShuffle()
const {
return S.isAltShuffle(); }
3406 bool isOpcodeOrAlt(
Instruction *
I)
const {
return S.isOpcodeOrAlt(
I); }
3412 auto *
I = dyn_cast<Instruction>(
Op);
3413 if (
I && isOpcodeOrAlt(
I))
3415 return S.getMainOp();
3418 void setOperations(
const InstructionsState &S) {
3419 assert(S &&
"InstructionsState is invalid.");
3423 Instruction *getMainOp()
const {
return S.getMainOp(); }
3425 Instruction *getAltOp()
const {
return S.getAltOp(); }
3428 unsigned getOpcode()
const {
return S.
getOpcode(); }
3430 unsigned getAltOpcode()
const {
return S.getAltOpcode(); }
3432 bool hasState()
const {
return S.valid(); }
3436 int findLaneForValue(
Value *V)
const {
3437 unsigned FoundLane = getVectorFactor();
3438 for (
auto *It =
find(Scalars, V), *
End = Scalars.end(); It !=
End;
3439 std::advance(It, 1)) {
3442 FoundLane = std::distance(Scalars.begin(), It);
3443 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3444 if (!ReorderIndices.
empty())
3445 FoundLane = ReorderIndices[FoundLane];
3446 assert(FoundLane < Scalars.size() &&
"Couldn't find extract lane");
3447 if (ReuseShuffleIndices.
empty())
3449 if (
auto *RIt =
find(ReuseShuffleIndices, FoundLane);
3450 RIt != ReuseShuffleIndices.
end()) {
3451 FoundLane = std::distance(ReuseShuffleIndices.
begin(), RIt);
3455 assert(FoundLane < getVectorFactor() &&
"Unable to find given value.");
3468 bool isNonPowOf2Vec()
const {
3470 return IsNonPowerOf2;
3479 assert((!IsNonPowerOf2 || ReuseShuffleIndices.
empty()) &&
3480 "Reshuffling not supported with non-power-of-2 vectors yet.");
3481 return IsNonPowerOf2;
3484 Value *getOrdered(
unsigned Idx)
const {
3485 assert(
isGather() &&
"Must be used only for buildvectors/gathers.");
3486 if (ReorderIndices.
empty())
3487 return Scalars[
Idx];
3497 for (
unsigned OpI = 0, OpE =
Operands.size(); OpI != OpE; ++OpI) {
3498 dbgs() <<
"Operand " << OpI <<
":\n";
3499 for (
const Value *V : Operands[OpI])
3502 dbgs() <<
"Scalars: \n";
3503 for (
Value *V : Scalars)
3505 dbgs() <<
"State: ";
3508 if (InterleaveFactor > 0) {
3509 dbgs() <<
"Vectorize with interleave factor " << InterleaveFactor
3512 dbgs() <<
"Vectorize\n";
3515 case ScatterVectorize:
3516 dbgs() <<
"ScatterVectorize\n";
3518 case StridedVectorize:
3519 dbgs() <<
"StridedVectorize\n";
3522 dbgs() <<
"NeedToGather\n";
3524 case CombinedVectorize:
3525 dbgs() <<
"CombinedVectorize\n";
3529 dbgs() <<
"MainOp: " << *S.getMainOp() <<
"\n";
3530 dbgs() <<
"AltOp: " << *S.getAltOp() <<
"\n";
3532 dbgs() <<
"MainOp: NULL\n";
3533 dbgs() <<
"AltOp: NULL\n";
3535 dbgs() <<
"VectorizedValue: ";
3536 if (VectorizedValue)
3537 dbgs() << *VectorizedValue <<
"\n";
3540 dbgs() <<
"ReuseShuffleIndices: ";
3541 if (ReuseShuffleIndices.
empty())
3544 for (
int ReuseIdx : ReuseShuffleIndices)
3545 dbgs() << ReuseIdx <<
", ";
3547 dbgs() <<
"ReorderIndices: ";
3548 for (
unsigned ReorderIdx : ReorderIndices)
3549 dbgs() << ReorderIdx <<
", ";
3551 dbgs() <<
"UserTreeIndices: ";
3552 for (
const auto &EInfo : UserTreeIndices)
3553 dbgs() << EInfo <<
", ";
3555 if (!CombinedEntriesWithIndices.
empty()) {
3556 dbgs() <<
"Combined entries: ";
3558 dbgs() <<
"Entry index " <<
P.first <<
" with offset " <<
P.second;
3567 void dumpTreeCosts(
const TreeEntry *E,
InstructionCost ReuseShuffleCost,
3570 dbgs() <<
"SLP: " << Banner <<
":\n";
3572 dbgs() <<
"SLP: Costs:\n";
3573 dbgs() <<
"SLP: ReuseShuffleCost = " << ReuseShuffleCost <<
"\n";
3574 dbgs() <<
"SLP: VectorCost = " << VecCost <<
"\n";
3575 dbgs() <<
"SLP: ScalarCost = " << ScalarCost <<
"\n";
3576 dbgs() <<
"SLP: ReuseShuffleCost + VecCost - ScalarCost = "
3577 << ReuseShuffleCost + VecCost - ScalarCost <<
"\n";
3583 std::optional<ScheduleData *> Bundle,
3584 const InstructionsState &S,
3585 const EdgeInfo &UserTreeIdx,
3588 unsigned InterleaveFactor = 0) {
3589 TreeEntry::EntryState EntryState =
3590 Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
3591 TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
3592 ReuseShuffleIndices, ReorderIndices);
3593 if (E && InterleaveFactor > 0)
3594 E->setInterleave(InterleaveFactor);
3599 TreeEntry::EntryState EntryState,
3600 std::optional<ScheduleData *> Bundle,
3601 const InstructionsState &S,
3602 const EdgeInfo &UserTreeIdx,
3605 assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
3606 (Bundle && EntryState != TreeEntry::NeedToGather)) &&
3607 "Need to vectorize gather entry?");
3609 if (GatheredLoadsEntriesFirst.has_value() &&
3610 EntryState == TreeEntry::NeedToGather && S &&
3611 S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX &&
3612 !UserTreeIdx.UserTE)
3614 VectorizableTree.
push_back(std::make_unique<TreeEntry>(VectorizableTree));
3615 TreeEntry *
Last = VectorizableTree.
back().get();
3616 Last->Idx = VectorizableTree.
size() - 1;
3617 Last->State = EntryState;
3622 ReuseShuffleIndices.empty()) &&
3623 "Reshuffling scalars not yet supported for nodes with padding");
3624 Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
3625 ReuseShuffleIndices.end());
3626 if (ReorderIndices.
empty()) {
3629 Last->setOperations(S);
3632 Last->Scalars.assign(VL.
size(),
nullptr);
3635 if (Idx >= VL.size())
3636 return UndefValue::get(VL.front()->getType());
3641 Last->setOperations(S);
3642 Last->ReorderIndices.append(ReorderIndices.
begin(), ReorderIndices.
end());
3644 if (!
Last->isGather()) {
3645 for (
Value *V : VL) {
3646 const TreeEntry *
TE = getTreeEntry(V);
3648 "Scalar already in tree!");
3651 MultiNodeScalars.try_emplace(V).first->getSecond().push_back(
Last);
3654 ScalarToTreeEntry[
V] =
Last;
3657 ScheduleData *BundleMember = *Bundle;
3658 assert((BundleMember || isa<PHINode>(S.getMainOp()) ||
3661 "Bundle and VL out of sync");
3663 for (
Value *V : VL) {
3668 BundleMember->TE =
Last;
3669 BundleMember = BundleMember->NextInBundle;
3672 assert(!BundleMember &&
"Bundle and VL out of sync");
3675 bool AllConstsOrCasts =
true;
3678 auto *
I = dyn_cast<CastInst>(V);
3679 AllConstsOrCasts &=
I &&
I->getType()->isIntegerTy();
3680 if (UserTreeIdx.EdgeIdx != UINT_MAX || !UserTreeIdx.UserTE ||
3681 !UserTreeIdx.UserTE->isGather())
3684 if (AllConstsOrCasts)
3686 std::make_pair(std::numeric_limits<unsigned>::max(), 1);
3687 MustGather.
insert(VL.begin(), VL.end());
3690 if (UserTreeIdx.UserTE)
3691 Last->UserTreeIndices.push_back(UserTreeIdx);
3697 TreeEntry::VecTreeTy VectorizableTree;
3702 for (
unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
3703 VectorizableTree[
Id]->dump();
3709 TreeEntry *getTreeEntry(
Value *V) {
3710 assert(V &&
"V cannot be nullptr.");
3711 return ScalarToTreeEntry.lookup(V);
3714 const TreeEntry *getTreeEntry(
Value *V)
const {
3715 assert(V &&
"V cannot be nullptr.");
3716 return ScalarToTreeEntry.lookup(V);
3725 bool areAltOperandsProfitable(
const InstructionsState &S,
3730 TreeEntry::EntryState
3732 bool IsScatterVectorizeUserTE,
3765 using ValueToGatherNodesMap =
3767 ValueToGatherNodesMap ValueToGatherNodes;
3775 bool IsGraphTransformMode =
false;
3778 std::optional<unsigned> GatheredLoadsEntriesFirst;
3781 struct ExternalUser {
3805 AliasCacheKey
Key = std::make_pair(Inst1, Inst2);
3806 auto It = AliasCache.
find(Key);
3807 if (It != AliasCache.
end())
3812 AliasCache.
try_emplace(std::make_pair(Inst2, Inst1), Aliased);
3816 using AliasCacheKey = std::pair<Instruction *, Instruction *>;
3848 UserList ExternalUses;
3871 struct ScheduleData {
3874 enum { InvalidDeps = -1 };
3876 ScheduleData() =
default;
3879 FirstInBundle =
this;
3880 NextInBundle =
nullptr;
3881 NextLoadStore =
nullptr;
3882 IsScheduled =
false;
3883 SchedulingRegionID = BlockSchedulingRegionID;
3884 clearDependencies();
3891 if (hasValidDependencies()) {
3892 assert(UnscheduledDeps <= Dependencies &&
"invariant");
3894 assert(UnscheduledDeps == Dependencies &&
"invariant");
3898 assert(isSchedulingEntity() &&
3899 "unexpected scheduled state");
3900 for (
const ScheduleData *BundleMember =
this; BundleMember;
3901 BundleMember = BundleMember->NextInBundle) {
3902 assert(BundleMember->hasValidDependencies() &&
3903 BundleMember->UnscheduledDeps == 0 &&
3904 "unexpected scheduled state");
3905 assert((BundleMember ==
this || !BundleMember->IsScheduled) &&
3906 "only bundle is marked scheduled");
3910 assert(Inst->getParent() == FirstInBundle->Inst->getParent() &&
3911 "all bundle members must be in same basic block");
3917 bool hasValidDependencies()
const {
return Dependencies != InvalidDeps; }
3921 bool isSchedulingEntity()
const {
return FirstInBundle ==
this; }
3925 bool isPartOfBundle()
const {
3926 return NextInBundle !=
nullptr || FirstInBundle !=
this ||
TE;
3931 bool isReady()
const {
3932 assert(isSchedulingEntity() &&
3933 "can't consider non-scheduling entity for ready list");
3934 return unscheduledDepsInBundle() == 0 && !IsScheduled;
3940 int incrementUnscheduledDeps(
int Incr) {
3941 assert(hasValidDependencies() &&
3942 "increment of unscheduled deps would be meaningless");
3943 UnscheduledDeps += Incr;
3944 return FirstInBundle->unscheduledDepsInBundle();
3949 void resetUnscheduledDeps() {
3950 UnscheduledDeps = Dependencies;
3954 void clearDependencies() {
3955 Dependencies = InvalidDeps;
3956 resetUnscheduledDeps();
3957 MemoryDependencies.clear();
3958 ControlDependencies.clear();
3961 int unscheduledDepsInBundle()
const {
3962 assert(isSchedulingEntity() &&
"only meaningful on the bundle");
3964 for (
const ScheduleData *BundleMember =
this; BundleMember;
3965 BundleMember = BundleMember->NextInBundle) {
3966 if (BundleMember->UnscheduledDeps == InvalidDeps)
3968 Sum += BundleMember->UnscheduledDeps;
3974 if (!isSchedulingEntity()) {
3975 os <<
"/ " << *Inst;
3976 }
else if (NextInBundle) {
3978 ScheduleData *SD = NextInBundle;
3980 os <<
';' << *SD->Inst;
3981 SD = SD->NextInBundle;
3992 TreeEntry *
TE =
nullptr;
3996 ScheduleData *FirstInBundle =
nullptr;
4000 ScheduleData *NextInBundle =
nullptr;
4004 ScheduleData *NextLoadStore =
nullptr;
4018 int SchedulingRegionID = 0;
4021 int SchedulingPriority = 0;
4027 int Dependencies = InvalidDeps;
4033 int UnscheduledDeps = InvalidDeps;
4037 bool IsScheduled =
false;
4042 const BoUpSLP::ScheduleData &SD) {
4067 struct BlockScheduling {
4069 : BB(BB), ChunkSize(BB->
size()), ChunkPos(ChunkSize) {}
4073 ScheduleStart =
nullptr;
4074 ScheduleEnd =
nullptr;
4075 FirstLoadStoreInRegion =
nullptr;
4076 LastLoadStoreInRegion =
nullptr;
4077 RegionHasStackSave =
false;
4081 ScheduleRegionSizeLimit -= ScheduleRegionSize;
4084 ScheduleRegionSize = 0;
4088 ++SchedulingRegionID;
4092 if (BB !=
I->getParent())
4095 ScheduleData *SD = ScheduleDataMap.lookup(
I);
4096 if (SD && isInSchedulingRegion(SD))
4101 ScheduleData *getScheduleData(
Value *V) {
4102 if (
auto *
I = dyn_cast<Instruction>(V))
4103 return getScheduleData(
I);
4107 bool isInSchedulingRegion(ScheduleData *SD)
const {
4108 return SD->SchedulingRegionID == SchedulingRegionID;
4113 template <
typename ReadyListType>
4114 void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
4115 SD->IsScheduled =
true;
4118 for (ScheduleData *BundleMember = SD; BundleMember;
4119 BundleMember = BundleMember->NextInBundle) {
4124 auto &&DecrUnsched = [
this, &ReadyList](
Instruction *
I) {
4125 ScheduleData *OpDef = getScheduleData(
I);
4126 if (OpDef && OpDef->hasValidDependencies() &&
4127 OpDef->incrementUnscheduledDeps(-1) == 0) {
4131 ScheduleData *DepBundle = OpDef->FirstInBundle;
4132 assert(!DepBundle->IsScheduled &&
4133 "already scheduled bundle gets ready");
4134 ReadyList.insert(DepBundle);
4136 <<
"SLP: gets ready (def): " << *DepBundle <<
"\n");
4143 if (TreeEntry *TE = BundleMember->TE) {
4145 int Lane = std::distance(
TE->Scalars.begin(),
4146 find(
TE->Scalars, BundleMember->Inst));
4147 assert(Lane >= 0 &&
"Lane not set");
4155 auto *
In = BundleMember->Inst;
4158 (isa<ExtractValueInst, ExtractElementInst, IntrinsicInst>(In) ||
4159 In->getNumOperands() ==
TE->getNumOperands()) &&
4160 "Missed TreeEntry operands?");
4163 for (
unsigned OpIdx = 0, NumOperands =
TE->getNumOperands();
4164 OpIdx != NumOperands; ++OpIdx)
4165 if (
auto *
I = dyn_cast<Instruction>(
TE->getOperand(OpIdx)[Lane]))
4170 for (
Use &U : BundleMember->Inst->operands())
4171 if (
auto *
I = dyn_cast<Instruction>(
U.get()))
4175 for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
4176 if (MemoryDepSD->hasValidDependencies() &&
4177 MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
4180 ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
4181 assert(!DepBundle->IsScheduled &&
4182 "already scheduled bundle gets ready");
4183 ReadyList.insert(DepBundle);
4185 <<
"SLP: gets ready (mem): " << *DepBundle <<
"\n");
4189 for (ScheduleData *DepSD : BundleMember->ControlDependencies) {
4190 if (DepSD->incrementUnscheduledDeps(-1) == 0) {
4193 ScheduleData *DepBundle = DepSD->FirstInBundle;
4194 assert(!DepBundle->IsScheduled &&
4195 "already scheduled bundle gets ready");
4196 ReadyList.insert(DepBundle);
4198 <<
"SLP: gets ready (ctl): " << *DepBundle <<
"\n");
4209 assert(ScheduleStart->getParent() == ScheduleEnd->getParent() &&
4210 ScheduleStart->comesBefore(ScheduleEnd) &&
4211 "Not a valid scheduling region?");
4213 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4214 auto *SD = getScheduleData(
I);
4217 assert(isInSchedulingRegion(SD) &&
4218 "primary schedule data not in window?");
4219 assert(isInSchedulingRegion(SD->FirstInBundle) &&
4220 "entire bundle in window!");
4224 for (
auto *SD : ReadyInsts) {
4225 assert(SD->isSchedulingEntity() && SD->isReady() &&
4226 "item in ready list not ready?");
4232 template <
typename ReadyListType>
4233 void initialFillReadyList(ReadyListType &ReadyList) {
4234 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
4235 ScheduleData *SD = getScheduleData(
I);
4236 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies() &&
4238 ReadyList.insert(SD);
4240 <<
"SLP: initially in ready list: " << *SD <<
"\n");
4254 std::optional<ScheduleData *>
4256 const InstructionsState &S);
4262 ScheduleData *allocateScheduleDataChunks();
4266 bool extendSchedulingRegion(
Value *V,
const InstructionsState &S);
4271 ScheduleData *PrevLoadStore,
4272 ScheduleData *NextLoadStore);
4276 void calculateDependencies(ScheduleData *SD,
bool InsertInReadyList,
4280 void resetSchedule();
4310 ScheduleData *FirstLoadStoreInRegion =
nullptr;
4314 ScheduleData *LastLoadStoreInRegion =
nullptr;
4319 bool RegionHasStackSave =
false;
4322 int ScheduleRegionSize = 0;
4331 int SchedulingRegionID = 1;
4339 void scheduleBlock(BlockScheduling *BS);
4346 struct OrdersTypeDenseMapInfo {
4359 static unsigned getHashValue(
const OrdersType &V) {
4380 unsigned MaxVecRegSize;
4381 unsigned MinVecRegSize;
4396 unsigned ReductionBitWidth = 0;
4399 unsigned BaseGraphSize = 1;
4403 std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;
4422 struct ChildIteratorType
4424 ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
4435 return R.VectorizableTree[0].get();
4439 return {
N->UserTreeIndices.begin(),
N->Container};
4443 return {
N->UserTreeIndices.end(),
N->Container};
4448 class nodes_iterator {
4459 bool operator!=(
const nodes_iterator &N2)
const {
return N2.It != It; }
4463 return nodes_iterator(R->VectorizableTree.begin());
4467 return nodes_iterator(R->VectorizableTree.end());
4470 static unsigned size(
BoUpSLP *R) {
return R->VectorizableTree.size(); }
4481 OS << Entry->Idx <<
".\n";
4484 for (
auto *V : Entry->Scalars) {
4486 if (
llvm::any_of(R->ExternalUses, [&](
const BoUpSLP::ExternalUser &EU) {
4487 return EU.Scalar == V;
4497 if (Entry->isGather())
4499 if (Entry->State == TreeEntry::ScatterVectorize ||
4500 Entry->State == TreeEntry::StridedVectorize)
4501 return "color=blue";
4510 for (
auto *
I : DeletedInstructions) {
4511 if (!
I->getParent()) {
4514 if (isa<PHINode>(
I))
4516 I->insertBefore(
F->getEntryBlock(),
4517 F->getEntryBlock().getFirstNonPHIIt());
4519 I->insertBefore(
F->getEntryBlock().getTerminator());
4522 for (
Use &U :
I->operands()) {
4523 auto *
Op = dyn_cast<Instruction>(U.get());
4524 if (
Op && !DeletedInstructions.count(
Op) &&
Op->hasOneUser() &&
4528 I->dropAllReferences();
4530 for (
auto *
I : DeletedInstructions) {
4532 "trying to erase instruction with users.");
4533 I->eraseFromParent();
4539#ifdef EXPENSIVE_CHECKS
4550 assert(!Mask.empty() && Reuses.
size() == Mask.size() &&
4551 "Expected non-empty mask.");
4554 for (
unsigned I = 0,
E = Prev.
size();
I <
E; ++
I)
4556 Reuses[Mask[
I]] = Prev[
I];
4564 bool BottomOrder =
false) {
4565 assert(!Mask.empty() &&
"Expected non-empty mask.");
4566 unsigned Sz = Mask.size();
4569 if (Order.
empty()) {
4571 std::iota(PrevOrder.
begin(), PrevOrder.
end(), 0);
4573 PrevOrder.
swap(Order);
4576 for (
unsigned I = 0;
I < Sz; ++
I)
4578 Order[
I] = PrevOrder[Mask[
I]];
4580 return Data.value() == Sz ||
Data.index() ==
Data.value();
4589 if (Order.
empty()) {
4591 std::iota(MaskOrder.
begin(), MaskOrder.
end(), 0);
4601 for (
unsigned I = 0;
I < Sz; ++
I)
4603 Order[MaskOrder[
I]] =
I;
4607std::optional<BoUpSLP::OrdersType>
4609 assert(TE.isGather() &&
"Expected gather node only.");
4613 Type *ScalarTy = GatheredScalars.
front()->getType();
4614 int NumScalars = GatheredScalars.
size();
4616 return std::nullopt;
4619 if (NumParts == 0 || NumParts >= NumScalars ||
4620 VecTy->getNumElements() % NumParts != 0 ||
4622 VecTy->getNumElements() / NumParts))
4628 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
4630 isGatherShuffledEntry(&TE, GatheredScalars, Mask, Entries, NumParts,
4633 if (GatherShuffles.
empty() && ExtractShuffles.
empty())
4634 return std::nullopt;
4635 OrdersType CurrentOrder(NumScalars, NumScalars);
4636 if (GatherShuffles.
size() == 1 &&
4638 Entries.front().front()->isSame(TE.Scalars)) {
4641 std::iota(CurrentOrder.
begin(), CurrentOrder.
end(), 0);
4642 return CurrentOrder;
4646 return all_of(Mask, [&](
int I) {
4653 if ((ExtractShuffles.
empty() && IsSplatMask(Mask) &&
4654 (Entries.size() != 1 ||
4655 Entries.front().front()->ReorderIndices.empty())) ||
4656 (GatherShuffles.
empty() && IsSplatMask(ExtractMask)))
4657 return std::nullopt;
4662 for (
int I : seq<int>(0, NumParts)) {
4663 if (ShuffledSubMasks.
test(
I))
4665 const int VF = GetVF(
I);
4671 if (
any_of(Slice, [&](
int I) {
return I != NumScalars; })) {
4672 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4673 ShuffledSubMasks.
set(
I);
4677 int FirstMin = INT_MAX;
4678 int SecondVecFound =
false;
4679 for (
int K : seq<int>(Limit)) {
4680 int Idx = Mask[
I * PartSz + K];
4682 Value *V = GatheredScalars[
I * PartSz + K];
4684 SecondVecFound =
true;
4693 SecondVecFound =
true;
4697 FirstMin = (FirstMin / PartSz) * PartSz;
4699 if (SecondVecFound) {
4700 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4701 ShuffledSubMasks.
set(
I);
4704 for (
int K : seq<int>(Limit)) {
4705 int Idx = Mask[
I * PartSz + K];
4709 if (
Idx >= PartSz) {
4710 SecondVecFound =
true;
4713 if (CurrentOrder[
I * PartSz +
Idx] >
4714 static_cast<unsigned>(
I * PartSz + K) &&
4715 CurrentOrder[
I * PartSz +
Idx] !=
4716 static_cast<unsigned>(
I * PartSz +
Idx))
4717 CurrentOrder[
I * PartSz +
Idx] =
I * PartSz + K;
4720 if (SecondVecFound) {
4721 std::fill(Slice.
begin(), Slice.
end(), NumScalars);
4722 ShuffledSubMasks.
set(
I);
4728 if (!ExtractShuffles.
empty())
4729 TransformMaskToOrder(
4730 CurrentOrder, ExtractMask, PartSz, NumParts, [&](
unsigned I) {
4731 if (!ExtractShuffles[
I])
4734 unsigned Sz =
getNumElems(TE.getVectorFactor(), PartSz,
I);
4735 for (
unsigned Idx : seq<unsigned>(Sz)) {
4736 int K =
I * PartSz +
Idx;
4739 if (!TE.ReuseShuffleIndices.empty())
4740 K = TE.ReuseShuffleIndices[K];
4743 if (!TE.ReorderIndices.empty())
4744 K = std::distance(TE.ReorderIndices.begin(),
4745 find(TE.ReorderIndices, K));
4746 auto *EI = dyn_cast<ExtractElementInst>(TE.Scalars[K]);
4749 VF = std::max(VF, cast<VectorType>(EI->getVectorOperandType())
4751 .getKnownMinValue());
4756 if (GatherShuffles.
size() == 1 && NumParts != 1) {
4757 if (ShuffledSubMasks.
any())
4758 return std::nullopt;
4759 PartSz = NumScalars;
4762 if (!Entries.empty())
4763 TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](
unsigned I) {
4764 if (!GatherShuffles[
I])
4766 return std::max(Entries[
I].front()->getVectorFactor(),
4767 Entries[
I].back()->getVectorFactor());
4770 count_if(CurrentOrder, [&](
int Idx) {
return Idx == NumScalars; });
4771 if (ShuffledSubMasks.
all() || (NumScalars > 2 && NumUndefs >= NumScalars / 2))
4772 return std::nullopt;
4773 return std::move(CurrentOrder);
4778 bool CompareOpcodes =
true) {
4782 auto *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
4783 auto *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
4784 return (!GEP1 || GEP1->getNumOperands() == 2) &&
4785 (!GEP2 || GEP2->getNumOperands() == 2) &&
4786 (((!GEP1 ||
isConstant(GEP1->getOperand(1))) &&
4787 (!GEP2 ||
isConstant(GEP2->getOperand(1)))) ||
4790 getSameOpcode({GEP1->getOperand(1), GEP2->getOperand(1)}, TLI)));
4794template <
typename T>
4796 Align CommonAlignment = cast<T>(VL.
front())->getAlign();
4798 CommonAlignment = std::min(CommonAlignment, cast<T>(V)->
getAlign());
4799 return CommonAlignment;
4805 "Order is empty. Please check it before using isReverseOrder.");
4806 unsigned Sz = Order.
size();
4808 return Pair.value() == Sz || Sz - Pair.index() - 1 == Pair.value();
4819static std::optional<Value *>
4825 const SCEV *PtrSCEVLowest =
nullptr;
4826 const SCEV *PtrSCEVHighest =
nullptr;
4832 return std::nullopt;
4834 if (!PtrSCEVLowest && !PtrSCEVHighest) {
4835 PtrSCEVLowest = PtrSCEVHighest = PtrSCEV;
4839 if (isa<SCEVCouldNotCompute>(Diff))
4840 return std::nullopt;
4842 PtrSCEVLowest = PtrSCEV;
4846 if (isa<SCEVCouldNotCompute>(Diff1))
4847 return std::nullopt;
4849 PtrSCEVHighest = PtrSCEV;
4855 if (isa<SCEVCouldNotCompute>(Dist))
4856 return std::nullopt;
4857 int Size =
DL.getTypeStoreSize(ElemTy);
4858 auto TryGetStride = [&](
const SCEV *Dist,
4859 const SCEV *Multiplier) ->
const SCEV * {
4860 if (
const auto *M = dyn_cast<SCEVMulExpr>(Dist)) {
4861 if (M->getOperand(0) == Multiplier)
4862 return M->getOperand(1);
4863 if (M->getOperand(1) == Multiplier)
4864 return M->getOperand(0);
4867 if (Multiplier == Dist)
4872 const SCEV *Stride =
nullptr;
4873 if (
Size != 1 || SCEVs.
size() > 2) {
4875 Stride = TryGetStride(Dist, Sz);
4877 return std::nullopt;
4879 if (!Stride || isa<SCEVConstant>(Stride))
4880 return std::nullopt;
4883 using DistOrdPair = std::pair<int64_t, int>;
4885 std::set<DistOrdPair,
decltype(Compare)> Offsets(Compare);
4887 bool IsConsecutive =
true;
4888 for (
const SCEV *PtrSCEV : SCEVs) {
4890 if (PtrSCEV != PtrSCEVLowest) {
4892 const SCEV *Coeff = TryGetStride(Diff, Stride);
4894 return std::nullopt;
4895 const auto *SC = dyn_cast<SCEVConstant>(Coeff);
4896 if (!SC || isa<SCEVCouldNotCompute>(SC))
4897 return std::nullopt;
4901 return std::nullopt;
4902 Dist = SC->getAPInt().getZExtValue();
4906 return std::nullopt;
4907 auto Res = Offsets.emplace(Dist, Cnt);
4909 return std::nullopt;
4911 IsConsecutive = IsConsecutive && std::next(Res.first) == Offsets.end();
4914 if (Offsets.size() != SCEVs.
size())
4915 return std::nullopt;
4916 SortedIndices.
clear();
4917 if (!IsConsecutive) {
4921 for (
const std::pair<int64_t, int> &Pair : Offsets) {
4922 SortedIndices[Cnt] = Pair.second;
4932static std::pair<InstructionCost, InstructionCost>
4948 int NumSrcElts = Tp->getElementCount().getKnownMinValue();
4951 Mask, NumSrcElts, NumSubElts,
Index)) {
4952 if (
Index + NumSubElts > NumSrcElts &&
4953 Index + NumSrcElts <=
static_cast<int>(
Mask.size()))
4969 if (
Index % SubVecVF == 0) {
4977 std::iota(
Mask.begin(),
Mask.end(), 0);
4978 for (
unsigned I : seq<unsigned>(SubVecVF))
4981 Vec = Generator(Vec, V, Mask);
4985 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), SubVecVF), 0);
4997 unsigned SubVecVF,
unsigned Index) {
4998 if (
Index % SubVecVF == 0) {
5006 std::iota(Mask.begin(), Mask.end(),
Index);
5014 unsigned *BestVF,
bool TryRecursiveCheck)
const {
5027 if (
DL->getTypeSizeInBits(ScalarTy) !=
DL->getTypeAllocSizeInBits(ScalarTy))
5033 const unsigned Sz = VL.
size();
5035 auto *POIter = PointerOps.
begin();
5036 for (
Value *V : VL) {
5037 auto *L = dyn_cast<LoadInst>(V);
5038 if (!L || !L->isSimple())
5040 *POIter = L->getPointerOperand();
5049 Align CommonAlignment = computeCommonAlignment<LoadInst>(VL);
5069 if (Order.
empty()) {
5070 Ptr0 = PointerOps.
front();
5071 PtrN = PointerOps.
back();
5073 Ptr0 = PointerOps[Order.
front()];
5074 PtrN = PointerOps[Order.
back()];
5076 std::optional<int> Diff =
5079 if (
static_cast<unsigned>(*Diff) == Sz - 1)
5085 bool IsPossibleStrided = *Diff % (Sz - 1) == 0;
5099 auto IsAnyPointerUsedOutGraph =
5100 IsPossibleStrided &&
any_of(PointerOps, [&](
Value *V) {
5101 return isa<Instruction>(V) &&
any_of(V->users(), [&](
User *U) {
5102 return !getTreeEntry(U) && !MustGather.contains(U);
5105 const unsigned AbsoluteDiff = std::abs(*Diff);
5106 if (IsPossibleStrided &&
5107 (IsAnyPointerUsedOutGraph ||
5108 (AbsoluteDiff > Sz &&
5111 AbsoluteDiff % Sz == 0 &&
has_single_bit(AbsoluteDiff / Sz)))) ||
5112 *Diff == -(
static_cast<int>(Sz) - 1))) {
5113 int Stride = *Diff /
static_cast<int>(Sz - 1);
5114 if (*Diff == Stride *
static_cast<int>(Sz - 1)) {
5126 else if (
Ptr != Ptr0)
5130 if (((Dist / Stride) * Stride) != Dist ||
5131 !Dists.
insert(Dist).second)
5134 if (Dists.
size() == Sz)
5143 auto CheckForShuffledLoads = [&, &
TTI = *
TTI](
Align CommonAlignment,
5145 bool ProfitableGatherPointers) {
5150 auto [ScalarGEPCost, VectorGEPCost] =
5152 Instruction::GetElementPtr,
CostKind, ScalarTy, VecTy);
5158 VecTy->getNumElements());
5159 if (
static_cast<unsigned>(
count_if(
5160 PointerOps, IsaPred<GetElementPtrInst>)) < PointerOps.
size() - 1 ||
5166 PtrVecTy, DemandedElts,
true,
false,
CostKind);
5185 false, CommonAlignment,
CostKind) +
5186 (ProfitableGatherPointers ? 0 : VectorGEPCost);
5193 constexpr unsigned ListLimit = 4;
5194 if (!TryRecursiveCheck || VL.
size() < ListLimit)
5203 unsigned Sz =
DL->getTypeSizeInBits(ScalarTy);
5213 for (
unsigned Cnt = 0,
End = VL.
size(); Cnt + VF <=
End; Cnt += VF) {
5226 DemandedElts.
setBits(Cnt, Cnt + VF);
5241 if (!DemandedElts.
isZero()) {
5246 for (
unsigned Idx : seq<unsigned>(VL.
size()))
5247 if (DemandedElts[
Idx])
5254 auto *LI0 = cast<LoadInst>(VL[
I * VF]);
5259 LI0->getPointerOperand(),
5260 Instruction::GetElementPtr,
CostKind, ScalarTy,
5264 if (
static_cast<unsigned>(
5265 count_if(PointerOps, IsaPred<GetElementPtrInst>)) <
5266 PointerOps.
size() - 1 ||
5286 LI0->getPointerAddressSpace(),
CostKind,
5292 LI0->getPointerOperand(),
5299 LI0->getPointerOperand(),
5309 for (
int Idx : seq<int>(0, VL.
size()))
5319 if (MaskedGatherCost >= VecLdCost &&
5332 bool ProfitableGatherPointers =
5333 L && Sz > 2 &&
static_cast<unsigned>(
count_if(PointerOps, [L](
Value *V) {
5334 return L->isLoopInvariant(V);
5336 if (ProfitableGatherPointers ||
all_of(PointerOps, [](
Value *
P) {
5337 auto *
GEP = dyn_cast<GetElementPtrInst>(
P);
5339 (
GEP &&
GEP->getNumOperands() == 2 &&
5340 isa<Constant, Instruction>(
GEP->getOperand(1)));
5347 if (!TryRecursiveCheck || !CheckForShuffledLoads(CommonAlignment, BestVF,
5348 ProfitableGatherPointers))
5361 "Expected list of pointer operands.");
5371 .first->second.emplace_back().emplace_back(VL.
front(), 0U, 0U);
5373 SortedIndices.
clear();
5375 auto Key = std::make_pair(BBs[Cnt + 1],
5379 std::optional<int> Diff = getPointersDiff(
5380 ElemTy, std::get<0>(Base.front()), ElemTy,
5386 Base.emplace_back(Ptr, *Diff, Cnt + 1);
5392 if (Bases.
size() > VL.
size() / 2 - 1)
5396 Bases.
find(Key)->second.emplace_back().emplace_back(
Ptr, 0, Cnt + 1);
5403 if (Bases.
size() == 1 && (Bases.
front().second.size() == 1 ||
5404 Bases.
front().second.size() == VL.
size()))
5409 auto ComparePointers = [](
Value *Ptr1,
Value *Ptr2) {
5418 FirstPointers.
insert(P1);
5419 SecondPointers.
insert(P2);
5425 "Unable to find matching root.");
5428 for (
auto &
Base : Bases) {
5429 for (
auto &Vec :
Base.second) {
5430 if (Vec.size() > 1) {
5431 stable_sort(Vec, [](
const std::tuple<Value *, int, unsigned> &
X,
5432 const std::tuple<Value *, int, unsigned> &
Y) {
5433 return std::get<1>(
X) < std::get<1>(
Y);
5435 int InitialOffset = std::get<1>(Vec[0]);
5436 bool AnyConsecutive =
5438 return std::get<1>(
P.value()) == int(
P.index()) + InitialOffset;
5442 if (!AnyConsecutive)
5447 return ComparePointers(std::get<0>(V1.front()), std::get<0>(V2.front()));
5451 for (
auto &
T : Bases)
5452 for (
const auto &Vec :
T.second)
5453 for (
const auto &
P : Vec)
5457 "Expected SortedIndices to be the size of VL");
5461std::optional<BoUpSLP::OrdersType>
5463 assert(TE.isGather() &&
"Expected gather node only.");
5464 Type *ScalarTy = TE.Scalars[0]->getType();
5467 Ptrs.
reserve(TE.Scalars.size());
5469 BBs.
reserve(TE.Scalars.size());
5470 for (
Value *V : TE.Scalars) {
5471 auto *L = dyn_cast<LoadInst>(V);
5472 if (!L || !L->isSimple())
5473 return std::nullopt;
5479 if (!LoadEntriesToVectorize.
contains(TE.Idx) &&
5481 return std::move(Order);
5482 return std::nullopt;
5493 if (VU->
getType() != V->getType())
5496 if (!VU->
hasOneUse() && !V->hasOneUse())
5502 if (Idx1 == std::nullopt || Idx2 == std::nullopt)
5508 cast<VectorType>(VU->
getType())->getElementCount().getKnownMinValue());
5509 bool IsReusedIdx =
false;
5511 if (IE2 == VU && !IE1)
5513 if (IE1 == V && !IE2)
5514 return V->hasOneUse();
5515 if (IE1 && IE1 != V) {
5517 IsReusedIdx |= ReusedIdx.
test(Idx1);
5518 ReusedIdx.
set(Idx1);
5519 if ((IE1 != VU && !IE1->
hasOneUse()) || IsReusedIdx)
5522 IE1 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE1));
5524 if (IE2 && IE2 != VU) {
5526 IsReusedIdx |= ReusedIdx.
test(Idx2);
5527 ReusedIdx.
set(Idx2);
5528 if ((IE2 != V && !IE2->hasOneUse()) || IsReusedIdx)
5531 IE2 = dyn_cast_or_null<InsertElementInst>(GetBaseOperand(IE2));
5533 }
while (!IsReusedIdx && (IE1 || IE2));
5537std::optional<BoUpSLP::OrdersType>
5541 if (!TE.ReuseShuffleIndices.empty()) {
5543 assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI) &&
5544 "Reshuffling scalars not yet supported for nodes with padding");
5547 return std::nullopt;
5555 unsigned Sz = TE.Scalars.size();
5556 if (TE.isGather()) {
5557 if (std::optional<OrdersType> CurrentOrder =
5562 ::addMask(Mask, TE.ReuseShuffleIndices);
5563 OrdersType Res(TE.getVectorFactor(), TE.getVectorFactor());
5564 unsigned Sz = TE.Scalars.size();
5565 for (
int K = 0,
E = TE.getVectorFactor() / Sz; K <
E; ++K) {
5568 Res[
Idx + K * Sz] =
I + K * Sz;
5570 return std::move(Res);
5573 if (Sz == 2 && TE.getVectorFactor() == 4 &&
5575 2 * TE.getVectorFactor())) == 1)
5576 return std::nullopt;
5580 if (TE.ReorderIndices.empty())
5581 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5584 ::addMask(ReorderMask, TE.ReuseShuffleIndices);
5585 unsigned VF = ReorderMask.
size();
5589 for (
unsigned I = 0;
I < VF;
I += Sz) {
5591 unsigned UndefCnt = 0;
5592 unsigned Limit = std::min(Sz, VF -
I);
5601 Val >=
static_cast<int>(NumParts) || UsedVals.
test(Val) ||
5603 return std::nullopt;
5605 for (
unsigned K = 0; K < NumParts; ++K) {
5606 unsigned Idx = Val + Sz * K;
5608 ResOrder[
Idx] =
I + K;
5611 return std::move(ResOrder);
5613 unsigned VF = TE.getVectorFactor();
5616 TE.ReuseShuffleIndices.end());
5617 if (TE.hasState() && TE.getOpcode() == Instruction::ExtractElement &&
5619 if (isa<PoisonValue>(V))
5621 std::optional<unsigned> Idx = getExtractIndex(cast<Instruction>(V));
5622 return Idx && *Idx < Sz;
5624 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported "
5625 "by BinaryOperator and CastInst.");
5627 if (TE.ReorderIndices.empty())
5628 std::iota(ReorderMask.
begin(), ReorderMask.
end(), 0);
5631 for (
unsigned I = 0;
I < VF; ++
I) {
5632 int &
Idx = ReusedMask[
I];
5635 Value *V = TE.Scalars[ReorderMask[
Idx]];
5637 Idx = std::distance(ReorderMask.
begin(),
find(ReorderMask, *EI));
5643 std::iota(ResOrder.
begin(), ResOrder.
end(), 0);
5644 auto *It = ResOrder.
begin();
5645 for (
unsigned K = 0; K < VF; K += Sz) {
5649 std::iota(SubMask.begin(), SubMask.end(), 0);
5651 transform(CurrentOrder, It, [K](
unsigned Pos) {
return Pos + K; });
5652 std::advance(It, Sz);
5655 return Data.index() ==
Data.value();
5657 return std::nullopt;
5658 return std::move(ResOrder);
5660 if (TE.State == TreeEntry::StridedVectorize && !TopToBottom &&
5661 any_of(TE.UserTreeIndices,
5663 return !Instruction::isBinaryOp(EI.UserTE->getOpcode());
5665 (TE.ReorderIndices.empty() ||
isReverseOrder(TE.ReorderIndices)))
5666 return std::nullopt;
5667 if ((TE.State == TreeEntry::Vectorize ||
5668 TE.State == TreeEntry::StridedVectorize) &&
5669 (isa<LoadInst, ExtractElementInst, ExtractValueInst>(TE.getMainOp()) ||
5670 (TopToBottom && isa<StoreInst, InsertElementInst>(TE.getMainOp())))) {
5671 assert(!TE.isAltShuffle() &&
"Alternate instructions are only supported by "
5672 "BinaryOperator and CastInst.");
5673 return TE.ReorderIndices;
5675 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::PHI) {
5676 if (!TE.ReorderIndices.empty())
5677 return TE.ReorderIndices;
5680 for (
auto [
I, V] :
zip(UserBVHead, TE.Scalars)) {
5681 if (!V->hasNUsesOrMore(1))
5683 auto *
II = dyn_cast<InsertElementInst>(*V->user_begin());
5688 while (
II &&
II->hasOneUse() &&
II->getParent() == BB) {
5690 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
5696 assert(BB1 != BB2 &&
"Expected different basic blocks.");
5697 auto *NodeA = DT->
getNode(BB1);
5698 auto *NodeB = DT->
getNode(BB2);
5699 assert(NodeA &&
"Should only process reachable instructions");
5700 assert(NodeB &&
"Should only process reachable instructions");
5701 assert((NodeA == NodeB) ==
5702 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
5703 "Different nodes should have different DFS numbers");
5704 return NodeA->getDFSNumIn() < NodeB->getDFSNumIn();
5706 auto PHICompare = [&](
unsigned I1,
unsigned I2) {
5707 Value *V1 = TE.Scalars[I1];
5708 Value *V2 = TE.Scalars[I2];
5709 if (V1 == V2 || (V1->
getNumUses() == 0 && V2->getNumUses() == 0))
5711 if (isa<PoisonValue>(V1))
5713 if (isa<PoisonValue>(V2))
5719 auto *FirstUserOfPhi1 = cast<Instruction>(*V1->
user_begin());
5720 auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
5721 if (FirstUserOfPhi1->getParent() != FirstUserOfPhi2->getParent())
5722 return CompareByBasicBlocks(FirstUserOfPhi1->getParent(),
5723 FirstUserOfPhi2->getParent());
5724 auto *IE1 = dyn_cast<InsertElementInst>(FirstUserOfPhi1);
5725 auto *IE2 = dyn_cast<InsertElementInst>(FirstUserOfPhi2);
5726 auto *EE1 = dyn_cast<ExtractElementInst>(FirstUserOfPhi1);
5727 auto *EE2 = dyn_cast<ExtractElementInst>(FirstUserOfPhi2);
5733 if (UserBVHead[I1] && !UserBVHead[I2])
5735 if (!UserBVHead[I1])
5737 if (UserBVHead[I1] == UserBVHead[I2])
5740 return CompareByBasicBlocks(UserBVHead[I1]->
getParent(),
5742 return UserBVHead[I1]->comesBefore(UserBVHead[I2]);
5749 auto *Inst1 = dyn_cast<Instruction>(EE1->getOperand(0));
5750 auto *Inst2 = dyn_cast<Instruction>(EE2->getOperand(0));
5751 auto *P1 = dyn_cast<Argument>(EE1->getOperand(0));
5752 auto *P2 = dyn_cast<Argument>(EE2->getOperand(0));
5755 if (EE1->getOperand(0) == EE2->getOperand(0))
5757 if (!Inst1 && Inst2)
5759 if (Inst1 && Inst2) {
5767 "Expected either instructions or arguments vector operands.");
5768 return P1->getArgNo() < P2->getArgNo();
5773 std::iota(Phis.
begin(), Phis.
end(), 0);
5776 return std::nullopt;
5777 return std::move(Phis);
5779 if (TE.isGather() && (!TE.hasState() || !TE.isAltShuffle()) &&
5783 if (((TE.hasState() && TE.getOpcode() == Instruction::ExtractElement) ||
5784 (
all_of(TE.Scalars, IsaPred<UndefValue, ExtractElementInst>) &&
5785 any_of(TE.Scalars, IsaPred<ExtractElementInst>))) &&
5787 auto *EE = dyn_cast<ExtractElementInst>(V);
5788 return !EE || isa<FixedVectorType>(EE->getVectorOperandType());
5794 canReuseExtract(TE.Scalars, CurrentOrder,
true);
5795 if (Reuse || !CurrentOrder.
empty())
5796 return std::move(CurrentOrder);
5804 int Sz = TE.Scalars.size();
5806 count_if(TE.Scalars, IsaPred<UndefValue>) == Sz - 1) {
5808 find_if(TE.Scalars, [](
Value *V) { return !isConstant(V); });
5809 if (It == TE.Scalars.begin())
5812 if (It != TE.Scalars.end()) {
5814 unsigned Idx = std::distance(TE.Scalars.begin(), It);
5829 if (InsertFirstCost + PermuteCost < InsertIdxCost) {
5832 return std::move(Order);
5837 return std::nullopt;
5838 if (TE.Scalars.size() >= 3)
5843 if (TE.hasState() && TE.getOpcode() == Instruction::Load) {
5847 CurrentOrder, PointerOps);
5849 return std::move(CurrentOrder);
5855 return CurrentOrder;
5857 return std::nullopt;
5867 for (
unsigned I = Sz,
E = Mask.size();
I <
E;
I += Sz) {
5869 if (Cluster != FirstCluster)
5875void BoUpSLP::reorderNodeWithReuses(TreeEntry &TE,
ArrayRef<int> Mask)
const {
5878 const unsigned Sz =
TE.Scalars.size();
5880 if (!
TE.isGather() ||
5887 addMask(NewMask,
TE.ReuseShuffleIndices);
5889 TE.ReorderIndices.clear();
5896 for (
auto *It =
TE.ReuseShuffleIndices.begin(),
5897 *
End =
TE.ReuseShuffleIndices.end();
5898 It !=
End; std::advance(It, Sz))
5899 std::iota(It, std::next(It, Sz), 0);
5905 "Expected same size of orders");
5906 unsigned Sz = Order.
size();
5908 for (
unsigned Idx : seq<unsigned>(0, Sz)) {
5909 if (Order[
Idx] != Sz)
5910 UsedIndices.
set(Order[
Idx]);
5912 if (SecondaryOrder.
empty()) {
5913 for (
unsigned Idx : seq<unsigned>(0, Sz))
5914 if (Order[
Idx] == Sz && !UsedIndices.
test(
Idx))
5917 for (
unsigned Idx : seq<unsigned>(0, Sz))
5918 if (SecondaryOrder[
Idx] != Sz && Order[
Idx] == Sz &&
5919 !UsedIndices.
test(SecondaryOrder[
Idx]))
5920 Order[
Idx] = SecondaryOrder[
Idx];
5940 ExternalUserReorderMap;
5945 const std::unique_ptr<TreeEntry> &TE) {
5948 findExternalStoreUsersReorderIndices(TE.get());
5949 if (!ExternalUserReorderIndices.
empty()) {
5950 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5952 std::move(ExternalUserReorderIndices));
5958 if (TE->hasState() && TE->isAltShuffle()) {
5961 unsigned Opcode0 = TE->getOpcode();
5962 unsigned Opcode1 = TE->getAltOpcode();
5965 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
5966 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5972 if (std::optional<OrdersType> CurrentOrder =
5982 const TreeEntry *UserTE = TE.get();
5984 if (UserTE->UserTreeIndices.size() != 1)
5987 return EI.UserTE->State == TreeEntry::Vectorize &&
5988 EI.UserTE->isAltShuffle() && EI.UserTE->Idx != 0;
5991 UserTE = UserTE->UserTreeIndices.back().UserTE;
5994 VFToOrderedEntries[TE->getVectorFactor()].
insert(TE.get());
5995 if (!(TE->State == TreeEntry::Vectorize ||
5996 TE->State == TreeEntry::StridedVectorize) ||
5997 !TE->ReuseShuffleIndices.empty())
5998 GathersToOrders.
try_emplace(TE.get(), *CurrentOrder);
5999 if (TE->State == TreeEntry::Vectorize &&
6000 TE->getOpcode() == Instruction::PHI)
6001 PhisToOrders.
try_emplace(TE.get(), *CurrentOrder);
6006 for (
unsigned VF = VectorizableTree.front()->getVectorFactor();
6007 !VFToOrderedEntries.
empty() && VF > 1; VF -= 2 - (VF & 1U)) {
6008 auto It = VFToOrderedEntries.
find(VF);
6009 if (It == VFToOrderedEntries.
end())
6024 for (
const TreeEntry *OpTE : OrderedEntries) {
6027 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6030 const auto &Order = [OpTE, &GathersToOrders, &AltShufflesToOrders,
6032 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty()) {
6033 auto It = GathersToOrders.find(OpTE);
6034 if (It != GathersToOrders.end())
6037 if (OpTE->hasState() && OpTE->isAltShuffle()) {
6038 auto It = AltShufflesToOrders.find(OpTE);
6039 if (It != AltShufflesToOrders.end())
6042 if (OpTE->State == TreeEntry::Vectorize &&
6043 OpTE->getOpcode() == Instruction::PHI) {
6044 auto It = PhisToOrders.
find(OpTE);
6045 if (It != PhisToOrders.
end())
6048 return OpTE->ReorderIndices;
6051 auto It = ExternalUserReorderMap.
find(OpTE);
6052 if (It != ExternalUserReorderMap.
end()) {
6053 const auto &ExternalUserReorderIndices = It->second;
6057 if (OpTE->getVectorFactor() != OpTE->Scalars.size()) {
6058 OrdersUses.insert(std::make_pair(
OrdersType(), 0)).first->second +=
6059 ExternalUserReorderIndices.size();
6061 for (
const OrdersType &ExtOrder : ExternalUserReorderIndices)
6062 ++OrdersUses.insert(std::make_pair(ExtOrder, 0)).first->second;
6069 if (OpTE->State == TreeEntry::Vectorize &&
6070 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6071 assert(!OpTE->isAltShuffle() &&
6072 "Alternate instructions are only supported by BinaryOperator "
6076 unsigned E = Order.size();
6079 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6082 ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second;
6084 ++OrdersUses.insert(std::make_pair(Order, 0)).first->second;
6087 if (OrdersUses.empty())
6090 unsigned IdentityCnt = 0;
6091 unsigned FilledIdentityCnt = 0;
6093 for (
auto &Pair : OrdersUses) {
6095 if (!Pair.first.empty())
6096 FilledIdentityCnt += Pair.second;
6097 IdentityCnt += Pair.second;
6102 unsigned Cnt = IdentityCnt;
6103 for (
auto &Pair : OrdersUses) {
6107 if (Cnt < Pair.second ||
6108 (Cnt == IdentityCnt && IdentityCnt == FilledIdentityCnt &&
6109 Cnt == Pair.second && !BestOrder.
empty() &&
6112 BestOrder = Pair.first;
6125 unsigned E = BestOrder.
size();
6127 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6130 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6132 if (TE->Scalars.size() != VF) {
6133 if (TE->ReuseShuffleIndices.size() == VF) {
6139 return EI.UserTE->Scalars.size() == VF ||
6140 EI.UserTE->Scalars.size() ==
6143 "All users must be of VF size.");
6151 return isa<ShuffleVectorInst>(EI.UserTE->getMainOp());
6156 return isa<ShuffleVectorInst>(
6157 EI.UserTE->getMainOp());
6159 "Does not know how to reorder.");
6163 reorderNodeWithReuses(*TE, Mask);
6167 if ((TE->State == TreeEntry::Vectorize ||
6168 TE->State == TreeEntry::StridedVectorize) &&
6171 (
SLPReVec && isa<ShuffleVectorInst>(TE->getMainOp())))) {
6172 assert(!TE->isAltShuffle() &&
6173 "Alternate instructions are only supported by BinaryOperator "
6178 if (isa<InsertElementInst, StoreInst>(TE->getMainOp()))
6179 TE->reorderOperands(Mask);
6182 TE->reorderOperands(Mask);
6183 assert(TE->ReorderIndices.empty() &&
6184 "Expected empty reorder sequence.");
6187 if (!TE->ReuseShuffleIndices.empty()) {
6194 addMask(NewReuses, TE->ReuseShuffleIndices);
6195 TE->ReuseShuffleIndices.swap(NewReuses);
6201bool BoUpSLP::canReorderOperands(
6202 TreeEntry *UserTE,
SmallVectorImpl<std::pair<unsigned, TreeEntry *>> &Edges,
6205 for (
unsigned I = 0,
E = UserTE->getNumOperands();
I <
E; ++
I) {
6206 if (
any_of(Edges, [
I](
const std::pair<unsigned, TreeEntry *> &OpData) {
6207 return OpData.first ==
I &&
6208 (OpData.second->State == TreeEntry::Vectorize ||
6209 OpData.second->State == TreeEntry::StridedVectorize);
6212 if (TreeEntry *TE = getVectorizedOperand(UserTE,
I)) {
6214 if (
any_of(TE->UserTreeIndices,
6215 [UserTE](
const EdgeInfo &EI) { return EI.UserTE != UserTE; }))
6219 Edges.emplace_back(
I, TE);
6225 if (TE->State != TreeEntry::Vectorize &&
6226 TE->State != TreeEntry::StridedVectorize &&
6227 TE->ReuseShuffleIndices.empty() && TE->ReorderIndices.empty())
6231 TreeEntry *
Gather =
nullptr;
6233 [&
Gather, UserTE,
I](TreeEntry *TE) {
6234 assert(TE->State != TreeEntry::Vectorize &&
6235 TE->State != TreeEntry::StridedVectorize &&
6236 "Only non-vectorized nodes are expected.");
6237 if (
any_of(TE->UserTreeIndices,
6238 [UserTE,
I](
const EdgeInfo &EI) {
6239 return EI.UserTE == UserTE && EI.EdgeIdx == I;
6241 assert(TE->isSame(UserTE->getOperand(
I)) &&
6242 "Operand entry does not match operands.");
6263 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
6264 if (TE->State != TreeEntry::Vectorize &&
6265 TE->State != TreeEntry::StridedVectorize)
6267 if (std::optional<OrdersType> CurrentOrder =
6269 OrderedEntries.
insert(TE.get());
6270 if (!(TE->State == TreeEntry::Vectorize ||
6271 TE->State == TreeEntry::StridedVectorize) ||
6272 !TE->ReuseShuffleIndices.empty())
6273 GathersToOrders.
insert(TE.get());
6282 while (!OrderedEntries.
empty()) {
6287 for (TreeEntry *TE : OrderedEntries) {
6288 if (!(TE->State == TreeEntry::Vectorize ||
6289 TE->State == TreeEntry::StridedVectorize ||
6290 (TE->isGather() && GathersToOrders.
contains(TE))) ||
6291 TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6294 return EI.UserTE == TE->UserTreeIndices.front().UserTE;
6296 !Visited.
insert(TE).second) {
6302 for (
EdgeInfo &EI : TE->UserTreeIndices)
6306 for (TreeEntry *TE : Filtered)
6307 OrderedEntries.remove(TE);
6309 std::pair<TreeEntry *, SmallVector<std::pair<unsigned, TreeEntry *>>>>
6311 sort(UsersVec, [](
const auto &Data1,
const auto &Data2) {
6312 return Data1.first->Idx > Data2.first->Idx;
6314 for (
auto &
Data : UsersVec) {
6317 if (!canReorderOperands(
Data.first,
Data.second, NonVectorized,
6319 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6320 OrderedEntries.remove(
Op.second);
6333 for (
const auto &
Op :
Data.second) {
6334 TreeEntry *OpTE =
Op.second;
6335 if (!VisitedOps.
insert(OpTE).second)
6337 if (!OpTE->ReuseShuffleIndices.empty() && !GathersToOrders.
count(OpTE))
6339 const auto Order = [&]() ->
const OrdersType {
6340 if (OpTE->isGather() || !OpTE->ReuseShuffleIndices.empty())
6343 return OpTE->ReorderIndices;
6347 if (Order.size() == 1)
6350 Data.second, [OpTE](
const std::pair<unsigned, TreeEntry *> &
P) {
6351 return P.second == OpTE;
6354 if (OpTE->State == TreeEntry::Vectorize &&
6355 OpTE->getOpcode() == Instruction::Store && !Order.empty()) {
6356 assert(!OpTE->isAltShuffle() &&
6357 "Alternate instructions are only supported by BinaryOperator "
6361 unsigned E = Order.size();
6364 return Idx == PoisonMaskElem ? E : static_cast<unsigned>(Idx);
6367 OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second +=
6370 OrdersUses.insert(std::make_pair(Order, 0)).first->second += NumOps;
6372 auto Res = OrdersUses.insert(std::make_pair(
OrdersType(), 0));
6373 const auto AllowsReordering = [&](
const TreeEntry *TE) {
6374 if (!TE->ReorderIndices.empty() || !TE->ReuseShuffleIndices.empty() ||
6375 (TE->State == TreeEntry::Vectorize && TE->isAltShuffle()) ||
6376 (IgnoreReorder && TE->Idx == 0))
6378 if (TE->isGather()) {
6387 for (
const EdgeInfo &EI : OpTE->UserTreeIndices) {
6388 TreeEntry *UserTE = EI.
UserTE;
6389 if (!VisitedUsers.
insert(UserTE).second)
6394 if (AllowsReordering(UserTE))
6402 if (
static_cast<unsigned>(
count_if(
6403 Ops, [UserTE, &AllowsReordering](
6404 const std::pair<unsigned, TreeEntry *> &
Op) {
6405 return AllowsReordering(
Op.second) &&
6408 return EI.UserTE == UserTE;
6410 })) <= Ops.
size() / 2)
6411 ++Res.first->second;
6414 if (OrdersUses.empty()) {
6415 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6416 OrderedEntries.remove(
Op.second);
6420 unsigned IdentityCnt = 0;
6421 unsigned VF =
Data.second.front().second->getVectorFactor();
6423 for (
auto &Pair : OrdersUses) {
6425 IdentityCnt += Pair.second;
6430 unsigned Cnt = IdentityCnt;
6431 for (
auto &Pair : OrdersUses) {
6435 if (Cnt < Pair.second) {
6437 BestOrder = Pair.first;
6445 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second)
6446 OrderedEntries.remove(
Op.second);
6455 unsigned E = BestOrder.
size();
6457 return I < E ? static_cast<int>(I) : PoisonMaskElem;
6459 for (
const std::pair<unsigned, TreeEntry *> &
Op :
Data.second) {
6460 TreeEntry *TE =
Op.second;
6461 OrderedEntries.remove(TE);
6462 if (!VisitedOps.
insert(TE).second)
6464 if (TE->ReuseShuffleIndices.size() == BestOrder.
size()) {
6465 reorderNodeWithReuses(*TE, Mask);
6469 if (TE->State != TreeEntry::Vectorize &&
6470 TE->State != TreeEntry::StridedVectorize &&
6471 (TE->State != TreeEntry::ScatterVectorize ||
6472 TE->ReorderIndices.empty()))
6474 assert((BestOrder.
size() == TE->ReorderIndices.size() ||
6475 TE->ReorderIndices.empty()) &&
6476 "Non-matching sizes of user/operand entries.");
6478 if (IgnoreReorder && TE == VectorizableTree.front().get())
6479 IgnoreReorder =
false;
6482 for (TreeEntry *
Gather : GatherOps) {
6484 "Unexpected reordering of gathers.");
6485 if (!
Gather->ReuseShuffleIndices.empty()) {
6491 OrderedEntries.remove(
Gather);
6495 if (
Data.first->State != TreeEntry::Vectorize ||
6496 !isa<ExtractElementInst, ExtractValueInst, LoadInst>(
6497 Data.first->getMainOp()) ||
6498 Data.first->isAltShuffle())
6499 Data.first->reorderOperands(Mask);
6500 if (!isa<InsertElementInst, StoreInst>(
Data.first->getMainOp()) ||
6501 Data.first->isAltShuffle() ||
6502 Data.first->State == TreeEntry::StridedVectorize) {
6506 if (
Data.first->ReuseShuffleIndices.empty() &&
6507 !
Data.first->ReorderIndices.empty() &&
6508 !
Data.first->isAltShuffle()) {
6511 OrderedEntries.insert(
Data.first);
6519 if (IgnoreReorder && !VectorizableTree.front()->ReorderIndices.empty() &&
6520 VectorizableTree.front()->ReuseShuffleIndices.empty())
6521 VectorizableTree.front()->ReorderIndices.clear();
6524Instruction *BoUpSLP::getRootEntryInstruction(
const TreeEntry &Entry)
const {
6525 if ((Entry.getOpcode() == Instruction::Store ||
6526 Entry.getOpcode() == Instruction::Load) &&
6527 Entry.State == TreeEntry::StridedVectorize &&
6528 !Entry.ReorderIndices.empty() &&
isReverseOrder(Entry.ReorderIndices))
6529 return dyn_cast<Instruction>(Entry.Scalars[Entry.ReorderIndices.front()]);
6530 return dyn_cast<Instruction>(Entry.Scalars.front());
6537 for (
auto &TEPtr : VectorizableTree) {
6538 TreeEntry *Entry = TEPtr.get();
6541 if (Entry->isGather())
6545 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
6546 Value *Scalar = Entry->Scalars[Lane];
6547 if (!isa<Instruction>(Scalar))
6550 auto It = ScalarToExtUses.
find(Scalar);
6551 if (It != ScalarToExtUses.
end() && !ExternalUses[It->second].User)
6555 const auto ExtI = ExternallyUsedValues.
find(Scalar);
6556 if (ExtI != ExternallyUsedValues.
end()) {
6557 int FoundLane = Entry->findLaneForValue(Scalar);
6558 LLVM_DEBUG(
dbgs() <<
"SLP: Need to extract: Extra arg from lane "
6559 << FoundLane <<
" from " << *Scalar <<
".\n");
6560 ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size());
6561 ExternalUses.emplace_back(Scalar,
nullptr, FoundLane);
6564 for (
User *U : Scalar->users()) {
6572 if (UserIgnoreList && UserIgnoreList->contains(UserInst))
6576 if (TreeEntry *UseEntry = getTreeEntry(U)) {
6580 if (UseEntry->State == TreeEntry::ScatterVectorize ||
6582 Scalar, getRootEntryInstruction(*UseEntry), TLI,
TTI)) {
6583 LLVM_DEBUG(
dbgs() <<
"SLP: \tInternal user will be removed:" << *U
6585 assert(!UseEntry->isGather() &&
"Bad state");
6589 if (It != ScalarToExtUses.
end()) {
6590 ExternalUses[It->second].User =
nullptr;
6595 if (U && Scalar->hasNUsesOrMore(
UsesLimit))
6597 int FoundLane = Entry->findLaneForValue(Scalar);
6599 <<
" from lane " << FoundLane <<
" from " << *Scalar
6601 It = ScalarToExtUses.
try_emplace(Scalar, ExternalUses.size()).first;
6602 ExternalUses.emplace_back(Scalar, U, FoundLane);
6611BoUpSLP::collectUserStores(
const BoUpSLP::TreeEntry *TE)
const {
6615 for (
unsigned Lane : seq<unsigned>(0, TE->Scalars.size())) {
6616 Value *V = TE->Scalars[Lane];
6618 if (!isa<Instruction>(V))
6625 for (
User *U : V->users()) {
6626 auto *SI = dyn_cast<StoreInst>(U);
6629 if (SI ==
nullptr || !SI->isSimple() || SI->getFunction() !=
F ||
6633 if (getTreeEntry(U))
6638 auto &StoresVec = PtrToStoresMap[{SI->getParent(),
6639 SI->getValueOperand()->getType(),
Ptr}];
6642 if (StoresVec.size() > Lane)
6644 if (!StoresVec.empty()) {
6646 SI->getValueOperand()->getType(), SI->getPointerOperand(),
6647 SI->getValueOperand()->getType(),
6648 StoresVec.front()->getPointerOperand(), *
DL, *SE,
6654 StoresVec.push_back(SI);
6659 for (
auto &
P : PtrToStoresMap) {
6660 Res[
I].swap(
P.second);
6667 OrdersType &ReorderIndices)
const {
6678 for (
unsigned Idx : seq<unsigned>(1, StoresVec.
size())) {
6680 std::optional<int> Diff =
6682 SI->getPointerOperand(), *
DL, *SE,
6688 if (StoreOffsetVec.
size() != StoresVec.
size())
6690 sort(StoreOffsetVec,
6691 [](
const std::pair<int, unsigned> &L,
6692 const std::pair<int, unsigned> &R) {
return L.first <
R.first; });
6695 for (
const auto &
P : StoreOffsetVec) {
6696 if (
Idx > 0 &&
P.first != PrevDist + 1)
6704 ReorderIndices.assign(StoresVec.
size(), 0);
6705 bool IsIdentity =
true;
6707 ReorderIndices[
P.second] =
I;
6708 IsIdentity &=
P.second ==
I;
6714 ReorderIndices.clear();
6721 for (
unsigned Idx : Order)
6728BoUpSLP::findExternalStoreUsersReorderIndices(TreeEntry *TE)
const {
6729 unsigned NumLanes =
TE->Scalars.size();
6742 if (StoresVec.
size() != NumLanes)
6747 if (!canFormVector(StoresVec, ReorderIndices))
6752 ExternalReorderIndices.
push_back(ReorderIndices);
6754 return ExternalReorderIndices;
6760 UserIgnoreList = &UserIgnoreLst;
6763 buildTree_rec(Roots, 0,
EdgeInfo());
6770 buildTree_rec(Roots, 0,
EdgeInfo());
6779 bool AddNew =
true) {
6787 for (
Value *V : VL) {
6788 auto *LI = dyn_cast<LoadInst>(V);
6791 if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple())
6793 bool IsFound =
false;
6794 for (
auto [Map,
Data] :
zip(ClusteredDistToLoad, ClusteredLoads)) {
6795 assert(LI->getParent() ==
Data.front().first->getParent() &&
6796 LI->getType() ==
Data.front().first->getType() &&
6800 "Expected loads with the same type, same parent and same "
6801 "underlying pointer.");
6803 LI->getType(), LI->getPointerOperand(),
Data.front().first->getType(),
6804 Data.front().first->getPointerOperand(),
DL, SE,
6808 auto It = Map.find(*Dist);
6809 if (It != Map.end() && It->second != LI)
6811 if (It == Map.end()) {
6812 Data.emplace_back(LI, *Dist);
6813 Map.try_emplace(*Dist, LI);
6823 auto FindMatchingLoads =
6828 int &
Offset,
unsigned &Start) {
6830 return GatheredLoads.
end();
6840 std::optional<int> Dist =
6842 Data.front().first->getType(),
6843 Data.front().first->getPointerOperand(),
DL, SE,
6849 for (std::pair<LoadInst *, int>
P :
Data) {
6855 unsigned NumUniques = 0;
6856 for (
auto [Cnt, Pair] :
enumerate(Loads)) {
6857 bool Used = DataLoads.
contains(Pair.first);
6858 if (!Used && !DataDists.
contains(*Dist + Pair.second)) {
6862 Repeated.insert(Cnt);
6865 if (NumUniques > 0 &&
6866 (Loads.
size() == NumUniques ||
6867 (Loads.
size() - NumUniques >= 2 &&
6868 Loads.
size() - NumUniques >= Loads.
size() / 2 &&
6874 return std::next(GatheredLoads.
begin(),
Idx);
6878 return GatheredLoads.
end();
6880 for (
ArrayRef<std::pair<LoadInst *, int>>
Data : ClusteredLoads) {
6884 auto *It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
6886 while (It != GatheredLoads.
end()) {
6887 assert(!LocalToAdd.
empty() &&
"Expected some elements to add.");
6888 for (
unsigned Idx : LocalToAdd)
6890 ToAdd.
insert(LocalToAdd.begin(), LocalToAdd.end());
6891 It = FindMatchingLoads(
Data, GatheredLoads, LocalToAdd, Repeated,
Offset,
6895 return !ToAdd.contains(Idx) && !Repeated.contains(Idx);
6899 for (
unsigned Idx : seq<unsigned>(
Data.size())) {
6908 GatheredLoads, [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6909 return PD.front().first->getParent() == LI->
getParent() &&
6910 PD.front().first->getType() == LI->
getType();
6912 while (It != GatheredLoads.
end()) {
6915 std::next(It), GatheredLoads.
end(),
6916 [&](
ArrayRef<std::pair<LoadInst *, int>> PD) {
6917 return PD.front().first->getParent() == LI->getParent() &&
6918 PD.front().first->getType() == LI->getType();
6922 GatheredLoads.emplace_back().append(
Data.begin(),
Data.end());
6923 AddNewLoads(GatheredLoads.emplace_back());
6928void BoUpSLP::tryToVectorizeGatheredLoads(
6931 8> &GatheredLoads) {
6932 GatheredLoadsEntriesFirst = VectorizableTree.size();
6935 LoadEntriesToVectorize.
size());
6936 for (
auto [
Idx, Set] :
zip(LoadEntriesToVectorize, LoadSetsToVectorize))
6937 Set.insert(VectorizableTree[
Idx]->Scalars.begin(),
6938 VectorizableTree[
Idx]->Scalars.end());
6941 auto LoadSorter = [](
const std::pair<LoadInst *, int> &L1,
6942 const std::pair<LoadInst *, int> &L2) {
6943 return L1.second > L2.second;
6949 Align Alignment = computeCommonAlignment<LoadInst>(Values);
6950 auto *Ty =
getWidenedType(Loads.front()->getType(), Loads.size());
6958 bool Final,
unsigned MaxVF) {
6960 unsigned StartIdx = 0;
6965 *
TTI, Loads.
front()->getType(), MaxVF);
6967 *
TTI, Loads.
front()->getType(), NumElts - 1)) {
6973 if (Final && CandidateVFs.
empty())
6976 unsigned BestVF = Final ? CandidateVFs.
back() : 0;
6977 for (
unsigned NumElts : CandidateVFs) {
6978 if (Final && NumElts > BestVF)
6981 for (
unsigned Cnt = StartIdx,
E = Loads.
size(); Cnt <
E;
6985 if (VectorizedLoads.count(Slice.
front()) ||
6986 VectorizedLoads.count(Slice.
back()) ||
6992 bool AllowToVectorize =
false;
7000 if (LI->hasOneUse())
7006 if (
static_cast<unsigned int>(std::distance(
7007 LI->user_begin(), LI->user_end())) != LI->getNumUses())
7009 if (!IsLegalBroadcastLoad)
7013 for (
User *U : LI->users()) {
7014 if (
auto *UI = dyn_cast<Instruction>(U); UI &&
isDeleted(UI))
7016 if (
const TreeEntry *UTE = getTreeEntry(U)) {
7017 for (
int I : seq<int>(UTE->getNumOperands())) {
7018 if (
all_of(UTE->getOperand(
I),
7019 [LI](
Value *V) { return V == LI; }))
7028 AllowToVectorize = CheckIfAllowed(Slice);
7032 any_of(ValueToGatherNodes.at(Slice.front()),
7033 [=](
const TreeEntry *TE) {
7034 return TE->Scalars.size() == 2 &&
7035 ((TE->Scalars.front() == Slice.front() &&
7036 TE->Scalars.back() == Slice.back()) ||
7037 (TE->Scalars.front() == Slice.back() &&
7038 TE->Scalars.back() == Slice.front()));
7043 if (AllowToVectorize) {
7048 reinterpret_cast<Value *
const *
>(Slice.begin()), Slice.size());
7050 PointerOps, &BestVF);
7052 (BestVF > 1 &&
static_cast<unsigned>(NumElts) == 2 * BestVF)) {
7054 if (MaskedGatherVectorized.
empty() ||
7055 Cnt >= MaskedGatherVectorized.
back() + NumElts)
7060 Results.emplace_back(Values, LS);
7061 VectorizedLoads.insert(Slice.begin(), Slice.end());
7064 if (Cnt == StartIdx)
7065 StartIdx += NumElts;
7068 if (StartIdx >= Loads.
size())
7072 if (!MaskedGatherVectorized.
empty() &&
7073 Cnt < MaskedGatherVectorized.
back() + NumElts)
7079 if (!AllowToVectorize || BestVF == 0)
7083 for (
unsigned Cnt : MaskedGatherVectorized) {
7085 Cnt, std::min<unsigned>(NumElts, Loads.
size() - Cnt));
7089 VectorizedLoads.insert(Slice.
begin(), Slice.
end());
7091 if (Cnt == StartIdx)
7092 StartIdx += NumElts;
7096 if (!VectorizedLoads.contains(LI))
7097 NonVectorized.push_back(LI);
7101 auto ProcessGatheredLoads =
7104 bool Final =
false) {
7106 for (
ArrayRef<std::pair<LoadInst *, int>> LoadsDists : GatheredLoads) {
7107 if (LoadsDists.size() <= 1) {
7108 NonVectorized.
push_back(LoadsDists.back().first);
7113 transform(LoadsDists, OriginalLoads.begin(),
7114 [](
const std::pair<LoadInst *, int> &L) ->
LoadInst * {
7119 unsigned MaxConsecutiveDistance = 0;
7120 unsigned CurrentConsecutiveDist = 1;
7121 int LastDist = LocalLoadsDists.
front().second;
7122 bool AllowMaskedGather = IsMaskedGatherSupported(OriginalLoads);
7123 for (
const std::pair<LoadInst *, int> &L : LocalLoadsDists) {
7124 if (getTreeEntry(
L.first))
7126 assert(LastDist >=
L.second &&
7127 "Expected first distance always not less than second");
7128 if (
static_cast<unsigned>(LastDist -
L.second) ==
7129 CurrentConsecutiveDist) {
7130 ++CurrentConsecutiveDist;
7131 MaxConsecutiveDistance =
7132 std::max(MaxConsecutiveDistance, CurrentConsecutiveDist);
7136 if (!AllowMaskedGather && CurrentConsecutiveDist == 1 &&
7139 CurrentConsecutiveDist = 1;
7140 LastDist =
L.second;
7143 if (Loads.
size() <= 1)
7145 if (AllowMaskedGather)
7146 MaxConsecutiveDistance = Loads.
size();
7147 else if (MaxConsecutiveDistance < 2)
7152 GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized,
7153 Final, MaxConsecutiveDistance);
7155 OriginalLoads.size() == Loads.
size() &&
7156 MaxConsecutiveDistance == Loads.
size() &&
7161 VectorizedLoads.
clear();
7165 GetVectorizedRanges(OriginalLoads, VectorizedLoads,
7166 UnsortedNonVectorized, Final,
7167 OriginalLoads.size());
7168 if (SortedNonVectorized.
size() >= UnsortedNonVectorized.
size()) {
7169 SortedNonVectorized.
swap(UnsortedNonVectorized);
7170 Results.swap(UnsortedResults);
7175 << Slice.
size() <<
")\n");
7176 if (
any_of(Slice, [&](
Value *V) {
return getTreeEntry(V); })) {
7177 for (
Value *L : Slice)
7178 if (!getTreeEntry(L))
7179 SortedNonVectorized.
push_back(cast<LoadInst>(L));
7185 unsigned MaxVF = Slice.size();
7186 unsigned UserMaxVF = 0;
7187 unsigned InterleaveFactor = 0;
7192 std::optional<unsigned> InterleavedLoadsDistance = 0;
7194 std::optional<unsigned> CommonVF = 0;
7198 for (
const TreeEntry *
E : ValueToGatherNodes.at(V)) {
7199 UserMaxVF = std::max<unsigned>(UserMaxVF,
E->Scalars.size());
7202 UserMaxVF = std::max<unsigned>(UserMaxVF,
Idx - Pos + 1);
7204 if (*CommonVF == 0) {
7205 CommonVF =
E->Scalars.size();
7208 if (*CommonVF !=
E->Scalars.size())
7212 if (Pos !=
Idx && InterleavedLoadsDistance) {
7215 if (isa<Constant>(V))
7217 if (getTreeEntry(V))
7219 const auto &Nodes = ValueToGatherNodes.at(V);
7220 return (Nodes.size() != 1 || !Nodes.contains(E)) &&
7221 !is_contained(Slice, V);
7223 InterleavedLoadsDistance.reset();
7227 if (*InterleavedLoadsDistance == 0) {
7228 InterleavedLoadsDistance =
Idx - Pos;
7231 if ((
Idx - Pos) % *InterleavedLoadsDistance != 0 ||
7232 (
Idx - Pos) / *InterleavedLoadsDistance < Order)
7233 InterleavedLoadsDistance.reset();
7234 Order = (
Idx - Pos) / InterleavedLoadsDistance.value_or(1);
7238 DeinterleavedNodes.
clear();
7240 if (InterleavedLoadsDistance.value_or(0) > 1 &&
7241 CommonVF.value_or(0) != 0) {
7242 InterleaveFactor =
bit_ceil(*InterleavedLoadsDistance);
7243 unsigned VF = *CommonVF;
7247 if (InterleaveFactor <= Slice.size() &&
7251 cast<LoadInst>(Slice.front())->getAlign(),
7252 cast<LoadInst>(Slice.front())
7256 UserMaxVF = InterleaveFactor * VF;
7258 InterleaveFactor = 0;
7263 unsigned ConsecutiveNodesSize = 0;
7264 if (!LoadEntriesToVectorize.
empty() && InterleaveFactor == 0 &&
7265 any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7266 [&, Slice = Slice](
const auto &
P) {
7268 return std::get<1>(
P).contains(V);
7270 if (It == Slice.end())
7273 VectorizableTree[std::get<0>(
P)]->Scalars;
7274 ConsecutiveNodesSize += VL.
size();
7275 unsigned Start = std::distance(Slice.begin(), It);
7276 unsigned Sz = Slice.size() - Start;
7277 return Sz < VL.
size() ||
7278 Slice.slice(std::distance(Slice.begin(), It),
7284 if (InterleaveFactor == 0 &&
7285 any_of(seq<unsigned>(Slice.size() / UserMaxVF),
7286 [&, Slice = Slice](
unsigned Idx) {
7288 SmallVector<Value *> PointerOps;
7289 return canVectorizeLoads(
7290 Slice.slice(Idx * UserMaxVF, UserMaxVF),
7291 Slice[Idx * UserMaxVF], Order,
7293 LoadsState::ScatterVectorize;
7296 if (Slice.size() != ConsecutiveNodesSize)
7297 MaxVF = std::min<unsigned>(MaxVF, UserMaxVF);
7299 for (
unsigned VF = MaxVF; VF >= 2; VF /= 2) {
7300 bool IsVectorized =
true;
7301 for (
unsigned I = 0,
E = Slice.size();
I <
E;
I += VF) {
7303 Slice.
slice(
I, std::min(VF,
E -
I));
7304 if (getTreeEntry(SubSlice.
front()))
7308 if (
any_of(
zip(LoadEntriesToVectorize, LoadSetsToVectorize),
7309 [&](
const auto &
P) {
7311 VectorizableTree[std::get<0>(
P)]
7316 unsigned Sz = VectorizableTree.size();
7317 buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor);
7318 if (Sz == VectorizableTree.size()) {
7319 IsVectorized =
false;
7322 if (InterleaveFactor > 0) {
7323 VF = 2 * (MaxVF / InterleaveFactor);
7324 InterleaveFactor = 0;
7333 NonVectorized.
append(SortedNonVectorized);
7335 return NonVectorized;
7337 for (
const auto &GLs : GatheredLoads) {
7338 const auto &
Ref = GLs.second;
7340 if (!
Ref.empty() && !NonVectorized.
empty() &&
7342 Ref.begin(),
Ref.end(), 0u,
7344 ArrayRef<std::pair<LoadInst *, int>> LoadsDists) ->
unsigned {
7345 return S + LoadsDists.size();
7346 }) != NonVectorized.
size() &&
7347 IsMaskedGatherSupported(NonVectorized)) {
7349 for (
LoadInst *LI : NonVectorized) {
7357 (void)ProcessGatheredLoads(FinalGatheredLoads,
true);
7361 for (
unsigned Idx : LoadEntriesToVectorize) {
7362 const TreeEntry &
E = *VectorizableTree[
Idx];
7365 if (!
E.ReorderIndices.empty()) {
7372 buildTree_rec(GatheredScalars, 0, EdgeInfo());
7376 if (
static_cast<unsigned>(*GatheredLoadsEntriesFirst) ==
7377 VectorizableTree.size())
7378 GatheredLoadsEntriesFirst.reset();
7385 Value *NeedsScheduling =
nullptr;
7386 for (
Value *V : VL) {
7389 if (!NeedsScheduling) {
7390 NeedsScheduling = V;
7395 return NeedsScheduling;
7406 bool AllowAlternate) {
7410 if (
auto *LI = dyn_cast<LoadInst>(V)) {
7413 SubKey =
hash_value(LoadsSubkeyGenerator(Key, LI));
7418 if (isa<ExtractElementInst, UndefValue>(V))
7420 if (
auto *EI = dyn_cast<ExtractElementInst>(V)) {
7422 !isa<UndefValue>(EI->getIndexOperand()))
7425 }
else if (
auto *
I = dyn_cast<Instruction>(V)) {
7428 if ((isa<BinaryOperator, CastInst>(
I)) &&
7438 : cast<CastInst>(
I)->getOperand(0)->getType()));
7440 if (isa<CastInst>(
I)) {
7441 std::pair<size_t, size_t> OpVals =
7447 }
else if (
auto *CI = dyn_cast<CmpInst>(
I)) {
7449 if (CI->isCommutative())
7455 }
else if (
auto *Call = dyn_cast<CallInst>(
I)) {
7469 }
else if (
auto *Gep = dyn_cast<GetElementPtrInst>(
I)) {
7470 if (Gep->getNumOperands() == 2 && isa<ConstantInt>(Gep->getOperand(1)))
7471 SubKey =
hash_value(Gep->getPointerOperand());
7475 !isa<ConstantInt>(
I->getOperand(1))) {
7483 return std::make_pair(Key, SubKey);
7493bool BoUpSLP::areAltOperandsProfitable(
const InstructionsState &S,
7495 unsigned Opcode0 = S.getOpcode();
7496 unsigned Opcode1 = S.getAltOpcode();
7500 Opcode0, Opcode1, OpcodeMask))
7503 for (
unsigned I : seq<unsigned>(S.getMainOp()->getNumOperands())) {
7506 for (
Value *V : VL) {
7507 if (isa<PoisonValue>(V)) {
7512 Operands.back().push_back(cast<Instruction>(V)->getOperand(
I));
7517 for (
unsigned I : seq<unsigned>(0, VL.size() - 1)) {
7523 switch (Res.value_or(0)) {
7538 constexpr unsigned NumAltInsts = 3;
7539 unsigned NonInstCnt = 0;
7542 unsigned UndefCnt = 0;
7544 unsigned ExtraShuffleInsts = 0;
7553 return is_contained(Operands.back(), V);
7556 ++ExtraShuffleInsts;
7573 if (isa<Constant, ExtractElementInst>(V) ||
7574 getTreeEntry(V) || (L &&
L->isLoopInvariant(V))) {
7575 if (isa<UndefValue>(V))
7581 if (!Res.second && Res.first->second == 1)
7582 ++ExtraShuffleInsts;
7583 ++Res.first->getSecond();
7584 if (
auto *
I = dyn_cast<Instruction>(V))
7585 UniqueOpcodes.
insert(
I->getOpcode());
7586 else if (Res.second)
7589 return none_of(Uniques, [&](
const auto &
P) {
7590 return P.first->hasNUsesOrMore(
P.second + 1) &&
7592 return getTreeEntry(U) || Uniques.contains(U);
7601 (UndefCnt < (VL.size() - 1) * S.getMainOp()->getNumOperands() &&
7602 (UniqueOpcodes.
size() + NonInstCnt + ExtraShuffleInsts +
7603 NumAltInsts) < S.getMainOp()->getNumOperands() * VL.size());
7606BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
7608 bool IsScatterVectorizeUserTE, OrdersType &CurrentOrder,
7611 "Expected instructions with same/alternate opcodes only.");
7613 unsigned ShuffleOrOp =
7614 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
7616 switch (ShuffleOrOp) {
7617 case Instruction::PHI: {
7620 return TreeEntry::NeedToGather;
7622 for (
Value *V : VL) {
7623 auto *
PHI = dyn_cast<PHINode>(V);
7628 if (Term &&
Term->isTerminator()) {
7630 <<
"SLP: Need to swizzle PHINodes (terminator use).\n");
7631 return TreeEntry::NeedToGather;
7636 return TreeEntry::Vectorize;
7638 case Instruction::ExtractValue:
7639 case Instruction::ExtractElement: {
7640 bool Reuse = canReuseExtract(VL, CurrentOrder);
7644 return TreeEntry::NeedToGather;
7645 if (Reuse || !CurrentOrder.empty())
7646 return TreeEntry::Vectorize;
7648 return TreeEntry::NeedToGather;
7650 case Instruction::InsertElement: {
7654 for (
Value *V : VL) {
7655 SourceVectors.
insert(cast<Instruction>(V)->getOperand(0));
7657 "Non-constant or undef index?");
7661 return !SourceVectors.contains(V);
7664 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7665 "different source vectors.\n");
7666 return TreeEntry::NeedToGather;
7671 return SourceVectors.contains(V) && !
V->hasOneUse();
7674 LLVM_DEBUG(
dbgs() <<
"SLP: Gather of insertelement vectors with "
7675 "multiple uses.\n");
7676 return TreeEntry::NeedToGather;
7679 return TreeEntry::Vectorize;
7681 case Instruction::Load: {
7690 return TreeEntry::Vectorize;
7692 if (!IsGraphTransformMode && !VectorizableTree.empty()) {
7694 LoadEntriesToVectorize.insert(VectorizableTree.size());
7695 return TreeEntry::NeedToGather;
7697 return TreeEntry::ScatterVectorize;
7699 if (!IsGraphTransformMode && VectorizableTree.size() > 1) {
7701 LoadEntriesToVectorize.insert(VectorizableTree.size());
7702 return TreeEntry::NeedToGather;
7704 return TreeEntry::StridedVectorize;
7708 if (
DL->getTypeSizeInBits(ScalarTy) !=
7709 DL->getTypeAllocSizeInBits(ScalarTy))
7710 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering loads of non-packed type.\n");
7712 auto *LI = dyn_cast<LoadInst>(V);
7713 return !LI || !LI->isSimple();
7720 return TreeEntry::NeedToGather;
7724 case Instruction::ZExt:
7725 case Instruction::SExt:
7726 case Instruction::FPToUI:
7727 case Instruction::FPToSI:
7728 case Instruction::FPExt:
7729 case Instruction::PtrToInt:
7730 case Instruction::IntToPtr:
7731 case Instruction::SIToFP:
7732 case Instruction::UIToFP:
7733 case Instruction::Trunc:
7734 case Instruction::FPTrunc:
7735 case Instruction::BitCast: {
7737 for (
Value *V : VL) {
7738 if (isa<PoisonValue>(V))
7740 Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
7743 dbgs() <<
"SLP: Gathering casts with different src types.\n");
7744 return TreeEntry::NeedToGather;
7747 return TreeEntry::Vectorize;
7749 case Instruction::ICmp:
7750 case Instruction::FCmp: {
7755 for (
Value *V : VL) {
7756 if (isa<PoisonValue>(V))
7758 auto *
Cmp = cast<CmpInst>(V);
7759 if ((
Cmp->getPredicate() != P0 &&
Cmp->getPredicate() != SwapP0) ||
7760 Cmp->getOperand(0)->getType() != ComparedTy) {
7761 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering cmp with different predicate.\n");
7762 return TreeEntry::NeedToGather;
7765 return TreeEntry::Vectorize;
7767 case Instruction::Select:
7768 case Instruction::FNeg:
7769 case Instruction::Add:
7770 case Instruction::FAdd:
7771 case Instruction::Sub:
7772 case Instruction::FSub:
7773 case Instruction::Mul:
7774 case Instruction::FMul:
7775 case Instruction::UDiv:
7776 case Instruction::SDiv:
7777 case Instruction::FDiv:
7778 case Instruction::URem:
7779 case Instruction::SRem:
7780 case Instruction::FRem:
7781 case Instruction::Shl:
7782 case Instruction::LShr:
7783 case Instruction::AShr:
7784 case Instruction::And:
7785 case Instruction::Or:
7786 case Instruction::Xor:
7787 case Instruction::Freeze:
7788 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7790 auto *
I = dyn_cast<Instruction>(V);
7791 return I &&
I->isBinaryOp() && !
I->isFast();
7793 return TreeEntry::NeedToGather;
7794 return TreeEntry::Vectorize;
7795 case Instruction::GetElementPtr: {
7797 for (
Value *V : VL) {
7798 auto *
I = dyn_cast<GetElementPtrInst>(V);
7801 if (
I->getNumOperands() != 2) {
7802 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (nested indexes).\n");
7803 return TreeEntry::NeedToGather;
7809 Type *Ty0 = cast<GEPOperator>(VL0)->getSourceElementType();
7810 for (
Value *V : VL) {
7811 auto *
GEP = dyn_cast<GEPOperator>(V);
7814 Type *CurTy =
GEP->getSourceElementType();
7816 LLVM_DEBUG(
dbgs() <<
"SLP: not-vectorizable GEP (different types).\n");
7817 return TreeEntry::NeedToGather;
7823 for (
Value *V : VL) {
7824 auto *
I = dyn_cast<GetElementPtrInst>(V);
7827 auto *
Op =
I->getOperand(1);
7828 if ((!IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7829 (
Op->getType() != Ty1 &&
7830 ((IsScatterVectorizeUserTE && !isa<ConstantInt>(
Op)) ||
7831 Op->getType()->getScalarSizeInBits() >
7832 DL->getIndexSizeInBits(
7833 V->getType()->getPointerAddressSpace())))) {
7835 dbgs() <<
"SLP: not-vectorizable GEP (non-constant indexes).\n");
7836 return TreeEntry::NeedToGather;
7840 return TreeEntry::Vectorize;
7842 case Instruction::Store: {
7844 llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
7847 if (
DL->getTypeSizeInBits(ScalarTy) !=
7848 DL->getTypeAllocSizeInBits(ScalarTy)) {
7849 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering stores of non-packed type.\n");
7850 return TreeEntry::NeedToGather;
7854 for (
Value *V : VL) {
7855 auto *
SI = cast<StoreInst>(V);
7856 if (!
SI->isSimple()) {
7858 return TreeEntry::NeedToGather;
7867 if (CurrentOrder.empty()) {
7868 Ptr0 = PointerOps.
front();
7869 PtrN = PointerOps.
back();
7871 Ptr0 = PointerOps[CurrentOrder.front()];
7872 PtrN = PointerOps[CurrentOrder.back()];
7874 std::optional<int> Dist =
7877 if (
static_cast<unsigned>(*Dist) == VL.size() - 1)
7878 return TreeEntry::Vectorize;
7882 return TreeEntry::NeedToGather;
7884 case Instruction::Call: {
7885 if (S.getMainOp()->getType()->isFloatingPointTy() &&
7887 auto *
I = dyn_cast<Instruction>(V);
7888 return I && !
I->isFast();
7890 return TreeEntry::NeedToGather;
7893 CallInst *CI = cast<CallInst>(VL0);
7904 return TreeEntry::NeedToGather;
7909 for (
unsigned J = 0; J != NumArgs; ++J)
7912 for (
Value *V : VL) {
7913 CallInst *CI2 = dyn_cast<CallInst>(V);
7919 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched calls:" << *CI <<
"!=" << *V
7921 return TreeEntry::NeedToGather;
7925 for (
unsigned J = 0; J != NumArgs; ++J) {
7928 if (ScalarArgs[J] != A1J) {
7930 <<
"SLP: mismatched arguments in call:" << *CI
7931 <<
" argument " << ScalarArgs[J] <<
"!=" << A1J <<
"\n");
7932 return TreeEntry::NeedToGather;
7941 LLVM_DEBUG(
dbgs() <<
"SLP: mismatched bundle operands in calls:" << *CI
7942 <<
"!=" << *V <<
'\n');
7943 return TreeEntry::NeedToGather;
7947 return TreeEntry::Vectorize;
7949 case Instruction::ShuffleVector: {
7950 if (!S.isAltShuffle()) {
7953 return TreeEntry::Vectorize;
7956 LLVM_DEBUG(
dbgs() <<
"SLP: ShuffleVector are not vectorized.\n");
7957 return TreeEntry::NeedToGather;
7962 <<
"SLP: ShuffleVector not vectorized, operands are buildvector and "
7963 "the whole alt sequence is not profitable.\n");
7964 return TreeEntry::NeedToGather;
7967 return TreeEntry::Vectorize;
7971 return TreeEntry::NeedToGather;
7985 PHIHandler() =
delete;
7987 : DT(DT), Main(Main), Phis(Phis),
7988 Operands(Main->getNumIncomingValues(),
7990 void buildOperands() {
7991 constexpr unsigned FastLimit = 4;
8001 auto *
P = dyn_cast<PHINode>(V);
8003 assert(isa<PoisonValue>(V) &&
8004 "Expected isa instruction or poison value.");
8008 if (
P->getIncomingBlock(
I) == InBB)
8023 Blocks.try_emplace(InBB).first->second.push_back(
I);
8026 if (isa<PoisonValue>(V)) {
8031 auto *
P = cast<PHINode>(V);
8032 for (
unsigned I : seq<unsigned>(0,
P->getNumIncomingValues())) {
8040 auto It =
Blocks.find(InBB);
8046 for (
const auto &
P :
Blocks) {
8047 if (
P.getSecond().size() <= 1)
8049 unsigned BasicI =
P.getSecond().front();
8052 [&](
const auto &Data) {
8053 return !Data.value() ||
8054 Data.value() ==
Operands[BasicI][Data.index()];
8056 "Expected empty operands list.");
8066 const EdgeInfo &UserTreeIdx,
8067 unsigned InterleaveFactor) {
8073 auto TryToFindDuplicates = [&](
const InstructionsState &S,
8074 bool DoNotFail =
false) {
8077 for (
Value *V : VL) {
8084 auto Res = UniquePositions.try_emplace(V, UniqueValues.
size());
8089 size_t NumUniqueScalarValues = UniqueValues.
size();
8092 if (NumUniqueScalarValues == VL.size() &&
8094 ReuseShuffleIndices.
clear();
8097 if ((UserTreeIdx.UserTE &&
8098 UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI)) ||
8100 LLVM_DEBUG(
dbgs() <<
"SLP: Reshuffling scalars not yet supported "
8101 "for nodes with padding.\n");
8102 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8106 if (NumUniqueScalarValues <= 1 || !IsFullVectors ||
8107 (UniquePositions.size() == 1 &&
all_of(UniqueValues, [](
Value *V) {
8110 if (DoNotFail && UniquePositions.size() > 1 &&
8111 NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() &&
8112 all_of(UniqueValues, IsaPred<Instruction, PoisonValue>)) {
8115 *
TTI, UniqueValues.
front()->getType(), UniqueValues.
size());
8116 if (PWSz == VL.size()) {
8117 ReuseShuffleIndices.
clear();
8119 NonUniqueValueVL.
assign(UniqueValues.
begin(), UniqueValues.
end());
8121 PWSz - UniqueValues.
size(),
8127 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8130 VL = NonUniqueValueVL;
8135 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8148 if (S && isa<CatchSwitchInst>(S.getMainOp()->getParent()->getTerminator())) {
8150 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8156 if (TreeEntry *E = getTreeEntry(S.getMainOp())) {
8157 LLVM_DEBUG(
dbgs() <<
"SLP: \tChecking bundle: " << *S.getMainOp()
8159 if (GatheredLoadsEntriesFirst.has_value() || !E->isSame(VL)) {
8160 auto It = MultiNodeScalars.
find(S.getMainOp());
8161 if (It != MultiNodeScalars.
end()) {
8162 auto *TEIt =
find_if(It->getSecond(),
8163 [&](TreeEntry *ME) { return ME->isSame(VL); });
8164 if (TEIt != It->getSecond().end())
8174 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to partial overlap.\n");
8175 if (TryToFindDuplicates(S))
8176 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8177 ReuseShuffleIndices);
8181 Nodes.
insert(getTreeEntry(S.getMainOp()));
8182 for (
const TreeEntry *E : MultiNodeScalars.
lookup(S.getMainOp()))
8185 if (
any_of(Nodes, [&](
const TreeEntry *E) {
8187 [&](
Value *V) { return Values.contains(V); }))
8192 all_of(VL, [&](
Value *V) {
return EValues.contains(V); }));
8195 if (TryToFindDuplicates(S))
8196 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8197 ReuseShuffleIndices);
8204 E->UserTreeIndices.push_back(UserTreeIdx);
8205 LLVM_DEBUG(
dbgs() <<
"SLP: Perfect diamond merge at " << *S.getMainOp()
8216 !(S && !S.isAltShuffle() && VL.size() >= 4 &&
8221 cast<Instruction>(
I)->getOpcode() == S.getOpcode();
8223 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to max recursion depth.\n");
8224 if (TryToFindDuplicates(S))
8225 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8226 ReuseShuffleIndices);
8231 if (S && S.getOpcode() == Instruction::ExtractElement &&
8232 isa<ScalableVectorType>(
8233 cast<ExtractElementInst>(S.getMainOp())->getVectorOperandType())) {
8234 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to scalable vector type.\n");
8235 if (TryToFindDuplicates(S))
8236 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8237 ReuseShuffleIndices);
8244 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8253 auto &&NotProfitableForVectorization = [&S,
this,
8255 if (!S || !S.isAltShuffle() || VL.size() > 2)
8264 for (
Value *V : VL) {
8265 auto *
I = cast<Instruction>(V);
8267 return isa<Instruction>(Op) || isVectorLikeInstWithConstOps(Op);
8270 bool IsCommutative =
8272 if ((IsCommutative &&
8273 std::accumulate(InstsCount.
begin(), InstsCount.
end(), 0) < 2) ||
8275 all_of(InstsCount, [](
unsigned ICnt) {
return ICnt < 2; })))
8277 assert(VL.size() == 2 &&
"Expected only 2 alternate op instructions.");
8279 auto *
I1 = cast<Instruction>(VL.front());
8280 auto *I2 = cast<Instruction>(VL.back());
8281 for (
int Op : seq<int>(S.getMainOp()->getNumOperands()))
8283 I2->getOperand(
Op));
8284 if (
static_cast<unsigned>(
count_if(
8285 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8287 })) >= S.getMainOp()->getNumOperands() / 2)
8289 if (S.getMainOp()->getNumOperands() > 2)
8291 if (IsCommutative) {
8296 I2->getOperand((
Op + 1) % E));
8298 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
8307 bool IsScatterVectorizeUserTE =
8308 UserTreeIdx.UserTE &&
8309 UserTreeIdx.UserTE->State == TreeEntry::ScatterVectorize;
8311 bool AreScatterAllGEPSameBlock =
8312 (IsScatterVectorizeUserTE && VL.front()->getType()->isPointerTy() &&
8316 auto *
I = dyn_cast<GetElementPtrInst>(V);
8320 BB =
I->getParent();
8321 return BB ==
I->getParent() &&
I->getNumOperands() == 2;
8324 sortPtrAccesses(VL, UserTreeIdx.UserTE->getMainOp()->getType(), *DL, *SE,
8326 bool AreAllSameInsts = AreAllSameBlock || AreScatterAllGEPSameBlock;
8329 isa<InsertElementInst, ExtractValueInst, ExtractElementInst>(
8332 NotProfitableForVectorization(VL)) {
8333 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to C,S,B,O, small shuffle. \n");
8334 if (TryToFindDuplicates(S))
8335 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8336 ReuseShuffleIndices);
8341 if (S && !EphValues.
empty()) {
8342 for (
Value *V : VL) {
8343 if (EphValues.
count(V)) {
8345 <<
") is ephemeral.\n");
8346 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8356 for (
Value *V : VL) {
8357 if ((!IsScatterVectorizeUserTE && !isa<Instruction>(V)) ||
8360 if (getTreeEntry(V)) {
8362 <<
") is already in tree.\n");
8363 if (TryToFindDuplicates(S))
8364 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8365 ReuseShuffleIndices);
8371 if (UserIgnoreList && !UserIgnoreList->empty()) {
8372 for (
Value *V : VL) {
8373 if (UserIgnoreList->contains(V)) {
8374 LLVM_DEBUG(
dbgs() <<
"SLP: Gathering due to gathered scalar.\n");
8375 if (TryToFindDuplicates(S))
8376 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8377 ReuseShuffleIndices);
8385 if (!AreAllSameBlock && AreScatterAllGEPSameBlock) {
8386 assert(VL.front()->getType()->isPointerTy() &&
8387 count_if(VL, IsaPred<GetElementPtrInst>) >= 2 &&
8388 "Expected pointers only.");
8390 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
8391 assert(It != VL.end() &&
"Expected at least one GEP.");
8408 newTreeEntry(VL, std::nullopt , S, UserTreeIdx);
8413 if (!TryToFindDuplicates(S,
true))
8419 TreeEntry::EntryState State = getScalarsVectorizationState(
8420 S, VL, IsScatterVectorizeUserTE, CurrentOrder, PointerOps);
8421 if (State == TreeEntry::NeedToGather) {
8422 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8423 ReuseShuffleIndices);
8427 auto &BSRef = BlocksSchedules[BB];
8429 BSRef = std::make_unique<BlockScheduling>(BB);
8431 BlockScheduling &BS = *BSRef;
8433 std::optional<ScheduleData *> Bundle =
8434 BS.tryScheduleBundle(UniqueValues,
this, S);
8435#ifdef EXPENSIVE_CHECKS
8440 LLVM_DEBUG(
dbgs() <<
"SLP: We are not able to schedule this bundle!\n");
8441 assert((!BS.getScheduleData(VL0) ||
8442 !BS.getScheduleData(VL0)->isPartOfBundle()) &&
8443 "tryScheduleBundle should cancelScheduling on failure");
8444 newTreeEntry(VL, std::nullopt , S, UserTreeIdx,
8445 ReuseShuffleIndices);
8446 NonScheduledFirst.insert(VL.front());
8447 if (S.getOpcode() == Instruction::Load &&
8448 BS.ScheduleRegionSize < BS.ScheduleRegionSizeLimit)
8452 LLVM_DEBUG(
dbgs() <<
"SLP: We are able to schedule this bundle.\n");
8454 unsigned ShuffleOrOp =
8455 S.isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : S.getOpcode();
8456 auto CreateOperandNodes = [&](TreeEntry *
TE,
const auto &
Operands) {
8459 for (
unsigned I : seq<unsigned>(
Operands.size())) {
8464 if ((!S || S.getOpcode() != Instruction::PHI) || S.isAltShuffle())
8469 for (
unsigned I : PHIOps)
8472 switch (ShuffleOrOp) {
8473 case Instruction::PHI: {
8474 auto *PH = cast<PHINode>(VL0);
8477 newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndices);
8482 PHIHandler Handler(*DT, PH, VL);
8483 Handler.buildOperands();
8484 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8485 TE->setOperand(
I, Handler.getOperands(
I));
8487 for (
unsigned I : seq<unsigned>(PH->getNumOperands()))
8492 case Instruction::ExtractValue:
8493 case Instruction::ExtractElement: {
8494 if (CurrentOrder.empty()) {
8495 LLVM_DEBUG(
dbgs() <<
"SLP: Reusing or shuffling extract sequence.\n");
8498 dbgs() <<
"SLP: Reusing or shuffling of reordered extract sequence "
8500 for (
unsigned Idx : CurrentOrder)
8508 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8509 ReuseShuffleIndices, CurrentOrder);
8511 "(ExtractValueInst/ExtractElementInst).\n";
8515 TE->setOperand(*
this);
8518 case Instruction::InsertElement: {
8519 assert(ReuseShuffleIndices.
empty() &&
"All inserts should be unique");
8521 auto OrdCompare = [](
const std::pair<int, int> &P1,
8522 const std::pair<int, int> &P2) {
8523 return P1.first > P2.first;
8526 decltype(OrdCompare)>
8527 Indices(OrdCompare);
8528 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8530 Indices.emplace(
Idx,
I);
8532 OrdersType CurrentOrder(VL.size(), VL.size());
8533 bool IsIdentity =
true;
8534 for (
int I = 0, E = VL.size();
I < E; ++
I) {
8535 CurrentOrder[Indices.top().second] =
I;
8536 IsIdentity &= Indices.top().second ==
I;
8540 CurrentOrder.clear();
8541 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8543 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (InsertElementInst).\n";
8546 TE->setOperand(*
this);
8547 buildTree_rec(
TE->getOperand(1),
Depth + 1, {TE, 1});
8550 case Instruction::Load: {
8557 TreeEntry *
TE =
nullptr;
8560 case TreeEntry::Vectorize:
8561 TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8562 ReuseShuffleIndices, CurrentOrder, InterleaveFactor);
8563 if (CurrentOrder.empty())
8568 <<
"SLP: added a new TreeEntry (jumbled LoadInst).\n";
8571 case TreeEntry::StridedVectorize:
8573 TE = newTreeEntry(VL, TreeEntry::StridedVectorize, Bundle, S,
8574 UserTreeIdx, ReuseShuffleIndices, CurrentOrder);
8575 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (strided LoadInst).\n";
8578 case TreeEntry::ScatterVectorize:
8580 TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
8581 UserTreeIdx, ReuseShuffleIndices);
8584 <<
"SLP: added a new TreeEntry (non-consecutive LoadInst).\n";
8587 case TreeEntry::CombinedVectorize:
8588 case TreeEntry::NeedToGather:
8591 TE->setOperand(*
this);
8592 if (State == TreeEntry::ScatterVectorize)
8593 buildTree_rec(PointerOps,
Depth + 1, {
TE, 0});
8596 case Instruction::ZExt:
8597 case Instruction::SExt:
8598 case Instruction::FPToUI:
8599 case Instruction::FPToSI:
8600 case Instruction::FPExt:
8601 case Instruction::PtrToInt:
8602 case Instruction::IntToPtr:
8603 case Instruction::SIToFP:
8604 case Instruction::UIToFP:
8605 case Instruction::Trunc:
8606 case Instruction::FPTrunc:
8607 case Instruction::BitCast: {
8608 auto [PrevMaxBW, PrevMinBW] = CastMaxMinBWSizes.value_or(
8609 std::make_pair(std::numeric_limits<unsigned>::min(),
8610 std::numeric_limits<unsigned>::max()));
8611 if (ShuffleOrOp == Instruction::ZExt ||
8612 ShuffleOrOp == Instruction::SExt) {
8613 CastMaxMinBWSizes = std::make_pair(
8619 }
else if (ShuffleOrOp == Instruction::Trunc) {
8620 CastMaxMinBWSizes = std::make_pair(
8627 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8628 ReuseShuffleIndices);
8632 TE->setOperand(*
this);
8634 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8635 if (ShuffleOrOp == Instruction::Trunc) {
8636 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8637 }
else if (ShuffleOrOp == Instruction::SIToFP ||
8638 ShuffleOrOp == Instruction::UIToFP) {
8639 unsigned NumSignBits =
8641 if (
auto *OpI = dyn_cast<Instruction>(VL0->
getOperand(0))) {
8643 NumSignBits = std::max(NumSignBits,
Mask.countl_zero());
8645 if (NumSignBits * 2 >=
8647 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8651 case Instruction::ICmp:
8652 case Instruction::FCmp: {
8655 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8656 ReuseShuffleIndices);
8661 VLOperands Ops(VL, S, *
this);
8666 "Commutative Predicate mismatch");
8668 Left = Ops.getVL(0);
8669 Right = Ops.getVL(1);
8672 for (
Value *V : VL) {
8673 if (isa<PoisonValue>(V)) {
8678 auto *
Cmp = cast<CmpInst>(V);
8681 if (
Cmp->getPredicate() != P0)
8683 Left.push_back(LHS);
8684 Right.push_back(RHS);
8691 if (ShuffleOrOp == Instruction::ICmp) {
8692 unsigned NumSignBits0 =
8694 if (NumSignBits0 * 2 >=
8696 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 0)->
Idx);
8697 unsigned NumSignBits1 =
8699 if (NumSignBits1 * 2 >=
8701 ExtraBitWidthNodes.
insert(getOperandEntry(TE, 1)->
Idx);
8705 case Instruction::Select:
8706 case Instruction::FNeg:
8707 case Instruction::Add:
8708 case Instruction::FAdd:
8709 case Instruction::Sub:
8710 case Instruction::FSub:
8711 case Instruction::Mul:
8712 case Instruction::FMul:
8713 case Instruction::UDiv:
8714 case Instruction::SDiv:
8715 case Instruction::FDiv:
8716 case Instruction::URem:
8717 case Instruction::SRem:
8718 case Instruction::FRem:
8719 case Instruction::Shl:
8720 case Instruction::LShr:
8721 case Instruction::AShr:
8722 case Instruction::And:
8723 case Instruction::Or:
8724 case Instruction::Xor:
8725 case Instruction::Freeze: {
8726 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8727 ReuseShuffleIndices);
8729 dbgs() <<
"SLP: added a new TreeEntry "
8730 "(SelectInst/UnaryOperator/BinaryOperator/FreezeInst).\n";
8733 TE->setOperand(*
this, isa<BinaryOperator>(VL0) &&
isCommutative(VL0));
8735 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8738 case Instruction::GetElementPtr: {
8739 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8740 ReuseShuffleIndices);
8741 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (GetElementPtrInst).\n";
8745 for (
Value *V : VL) {
8746 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8751 Operands.front().push_back(
GEP->getPointerOperand());
8762 [VL0Ty, IndexIdx](
Value *V) {
8763 auto *
GEP = dyn_cast<GetElementPtrInst>(V);
8766 return VL0Ty ==
GEP->getOperand(IndexIdx)->getType();
8770 ->getPointerOperandType()
8773 for (
Value *V : VL) {
8774 auto *
I = dyn_cast<GetElementPtrInst>(V);
8777 ConstantInt::get(Ty, 0,
false));
8780 auto *
Op =
I->getOperand(IndexIdx);
8781 auto *CI = dyn_cast<ConstantInt>(
Op);
8786 CI, Ty, CI->getValue().isSignBitSet(), *DL));
8790 for (
unsigned I = 0, Ops =
Operands.size();
I < Ops; ++
I)
8794 case Instruction::Store: {
8795 bool Consecutive = CurrentOrder.empty();
8798 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8799 ReuseShuffleIndices, CurrentOrder);
8801 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (StoreInst).\n";
8805 dbgs() <<
"SLP: added a new TreeEntry (jumbled StoreInst).\n";
8807 TE->setOperand(*
this);
8808 buildTree_rec(
TE->getOperand(0),
Depth + 1, {TE, 0});
8811 case Instruction::Call: {
8814 CallInst *CI = cast<CallInst>(VL0);
8817 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8818 ReuseShuffleIndices);
8822 for (
unsigned I : seq<unsigned>(CI->
arg_size())) {
8827 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8831 case Instruction::ShuffleVector: {
8832 TreeEntry *
TE = newTreeEntry(VL, Bundle , S, UserTreeIdx,
8833 ReuseShuffleIndices);
8834 if (S.isAltShuffle()) {
8835 LLVM_DEBUG(
dbgs() <<
"SLP: added a new TreeEntry (isAltShuffle).\n";
8840 dbgs() <<
"SLP: added a new TreeEntry (ShuffleVectorInst).\n";
8845 auto *CI = dyn_cast<CmpInst>(VL0);
8847 return !isa<PoisonValue>(V) && !cast<CmpInst>(V)->isCommutative();
8849 auto *MainCI = cast<CmpInst>(S.getMainOp());
8850 auto *AltCI = cast<CmpInst>(S.getAltOp());
8854 "Expected different main/alternate predicates.");
8858 for (
Value *V : VL) {
8859 if (isa<PoisonValue>(V)) {
8864 auto *
Cmp = cast<CmpInst>(V);
8875 Left.push_back(LHS);
8876 Right.push_back(RHS);
8885 TE->setOperand(*
this, isa<BinaryOperator>(VL0) || CI);
8887 buildTree_rec(
TE->getOperand(
I),
Depth + 1, {TE, I});
8900 while (isa<StructType, ArrayType, FixedVectorType>(EltTy)) {
8903 if (
auto *ST = dyn_cast<StructType>(EltTy)) {
8905 for (
const auto *Ty : ST->elements())
8906 if (Ty != *ST->element_begin())
8908 N *= ST->getNumElements();
8909 EltTy = *ST->element_begin();
8910 }
else if (
auto *AT = dyn_cast<ArrayType>(EltTy)) {
8911 N *= AT->getNumElements();
8912 EltTy = AT->getElementType();
8914 auto *VT = cast<FixedVectorType>(EltTy);
8915 N *= VT->getNumElements();
8916 EltTy = VT->getElementType();
8923 if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize ||
8931 bool ResizeAllowed)
const {
8932 const auto *It =
find_if(VL, IsaPred<ExtractElementInst, ExtractValueInst>);
8933 assert(It != VL.
end() &&
"Expected at least one extract instruction.");
8934 auto *E0 = cast<Instruction>(*It);
8936 all_of(VL, IsaPred<UndefValue, ExtractElementInst, ExtractValueInst>) &&
8940 Value *Vec = E0->getOperand(0);
8942 CurrentOrder.
clear();
8946 if (E0->getOpcode() == Instruction::ExtractValue) {
8951 LoadInst *LI = dyn_cast<LoadInst>(Vec);
8955 NElts = cast<FixedVectorType>(Vec->
getType())->getNumElements();
8958 unsigned E = VL.
size();
8959 if (!ResizeAllowed && NElts != E)
8962 unsigned MinIdx = NElts, MaxIdx = 0;
8964 auto *Inst = dyn_cast<Instruction>(V);
8967 if (Inst->getOperand(0) != Vec)
8969 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst))
8970 if (isa<UndefValue>(EE->getIndexOperand()))
8975 const unsigned ExtIdx = *
Idx;
8976 if (ExtIdx >= NElts)
8978 Indices[
I] = ExtIdx;
8979 if (MinIdx > ExtIdx)
8981 if (MaxIdx < ExtIdx)
8984 if (MaxIdx - MinIdx + 1 > E)
8986 if (MaxIdx + 1 <= E)
8990 bool ShouldKeepOrder =
true;
8996 CurrentOrder.
assign(E, E);
8997 for (
unsigned I = 0;
I < E; ++
I) {
9000 const unsigned ExtIdx = Indices[
I] - MinIdx;
9001 if (CurrentOrder[ExtIdx] != E) {
9002 CurrentOrder.
clear();
9005 ShouldKeepOrder &= ExtIdx ==
I;
9006 CurrentOrder[ExtIdx] =
I;
9008 if (ShouldKeepOrder)
9009 CurrentOrder.
clear();
9011 return ShouldKeepOrder;
9014bool BoUpSLP::areAllUsersVectorized(
9016 return (
I->hasOneUse() && (!VectorizedVals || VectorizedVals->
contains(
I))) ||
9018 return ScalarToTreeEntry.contains(U) ||
9019 isVectorLikeInstWithConstOps(U) ||
9020 (isa<ExtractElementInst>(U) && MustGather.contains(U));
9024static std::pair<InstructionCost, InstructionCost>
9032 if (
auto *FPCI = dyn_cast<FPMathOperator>(CI))
9033 FMF = FPCI->getFastMathFlags();
9036 dyn_cast<IntrinsicInst>(CI));
9037 auto IntrinsicCost =
9044 auto LibCost = IntrinsicCost;
9051 return {IntrinsicCost, LibCost};
9054void BoUpSLP::TreeEntry::buildAltOpShuffleMask(
9058 unsigned Sz = Scalars.size();
9061 if (!ReorderIndices.empty())
9063 for (
unsigned I = 0;
I < Sz; ++
I) {
9065 if (!ReorderIndices.empty())
9067 if (isa<PoisonValue>(Scalars[
Idx]))
9069 auto *OpInst = cast<Instruction>(Scalars[
Idx]);
9070 if (IsAltOp(OpInst)) {
9080 if (!ReuseShuffleIndices.
empty()) {
9083 return Idx != PoisonMaskElem ? Mask[Idx] : PoisonMaskElem;
9093 if (
auto *MainCI = dyn_cast<CmpInst>(MainOp)) {
9094 auto *AltCI = cast<CmpInst>(AltOp);
9097 assert(MainP != AltP &&
"Expected different main/alternate predicates.");
9098 auto *CI = cast<CmpInst>(
I);
9106 assert((MainP ==
P || AltP ==
P || MainP == SwappedP || AltP == SwappedP) &&
9107 "CmpInst expected to match either main or alternate predicate or "
9110 return MainP !=
P && MainP != SwappedP;
9117 const auto *Op0 = Ops.
front();
9123 const bool IsUniform =
all_of(Ops, [=](
Value *V) {
9127 const bool IsPowerOfTwo =
all_of(Ops, [](
Value *V) {
9129 if (
auto *CI = dyn_cast<ConstantInt>(V))
9130 return CI->getValue().isPowerOf2();
9133 const bool IsNegatedPowerOfTwo =
all_of(Ops, [](
Value *V) {
9135 if (
auto *CI = dyn_cast<ConstantInt>(V))
9136 return CI->getValue().isNegatedPowerOf2();
9141 if (IsConstant && IsUniform)
9143 else if (IsConstant)
9157class BaseShuffleAnalysis {
9159 Type *ScalarTy =
nullptr;
9161 BaseShuffleAnalysis(
Type *ScalarTy) : ScalarTy(ScalarTy) {}
9169 unsigned getVF(
Value *V)
const {
9170 assert(V &&
"V cannot be nullptr");
9171 assert(isa<FixedVectorType>(
V->getType()) &&
9172 "V does not have FixedVectorType");
9173 assert(ScalarTy &&
"ScalarTy cannot be nullptr");
9175 unsigned VNumElements =
9176 cast<FixedVectorType>(
V->getType())->getNumElements();
9177 assert(VNumElements > ScalarTyNumElements &&
9178 "the number of elements of V is not large enough");
9179 assert(VNumElements % ScalarTyNumElements == 0 &&
9180 "the number of elements of V is not a vectorized value");
9181 return VNumElements / ScalarTyNumElements;
9189 int Limit =
Mask.size();
9201 if (Limit % VF == 0 &&
all_of(seq<int>(0, Limit / VF), [=](
int Idx) {
9217 unsigned VF =
Mask.size();
9219 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
9222 int MaskedIdx =
Mask[ExtMask[
I] % VF];
9263 bool SinglePermute) {
9267 while (
auto *SV = dyn_cast<ShuffleVectorInst>(
Op)) {
9269 auto *SVTy = dyn_cast<FixedVectorType>(SV->getType());
9275 if (isIdentityMask(Mask, SVTy,
false)) {
9276 if (!IdentityOp || !SinglePermute ||
9277 (isIdentityMask(Mask, SVTy,
true) &&
9279 IdentityMask.
size()))) {
9284 IdentityMask.
assign(Mask);
9304 if (SV->isZeroEltSplat()) {
9306 IdentityMask.
assign(Mask);
9308 int LocalVF =
Mask.size();
9310 dyn_cast<FixedVectorType>(SV->getOperand(0)->getType()))
9311 LocalVF = SVOpTy->getNumElements();
9315 static_cast<unsigned>(
I) >= SV->getShuffleMask().size())
9317 ExtMask[
Idx] = SV->getMaskValue(
I);
9327 if (!IsOp1Undef && !IsOp2Undef) {
9329 for (
int &
I : Mask) {
9332 if (SV->getMaskValue(
I % SV->getShuffleMask().size()) ==
9339 combineMasks(LocalVF, ShuffleMask, Mask);
9340 Mask.swap(ShuffleMask);
9342 Op = SV->getOperand(0);
9344 Op = SV->getOperand(1);
9346 if (
auto *OpTy = dyn_cast<FixedVectorType>(
Op->getType());
9347 !OpTy || !isIdentityMask(Mask, OpTy, SinglePermute) ||
9352 "Expected masks of same sizes.");
9357 Mask.swap(IdentityMask);
9358 auto *Shuffle = dyn_cast<ShuffleVectorInst>(V);
9359 return SinglePermute &&
9360 (isIdentityMask(Mask, cast<FixedVectorType>(
V->getType()),
9362 (Shuffle &&
Mask.size() == Shuffle->getShuffleMask().size() &&
9363 Shuffle->isZeroEltSplat() &&
9376 template <
typename T,
typename ShuffleBuilderTy>
9378 ShuffleBuilderTy &Builder) {
9379 assert(V1 &&
"Expected at least one vector value.");
9381 Builder.resizeToMatch(V1, V2);
9382 int VF =
Mask.size();
9383 if (
auto *FTy = dyn_cast<FixedVectorType>(V1->
getType()))
9384 VF = FTy->getNumElements();
9385 if (V2 && !isUndefVector</*IsPoisonOnly=*/true>(
9392 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
9395 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9397 CombinedMask1[
I] =
Mask[
I];
9399 CombinedMask2[
I] =
Mask[
I] - VF;
9406 (void)peekThroughShuffles(Op1, CombinedMask1,
false);
9407 (void)peekThroughShuffles(Op2, CombinedMask2,
false);
9410 if (
auto *SV1 = dyn_cast<ShuffleVectorInst>(Op1))
9411 if (
auto *SV2 = dyn_cast<ShuffleVectorInst>(Op2)) {
9416 ExtMask1[
Idx] = SV1->getMaskValue(
I);
9419 cast<FixedVectorType>(SV1->getOperand(1)->getType())
9421 ExtMask1, UseMask::SecondArg);
9426 ExtMask2[
Idx] = SV2->getMaskValue(
I);
9429 cast<FixedVectorType>(SV2->getOperand(1)->getType())
9431 ExtMask2, UseMask::SecondArg);
9432 if (SV1->getOperand(0)->getType() ==
9433 SV2->getOperand(0)->getType() &&
9434 SV1->getOperand(0)->getType() != SV1->getType() &&
9437 Op1 = SV1->getOperand(0);
9438 Op2 = SV2->getOperand(0);
9440 int LocalVF = ShuffleMask1.size();
9441 if (
auto *FTy = dyn_cast<FixedVectorType>(Op1->
getType()))
9442 LocalVF = FTy->getNumElements();
9443 combineMasks(LocalVF, ShuffleMask1, CombinedMask1);
9444 CombinedMask1.swap(ShuffleMask1);
9446 LocalVF = ShuffleMask2.size();
9447 if (
auto *FTy = dyn_cast<FixedVectorType>(Op2->
getType()))
9448 LocalVF = FTy->getNumElements();
9449 combineMasks(LocalVF, ShuffleMask2, CombinedMask2);
9450 CombinedMask2.swap(ShuffleMask2);
9453 }
while (PrevOp1 != Op1 || PrevOp2 != Op2);
9454 Builder.resizeToMatch(Op1, Op2);
9455 VF = std::max(cast<VectorType>(Op1->
getType())
9457 .getKnownMinValue(),
9458 cast<VectorType>(Op2->
getType())
9460 .getKnownMinValue());
9461 for (
int I = 0,
E =
Mask.size();
I <
E; ++
I) {
9464 "Expected undefined mask element");
9465 CombinedMask1[
I] = CombinedMask2[
I] + (Op1 == Op2 ? 0 : VF);
9471 isa<ShuffleVectorInst>(Op1) &&
9472 cast<ShuffleVectorInst>(Op1)->getShuffleMask() ==
9474 return Builder.createIdentity(Op1);
9475 return Builder.createShuffleVector(
9479 if (isa<PoisonValue>(V1))
9480 return Builder.createPoison(
9481 cast<VectorType>(V1->
getType())->getElementType(),
Mask.size());
9483 bool IsIdentity = peekThroughShuffles(V1, NewMask,
true);
9484 assert(V1 &&
"Expected non-null value after looking through shuffles.");
9487 return Builder.createShuffleVector(V1, NewMask);
9488 return Builder.createIdentity(V1);
9494static std::pair<InstructionCost, InstructionCost>
9505 if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
9515 Ptrs, BasePtr, TTI::PointersChainInfo::getUnitStride(), ScalarTy,
9519 for (
Value *V : Ptrs) {
9524 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9529 if (!
Ptr || !
Ptr->hasOneUse())
9533 if (PtrsRetainedInVecCode.
size() == Ptrs.size()) {
9539 TTI::PointersChainInfo::getKnownStride(),
9549 [](
const Value *V) {
9550 auto *
Ptr = dyn_cast<GetElementPtrInst>(V);
9551 return Ptr && !
Ptr->hasAllConstantIndices();
9553 ? TTI::PointersChainInfo::getUnknownStride()
9554 : TTI::PointersChainInfo::getKnownStride();
9558 auto *BaseGEP = dyn_cast<GEPOperator>(BasePtr);
9560 auto *It =
find_if(Ptrs, IsaPred<GEPOperator>);
9561 if (It != Ptrs.
end())
9562 BaseGEP = cast<GEPOperator>(*It);
9567 BaseGEP->getPointerOperand(), Indices, VecTy,
9572 return std::make_pair(ScalarCost, VecCost);
9575void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
9576 assert(
TE.isGather() &&
TE.ReorderIndices.empty() &&
9577 "Expected gather node without reordering.");
9583 if (
TE.Scalars.size() == 2 || (
TE.hasState() && !
TE.isAltShuffle()) ||
9587 if (
any_of(seq<unsigned>(
TE.Idx), [&](
unsigned Idx) {
9588 return VectorizableTree[Idx]->isSame(TE.Scalars);
9592 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
9597 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
9598 if (LIt != LoadsMap.
end()) {
9599 for (
LoadInst *RLI : LIt->second) {
9605 for (
LoadInst *RLI : LIt->second) {
9612 if (LIt->second.size() > 2) {
9614 hash_value(LIt->second.back()->getPointerOperand());
9620 LoadsMap.
try_emplace(std::make_pair(Key,
Ptr)).first->second.push_back(LI);
9625 bool IsOrdered =
true;
9626 unsigned NumInstructions = 0;
9631 if (
auto *Inst = dyn_cast<Instruction>(V);
9632 Inst && !isa<ExtractElementInst, LoadInst, CastInst>(V) &&
9638 auto &Container = SortedValues[
Key];
9639 if (IsOrdered && !KeyToIndex.
contains(V) &&
9640 !(isa<Constant, ExtractElementInst>(V) ||
9642 ((Container.contains(
Idx) &&
9643 KeyToIndex.
at(Container[
Idx].back()).back() !=
I - 1) ||
9644 (!Container.empty() && !Container.contains(
Idx) &&
9645 KeyToIndex.
at(Container.back().second.back()).back() !=
I - 1)))
9647 auto &KTI = KeyToIndex[
V];
9649 Container[
Idx].push_back(V);
9654 if (!IsOrdered && NumInstructions > 1) {
9656 TE.ReorderIndices.resize(
TE.Scalars.size(),
TE.Scalars.size());
9657 for (
const auto &
D : SortedValues) {
9658 for (
const auto &
P :
D.second) {
9660 for (
Value *V :
P.second) {
9663 TE.ReorderIndices[Cnt +
K] =
Idx;
9664 TE.Scalars[Cnt +
K] =
V;
9666 Sz += Indices.
size();
9667 Cnt += Indices.
size();
9669 if (Sz > 1 && isa<Instruction>(
P.second.front())) {
9671 *
TTI,
TE.Scalars.front()->getType(), Sz);
9673 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt - Sz + SubVF))
9675 }
else if (!
P.second.empty() &&
isConstant(
P.second.front())) {
9676 for (
unsigned I : seq<unsigned>(Cnt - Sz, Cnt))
9683 if (!
TE.ReuseShuffleIndices.empty() ||
TE.ReorderIndices.empty())
9688 auto *ScalarTy =
TE.Scalars.front()->getType();
9690 for (
auto [
Idx, Sz] : SubVectors) {
9694 if (
auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
9699 for (
unsigned I : seq<unsigned>(
TE.Scalars.size()))
9700 if (DemandedElts[
I])
9703 CostKind,
I * ScalarTyNumElements, FTy);
9708 int Sz =
TE.Scalars.size();
9710 TE.ReorderIndices.end());
9711 for (
unsigned I : seq<unsigned>(Sz)) {
9713 if (isa<PoisonValue>(V)) {
9716 ReorderMask[
I] =
I +
TE.ReorderIndices.size();
9720 any_of(ReorderMask, [&](
int I) {
return I >= Sz; })
9723 VecTy, ReorderMask);
9726 for (
unsigned I : seq<unsigned>(Sz)) {
9730 if (!isa<PoisonValue>(V))
9733 ReorderMask[
I] =
I + Sz;
9737 VecTy, DemandedElts,
true,
false,
CostKind);
9740 if (
Cost >= BVCost) {
9743 TE.ReorderIndices.clear();
9749 BaseGraphSize = VectorizableTree.size();
9751 class GraphTransformModeRAAI {
9752 bool &SavedIsGraphTransformMode;
9755 GraphTransformModeRAAI(
bool &IsGraphTransformMode)
9756 : SavedIsGraphTransformMode(IsGraphTransformMode) {
9757 IsGraphTransformMode =
true;
9759 ~GraphTransformModeRAAI() { SavedIsGraphTransformMode =
false; }
9760 } TransformContext(IsGraphTransformMode);
9769 const InstructionsState &S) {
9771 for (
unsigned Op : seq<unsigned>(S.getMainOp()->getNumOperands()))
9773 I2->getOperand(
Op));
9775 Candidates, [
this](
ArrayRef<std::pair<Value *, Value *>> Cand) {
9777 [](
const std::pair<Value *, Value *> &
P) {
9778 return isa<Constant>(
P.first) ||
9779 isa<Constant>(
P.second) ||
P.first ==
P.second;
9786 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9787 TreeEntry &E = *VectorizableTree[
Idx];
9789 reorderGatherNode(E);
9793 for (
unsigned Idx : seq<unsigned>(BaseGraphSize)) {
9794 TreeEntry &E = *VectorizableTree[
Idx];
9801 if (VL.
size() <= 2 || LoadEntriesToVectorize.contains(
Idx) ||
9802 !(!E.hasState() || E.getOpcode() == Instruction::Load ||
9808 unsigned StartIdx = 0;
9813 *
TTI, VL.
front()->getType(), VF - 1)) {
9814 if (StartIdx + VF >
End)
9817 for (
unsigned Cnt = StartIdx; Cnt + VF <=
End; Cnt += VF) {
9821 if (
const TreeEntry *SE = getTreeEntry(Slice.
front());
9822 SE || getTreeEntry(Slice.
back())) {
9825 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9833 bool IsSplat =
isSplat(Slice);
9834 if (Slices.
empty() || !IsSplat ||
9836 Slice.
front()->getType(), VF)),
9839 Slice.
front()->getType(), 2 * VF)),
9842 static_cast<long>(isa<UndefValue>(Slice.
front()) ? VF - 1
9848 (S.getOpcode() == Instruction::Load &&
9850 (S.getOpcode() != Instruction::Load &&
9856 if ((!UserIgnoreList || E.Idx != 0) &&
9860 if (isa<PoisonValue>(V))
9862 return areAllUsersVectorized(cast<Instruction>(V),
9866 if (S.getOpcode() == Instruction::Load) {
9878 if (UserIgnoreList && E.Idx == 0)
9883 }
else if (S.getOpcode() == Instruction::ExtractElement ||
9886 !CheckOperandsProfitability(
9889 IsaPred<Instruction>)),
9900 auto AddCombinedNode = [&](
unsigned Idx,
unsigned Cnt,
unsigned Sz) {
9901 E.CombinedEntriesWithIndices.emplace_back(
Idx, Cnt);
9902 if (StartIdx == Cnt)
9903 StartIdx = Cnt + Sz;
9904 if (
End == Cnt + Sz)
9907 for (
auto [Cnt, Sz] : Slices) {
9910 if (TreeEntry *SE = getTreeEntry(Slice.
front());
9911 SE || getTreeEntry(Slice.
back())) {
9914 if (VF != SE->getVectorFactor() || !SE->isSame(Slice))
9916 SE->UserTreeIndices.emplace_back(&E, UINT_MAX);
9917 AddCombinedNode(SE->Idx, Cnt, Sz);
9920 unsigned PrevSize = VectorizableTree.size();
9921 [[maybe_unused]]
unsigned PrevEntriesSize =
9922 LoadEntriesToVectorize.size();
9923 buildTree_rec(Slice, 0,
EdgeInfo(&E, UINT_MAX));
9924 if (PrevSize + 1 == VectorizableTree.size() &&
9925 VectorizableTree[PrevSize]->isGather() &&
9926 VectorizableTree[PrevSize]->hasState() &&
9927 VectorizableTree[PrevSize]->getOpcode() !=
9928 Instruction::ExtractElement &&
9930 if (UserIgnoreList && E.Idx == 0 && VF == 2)
9932 VectorizableTree.pop_back();
9933 assert(PrevEntriesSize == LoadEntriesToVectorize.size() &&
9934 "LoadEntriesToVectorize expected to remain the same");
9937 AddCombinedNode(PrevSize, Cnt, Sz);
9941 if (E.CombinedEntriesWithIndices.empty() && !E.ReorderIndices.empty()) {
9944 E.ReorderIndices.clear();
9949 switch (E.getOpcode()) {
9950 case Instruction::Load: {
9953 if (E.State != TreeEntry::Vectorize)
9955 Type *ScalarTy = E.getMainOp()->getType();
9957 Align CommonAlignment = computeCommonAlignment<LoadInst>(E.Scalars);
9960 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9964 auto *BaseLI = cast<LoadInst>(E.Scalars.back());
9971 Instruction::Load, VecTy, BaseLI->getPointerOperand(),
9972 false, CommonAlignment,
CostKind, BaseLI);
9973 if (StridedCost < OriginalVecCost)
9976 E.State = TreeEntry::StridedVectorize;
9980 case Instruction::Store: {
9982 cast<StoreInst>(E.getMainOp())->getValueOperand()->getType();
9984 Align CommonAlignment = computeCommonAlignment<StoreInst>(E.Scalars);
9987 if (!E.ReorderIndices.empty() &&
isReverseOrder(E.ReorderIndices) &&
9991 auto *BaseSI = cast<StoreInst>(E.Scalars.back());
9998 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
9999 false, CommonAlignment,
CostKind, BaseSI);
10000 if (StridedCost < OriginalVecCost)
10003 E.State = TreeEntry::StridedVectorize;
10004 }
else if (!E.ReorderIndices.empty()) {
10007 auto *BaseSI = cast<StoreInst>(E.Scalars.front());
10008 assert(Mask.size() > 1 &&
"Expected mask greater than 1 element.");
10009 if (Mask.size() < 4)
10011 for (
unsigned Factor : seq<unsigned>(2, Mask.size() / 2 + 1)) {
10015 VecTy, Factor, BaseSI->getAlign(),
10023 unsigned InterleaveFactor = IsInterleaveMask(Mask);
10024 if (InterleaveFactor != 0)
10025 E.setInterleave(InterleaveFactor);
10029 case Instruction::Select: {
10030 if (E.State != TreeEntry::Vectorize)
10036 E.CombinedOp = TreeEntry::MinMax;
10037 TreeEntry *CondEntry =
const_cast<TreeEntry *
>(getOperandEntry(&E, 0));
10038 if (SelectOnly && CondEntry->UserTreeIndices.size() == 1 &&
10039 CondEntry->State == TreeEntry::Vectorize) {
10041 CondEntry->State = TreeEntry::CombinedVectorize;
10050 if (LoadEntriesToVectorize.empty()) {
10052 if (VectorizableTree.size() <= 1 && VectorizableTree.front()->hasState() &&
10053 VectorizableTree.front()->getOpcode() == Instruction::Load)
10056 constexpr unsigned SmallTree = 3;
10057 constexpr unsigned SmallVF = 2;
10058 if ((VectorizableTree.size() <= SmallTree &&
10059 VectorizableTree.front()->Scalars.size() == SmallVF) ||
10060 (VectorizableTree.size() <= 2 && UserIgnoreList))
10063 if (VectorizableTree.front()->isNonPowOf2Vec() &&
10067 [](
const std::unique_ptr<TreeEntry> &TE) {
10068 return TE->isGather() && TE->hasState() &&
10069 TE->getOpcode() == Instruction::Load &&
10081 for (std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
10082 TreeEntry &E = *TE;
10083 if (E.isGather() &&
10084 ((E.hasState() && E.getOpcode() == Instruction::Load) ||
10085 (!E.hasState() &&
any_of(E.Scalars,
10087 return isa<LoadInst>(V) &&
10088 !isVectorized(V) &&
10089 !isDeleted(cast<Instruction>(V));
10092 for (
Value *V : E.Scalars) {
10093 auto *LI = dyn_cast<LoadInst>(V);
10099 *
this, V, *DL, *SE, *
TTI,
10100 GatheredLoads[std::make_tuple(
10108 if (!GatheredLoads.
empty())
10109 tryToVectorizeGatheredLoads(GatheredLoads);
10119 bool IsFinalized =
false;
10132 bool SameNodesEstimated =
true;
10141 if (
auto *VTy = dyn_cast<VectorType>(Ty))
10157 const auto *It =
find_if_not(VL, IsaPred<UndefValue>);
10158 assert(It != VL.
end() &&
"Expected at least one non-undef value.");
10161 count(VL, *It) > 1 &&
10163 if (!NeedShuffle) {
10164 if (isa<FixedVectorType>(ScalarTy)) {
10169 cast<FixedVectorType>(ScalarTy));
10172 CostKind, std::distance(VL.
begin(), It),
10178 return isa<PoisonValue>(V) ? PoisonMaskElem : 0;
10185 VecTy, ShuffleMask, CostKind,
10189 return GatherCost +
10190 (
all_of(Gathers, IsaPred<UndefValue>)
10192 : R.getGatherCost(Gathers, !Root && VL.
equals(Gathers),
10200 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10201 unsigned NumParts) {
10202 assert(VL.
size() > NumParts &&
"Unexpected scalarized shuffle.");
10204 std::accumulate(VL.
begin(), VL.
end(), 0, [](
unsigned Sz,
Value *V) {
10205 auto *EE = dyn_cast<ExtractElementInst>(V);
10208 auto *VecTy = dyn_cast<FixedVectorType>(EE->getVectorOperandType());
10211 return std::max(Sz, VecTy->getNumElements());
10217 -> std::optional<TTI::ShuffleKind> {
10218 if (NumElts <= EltsPerVector)
10219 return std::nullopt;
10221 alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
10223 if (I == PoisonMaskElem)
10225 return std::min(S, I);
10228 int OffsetReg1 = OffsetReg0;
10232 int FirstRegId = -1;
10233 Indices.assign(1, OffsetReg0);
10237 int Idx =
I - OffsetReg0;
10239 (
Idx / NumElts) * NumParts + (
Idx % NumElts) / EltsPerVector;
10240 if (FirstRegId < 0)
10241 FirstRegId = RegId;
10242 RegIndices.
insert(RegId);
10243 if (RegIndices.
size() > 2)
10244 return std::nullopt;
10245 if (RegIndices.
size() == 2) {
10247 if (Indices.
size() == 1) {
10250 std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
10251 [&](
int S,
int I) {
10252 if (I == PoisonMaskElem)
10254 int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
10255 ((I - OffsetReg0) % NumElts) / EltsPerVector;
10256 if (RegId == FirstRegId)
10258 return std::min(S, I);
10261 Indices.push_back(OffsetReg1 % NumElts);
10263 Idx =
I - OffsetReg1;
10265 I = (
Idx % NumElts) % EltsPerVector +
10266 (RegId == FirstRegId ? 0 : EltsPerVector);
10268 return ShuffleKind;
10275 for (
unsigned Part : seq<unsigned>(NumParts)) {
10276 if (!ShuffleKinds[Part])
10279 Part * EltsPerVector,
getNumElems(Mask.size(), EltsPerVector, Part));
10283 std::optional<TTI::ShuffleKind> RegShuffleKind =
10284 CheckPerRegistersShuffle(SubMask, Indices);
10285 if (!RegShuffleKind) {
10288 MaskSlice, std::max<unsigned>(NumElts, MaskSlice.
size())))
10301 *R.TTI, VL.
front()->getType(),
alignTo(NumElts, EltsPerVector));
10302 for (
unsigned Idx : Indices) {
10303 assert((
Idx + EltsPerVector) <= BaseVF &&
10304 "SK_ExtractSubvector index out of range");
10315 if (OriginalCost <
Cost)
10316 Cost = OriginalCost;
10324 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10331 void estimateNodesPermuteCost(
const TreeEntry &E1,
const TreeEntry *E2,
10333 unsigned SliceSize) {
10334 if (SameNodesEstimated) {
10340 if ((InVectors.
size() == 2 &&
10341 cast<const TreeEntry *>(InVectors.
front()) == &E1 &&
10342 cast<const TreeEntry *>(InVectors.
back()) == E2) ||
10343 (!E2 && cast<const TreeEntry *>(InVectors.
front()) == &E1)) {
10344 unsigned Limit =
getNumElems(Mask.size(), SliceSize, Part);
10347 "Expected all poisoned elements.");
10349 copy(SubMask, std::next(CommonMask.
begin(), SliceSize * Part));
10354 Cost += createShuffle(InVectors.
front(),
10355 InVectors.
size() == 1 ?
nullptr : InVectors.
back(),
10357 transformMaskAfterShuffle(CommonMask, CommonMask);
10358 }
else if (InVectors.
size() == 2) {
10359 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10360 transformMaskAfterShuffle(CommonMask, CommonMask);
10362 SameNodesEstimated =
false;
10363 if (!E2 && InVectors.
size() == 1) {
10364 unsigned VF = E1.getVectorFactor();
10367 cast<FixedVectorType>(V1->
getType())->getNumElements());
10369 const auto *E = cast<const TreeEntry *>(InVectors.
front());
10370 VF = std::max(VF, E->getVectorFactor());
10372 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10374 CommonMask[
Idx] = Mask[
Idx] + VF;
10375 Cost += createShuffle(InVectors.
front(), &E1, CommonMask);
10376 transformMaskAfterShuffle(CommonMask, CommonMask);
10378 auto P = InVectors.
front();
10379 Cost += createShuffle(&E1, E2, Mask);
10380 unsigned VF = Mask.size();
10385 const auto *E = cast<const TreeEntry *>(
P);
10386 VF = std::max(VF, E->getVectorFactor());
10388 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10390 CommonMask[
Idx] =
Idx + (InVectors.
empty() ? 0 : VF);
10391 Cost += createShuffle(
P, InVectors.
front(), CommonMask);
10392 transformMaskAfterShuffle(CommonMask, CommonMask);
10396 class ShuffleCostBuilder {
10399 static bool isEmptyOrIdentity(
ArrayRef<int> Mask,
unsigned VF) {
10401 return Mask.empty() ||
10402 (VF == Mask.size() &&
10410 ~ShuffleCostBuilder() =
default;
10415 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10416 if (isEmptyOrIdentity(Mask, VF))
10419 cast<VectorType>(V1->
getType()), Mask);
10424 cast<VectorType>(V1->
getType())->getElementCount().getKnownMinValue();
10425 if (isEmptyOrIdentity(Mask, VF))
10428 cast<VectorType>(V1->
getType()), Mask);
10434 void resizeToMatch(
Value *&,
Value *&)
const {}
10444 ShuffleCostBuilder Builder(
TTI);
10447 unsigned CommonVF = Mask.size();
10449 auto GetNodeMinBWAffectedCost = [&](
const TreeEntry &E,
10453 Type *EScalarTy = E.Scalars.front()->getType();
10454 bool IsSigned =
true;
10455 if (
auto It = R.MinBWs.find(&E); It != R.MinBWs.end()) {
10457 IsSigned = It->second.second;
10459 if (EScalarTy != ScalarTy) {
10460 unsigned CastOpcode = Instruction::Trunc;
10461 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10462 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10464 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10472 if (isa<Constant>(V))
10474 auto *VecTy = cast<VectorType>(V->getType());
10476 if (EScalarTy != ScalarTy) {
10478 unsigned CastOpcode = Instruction::Trunc;
10479 unsigned DstSz = R.DL->getTypeSizeInBits(ScalarTy);
10480 unsigned SrcSz = R.DL->getTypeSizeInBits(EScalarTy);
10482 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
10489 if (!V1 && !V2 && !P2.
isNull()) {
10491 const TreeEntry *E = cast<const TreeEntry *>(P1);
10492 unsigned VF = E->getVectorFactor();
10493 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10494 CommonVF = std::max(VF, E2->getVectorFactor());
10497 return Idx < 2 * static_cast<int>(CommonVF);
10499 "All elements in mask must be less than 2 * CommonVF.");
10500 if (E->Scalars.size() == E2->Scalars.size()) {
10504 for (
int &
Idx : CommonMask) {
10507 if (
Idx <
static_cast<int>(CommonVF) && !EMask.
empty())
10509 else if (
Idx >=
static_cast<int>(CommonVF))
10510 Idx = (E2Mask.
empty() ?
Idx - CommonVF : E2Mask[
Idx - CommonVF]) +
10514 CommonVF = E->Scalars.size();
10515 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF) +
10516 GetNodeMinBWAffectedCost(*E2, CommonVF);
10518 ExtraCost += GetNodeMinBWAffectedCost(*E, E->getVectorFactor()) +
10519 GetNodeMinBWAffectedCost(*E2, E2->getVectorFactor());
10522 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10523 }
else if (!V1 && P2.
isNull()) {
10525 const TreeEntry *E = cast<const TreeEntry *>(P1);
10526 unsigned VF = E->getVectorFactor();
10530 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10531 "All elements in mask must be less than CommonVF.");
10532 if (E->Scalars.size() == Mask.size() && VF != Mask.size()) {
10534 assert(!EMask.
empty() &&
"Expected non-empty common mask.");
10535 for (
int &
Idx : CommonMask) {
10539 CommonVF = E->Scalars.size();
10540 }
else if (
unsigned Factor = E->getInterleaveFactor();
10541 Factor > 0 && E->Scalars.size() != Mask.size() &&
10545 std::iota(CommonMask.
begin(), CommonMask.
end(), 0);
10547 ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF);
10550 if (!E->ReorderIndices.empty() && CommonVF == E->ReorderIndices.size() &&
10551 CommonVF == CommonMask.
size() &&
10553 [](
const auto &&
P) {
10555 static_cast<unsigned>(
P.value()) !=
P.index();
10563 }
else if (V1 && P2.
isNull()) {
10565 ExtraCost += GetValueMinBWAffectedCost(V1);
10566 CommonVF = getVF(V1);
10569 [=](
int Idx) {
return Idx < static_cast<int>(CommonVF); }) &&
10570 "All elements in mask must be less than CommonVF.");
10571 }
else if (V1 && !V2) {
10573 unsigned VF = getVF(V1);
10574 const TreeEntry *E2 = cast<const TreeEntry *>(P2);
10575 CommonVF = std::max(VF, E2->getVectorFactor());
10578 return Idx < 2 * static_cast<int>(CommonVF);
10580 "All elements in mask must be less than 2 * CommonVF.");
10581 if (E2->Scalars.size() == VF && VF != CommonVF) {
10583 assert(!E2Mask.
empty() &&
"Expected non-empty common mask.");
10584 for (
int &
Idx : CommonMask) {
10587 if (
Idx >=
static_cast<int>(CommonVF))
10588 Idx = E2Mask[
Idx - CommonVF] + VF;
10592 ExtraCost += GetValueMinBWAffectedCost(V1);
10594 ExtraCost += GetNodeMinBWAffectedCost(
10595 *E2, std::min(CommonVF, E2->getVectorFactor()));
10596 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10597 }
else if (!V1 && V2) {
10599 unsigned VF = getVF(V2);
10600 const TreeEntry *E1 = cast<const TreeEntry *>(P1);
10601 CommonVF = std::max(VF, E1->getVectorFactor());
10604 return Idx < 2 * static_cast<int>(CommonVF);
10606 "All elements in mask must be less than 2 * CommonVF.");
10607 if (E1->Scalars.size() == VF && VF != CommonVF) {
10609 assert(!E1Mask.
empty() &&
"Expected non-empty common mask.");
10610 for (
int &
Idx : CommonMask) {
10613 if (
Idx >=
static_cast<int>(CommonVF))
10614 Idx = E1Mask[
Idx - CommonVF] + VF;
10620 ExtraCost += GetNodeMinBWAffectedCost(
10621 *E1, std::min(CommonVF, E1->getVectorFactor()));
10623 ExtraCost += GetValueMinBWAffectedCost(V2);
10624 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10626 assert(V1 && V2 &&
"Expected both vectors.");
10627 unsigned VF = getVF(V1);
10628 CommonVF = std::max(VF, getVF(V2));
10631 return Idx < 2 * static_cast<int>(CommonVF);
10633 "All elements in mask must be less than 2 * CommonVF.");
10635 GetValueMinBWAffectedCost(V1) + GetValueMinBWAffectedCost(V2);
10636 if (V1->
getType() != V2->getType()) {
10638 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10640 if (cast<VectorType>(V1->
getType())->getElementType() != ScalarTy)
10642 if (cast<VectorType>(V2->getType())->getElementType() != ScalarTy)
10643 V2 = getAllOnesValue(*R.DL,
getWidenedType(ScalarTy, CommonVF));
10646 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
10651 InVectors.
front() =
10653 if (InVectors.
size() == 2)
10655 return ExtraCost + BaseShuffleAnalysis::createShuffle<InstructionCost>(
10656 V1, V2, CommonMask, Builder);
10663 : BaseShuffleAnalysis(ScalarTy),
TTI(
TTI),
10664 VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()), R(R),
10665 CheckedExtracts(CheckedExtracts) {}
10667 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
10668 unsigned NumParts,
bool &UseVecBaseAsInput) {
10669 UseVecBaseAsInput =
false;
10672 Value *VecBase =
nullptr;
10674 if (!E->ReorderIndices.empty()) {
10676 E->ReorderIndices.end());
10681 bool PrevNodeFound =
any_of(
10683 [&](
const std::unique_ptr<TreeEntry> &TE) {
10684 return ((TE->hasState() && !TE->isAltShuffle() &&
10685 TE->getOpcode() == Instruction::ExtractElement) ||
10687 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
10688 return VL.size() > Data.index() &&
10689 (Mask[Data.index()] == PoisonMaskElem ||
10690 isa<UndefValue>(VL[Data.index()]) ||
10691 Data.value() == VL[Data.index()]);
10696 for (
unsigned Part : seq<unsigned>(NumParts)) {
10698 ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, Limit);
10702 if (isa<UndefValue>(V) ||
10711 auto *EE = cast<ExtractElementInst>(V);
10712 VecBase = EE->getVectorOperand();
10713 UniqueBases.
insert(VecBase);
10714 const TreeEntry *VE = R.getTreeEntry(V);
10715 if (!CheckedExtracts.
insert(V).second ||
10716 !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
10719 return isa<GetElementPtrInst>(U) &&
10720 !R.areAllUsersVectorized(cast<Instruction>(U),
10728 unsigned Idx = *EEIdx;
10730 if (EE->hasOneUse() || !PrevNodeFound) {
10732 if (isa<SExtInst, ZExtInst>(Ext) &&
10733 all_of(Ext->users(), IsaPred<GetElementPtrInst>)) {
10738 EE->getVectorOperandType(),
Idx);
10741 Ext->getOpcode(), Ext->getType(), EE->getType(),
10756 if (!PrevNodeFound)
10757 Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
10760 transformMaskAfterShuffle(CommonMask, CommonMask);
10761 SameNodesEstimated =
false;
10762 if (NumParts != 1 && UniqueBases.
size() != 1) {
10763 UseVecBaseAsInput =
true;
10771 std::optional<InstructionCost>
10775 return std::nullopt;
10781 return Idx < static_cast<int>(E1.getVectorFactor());
10783 "Expected single vector shuffle mask.");
10787 if (InVectors.
empty()) {
10788 CommonMask.
assign(Mask.begin(), Mask.end());
10789 InVectors.
assign({&E1, &E2});
10792 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10795 if (NumParts == 0 || NumParts >= Mask.size() ||
10796 MaskVecTy->getNumElements() % NumParts != 0 ||
10798 MaskVecTy->getNumElements() / NumParts))
10803 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10804 estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
10807 if (InVectors.
empty()) {
10808 CommonMask.
assign(Mask.begin(), Mask.end());
10809 InVectors.
assign(1, &E1);
10812 assert(!CommonMask.
empty() &&
"Expected non-empty common mask.");
10815 if (NumParts == 0 || NumParts >= Mask.size() ||
10816 MaskVecTy->getNumElements() % NumParts != 0 ||
10818 MaskVecTy->getNumElements() / NumParts))
10823 unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
10824 estimateNodesPermuteCost(E1,
nullptr, Mask, Part, SliceSize);
10825 if (!SameNodesEstimated && InVectors.
size() == 1)
10837 auto *EI = cast<ExtractElementInst>(
10838 cast<const TreeEntry *>(InVectors.
front())
10839 ->getOrdered(
P.index()));
10840 return EI->getVectorOperand() == V1 ||
10841 EI->getVectorOperand() == V2;
10843 "Expected extractelement vectors.");
10847 if (InVectors.
empty()) {
10849 "Expected empty input mask/vectors.");
10850 CommonMask.
assign(Mask.begin(), Mask.end());
10851 InVectors.
assign(1, V1);
10856 assert(InVectors.
size() == 1 && isa<const TreeEntry *>(InVectors[0]) &&
10857 !CommonMask.
empty() &&
10860 Value *Scalar = cast<const TreeEntry *>(InVectors[0])
10861 ->getOrdered(
P.index());
10863 return P.value() == Mask[
P.index()] ||
10864 isa<UndefValue>(Scalar);
10865 if (isa<Constant>(V1))
10867 auto *EI = cast<ExtractElementInst>(Scalar);
10868 return EI->getVectorOperand() == V1;
10870 "Expected only tree entry for extractelement vectors.");
10874 "Expected only tree entries from extracts/reused buildvectors.");
10875 unsigned VF = getVF(V1);
10876 if (InVectors.
size() == 2) {
10877 Cost += createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
10878 transformMaskAfterShuffle(CommonMask, CommonMask);
10879 VF = std::max<unsigned>(VF, CommonMask.
size());
10880 }
else if (
const auto *InTE =
10881 InVectors.
front().dyn_cast<
const TreeEntry *>()) {
10882 VF = std::max(VF, InTE->getVectorFactor());
10885 VF, cast<FixedVectorType>(cast<Value *>(InVectors.
front())->getType())
10886 ->getNumElements());
10889 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10891 CommonMask[
Idx] = Mask[
Idx] + VF;
10894 Value *Root =
nullptr) {
10895 Cost += getBuildVectorCost(VL, Root);
10899 unsigned VF = VL.
size();
10901 VF = std::min(VF, MaskVF);
10903 if (isa<UndefValue>(V)) {
10909 if (
auto *VecTy = dyn_cast<FixedVectorType>(Vals.
front()->getType())) {
10916 Type *ScalarTy = V->getType()->getScalarType();
10918 if (isa<PoisonValue>(V))
10920 else if (isa<UndefValue>(V))
10924 std::fill_n(NewVals.
begin() +
I * VecTyNumElements, VecTyNumElements,
10927 Vals.
swap(NewVals);
10933 cast<FixedVectorType>(Root->
getType())->getNumElements()),
10940 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
10943 IsFinalized =
true;
10946 if (InVectors.
size() == 2)
10947 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10949 Cost += createShuffle(Vec,
nullptr, CommonMask);
10950 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10954 "Expected vector length for the final value before action.");
10955 Value *V = cast<Value *>(Vec);
10956 Action(V, CommonMask);
10957 InVectors.
front() = V;
10959 if (!SubVectors.empty()) {
10961 if (InVectors.
size() == 2)
10962 Cost += createShuffle(Vec, InVectors.
back(), CommonMask);
10964 Cost += createShuffle(Vec,
nullptr, CommonMask);
10965 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
10969 if (!SubVectorsMask.
empty()) {
10971 "Expected same size of masks for subvectors and common mask.");
10973 copy(SubVectorsMask, SVMask.begin());
10974 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
10977 I1 = I2 + CommonMask.
size();
10984 for (
auto [E,
Idx] : SubVectors) {
10985 Type *EScalarTy = E->Scalars.front()->getType();
10986 bool IsSigned =
true;
10987 if (
auto It =
R.MinBWs.find(E); It !=
R.MinBWs.end()) {
10990 IsSigned = It->second.second;
10992 if (ScalarTy != EScalarTy) {
10993 unsigned CastOpcode = Instruction::Trunc;
10994 unsigned DstSz =
R.DL->getTypeSizeInBits(ScalarTy);
10995 unsigned SrcSz =
R.DL->getTypeSizeInBits(EScalarTy);
10997 CastOpcode = IsSigned ? Instruction::SExt : Instruction::ZExt;
11007 if (!CommonMask.
empty()) {
11008 std::iota(std::next(CommonMask.
begin(),
Idx),
11009 std::next(CommonMask.
begin(),
Idx + E->getVectorFactor()),
11015 if (!ExtMask.
empty()) {
11016 if (CommonMask.
empty()) {
11020 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
11023 NewMask[
I] = CommonMask[ExtMask[
I]];
11025 CommonMask.
swap(NewMask);
11028 if (CommonMask.
empty()) {
11029 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
11033 createShuffle(InVectors.
front(),
11034 InVectors.
size() == 2 ? InVectors.
back() :
nullptr,
11040 "Shuffle construction must be finalized.");
11044const BoUpSLP::TreeEntry *BoUpSLP::getOperandEntry(
const TreeEntry *E,
11045 unsigned Idx)
const {
11046 if (
const TreeEntry *VE = getMatchedVectorizedOperand(E,
Idx))
11049 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
11050 return TE->isGather() &&
11051 find_if(TE->UserTreeIndices, [&](
const EdgeInfo &EI) {
11052 return EI.EdgeIdx == Idx && EI.UserTE == E;
11053 }) != TE->UserTreeIndices.end();
11055 assert(It != VectorizableTree.end() &&
"Expected vectorizable entry.");
11060 if (TE.State == TreeEntry::ScatterVectorize ||
11061 TE.State == TreeEntry::StridedVectorize)
11063 if (TE.State == TreeEntry::Vectorize && TE.getOpcode() == Instruction::Load &&
11064 !TE.isAltShuffle()) {
11065 if (TE.ReorderIndices.empty())
11079 const unsigned VF,
unsigned MinBW,
11111 auto It = MinBWs.
find(E);
11112 Type *OrigScalarTy = ScalarTy;
11113 if (It != MinBWs.
end()) {
11114 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
11120 unsigned EntryVF = E->getVectorFactor();
11123 if (E->isGather()) {
11126 if (isa<InsertElementInst>(VL[0]))
11128 if (isa<CmpInst>(VL.
front()))
11129 ScalarTy = VL.
front()->getType();
11130 return processBuildVector<ShuffleCostEstimator, InstructionCost>(
11131 E, ScalarTy, *
TTI, VectorizedVals, *
this, CheckedExtracts);
11135 if (!E->ReorderIndices.empty() && (E->State != TreeEntry::StridedVectorize ||
11138 if (E->getOpcode() == Instruction::Store) {
11140 NewMask.
resize(E->ReorderIndices.size());
11141 copy(E->ReorderIndices, NewMask.
begin());
11147 if (!E->ReuseShuffleIndices.empty())
11148 ::addMask(Mask, E->ReuseShuffleIndices);
11152 assert((E->State == TreeEntry::Vectorize ||
11153 E->State == TreeEntry::ScatterVectorize ||
11154 E->State == TreeEntry::StridedVectorize) &&
11155 "Unhandled state");
11156 assert(E->getOpcode() &&
11158 (E->getOpcode() == Instruction::GetElementPtr &&
11159 E->getMainOp()->getType()->isPointerTy())) &&
11162 unsigned ShuffleOrOp =
11163 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
11164 if (E->CombinedOp != TreeEntry::NotCombinedOp)
11165 ShuffleOrOp = E->CombinedOp;
11167 const unsigned Sz = UniqueValues.
size();
11169 for (
unsigned I = 0;
I < Sz; ++
I) {
11170 if (isa<Instruction>(UniqueValues[
I]) && getTreeEntry(UniqueValues[
I]) == E)
11172 UsedScalars.set(
I);
11174 auto GetCastContextHint = [&](
Value *
V) {
11175 if (
const TreeEntry *OpTE = getTreeEntry(V))
11176 return getCastContextHint(*OpTE);
11177 InstructionsState SrcState =
getSameOpcode(E->getOperand(0), *TLI);
11178 if (SrcState && SrcState.getOpcode() == Instruction::Load &&
11179 !SrcState.isAltShuffle())
11188 if (isa<CastInst, CallInst>(VL0)) {
11192 ScalarCost = (Sz - UsedScalars.count()) * ScalarEltCost(0);
11194 for (
unsigned I = 0;
I < Sz; ++
I) {
11195 if (UsedScalars.test(
I))
11197 ScalarCost += ScalarEltCost(
I);
11206 (E->getOpcode() != Instruction::Load ||
11207 !E->UserTreeIndices.empty())) {
11208 const EdgeInfo &EI =
11209 *
find_if(E->UserTreeIndices, [](
const EdgeInfo &EI) {
11210 return !EI.UserTE->isGather() || EI.EdgeIdx != UINT_MAX;
11212 if (EI.UserTE->getOpcode() != Instruction::Select ||
11214 auto UserBWIt = MinBWs.
find(EI.UserTE);
11215 Type *UserScalarTy =
11216 EI.UserTE->getOperand(EI.EdgeIdx).front()->getType();
11217 if (UserBWIt != MinBWs.
end())
11219 UserBWIt->second.first);
11220 if (ScalarTy != UserScalarTy) {
11221 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11222 unsigned SrcBWSz =
DL->getTypeSizeInBits(UserScalarTy);
11223 unsigned VecOpcode;
11224 auto *UserVecTy =
getWidenedType(UserScalarTy, E->Scalars.size());
11225 if (BWSz > SrcBWSz)
11226 VecOpcode = Instruction::Trunc;
11229 It->second.second ? Instruction::SExt : Instruction::ZExt;
11236 LLVM_DEBUG(dumpTreeCosts(E, CommonCost, VecCost - CommonCost,
11237 ScalarCost,
"Calculated costs for Tree"));
11238 return VecCost - ScalarCost;
11243 assert((E->State == TreeEntry::Vectorize ||
11244 E->State == TreeEntry::StridedVectorize) &&
11245 "Entry state expected to be Vectorize or StridedVectorize here.");
11249 *
TTI, Ptrs, BasePtr, E->getOpcode(),
CostKind, OrigScalarTy, VecTy);
11250 LLVM_DEBUG(dumpTreeCosts(E, 0, VecCost, ScalarCost,
11251 "Calculated GEPs cost for Tree"));
11253 return VecCost - ScalarCost;
11260 Type *CanonicalType = Ty;
11267 {CanonicalType, CanonicalType});
11272 if (VI && SelectOnly) {
11274 "Expected only for scalar type.");
11275 auto *CI = cast<CmpInst>(
VI->getOperand(0));
11277 CI->
getOpcode(), Ty, Builder.getInt1Ty(), CI->getPredicate(),
11278 CostKind, {TTI::OK_AnyValue, TTI::OP_None},
11279 {TTI::OK_AnyValue, TTI::OP_None}, CI);
11281 return IntrinsicCost;
11283 switch (ShuffleOrOp) {
11284 case Instruction::PHI: {
11288 for (
Value *V : UniqueValues) {
11289 auto *
PHI = dyn_cast<PHINode>(V);
11294 for (
unsigned I = 0,
N =
PHI->getNumIncomingValues();
I <
N; ++
I) {
11298 if (
const TreeEntry *OpTE = getTreeEntry(
Operands.front()))
11300 if (!OpTE->ReuseShuffleIndices.empty())
11301 ScalarCost +=
TTI::TCC_Basic * (OpTE->ReuseShuffleIndices.size() -
11302 OpTE->Scalars.size());
11305 return CommonCost - ScalarCost;
11307 case Instruction::ExtractValue:
11308 case Instruction::ExtractElement: {
11309 auto GetScalarCost = [&](
unsigned Idx) {
11310 if (isa<PoisonValue>(UniqueValues[
Idx]))
11313 auto *
I = cast<Instruction>(UniqueValues[
Idx]);
11315 if (ShuffleOrOp == Instruction::ExtractElement) {
11316 auto *EE = cast<ExtractElementInst>(
I);
11317 SrcVecTy = EE->getVectorOperandType();
11319 auto *EV = cast<ExtractValueInst>(
I);
11320 Type *AggregateTy = EV->getAggregateOperand()->getType();
11322 if (
auto *ATy = dyn_cast<ArrayType>(AggregateTy))
11323 NumElts = ATy->getNumElements();
11328 if (
I->hasOneUse()) {
11330 if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
11331 all_of(
Ext->users(), IsaPred<GetElementPtrInst>)) {
11338 Ext->getOpcode(),
Ext->getType(),
I->getType(),
11346 auto GetVectorCost = [](
InstructionCost CommonCost) {
return CommonCost; };
11347 return GetCostDiff(GetScalarCost, GetVectorCost);
11349 case Instruction::InsertElement: {
11350 assert(E->ReuseShuffleIndices.empty() &&
11351 "Unique insertelements only are expected.");
11352 auto *SrcVecTy = cast<FixedVectorType>(VL0->
getType());
11353 unsigned const NumElts = SrcVecTy->getNumElements();
11354 unsigned const NumScalars = VL.
size();
11360 unsigned OffsetEnd = OffsetBeg;
11361 InsertMask[OffsetBeg] = 0;
11364 if (OffsetBeg >
Idx)
11366 else if (OffsetEnd <
Idx)
11368 InsertMask[
Idx] =
I + 1;
11371 if (NumOfParts > 0 && NumOfParts < NumElts)
11372 VecScalarsSz =
PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts);
11373 unsigned VecSz = (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) *
11375 unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz);
11376 unsigned InsertVecSz = std::min<unsigned>(
11378 ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * VecScalarsSz);
11379 bool IsWholeSubvector =
11380 OffsetBeg ==
Offset && ((OffsetEnd + 1) % VecScalarsSz == 0);
11384 if (OffsetBeg + InsertVecSz > VecSz) {
11387 InsertVecSz = VecSz;
11393 if (!E->ReorderIndices.empty()) {
11398 std::iota(
Mask.begin(), std::next(
Mask.begin(), InsertVecSz), 0);
11400 bool IsIdentity =
true;
11402 Mask.swap(PrevMask);
11403 for (
unsigned I = 0;
I < NumScalars; ++
I) {
11405 DemandedElts.
setBit(InsertIdx);
11406 IsIdentity &= InsertIdx - OffsetBeg ==
I;
11407 Mask[InsertIdx - OffsetBeg] =
I;
11409 assert(
Offset < NumElts &&
"Failed to find vector index offset");
11423 InsertVecTy, Mask);
11424 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
11425 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
11433 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask));
11434 if (!InMask.
all() && NumScalars != NumElts && !IsWholeSubvector) {
11435 if (InsertVecSz != VecSz) {
11446 for (
unsigned I = OffsetEnd + 1 -
Offset;
I < VecSz; ++
I)
11455 case Instruction::ZExt:
11456 case Instruction::SExt:
11457 case Instruction::FPToUI:
11458 case Instruction::FPToSI:
11459 case Instruction::FPExt:
11460 case Instruction::PtrToInt:
11461 case Instruction::IntToPtr:
11462 case Instruction::SIToFP:
11463 case Instruction::UIToFP:
11464 case Instruction::Trunc:
11465 case Instruction::FPTrunc:
11466 case Instruction::BitCast: {
11467 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11470 unsigned Opcode = ShuffleOrOp;
11471 unsigned VecOpcode = Opcode;
11473 (SrcIt != MinBWs.
end() || It != MinBWs.
end())) {
11475 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy->
getScalarType());
11476 if (SrcIt != MinBWs.
end()) {
11477 SrcBWSz = SrcIt->second.first;
11484 if (BWSz == SrcBWSz) {
11485 VecOpcode = Instruction::BitCast;
11486 }
else if (BWSz < SrcBWSz) {
11487 VecOpcode = Instruction::Trunc;
11488 }
else if (It != MinBWs.
end()) {
11489 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11490 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
11491 }
else if (SrcIt != MinBWs.
end()) {
11492 assert(BWSz > SrcBWSz &&
"Invalid cast!");
11494 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
11496 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
11497 !SrcIt->second.second) {
11498 VecOpcode = Instruction::UIToFP;
11501 assert(
Idx == 0 &&
"Expected 0 index only");
11509 if (VecOpcode != Opcode && VecOpcode == Instruction::BitCast)
11511 auto *
VI = VL0->
getOpcode() == Opcode ? VL0 :
nullptr;
11514 bool IsArithmeticExtendedReduction =
11515 E->Idx == 0 && UserIgnoreList &&
11517 auto *
I = cast<Instruction>(V);
11518 return is_contained({Instruction::Add, Instruction::FAdd,
11519 Instruction::Mul, Instruction::FMul,
11520 Instruction::And, Instruction::Or,
11524 if (IsArithmeticExtendedReduction &&
11525 (VecOpcode == Instruction::ZExt || VecOpcode == Instruction::SExt))
11527 return CommonCost +
11529 VecOpcode == Opcode ? VI :
nullptr);
11531 return GetCostDiff(GetScalarCost, GetVectorCost);
11533 case Instruction::FCmp:
11534 case Instruction::ICmp:
11535 case Instruction::Select: {
11539 match(VL0, MatchCmp))
11545 auto GetScalarCost = [&](
unsigned Idx) {
11546 if (isa<PoisonValue>(UniqueValues[
Idx]))
11549 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11555 !
match(VI, MatchCmp)) ||
11563 E->getOpcode(), OrigScalarTy, Builder.getInt1Ty(), CurrentPred,
11564 CostKind, getOperandInfo(
VI->getOperand(0)),
11565 getOperandInfo(
VI->getOperand(1)), VI);
11568 ScalarCost = IntrinsicCost;
11577 CostKind, getOperandInfo(E->getOperand(0)),
11578 getOperandInfo(E->getOperand(1)), VL0);
11579 if (
auto *SI = dyn_cast<SelectInst>(VL0)) {
11582 unsigned CondNumElements = CondType->getNumElements();
11584 assert(VecTyNumElements >= CondNumElements &&
11585 VecTyNumElements % CondNumElements == 0 &&
11586 "Cannot vectorize Instruction::Select");
11587 if (CondNumElements != VecTyNumElements) {
11596 return VecCost + CommonCost;
11598 return GetCostDiff(GetScalarCost, GetVectorCost);
11600 case TreeEntry::MinMax: {
11601 auto GetScalarCost = [&](
unsigned Idx) {
11602 return GetMinMaxCost(OrigScalarTy);
11606 return VecCost + CommonCost;
11608 return GetCostDiff(GetScalarCost, GetVectorCost);
11610 case Instruction::FNeg:
11611 case Instruction::Add:
11612 case Instruction::FAdd:
11613 case Instruction::Sub:
11614 case Instruction::FSub:
11615 case Instruction::Mul:
11616 case Instruction::FMul:
11617 case Instruction::UDiv:
11618 case Instruction::SDiv:
11619 case Instruction::FDiv:
11620 case Instruction::URem:
11621 case Instruction::SRem:
11622 case Instruction::FRem:
11623 case Instruction::Shl:
11624 case Instruction::LShr:
11625 case Instruction::AShr:
11626 case Instruction::And:
11627 case Instruction::Or:
11628 case Instruction::Xor: {
11629 auto GetScalarCost = [&](
unsigned Idx) {
11630 if (isa<PoisonValue>(UniqueValues[
Idx]))
11633 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11634 unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
11643 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
11644 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
11647 auto *CI = dyn_cast<ConstantInt>(
Op);
11648 return CI && CI->getValue().countr_one() >= It->second.first;
11653 unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
11657 Op2Info, {},
nullptr, TLI) +
11660 return GetCostDiff(GetScalarCost, GetVectorCost);
11662 case Instruction::GetElementPtr: {
11663 return CommonCost + GetGEPCostDiff(VL, VL0);
11665 case Instruction::Load: {
11666 auto GetScalarCost = [&](
unsigned Idx) {
11667 auto *
VI = cast<LoadInst>(UniqueValues[
Idx]);
11669 VI->getAlign(),
VI->getPointerAddressSpace(),
11672 auto *LI0 = cast<LoadInst>(VL0);
11675 switch (E->State) {
11676 case TreeEntry::Vectorize:
11677 if (
unsigned Factor = E->getInterleaveFactor()) {
11679 Instruction::Load, VecTy, Factor, std::nullopt, LI0->getAlign(),
11680 LI0->getPointerAddressSpace(),
CostKind);
11684 Instruction::Load, VecTy, LI0->getAlign(),
11688 case TreeEntry::StridedVectorize: {
11689 Align CommonAlignment =
11690 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11692 Instruction::Load, VecTy, LI0->getPointerOperand(),
11693 false, CommonAlignment,
CostKind);
11696 case TreeEntry::ScatterVectorize: {
11697 Align CommonAlignment =
11698 computeCommonAlignment<LoadInst>(UniqueValues.getArrayRef());
11700 Instruction::Load, VecTy, LI0->getPointerOperand(),
11701 false, CommonAlignment,
CostKind);
11704 case TreeEntry::CombinedVectorize:
11705 case TreeEntry::NeedToGather:
11708 return VecLdCost + CommonCost;
11714 if (E->State == TreeEntry::ScatterVectorize)
11720 PointerOps[
I] = cast<LoadInst>(V)->getPointerOperand();
11721 return Cost + GetGEPCostDiff(PointerOps, LI0->getPointerOperand());
11723 case Instruction::Store: {
11724 bool IsReorder = !E->ReorderIndices.empty();
11725 auto GetScalarCost = [=](
unsigned Idx) {
11726 auto *
VI = cast<StoreInst>(VL[
Idx]);
11729 VI->getAlign(),
VI->getPointerAddressSpace(),
11733 cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
11737 if (E->State == TreeEntry::StridedVectorize) {
11738 Align CommonAlignment =
11739 computeCommonAlignment<StoreInst>(UniqueValues.getArrayRef());
11741 Instruction::Store, VecTy, BaseSI->getPointerOperand(),
11742 false, CommonAlignment,
CostKind);
11744 assert(E->State == TreeEntry::Vectorize &&
11745 "Expected either strided or consecutive stores.");
11746 if (
unsigned Factor = E->getInterleaveFactor()) {
11747 assert(E->ReuseShuffleIndices.empty() && !E->ReorderIndices.empty() &&
11748 "No reused shuffles expected");
11751 Instruction::Store, VecTy, Factor, std::nullopt,
11752 BaseSI->getAlign(), BaseSI->getPointerAddressSpace(),
CostKind);
11756 Instruction::Store, VecTy, BaseSI->getAlign(),
11757 BaseSI->getPointerAddressSpace(),
CostKind, OpInfo);
11760 return VecStCost + CommonCost;
11764 unsigned Idx = IsReorder ? E->ReorderIndices[
I] :
I;
11765 PointerOps[
Idx] = cast<StoreInst>(V)->getPointerOperand();
11768 return GetCostDiff(GetScalarCost, GetVectorCost) +
11769 GetGEPCostDiff(PointerOps, BaseSI->getPointerOperand());
11771 case Instruction::Call: {
11772 auto GetScalarCost = [&](
unsigned Idx) {
11773 auto *CI = cast<CallInst>(UniqueValues[
Idx]);
11784 auto *CI = cast<CallInst>(VL0);
11788 It != MinBWs.
end() ? It->second.first : 0,
TTI);
11790 return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
11792 return GetCostDiff(GetScalarCost, GetVectorCost);
11794 case Instruction::ShuffleVector: {
11795 if (!
SLPReVec || E->isAltShuffle())
11796 assert(E->isAltShuffle() &&
11801 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
11802 "Invalid Shuffle Vector Operand");
11805 auto TryFindNodeWithEqualOperands = [=]() {
11806 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
11809 if (
TE->hasState() &&
TE->isAltShuffle() &&
11810 ((
TE->getOpcode() == E->getOpcode() &&
11811 TE->getAltOpcode() == E->getAltOpcode()) ||
11812 (
TE->getOpcode() == E->getAltOpcode() &&
11813 TE->getAltOpcode() == E->getOpcode())) &&
11814 TE->hasEqualOperands(*E))
11819 auto GetScalarCost = [&](
unsigned Idx) {
11820 if (isa<PoisonValue>(UniqueValues[
Idx]))
11823 auto *
VI = cast<Instruction>(UniqueValues[
Idx]);
11824 assert(E->isOpcodeOrAlt(VI) &&
"Unexpected main/alternate opcode");
11834 if (TryFindNodeWithEqualOperands()) {
11836 dbgs() <<
"SLP: diamond match for alternate node found.\n";
11843 TTIRef.getArithmeticInstrCost(E->getOpcode(), VecTy,
CostKind);
11845 TTIRef.getArithmeticInstrCost(E->getAltOpcode(), VecTy,
CostKind);
11846 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
11848 VecCost = TTIRef.getCmpSelInstrCost(
11849 E->getOpcode(), VecTy, MaskTy, CI0->getPredicate(),
CostKind,
11850 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11852 VecCost += TTIRef.getCmpSelInstrCost(
11853 E->getOpcode(), VecTy, MaskTy,
11854 cast<CmpInst>(E->getAltOp())->getPredicate(),
CostKind,
11855 {TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None},
11858 Type *SrcSclTy = E->getMainOp()->getOperand(0)->getType();
11861 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
11862 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
11864 DL->getTypeSizeInBits(E->getMainOp()->getOperand(0)->getType());
11865 if (SrcIt != MinBWs.
end()) {
11866 SrcBWSz = SrcIt->second.first;
11870 if (BWSz <= SrcBWSz) {
11871 if (BWSz < SrcBWSz)
11873 TTIRef.getCastInstrCost(Instruction::Trunc, VecTy, SrcTy,
11877 <<
"SLP: alternate extension, which should be truncated.\n";
11883 VecCost = TTIRef.getCastInstrCost(E->getOpcode(), VecTy, SrcTy,
11886 TTIRef.getCastInstrCost(E->getAltOpcode(), VecTy, SrcTy,
11890 E->buildAltOpShuffleMask(
11892 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
11903 unsigned Opcode0 = E->getOpcode();
11904 unsigned Opcode1 = E->getAltOpcode();
11908 if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
11910 VecTy, Opcode0, Opcode1, OpcodeMask,
CostKind);
11911 return AltVecCost < VecCost ? AltVecCost : VecCost;
11916 if (
SLPReVec && !E->isAltShuffle())
11917 return GetCostDiff(
11922 "Not supported shufflevector usage.");
11923 auto *SV = cast<ShuffleVectorInst>(VL.
front());
11924 unsigned SVNumElements =
11925 cast<FixedVectorType>(SV->getOperand(0)->getType())
11926 ->getNumElements();
11927 unsigned GroupSize = SVNumElements / SV->getShuffleMask().size();
11928 for (
size_t I = 0,
End = VL.
size();
I !=
End;
I += GroupSize) {
11932 assert(isa<ShuffleVectorInst>(V) &&
11933 "Not supported shufflevector usage.");
11934 auto *SV = cast<ShuffleVectorInst>(V);
11936 [[maybe_unused]]
bool IsExtractSubvectorMask =
11937 SV->isExtractSubvectorMask(Index);
11938 assert(IsExtractSubvectorMask &&
11939 "Not supported shufflevector usage.");
11940 if (NextIndex != Index)
11942 NextIndex += SV->getShuffleMask().size();
11945 return ::getShuffleCost(
11951 return GetCostDiff(GetScalarCost, GetVectorCost);
11953 case Instruction::Freeze:
11960bool BoUpSLP::isFullyVectorizableTinyTree(
bool ForReduction)
const {
11962 << VectorizableTree.size() <<
" is fully vectorizable .\n");
11964 auto &&AreVectorizableGathers = [
this](
const TreeEntry *
TE,
unsigned Limit) {
11966 return TE->isGather() &&
11968 [
this](
Value *V) { return EphValues.contains(V); }) &&
11970 TE->Scalars.size() < Limit ||
11971 (((
TE->hasState() &&
11972 TE->getOpcode() == Instruction::ExtractElement) ||
11973 all_of(
TE->Scalars, IsaPred<ExtractElementInst, UndefValue>)) &&
11975 (
TE->hasState() &&
TE->getOpcode() == Instruction::Load &&
11976 !
TE->isAltShuffle()) ||
11977 any_of(
TE->Scalars, IsaPred<LoadInst>));
11981 if (VectorizableTree.size() == 1 &&
11982 (VectorizableTree[0]->State == TreeEntry::Vectorize ||
11983 VectorizableTree[0]->State == TreeEntry::StridedVectorize ||
11985 AreVectorizableGathers(VectorizableTree[0].
get(),
11986 VectorizableTree[0]->Scalars.size()) &&
11987 VectorizableTree[0]->getVectorFactor() > 2)))
11990 if (VectorizableTree.size() != 2)
11998 if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
11999 AreVectorizableGathers(VectorizableTree[1].
get(),
12000 VectorizableTree[0]->Scalars.size()))
12004 if (VectorizableTree[0]->
isGather() ||
12005 (VectorizableTree[1]->isGather() &&
12006 VectorizableTree[0]->State != TreeEntry::ScatterVectorize &&
12007 VectorizableTree[0]->State != TreeEntry::StridedVectorize))
12015 bool MustMatchOrInst) {
12019 Value *ZextLoad = Root;
12020 const APInt *ShAmtC;
12021 bool FoundOr =
false;
12022 while (!isa<ConstantExpr>(ZextLoad) &&
12025 ShAmtC->
urem(8) == 0))) {
12026 auto *BinOp = cast<BinaryOperator>(ZextLoad);
12027 ZextLoad = BinOp->getOperand(0);
12028 if (BinOp->getOpcode() == Instruction::Or)
12033 if ((MustMatchOrInst && !FoundOr) || ZextLoad == Root ||
12040 Type *SrcTy = Load->getType();
12047 LLVM_DEBUG(
dbgs() <<
"SLP: Assume load combining for tree starting at "
12048 << *(cast<Instruction>(Root)) <<
"\n");
12057 unsigned NumElts = VectorizableTree[0]->Scalars.size();
12058 Value *FirstReduced = VectorizableTree[0]->Scalars[0];
12066 unsigned NumElts = Stores.
size();
12067 for (
Value *Scalar : Stores) {
12081 if (VectorizableTree.empty()) {
12082 assert(ExternalUses.empty() &&
"We shouldn't have any external users");
12088 if (VectorizableTree.size() == 2 &&
12089 isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
12090 VectorizableTree[1]->isGather() &&
12091 (VectorizableTree[1]->getVectorFactor() <= 2 ||
12092 !(
isSplat(VectorizableTree[1]->Scalars) ||
12100 constexpr int Limit = 4;
12102 !VectorizableTree.empty() &&
12103 all_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12104 return (TE->isGather() &&
12105 (!TE->hasState() ||
12106 TE->getOpcode() != Instruction::ExtractElement) &&
12107 count_if(TE->Scalars, IsaPred<ExtractElementInst>) <= Limit) ||
12108 (TE->hasState() && TE->getOpcode() == Instruction::PHI);
12119 if (isFullyVectorizableTinyTree(ForReduction))
12124 bool IsAllowedSingleBVNode =
12125 VectorizableTree.size() > 1 ||
12126 (VectorizableTree.size() == 1 && VectorizableTree.front()->hasState() &&
12127 !VectorizableTree.front()->isAltShuffle() &&
12128 VectorizableTree.front()->getOpcode() != Instruction::PHI &&
12129 VectorizableTree.front()->getOpcode() != Instruction::GetElementPtr &&
12131 if (
any_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
12132 return TE->isGather() &&
all_of(TE->Scalars, [&](
Value *V) {
12133 return isa<ExtractElementInst, UndefValue>(V) ||
12134 (IsAllowedSingleBVNode &&
12135 !V->hasNUsesOrMore(UsesLimit) &&
12136 any_of(V->users(), IsaPred<InsertElementInst>));
12141 if (VectorizableTree.back()->isGather() &&
12142 VectorizableTree.back()->hasState() &&
12143 VectorizableTree.back()->isAltShuffle() &&
12144 VectorizableTree.back()->getVectorFactor() > 2 &&
12146 !VectorizableTree.back()->Scalars.front()->getType()->isVectorTy() &&
12148 getWidenedType(VectorizableTree.back()->Scalars.front()->getType(),
12149 VectorizableTree.back()->getVectorFactor()),
12162 constexpr unsigned SmallTree = 3;
12163 if (VectorizableTree.front()->isNonPowOf2Vec() &&
12166 [](
const std::unique_ptr<TreeEntry> &TE) {
12167 return TE->isGather() && TE->hasState() &&
12168 TE->getOpcode() == Instruction::Load &&
12176 TreeEntry &E = *VectorizableTree[
Idx];
12179 if (E.hasState() && E.getOpcode() != Instruction::Load)
12193 unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
12206 for (
const auto &TEPtr : VectorizableTree) {
12207 if (TEPtr->State != TreeEntry::Vectorize)
12209 Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
12215 auto *NodeA = DT->
getNode(
A->getParent());
12216 auto *NodeB = DT->
getNode(
B->getParent());
12217 assert(NodeA &&
"Should only process reachable instructions");
12218 assert(NodeB &&
"Should only process reachable instructions");
12219 assert((NodeA == NodeB) == (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
12220 "Different nodes should have different DFS numbers");
12221 if (NodeA != NodeB)
12222 return NodeA->getDFSNumIn() > NodeB->getDFSNumIn();
12223 return B->comesBefore(
A);
12233 LiveValues.
erase(PrevInst);
12234 for (
auto &J : PrevInst->
operands()) {
12235 if (isa<Instruction>(&*J) && getTreeEntry(&*J))
12236 LiveValues.
insert(cast<Instruction>(&*J));
12240 dbgs() <<
"SLP: #LV: " << LiveValues.
size();
12241 for (
auto *
X : LiveValues)
12242 dbgs() <<
" " <<
X->getName();
12243 dbgs() <<
", Looking at ";
12248 unsigned NumCalls = 0;
12252 while (InstIt != PrevInstIt) {
12253 if (PrevInstIt == PrevInst->
getParent()->rend()) {
12254 PrevInstIt = Inst->getParent()->rbegin();
12259 if (
auto *
II = dyn_cast<IntrinsicInst>(
I)) {
12260 if (
II->isAssumeLikeIntrinsic())
12264 for (
auto &ArgOp :
II->args())
12265 Tys.push_back(ArgOp->getType());
12266 if (
auto *FPMO = dyn_cast<FPMathOperator>(
II))
12267 FMF = FPMO->getFastMathFlags();
12274 if (IntrCost < CallCost)
12281 if (isa<CallBase>(&*PrevInstIt) && !NoCallIntrinsic(&*PrevInstIt) &&
12282 &*PrevInstIt != PrevInst)
12290 for (
auto *
II : LiveValues) {
12291 auto *ScalarTy =
II->getType();
12292 if (
auto *VectorTy = dyn_cast<FixedVectorType>(ScalarTy))
12293 ScalarTy = VectorTy->getElementType();
12311 const auto *I1 = IE1;
12312 const auto *I2 = IE2;
12324 if (I1 && (I1 == IE1 || I1->hasOneUse()) &&
12326 I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
12327 if (I2 && ((I2 == IE2 || I2->
hasOneUse())) &&
12329 I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
12330 }
while ((I1 && PrevI1 != I1) || (I2 && PrevI2 != I2));
12337struct ValueSelect {
12338 template <
typename U>
12339 static std::enable_if_t<std::is_same_v<Value *, U>,
Value *>
get(
Value *V) {
12342 template <
typename U>
12343 static std::enable_if_t<!std::is_same_v<Value *, U>,
U>
get(
Value *) {
12361template <
typename T>
12367 assert(!ShuffleMask.empty() &&
"Empty list of shuffles for inserts.");
12369 auto VMIt = std::next(ShuffleMask.begin());
12372 buildUseMask(Mask.size(), Mask, UseMask::UndefsAsMask);
12374 if (!IsBaseUndef.
all()) {
12376 std::pair<T *, bool> Res =
12377 ResizeAction(ShuffleMask.begin()->first, Mask,
false);
12379 for (
unsigned Idx = 0, VF = Mask.size();
Idx < VF; ++
Idx) {
12383 Mask[
Idx] = (Res.second ?
Idx : Mask[
Idx]) + VF;
12385 auto *V = ValueSelect::get<T *>(
Base);
12387 assert((!V || GetVF(V) == Mask.size()) &&
12388 "Expected base vector of VF number of elements.");
12389 Prev = Action(Mask, {
nullptr, Res.first});
12390 }
else if (ShuffleMask.size() == 1) {
12393 std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask,
12399 Prev = Action(Mask, {ShuffleMask.begin()->first});
12403 unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
12404 unsigned Vec2VF = GetVF(VMIt->first);
12405 if (Vec1VF == Vec2VF) {
12409 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12412 Mask[
I] = SecMask[
I] + Vec1VF;
12415 Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
12418 std::pair<T *, bool> Res1 = ResizeAction(ShuffleMask.begin()->first, Mask,
12420 std::pair<T *, bool> Res2 =
12421 ResizeAction(VMIt->first, VMIt->second,
false);
12423 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12430 Mask[
I] = (Res2.second ?
I : SecMask[
I]) + VF;
12433 Prev = Action(Mask, {Res1.first, Res2.first});
12435 VMIt = std::next(VMIt);
12437 bool IsBaseNotUndef = !IsBaseUndef.
all();
12438 (void)IsBaseNotUndef;
12440 for (
auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
12442 std::pair<T *, bool> Res =
12443 ResizeAction(VMIt->first, VMIt->second,
false);
12445 for (
unsigned I = 0, VF = Mask.size();
I < VF; ++
I) {
12448 "Multiple uses of scalars.");
12449 Mask[
I] = (Res.second ?
I : SecMask[
I]) + VF;
12454 Prev = Action(Mask, {Prev, Res.first});
12462template <
typename T>
struct ShuffledInsertData {
12473 << VectorizableTree.size() <<
".\n");
12475 unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
12478 for (
unsigned I = 0, E = VectorizableTree.size();
I < E; ++
I) {
12479 TreeEntry &TE = *VectorizableTree[
I];
12482 if (TE.State == TreeEntry::CombinedVectorize) {
12484 dbgs() <<
"SLP: Skipping cost for combined node that starts with "
12485 << *TE.Scalars[0] <<
".\n";
12486 TE.dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12489 if (TE.isGather() && TE.hasState()) {
12490 if (
const TreeEntry *E = getTreeEntry(TE.getMainOp());
12491 E && E->getVectorFactor() == TE.getVectorFactor() &&
12492 E->isSame(TE.Scalars)) {
12497 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12504 assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) &&
12505 "Expected gather nodes with users only.");
12511 <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12520 std::optional<DenseMap<Value *, unsigned>> ValueToExtUses;
12527 for (ExternalUser &EU : ExternalUses) {
12528 ScalarUserAndIdx.
emplace_back(EU.Scalar, EU.User, EU.Lane);
12530 for (ExternalUser &EU : ExternalUses) {
12534 if (EphValues.
count(EU.User))
12540 EU.User ? cast<Instruction>(EU.User)->
getParent() :
nullptr;
12543 isa_and_present<UnreachableInst>(UserParent->getTerminator())))
12547 if (!isa_and_nonnull<InsertElementInst>(EU.User) &&
12548 !ExtractCostCalculated.
insert(EU.Scalar).second)
12552 if (isa<FixedVectorType>(EU.Scalar->getType()))
12557 if (
auto *VU = dyn_cast_or_null<InsertElementInst>(EU.User);
12559 if (
auto *FTy = dyn_cast<FixedVectorType>(VU->
getType())) {
12560 if (!UsedInserts.
insert(VU).second)
12564 const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
12567 [
this, VU](
const ShuffledInsertData<const TreeEntry *> &
Data) {
12572 Value *Op0 =
II->getOperand(0);
12573 if (getTreeEntry(
II) && !getTreeEntry(Op0))
12579 if (It == ShuffledInserts.
end()) {
12581 Data.InsertElements.emplace_back(VU);
12583 VecId = ShuffledInserts.
size() - 1;
12584 auto It = MinBWs.
find(ScalarTE);
12585 if (It != MinBWs.
end() &&
12587 .
insert(std::make_pair(ScalarTE, FTy->getElementType()))
12589 unsigned BWSz = It->second.first;
12590 unsigned DstBWSz =
DL->getTypeSizeInBits(FTy->getElementType());
12591 unsigned VecOpcode;
12592 if (DstBWSz < BWSz)
12593 VecOpcode = Instruction::Trunc;
12596 It->second.second ? Instruction::SExt : Instruction::ZExt;
12601 FTy->getNumElements()),
12604 <<
" for extending externally used vector with "
12605 "non-equal minimum bitwidth.\n");
12610 It->InsertElements.front() = VU;
12611 VecId = std::distance(ShuffledInserts.
begin(), It);
12613 int InIdx = *InsertIdx;
12615 ShuffledInserts[VecId].ValueMasks[ScalarTE];
12618 Mask[InIdx] = EU.Lane;
12619 DemandedElts[VecId].setBit(InIdx);
12630 auto *VecTy =
getWidenedType(EU.Scalar->getType(), BundleWidth);
12631 const TreeEntry *Entry = getTreeEntry(EU.Scalar);
12632 auto It = MinBWs.
find(Entry);
12633 if (It != MinBWs.
end()) {
12636 ? Instruction::ZExt
12637 : Instruction::SExt;
12644 EU.Lane, EU.Scalar, ScalarUserAndIdx);
12647 if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
12648 Entry->getOpcode() == Instruction::Load) {
12650 auto IsPhiInLoop = [&](
const ExternalUser &U) {
12651 if (
auto *Phi = dyn_cast_if_present<PHINode>(U.User)) {
12652 auto *
I = cast<Instruction>(U.Scalar);
12653 const Loop *L = LI->getLoopFor(Phi->getParent());
12654 return L && (Phi->getParent() ==
I->getParent() ||
12655 L == LI->getLoopFor(
I->getParent()));
12659 if (!ValueToExtUses) {
12660 ValueToExtUses.emplace();
12663 if (IsPhiInLoop(
P.value()))
12666 ValueToExtUses->try_emplace(
P.value().Scalar,
P.index());
12671 auto *Inst = cast<Instruction>(EU.Scalar);
12673 auto OperandIsScalar = [&](
Value *V) {
12674 if (!getTreeEntry(V)) {
12678 if (
auto *EE = dyn_cast<ExtractElementInst>(V))
12679 return !EE->hasOneUse() || !MustGather.contains(EE);
12682 return ValueToExtUses->contains(V);
12684 bool CanBeUsedAsScalar =
all_of(Inst->operands(), OperandIsScalar);
12685 bool CanBeUsedAsScalarCast =
false;
12686 if (
auto *CI = dyn_cast<CastInst>(Inst); CI && !CanBeUsedAsScalar) {
12687 if (
auto *
Op = dyn_cast<Instruction>(CI->
getOperand(0));
12688 Op &&
all_of(
Op->operands(), OperandIsScalar)) {
12690 (getTreeEntry(
Op) && !ValueToExtUses->contains(
Op))
12693 if (ScalarCost + OpCost <= ExtraCost) {
12694 CanBeUsedAsScalar = CanBeUsedAsScalarCast =
true;
12695 ScalarCost += OpCost;
12699 if (CanBeUsedAsScalar) {
12700 bool KeepScalar = ScalarCost <= ExtraCost;
12704 bool IsProfitablePHIUser =
12706 VectorizableTree.front()->Scalars.size() > 2)) &&
12707 VectorizableTree.front()->getOpcode() == Instruction::PHI &&
12711 auto *PHIUser = dyn_cast<PHINode>(U);
12712 return (!PHIUser ||
12713 PHIUser->getParent() !=
12715 VectorizableTree.front()->getMainOp())
12720 return ValueToExtUses->contains(V);
12722 if (IsProfitablePHIUser) {
12726 (!GatheredLoadsEntriesFirst.has_value() ||
12727 Entry->Idx < *GatheredLoadsEntriesFirst)) {
12728 unsigned ScalarUsesCount =
count_if(Entry->Scalars, [&](
Value *V) {
12729 return ValueToExtUses->contains(V);
12731 auto It = ExtractsCount.
find(Entry);
12732 if (It != ExtractsCount.
end()) {
12733 assert(ScalarUsesCount >= It->getSecond().size() &&
12734 "Expected total number of external uses not less than "
12735 "number of scalar uses.");
12736 ScalarUsesCount -= It->getSecond().size();
12741 KeepScalar = ScalarUsesCount <= 1 || !
has_single_bit(ScalarUsesCount);
12744 ExternalUsesAsOriginalScalar.
insert(EU.Scalar);
12746 auto It = ValueToExtUses->find(V);
12747 if (It != ValueToExtUses->end()) {
12749 ExternalUses[It->second].User = nullptr;
12752 ExtraCost = ScalarCost;
12753 if (!IsPhiInLoop(EU))
12754 ExtractsCount[Entry].
insert(Inst);
12755 if (CanBeUsedAsScalarCast) {
12756 ScalarOpsFromCasts.
insert(Inst->getOperand(0));
12759 if (
auto *IOp = dyn_cast<Instruction>(Inst->getOperand(0))) {
12761 auto It = ValueToExtUses->find(V);
12762 if (It != ValueToExtUses->end()) {
12764 ExternalUses[It->second].User = nullptr;
12773 ExtractCost += ExtraCost;
12777 for (
Value *V : ScalarOpsFromCasts) {
12778 ExternalUsesAsOriginalScalar.
insert(V);
12779 if (
const TreeEntry *E = getTreeEntry(V)) {
12780 ExternalUses.emplace_back(V,
nullptr, E->findLaneForValue(V));
12784 if (!VectorizedVals.
empty()) {
12785 const TreeEntry &Root = *VectorizableTree.front();
12786 auto BWIt = MinBWs.find(&Root);
12787 if (BWIt != MinBWs.end()) {
12788 Type *DstTy = Root.Scalars.front()->getType();
12791 ReductionBitWidth == 0 ? BWIt->second.first : ReductionBitWidth;
12792 if (OriginalSz != SrcSz) {
12793 unsigned Opcode = Instruction::Trunc;
12794 if (OriginalSz > SrcSz)
12795 Opcode = BWIt->second.second ? Instruction::SExt : Instruction::ZExt;
12797 if (
auto *VecTy = dyn_cast<FixedVectorType>(DstTy)) {
12809 Cost += SpillCost + ExtractCost;
12813 unsigned VF =
Mask.size();
12814 unsigned VecVF =
TE->getVectorFactor();
12816 (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); }) ||
12819 std::copy(
Mask.begin(), std::next(
Mask.begin(), std::min(VF, VecVF)),
12825 dbgs() <<
"SLP: Adding cost " <<
C
12826 <<
" for final shuffle of insertelement external users.\n";
12827 TE->dump();
dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12829 return std::make_pair(TE,
true);
12831 return std::make_pair(TE,
false);
12834 for (
int I = 0, E = ShuffledInserts.size();
I < E; ++
I) {
12835 Value *
Base = ShuffledInserts[
I].InsertElements.front()->getOperand(0);
12836 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
12840 assert((TEs.size() == 1 || TEs.size() == 2) &&
12841 "Expected exactly 1 or 2 tree entries.");
12842 if (TEs.size() == 1) {
12844 VF = TEs.front()->getVectorFactor();
12845 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12849 (
Data.index() < VF &&
12850 static_cast<int>(
Data.index()) ==
Data.value());
12855 <<
" for final shuffle of insertelement "
12856 "external users.\n";
12857 TEs.front()->
dump();
12858 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12864 TEs.front()->getVectorFactor() == TEs.back()->getVectorFactor())
12865 VF = TEs.front()->getVectorFactor();
12869 auto *FTy =
getWidenedType(TEs.back()->Scalars.front()->getType(), VF);
12873 <<
" for final shuffle of vector node and external "
12874 "insertelement users.\n";
12875 if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
12876 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12882 (void)performExtractsShuffleAction<const TreeEntry>(
12884 [](
const TreeEntry *E) {
return E->getVectorFactor(); }, ResizeToVF,
12885 EstimateShufflesCost);
12887 cast<FixedVectorType>(
12888 ShuffledInserts[
I].InsertElements.front()->getType()),
12891 Cost -= InsertCost;
12895 if (ReductionBitWidth != 0) {
12896 assert(UserIgnoreList &&
"Expected reduction tree.");
12897 const TreeEntry &E = *VectorizableTree.front();
12898 auto It = MinBWs.find(&E);
12899 if (It != MinBWs.end() && It->second.first != ReductionBitWidth) {
12900 unsigned SrcSize = It->second.first;
12901 unsigned DstSize = ReductionBitWidth;
12902 unsigned Opcode = Instruction::Trunc;
12903 if (SrcSize < DstSize) {
12904 bool IsArithmeticExtendedReduction =
12906 auto *
I = cast<Instruction>(V);
12907 return is_contained({Instruction::Add, Instruction::FAdd,
12908 Instruction::Mul, Instruction::FMul,
12909 Instruction::And, Instruction::Or,
12913 if (IsArithmeticExtendedReduction)
12915 Instruction::BitCast;
12917 Opcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
12919 if (Opcode != Instruction::BitCast) {
12921 getWidenedType(Builder.getIntNTy(SrcSize), E.getVectorFactor());
12923 getWidenedType(Builder.getIntNTy(DstSize), E.getVectorFactor());
12926 switch (E.getOpcode()) {
12927 case Instruction::SExt:
12928 case Instruction::ZExt:
12929 case Instruction::Trunc: {
12930 const TreeEntry *OpTE = getOperandEntry(&E, 0);
12931 CCH = getCastContextHint(*OpTE);
12941 <<
" for final resize for reduction from " << SrcVecTy
12942 <<
" to " << DstVecTy <<
"\n";
12943 dbgs() <<
"SLP: Current total cost = " <<
Cost <<
"\n");
12952 OS <<
"SLP: Spill Cost = " << SpillCost <<
".\n"
12953 <<
"SLP: Extract Cost = " << ExtractCost <<
".\n"
12954 <<
"SLP: Total Cost = " <<
Cost <<
".\n";
12958 ViewGraph(
this,
"SLP" +
F->getName(),
false, Str);
12969std::optional<TTI::ShuffleKind>
12970BoUpSLP::tryToGatherSingleRegisterExtractElements(
12976 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
12977 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
12979 if (isa<UndefValue>(VL[
I]))
12983 auto *VecTy = dyn_cast<FixedVectorType>(EI->getVectorOperandType());
12984 if (!VecTy || !isa<ConstantInt, UndefValue>(EI->getIndexOperand()))
12997 ExtractMask.reset(*
Idx);
13002 VectorOpToIdx[EI->getVectorOperand()].push_back(
I);
13007 stable_sort(Vectors, [](
const auto &P1,
const auto &P2) {
13008 return P1.second.size() > P2.second.size();
13011 const int UndefSz = UndefVectorExtracts.
size();
13012 unsigned SingleMax = 0;
13013 unsigned PairMax = 0;
13014 if (!Vectors.
empty()) {
13015 SingleMax = Vectors.
front().second.size() + UndefSz;
13016 if (Vectors.
size() > 1) {
13017 auto *ItNext = std::next(Vectors.
begin());
13018 PairMax = SingleMax + ItNext->second.size();
13021 if (SingleMax == 0 && PairMax == 0 && UndefSz == 0)
13022 return std::nullopt;
13028 if (SingleMax >= PairMax && SingleMax) {
13029 for (
int Idx : Vectors.
front().second)
13031 }
else if (!Vectors.
empty()) {
13032 for (
unsigned Idx : {0, 1})
13033 for (
int Idx : Vectors[
Idx].second)
13037 for (
int Idx : UndefVectorExtracts)
13041 std::optional<TTI::ShuffleKind> Res =
13047 return std::nullopt;
13051 for (
int I = 0, E = GatheredExtracts.size();
I < E; ++
I) {
13052 if (Mask[
I] ==
PoisonMaskElem && !isa<PoisonValue>(GatheredExtracts[
I]) &&
13053 isa<UndefValue>(GatheredExtracts[
I])) {
13057 auto *EI = dyn_cast<ExtractElementInst>(VL[
I]);
13058 if (!EI || !isa<FixedVectorType>(EI->getVectorOperandType()) ||
13059 !isa<ConstantInt, UndefValue>(EI->getIndexOperand()) ||
13074 unsigned NumParts)
const {
13075 assert(NumParts > 0 &&
"NumParts expected be greater than or equal to 1.");
13079 for (
unsigned Part : seq<unsigned>(NumParts)) {
13085 std::optional<TTI::ShuffleKind> Res =
13086 tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
13087 ShufflesRes[Part] = Res;
13088 copy(SubMask, std::next(
Mask.begin(), Part * SliceSize));
13090 if (
none_of(ShufflesRes, [](
const std::optional<TTI::ShuffleKind> &Res) {
13091 return Res.has_value();
13093 ShufflesRes.clear();
13094 return ShufflesRes;
13097std::optional<TargetTransformInfo::ShuffleKind>
13098BoUpSLP::isGatherShuffledSingleRegisterEntry(
13104 const EdgeInfo &TEUseEI =
TE == VectorizableTree.front().get()
13105 ? EdgeInfo(
const_cast<TreeEntry *
>(TE), 0)
13106 :
TE->UserTreeIndices.front();
13107 const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE);
13111 if (
auto *
PHI = dyn_cast<PHINode>(TEUseEI.UserTE->getMainOp())) {
13112 TEInsertBlock =
PHI->getIncomingBlock(TEUseEI.EdgeIdx);
13115 TEInsertBlock = TEInsertPt->
getParent();
13118 return std::nullopt;
13119 auto *NodeUI = DT->
getNode(TEInsertBlock);
13120 assert(NodeUI &&
"Should only process reachable instructions");
13122 auto CheckOrdering = [&](
const Instruction *InsertPt) {
13136 auto *NodeEUI = DT->
getNode(InsertBlock);
13139 assert((NodeUI == NodeEUI) ==
13140 (NodeUI->getDFSNumIn() == NodeEUI->getDFSNumIn()) &&
13141 "Different nodes should have different DFS numbers");
13143 if (TEInsertPt->
getParent() != InsertBlock &&
13146 if (TEInsertPt->
getParent() == InsertBlock &&
13160 for (
Value *V : VL) {
13165 for (
const TreeEntry *TEPtr : ValueToGatherNodes.find(V)->second) {
13166 if (TEPtr == TE || TEPtr->Idx == 0)
13169 [&](
Value *V) { return GatheredScalars.contains(V); }) &&
13170 "Must contain at least single gathered value.");
13171 assert(TEPtr->UserTreeIndices.size() == 1 &&
13172 "Expected only single user of a gather node.");
13173 const EdgeInfo &UseEI = TEPtr->UserTreeIndices.front();
13175 PHINode *UserPHI = dyn_cast<PHINode>(UseEI.UserTE->getMainOp());
13178 : &getLastInstructionInBundle(UseEI.UserTE);
13179 if (TEInsertPt == InsertPt) {
13183 if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
13187 if (TEUseEI.UserTE != UseEI.UserTE &&
13188 TEUseEI.UserTE->Idx < UseEI.UserTE->Idx)
13194 if ((TEInsertBlock != InsertPt->
getParent() ||
13195 TEUseEI.EdgeIdx < UseEI.EdgeIdx || TEUseEI.UserTE != UseEI.UserTE) &&
13196 !CheckOrdering(InsertPt))
13200 if (
const TreeEntry *VTE = getTreeEntry(V)) {
13201 if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst.value_or(0)) {
13202 if (VTE->State != TreeEntry::Vectorize) {
13203 auto It = MultiNodeScalars.
find(V);
13204 if (It == MultiNodeScalars.
end())
13206 VTE = *It->getSecond().begin();
13208 auto *MIt =
find_if(It->getSecond(), [](
const TreeEntry *MTE) {
13209 return MTE->State == TreeEntry::Vectorize;
13211 if (MIt == It->getSecond().end())
13216 if (
none_of(
TE->CombinedEntriesWithIndices,
13217 [&](
const auto &
P) { return P.first == VTE->Idx; })) {
13218 Instruction &LastBundleInst = getLastInstructionInBundle(VTE);
13219 if (&LastBundleInst == TEInsertPt || !CheckOrdering(&LastBundleInst))
13224 if (VToTEs.
empty())
13226 if (UsedTEs.
empty()) {
13240 if (!VToTEs.
empty()) {
13246 VToTEs = SavedVToTEs;
13255 if (UsedTEs.
size() == 2)
13257 UsedTEs.push_back(SavedVToTEs);
13264 if (UsedTEs.
empty()) {
13266 return std::nullopt;
13270 if (UsedTEs.
size() == 1) {
13273 UsedTEs.front().
end());
13274 sort(FirstEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13275 return TE1->Idx < TE2->Idx;
13278 auto *It =
find_if(FirstEntries, [=](
const TreeEntry *EntryPtr) {
13279 return EntryPtr->isSame(VL) || EntryPtr->isSame(
TE->Scalars);
13281 if (It != FirstEntries.end() &&
13282 ((*It)->getVectorFactor() == VL.size() ||
13283 ((*It)->getVectorFactor() ==
TE->Scalars.size() &&
13284 TE->ReuseShuffleIndices.size() == VL.size() &&
13285 (*It)->isSame(
TE->Scalars)))) {
13286 Entries.push_back(*It);
13287 if ((*It)->getVectorFactor() == VL.size()) {
13288 std::iota(std::next(
Mask.begin(), Part * VL.size()),
13289 std::next(
Mask.begin(), (Part + 1) * VL.size()), 0);
13295 for (
unsigned I : seq<unsigned>(VL.size()))
13296 if (isa<PoisonValue>(VL[
I]))
13302 Entries.push_back(FirstEntries.front());
13303 VF = FirstEntries.front()->getVectorFactor();
13306 assert(UsedTEs.
size() == 2 &&
"Expected at max 2 permuted entries.");
13309 for (
const TreeEntry *TE : UsedTEs.front()) {
13310 unsigned VF =
TE->getVectorFactor();
13311 auto It = VFToTE.
find(VF);
13312 if (It != VFToTE.
end()) {
13313 if (It->second->Idx >
TE->Idx)
13314 It->getSecond() =
TE;
13321 UsedTEs.back().
end());
13322 sort(SecondEntries, [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13323 return TE1->Idx < TE2->Idx;
13325 for (
const TreeEntry *TE : SecondEntries) {
13326 auto It = VFToTE.
find(
TE->getVectorFactor());
13327 if (It != VFToTE.
end()) {
13329 Entries.push_back(It->second);
13330 Entries.push_back(TE);
13336 if (Entries.empty()) {
13338 UsedTEs.front(), [](
const TreeEntry *TE1,
const TreeEntry *TE2) {
13339 return TE1->Idx < TE2->Idx;
13341 Entries.push_back(SecondEntries.front());
13342 VF = std::max(Entries.front()->getVectorFactor(),
13343 Entries.back()->getVectorFactor());
13345 VF = Entries.front()->getVectorFactor();
13349 bool IsSplatOrUndefs =
isSplat(VL) ||
all_of(VL, IsaPred<UndefValue>);
13352 auto AreCompatiblePHIs = [&](
Value *
V,
Value *V1) {
13353 auto *
PHI = cast<PHINode>(V);
13354 auto *PHI1 = cast<PHINode>(V1);
13359 for (
int I = 0, E =
PHI->getNumIncomingValues();
I < E; ++
I) {
13361 Value *In1 = PHI1->getIncomingValue(
I);
13366 if (cast<Instruction>(In)->
getParent() !=
13376 auto MightBeIgnored = [=](
Value *
V) {
13377 auto *
I = dyn_cast<Instruction>(V);
13378 return I && !IsSplatOrUndefs && !ScalarToTreeEntry.
count(
I) &&
13380 !areAllUsersVectorized(
I, UserIgnoreList) &&
isSimple(
I);
13385 auto NeighborMightBeIgnored = [&](
Value *
V,
int Idx) {
13387 bool UsedInSameVTE =
false;
13388 auto It = UsedValuesEntry.
find(V1);
13389 if (It != UsedValuesEntry.
end())
13390 UsedInSameVTE = It->second == UsedValuesEntry.
find(V)->second;
13391 return V != V1 && MightBeIgnored(V1) && !UsedInSameVTE &&
13393 cast<Instruction>(V)->getParent() ==
13394 cast<Instruction>(V1)->getParent() &&
13395 (!isa<PHINode>(V1) || AreCompatiblePHIs(V, V1));
13400 for (
int I = 0, E = VL.size();
I < E; ++
I) {
13402 auto It = UsedValuesEntry.
find(V);
13403 if (It == UsedValuesEntry.
end())
13409 ((
I > 0 && NeighborMightBeIgnored(V,
I - 1)) ||
13410 (
I != E - 1 && NeighborMightBeIgnored(V,
I + 1)))))
13412 unsigned Idx = It->second;
13419 for (
unsigned I = 0, Sz = Entries.size();
I < Sz; ++
I) {
13420 if (!UsedIdxs.test(
I))
13426 for (std::pair<unsigned, int> &Pair : EntryLanes)
13427 if (Pair.first ==
I)
13428 Pair.first = TempEntries.
size();
13431 Entries.swap(TempEntries);
13432 if (EntryLanes.size() == Entries.size() &&
13434 .
slice(Part * VL.size(),
13435 std::min<int>(VL.size(),
TE->Scalars.size())))) {
13441 return std::nullopt;
13444 bool IsIdentity = Entries.size() == 1;
13447 for (
const std::pair<unsigned, int> &Pair : EntryLanes) {
13448 unsigned Idx = Part * VL.size() + Pair.second;
13451 (ForOrder ? std::distance(
13452 Entries[Pair.first]->Scalars.begin(),
13453 find(Entries[Pair.first]->Scalars, VL[Pair.second]))
13454 : Entries[Pair.first]->findLaneForValue(VL[Pair.second]));
13455 IsIdentity &=
Mask[
Idx] == Pair.second;
13457 if (ForOrder || IsIdentity || Entries.empty()) {
13458 switch (Entries.size()) {
13460 if (IsIdentity || EntryLanes.size() > 1 || VL.size() <= 2)
13464 if (EntryLanes.size() > 2 || VL.size() <= 2)
13470 }
else if (!isa<VectorType>(VL.front()->getType()) &&
13471 (EntryLanes.size() > Entries.size() || VL.size() <= 2)) {
13474 std::next(
Mask.begin(), (Part + 1) * VL.size()));
13475 int MinElement = SubMask.
front(), MaxElement = SubMask.
front();
13476 for (
int Idx : SubMask) {
13484 assert(MaxElement >= 0 && MinElement >= 0 &&
13485 MaxElement % VF >= MinElement % VF &&
13486 "Expected at least single element.");
13487 unsigned NewVF = std::max<unsigned>(
13489 (MaxElement % VF) -
13490 (MinElement % VF) + 1));
13495 Idx = ((
Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF +
13496 (
Idx >=
static_cast<int>(VF) ? NewVF : 0);
13504 auto *MaskVecTy =
getWidenedType(VL.front()->getType(), SubMask.size());
13505 auto GetShuffleCost = [&,
13509 if (Entries.size() == 1 && Entries.front()->getInterleaveFactor() > 0 &&
13511 Mask, Entries.front()->getInterleaveFactor()))
13513 return ::getShuffleCost(
TTI,
13518 InstructionCost ShuffleCost = GetShuffleCost(SubMask, Entries, VecTy);
13521 if (Entries.size() == 1 || !Entries[0]->isGather()) {
13522 FirstShuffleCost = ShuffleCost;
13526 bool IsIdentity =
true;
13528 if (
Idx >=
static_cast<int>(NewVF)) {
13533 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13537 FirstShuffleCost = GetShuffleCost(FirstMask, Entries.front(), VecTy);
13539 MaskVecTy, DemandedElts,
true,
13544 if (Entries.size() == 1 || !Entries[1]->isGather()) {
13545 SecondShuffleCost = ShuffleCost;
13549 bool IsIdentity =
true;
13551 if (
Idx <
static_cast<int>(NewVF) &&
Idx >= 0) {
13557 IsIdentity &=
static_cast<int>(
I) ==
Idx;
13562 SecondShuffleCost = GetShuffleCost(SecondMask, Entries[1], VecTy);
13564 MaskVecTy, DemandedElts,
true,
13574 const TreeEntry *BestEntry =
nullptr;
13575 if (FirstShuffleCost < ShuffleCost) {
13576 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13577 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13579 if (Idx >= static_cast<int>(VF))
13580 Idx = PoisonMaskElem;
13582 BestEntry = Entries.front();
13583 ShuffleCost = FirstShuffleCost;
13585 if (SecondShuffleCost < ShuffleCost) {
13586 std::for_each(std::next(
Mask.begin(), Part * VL.size()),
13587 std::next(
Mask.begin(), (Part + 1) * VL.size()),
13589 if (Idx < static_cast<int>(VF))
13590 Idx = PoisonMaskElem;
13594 BestEntry = Entries[1];
13595 ShuffleCost = SecondShuffleCost;
13597 if (BuildVectorCost >= ShuffleCost) {
13600 Entries.push_back(BestEntry);
13608 std::fill(std::next(
Mask.begin(), Part * VL.size()),
13610 return std::nullopt;
13614BoUpSLP::isGatherShuffledEntry(
13618 assert(NumParts > 0 && NumParts < VL.
size() &&
13619 "Expected positive number of registers.");
13622 if (TE == VectorizableTree.front().get() &&
13623 (!GatheredLoadsEntriesFirst.has_value() ||
13625 [](
const std::unique_ptr<TreeEntry> &TE) {
13626 return !
TE->isGather();
13631 if (
TE->hasNonWholeRegisterOrNonPowerOf2Vec(*
TTI))
13634 assert((
TE->UserTreeIndices.size() == 1 ||
13635 TE == VectorizableTree.front().get()) &&
13636 "Expected only single user of the gather node.");
13638 "Number of scalars must be divisible by NumParts.");
13639 if (!
TE->UserTreeIndices.empty() &&
13640 TE->UserTreeIndices.front().UserTE->isGather() &&
13641 TE->UserTreeIndices.front().EdgeIdx == UINT_MAX) {
13644 (
TE->hasState() &&
TE->getOpcode() == Instruction::ExtractElement) ||
13646 "Expected splat or extractelements only node.");
13651 for (
unsigned Part : seq<unsigned>(NumParts)) {
13655 std::optional<TTI::ShuffleKind> SubRes =
13656 isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part,
13659 SubEntries.
clear();
13662 SubEntries.
front()->getVectorFactor() == VL.
size() &&
13663 (SubEntries.
front()->isSame(
TE->Scalars) ||
13664 SubEntries.
front()->isSame(VL))) {
13666 LocalSubEntries.
swap(SubEntries);
13669 std::iota(
Mask.begin(),
Mask.end(), 0);
13671 for (
int I = 0, Sz = VL.
size();
I < Sz; ++
I)
13672 if (isa<PoisonValue>(VL[
I]))
13674 Entries.emplace_back(1, LocalSubEntries.
front());
13680 [](
const std::optional<TTI::ShuffleKind> &SK) {
return !SK; })) {
13688 Type *ScalarTy)
const {
13690 bool DuplicateNonConst =
false;
13698 auto EstimateInsertCost = [&](
unsigned I,
Value *
V) {
13699 if (
V->getType() != ScalarTy) {
13710 for (
unsigned I = 0, E = VL.
size();
I < E; ++
I) {
13713 if ((ForPoisonSrc &&
isConstant(V)) || isa<UndefValue>(V)) {
13721 EstimateInsertCost(
I, V);
13722 ShuffleMask[
I] =
I;
13726 DuplicateNonConst =
true;
13728 ShuffleMask[
I] = Res.first->second;
13730 if (ForPoisonSrc) {
13731 if (isa<FixedVectorType>(ScalarTy)) {
13737 for (
unsigned I : seq<unsigned>(VL.
size()))
13738 if (!ShuffledElements[
I])
13741 I * ScalarTyNumElements, cast<FixedVectorType>(ScalarTy));
13749 if (DuplicateNonConst)
13751 VecTy, ShuffleMask);
13755Instruction &BoUpSLP::getLastInstructionInBundle(
const TreeEntry *E) {
13756 auto &Res = EntryToLastInstruction.
try_emplace(E).first->second;
13762 auto *Front = E->getMainOp();
13764 assert(((GatheredLoadsEntriesFirst.has_value() &&
13765 E->getOpcode() == Instruction::Load && E->isGather() &&
13766 E->Idx < *GatheredLoadsEntriesFirst) ||
13768 [=](
Value *V) ->
bool {
13769 if (E->getOpcode() == Instruction::GetElementPtr &&
13770 !isa<GetElementPtrInst>(V))
13772 auto *I = dyn_cast<Instruction>(V);
13773 return !I || !E->isOpcodeOrAlt(I) || I->getParent() == BB ||
13774 isVectorLikeInstWithConstOps(I);
13776 "Expected gathered loads or GEPs or instructions from same basic "
13779 auto FindLastInst = [&]() {
13781 for (
Value *V : E->Scalars) {
13782 auto *
I = dyn_cast<Instruction>(V);
13785 if (LastInst->
getParent() ==
I->getParent()) {
13790 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13791 !isa<GetElementPtrInst>(
I)) ||
13794 (GatheredLoadsEntriesFirst.has_value() &&
13795 E->getOpcode() == Instruction::Load && E->isGather() &&
13796 E->Idx < *GatheredLoadsEntriesFirst)) &&
13797 "Expected vector-like or non-GEP in GEP node insts only.");
13805 auto *NodeB = DT->
getNode(
I->getParent());
13806 assert(NodeA &&
"Should only process reachable instructions");
13807 assert(NodeB &&
"Should only process reachable instructions");
13808 assert((NodeA == NodeB) ==
13809 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13810 "Different nodes should have different DFS numbers");
13811 if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn())
13818 auto FindFirstInst = [&]() {
13820 for (
Value *V : E->Scalars) {
13821 auto *
I = dyn_cast<Instruction>(V);
13824 if (FirstInst->
getParent() ==
I->getParent()) {
13825 if (
I->comesBefore(FirstInst))
13829 assert(((E->getOpcode() == Instruction::GetElementPtr &&
13830 !isa<GetElementPtrInst>(
I)) ||
13833 "Expected vector-like or non-GEP in GEP node insts only.");
13841 auto *NodeB = DT->
getNode(
I->getParent());
13842 assert(NodeA &&
"Should only process reachable instructions");
13843 assert(NodeB &&
"Should only process reachable instructions");
13844 assert((NodeA == NodeB) ==
13845 (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
13846 "Different nodes should have different DFS numbers");
13847 if (NodeA->getDFSNumIn() > NodeB->getDFSNumIn())
13854 if (GatheredLoadsEntriesFirst.has_value() &&
13855 E->Idx >= *GatheredLoadsEntriesFirst && !E->isGather() &&
13856 E->getOpcode() == Instruction::Load) {
13857 Res = FindFirstInst();
13865 if ((E->getOpcode() == Instruction::GetElementPtr &&
13868 return !isa<GetElementPtrInst>(V) && isa<Instruction>(V);
13872 return isa<PoisonValue>(V) ||
13873 (!isVectorLikeInstWithConstOps(V) &&
13874 isUsedOutsideBlock(V));
13876 (E->isGather() && E->Idx == 0 &&
all_of(E->Scalars, [](
Value *V) {
13877 return isa<ExtractElementInst, UndefValue>(V) ||
13878 areAllOperandsNonInsts(V);
13880 Res = FindLastInst();
13882 Res = FindFirstInst();
13890 if (BlocksSchedules.count(BB) && !E->isGather()) {
13891 Value *
V = E->isOneOf(E->Scalars.back());
13894 auto *Bundle = BlocksSchedules[BB]->getScheduleData(V);
13895 if (Bundle && Bundle->isPartOfBundle())
13896 for (; Bundle; Bundle = Bundle->NextInBundle)
13897 Res = Bundle->Inst;
13919 Res = FindLastInst();
13920 assert(Res &&
"Failed to find last instruction in bundle");
13924void BoUpSLP::setInsertPointAfterBundle(
const TreeEntry *E) {
13925 auto *Front = E->getMainOp();
13926 Instruction *LastInst = &getLastInstructionInBundle(E);
13927 assert(LastInst &&
"Failed to find last instruction in bundle");
13930 bool IsPHI = isa<PHINode>(LastInst);
13932 LastInstIt = LastInst->
getParent()->getFirstNonPHIIt();
13934 Builder.SetInsertPoint(LastInst->
getParent(), LastInstIt);
13938 Builder.SetInsertPoint(
13942 Builder.SetCurrentDebugLocation(Front->getDebugLoc());
13945Value *BoUpSLP::gather(
13954 Loop *
L = LI->getLoopFor(Builder.GetInsertBlock());
13957 while (InsertBB && InsertBB != InstBB && Visited.
insert(InsertBB).second)
13958 InsertBB = InsertBB->getSinglePredecessor();
13959 return InsertBB && InsertBB == InstBB;
13961 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
13962 if (
auto *Inst = dyn_cast<Instruction>(VL[
I]))
13963 if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
13964 getTreeEntry(Inst) ||
13965 (L && (!Root ||
L->isLoopInvariant(Root)) &&
L->contains(Inst))) &&
13966 PostponedIndices.
insert(
I).second)
13970 auto &&CreateInsertElement = [
this](
Value *Vec,
Value *
V,
unsigned Pos,
13973 if (
Scalar->getType() != Ty) {
13977 if (
auto *CI = dyn_cast<CastInst>(Scalar);
13978 isa_and_nonnull<SExtInst, ZExtInst>(CI)) {
13980 if (
auto *IOp = dyn_cast<Instruction>(
Op);
13981 !IOp || !(
isDeleted(IOp) || getTreeEntry(IOp)))
13984 Scalar = Builder.CreateIntCast(
13989 if (
auto *VecTy = dyn_cast<FixedVectorType>(
Scalar->getType())) {
13993 auto *
II = dyn_cast<IntrinsicInst>(Vec);
13994 if (!
II ||
II->getIntrinsicID() != Intrinsic::vector_insert)
13998 Vec = Builder.CreateInsertElement(Vec, Scalar, Builder.getInt32(Pos));
13999 InsElt = dyn_cast<InsertElementInst>(Vec);
14003 GatherShuffleExtractSeq.
insert(InsElt);
14006 if (isa<Instruction>(V)) {
14007 if (TreeEntry *Entry = getTreeEntry(V)) {
14009 User *UserOp =
nullptr;
14011 if (
auto *SI = dyn_cast<Instruction>(Scalar))
14017 unsigned FoundLane =
Entry->findLaneForValue(V);
14018 ExternalUses.emplace_back(V, UserOp, FoundLane);
14028 std::iota(
Mask.begin(),
Mask.end(), 0);
14029 Value *OriginalRoot = Root;
14030 if (
auto *SV = dyn_cast_or_null<ShuffleVectorInst>(Root);
14031 SV && isa<PoisonValue>(SV->getOperand(1)) &&
14032 SV->getOperand(0)->getType() == VecTy) {
14033 Root = SV->getOperand(0);
14034 Mask.assign(SV->getShuffleMask().begin(), SV->getShuffleMask().end());
14037 for (
int I = 0, E = VL.
size();
I < E; ++
I) {
14044 if (isa<PoisonValue>(VL[
I]))
14046 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14050 if (isa<PoisonValue>(Vec)) {
14051 Vec = OriginalRoot;
14053 Vec = CreateShuffle(Root, Vec, Mask);
14054 if (
auto *OI = dyn_cast<Instruction>(OriginalRoot);
14055 OI && OI->hasNUses(0) &&
14056 none_of(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14057 return TE->VectorizedValue == OI;
14063 for (
int I : NonConsts)
14064 Vec = CreateInsertElement(Vec, VL[
I],
I, ScalarTy);
14067 for (
const std::pair<Value *, unsigned> &Pair : PostponedInsts)
14068 Vec = CreateInsertElement(Vec, Pair.first, Pair.second, ScalarTy);
14106 bool IsFinalized =
false;
14119 class ShuffleIRBuilder {
14132 : Builder(Builder), GatherShuffleExtractSeq(GatherShuffleExtractSeq),
14133 CSEBlocks(CSEBlocks),
DL(
DL) {}
14134 ~ShuffleIRBuilder() =
default;
14137 if (V1->
getType() != V2->getType()) {
14140 "Expected integer vector types only.");
14141 if (V1->
getType() != V2->getType()) {
14142 if (cast<VectorType>(V2->getType())
14144 ->getIntegerBitWidth() < cast<VectorType>(V1->
getType())
14146 ->getIntegerBitWidth())
14155 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14156 GatherShuffleExtractSeq.
insert(
I);
14157 CSEBlocks.
insert(
I->getParent());
14166 unsigned VF = Mask.size();
14167 unsigned LocalVF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14171 if (
auto *
I = dyn_cast<Instruction>(Vec)) {
14172 GatherShuffleExtractSeq.
insert(
I);
14173 CSEBlocks.
insert(
I->getParent());
14177 Value *createIdentity(
Value *V) {
return V; }
14178 Value *createPoison(
Type *Ty,
unsigned VF) {
14183 void resizeToMatch(
Value *&V1,
Value *&V2) {
14184 if (V1->
getType() == V2->getType())
14186 int V1VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
14187 int V2VF = cast<FixedVectorType>(V2->getType())->getNumElements();
14188 int VF = std::max(V1VF, V2VF);
14189 int MinVF = std::min(V1VF, V2VF);
14191 std::iota(IdentityMask.
begin(), std::next(IdentityMask.
begin(), MinVF),
14193 Value *&
Op = MinVF == V1VF ? V1 : V2;
14195 if (
auto *
I = dyn_cast<Instruction>(
Op)) {
14196 GatherShuffleExtractSeq.
insert(
I);
14197 CSEBlocks.
insert(
I->getParent());
14210 assert(V1 &&
"Expected at least one vector value.");
14211 ShuffleIRBuilder ShuffleBuilder(Builder, R.GatherShuffleExtractSeq,
14212 R.CSEBlocks, *R.DL);
14213 return BaseShuffleAnalysis::createShuffle<Value *>(V1, V2, Mask,
14221 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14229 std::optional<bool> IsSigned = std::nullopt) {
14230 auto *VecTy = cast<VectorType>(V->getType());
14241 : BaseShuffleAnalysis(ScalarTy), Builder(Builder), R(R) {}
14245 ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
14246 unsigned NumParts,
bool &UseVecBaseAsInput) {
14247 UseVecBaseAsInput =
false;
14249 Value *VecBase =
nullptr;
14251 if (!E->ReorderIndices.empty()) {
14253 E->ReorderIndices.end());
14256 for (
int I = 0, Sz = Mask.size();
I < Sz; ++
I) {
14260 auto *EI = cast<ExtractElementInst>(VL[
I]);
14261 VecBase = EI->getVectorOperand();
14262 if (
const TreeEntry *TE = R.getTreeEntry(VecBase))
14263 VecBase = TE->VectorizedValue;
14264 assert(VecBase &&
"Expected vectorized value.");
14265 UniqueBases.
insert(VecBase);
14268 if (!EI->hasOneUse() || R.ExternalUsesAsOriginalScalar.contains(EI) ||
14269 (NumParts != 1 &&
count(VL, EI) > 1) ||
14271 const TreeEntry *UTE = R.getTreeEntry(U);
14272 return !UTE || R.MultiNodeScalars.contains(U) ||
14273 (isa<GetElementPtrInst>(U) &&
14274 !R.areAllUsersVectorized(cast<Instruction>(U))) ||
14275 count_if(R.VectorizableTree,
14276 [&](const std::unique_ptr<TreeEntry> &TE) {
14277 return any_of(TE->UserTreeIndices,
14278 [&](const EdgeInfo &Edge) {
14279 return Edge.UserTE == UTE;
14281 is_contained(VL, EI);
14285 R.eraseInstruction(EI);
14287 if (NumParts == 1 || UniqueBases.
size() == 1) {
14288 assert(VecBase &&
"Expected vectorized value.");
14289 return castToScalarTyElem(VecBase);
14291 UseVecBaseAsInput =
true;
14301 Value *Vec =
nullptr;
14304 for (
unsigned Part : seq<unsigned>(NumParts)) {
14308 constexpr int MaxBases = 2;
14310 auto VLMask =
zip(SubVL, SubMask);
14311 const unsigned VF = std::accumulate(
14312 VLMask.begin(), VLMask.end(), 0U, [&](
unsigned S,
const auto &
D) {
14313 if (std::get<1>(D) == PoisonMaskElem)
14316 cast<ExtractElementInst>(std::get<0>(D))->getVectorOperand();
14317 if (const TreeEntry *TE = R.getTreeEntry(VecOp))
14318 VecOp = TE->VectorizedValue;
14319 assert(VecOp &&
"Expected vectorized value.");
14320 const unsigned Size =
14321 cast<FixedVectorType>(VecOp->getType())->getNumElements();
14322 return std::max(S, Size);
14324 for (
const auto [V,
I] : VLMask) {
14327 Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
14328 if (
const TreeEntry *TE = R.getTreeEntry(VecOp))
14329 VecOp = TE->VectorizedValue;
14330 assert(VecOp &&
"Expected vectorized value.");
14331 VecOp = castToScalarTyElem(VecOp);
14332 Bases[
I / VF] = VecOp;
14334 if (!Bases.front())
14337 if (Bases.back()) {
14338 SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
14339 TransformToIdentity(SubMask);
14341 SubVec = Bases.front();
14348 Mask.slice(
P * SliceSize,
14355 "Expected first part or all previous parts masked.");
14356 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14359 cast<FixedVectorType>(Vec->
getType())->getNumElements();
14361 unsigned SubVecVF =
14362 cast<FixedVectorType>(SubVec->
getType())->getNumElements();
14363 NewVF = std::max(NewVF, SubVecVF);
14366 for (
int &
Idx : SubMask)
14369 copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
14370 Vec = createShuffle(Vec, SubVec, VecMask);
14371 TransformToIdentity(VecMask);
14379 std::optional<Value *>
14385 TEs, [](
const TreeEntry *TE) {
return TE->VectorizedValue; });
14387 return std::nullopt;
14390 auto *ResVecTy =
getWidenedType(ScalarTy, E->getVectorFactor());
14399 Value *V1 = E1.VectorizedValue;
14401 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14402 if (isa<PoisonValue>(V))
14404 return !isKnownNonNegative(
14405 V, SimplifyQuery(*R.DL));
14407 Value *V2 = E2.VectorizedValue;
14408 if (V2->getType()->isIntOrIntVectorTy())
14409 V2 = castToScalarTyElem(V2,
any_of(E2.Scalars, [&](
Value *V) {
14410 if (isa<PoisonValue>(V))
14412 return !isKnownNonNegative(
14413 V, SimplifyQuery(*R.DL));
14420 Value *V1 = E1.VectorizedValue;
14422 V1 = castToScalarTyElem(V1,
any_of(E1.Scalars, [&](
Value *V) {
14423 if (isa<PoisonValue>(V))
14425 return !isKnownNonNegative(
14426 V, SimplifyQuery(*R.DL));
14432 assert(V1 && V2 && !Mask.empty() &&
"Expected non-empty input vectors.");
14434 isa<FixedVectorType>(V2->getType()) &&
14435 "castToScalarTyElem expects V1 and V2 to be FixedVectorType");
14436 V1 = castToScalarTyElem(V1);
14437 V2 = castToScalarTyElem(V2);
14438 if (InVectors.
empty()) {
14441 CommonMask.
assign(Mask.begin(), Mask.end());
14445 if (InVectors.
size() == 2) {
14446 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14447 transformMaskAfterShuffle(CommonMask, CommonMask);
14448 }
else if (cast<FixedVectorType>(Vec->
getType())->getNumElements() !=
14450 Vec = createShuffle(Vec,
nullptr, CommonMask);
14451 transformMaskAfterShuffle(CommonMask, CommonMask);
14453 V1 = createShuffle(V1, V2, Mask);
14454 unsigned VF = std::max(getVF(V1), getVF(Vec));
14455 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14457 CommonMask[
Idx] =
Idx + VF;
14458 InVectors.
front() = Vec;
14459 if (InVectors.
size() == 2)
14460 InVectors.
back() = V1;
14467 "castToScalarTyElem expects V1 to be FixedVectorType");
14468 V1 = castToScalarTyElem(V1);
14469 if (InVectors.
empty()) {
14471 CommonMask.
assign(Mask.begin(), Mask.end());
14474 const auto *It =
find(InVectors, V1);
14475 if (It == InVectors.
end()) {
14476 if (InVectors.
size() == 2 ||
14479 if (InVectors.
size() == 2) {
14480 V = createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14481 transformMaskAfterShuffle(CommonMask, CommonMask);
14482 }
else if (cast<FixedVectorType>(V->getType())->getNumElements() !=
14483 CommonMask.
size()) {
14484 V = createShuffle(InVectors.
front(),
nullptr, CommonMask);
14485 transformMaskAfterShuffle(CommonMask, CommonMask);
14487 unsigned VF = std::max(CommonMask.
size(), Mask.size());
14488 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14491 V->getType() != V1->
getType()
14493 : Mask[
Idx] + cast<FixedVectorType>(V1->
getType())
14494 ->getNumElements();
14495 if (V->getType() != V1->
getType())
14496 V1 = createShuffle(V1,
nullptr, Mask);
14497 InVectors.
front() = V;
14498 if (InVectors.
size() == 2)
14499 InVectors.
back() = V1;
14506 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14513 for (
Value *V : InVectors)
14514 VF = std::max(VF, getVF(V));
14515 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14517 CommonMask[
Idx] = Mask[
Idx] + (It == InVectors.begin() ? 0 : VF);
14526 Value *Root =
nullptr) {
14527 return R.gather(VL, Root, ScalarTy,
14529 return createShuffle(V1, V2, Mask);
14538 ArrayRef<std::pair<const TreeEntry *, unsigned>> SubVectors,
14541 IsFinalized =
true;
14544 if (ScalarTyNumElements != 1) {
14548 ExtMask = NewExtMask;
14552 if (InVectors.
size() == 2) {
14553 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14556 Vec = createShuffle(Vec,
nullptr, CommonMask);
14558 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14562 "Expected vector length for the final value before action.");
14563 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
14566 std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0);
14567 Vec = createShuffle(Vec,
nullptr, ResizeMask);
14569 Action(Vec, CommonMask);
14570 InVectors.
front() = Vec;
14572 if (!SubVectors.empty()) {
14574 if (InVectors.
size() == 2) {
14575 Vec = createShuffle(Vec, InVectors.
back(), CommonMask);
14578 Vec = createShuffle(Vec,
nullptr, CommonMask);
14580 for (
unsigned Idx = 0, Sz = CommonMask.
size();
Idx < Sz; ++
Idx)
14583 auto CreateSubVectors = [&](
Value *Vec,
14585 for (
auto [E,
Idx] : SubVectors) {
14586 Value *
V = E->VectorizedValue;
14587 if (
V->getType()->isIntOrIntVectorTy())
14588 V = castToScalarTyElem(V,
any_of(E->Scalars, [&](
Value *V) {
14589 if (isa<PoisonValue>(V))
14591 return !isKnownNonNegative(
14592 V, SimplifyQuery(*R.DL));
14594 unsigned InsertionIndex =
Idx * ScalarTyNumElements;
14596 Builder, Vec, V, InsertionIndex,
14597 std::bind(&ShuffleInstructionBuilder::createShuffle,
this, _1, _2,
14599 if (!CommonMask.
empty()) {
14601 std::next(CommonMask.
begin(), InsertionIndex),
14602 std::next(CommonMask.
begin(),
14603 (
Idx + E->getVectorFactor()) * ScalarTyNumElements),
14609 if (SubVectorsMask.
empty()) {
14610 Vec = CreateSubVectors(Vec, CommonMask);
14613 copy(SubVectorsMask, SVMask.begin());
14614 for (
auto [I1, I2] :
zip(SVMask, CommonMask)) {
14617 I1 = I2 + CommonMask.
size();
14622 Vec = createShuffle(InsertVec, Vec, SVMask);
14623 for (
unsigned I : seq<unsigned>(CommonMask.
size())) {
14628 InVectors.
front() = Vec;
14631 if (!ExtMask.
empty()) {
14632 if (CommonMask.
empty()) {
14636 for (
int I = 0, Sz = ExtMask.
size();
I < Sz; ++
I) {
14639 NewMask[
I] = CommonMask[ExtMask[
I]];
14641 CommonMask.
swap(NewMask);
14644 if (CommonMask.
empty()) {
14645 assert(InVectors.
size() == 1 &&
"Expected only one vector with no mask");
14646 return InVectors.
front();
14648 if (InVectors.
size() == 2)
14649 return createShuffle(InVectors.
front(), InVectors.
back(), CommonMask);
14650 return createShuffle(InVectors.
front(),
nullptr, CommonMask);
14655 "Shuffle construction must be finalized.");
14659BoUpSLP::TreeEntry *BoUpSLP::getMatchedVectorizedOperand(
const TreeEntry *E,
14660 unsigned NodeIdx) {
14664 if (!S && VL.
front()->getType()->isPointerTy()) {
14665 const auto *It =
find_if(VL, IsaPred<GetElementPtrInst>);
14666 if (It != VL.
end())
14671 auto CheckSameVE = [&](
const TreeEntry *VE) {
14672 return VE->isSame(VL) &&
14673 (
any_of(VE->UserTreeIndices,
14674 [E, NodeIdx](
const EdgeInfo &EI) {
14675 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14677 any_of(VectorizableTree,
14678 [E, NodeIdx, VE](
const std::unique_ptr<TreeEntry> &TE) {
14679 return TE->isOperandGatherNode(
14680 {
const_cast<TreeEntry *
>(E), NodeIdx}) &&
14681 VE->isSame(TE->Scalars);
14684 TreeEntry *VE = getTreeEntry(S.getMainOp());
14685 if (VE && CheckSameVE(VE))
14687 auto It = MultiNodeScalars.
find(S.getMainOp());
14688 if (It != MultiNodeScalars.
end()) {
14689 auto *
I =
find_if(It->getSecond(), [&](
const TreeEntry *TE) {
14690 return TE != VE && CheckSameVE(TE);
14692 if (
I != It->getSecond().end())
14698Value *BoUpSLP::vectorizeOperand(TreeEntry *E,
unsigned NodeIdx,
14699 bool PostponedPHIs) {
14700 ValueList &VL = E->getOperand(NodeIdx);
14701 const unsigned VF = VL.size();
14702 if (TreeEntry *VE = getMatchedVectorizedOperand(E, NodeIdx)) {
14707 Type *ScalarTy = cast<VectorType>(
V->getType())->getElementType();
14709 ShuffleInstructionBuilder ShuffleBuilder(
14713 ShuffleBuilder.add(V, Mask);
14715 E->CombinedEntriesWithIndices.size());
14716 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14717 [&](
const auto &
P) {
14718 return std::make_pair(VectorizableTree[P.first].get(),
14721 assert((E->CombinedEntriesWithIndices.empty() ||
14722 E->ReorderIndices.empty()) &&
14723 "Expected either combined subnodes or reordering");
14724 return ShuffleBuilder.finalize({}, SubVectors, {});
14728 cast<FixedVectorType>(
V->getType())->getNumElements()) {
14729 if (!VE->ReuseShuffleIndices.empty()) {
14750 if (isa<PoisonValue>(V))
14752 Mask[
I] = VE->findLaneForValue(V);
14754 V = FinalShuffle(V, Mask);
14756 assert(VF < cast<FixedVectorType>(
V->getType())->getNumElements() &&
14757 "Expected vectorization factor less "
14758 "than original vector size.");
14760 std::iota(UniformMask.begin(), UniformMask.end(), 0);
14761 V = FinalShuffle(V, UniformMask);
14767 if (
find_if(VE->UserTreeIndices, [&](
const EdgeInfo &EI) {
14768 return EI.UserTE == E && EI.EdgeIdx == NodeIdx;
14769 }) == VE->UserTreeIndices.end()) {
14771 find_if(VectorizableTree, [&](
const std::unique_ptr<TreeEntry> &TE) {
14772 return TE->isGather() &&
TE->UserTreeIndices.front().UserTE == E &&
14773 TE->UserTreeIndices.front().EdgeIdx == NodeIdx;
14775 assert(It != VectorizableTree.end() &&
"Expected gather node operand.");
14776 (*It)->VectorizedValue =
V;
14784 auto *
I =
find_if(VectorizableTree,
14785 [E, NodeIdx](
const std::unique_ptr<TreeEntry> &TE) {
14786 return TE->isOperandGatherNode({E, NodeIdx});
14788 assert(
I != VectorizableTree.end() &&
"Gather node is not in the graph.");
14789 assert(
I->get()->UserTreeIndices.size() == 1 &&
14790 "Expected only single user for the gather node.");
14791 assert(
I->get()->isSame(VL) &&
"Expected same list of scalars.");
14795template <
typename BVTy,
typename ResTy,
typename...
Args>
14796ResTy BoUpSLP::processBuildVector(
const TreeEntry *E,
Type *ScalarTy,
14798 assert(E->isGather() &&
"Expected gather node.");
14799 unsigned VF = E->getVectorFactor();
14801 bool NeedFreeze =
false;
14803 E->ReuseShuffleIndices.end());
14806 for (
auto [EIdx,
Idx] : E->CombinedEntriesWithIndices)
14808 .slice(
Idx, VectorizableTree[EIdx]->getVectorFactor()),
14811 E->CombinedEntriesWithIndices.size());
14812 transform(E->CombinedEntriesWithIndices, SubVectors.begin(),
14813 [&](
const auto &
P) {
14814 return std::make_pair(VectorizableTree[P.first].get(), P.second);
14819 E->ReorderIndices.end());
14820 if (!ReorderMask.empty())
14826 if (!SubVectors.empty() && !SubVectorsMask.
empty()) {
14827 for (
unsigned I : seq<unsigned>(GatheredScalars.size()))
14828 if (E->Scalars[
I] == GatheredScalars[ReorderMask[
I]])
14831 SubVectorsMask.
clear();
14835 unsigned I,
unsigned SliceSize,
14836 bool IsNotPoisonous) {
14838 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
14841 TreeEntry *UserTE = E->UserTreeIndices.back().UserTE;
14842 unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx;
14843 if (UserTE->getNumOperands() != 2)
14845 if (!IsNotPoisonous) {
14847 find_if(VectorizableTree, [=](
const std::unique_ptr<TreeEntry> &TE) {
14848 return find_if(
TE->UserTreeIndices, [=](
const EdgeInfo &EI) {
14849 return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx;
14850 }) !=
TE->UserTreeIndices.end();
14852 if (It == VectorizableTree.end())
14855 if (!(*It)->ReorderIndices.empty()) {
14859 if (!
all_of(
zip(GatheredScalars, GS), [&](
const auto &
P) {
14860 Value *V0 = std::get<0>(
P);
14861 Value *V1 = std::get<1>(
P);
14862 return !isa<UndefValue>(V0) || isa<PoisonValue>(V0) ||
14863 (isa<UndefValue>(V0) && !isa<PoisonValue>(V0) &&
14869 if ((
Mask.size() < InputVF &&
14872 (
Mask.size() == InputVF &&
14875 std::next(
Mask.begin(),
I * SliceSize),
14876 std::next(
Mask.begin(),
14883 std::next(
Mask.begin(),
I * SliceSize),
14884 std::next(
Mask.begin(),
14890 BVTy ShuffleBuilder(ScalarTy, Params...);
14891 ResTy Res = ResTy();
14895 Value *ExtractVecBase =
nullptr;
14896 bool UseVecBaseAsInput =
false;
14899 Type *OrigScalarTy = GatheredScalars.front()->getType();
14902 if (NumParts == 0 || NumParts >= GatheredScalars.size() ||
14907 if (!
all_of(GatheredScalars, IsaPred<UndefValue>)) {
14909 bool Resized =
false;
14911 tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
14912 if (!ExtractShuffles.
empty()) {
14917 if (
const auto *TE = getTreeEntry(
14918 cast<ExtractElementInst>(StoredGS[
Idx])->getVectorOperand()))
14921 if (std::optional<ResTy> Delayed =
14922 ShuffleBuilder.needToDelay(E, ExtractEntries)) {
14924 PostponedGathers.
insert(E);
14929 if (
Value *VecBase = ShuffleBuilder.adjustExtracts(
14930 E, ExtractMask, ExtractShuffles, NumParts, UseVecBaseAsInput)) {
14931 ExtractVecBase = VecBase;
14932 if (
auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
14933 if (VF == VecBaseTy->getNumElements() &&
14934 GatheredScalars.size() != VF) {
14936 GatheredScalars.append(VF - GatheredScalars.size(),
14942 if (!ExtractShuffles.
empty() || !E->hasState() ||
14943 E->getOpcode() != Instruction::Load ||
14944 (((E->hasState() && E->getOpcode() == Instruction::Load) ||
14945 any_of(E->Scalars, IsaPred<LoadInst>)) &&
14948 return isa<LoadInst>(V) && getTreeEntry(V);
14950 (E->hasState() && E->isAltShuffle()) ||
14951 all_of(E->Scalars, [
this](
Value *V) { return getTreeEntry(V); }) ||
14953 (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
14955 isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
14957 if (!GatherShuffles.
empty()) {
14958 if (std::optional<ResTy> Delayed =
14959 ShuffleBuilder.needToDelay(E, Entries)) {
14961 PostponedGathers.
insert(E);
14966 if (GatherShuffles.
size() == 1 &&
14968 Entries.front().front()->isSame(E->Scalars)) {
14971 LLVM_DEBUG(
dbgs() <<
"SLP: perfect diamond match for gather bundle "
14974 Mask.resize(E->Scalars.size());
14975 const TreeEntry *FrontTE = Entries.front().front();
14976 if (FrontTE->ReorderIndices.empty() &&
14977 ((FrontTE->ReuseShuffleIndices.empty() &&
14978 E->Scalars.size() == FrontTE->Scalars.size()) ||
14979 (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
14980 std::iota(
Mask.begin(),
Mask.end(), 0);
14983 if (isa<PoisonValue>(V)) {
14987 Mask[
I] = FrontTE->findLaneForValue(V);
14990 ShuffleBuilder.add(*FrontTE, Mask);
14992 Res = ShuffleBuilder.finalize(E->getCommonMask(), {}, {});
14996 if (GatheredScalars.size() != VF &&
14998 return any_of(TEs, [&](
const TreeEntry *TE) {
14999 return TE->getVectorFactor() == VF;
15002 GatheredScalars.append(VF - GatheredScalars.size(),
15006 for (
int I = 0, Sz =
Mask.size();
I < Sz; ++
I) {
15014 bool IsRootPoison) {
15017 bool IsSplat = IsRootPoison &&
isSplat(Scalars) &&
15024 int NumNonConsts = 0;
15027 if (isa<UndefValue>(V)) {
15028 if (!isa<PoisonValue>(V)) {
15043 Scalars.
front() = OrigV;
15046 const auto Res = UniquePositions.
try_emplace(OrigV,
I);
15047 Scalars[Res.first->second] = OrigV;
15048 ReuseMask[
I] = Res.first->second;
15051 if (NumNonConsts == 1) {
15056 if (!UndefPos.
empty() && UndefPos.
front() == 0)
15059 ReuseMask[SinglePos] = SinglePos;
15060 }
else if (!UndefPos.
empty() && IsSplat) {
15065 return !isa<UndefValue>(V) &&
15067 (E->UserTreeIndices.size() == 1 &&
15071 return E->UserTreeIndices.front().EdgeIdx !=
15072 U.getOperandNo() &&
15074 E->UserTreeIndices.front().UserTE->Scalars,
15078 if (It != Scalars.
end()) {
15080 int Pos = std::distance(Scalars.
begin(), It);
15081 for (
int I : UndefPos) {
15083 ReuseMask[
I] = Pos;
15092 for (
int I : UndefPos) {
15094 if (isa<UndefValue>(Scalars[
I]))
15101 if (!ExtractShuffles.
empty() || !GatherShuffles.
empty()) {
15102 bool IsNonPoisoned =
true;
15103 bool IsUsedInExpr =
true;
15104 Value *Vec1 =
nullptr;
15105 if (!ExtractShuffles.
empty()) {
15109 Value *Vec2 =
nullptr;
15110 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15114 if (UseVecBaseAsInput) {
15115 Vec1 = ExtractVecBase;
15117 for (
unsigned I = 0, Sz = ExtractMask.size();
I < Sz; ++
I) {
15120 if (isa<UndefValue>(E->Scalars[
I]))
15122 auto *EI = cast<ExtractElementInst>(StoredGS[
I]);
15123 Value *VecOp = EI->getVectorOperand();
15124 if (
const auto *TE = getTreeEntry(VecOp))
15125 if (
TE->VectorizedValue)
15126 VecOp =
TE->VectorizedValue;
15129 }
else if (Vec1 != VecOp) {
15130 assert((!Vec2 || Vec2 == VecOp) &&
15131 "Expected only 1 or 2 vectors shuffle.");
15137 IsUsedInExpr =
false;
15140 ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
15143 IsUsedInExpr &= FindReusedSplat(
15145 cast<FixedVectorType>(Vec1->
getType())->getNumElements(), 0,
15146 ExtractMask.size(), IsNotPoisonedVec);
15147 ShuffleBuilder.add(Vec1, ExtractMask,
true);
15148 IsNonPoisoned &= IsNotPoisonedVec;
15150 IsUsedInExpr =
false;
15155 if (!GatherShuffles.
empty()) {
15158 for (
const auto [
I, TEs] :
enumerate(Entries)) {
15161 "No shuffles with empty entries list expected.");
15165 "Expected shuffle of 1 or 2 entries.");
15169 copy(SubMask, std::next(VecMask.begin(),
I * SliceSize));
15170 if (TEs.
size() == 1) {
15171 bool IsNotPoisonedVec =
15172 TEs.
front()->VectorizedValue
15176 FindReusedSplat(VecMask, TEs.
front()->getVectorFactor(),
I,
15177 SliceSize, IsNotPoisonedVec);
15178 ShuffleBuilder.add(*TEs.
front(), VecMask);
15179 IsNonPoisoned &= IsNotPoisonedVec;
15181 IsUsedInExpr =
false;
15182 ShuffleBuilder.add(*TEs.
front(), *TEs.
back(), VecMask);
15183 if (TEs.
front()->VectorizedValue && TEs.
back()->VectorizedValue)
15194 int EMSz = ExtractMask.size();
15195 int MSz =
Mask.size();
15198 bool IsSingleShuffle = ExtractShuffles.
empty() || GatherShuffles.
empty();
15199 bool IsIdentityShuffle =
15200 ((UseVecBaseAsInput ||
15202 [](
const std::optional<TTI::ShuffleKind> &SK) {
15206 none_of(ExtractMask, [&](
int I) {
return I >= EMSz; }) &&
15208 (!GatherShuffles.
empty() &&
15210 [](
const std::optional<TTI::ShuffleKind> &SK) {
15214 none_of(Mask, [&](
int I) {
return I >= MSz; }) &&
15216 bool EnoughConstsForShuffle =
15220 return isa<UndefValue>(V) && !isa<PoisonValue>(V);
15224 return isa<Constant>(V) && !isa<UndefValue>(V);
15226 (!IsIdentityShuffle ||
15227 (GatheredScalars.size() == 2 &&
15229 [](
Value *V) {
return !isa<UndefValue>(V); })) ||
15231 return isa<Constant>(V) && !isa<PoisonValue>(V);
15235 for (
int I = 0, Sz = GatheredScalars.size();
I < Sz; ++
I) {
15236 if (EnoughConstsForShuffle && isa<Constant>(GatheredScalars[
I]))
15242 if (!
all_of(GatheredScalars, IsaPred<PoisonValue>)) {
15244 TryPackScalars(GatheredScalars, BVMask,
true);
15245 Value *BV = ShuffleBuilder.gather(GatheredScalars, BVMask.size());
15246 ShuffleBuilder.add(BV, BVMask);
15249 return isa<PoisonValue>(V) ||
15250 (IsSingleShuffle && ((IsIdentityShuffle &&
15251 IsNonPoisoned) || IsUsedInExpr) && isa<UndefValue>(V));
15253 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15256 Res = ShuffleBuilder.finalize(
15257 E->ReuseShuffleIndices, SubVectors, SubVectorsMask, E->Scalars.size(),
15259 TryPackScalars(NonConstants, Mask,
false);
15260 Vec = ShuffleBuilder.gather(NonConstants,
Mask.size(), Vec);
15265 TryPackScalars(GatheredScalars, ReuseMask,
true);
15266 Value *BV = ShuffleBuilder.gather(GatheredScalars, ReuseMask.
size());
15267 ShuffleBuilder.add(BV, ReuseMask);
15268 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15273 for (
auto [
I, V] :
enumerate(GatheredScalars)) {
15274 if (!isa<PoisonValue>(V))
15277 Value *BV = ShuffleBuilder.gather(GatheredScalars);
15278 ShuffleBuilder.add(BV, Mask);
15279 Res = ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors,
15284 Res = ShuffleBuilder.createFreeze(Res);
15288Value *BoUpSLP::createBuildVector(
const TreeEntry *E,
Type *ScalarTy,
15289 bool PostponedPHIs) {
15290 for (
auto [EIdx,
_] : E->CombinedEntriesWithIndices)
15292 return processBuildVector<ShuffleInstructionBuilder, Value *>(E, ScalarTy,
15300 for (
Value *V : VL)
15301 if (isa<Instruction>(V))
15309 if (E->VectorizedValue &&
15310 (E->State != TreeEntry::Vectorize || E->getOpcode() != Instruction::PHI ||
15311 E->isAltShuffle())) {
15312 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *E->Scalars[0] <<
".\n");
15313 return E->VectorizedValue;
15316 Value *
V = E->Scalars.front();
15317 Type *ScalarTy =
V->getType();
15318 if (!isa<CmpInst>(V))
15320 auto It = MinBWs.
find(E);
15321 if (It != MinBWs.
end()) {
15322 auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy);
15328 if (E->isGather()) {
15330 if (E->hasState() && E->Idx == 0 && !UserIgnoreList)
15331 setInsertPointAfterBundle(E);
15332 Value *Vec = createBuildVector(E, ScalarTy, PostponedPHIs);
15333 E->VectorizedValue = Vec;
15337 bool IsReverseOrder =
15338 !E->ReorderIndices.empty() &&
isReverseOrder(E->ReorderIndices);
15339 auto FinalShuffle = [&](
Value *
V,
const TreeEntry *E) {
15340 ShuffleInstructionBuilder ShuffleBuilder(ScalarTy, Builder, *
this);
15341 if (E->getOpcode() == Instruction::Store &&
15342 E->State == TreeEntry::Vectorize) {
15344 ArrayRef(
reinterpret_cast<const int *
>(E->ReorderIndices.begin()),
15345 E->ReorderIndices.size());
15346 ShuffleBuilder.add(V, Mask);
15347 }
else if (E->State == TreeEntry::StridedVectorize && IsReverseOrder) {
15348 ShuffleBuilder.addOrdered(V, {});
15350 ShuffleBuilder.addOrdered(V, E->ReorderIndices);
15353 E->CombinedEntriesWithIndices.size());
15355 E->CombinedEntriesWithIndices, SubVectors.begin(), [&](
const auto &
P) {
15356 return std::make_pair(VectorizableTree[P.first].get(), P.second);
15359 (E->CombinedEntriesWithIndices.empty() || E->ReorderIndices.empty()) &&
15360 "Expected either combined subnodes or reordering");
15361 return ShuffleBuilder.finalize(E->ReuseShuffleIndices, SubVectors, {});
15364 assert(!E->isGather() &&
"Unhandled state");
15365 unsigned ShuffleOrOp =
15366 E->isAltShuffle() ? (
unsigned)Instruction::ShuffleVector : E->getOpcode();
15368 auto GetOperandSignedness = [&](
unsigned Idx) {
15369 const TreeEntry *OpE = getOperandEntry(E,
Idx);
15370 bool IsSigned =
false;
15371 auto It = MinBWs.
find(OpE);
15372 if (It != MinBWs.
end())
15373 IsSigned = It->second.second;
15376 if (isa<PoisonValue>(V))
15378 return !isKnownNonNegative(R, SimplifyQuery(*DL));
15382 switch (ShuffleOrOp) {
15383 case Instruction::PHI: {
15384 assert((E->ReorderIndices.empty() || !E->ReuseShuffleIndices.empty() ||
15385 E != VectorizableTree.front().get() ||
15386 !E->UserTreeIndices.empty()) &&
15387 "PHI reordering is free.");
15388 if (PostponedPHIs && E->VectorizedValue)
15389 return E->VectorizedValue;
15390 auto *PH = cast<PHINode>(VL0);
15392 PH->getParent()->getFirstNonPHIIt());
15394 if (PostponedPHIs || !E->VectorizedValue) {
15401 PH->getParent()->getFirstInsertionPt());
15404 V = FinalShuffle(V, E);
15406 E->VectorizedValue =
V;
15410 PHINode *NewPhi = cast<PHINode>(E->PHI);
15419 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
15425 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15429 if (!VisitedBBs.
insert(IBB).second) {
15436 Value *Vec = vectorizeOperand(E,
I,
true);
15437 if (VecTy != Vec->
getType()) {
15439 MinBWs.
contains(getOperandEntry(E,
I))) &&
15440 "Expected item in MinBWs.");
15441 Vec = Builder.
CreateIntCast(Vec, VecTy, GetOperandSignedness(
I));
15447 "Invalid number of incoming values");
15448 assert(E->VectorizedValue &&
"Expected vectorized value.");
15449 return E->VectorizedValue;
15452 case Instruction::ExtractElement: {
15453 Value *
V = E->getSingleOperand(0);
15454 if (
const TreeEntry *TE = getTreeEntry(V))
15455 V =
TE->VectorizedValue;
15456 setInsertPointAfterBundle(E);
15457 V = FinalShuffle(V, E);
15458 E->VectorizedValue =
V;
15461 case Instruction::ExtractValue: {
15462 auto *LI = cast<LoadInst>(E->getSingleOperand(0));
15467 NewV = FinalShuffle(NewV, E);
15468 E->VectorizedValue = NewV;
15471 case Instruction::InsertElement: {
15472 assert(E->ReuseShuffleIndices.empty() &&
"All inserts should be unique");
15474 Value *
V = vectorizeOperand(E, 1, PostponedPHIs);
15476 Type *ScalarTy =
Op.front()->getType();
15477 if (cast<VectorType>(
V->getType())->getElementType() != ScalarTy) {
15479 std::pair<unsigned, bool> Res = MinBWs.
lookup(getOperandEntry(E, 1));
15480 assert(Res.first > 0 &&
"Expected item in MinBWs.");
15485 cast<FixedVectorType>(
V->getType())->getNumElements()),
15490 auto *FirstInsert = cast<Instruction>(*
find_if(E->Scalars, [E](
Value *V) {
15491 return !is_contained(E->Scalars, cast<Instruction>(V)->getOperand(0));
15493 const unsigned NumElts =
15494 cast<FixedVectorType>(FirstInsert->getType())->getNumElements();
15495 const unsigned NumScalars = E->Scalars.size();
15498 assert(
Offset < NumElts &&
"Failed to find vector index offset");
15502 if (!E->ReorderIndices.empty()) {
15507 std::iota(
Mask.begin(), std::next(
Mask.begin(), NumScalars), 0);
15510 bool IsIdentity =
true;
15512 Mask.swap(PrevMask);
15513 for (
unsigned I = 0;
I < NumScalars; ++
I) {
15516 IsIdentity &= InsertIdx -
Offset ==
I;
15519 if (!IsIdentity || NumElts != NumScalars) {
15521 bool IsVNonPoisonous =
15524 if (NumElts != NumScalars &&
Offset == 0) {
15533 InsertMask[*InsertIdx] = *InsertIdx;
15534 if (!
Ins->hasOneUse())
15536 Ins = dyn_cast_or_null<InsertElementInst>(
15537 Ins->getUniqueUndroppableUser());
15540 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15542 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15545 if (!IsFirstPoison.
all()) {
15547 for (
unsigned I = 0;
I < NumElts;
I++) {
15549 IsFirstUndef.
test(
I)) {
15550 if (IsVNonPoisonous) {
15551 InsertMask[
I] =
I < NumScalars ?
I : 0;
15556 if (
Idx >= NumScalars)
15557 Idx = NumScalars - 1;
15558 InsertMask[
I] = NumScalars +
Idx;
15572 if (
auto *
I = dyn_cast<Instruction>(V)) {
15573 GatherShuffleExtractSeq.
insert(
I);
15574 CSEBlocks.
insert(
I->getParent());
15579 for (
unsigned I = 0;
I < NumElts;
I++) {
15584 buildUseMask(NumElts, InsertMask, UseMask::UndefsAsMask);
15587 if ((!IsIdentity ||
Offset != 0 || !IsFirstUndef.
all()) &&
15588 NumElts != NumScalars) {
15589 if (IsFirstUndef.
all()) {
15592 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15593 if (!IsFirstPoison.
all()) {
15594 for (
unsigned I = 0;
I < NumElts;
I++) {
15596 InsertMask[
I] =
I + NumElts;
15603 InsertMask, cast<Instruction>(E->Scalars.back())->
getName());
15604 if (
auto *
I = dyn_cast<Instruction>(V)) {
15605 GatherShuffleExtractSeq.
insert(
I);
15606 CSEBlocks.
insert(
I->getParent());
15611 isUndefVector<true>(FirstInsert->getOperand(0), UseMask);
15612 for (
unsigned I = 0;
I < NumElts;
I++) {
15616 InsertMask[
I] += NumElts;
15619 FirstInsert->getOperand(0), V, InsertMask,
15620 cast<Instruction>(E->Scalars.back())->getName());
15621 if (
auto *
I = dyn_cast<Instruction>(V)) {
15622 GatherShuffleExtractSeq.
insert(
I);
15623 CSEBlocks.
insert(
I->getParent());
15628 ++NumVectorInstructions;
15629 E->VectorizedValue =
V;
15632 case Instruction::ZExt:
15633 case Instruction::SExt:
15634 case Instruction::FPToUI:
15635 case Instruction::FPToSI:
15636 case Instruction::FPExt:
15637 case Instruction::PtrToInt:
15638 case Instruction::IntToPtr:
15639 case Instruction::SIToFP:
15640 case Instruction::UIToFP:
15641 case Instruction::Trunc:
15642 case Instruction::FPTrunc:
15643 case Instruction::BitCast: {
15644 setInsertPointAfterBundle(E);
15646 Value *InVec = vectorizeOperand(E, 0, PostponedPHIs);
15647 if (E->VectorizedValue) {
15648 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15649 return E->VectorizedValue;
15652 auto *CI = cast<CastInst>(VL0);
15654 Type *SrcScalarTy = cast<VectorType>(InVec->
getType())->getElementType();
15655 auto SrcIt = MinBWs.
find(getOperandEntry(E, 0));
15657 (SrcIt != MinBWs.
end() || It != MinBWs.
end() ||
15660 unsigned SrcBWSz =
DL->getTypeSizeInBits(SrcScalarTy);
15661 if (SrcIt != MinBWs.
end())
15662 SrcBWSz = SrcIt->second.first;
15664 if (BWSz == SrcBWSz) {
15665 VecOpcode = Instruction::BitCast;
15666 }
else if (BWSz < SrcBWSz) {
15667 VecOpcode = Instruction::Trunc;
15668 }
else if (It != MinBWs.
end()) {
15669 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15670 VecOpcode = It->second.second ? Instruction::SExt : Instruction::ZExt;
15671 }
else if (SrcIt != MinBWs.
end()) {
15672 assert(BWSz > SrcBWSz &&
"Invalid cast!");
15674 SrcIt->second.second ? Instruction::SExt : Instruction::ZExt;
15676 }
else if (VecOpcode == Instruction::SIToFP && SrcIt != MinBWs.
end() &&
15677 !SrcIt->second.second) {
15678 VecOpcode = Instruction::UIToFP;
15680 Value *
V = (VecOpcode != ShuffleOrOp && VecOpcode == Instruction::BitCast)
15682 : Builder.
CreateCast(VecOpcode, InVec, VecTy);
15683 V = FinalShuffle(V, E);
15685 E->VectorizedValue =
V;
15686 ++NumVectorInstructions;
15689 case Instruction::FCmp:
15690 case Instruction::ICmp: {
15691 setInsertPointAfterBundle(E);
15693 Value *
L = vectorizeOperand(E, 0, PostponedPHIs);
15694 if (E->VectorizedValue) {
15695 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15696 return E->VectorizedValue;
15698 Value *
R = vectorizeOperand(E, 1, PostponedPHIs);
15699 if (E->VectorizedValue) {
15700 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15701 return E->VectorizedValue;
15703 if (
L->getType() !=
R->getType()) {
15705 getOperandEntry(E, 1)->
isGather() ||
15706 MinBWs.
contains(getOperandEntry(E, 0)) ||
15707 MinBWs.
contains(getOperandEntry(E, 1))) &&
15708 "Expected item in MinBWs.");
15709 if (cast<VectorType>(
L->getType())
15711 ->getIntegerBitWidth() < cast<VectorType>(
R->getType())
15713 ->getIntegerBitWidth()) {
15714 Type *CastTy =
R->getType();
15717 Type *CastTy =
L->getType();
15725 if (
auto *ICmp = dyn_cast<ICmpInst>(V); ICmp && It == MinBWs.
end())
15726 ICmp->setSameSign(
false);
15728 VecTy = cast<FixedVectorType>(
V->getType());
15729 V = FinalShuffle(V, E);
15731 E->VectorizedValue =
V;
15732 ++NumVectorInstructions;
15735 case Instruction::Select: {
15736 setInsertPointAfterBundle(E);
15738 Value *
Cond = vectorizeOperand(E, 0, PostponedPHIs);
15739 if (E->VectorizedValue) {
15740 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15741 return E->VectorizedValue;
15743 Value *True = vectorizeOperand(E, 1, PostponedPHIs);
15744 if (E->VectorizedValue) {
15745 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15746 return E->VectorizedValue;
15748 Value *False = vectorizeOperand(E, 2, PostponedPHIs);
15749 if (E->VectorizedValue) {
15750 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15751 return E->VectorizedValue;
15755 getOperandEntry(E, 2)->
isGather() ||
15756 MinBWs.
contains(getOperandEntry(E, 1)) ||
15757 MinBWs.
contains(getOperandEntry(E, 2))) &&
15758 "Expected item in MinBWs.");
15759 if (True->
getType() != VecTy)
15760 True = Builder.
CreateIntCast(True, VecTy, GetOperandSignedness(1));
15761 if (False->
getType() != VecTy)
15762 False = Builder.
CreateIntCast(False, VecTy, GetOperandSignedness(2));
15767 assert(TrueNumElements >= CondNumElements &&
15768 TrueNumElements % CondNumElements == 0 &&
15769 "Cannot vectorize Instruction::Select");
15771 "Cannot vectorize Instruction::Select");
15772 if (CondNumElements != TrueNumElements) {
15780 "Cannot vectorize Instruction::Select");
15782 V = FinalShuffle(V, E);
15784 E->VectorizedValue =
V;
15785 ++NumVectorInstructions;
15788 case Instruction::FNeg: {
15789 setInsertPointAfterBundle(E);
15791 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15793 if (E->VectorizedValue) {
15794 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15795 return E->VectorizedValue;
15801 if (
auto *
I = dyn_cast<Instruction>(V))
15804 V = FinalShuffle(V, E);
15806 E->VectorizedValue =
V;
15807 ++NumVectorInstructions;
15811 case Instruction::Freeze: {
15812 setInsertPointAfterBundle(E);
15814 Value *
Op = vectorizeOperand(E, 0, PostponedPHIs);
15816 if (E->VectorizedValue) {
15817 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15818 return E->VectorizedValue;
15821 if (
Op->getType() != VecTy) {
15823 MinBWs.
contains(getOperandEntry(E, 0))) &&
15824 "Expected item in MinBWs.");
15828 V = FinalShuffle(V, E);
15830 E->VectorizedValue =
V;
15831 ++NumVectorInstructions;
15835 case Instruction::Add:
15836 case Instruction::FAdd:
15837 case Instruction::Sub:
15838 case Instruction::FSub:
15839 case Instruction::Mul:
15840 case Instruction::FMul:
15841 case Instruction::UDiv:
15842 case Instruction::SDiv:
15843 case Instruction::FDiv:
15844 case Instruction::URem:
15845 case Instruction::SRem:
15846 case Instruction::FRem:
15847 case Instruction::Shl:
15848 case Instruction::LShr:
15849 case Instruction::AShr:
15850 case Instruction::And:
15851 case Instruction::Or:
15852 case Instruction::Xor: {
15853 setInsertPointAfterBundle(E);
15855 Value *
LHS = vectorizeOperand(E, 0, PostponedPHIs);
15856 if (E->VectorizedValue) {
15857 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15858 return E->VectorizedValue;
15860 Value *
RHS = vectorizeOperand(E, 1, PostponedPHIs);
15861 if (E->VectorizedValue) {
15862 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15863 return E->VectorizedValue;
15865 if (ShuffleOrOp == Instruction::And && It != MinBWs.
end()) {
15866 for (
unsigned I : seq<unsigned>(0, E->getNumOperands())) {
15869 auto *CI = dyn_cast<ConstantInt>(
Op);
15870 return CI && CI->getValue().countr_one() >= It->second.first;
15872 V = FinalShuffle(
I == 0 ? RHS : LHS, E);
15873 E->VectorizedValue =
V;
15874 ++NumVectorInstructions;
15881 getOperandEntry(E, 1)->
isGather() ||
15882 MinBWs.
contains(getOperandEntry(E, 0)) ||
15883 MinBWs.
contains(getOperandEntry(E, 1))) &&
15884 "Expected item in MinBWs.");
15895 if (
auto *
I = dyn_cast<Instruction>(V)) {
15898 if (!MinBWs.
contains(E) && ShuffleOrOp == Instruction::Sub &&
15900 return isa<PoisonValue>(V) || isCommutative(cast<Instruction>(V));
15902 I->setHasNoUnsignedWrap(
false);
15905 V = FinalShuffle(V, E);
15907 E->VectorizedValue =
V;
15908 ++NumVectorInstructions;
15912 case Instruction::Load: {
15915 setInsertPointAfterBundle(E);
15917 LoadInst *LI = cast<LoadInst>(VL0);
15920 if (E->State == TreeEntry::Vectorize) {
15922 }
else if (E->State == TreeEntry::StridedVectorize) {
15923 Value *Ptr0 = cast<LoadInst>(E->Scalars.front())->getPointerOperand();
15924 Value *PtrN = cast<LoadInst>(E->Scalars.back())->getPointerOperand();
15925 PO = IsReverseOrder ? PtrN : Ptr0;
15931 int Stride = *Diff / (
static_cast<int>(E->Scalars.size()) - 1);
15933 ConstantInt::get(StrideTy, (IsReverseOrder ? -1 : 1) * Stride *
15934 DL->getTypeAllocSize(ScalarTy));
15938 return cast<LoadInst>(V)->getPointerOperand();
15941 std::optional<Value *> Stride =
15950 (IsReverseOrder ? -1 : 1) *
15951 static_cast<int>(
DL->getTypeAllocSize(ScalarTy))));
15953 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15955 Intrinsic::experimental_vp_strided_load,
15956 {VecTy, PO->
getType(), StrideTy},
15958 Builder.
getInt32(E->Scalars.size())});
15964 assert(E->State == TreeEntry::ScatterVectorize &&
"Unhandled state");
15965 Value *VecPtr = vectorizeOperand(E, 0, PostponedPHIs);
15966 if (E->VectorizedValue) {
15967 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
15968 return E->VectorizedValue;
15970 if (isa<FixedVectorType>(ScalarTy)) {
15974 unsigned ScalarTyNumElements =
15975 cast<FixedVectorType>(ScalarTy)->getNumElements();
15976 unsigned VecTyNumElements =
15977 cast<FixedVectorType>(VecTy)->getNumElements();
15978 assert(VecTyNumElements % ScalarTyNumElements == 0 &&
15979 "Cannot expand getelementptr.");
15980 unsigned VF = VecTyNumElements / ScalarTyNumElements;
15983 return Builder.getInt64(I % ScalarTyNumElements);
15992 Align CommonAlignment = computeCommonAlignment<LoadInst>(E->Scalars);
15997 V = FinalShuffle(V, E);
15998 E->VectorizedValue =
V;
15999 ++NumVectorInstructions;
16002 case Instruction::Store: {
16003 auto *
SI = cast<StoreInst>(VL0);
16005 setInsertPointAfterBundle(E);
16007 Value *VecValue = vectorizeOperand(E, 0, PostponedPHIs);
16008 if (VecValue->
getType() != VecTy)
16010 Builder.
CreateIntCast(VecValue, VecTy, GetOperandSignedness(0));
16011 VecValue = FinalShuffle(VecValue, E);
16015 if (E->State == TreeEntry::Vectorize) {
16018 assert(E->State == TreeEntry::StridedVectorize &&
16019 "Expected either strided or consecutive stores.");
16020 if (!E->ReorderIndices.empty()) {
16021 SI = cast<StoreInst>(E->Scalars[E->ReorderIndices.front()]);
16022 Ptr =
SI->getPointerOperand();
16024 Align CommonAlignment = computeCommonAlignment<StoreInst>(E->Scalars);
16025 Type *StrideTy =
DL->getIndexType(
SI->getPointerOperandType());
16027 Intrinsic::experimental_vp_strided_store,
16028 {VecTy,
Ptr->getType(), StrideTy},
16031 StrideTy, -
static_cast<int>(
DL->getTypeAllocSize(ScalarTy))),
16033 Builder.
getInt32(E->Scalars.size())});
16042 E->VectorizedValue =
V;
16043 ++NumVectorInstructions;
16046 case Instruction::GetElementPtr: {
16047 auto *GEP0 = cast<GetElementPtrInst>(VL0);
16048 setInsertPointAfterBundle(E);
16050 Value *Op0 = vectorizeOperand(E, 0, PostponedPHIs);
16051 if (E->VectorizedValue) {
16052 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16053 return E->VectorizedValue;
16057 for (
int J = 1,
N = GEP0->getNumOperands(); J <
N; ++J) {
16058 Value *OpVec = vectorizeOperand(E, J, PostponedPHIs);
16059 if (E->VectorizedValue) {
16060 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16061 return E->VectorizedValue;
16066 Value *
V = Builder.
CreateGEP(GEP0->getSourceElementType(), Op0, OpVecs);
16067 if (
Instruction *
I = dyn_cast<GetElementPtrInst>(V)) {
16069 for (
Value *V : E->Scalars) {
16070 if (isa<GetElementPtrInst>(V))
16076 V = FinalShuffle(V, E);
16078 E->VectorizedValue =
V;
16079 ++NumVectorInstructions;
16083 case Instruction::Call: {
16084 CallInst *CI = cast<CallInst>(VL0);
16085 setInsertPointAfterBundle(E);
16091 It != MinBWs.
end() ? It->second.first : 0,
TTI);
16094 VecCallCosts.first <= VecCallCosts.second;
16096 Value *ScalarArg =
nullptr;
16102 auto *CEI = cast<CallInst>(VL0);
16103 for (
unsigned I : seq<unsigned>(0, CI->
arg_size())) {
16108 ScalarArg = CEI->getArgOperand(
I);
16111 if (
ID == Intrinsic::abs && It != MinBWs.
end() &&
16112 It->second.first <
DL->getTypeSizeInBits(CEI->getType()))
16120 Value *OpVec = vectorizeOperand(E,
I, PostponedPHIs);
16121 if (E->VectorizedValue) {
16122 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16123 return E->VectorizedValue;
16125 ScalarArg = CEI->getArgOperand(
I);
16126 if (cast<VectorType>(OpVec->
getType())->getElementType() !=
16128 It == MinBWs.
end()) {
16131 OpVec = Builder.
CreateIntCast(OpVec, CastTy, GetOperandSignedness(
I));
16132 }
else if (It != MinBWs.
end()) {
16133 OpVec = Builder.
CreateIntCast(OpVec, VecTy, GetOperandSignedness(
I));
16142 if (!UseIntrinsic) {
16158 V = FinalShuffle(V, E);
16160 E->VectorizedValue =
V;
16161 ++NumVectorInstructions;
16164 case Instruction::ShuffleVector: {
16166 if (
SLPReVec && !E->isAltShuffle()) {
16167 setInsertPointAfterBundle(E);
16168 Value *Src = vectorizeOperand(E, 0, PostponedPHIs);
16169 if (E->VectorizedValue) {
16170 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16171 return E->VectorizedValue;
16174 if (
auto *SVSrc = dyn_cast<ShuffleVectorInst>(Src)) {
16175 assert(isa<PoisonValue>(SVSrc->getOperand(1)) &&
16176 "Not supported shufflevector usage.");
16179 return SVSrc->getShuffleMask()[Mask];
16186 if (
auto *
I = dyn_cast<Instruction>(V))
16188 V = FinalShuffle(V, E);
16190 assert(E->isAltShuffle() &&
16195 (isa<CmpInst>(VL0) && isa<CmpInst>(E->getAltOp()))) &&
16196 "Invalid Shuffle Vector Operand");
16200 setInsertPointAfterBundle(E);
16201 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16202 if (E->VectorizedValue) {
16203 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16204 return E->VectorizedValue;
16206 RHS = vectorizeOperand(E, 1, PostponedPHIs);
16208 setInsertPointAfterBundle(E);
16209 LHS = vectorizeOperand(E, 0, PostponedPHIs);
16211 if (E->VectorizedValue) {
16212 LLVM_DEBUG(
dbgs() <<
"SLP: Diamond merged for " << *VL0 <<
".\n");
16213 return E->VectorizedValue;
16220 getOperandEntry(E, 0)->State == TreeEntry::NeedToGather ||
16221 getOperandEntry(E, 1)->State == TreeEntry::NeedToGather ||
16222 MinBWs.
contains(getOperandEntry(E, 0)) ||
16223 MinBWs.
contains(getOperandEntry(E, 1))) &&
16224 "Expected item in MinBWs.");
16225 Type *CastTy = VecTy;
16229 ->getIntegerBitWidth() < cast<VectorType>(
RHS->
getType())
16231 ->getIntegerBitWidth())
16248 }
else if (
auto *CI0 = dyn_cast<CmpInst>(VL0)) {
16249 V0 = Builder.
CreateCmp(CI0->getPredicate(), LHS, RHS);
16250 auto *AltCI = cast<CmpInst>(E->getAltOp());
16252 V1 = Builder.
CreateCmp(AltPred, LHS, RHS);
16255 unsigned SrcBWSz =
DL->getTypeSizeInBits(
16256 cast<VectorType>(
LHS->
getType())->getElementType());
16257 unsigned BWSz =
DL->getTypeSizeInBits(ScalarTy);
16258 if (BWSz <= SrcBWSz) {
16259 if (BWSz < SrcBWSz)
16262 "Expected same type as operand.");
16263 if (
auto *
I = dyn_cast<Instruction>(LHS))
16265 LHS = FinalShuffle(LHS, E);
16266 E->VectorizedValue =
LHS;
16267 ++NumVectorInstructions;
16278 for (
Value *V : {V0, V1}) {
16279 if (
auto *
I = dyn_cast<Instruction>(V)) {
16280 GatherShuffleExtractSeq.
insert(
I);
16281 CSEBlocks.
insert(
I->getParent());
16290 E->buildAltOpShuffleMask(
16292 assert(E->isOpcodeOrAlt(
I) &&
"Unexpected main/alternate opcode");
16296 Mask, &OpScalars, &AltScalars);
16300 auto DropNuwFlag = [&](
Value *Vec,
unsigned Opcode) {
16302 if (
auto *
I = dyn_cast<Instruction>(Vec);
16303 I && Opcode == Instruction::Sub && !MinBWs.
contains(E) &&
16305 if (isa<PoisonValue>(V))
16307 auto *IV = cast<Instruction>(V);
16308 return IV->getOpcode() == Instruction::Sub && isCommutative(IV);
16310 I->setHasNoUnsignedWrap(
false);
16312 DropNuwFlag(V0, E->getOpcode());
16313 DropNuwFlag(V1, E->getAltOpcode());
16315 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
16320 if (
auto *
I = dyn_cast<Instruction>(V)) {
16322 GatherShuffleExtractSeq.
insert(
I);
16323 CSEBlocks.
insert(
I->getParent());
16327 E->VectorizedValue =
V;
16328 ++NumVectorInstructions;
16347 for (
auto &BSIter : BlocksSchedules) {
16348 scheduleBlock(BSIter.second.get());
16352 EntryToLastInstruction.
clear();
16362 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree) {
16363 if (GatheredLoadsEntriesFirst.has_value() &&
16364 TE->Idx >= *GatheredLoadsEntriesFirst &&
16365 (!TE->isGather() || !TE->UserTreeIndices.empty())) {
16366 assert((!TE->UserTreeIndices.empty() ||
16367 (TE->getOpcode() == Instruction::Load && !TE->isGather())) &&
16368 "Expected gathered load node.");
16374 for (
const std::unique_ptr<TreeEntry> &TE : VectorizableTree)
16375 if (TE->State == TreeEntry::Vectorize &&
16376 TE->getOpcode() == Instruction::PHI && !TE->isAltShuffle() &&
16377 TE->VectorizedValue)
16383 for (
const TreeEntry *E : PostponedNodes) {
16384 auto *TE =
const_cast<TreeEntry *
>(E);
16385 if (
auto *VecTE = getTreeEntry(TE->Scalars.front()))
16386 if (VecTE->isSame(TE->UserTreeIndices.front().UserTE->getOperand(
16387 TE->UserTreeIndices.front().EdgeIdx)) &&
16388 VecTE->isSame(TE->Scalars))
16392 auto *PrevVec = cast<Instruction>(TE->VectorizedValue);
16393 TE->VectorizedValue =
nullptr;
16395 cast<Instruction>(TE->UserTreeIndices.front().UserTE->VectorizedValue);
16404 if (isa<PHINode>(UserI)) {
16407 for (
User *U : PrevVec->users()) {
16410 auto *UI = dyn_cast<Instruction>(U);
16411 if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->
getParent())
16413 if (UI->comesBefore(InsertPt))
16422 if (
auto *VecI = dyn_cast<Instruction>(Vec);
16427 if (Vec->
getType() != PrevVec->getType()) {
16429 PrevVec->getType()->isIntOrIntVectorTy() &&
16430 "Expected integer vector types only.");
16431 std::optional<bool> IsSigned;
16432 for (
Value *V : TE->Scalars) {
16433 if (
const TreeEntry *BaseTE = getTreeEntry(V)) {
16434 auto It = MinBWs.
find(BaseTE);
16435 if (It != MinBWs.
end()) {
16436 IsSigned = IsSigned.value_or(
false) || It->second.second;
16440 for (
const TreeEntry *MNTE : MultiNodeScalars.
lookup(V)) {
16441 auto It = MinBWs.
find(MNTE);
16442 if (It != MinBWs.
end()) {
16443 IsSigned = IsSigned.value_or(
false) || It->second.second;
16448 if (IsSigned.value_or(
false))
16451 for (
const TreeEntry *BVE : ValueToGatherNodes.lookup(V)) {
16452 auto It = MinBWs.
find(BVE);
16453 if (It != MinBWs.
end()) {
16454 IsSigned = IsSigned.value_or(
false) || It->second.second;
16459 if (IsSigned.value_or(
false))
16461 if (
auto *EE = dyn_cast<ExtractElementInst>(V)) {
16463 IsSigned.value_or(
false) ||
16467 if (IsSigned.value_or(
false))
16471 if (IsSigned.value_or(
false)) {
16473 auto It = MinBWs.
find(TE->UserTreeIndices.front().UserTE);
16474 if (It != MinBWs.
end())
16475 IsSigned = It->second.second;
16478 "Expected user node or perfect diamond match in MinBWs.");
16482 PostponedValues.
try_emplace(Vec).first->second.push_back(TE);
16485 auto It = PostponedValues.
find(PrevVec);
16486 if (It != PostponedValues.
end()) {
16487 for (TreeEntry *VTE : It->getSecond())
16488 VTE->VectorizedValue = Vec;
16508 for (
const auto &ExternalUse : ExternalUses) {
16509 Value *Scalar = ExternalUse.Scalar;
16516 TreeEntry *E = getTreeEntry(Scalar);
16517 assert(E &&
"Invalid scalar");
16518 assert(!E->isGather() &&
"Extracting from a gather list");
16520 if (E->getOpcode() == Instruction::GetElementPtr &&
16521 !isa<GetElementPtrInst>(Scalar))
16524 Value *Vec = E->VectorizedValue;
16525 assert(Vec &&
"Can't find vectorizable value");
16528 auto ExtractAndExtendIfNeeded = [&](
Value *Vec) {
16529 if (Scalar->getType() != Vec->
getType()) {
16530 Value *Ex =
nullptr;
16531 Value *ExV =
nullptr;
16532 auto *Inst = dyn_cast<Instruction>(Scalar);
16533 bool ReplaceInst = Inst && ExternalUsesAsOriginalScalar.
contains(Inst);
16534 auto It = ScalarToEEs.
find(Scalar);
16535 if (It != ScalarToEEs.
end()) {
16538 auto EEIt = It->second.find(ReplaceInst ? Inst->getParent()
16540 if (EEIt != It->second.end()) {
16541 Value *PrevV = EEIt->second.first;
16542 if (
auto *
I = dyn_cast<Instruction>(PrevV);
16543 I && !ReplaceInst &&
16548 if (
auto *CI = dyn_cast<Instruction>(EEIt->second.second))
16552 ExV = EEIt->second.second ? EEIt->second.second : Ex;
16560 if (
auto *EE = dyn_cast<ExtractElementInst>(Inst)) {
16561 IgnoredExtracts.
insert(EE);
16564 auto *CloneInst = Inst->clone();
16565 CloneInst->insertBefore(Inst);
16566 if (Inst->hasName())
16570 }
else if (
auto *ES = dyn_cast<ExtractElementInst>(Scalar);
16571 ES && isa<Instruction>(Vec)) {
16572 Value *V = ES->getVectorOperand();
16573 auto *IVec = cast<Instruction>(Vec);
16574 if (
const TreeEntry *ETE = getTreeEntry(V))
16575 V = ETE->VectorizedValue;
16576 if (
auto *
IV = dyn_cast<Instruction>(V);
16577 !
IV ||
IV == Vec ||
IV->getParent() != IVec->getParent() ||
16578 IV->comesBefore(IVec))
16582 }
else if (
auto *VecTy =
16583 dyn_cast<FixedVectorType>(Scalar->getType())) {
16590 ExternalUse.Lane * VecTyNumElements);
16597 if (Scalar->getType() != Ex->
getType())
16599 Ex, Scalar->getType(),
16601 auto *
I = dyn_cast<Instruction>(Ex);
16603 : &
F->getEntryBlock(),
16604 std::make_pair(Ex, ExV));
16608 if (
auto *ExI = dyn_cast<Instruction>(Ex);
16610 GatherShuffleExtractSeq.
insert(ExI);
16611 CSEBlocks.
insert(ExI->getParent());
16615 assert(isa<FixedVectorType>(Scalar->getType()) &&
16616 isa<InsertElementInst>(Scalar) &&
16617 "In-tree scalar of vector type is not insertelement?");
16618 auto *IE = cast<InsertElementInst>(Scalar);
16626 if (!ScalarsWithNullptrUser.
insert(Scalar).second)
16630 ExternalUsesAsOriginalScalar.
contains(Scalar) ||
16633 if (ExternalUsesAsOriginalScalar.contains(U))
16635 TreeEntry *UseEntry = getTreeEntry(U);
16637 (UseEntry->State == TreeEntry::Vectorize ||
16639 TreeEntry::StridedVectorize) &&
16640 (E->State == TreeEntry::Vectorize ||
16641 E->State == TreeEntry::StridedVectorize) &&
16642 doesInTreeUserNeedToExtract(
16643 Scalar, getRootEntryInstruction(*UseEntry),
16646 "Scalar with nullptr User must be registered in "
16647 "ExternallyUsedValues map or remain as scalar in vectorized "
16649 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16650 if (
auto *
PHI = dyn_cast<PHINode>(VecI)) {
16651 if (
PHI->getParent()->isLandingPad())
16655 PHI->getParent()->getLandingPadInst()->getIterator()));
16658 PHI->getParent()->getFirstNonPHIIt());
16661 std::next(VecI->getIterator()));
16666 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16668 if (Scalar != NewInst) {
16669 assert((!isa<ExtractElementInst>(Scalar) ||
16670 !IgnoredExtracts.
contains(cast<ExtractElementInst>(Scalar))) &&
16671 "Extractelements should not be replaced.");
16672 Scalar->replaceAllUsesWith(NewInst);
16677 if (
auto *VU = dyn_cast<InsertElementInst>(
User);
16680 if (!Scalar->getType()->isVectorTy() && isa<Instruction>(Vec)) {
16681 if (
auto *FTy = dyn_cast<FixedVectorType>(
User->
getType())) {
16682 if (!UsedInserts.
insert(VU).second)
16685 auto BWIt = MinBWs.
find(E);
16687 auto *ScalarTy = FTy->getElementType();
16688 auto Key = std::make_pair(Vec, ScalarTy);
16689 auto VecIt = VectorCasts.
find(Key);
16690 if (VecIt == VectorCasts.
end()) {
16692 if (
auto *IVec = dyn_cast<PHINode>(Vec)) {
16693 if (IVec->getParent()->isLandingPad())
16695 std::next(IVec->getParent()
16696 ->getLandingPadInst()
16700 IVec->getParent()->getFirstNonPHIOrDbgOrLifetime());
16701 }
else if (
auto *IVec = dyn_cast<Instruction>(Vec)) {
16708 cast<FixedVectorType>(Vec->
getType())->getNumElements()),
16709 BWIt->second.second);
16712 Vec = VecIt->second;
16719 ShuffledInserts, [VU](
const ShuffledInsertData<Value *> &
Data) {
16726 unsigned Idx = *InsertIdx;
16727 if (It == ShuffledInserts.
end()) {
16729 It = std::next(ShuffledInserts.
begin(),
16730 ShuffledInserts.
size() - 1);
16735 Mask[
Idx] = ExternalUse.Lane;
16736 It->InsertElements.push_back(cast<InsertElementInst>(
User));
16745 if (
auto *VecI = dyn_cast<Instruction>(Vec)) {
16747 for (
unsigned I : seq<unsigned>(0, PH->getNumIncomingValues())) {
16748 if (PH->getIncomingValue(
I) == Scalar) {
16750 PH->getIncomingBlock(
I)->getTerminator();
16751 if (isa<CatchSwitchInst>(IncomingTerminator)) {
16753 std::next(VecI->getIterator()));
16757 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16758 PH->setOperand(
I, NewInst);
16763 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16768 Value *NewInst = ExtractAndExtendIfNeeded(Vec);
16778 int VF = cast<FixedVectorType>(V1->
getType())->getNumElements();
16779 for (
int I = 0, E = Mask.size();
I < E; ++
I) {
16781 CombinedMask1[
I] = Mask[
I];
16783 CombinedMask2[
I] = Mask[
I] - VF;
16786 cast<VectorType>(V1->
getType())->getElementType(), Builder, *
this);
16787 ShuffleBuilder.
add(V1, CombinedMask1);
16789 ShuffleBuilder.
add(V2, CombinedMask2);
16790 return ShuffleBuilder.
finalize({}, {}, {});
16794 bool ForSingleMask) {
16795 unsigned VF = Mask.size();
16796 unsigned VecVF = cast<FixedVectorType>(Vec->
getType())->getNumElements();
16798 if (
any_of(Mask, [VF](
int Idx) {
return Idx >=
static_cast<int>(VF); })) {
16799 Vec = CreateShuffle(Vec,
nullptr, Mask);
16800 return std::make_pair(Vec,
true);
16802 if (!ForSingleMask) {
16804 for (
unsigned I = 0;
I < VF; ++
I) {
16806 ResizeMask[Mask[
I]] = Mask[
I];
16808 Vec = CreateShuffle(Vec,
nullptr, ResizeMask);
16812 return std::make_pair(Vec,
false);
16816 for (
int I = 0, E = ShuffledInserts.
size();
I < E; ++
I) {
16822 auto Vector = ShuffledInserts[
I].ValueMasks.takeVector();
16823 Value *NewInst = performExtractsShuffleAction<Value>(
16827 return cast<VectorType>(Vec->getType())
16828 ->getElementCount()
16829 .getKnownMinValue();
16834 assert((Vals.size() == 1 || Vals.size() == 2) &&
16835 "Expected exactly 1 or 2 input values.");
16836 if (Vals.size() == 1) {
16839 if (Mask.size() != cast<FixedVectorType>(Vals.front()->getType())
16840 ->getNumElements() ||
16841 !ShuffleVectorInst::isIdentityMask(Mask, Mask.size()))
16842 return CreateShuffle(Vals.front(), nullptr, Mask);
16843 return Vals.front();
16845 return CreateShuffle(Vals.
front() ? Vals.
front()
16847 Vals.
back(), Mask);
16849 auto It = ShuffledInserts[
I].InsertElements.
rbegin();
16852 if (It != ShuffledInserts[
I].InsertElements.
rend())
16855 while (It != ShuffledInserts[
I].InsertElements.
rend()) {
16856 assert(
II &&
"Must be an insertelement instruction.");
16861 II = dyn_cast<InsertElementInst>(
II->getOperand(0));
16864 II->replaceUsesOfWith(
II->getOperand(0), NewInst);
16865 if (
auto *NewI = dyn_cast<Instruction>(NewInst))
16866 if (
II->getParent() == NewI->getParent() &&
II->comesBefore(NewI))
16867 II->moveAfter(NewI);
16870 LastInsert->replaceAllUsesWith(NewInst);
16872 IE->replaceUsesOfWith(IE->getOperand(0),
16874 IE->replaceUsesOfWith(IE->getOperand(1),
16878 CSEBlocks.
insert(LastInsert->getParent());
16883 for (
auto &TEPtr : VectorizableTree) {
16884 TreeEntry *Entry = TEPtr.get();
16887 if (Entry->isGather())
16890 assert(Entry->VectorizedValue &&
"Can't find vectorizable value");
16893 for (
int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
16894 Value *Scalar = Entry->Scalars[Lane];
16896 if (Entry->getOpcode() == Instruction::GetElementPtr &&
16897 !isa<GetElementPtrInst>(Scalar))
16899 if (
auto *EE = dyn_cast<ExtractElementInst>(Scalar);
16900 EE && IgnoredExtracts.contains(EE))
16902 if (isa<PoisonValue>(Scalar))
16905 Type *Ty = Scalar->getType();
16907 for (
User *U : Scalar->users()) {
16911 assert((getTreeEntry(U) ||
16912 (UserIgnoreList && UserIgnoreList->contains(U)) ||
16913 (isa_and_nonnull<Instruction>(U) &&
16914 isDeleted(cast<Instruction>(U)))) &&
16915 "Deleting out-of-tree value");
16919 LLVM_DEBUG(
dbgs() <<
"SLP: \tErasing scalar:" << *Scalar <<
".\n");
16920 auto *
I = cast<Instruction>(Scalar);
16927 if (
auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
16928 V->mergeDIAssignID(RemovedInsts);
16931 if (UserIgnoreList) {
16933 const TreeEntry *
IE = getTreeEntry(
I);
16934 if (
IE->Idx != 0 &&
16935 !(VectorizableTree.front()->isGather() &&
16936 !
IE->UserTreeIndices.empty() &&
16937 (ValueToGatherNodes.lookup(
I).contains(
16938 VectorizableTree.front().get()) ||
16940 [&](
const EdgeInfo &EI) {
16941 return EI.UserTE == VectorizableTree.front().get() &&
16942 EI.EdgeIdx == UINT_MAX;
16944 !(GatheredLoadsEntriesFirst.has_value() &&
16945 IE->Idx >= *GatheredLoadsEntriesFirst &&
16946 VectorizableTree.front()->isGather() &&
16952 bool IsPoisoningLogicalOp = isa<SelectInst>(U.getUser()) &&
16953 (match(U.getUser(), m_LogicalAnd()) ||
16954 match(U.getUser(), m_LogicalOr())) &&
16955 U.getOperandNo() == 0;
16956 if (IsPoisoningLogicalOp) {
16957 LogicalOpSelects.push_back(cast<SelectInst>(U.getUser()));
16960 return UserIgnoreList->contains(
U.getUser());
16972 removeInstructionsAndOperands(
ArrayRef(RemovedInsts));
16975 InstrElementSize.
clear();
16977 const TreeEntry &RootTE = *VectorizableTree.front();
16978 Value *Vec = RootTE.VectorizedValue;
16979 if (
auto It = MinBWs.find(&RootTE); ReductionBitWidth != 0 &&
16980 It != MinBWs.end() &&
16981 ReductionBitWidth != It->second.first) {
16984 ReductionRoot->getIterator());
16988 cast<VectorType>(Vec->
getType())->getElementCount()),
16989 It->second.second);
16996 <<
" gather sequences instructions.\n");
17003 Loop *L = LI->getLoopFor(
I->getParent());
17008 BasicBlock *PreHeader = L->getLoopPreheader();
17016 auto *OpI = dyn_cast<Instruction>(V);
17017 return OpI && L->contains(OpI);
17023 CSEBlocks.
insert(PreHeader);
17038 assert((
A ==
B) == (
A->getDFSNumIn() ==
B->getDFSNumIn()) &&
17039 "Different nodes should have different DFS numbers");
17040 return A->getDFSNumIn() <
B->getDFSNumIn();
17051 if (I1->getType() != I2->getType())
17053 auto *SI1 = dyn_cast<ShuffleVectorInst>(I1);
17054 auto *SI2 = dyn_cast<ShuffleVectorInst>(I2);
17056 return I1->isIdenticalTo(I2);
17057 if (SI1->isIdenticalTo(SI2))
17059 for (
int I = 0, E = SI1->getNumOperands();
I < E; ++
I)
17060 if (SI1->getOperand(
I) != SI2->getOperand(
I))
17063 NewMask.
assign(SI2->getShuffleMask().begin(), SI2->getShuffleMask().end());
17067 unsigned LastUndefsCnt = 0;
17068 for (
int I = 0, E = NewMask.
size();
I < E; ++
I) {
17074 NewMask[
I] != SM1[
I])
17077 NewMask[
I] = SM1[
I];
17081 return SM1.
size() - LastUndefsCnt > 1 &&
17085 SM1.
size() - LastUndefsCnt));
17091 for (
auto I = CSEWorkList.
begin(), E = CSEWorkList.
end();
I != E; ++
I) {
17094 "Worklist not sorted properly!");
17100 if (!isa<InsertElementInst, ExtractElementInst, ShuffleVectorInst>(&In) &&
17101 !GatherShuffleExtractSeq.contains(&In))
17106 bool Replaced =
false;
17109 if (IsIdenticalOrLessDefined(&In, V, NewMask) &&
17110 DT->
dominates(V->getParent(), In.getParent())) {
17111 In.replaceAllUsesWith(V);
17113 if (
auto *SI = dyn_cast<ShuffleVectorInst>(V))
17114 if (!NewMask.
empty())
17115 SI->setShuffleMask(NewMask);
17119 if (isa<ShuffleVectorInst>(In) && isa<ShuffleVectorInst>(V) &&
17120 GatherShuffleExtractSeq.contains(V) &&
17121 IsIdenticalOrLessDefined(V, &In, NewMask) &&
17122 DT->
dominates(In.getParent(), V->getParent())) {
17124 V->replaceAllUsesWith(&In);
17126 if (
auto *SI = dyn_cast<ShuffleVectorInst>(&In))
17127 if (!NewMask.
empty())
17128 SI->setShuffleMask(NewMask);
17136 Visited.push_back(&In);
17141 GatherShuffleExtractSeq.clear();
17144BoUpSLP::ScheduleData *
17146 ScheduleData *Bundle =
nullptr;
17147 ScheduleData *PrevInBundle =
nullptr;
17148 for (
Value *V : VL) {
17151 ScheduleData *BundleMember = getScheduleData(V);
17153 "no ScheduleData for bundle member "
17154 "(maybe not in same basic block)");
17155 assert(BundleMember->isSchedulingEntity() &&
17156 "bundle member already part of other bundle");
17157 if (PrevInBundle) {
17158 PrevInBundle->NextInBundle = BundleMember;
17160 Bundle = BundleMember;
17164 BundleMember->FirstInBundle = Bundle;
17165 PrevInBundle = BundleMember;
17167 assert(Bundle &&
"Failed to find schedule bundle");
17173std::optional<BoUpSLP::ScheduleData *>
17175 const InstructionsState &S) {
17178 if (isa<PHINode>(S.getMainOp()) ||
17184 LLVM_DEBUG(
dbgs() <<
"SLP: bundle: " << *S.getMainOp() <<
"\n");
17186 auto TryScheduleBundleImpl = [
this, OldScheduleEnd, SLP](
bool ReSchedule,
17187 ScheduleData *Bundle) {
17193 if (ScheduleEnd != OldScheduleEnd) {
17194 for (
auto *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode())
17195 if (ScheduleData *SD = getScheduleData(
I))
17196 SD->clearDependencies();
17201 <<
" in block " << BB->
getName() <<
"\n");
17202 calculateDependencies(Bundle,
true, SLP);
17207 initialFillReadyList(ReadyInsts);
17214 while (((!Bundle && ReSchedule) || (Bundle && !Bundle->isReady())) &&
17215 !ReadyInsts.empty()) {
17216 ScheduleData *Picked = ReadyInsts.pop_back_val();
17217 assert(Picked->isSchedulingEntity() && Picked->isReady() &&
17218 "must be ready to schedule");
17219 schedule(Picked, ReadyInsts);
17225 for (
Value *V : VL) {
17228 if (!extendSchedulingRegion(V, S)) {
17235 TryScheduleBundleImpl(
false,
nullptr);
17236 return std::nullopt;
17240 bool ReSchedule =
false;
17241 for (
Value *V : VL) {
17244 ScheduleData *BundleMember = getScheduleData(V);
17246 "no ScheduleData for bundle member (maybe not in same basic block)");
17250 ReadyInsts.remove(BundleMember);
17252 if (!BundleMember->IsScheduled)
17257 LLVM_DEBUG(
dbgs() <<
"SLP: reset schedule because " << *BundleMember
17258 <<
" was already scheduled\n");
17262 auto *Bundle = buildBundle(VL);
17263 TryScheduleBundleImpl(ReSchedule, Bundle);
17264 if (!Bundle->isReady()) {
17265 cancelScheduling(VL, S.getMainOp());
17266 return std::nullopt;
17279 ScheduleData *Bundle = getScheduleData(OpValue);
17280 LLVM_DEBUG(
dbgs() <<
"SLP: cancel scheduling of " << *Bundle <<
"\n");
17281 assert(!Bundle->IsScheduled &&
17282 "Can't cancel bundle which is already scheduled");
17283 assert(Bundle->isSchedulingEntity() &&
17285 "tried to unbundle something which is not a bundle");
17288 if (Bundle->isReady())
17289 ReadyInsts.remove(Bundle);
17292 ScheduleData *BundleMember = Bundle;
17293 while (BundleMember) {
17294 assert(BundleMember->FirstInBundle == Bundle &&
"corrupt bundle links");
17295 BundleMember->FirstInBundle = BundleMember;
17296 ScheduleData *Next = BundleMember->NextInBundle;
17297 BundleMember->NextInBundle =
nullptr;
17298 BundleMember->TE =
nullptr;
17299 if (BundleMember->unscheduledDepsInBundle() == 0) {
17300 ReadyInsts.insert(BundleMember);
17302 BundleMember = Next;
17306BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
17308 if (ChunkPos >= ChunkSize) {
17309 ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
17312 return &(ScheduleDataChunks.back()[ChunkPos++]);
17315bool BoUpSLP::BlockScheduling::extendSchedulingRegion(
17316 Value *V,
const InstructionsState &S) {
17318 assert(
I &&
"bundle member must be an instruction");
17321 "phi nodes/insertelements/extractelements/extractvalues don't need to "
17323 if (getScheduleData(
I))
17325 if (!ScheduleStart) {
17327 initScheduleData(
I,
I->getNextNode(),
nullptr,
nullptr);
17329 ScheduleEnd =
I->getNextNode();
17330 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17331 LLVM_DEBUG(
dbgs() <<
"SLP: initialize schedule region to " << *
I <<
"\n");
17339 ++ScheduleStart->getIterator().getReverse();
17344 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I))
17345 return II->isAssumeLikeIntrinsic();
17348 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17349 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17350 while (UpIter != UpperEnd && DownIter != LowerEnd && &*UpIter !=
I &&
17352 if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
17353 LLVM_DEBUG(
dbgs() <<
"SLP: exceeded schedule region size limit\n");
17360 UpIter = std::find_if_not(UpIter, UpperEnd, IsAssumeLikeIntr);
17361 DownIter = std::find_if_not(DownIter, LowerEnd, IsAssumeLikeIntr);
17363 if (DownIter == LowerEnd || (UpIter != UpperEnd && &*UpIter ==
I)) {
17364 assert(
I->getParent() == ScheduleStart->getParent() &&
17365 "Instruction is in wrong basic block.");
17366 initScheduleData(
I, ScheduleStart,
nullptr, FirstLoadStoreInRegion);
17372 assert((UpIter == UpperEnd || (DownIter != LowerEnd && &*DownIter ==
I)) &&
17373 "Expected to reach top of the basic block or instruction down the "
17375 assert(
I->getParent() == ScheduleEnd->getParent() &&
17376 "Instruction is in wrong basic block.");
17377 initScheduleData(ScheduleEnd,
I->getNextNode(), LastLoadStoreInRegion,
17379 ScheduleEnd =
I->getNextNode();
17380 assert(ScheduleEnd &&
"tried to vectorize a terminator?");
17381 LLVM_DEBUG(
dbgs() <<
"SLP: extend schedule region end to " << *
I <<
"\n");
17385void BoUpSLP::BlockScheduling::initScheduleData(
Instruction *FromI,
17387 ScheduleData *PrevLoadStore,
17388 ScheduleData *NextLoadStore) {
17389 ScheduleData *CurrentLoadStore = PrevLoadStore;
17394 ScheduleData *SD = ScheduleDataMap.lookup(
I);
17396 SD = allocateScheduleDataChunks();
17397 ScheduleDataMap[
I] = SD;
17399 assert(!isInSchedulingRegion(SD) &&
17400 "new ScheduleData already in scheduling region");
17401 SD->init(SchedulingRegionID,
I);
17403 if (
I->mayReadOrWriteMemory() &&
17404 (!isa<IntrinsicInst>(
I) ||
17405 (cast<IntrinsicInst>(
I)->getIntrinsicID() != Intrinsic::sideeffect &&
17406 cast<IntrinsicInst>(
I)->getIntrinsicID() !=
17407 Intrinsic::pseudoprobe))) {
17409 if (CurrentLoadStore) {
17410 CurrentLoadStore->NextLoadStore = SD;
17412 FirstLoadStoreInRegion = SD;
17414 CurrentLoadStore = SD;
17417 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17418 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17419 RegionHasStackSave =
true;
17421 if (NextLoadStore) {
17422 if (CurrentLoadStore)
17423 CurrentLoadStore->NextLoadStore = NextLoadStore;
17425 LastLoadStoreInRegion = CurrentLoadStore;
17429void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
17430 bool InsertInReadyList,
17432 assert(SD->isSchedulingEntity());
17437 while (!WorkList.
empty()) {
17439 for (ScheduleData *BundleMember = SD; BundleMember;
17440 BundleMember = BundleMember->NextInBundle) {
17441 assert(isInSchedulingRegion(BundleMember));
17442 if (BundleMember->hasValidDependencies())
17447 BundleMember->Dependencies = 0;
17448 BundleMember->resetUnscheduledDeps();
17451 for (
User *U : BundleMember->Inst->
users()) {
17452 if (ScheduleData *UseSD = getScheduleData(cast<Instruction>(U))) {
17453 BundleMember->Dependencies++;
17454 ScheduleData *DestBundle = UseSD->FirstInBundle;
17455 if (!DestBundle->IsScheduled)
17456 BundleMember->incrementUnscheduledDeps(1);
17457 if (!DestBundle->hasValidDependencies())
17463 auto *DepDest = getScheduleData(
I);
17464 assert(DepDest &&
"must be in schedule window");
17465 DepDest->ControlDependencies.push_back(BundleMember);
17466 BundleMember->Dependencies++;
17467 ScheduleData *DestBundle = DepDest->FirstInBundle;
17468 if (!DestBundle->IsScheduled)
17469 BundleMember->incrementUnscheduledDeps(1);
17470 if (!DestBundle->hasValidDependencies())
17478 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17479 I != ScheduleEnd;
I =
I->getNextNode()) {
17484 MakeControlDependent(
I);
17492 if (RegionHasStackSave) {
17496 if (
match(BundleMember->Inst, m_Intrinsic<Intrinsic::stacksave>()) ||
17497 match(BundleMember->Inst, m_Intrinsic<Intrinsic::stackrestore>())) {
17498 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17499 I != ScheduleEnd;
I =
I->getNextNode()) {
17500 if (
match(
I, m_Intrinsic<Intrinsic::stacksave>()) ||
17501 match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17506 if (!isa<AllocaInst>(
I))
17510 MakeControlDependent(
I);
17519 if (isa<AllocaInst>(BundleMember->Inst) ||
17520 BundleMember->Inst->mayReadOrWriteMemory()) {
17521 for (
Instruction *
I = BundleMember->Inst->getNextNode();
17522 I != ScheduleEnd;
I =
I->getNextNode()) {
17523 if (!
match(
I, m_Intrinsic<Intrinsic::stacksave>()) &&
17524 !
match(
I, m_Intrinsic<Intrinsic::stackrestore>()))
17528 MakeControlDependent(
I);
17535 ScheduleData *DepDest = BundleMember->NextLoadStore;
17540 "NextLoadStore list for non memory effecting bundle?");
17542 bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
17543 unsigned NumAliased = 0;
17544 unsigned DistToSrc = 1;
17546 for (; DepDest; DepDest = DepDest->NextLoadStore) {
17547 assert(isInSchedulingRegion(DepDest));
17557 ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
17559 SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
17566 DepDest->MemoryDependencies.push_back(BundleMember);
17567 BundleMember->Dependencies++;
17568 ScheduleData *DestBundle = DepDest->FirstInBundle;
17569 if (!DestBundle->IsScheduled) {
17570 BundleMember->incrementUnscheduledDeps(1);
17572 if (!DestBundle->hasValidDependencies()) {
17595 if (InsertInReadyList && SD->isReady()) {
17596 ReadyInsts.insert(SD);
17603void BoUpSLP::BlockScheduling::resetSchedule() {
17605 "tried to reset schedule on block which has not been scheduled");
17606 for (
Instruction *
I = ScheduleStart;
I != ScheduleEnd;
I =
I->getNextNode()) {
17607 if (ScheduleData *SD = getScheduleData(
I)) {
17608 assert(isInSchedulingRegion(SD) &&
17609 "ScheduleData not in scheduling region");
17610 SD->IsScheduled =
false;
17611 SD->resetUnscheduledDeps();
17614 ReadyInsts.clear();
17617void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
17618 if (!BS->ScheduleStart)
17621 LLVM_DEBUG(
dbgs() <<
"SLP: schedule block " << BS->BB->getName() <<
"\n");
17628 BS->resetSchedule();
17635 struct ScheduleDataCompare {
17636 bool operator()(ScheduleData *SD1, ScheduleData *SD2)
const {
17637 return SD2->SchedulingPriority < SD1->SchedulingPriority;
17640 std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
17645 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
17646 I =
I->getNextNode()) {
17647 if (ScheduleData *SD = BS->getScheduleData(
I)) {
17648 TreeEntry *SDTE = getTreeEntry(SD->Inst);
17651 SD->isPartOfBundle() ==
17653 "scheduler and vectorizer bundle mismatch");
17654 SD->FirstInBundle->SchedulingPriority =
Idx++;
17656 if (SD->isSchedulingEntity() && SD->isPartOfBundle())
17657 BS->calculateDependencies(SD,
false,
this);
17660 BS->initialFillReadyList(ReadyInsts);
17662 Instruction *LastScheduledInst = BS->ScheduleEnd;
17665 while (!ReadyInsts.empty()) {
17666 ScheduleData *Picked = *ReadyInsts.begin();
17667 ReadyInsts.erase(ReadyInsts.begin());
17671 for (ScheduleData *BundleMember = Picked; BundleMember;
17672 BundleMember = BundleMember->NextInBundle) {
17676 LastScheduledInst = PickedInst;
17679 BS->schedule(Picked, ReadyInsts);
17683#ifdef EXPENSIVE_CHECKS
17687#if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS)
17689 for (
auto *
I = BS->ScheduleStart;
I != BS->ScheduleEnd;
I =
I->getNextNode()) {
17690 ScheduleData *SD = BS->getScheduleData(
I);
17691 if (SD && SD->isSchedulingEntity() && SD->hasValidDependencies())
17692 assert(SD->IsScheduled &&
"must be scheduled at this point");
17697 BS->ScheduleStart =
nullptr;
17704 if (
auto *Store = dyn_cast<StoreInst>(V))
17705 return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
17707 if (
auto *IEI = dyn_cast<InsertElementInst>(V))
17710 auto E = InstrElementSize.
find(V);
17711 if (E != InstrElementSize.
end())
17720 if (
auto *
I = dyn_cast<Instruction>(V)) {
17728 Value *FirstNonBool =
nullptr;
17729 while (!Worklist.
empty()) {
17734 auto *Ty =
I->getType();
17735 if (isa<VectorType>(Ty))
17737 if (Ty != Builder.
getInt1Ty() && !FirstNonBool)
17744 if (isa<LoadInst, ExtractElementInst, ExtractValueInst>(
I))
17745 Width = std::max<unsigned>(Width,
DL->getTypeSizeInBits(Ty));
17753 for (
Use &U :
I->operands()) {
17754 if (
auto *J = dyn_cast<Instruction>(U.get()))
17755 if (Visited.
insert(J).second &&
17756 (isa<PHINode>(
I) || J->getParent() == Parent)) {
17760 if (!FirstNonBool && U.get()->getType() != Builder.
getInt1Ty())
17761 FirstNonBool = U.get();
17772 if (V->getType() == Builder.
getInt1Ty() && FirstNonBool)
17774 Width =
DL->getTypeSizeInBits(V->getType());
17778 InstrElementSize[
I] = Width;
17783bool BoUpSLP::collectValuesToDemote(
17784 const TreeEntry &E,
bool IsProfitableToDemoteRoot,
unsigned &
BitWidth,
17787 bool &IsProfitableToDemote,
bool IsTruncRoot)
const {
17789 if (
all_of(E.Scalars, IsaPred<Constant>))
17792 unsigned OrigBitWidth =
17793 DL->getTypeSizeInBits(E.Scalars.front()->getType()->getScalarType());
17800 if (NodesToKeepBWs.
contains(E.Idx))
17806 bool IsSignedNode =
any_of(E.Scalars, [&](
Value *R) {
17807 if (isa<PoisonValue>(R))
17809 return !isKnownNonNegative(R, SimplifyQuery(*DL));
17811 auto IsPotentiallyTruncated = [&](
Value *V,
unsigned &
BitWidth) ->
bool {
17812 if (isa<PoisonValue>(V))
17820 if ((!IsSignedNode || IsSignedVal) && OrigBitWidth >
BitWidth) {
17826 unsigned BitWidth1 = OrigBitWidth - NumSignBits;
17829 if (
auto *
I = dyn_cast<Instruction>(V)) {
17831 unsigned BitWidth2 =
17832 std::max<unsigned>(1, Mask.getBitWidth() - Mask.countl_zero());
17833 while (!IsSignedNode && BitWidth2 < OrigBitWidth) {
17839 BitWidth1 = std::min(BitWidth1, BitWidth2);
17844 auto FinalAnalysis = [&,
TTI =
TTI]() {
17845 if (!IsProfitableToDemote)
17848 E.Scalars, std::bind(IsPotentiallyTruncated, _1, std::ref(
BitWidth)));
17850 if (Res && E.isGather()) {
17854 for (
Value *V : E.Scalars) {
17855 auto *EE = dyn_cast<ExtractElementInst>(V);
17858 UniqueBases.
insert(EE->getVectorOperand());
17860 const unsigned VF = E.Scalars.size();
17861 Type *OrigScalarTy = E.Scalars.front()->getType();
17862 if (UniqueBases.
size() <= 2 ||
17870 if (E.isGather() || !Visited.
insert(&E).second ||
17872 return !isa<PoisonValue>(V) && all_of(V->users(), [&](User *U) {
17873 return isa<InsertElementInst>(U) && !getTreeEntry(U);
17876 return FinalAnalysis();
17879 return !all_of(V->users(), [=](User *U) {
17880 return getTreeEntry(U) ||
17881 (E.Idx == 0 && UserIgnoreList &&
17882 UserIgnoreList->contains(U)) ||
17883 (!isa<CmpInst>(U) && U->getType()->isSized() &&
17884 !U->getType()->isScalableTy() &&
17885 DL->getTypeSizeInBits(U->getType()) <= BitWidth);
17886 }) && !IsPotentiallyTruncated(V,
BitWidth);
17891 bool &NeedToExit) {
17892 NeedToExit =
false;
17893 unsigned InitLevel = MaxDepthLevel;
17895 unsigned Level = InitLevel;
17896 if (!collectValuesToDemote(*
Op, IsProfitableToDemoteRoot,
BitWidth,
17897 ToDemote, Visited, NodesToKeepBWs, Level,
17898 IsProfitableToDemote, IsTruncRoot)) {
17899 if (!IsProfitableToDemote)
17902 if (!FinalAnalysis())
17906 MaxDepthLevel = std::max(MaxDepthLevel, Level);
17910 auto AttemptCheckBitwidth =
17913 NeedToExit =
false;
17914 unsigned BestFailBitwidth = 0;
17916 if (Checker(
BitWidth, OrigBitWidth))
17918 if (BestFailBitwidth == 0 && FinalAnalysis())
17922 if (BestFailBitwidth == 0) {
17933 auto TryProcessInstruction =
17939 (void)
for_each(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17944 if (E.UserTreeIndices.size() > 1 &&
17945 !
all_of(E.Scalars, std::bind(IsPotentiallyTruncated, _1,
17948 bool NeedToExit =
false;
17949 if (Checker && !AttemptCheckBitwidth(Checker, NeedToExit))
17953 if (!ProcessOperands(
Operands, NeedToExit))
17962 return IsProfitableToDemote;
17964 switch (E.getOpcode()) {
17968 case Instruction::Trunc:
17969 if (IsProfitableToDemoteRoot)
17970 IsProfitableToDemote =
true;
17971 return TryProcessInstruction(
BitWidth);
17972 case Instruction::ZExt:
17973 case Instruction::SExt:
17974 IsProfitableToDemote =
true;
17975 return TryProcessInstruction(
BitWidth);
17979 case Instruction::Add:
17980 case Instruction::Sub:
17981 case Instruction::Mul:
17982 case Instruction::And:
17983 case Instruction::Or:
17984 case Instruction::Xor: {
17985 return TryProcessInstruction(
17986 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)});
17988 case Instruction::Freeze:
17989 return TryProcessInstruction(
BitWidth, getOperandEntry(&E, 0));
17990 case Instruction::Shl: {
17995 if (isa<PoisonValue>(V))
17997 auto *I = cast<Instruction>(V);
17998 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
17999 return AmtKnownBits.getMaxValue().ult(BitWidth);
18002 return TryProcessInstruction(
18003 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, ShlChecker);
18005 case Instruction::LShr: {
18009 auto LShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18011 if (isa<PoisonValue>(V))
18013 auto *I = cast<Instruction>(V);
18014 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18015 APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18016 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18017 MaskedValueIsZero(I->getOperand(0), ShiftedBits,
18018 SimplifyQuery(*DL));
18021 return TryProcessInstruction(
18022 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18025 case Instruction::AShr: {
18029 auto AShrChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18031 if (isa<PoisonValue>(V))
18033 auto *I = cast<Instruction>(V);
18034 KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
18035 unsigned ShiftedBits = OrigBitWidth - BitWidth;
18036 return AmtKnownBits.getMaxValue().ult(BitWidth) &&
18037 ShiftedBits < ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
18041 return TryProcessInstruction(
18042 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)},
18045 case Instruction::UDiv:
18046 case Instruction::URem: {
18048 auto Checker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18051 auto *I = cast<Instruction>(V);
18052 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18053 return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
18054 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18057 return TryProcessInstruction(
18058 BitWidth, {getOperandEntry(&E, 0), getOperandEntry(&E, 1)}, Checker);
18062 case Instruction::Select: {
18063 return TryProcessInstruction(
18064 BitWidth, {getOperandEntry(&E, 1), getOperandEntry(&E, 2)});
18069 case Instruction::PHI: {
18070 const unsigned NumOps = E.getNumOperands();
18073 std::bind(&BoUpSLP::getOperandEntry,
this, &E, _1));
18075 return TryProcessInstruction(
BitWidth, Ops);
18078 case Instruction::Call: {
18079 auto *IC = dyn_cast<IntrinsicInst>(E.getMainOp());
18083 if (
ID != Intrinsic::abs &&
ID != Intrinsic::smin &&
18084 ID != Intrinsic::smax &&
ID != Intrinsic::umin &&
ID != Intrinsic::umax)
18088 auto CompChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18091 auto *I = cast<Instruction>(V);
18092 if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
18093 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
18094 return MaskedValueIsZero(I->getOperand(0), Mask,
18095 SimplifyQuery(*DL)) &&
18096 MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
18098 assert((
ID == Intrinsic::smin ||
ID == Intrinsic::smax) &&
18099 "Expected min/max intrinsics only.");
18100 unsigned SignBits = OrigBitWidth -
BitWidth;
18106 return SignBits <= Op0SignBits &&
18107 ((SignBits != Op0SignBits &&
18111 SignBits <= Op1SignBits &&
18112 ((SignBits != Op1SignBits &&
18117 auto AbsChecker = [&](
unsigned BitWidth,
unsigned OrigBitWidth) {
18120 auto *I = cast<Instruction>(V);
18121 unsigned SignBits = OrigBitWidth - BitWidth;
18122 APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth - 1);
18123 unsigned Op0SignBits =
18124 ComputeNumSignBits(I->getOperand(0), *DL, 0, AC, nullptr, DT);
18125 return SignBits <= Op0SignBits &&
18126 ((SignBits != Op0SignBits &&
18127 !isKnownNonNegative(I->getOperand(0), SimplifyQuery(*DL))) ||
18128 MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)));
18131 if (
ID != Intrinsic::abs) {
18132 Operands.push_back(getOperandEntry(&E, 1));
18133 CallChecker = CompChecker;
18135 CallChecker = AbsChecker;
18138 std::numeric_limits<InstructionCost::CostType>::max();
18140 unsigned VF = E.Scalars.size();
18150 if (
Cost < BestCost) {
18156 [[maybe_unused]]
bool NeedToExit;
18157 (void)AttemptCheckBitwidth(Checker, NeedToExit);
18167 return FinalAnalysis();
18174 bool IsStoreOrInsertElt =
18175 VectorizableTree.front()->hasState() &&
18176 (VectorizableTree.front()->getOpcode() == Instruction::Store ||
18177 VectorizableTree.front()->getOpcode() == Instruction::InsertElement);
18178 if ((IsStoreOrInsertElt || UserIgnoreList) &&
18179 ExtraBitWidthNodes.
size() <= 1 &&
18180 (!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
18181 CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
18184 unsigned NodeIdx = 0;
18185 if (IsStoreOrInsertElt && !VectorizableTree.front()->isGather())
18189 if (VectorizableTree[NodeIdx]->
isGather() ||
18190 (NodeIdx == 0 && !VectorizableTree[NodeIdx]->UserTreeIndices.empty()) ||
18191 (NodeIdx != 0 &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18193 return EI.
UserTE->Idx > NodeIdx;
18199 bool IsTruncRoot =
false;
18200 bool IsProfitableToDemoteRoot = !IsStoreOrInsertElt;
18203 if (NodeIdx != 0 &&
18204 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18205 VectorizableTree[NodeIdx]->
getOpcode() == Instruction::Trunc) {
18206 assert(IsStoreOrInsertElt &&
"Expected store/insertelement seeded graph.");
18207 IsTruncRoot =
true;
18209 IsProfitableToDemoteRoot =
true;
18214 if (AnalyzedMinBWVals.
contains(VectorizableTree[NodeIdx]->Scalars.front()))
18218 auto ComputeMaxBitWidth =
18219 [&](
const TreeEntry &E,
bool IsTopRoot,
bool IsProfitableToDemoteRoot,
18220 unsigned Limit,
bool IsTruncRoot,
bool IsSignedCmp) ->
unsigned {
18224 if (E.isGather() && IsTruncRoot && E.UserTreeIndices.size() == 1 &&
18225 !NodesToKeepBWs.
contains(E.Idx) &&
18226 E.Idx > (IsStoreOrInsertElt ? 2u : 1u) &&
18228 return V->hasOneUse() || isa<Constant>(V) ||
18231 const TreeEntry *TE = getTreeEntry(U);
18232 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18233 if (TE == UserTE || !TE)
18235 if (!isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18237 !isa<CastInst, BinaryOperator, FreezeInst, PHINode,
18238 SelectInst>(UserTE->getMainOp()))
18240 unsigned UserTESz = DL->getTypeSizeInBits(
18241 UserTE->Scalars.front()->getType());
18242 auto It = MinBWs.find(TE);
18243 if (It != MinBWs.end() && It->second.first > UserTESz)
18245 return DL->getTypeSizeInBits(U->getType()) > UserTESz;
18249 const TreeEntry *UserTE = E.UserTreeIndices.back().UserTE;
18250 auto It = MinBWs.
find(UserTE);
18251 if (It != MinBWs.
end())
18252 return It->second.first;
18253 unsigned MaxBitWidth =
18254 DL->getTypeSizeInBits(UserTE->Scalars.front()->getType());
18255 MaxBitWidth =
bit_ceil(MaxBitWidth);
18256 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18258 return MaxBitWidth;
18264 unsigned VF = E.getVectorFactor();
18265 Type *ScalarTy = E.Scalars.front()->getType();
18267 auto *TreeRootIT = dyn_cast<IntegerType>(ScalarTy->
getScalarType());
18272 [&](
Value *V) { return AnalyzedMinBWVals.contains(V); }))
18281 unsigned MaxBitWidth = 1u;
18289 bool IsKnownPositive = !IsSignedCmp &&
all_of(E.Scalars, [&](
Value *R) {
18290 if (isa<PoisonValue>(R))
18292 KnownBits Known = computeKnownBits(R, *DL);
18293 return Known.isNonNegative();
18298 for (
Value *Root : E.Scalars) {
18299 if (isa<PoisonValue>(Root))
18304 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18320 if (!IsKnownPositive)
18324 unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18326 std::max<unsigned>(std::min(BitWidth1, BitWidth2), MaxBitWidth);
18329 if (MaxBitWidth < 8 && MaxBitWidth > 1)
18334 if (NumParts > 1 &&
18340 unsigned Opcode = E.getOpcode();
18341 bool IsProfitableToDemote = Opcode == Instruction::Trunc ||
18342 Opcode == Instruction::SExt ||
18343 Opcode == Instruction::ZExt || NumParts > 1;
18348 unsigned MaxDepthLevel = IsTruncRoot ? Limit : 1;
18349 bool NeedToDemote = IsProfitableToDemote;
18351 if (!collectValuesToDemote(E, IsProfitableToDemoteRoot, MaxBitWidth,
18352 ToDemote, Visited, NodesToKeepBWs, MaxDepthLevel,
18353 NeedToDemote, IsTruncRoot) ||
18354 (MaxDepthLevel <= Limit &&
18355 !(((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
18356 (!IsTopRoot || !(IsStoreOrInsertElt || UserIgnoreList) ||
18357 DL->getTypeSizeInBits(TreeRootIT) /
18358 DL->getTypeSizeInBits(
18359 E.getMainOp()->getOperand(0)->getType()) >
18363 MaxBitWidth =
bit_ceil(MaxBitWidth);
18365 return MaxBitWidth;
18372 if (UserIgnoreList &&
18373 isa<IntegerType>(VectorizableTree.front()->Scalars.front()->getType())) {
18376 if (
all_of(*UserIgnoreList,
18378 return isa<PoisonValue>(V) ||
18379 cast<Instruction>(V)->getOpcode() == Instruction::Add;
18381 VectorizableTree.front()->State == TreeEntry::Vectorize &&
18382 VectorizableTree.front()->getOpcode() == Instruction::ZExt &&
18383 cast<CastInst>(VectorizableTree.front()->getMainOp())->getSrcTy() ==
18385 ReductionBitWidth = 1;
18387 for (
Value *V : *UserIgnoreList) {
18388 if (isa<PoisonValue>(V))
18391 TypeSize NumTypeBits =
DL->getTypeSizeInBits(V->getType());
18392 unsigned BitWidth1 = NumTypeBits - NumSignBits;
18395 unsigned BitWidth2 = BitWidth1;
18398 BitWidth2 = Mask.getBitWidth() - Mask.countl_zero();
18400 ReductionBitWidth =
18401 std::max(std::min(BitWidth1, BitWidth2), ReductionBitWidth);
18403 if (ReductionBitWidth < 8 && ReductionBitWidth > 1)
18404 ReductionBitWidth = 8;
18406 ReductionBitWidth =
bit_ceil(ReductionBitWidth);
18409 bool IsTopRoot = NodeIdx == 0;
18410 while (NodeIdx < VectorizableTree.size() &&
18411 VectorizableTree[NodeIdx]->State == TreeEntry::Vectorize &&
18412 VectorizableTree[NodeIdx]->getOpcode() == Instruction::Trunc) {
18415 IsTruncRoot =
true;
18417 bool IsSignedCmp =
false;
18418 while (NodeIdx < VectorizableTree.size()) {
18420 unsigned Limit = 2;
18422 ReductionBitWidth ==
18423 DL->getTypeSizeInBits(
18424 VectorizableTree.front()->Scalars.front()->getType()))
18426 unsigned MaxBitWidth = ComputeMaxBitWidth(
18427 *VectorizableTree[NodeIdx], IsTopRoot, IsProfitableToDemoteRoot, Limit,
18428 IsTruncRoot, IsSignedCmp);
18429 if (ReductionBitWidth != 0 && (IsTopRoot || !RootDemotes.
empty())) {
18430 if (MaxBitWidth != 0 && ReductionBitWidth < MaxBitWidth)
18431 ReductionBitWidth =
bit_ceil(MaxBitWidth);
18432 else if (MaxBitWidth == 0)
18433 ReductionBitWidth = 0;
18436 for (
unsigned Idx : RootDemotes) {
18439 DL->getTypeSizeInBits(V->getType()->getScalarType());
18440 if (OrigBitWidth > MaxBitWidth) {
18448 RootDemotes.clear();
18450 IsProfitableToDemoteRoot =
true;
18452 if (ExtraBitWidthNodes.
empty()) {
18453 NodeIdx = VectorizableTree.size();
18455 unsigned NewIdx = 0;
18457 NewIdx = *ExtraBitWidthNodes.
begin();
18458 ExtraBitWidthNodes.
erase(ExtraBitWidthNodes.
begin());
18459 }
while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.
empty());
18462 NodeIdx < VectorizableTree.size() &&
18463 any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
18466 EI.
UserTE->getOpcode() == Instruction::Trunc &&
18467 !EI.
UserTE->isAltShuffle();
18470 NodeIdx < VectorizableTree.size() &&
18472 VectorizableTree[NodeIdx]->UserTreeIndices,
18474 return (EI.
UserTE->hasState() &&
18475 EI.
UserTE->getOpcode() == Instruction::ICmp) &&
18477 auto *IC = dyn_cast<ICmpInst>(V);
18480 !isKnownNonNegative(IC->getOperand(0),
18481 SimplifyQuery(*DL)) ||
18482 !isKnownNonNegative(IC->getOperand(1),
18483 SimplifyQuery(*DL)));
18490 if (MaxBitWidth == 0 ||
18492 cast<IntegerType>(TreeRoot.
front()->getType()->getScalarType())
18494 if (UserIgnoreList)
18502 for (
unsigned Idx : ToDemote) {
18503 TreeEntry *TE = VectorizableTree[
Idx].get();
18506 bool IsSigned =
any_of(TE->Scalars, [&](
Value *R) {
18507 if (isa<PoisonValue>(R))
18509 return !isKnownNonNegative(R, SimplifyQuery(*DL));
18527 bool Changed =
runImpl(
F, SE,
TTI, TLI, AA, LI, DT, AC, DB, ORE);
18552 DL = &
F.getDataLayout();
18556 bool Changed =
false;
18562 dbgs() <<
"SLP: Didn't find any vector registers for target, abort.\n");
18567 if (
F.hasFnAttribute(Attribute::NoImplicitFloat))
18570 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing blocks in " <<
F.getName() <<
".\n");
18574 BoUpSLP R(&
F, SE,
TTI, TLI, AA, LI, DT, AC, DB,
DL, ORE_);
18583 for (
auto *BB :
post_order(&
F.getEntryBlock())) {
18588 R.clearReductionData();
18589 collectSeedInstructions(BB);
18592 if (!Stores.
empty()) {
18594 <<
" underlying objects.\n");
18595 Changed |= vectorizeStoreChains(R);
18599 Changed |= vectorizeChainsInBlock(BB, R);
18604 if (!GEPs.
empty()) {
18606 <<
" underlying objects.\n");
18607 Changed |= vectorizeGEPIndices(BB, R);
18612 R.optimizeGatherSequence();
18620 unsigned Idx,
unsigned MinVF,
18625 const unsigned Sz = R.getVectorElementSize(Chain[0]);
18626 unsigned VF = Chain.
size();
18630 *
TTI, cast<StoreInst>(Chain.
front())->getValueOperand()->getType(),
18632 VF < 2 || VF < MinVF) {
18644 for (
Value *V : Chain)
18645 ValOps.
insert(cast<StoreInst>(V)->getValueOperand());
18648 if (
all_of(ValOps, IsaPred<Instruction>) && ValOps.
size() > 1) {
18650 bool IsAllowedSize =
18654 if ((!IsAllowedSize && S && S.getOpcode() != Instruction::Load &&
18655 (!S.getMainOp()->isSafeToRemove() ||
18658 return !isa<ExtractElementInst>(V) &&
18659 (V->getNumUses() > Chain.size() ||
18660 any_of(V->users(), [&](User *U) {
18661 return !Stores.contains(U);
18664 (ValOps.
size() > Chain.size() / 2 && !S)) {
18665 Size = (!IsAllowedSize && S) ? 1 : 2;
18669 if (
R.isLoadCombineCandidate(Chain))
18671 R.buildTree(Chain);
18673 if (
R.isTreeTinyAndNotFullyVectorizable()) {
18674 if (
R.isGathered(Chain.front()) ||
18675 R.isNotScheduled(cast<StoreInst>(Chain.front())->getValueOperand()))
18676 return std::nullopt;
18677 Size =
R.getCanonicalGraphSize();
18680 R.reorderTopToBottom();
18681 R.reorderBottomToTop();
18682 R.transformNodes();
18683 R.buildExternalUses();
18685 R.computeMinimumValueSizes();
18687 Size =
R.getCanonicalGraphSize();
18688 if (S && S.getOpcode() == Instruction::Load)
18696 using namespace ore;
18699 cast<StoreInst>(Chain[0]))
18700 <<
"Stores SLP vectorized with cost " <<
NV(
"Cost",
Cost)
18701 <<
" and with tree size "
18702 <<
NV(
"TreeSize",
R.getTreeSize()));
18716 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18717 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18718 unsigned Size = First ? Val.first : Val.second;
18730 Sizes.begin(), Sizes.end(),
static_cast<uint64_t>(0),
18731 [&](
uint64_t V,
const std::pair<unsigned, unsigned> &Val) {
18732 unsigned P = First ? Val.first : Val.second;
18735 return V + (P - Mean) * (P - Mean);
18738 return Dev * 81 / (Mean * Mean) == 0;
18741bool SLPVectorizerPass::vectorizeStores(
18743 DenseSet<std::tuple<Value *, Value *, Value *, Value *, unsigned>>
18748 bool Changed =
false;
18750 struct StoreDistCompare {
18751 bool operator()(
const std::pair<unsigned, int> &Op1,
18752 const std::pair<unsigned, int> &Op2)
const {
18753 return Op1.second < Op2.second;
18758 using StoreIndexToDistSet =
18759 std::set<std::pair<unsigned, int>, StoreDistCompare>;
18760 auto TryToVectorize = [&](
const StoreIndexToDistSet &
Set) {
18765 if (
Operands.empty() ||
Data.second - PrevDist == 1) {
18767 PrevDist =
Data.second;
18768 if (
Idx !=
Set.size() - 1)
18773 Operands.push_back(Stores[DataVar.first]);
18774 PrevDist = DataVar.second;
18779 .
insert({Operands.front(),
18780 cast<StoreInst>(Operands.front())->getValueOperand(),
18782 cast<StoreInst>(Operands.back())->getValueOperand(),
18787 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
18788 unsigned EltSize =
R.getVectorElementSize(
Operands[0]);
18792 std::min(
R.getMaximumVF(EltSize, Instruction::Store), MaxElts);
18794 Type *StoreTy =
Store->getValueOperand()->getType();
18795 Type *ValueTy = StoreTy;
18796 if (
auto *Trunc = dyn_cast<TruncInst>(
Store->getValueOperand()))
18797 ValueTy = Trunc->getSrcTy();
18798 unsigned MinVF = std::max<unsigned>(
18800 R.getMinVF(
DL->getTypeStoreSizeInBits(StoreTy)), StoreTy,
18803 if (MaxVF < MinVF) {
18804 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18806 <<
"MinVF (" << MinVF <<
")\n");
18810 unsigned NonPowerOf2VF = 0;
18815 unsigned CandVF = std::clamp<unsigned>(
Operands.size(), MinVF, MaxVF);
18817 NonPowerOf2VF = CandVF;
18818 assert(NonPowerOf2VF != MaxVF &&
18819 "Non-power-of-2 VF should not be equal to MaxVF");
18823 unsigned MaxRegVF = MaxVF;
18825 if (MaxVF < MinVF) {
18826 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorization infeasible as MaxVF (" << MaxVF
18828 <<
"MinVF (" << MinVF <<
")\n");
18834 unsigned Size = MinVF;
18836 VF =
Size > MaxVF ? NonPowerOf2VF :
Size;
18840 unsigned Repeat = 0;
18841 constexpr unsigned MaxAttempts = 4;
18843 for_each(RangeSizes, [](std::pair<unsigned, unsigned> &
P) {
18844 P.first =
P.second = 1;
18847 auto IsNotVectorized = [](
bool First,
18848 const std::pair<unsigned, unsigned> &
P) {
18849 return First ?
P.first > 0 :
P.second > 0;
18851 auto IsVectorized = [](
bool First,
18852 const std::pair<unsigned, unsigned> &
P) {
18853 return First ?
P.first == 0 :
P.second == 0;
18855 auto VFIsProfitable = [](
bool First,
unsigned Size,
18856 const std::pair<unsigned, unsigned> &
P) {
18859 auto FirstSizeSame = [](
unsigned Size,
18860 const std::pair<unsigned, unsigned> &
P) {
18861 return Size ==
P.first;
18865 bool RepeatChanged =
false;
18866 bool AnyProfitableGraph =
false;
18867 for (
unsigned Size : CandidateVFs) {
18868 AnyProfitableGraph =
false;
18869 unsigned StartIdx = std::distance(
18870 RangeSizes.begin(),
18871 find_if(RangeSizes, std::bind(IsNotVectorized,
Size >= MaxRegVF,
18872 std::placeholders::_1)));
18873 while (StartIdx <
End) {
18875 std::distance(RangeSizes.begin(),
18876 find_if(RangeSizes.drop_front(StartIdx),
18877 std::bind(IsVectorized,
Size >= MaxRegVF,
18878 std::placeholders::_1)));
18879 unsigned Sz = EndIdx >=
End ?
End : EndIdx;
18880 for (
unsigned Cnt = StartIdx; Cnt +
Size <= Sz;) {
18882 Size >= MaxRegVF)) {
18889 return cast<StoreInst>(V)
18890 ->getValueOperand()
18892 cast<StoreInst>(Slice.
front())
18893 ->getValueOperand()
18896 "Expected all operands of same type.");
18897 if (!NonSchedulable.empty()) {
18898 auto [NonSchedSizeMax, NonSchedSizeMin] =
18899 NonSchedulable.lookup(Slice.
front());
18900 if (NonSchedSizeMax > 0 && NonSchedSizeMin <=
Size) {
18901 Cnt += NonSchedSizeMax;
18906 std::optional<bool> Res =
18907 vectorizeStoreChain(Slice, R, Cnt, MinVF, TreeSize);
18911 .first->getSecond()
18919 AnyProfitableGraph = RepeatChanged = Changed =
true;
18923 [](std::pair<unsigned, unsigned> &
P) {
18924 P.first = P.second = 0;
18926 if (Cnt < StartIdx + MinVF) {
18927 for_each(RangeSizes.slice(StartIdx, Cnt - StartIdx),
18928 [](std::pair<unsigned, unsigned> &
P) {
18929 P.first = P.second = 0;
18931 StartIdx = Cnt +
Size;
18933 if (Cnt > Sz -
Size - MinVF) {
18935 [](std::pair<unsigned, unsigned> &
P) {
18936 P.first = P.second = 0;
18945 if (
Size > 2 && Res &&
18947 std::bind(VFIsProfitable,
Size >= MaxRegVF, TreeSize,
18948 std::placeholders::_1))) {
18954 if (
Size > MaxRegVF && TreeSize > 1 &&
18956 std::bind(FirstSizeSame, TreeSize,
18957 std::placeholders::_1))) {
18959 while (Cnt != Sz && RangeSizes[Cnt].first == TreeSize)
18965 [&](std::pair<unsigned, unsigned> &
P) {
18966 if (Size >= MaxRegVF)
18967 P.second = std::max(P.second, TreeSize);
18969 P.first = std::max(P.first, TreeSize);
18972 AnyProfitableGraph =
true;
18974 if (StartIdx >=
End)
18976 if (Sz - StartIdx < Size && Sz - StartIdx >= MinVF)
18977 AnyProfitableGraph =
true;
18978 StartIdx = std::distance(
18979 RangeSizes.begin(),
18980 find_if(RangeSizes.drop_front(Sz),
18981 std::bind(IsNotVectorized,
Size >= MaxRegVF,
18982 std::placeholders::_1)));
18988 if (
all_of(RangeSizes, [](
const std::pair<unsigned, unsigned> &
P) {
18989 return P.first == 0 &&
P.second == 0;
18993 if (Repeat >= MaxAttempts ||
18994 (Repeat > 1 && (RepeatChanged || !AnyProfitableGraph)))
18996 constexpr unsigned StoresLimit = 64;
18997 const unsigned MaxTotalNum = std::min<unsigned>(
18999 static_cast<unsigned>(
19002 RangeSizes.begin(),
19003 find_if(RangeSizes, std::bind(IsNotVectorized,
true,
19004 std::placeholders::_1))) +
19006 unsigned VF =
bit_ceil(CandidateVFs.front()) * 2;
19009 CandidateVFs.clear();
19011 CandidateVFs.push_back(Limit);
19012 if (VF > MaxTotalNum || VF >= StoresLimit)
19014 for_each(RangeSizes, [&](std::pair<unsigned, unsigned> &
P) {
19016 P.first = std::max(
P.second,
P.first);
19020 CandidateVFs.push_back(VF);
19067 for (std::pair<unsigned, StoreIndexToDistSet> &Set : SortedStores) {
19069 Stores[
Set.first]->getValueOperand()->getType(),
19070 Stores[
Set.first]->getPointerOperand(),
19071 SI->getValueOperand()->getType(),
SI->getPointerOperand(), *
DL, *SE,
19075 auto It =
Set.second.find(std::make_pair(
Idx, *Diff));
19076 if (It ==
Set.second.end()) {
19077 Set.second.emplace(
Idx, *Diff);
19081 TryToVectorize(
Set.second);
19082 unsigned ItIdx = It->first;
19083 int ItDist = It->second;
19084 StoreIndexToDistSet PrevSet;
19085 copy_if(
Set.second, std::inserter(PrevSet, PrevSet.end()),
19086 [&](
const std::pair<unsigned, int> &Pair) {
19087 return Pair.first > ItIdx;
19089 Set.second.clear();
19091 Set.second.emplace(
Idx, 0);
19094 unsigned StartIdx = ItIdx + 1;
19099 for (
const std::pair<unsigned, int> &Pair :
reverse(PrevSet)) {
19101 if (VectorizedStores.
contains(Stores[Pair.first]))
19103 unsigned BI = Pair.first - StartIdx;
19104 UsedStores.set(BI);
19105 Dists[BI] = Pair.second - ItDist;
19107 for (
unsigned I = StartIdx;
I <
Idx; ++
I) {
19108 unsigned BI =
I - StartIdx;
19109 if (UsedStores.test(BI))
19110 Set.second.emplace(
I, Dists[BI]);
19114 auto &Res = SortedStores.emplace_back();
19116 Res.second.emplace(
Idx, 0);
19118 Type *PrevValTy =
nullptr;
19120 if (
R.isDeleted(SI))
19123 PrevValTy =
SI->getValueOperand()->getType();
19125 if (PrevValTy !=
SI->getValueOperand()->getType()) {
19126 for (
auto &Set : SortedStores)
19127 TryToVectorize(
Set.second);
19128 SortedStores.clear();
19129 PrevValTy =
SI->getValueOperand()->getType();
19131 FillStoresSet(
I, SI);
19135 for (
auto &Set : SortedStores)
19136 TryToVectorize(
Set.second);
19141void SLPVectorizerPass::collectSeedInstructions(
BasicBlock *BB) {
19152 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
19153 if (!
SI->isSimple())
19163 else if (
auto *
GEP = dyn_cast<GetElementPtrInst>(&
I)) {
19164 if (
GEP->getNumIndices() != 1)
19167 if (isa<Constant>(
Idx))
19171 if (
GEP->getType()->isVectorTy())
19183 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize a list of length = "
19184 << VL.
size() <<
".\n");
19195 for (
Value *V : VL) {
19196 Type *Ty =
V->getType();
19200 R.getORE()->emit([&]() {
19201 std::string TypeStr;
19205 <<
"Cannot SLP vectorize list: type "
19206 << TypeStr +
" is unsupported by vectorizer";
19213 unsigned Sz =
R.getVectorElementSize(I0);
19214 unsigned MinVF =
R.getMinVF(Sz);
19215 unsigned MaxVF = std::max<unsigned>(
19217 MaxVF = std::min(
R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
19219 R.getORE()->emit([&]() {
19221 <<
"Cannot SLP vectorize list: vectorization factor "
19222 <<
"less than 2 is not supported";
19227 bool Changed =
false;
19228 bool CandidateFound =
false;
19231 unsigned NextInst = 0, MaxInst = VL.size();
19232 for (
unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF;
19240 for (
unsigned I = NextInst;
I < MaxInst; ++
I) {
19241 unsigned ActualVF = std::min(MaxInst -
I, VF);
19246 if (MaxVFOnly && ActualVF < MaxVF)
19248 if ((VF > MinVF && ActualVF < VF) || (VF == MinVF && ActualVF < 2))
19253 for (
Value *V : VL.drop_front(
I)) {
19256 if (
auto *Inst = dyn_cast<Instruction>(V);
19257 !Inst || !
R.isDeleted(Inst)) {
19260 if (
Idx == ActualVF)
19265 if (
Idx != ActualVF)
19268 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing " << ActualVF <<
" operations "
19272 if (
R.isTreeTinyAndNotFullyVectorizable())
19274 R.reorderTopToBottom();
19275 R.reorderBottomToTop(
19276 !isa<InsertElementInst>(Ops.
front()) &&
19277 !
R.doesRootHaveInTreeUses());
19278 R.transformNodes();
19279 R.buildExternalUses();
19281 R.computeMinimumValueSizes();
19283 CandidateFound =
true;
19284 MinCost = std::min(MinCost,
Cost);
19287 <<
" for VF=" << ActualVF <<
"\n");
19291 cast<Instruction>(Ops[0]))
19292 <<
"SLP vectorized with cost " <<
ore::NV(
"Cost",
Cost)
19293 <<
" and with tree size "
19294 <<
ore::NV(
"TreeSize",
R.getTreeSize()));
19305 if (!Changed && CandidateFound) {
19306 R.getORE()->emit([&]() {
19308 <<
"List vectorization was possible but not beneficial with cost "
19309 <<
ore::NV(
"Cost", MinCost) <<
" >= "
19312 }
else if (!Changed) {
19313 R.getORE()->emit([&]() {
19315 <<
"Cannot SLP vectorize list: vectorization was impossible"
19316 <<
" with available vectorization factors";
19326 if (!isa<BinaryOperator, CmpInst>(
I) || isa<VectorType>(
I->getType()))
19332 auto *Op0 = dyn_cast<Instruction>(
I->getOperand(0));
19333 auto *Op1 = dyn_cast<Instruction>(
I->getOperand(1));
19334 if (!Op0 || !Op1 || Op0->getParent() !=
P || Op1->getParent() !=
P ||
19335 R.isDeleted(Op0) ||
R.isDeleted(Op1))
19342 auto *
A = dyn_cast<BinaryOperator>(Op0);
19343 auto *
B = dyn_cast<BinaryOperator>(Op1);
19345 if (
A &&
B &&
B->hasOneUse()) {
19346 auto *B0 = dyn_cast<BinaryOperator>(
B->getOperand(0));
19347 auto *B1 = dyn_cast<BinaryOperator>(
B->getOperand(1));
19348 if (B0 && B0->getParent() ==
P && !
R.isDeleted(B0))
19350 if (B1 && B1->getParent() ==
P && !
R.isDeleted(B1))
19354 if (
B &&
A &&
A->hasOneUse()) {
19355 auto *A0 = dyn_cast<BinaryOperator>(
A->getOperand(0));
19356 auto *A1 = dyn_cast<BinaryOperator>(
A->getOperand(1));
19357 if (A0 && A0->getParent() ==
P && !
R.isDeleted(A0))
19359 if (A1 && A1->getParent() ==
P && !
R.isDeleted(A1))
19363 if (Candidates.
size() == 1)
19364 return tryToVectorizeList({Op0, Op1},
R);
19367 std::optional<int> BestCandidate =
R.findBestRootPair(Candidates);
19368 if (!BestCandidate)
19370 return tryToVectorizeList(
19371 {Candidates[*BestCandidate].first, Candidates[*BestCandidate].second},
R);
19405 ReductionOpsListType ReductionOps;
19415 bool IsSupportedHorRdxIdentityOp =
false;
19426 return isa<SelectInst>(
I) &&
19432 if (Kind == RecurKind::None)
19440 if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
19444 return I->getFastMathFlags().noNaNs();
19447 if (Kind == RecurKind::FMaximum || Kind == RecurKind::FMinimum)
19450 return I->isAssociative();
19459 return I->getOperand(2);
19460 return I->getOperand(
Index);
19467 case RecurKind::Or: {
19475 case RecurKind::And: {
19483 case RecurKind::Add:
19484 case RecurKind::Mul:
19485 case RecurKind::Xor:
19486 case RecurKind::FAdd:
19487 case RecurKind::FMul: {
19492 case RecurKind::SMax:
19493 case RecurKind::SMin:
19494 case RecurKind::UMax:
19495 case RecurKind::UMin:
19502 case RecurKind::FMax:
19503 case RecurKind::FMin:
19504 case RecurKind::FMaximum:
19505 case RecurKind::FMinimum: {
19518 const ReductionOpsListType &ReductionOps) {
19519 bool UseSelect = ReductionOps.size() == 2 ||
19521 (ReductionOps.size() == 1 &&
19522 any_of(ReductionOps.front(), IsaPred<SelectInst>));
19523 assert((!UseSelect || ReductionOps.size() != 2 ||
19524 isa<SelectInst>(ReductionOps[1][0])) &&
19525 "Expected cmp + select pairs for reduction");
19528 if (
auto *Sel = dyn_cast<SelectInst>(
Op)) {
19542 auto *
I = dyn_cast<Instruction>(V);
19544 return RecurKind::None;
19546 return RecurKind::Add;
19548 return RecurKind::Mul;
19551 return RecurKind::And;
19554 return RecurKind::Or;
19556 return RecurKind::Xor;
19558 return RecurKind::FAdd;
19560 return RecurKind::FMul;
19563 return RecurKind::FMax;
19565 return RecurKind::FMin;
19568 return RecurKind::FMaximum;
19570 return RecurKind::FMinimum;
19576 return RecurKind::SMax;
19578 return RecurKind::SMin;
19580 return RecurKind::UMax;
19582 return RecurKind::UMin;
19584 if (
auto *
Select = dyn_cast<SelectInst>(
I)) {
19606 if (!isa<ExtractElementInst>(
RHS) ||
19608 return RecurKind::None;
19610 if (!isa<ExtractElementInst>(
LHS) ||
19612 return RecurKind::None;
19614 if (!isa<ExtractElementInst>(
LHS) || !isa<ExtractElementInst>(
RHS))
19615 return RecurKind::None;
19619 return RecurKind::None;
19624 return RecurKind::None;
19627 return RecurKind::SMax;
19630 return RecurKind::SMin;
19633 return RecurKind::UMax;
19636 return RecurKind::UMin;
19639 return RecurKind::None;
19643 static unsigned getFirstOperandIndex(
Instruction *
I) {
19644 return isCmpSelMinMax(
I) ? 1 : 0;
19650 return isCmpSelMinMax(
I) ? 3 : 2;
19656 if (isCmpSelMinMax(
I) || isBoolLogicOp(
I)) {
19657 auto *Sel = cast<SelectInst>(
I);
19658 auto *
Cmp = dyn_cast<Instruction>(Sel->getCondition());
19659 return Sel->getParent() == BB &&
Cmp &&
Cmp->getParent() == BB;
19661 return I->getParent() == BB;
19665 static bool hasRequiredNumberOfUses(
bool IsCmpSelMinMax,
Instruction *
I) {
19666 if (IsCmpSelMinMax) {
19669 if (
auto *Sel = dyn_cast<SelectInst>(
I))
19670 return Sel->
hasNUses(2) && Sel->getCondition()->hasOneUse();
19671 return I->hasNUses(2);
19675 return I->hasOneUse();
19680 if (isCmpSelMinMax(
I))
19681 ReductionOps.assign(2, ReductionOpsType());
19683 ReductionOps.assign(1, ReductionOpsType());
19688 if (isCmpSelMinMax(
I)) {
19689 ReductionOps[0].emplace_back(cast<SelectInst>(
I)->getCondition());
19690 ReductionOps[1].emplace_back(
I);
19692 ReductionOps[0].emplace_back(
I);
19697 int Sz = Data.size();
19698 auto *
I = dyn_cast<Instruction>(Data.front());
19699 return Sz > 1 ||
isConstant(Data.front()) ||
19710 RdxKind = HorizontalReduction::getRdxKind(Root);
19711 if (!isVectorizable(RdxKind, Root))
19722 if (
auto *Sel = dyn_cast<SelectInst>(Root))
19723 if (!Sel->getCondition()->hasOneUse())
19726 ReductionRoot = Root;
19731 bool IsCmpSelMinMax = isCmpSelMinMax(Root);
19733 1, std::make_pair(Root, 0));
19741 for (
int I :
reverse(seq<int>(getFirstOperandIndex(TreeN),
19742 getNumberOfOperands(TreeN)))) {
19743 Value *EdgeVal = getRdxOperand(TreeN,
I);
19744 ReducedValsToOps[EdgeVal].push_back(TreeN);
19745 auto *EdgeInst = dyn_cast<Instruction>(EdgeVal);
19752 IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
19753 !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
19754 !isVectorizable(RdxKind, EdgeInst) ||
19755 (
R.isAnalyzedReductionRoot(EdgeInst) &&
19756 all_of(EdgeInst->operands(), IsaPred<Constant>))) {
19757 PossibleReducedVals.push_back(EdgeVal);
19760 ReductionOps.push_back(EdgeInst);
19771 PossibleReducedVals;
19772 initReductionOps(Root);
19776 auto GenerateLoadsSubkey = [&](
size_t Key,
LoadInst *LI) {
19780 if (!LoadKeyUsed.
insert(Key).second) {
19781 auto LIt = LoadsMap.
find(std::make_pair(Key,
Ptr));
19782 if (LIt != LoadsMap.
end()) {
19783 for (
LoadInst *RLI : LIt->second) {
19789 for (
LoadInst *RLI : LIt->second) {
19796 if (LIt->second.size() > 2) {
19798 hash_value(LIt->second.back()->getPointerOperand());
19804 .first->second.push_back(LI);
19808 while (!Worklist.empty()) {
19809 auto [TreeN, Level] = Worklist.pop_back_val();
19812 CheckOperands(TreeN, PossibleRedVals, PossibleReductionOps, Level);
19813 addReductionOps(TreeN);
19816 for (
Value *V : PossibleRedVals) {
19820 ++PossibleReducedVals[
Key][
Idx]
19821 .
insert(std::make_pair(V, 0))
19825 Worklist.emplace_back(
I,
I->getParent() == BB ? 0 : Level + 1);
19827 auto PossibleReducedValsVect = PossibleReducedVals.
takeVector();
19830 for (
auto &PossibleReducedVals : PossibleReducedValsVect) {
19831 auto PossibleRedVals = PossibleReducedVals.second.
takeVector();
19833 for (
auto It = PossibleRedVals.begin(),
E = PossibleRedVals.end();
19836 auto RedValsVect = It->second.takeVector();
19838 for (
const std::pair<Value *, unsigned> &Data : RedValsVect)
19839 PossibleRedValsVect.
back().append(Data.second, Data.first);
19841 stable_sort(PossibleRedValsVect, [](
const auto &P1,
const auto &P2) {
19842 return P1.size() > P2.size();
19847 (!isGoodForReduction(Data) &&
19848 (!isa<LoadInst>(Data.front()) ||
19849 !isa<LoadInst>(ReducedVals[NewIdx].front()) ||
19851 cast<LoadInst>(Data.front())->getPointerOperand()) !=
19853 cast<LoadInst>(ReducedVals[NewIdx].front())
19855 NewIdx = ReducedVals.
size();
19858 ReducedVals[NewIdx].
append(Data.rbegin(), Data.rend());
19873 constexpr unsigned RegMaxNumber = 4;
19874 constexpr unsigned RedValsMaxNumber = 128;
19878 if (
unsigned NumReducedVals = std::accumulate(
19879 ReducedVals.
begin(), ReducedVals.
end(), 0,
19881 if (!isGoodForReduction(Vals))
19883 return Num + Vals.size();
19885 NumReducedVals < ReductionLimit &&
19889 for (ReductionOpsType &RdxOps : ReductionOps)
19890 for (
Value *RdxOp : RdxOps)
19891 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
19902 ReducedVals.
front().size());
19906 auto &&GetCmpForMinMaxReduction = [](
Instruction *RdxRootInst) {
19907 assert(isa<SelectInst>(RdxRootInst) &&
19908 "Expected min/max reduction to have select root instruction");
19909 Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
19910 assert(isa<Instruction>(ScalarCond) &&
19911 "Expected min/max reduction to have compare condition");
19912 return cast<Instruction>(ScalarCond);
19915 bool AnyBoolLogicOp =
any_of(ReductionOps.back(), [](
Value *V) {
19916 return isBoolLogicOp(cast<Instruction>(V));
19919 auto GetNewVectorizedTree = [&](
Value *VectorizedTree,
Value *Res) {
19920 if (VectorizedTree) {
19923 cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
19924 if (AnyBoolLogicOp) {
19925 auto It = ReducedValsToOps.
find(VectorizedTree);
19926 auto It1 = ReducedValsToOps.
find(Res);
19927 if ((It == ReducedValsToOps.
end() && It1 == ReducedValsToOps.
end()) ||
19929 (It != ReducedValsToOps.
end() &&
19931 return isBoolLogicOp(I) &&
19932 getRdxOperand(I, 0) == VectorizedTree;
19936 (It1 != ReducedValsToOps.
end() &&
19938 return isBoolLogicOp(I) && getRdxOperand(I, 0) == Res;
19942 VectorizedTree = Builder.
CreateFreeze(VectorizedTree);
19946 return createOp(Builder, RdxKind, VectorizedTree, Res,
"op.rdx",
19953 ReductionOps.front().size());
19954 for (ReductionOpsType &RdxOps : ReductionOps)
19955 for (
Value *RdxOp : RdxOps) {
19958 IgnoreList.insert(RdxOp);
19963 for (
Value *U : IgnoreList)
19964 if (
auto *FPMO = dyn_cast<FPMathOperator>(U))
19965 RdxFMF &= FPMO->getFastMathFlags();
19966 bool IsCmpSelMinMax = isCmpSelMinMax(cast<Instruction>(ReductionRoot));
19971 for (
Value *V : Candidates)
19972 TrackedVals.try_emplace(V, V);
19975 Value *
V) ->
unsigned & {
19976 auto *It = MV.
find(V);
19977 assert(It != MV.
end() &&
"Unable to find given key.");
19986 bool CheckForReusedReductionOps =
false;
19991 for (
unsigned I = 0,
E = ReducedVals.
size();
I <
E; ++
I) {
19993 InstructionsState S = States[
I];
19997 for (
unsigned Cnt = 0, Sz = OrigReducedVals.
size(); Cnt < Sz; ++Cnt) {
19998 Value *RdxVal = TrackedVals.at(OrigReducedVals[Cnt]);
20003 auto *Inst = dyn_cast<Instruction>(RdxVal);
20005 (!S || !S.isOpcodeOrAlt(Inst))) ||
20009 TrackedToOrig.try_emplace(RdxVal, OrigReducedVals[Cnt]);
20011 bool ShuffledExtracts =
false;
20013 if (S && S.getOpcode() == Instruction::ExtractElement &&
20014 !S.isAltShuffle() &&
I + 1 <
E) {
20016 for (
Value *RV : ReducedVals[
I + 1]) {
20017 Value *RdxVal = TrackedVals.at(RV);
20021 auto *Inst = dyn_cast<ExtractElementInst>(RdxVal);
20024 CommonCandidates.push_back(RdxVal);
20025 TrackedToOrig.try_emplace(RdxVal, RV);
20030 Candidates.
swap(CommonCandidates);
20031 ShuffledExtracts =
true;
20038 Value *OrigV = TrackedToOrig.at(Candidates.
front());
20039 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20041 Res = createOp(Builder, RdxKind, Res, VC,
"const.rdx", ReductionOps);
20042 Value *OrigV = TrackedToOrig.at(VC);
20043 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20044 if (
auto *ResI = dyn_cast<Instruction>(Res))
20045 V.analyzedReductionRoot(ResI);
20047 VectorizedTree = GetNewVectorizedTree(VectorizedTree, Res);
20051 unsigned NumReducedVals = Candidates.
size();
20052 if (NumReducedVals < ReductionLimit &&
20053 (NumReducedVals < 2 || !
isSplat(Candidates)))
20058 IsSupportedHorRdxIdentityOp = RdxKind != RecurKind::Mul &&
20059 RdxKind != RecurKind::FMul &&
20060 RdxKind != RecurKind::FMulAdd;
20063 if (IsSupportedHorRdxIdentityOp)
20064 for (
Value *V : Candidates) {
20065 Value *OrigV = TrackedToOrig.at(V);
20066 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20078 bool SameScaleFactor =
false;
20079 bool OptReusedScalars = IsSupportedHorRdxIdentityOp &&
20080 SameValuesCounter.
size() != Candidates.size();
20082 if (OptReusedScalars) {
20084 (RdxKind == RecurKind::Add || RdxKind == RecurKind::FAdd ||
20085 RdxKind == RecurKind::Xor) &&
20087 [&SameValuesCounter](
const std::pair<Value *, unsigned> &
P) {
20088 return P.second == SameValuesCounter.
front().second;
20090 Candidates.resize(SameValuesCounter.
size());
20091 transform(SameValuesCounter, Candidates.begin(),
20092 [&](
const auto &
P) { return TrackedVals.at(P.first); });
20093 NumReducedVals = Candidates.size();
20095 if (NumReducedVals == 1) {
20096 Value *OrigV = TrackedToOrig.at(Candidates.front());
20097 unsigned Cnt = At(SameValuesCounter, OrigV);
20099 emitScaleForReusedOps(Candidates.front(), Builder, Cnt);
20100 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20101 VectorizedVals.try_emplace(OrigV, Cnt);
20102 ExternallyUsedValues.
insert(OrigV);
20107 unsigned MaxVecRegSize =
V.getMaxVecRegSize();
20108 unsigned EltSize =
V.getVectorElementSize(Candidates[0]);
20109 const unsigned MaxElts = std::clamp<unsigned>(
20111 RegMaxNumber * RedValsMaxNumber);
20113 unsigned ReduxWidth = NumReducedVals;
20114 auto GetVectorFactor = [&, &
TTI = *
TTI](
unsigned ReduxWidth) {
20115 unsigned NumParts, NumRegs;
20116 Type *ScalarTy = Candidates.front()->getType();
20123 while (NumParts > NumRegs) {
20124 assert(ReduxWidth > 0 &&
"ReduxWidth is unexpectedly 0.");
20125 ReduxWidth =
bit_floor(ReduxWidth - 1);
20131 if (NumParts > NumRegs / 2)
20136 ReduxWidth = GetVectorFactor(ReduxWidth);
20137 ReduxWidth = std::min(ReduxWidth, MaxElts);
20139 unsigned Start = 0;
20140 unsigned Pos = Start;
20142 unsigned PrevReduxWidth = ReduxWidth;
20143 bool CheckForReusedReductionOpsLocal =
false;
20144 auto AdjustReducedVals = [&](
bool IgnoreVL =
false) {
20145 bool IsAnyRedOpGathered = !IgnoreVL &&
V.isAnyGathered(IgnoreList);
20146 if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
20149 CheckForReusedReductionOpsLocal |= IsAnyRedOpGathered;
20152 if (Pos < NumReducedVals - ReduxWidth + 1)
20153 return IsAnyRedOpGathered;
20156 if (ReduxWidth > 1)
20157 ReduxWidth = GetVectorFactor(ReduxWidth);
20158 return IsAnyRedOpGathered;
20160 bool AnyVectorized =
false;
20162 while (Pos < NumReducedVals - ReduxWidth + 1 &&
20163 ReduxWidth >= ReductionLimit) {
20166 if (CheckForReusedReductionOpsLocal && PrevReduxWidth != ReduxWidth &&
20168 CheckForReusedReductionOps =
true;
20171 PrevReduxWidth = ReduxWidth;
20174 if (IgnoredCandidates.
contains(std::make_pair(Pos, ReduxWidth)) ||
20177 std::make_pair(Pos,
bit_floor(ReduxWidth))) ||
20179 std::make_pair(Pos + (ReduxWidth -
bit_floor(ReduxWidth)),
20181 V.areAnalyzedReductionVals(VL)) {
20182 (void)AdjustReducedVals(
true);
20188 auto *RedValI = dyn_cast<Instruction>(RedVal);
20191 return V.isDeleted(RedValI);
20194 V.buildTree(VL, IgnoreList);
20195 if (
V.isTreeTinyAndNotFullyVectorizable(
true)) {
20196 if (!AdjustReducedVals())
20197 V.analyzedReductionVals(VL);
20200 if (
V.isLoadCombineReductionCandidate(RdxKind)) {
20201 if (!AdjustReducedVals())
20202 V.analyzedReductionVals(VL);
20205 V.reorderTopToBottom();
20207 V.reorderBottomToTop(
true);
20211 ExternallyUsedValues);
20215 LocalExternallyUsedValues.insert(ReductionRoot);
20216 for (
unsigned Cnt = 0, Sz = ReducedVals.
size(); Cnt < Sz; ++Cnt) {
20217 if (Cnt ==
I || (ShuffledExtracts && Cnt ==
I - 1))
20219 for (
Value *V : ReducedVals[Cnt])
20220 if (isa<Instruction>(V))
20221 LocalExternallyUsedValues.insert(TrackedVals[V]);
20223 if (!IsSupportedHorRdxIdentityOp) {
20226 "Reused values counter map is not empty");
20227 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20228 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20230 Value *
V = Candidates[Cnt];
20231 Value *OrigV = TrackedToOrig.at(V);
20232 ++SameValuesCounter.
try_emplace(OrigV).first->second;
20235 V.transformNodes();
20239 for (
unsigned Cnt = 0; Cnt < NumReducedVals; ++Cnt) {
20240 if (Cnt >= Pos && Cnt < Pos + ReduxWidth)
20242 Value *RdxVal = Candidates[Cnt];
20243 if (
auto It = TrackedVals.find(RdxVal); It != TrackedVals.end())
20244 RdxVal = It->second;
20245 if (!Visited.
insert(RdxVal).second)
20249 if (!VLScalars.contains(RdxVal) &&
V.isVectorized(RdxVal)) {
20250 LocalExternallyUsedValues.insert(RdxVal);
20253 Value *OrigV = TrackedToOrig.at(RdxVal);
20255 VectorizedVals.lookup(OrigV) + At(SameValuesCounter, OrigV);
20256 if (NumOps != ReducedValsToOps.
at(OrigV).size())
20257 LocalExternallyUsedValues.insert(RdxVal);
20260 if (!IsSupportedHorRdxIdentityOp)
20261 SameValuesCounter.
clear();
20262 for (
Value *RdxVal : VL)
20263 if (RequiredExtract.
contains(RdxVal))
20264 LocalExternallyUsedValues.insert(RdxVal);
20265 V.buildExternalUses(LocalExternallyUsedValues);
20267 V.computeMinimumValueSizes();
20272 getReductionCost(
TTI, VL, IsCmpSelMinMax, RdxFMF, V);
20275 <<
" for reduction\n");
20279 V.getORE()->emit([&]() {
20281 ReducedValsToOps.
at(VL[0]).front())
20282 <<
"Vectorizing horizontal reduction is possible "
20283 <<
"but not beneficial with cost " <<
ore::NV(
"Cost",
Cost)
20284 <<
" and threshold "
20287 if (!AdjustReducedVals()) {
20288 V.analyzedReductionVals(VL);
20289 unsigned Offset = Pos == Start ? Pos : Pos - 1;
20290 if (ReduxWidth > ReductionLimit &&
V.isTreeNotExtendable()) {
20293 *
TTI, VL.front()->getType(), ReduxWidth - 1);
20294 VF >= ReductionLimit;
20296 *
TTI, VL.front()->getType(), VF - 1)) {
20298 V.getCanonicalGraphSize() !=
V.getTreeSize())
20300 for (
unsigned Idx : seq<unsigned>(ReduxWidth - VF))
20308 LLVM_DEBUG(
dbgs() <<
"SLP: Vectorizing horizontal reduction at cost:"
20309 <<
Cost <<
". (HorRdx)\n");
20310 V.getORE()->emit([&]() {
20312 ReducedValsToOps.
at(VL[0]).front())
20313 <<
"Vectorized horizontal reduction with cost "
20314 <<
ore::NV(
"Cost",
Cost) <<
" and with tree size "
20315 <<
ore::NV(
"TreeSize",
V.getTreeSize());
20322 Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
20324 if (IsCmpSelMinMax)
20325 InsertPt = GetCmpForMinMaxReduction(RdxRootInst);
20328 Value *VectorizedRoot =
20329 V.vectorizeTree(LocalExternallyUsedValues, InsertPt);
20332 for (
Value *RdxVal : Candidates) {
20333 Value *OrigVal = TrackedToOrig.at(RdxVal);
20334 Value *TransformedRdxVal = TrackedVals.at(OrigVal);
20335 if (TransformedRdxVal != RdxVal)
20336 TrackedToOrig.try_emplace(TransformedRdxVal, OrigVal);
20345 VectorizedRoot = Builder.
CreateFreeze(VectorizedRoot);
20348 if (OptReusedScalars && !SameScaleFactor) {
20349 VectorizedRoot = emitReusedOps(VectorizedRoot, Builder, V,
20350 SameValuesCounter, TrackedToOrig);
20353 Value *ReducedSubTree;
20354 Type *ScalarTy = VL.front()->getType();
20355 if (isa<FixedVectorType>(ScalarTy)) {
20360 for (
unsigned I : seq<unsigned>(ScalarTyNumElements)) {
20378 emitReduction(Lane, Builder,
TTI, RdxRootInst->
getType()),
I);
20381 ReducedSubTree = emitReduction(VectorizedRoot, Builder,
TTI,
20384 if (ReducedSubTree->
getType() != VL.front()->getType()) {
20385 assert(ReducedSubTree->
getType() != VL.front()->getType() &&
20386 "Expected different reduction type.");
20388 Builder.
CreateIntCast(ReducedSubTree, VL.front()->getType(),
20389 V.isSignedMinBitwidthRootNode());
20395 if (OptReusedScalars && SameScaleFactor)
20396 ReducedSubTree = emitScaleForReusedOps(
20397 ReducedSubTree, Builder, SameValuesCounter.
front().second);
20399 VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree);
20401 for (
Value *RdxVal : VL) {
20402 Value *OrigV = TrackedToOrig.at(RdxVal);
20403 if (IsSupportedHorRdxIdentityOp) {
20404 VectorizedVals.try_emplace(OrigV, At(SameValuesCounter, OrigV));
20407 ++VectorizedVals.try_emplace(OrigV).first->getSecond();
20408 if (!
V.isVectorized(RdxVal))
20409 RequiredExtract.
insert(RdxVal);
20413 ReduxWidth = NumReducedVals - Pos;
20414 if (ReduxWidth > 1)
20415 ReduxWidth = GetVectorFactor(NumReducedVals - Pos);
20416 AnyVectorized =
true;
20418 if (OptReusedScalars && !AnyVectorized) {
20419 for (
const std::pair<Value *, unsigned> &
P : SameValuesCounter) {
20420 Value *RdxVal = TrackedVals.at(
P.first);
20421 Value *RedVal = emitScaleForReusedOps(RdxVal, Builder,
P.second);
20422 VectorizedTree = GetNewVectorizedTree(VectorizedTree, RedVal);
20423 VectorizedVals.try_emplace(
P.first,
P.second);
20428 if (VectorizedTree) {
20449 if (!AnyBoolLogicOp)
20451 if (isBoolLogicOp(RedOp1) && ((!InitStep &&
LHS == VectorizedTree) ||
20452 getRdxOperand(RedOp1, 0) ==
LHS ||
20455 if (isBoolLogicOp(RedOp2) && ((!InitStep &&
RHS == VectorizedTree) ||
20456 getRdxOperand(RedOp2, 0) ==
RHS ||
20461 if (
LHS != VectorizedTree)
20472 unsigned Sz = InstVals.
size();
20475 for (
unsigned I = 0,
E = (Sz / 2) * 2;
I <
E;
I += 2) {
20478 Value *RdxVal1 = InstVals[
I].second;
20479 Value *StableRdxVal1 = RdxVal1;
20480 auto It1 = TrackedVals.find(RdxVal1);
20481 if (It1 != TrackedVals.end())
20482 StableRdxVal1 = It1->second;
20483 Value *RdxVal2 = InstVals[
I + 1].second;
20484 Value *StableRdxVal2 = RdxVal2;
20485 auto It2 = TrackedVals.find(RdxVal2);
20486 if (It2 != TrackedVals.end())
20487 StableRdxVal2 = It2->second;
20491 FixBoolLogicalOps(StableRdxVal1, StableRdxVal2, InstVals[
I].first,
20493 Value *ExtraRed = createOp(Builder, RdxKind, StableRdxVal1,
20494 StableRdxVal2,
"op.rdx", ReductionOps);
20495 ExtraReds[
I / 2] = std::make_pair(InstVals[
I].first, ExtraRed);
20498 ExtraReds[Sz / 2] = InstVals.
back();
20502 ExtraReductions.
emplace_back(cast<Instruction>(ReductionRoot),
20506 for (
Value *RdxVal : Candidates) {
20507 if (!Visited.
insert(RdxVal).second)
20509 unsigned NumOps = VectorizedVals.lookup(RdxVal);
20516 bool InitStep =
true;
20517 while (ExtraReductions.
size() > 1) {
20519 FinalGen(ExtraReductions, InitStep);
20520 ExtraReductions.
swap(NewReds);
20523 VectorizedTree = ExtraReductions.
front().second;
20525 ReductionRoot->replaceAllUsesWith(VectorizedTree);
20534 IgnoreSet.
insert(RdxOps.begin(), RdxOps.end());
20541 for (
auto *U :
Ignore->users()) {
20543 "All users must be either in the reduction ops list.");
20546 if (!
Ignore->use_empty()) {
20548 Ignore->replaceAllUsesWith(
P);
20551 V.removeInstructionsAndOperands(RdxOps);
20553 }
else if (!CheckForReusedReductionOps) {
20554 for (ReductionOpsType &RdxOps : ReductionOps)
20555 for (
Value *RdxOp : RdxOps)
20556 V.analyzedReductionRoot(cast<Instruction>(RdxOp));
20558 return VectorizedTree;
20568 Type *ScalarTy = ReducedVals.
front()->getType();
20569 unsigned ReduxWidth = ReducedVals.
size();
20578 int Cnt = ReducedVals.
size();
20579 for (
Value *RdxVal : ReducedVals) {
20584 Cost += GenCostFn();
20589 auto *RdxOp = cast<Instruction>(U);
20590 if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
20598 Cost += ScalarCost;
20600 Cost += GenCostFn();
20605 case RecurKind::Add:
20606 case RecurKind::Mul:
20607 case RecurKind::Or:
20608 case RecurKind::And:
20609 case RecurKind::Xor:
20610 case RecurKind::FAdd:
20611 case RecurKind::FMul: {
20614 if (
auto *VecTy = dyn_cast<FixedVectorType>(ScalarTy)) {
20617 for (
unsigned I : seq<unsigned>(ReducedVals.size())) {
20629 auto [RType, IsSigned] =
R.getRootNodeTypeWithNoCast().value_or(
20630 std::make_pair(RedTy,
true));
20631 if (RType == RedTy) {
20641 ScalarCost = EvaluateScalarCost([&]() {
20646 case RecurKind::FMax:
20647 case RecurKind::FMin:
20648 case RecurKind::FMaximum:
20649 case RecurKind::FMinimum:
20650 case RecurKind::SMax:
20651 case RecurKind::SMin:
20652 case RecurKind::UMax:
20653 case RecurKind::UMin: {
20657 ScalarCost = EvaluateScalarCost([&]() {
20667 LLVM_DEBUG(
dbgs() <<
"SLP: Adding cost " << VectorCost - ScalarCost
20669 <<
" (It is a splitting reduction)\n");
20670 return VectorCost - ScalarCost;
20676 assert(VectorizedValue &&
"Need to have a vectorized tree node");
20677 assert(RdxKind != RecurKind::FMulAdd &&
20678 "A call to the llvm.fmuladd intrinsic is not handled yet");
20680 auto *FTy = cast<FixedVectorType>(VectorizedValue->
getType());
20681 if (FTy->getScalarType() == Builder.
getInt1Ty() &&
20682 RdxKind == RecurKind::Add &&
20687 VectorizedValue, Builder.
getIntNTy(FTy->getNumElements()));
20688 ++NumVectorInstructions;
20691 ++NumVectorInstructions;
20698 assert(IsSupportedHorRdxIdentityOp &&
20699 "The optimization of matched scalar identity horizontal reductions "
20700 "must be supported.");
20702 return VectorizedValue;
20704 case RecurKind::Add: {
20706 Value *Scale = ConstantInt::get(VectorizedValue->
getType(), Cnt);
20708 << VectorizedValue <<
". (HorRdx)\n");
20709 return Builder.
CreateMul(VectorizedValue, Scale);
20711 case RecurKind::Xor: {
20713 LLVM_DEBUG(
dbgs() <<
"SLP: Xor " << Cnt <<
"of " << VectorizedValue
20714 <<
". (HorRdx)\n");
20717 return VectorizedValue;
20719 case RecurKind::FAdd: {
20721 Value *Scale = ConstantFP::get(VectorizedValue->
getType(), Cnt);
20723 << VectorizedValue <<
". (HorRdx)\n");
20724 return Builder.
CreateFMul(VectorizedValue, Scale);
20726 case RecurKind::And:
20727 case RecurKind::Or:
20728 case RecurKind::SMax:
20729 case RecurKind::SMin:
20730 case RecurKind::UMax:
20731 case RecurKind::UMin:
20732 case RecurKind::FMax:
20733 case RecurKind::FMin:
20734 case RecurKind::FMaximum:
20735 case RecurKind::FMinimum:
20737 return VectorizedValue;
20738 case RecurKind::Mul:
20739 case RecurKind::FMul:
20740 case RecurKind::FMulAdd:
20741 case RecurKind::IAnyOf:
20742 case RecurKind::FAnyOf:
20743 case RecurKind::IFindLastIV:
20744 case RecurKind::FFindLastIV:
20745 case RecurKind::None:
20757 assert(IsSupportedHorRdxIdentityOp &&
20758 "The optimization of matched scalar identity horizontal reductions "
20759 "must be supported.");
20761 auto *VTy = cast<FixedVectorType>(VectorizedValue->
getType());
20762 if (VTy->getElementType() != VL.
front()->getType()) {
20766 R.isSignedMinBitwidthRootNode());
20769 case RecurKind::Add: {
20772 for (
Value *V : VL) {
20773 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20774 Vals.
push_back(ConstantInt::get(
V->getType(), Cnt,
false));
20778 << VectorizedValue <<
". (HorRdx)\n");
20779 return Builder.
CreateMul(VectorizedValue, Scale);
20781 case RecurKind::And:
20782 case RecurKind::Or:
20785 <<
". (HorRdx)\n");
20786 return VectorizedValue;
20787 case RecurKind::SMax:
20788 case RecurKind::SMin:
20789 case RecurKind::UMax:
20790 case RecurKind::UMin:
20791 case RecurKind::FMax:
20792 case RecurKind::FMin:
20793 case RecurKind::FMaximum:
20794 case RecurKind::FMinimum:
20797 <<
". (HorRdx)\n");
20798 return VectorizedValue;
20799 case RecurKind::Xor: {
20805 cast<FixedVectorType>(VectorizedValue->
getType())->getNumElements(),
20807 std::iota(
Mask.begin(),
Mask.end(), 0);
20808 bool NeedShuffle =
false;
20809 for (
unsigned I = 0, VF = VL.size();
I < VF; ++
I) {
20811 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20812 if (Cnt % 2 == 0) {
20814 NeedShuffle =
true;
20820 dbgs() <<
"> of " << VectorizedValue <<
". (HorRdx)\n");
20824 ConstantVector::getNullValue(VectorizedValue->
getType()),
Mask);
20825 return VectorizedValue;
20827 case RecurKind::FAdd: {
20830 for (
Value *V : VL) {
20831 unsigned Cnt = SameValuesCounter.
lookup(TrackedToOrig.
at(V));
20832 Vals.
push_back(ConstantFP::get(
V->getType(), Cnt));
20835 return Builder.
CreateFMul(VectorizedValue, Scale);
20837 case RecurKind::Mul:
20838 case RecurKind::FMul:
20839 case RecurKind::FMulAdd:
20840 case RecurKind::IAnyOf:
20841 case RecurKind::FAnyOf:
20842 case RecurKind::IFindLastIV:
20843 case RecurKind::FFindLastIV:
20844 case RecurKind::None:
20854 return HorizontalReduction::getRdxKind(V);
20857 if (
auto *IE = dyn_cast<InsertElementInst>(InsertInst))
20858 return cast<FixedVectorType>(IE->getType())->getNumElements();
20860 unsigned AggregateSize = 1;
20861 auto *
IV = cast<InsertValueInst>(InsertInst);
20862 Type *CurrentType =
IV->getType();
20864 if (
auto *ST = dyn_cast<StructType>(CurrentType)) {
20865 for (
auto *Elt : ST->elements())
20866 if (Elt != ST->getElementType(0))
20867 return std::nullopt;
20868 AggregateSize *= ST->getNumElements();
20869 CurrentType = ST->getElementType(0);
20870 }
else if (
auto *AT = dyn_cast<ArrayType>(CurrentType)) {
20871 AggregateSize *= AT->getNumElements();
20872 CurrentType = AT->getElementType();
20873 }
else if (
auto *VT = dyn_cast<FixedVectorType>(CurrentType)) {
20874 AggregateSize *= VT->getNumElements();
20875 return AggregateSize;
20877 return AggregateSize;
20879 return std::nullopt;
20888 unsigned OperandOffset,
const BoUpSLP &R) {
20891 std::optional<unsigned> OperandIndex =
20893 if (!OperandIndex || R.isDeleted(LastInsertInst))
20895 if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
20897 BuildVectorOpds, InsertElts, *OperandIndex, R);
20900 BuildVectorOpds[*OperandIndex] = InsertedOperand;
20901 InsertElts[*OperandIndex] = LastInsertInst;
20903 LastInsertInst = dyn_cast<Instruction>(LastInsertInst->
getOperand(0));
20904 }
while (LastInsertInst !=
nullptr &&
20905 isa<InsertValueInst, InsertElementInst>(LastInsertInst) &&
20929 assert((isa<InsertElementInst>(LastInsertInst) ||
20930 isa<InsertValueInst>(LastInsertInst)) &&
20931 "Expected insertelement or insertvalue instruction!");
20934 "Expected empty result vectors!");
20937 if (!AggregateSize)
20939 BuildVectorOpds.
resize(*AggregateSize);
20940 InsertElts.
resize(*AggregateSize);
20946 if (BuildVectorOpds.
size() >= 2)
20964 auto DominatedReduxValue = [&](
Value *R) {
20965 return isa<Instruction>(R) &&
20966 DT->
dominates(
P->getParent(), cast<Instruction>(R)->getParent());
20972 if (
P->getIncomingBlock(0) == ParentBB) {
20973 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20974 }
else if (
P->getIncomingBlock(1) == ParentBB) {
20975 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20978 if (Rdx && DominatedReduxValue(Rdx))
20991 if (
P->getIncomingBlock(0) == BBLatch) {
20992 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(0));
20993 }
else if (
P->getIncomingBlock(1) == BBLatch) {
20994 Rdx = dyn_cast<Instruction>(
P->getIncomingValue(1));
20997 if (Rdx && DominatedReduxValue(Rdx))
21031 assert((isa<BinaryOperator>(Root) || isa<SelectInst>(Root) ||
21032 isa<IntrinsicInst>(Root)) &&
21033 "Expected binop, select, or intrinsic for reduction matching");
21035 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root));
21037 Root->
getOperand(HorizontalReduction::getFirstOperandIndex(Root) + 1);
21039 return dyn_cast<Instruction>(
RHS);
21041 return dyn_cast<Instruction>(
LHS);
21048 Value *Op0 =
nullptr;
21049 Value *Op1 =
nullptr;
21052 return dyn_cast<Instruction>(Op0 == Phi ? Op1 : Op0);
21058 Value *B0 =
nullptr, *B1 =
nullptr;
21063bool SLPVectorizerPass::vectorizeHorReduction(
21068 bool TryOperandsAsNewSeeds =
P && isa<BinaryOperator>(Root);
21070 if (Root->
getParent() != BB || isa<PHINode>(Root))
21074 auto SelectRoot = [&]() {
21093 std::queue<std::pair<Instruction *, unsigned>>
Stack;
21094 Stack.emplace(SelectRoot(), 0);
21098 if (
R.isAnalyzedReductionRoot(Inst))
21103 if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *
DL, *TLI))
21105 return HorRdx.tryToReduce(R, *
DL,
TTI, *TLI, AC);
21107 auto TryAppendToPostponedInsts = [&](
Instruction *FutureSeed) {
21108 if (TryOperandsAsNewSeeds && FutureSeed == Root) {
21115 if (!isa<CmpInst, InsertElementInst, InsertValueInst>(FutureSeed))
21120 while (!
Stack.empty()) {
21123 std::tie(Inst, Level) =
Stack.front();
21128 if (
R.isDeleted(Inst))
21130 if (
Value *VectorizedV = TryToReduce(Inst)) {
21132 if (
auto *
I = dyn_cast<Instruction>(VectorizedV)) {
21134 Stack.emplace(
I, Level);
21137 if (
R.isDeleted(Inst))
21141 if (!TryAppendToPostponedInsts(Inst)) {
21152 if (VisitedInstrs.
insert(
Op).second)
21153 if (
auto *
I = dyn_cast<Instruction>(
Op))
21156 if (!isa<PHINode, CmpInst, InsertElementInst, InsertValueInst>(
I) &&
21157 !
R.isDeleted(
I) &&
I->getParent() == BB)
21158 Stack.emplace(
I, Level);
21166 bool Res = vectorizeHorReduction(
P, Root, BB, R, PostponedInsts);
21167 Res |= tryToVectorize(PostponedInsts, R);
21174 for (
Value *V : Insts)
21175 if (
auto *Inst = dyn_cast<Instruction>(V); Inst && !
R.isDeleted(Inst))
21176 Res |= tryToVectorize(Inst, R);
21180bool SLPVectorizerPass::vectorizeInsertValueInst(
InsertValueInst *IVI,
21183 if (!
R.canMapToVector(IVI->
getType()))
21191 if (MaxVFOnly && BuildVectorOpds.
size() == 2) {
21192 R.getORE()->emit([&]() {
21194 <<
"Cannot SLP vectorize list: only 2 elements of buildvalue, "
21195 "trying reduction first.";
21199 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IVI <<
"\n");
21201 return tryToVectorizeList(BuildVectorOpds, R, MaxVFOnly);
21211 (
all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
21215 if (MaxVFOnly && BuildVectorInsts.
size() == 2) {
21216 R.getORE()->emit([&]() {
21218 <<
"Cannot SLP vectorize list: only 2 elements of buildvector, "
21219 "trying reduction first.";
21223 LLVM_DEBUG(
dbgs() <<
"SLP: array mappable to vector: " << *IEI <<
"\n");
21224 return tryToVectorizeList(BuildVectorInsts, R, MaxVFOnly);
21227template <
typename T>
21232 bool MaxVFOnly,
BoUpSLP &R) {
21233 bool Changed =
false;
21244 auto *
I = dyn_cast<Instruction>(*IncIt);
21245 if (!
I || R.isDeleted(
I)) {
21249 auto *SameTypeIt = IncIt;
21250 while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
21251 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21252 AreCompatible(*SameTypeIt, *IncIt))) {
21253 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21255 if (
I && !R.isDeleted(
I))
21260 unsigned NumElts = VL.
size();
21261 LLVM_DEBUG(
dbgs() <<
"SLP: Trying to vectorize starting at nodes ("
21262 << NumElts <<
")\n");
21272 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL), MaxVFOnly)) {
21275 VL.
swap(Candidates);
21276 Candidates.
clear();
21278 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21284 auto GetMinNumElements = [&R](
Value *V) {
21285 unsigned EltSize = R.getVectorElementSize(V);
21286 return std::max(2U, R.getMaxVecRegSize() / EltSize);
21288 if (NumElts < GetMinNumElements(*IncIt) &&
21289 (Candidates.
empty() ||
21290 Candidates.
front()->getType() == (*IncIt)->getType())) {
21292 if (
auto *
I = dyn_cast<Instruction>(V);
I && !R.isDeleted(
I))
21298 if (Candidates.
size() > 1 &&
21299 (SameTypeIt == E || (*SameTypeIt)->getType() != (*IncIt)->getType())) {
21300 if (TryToVectorizeHelper(Candidates,
false)) {
21303 }
else if (MaxVFOnly) {
21306 for (
auto *It = Candidates.
begin(), *
End = Candidates.
end(); It !=
End;
21308 auto *
I = dyn_cast<Instruction>(*It);
21309 if (!
I || R.isDeleted(
I)) {
21313 auto *SameTypeIt = It;
21314 while (SameTypeIt !=
End &&
21315 (!isa<Instruction>(*SameTypeIt) ||
21316 R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
21317 AreCompatible(*SameTypeIt, *It))) {
21318 auto *
I = dyn_cast<Instruction>(*SameTypeIt);
21320 if (
I && !R.isDeleted(
I))
21323 unsigned NumElts = VL.
size();
21324 if (NumElts > 1 && TryToVectorizeHelper(
ArrayRef(VL),
21330 Candidates.
clear();
21334 IncIt = SameTypeIt;
21346template <
bool IsCompatibility>
21351 "Expected valid element types only.");
21353 return IsCompatibility;
21354 auto *CI1 = cast<CmpInst>(V);
21355 auto *CI2 = cast<CmpInst>(V2);
21356 if (CI1->getOperand(0)->getType()->getTypeID() <
21358 return !IsCompatibility;
21359 if (CI1->getOperand(0)->getType()->getTypeID() >
21362 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() <
21364 return !IsCompatibility;
21365 if (CI1->getOperand(0)->getType()->getScalarSizeInBits() >
21374 if (BasePred1 < BasePred2)
21375 return !IsCompatibility;
21376 if (BasePred1 > BasePred2)
21379 bool CI1Preds = Pred1 == BasePred1;
21380 bool CI2Preds = Pred2 == BasePred1;
21381 for (
int I = 0, E = CI1->getNumOperands();
I < E; ++
I) {
21382 auto *Op1 = CI1->getOperand(CI1Preds ?
I : E -
I - 1);
21383 auto *Op2 = CI2->
getOperand(CI2Preds ?
I : E -
I - 1);
21387 return !IsCompatibility;
21390 if (
auto *I1 = dyn_cast<Instruction>(Op1))
21391 if (
auto *I2 = dyn_cast<Instruction>(Op2)) {
21392 if (IsCompatibility) {
21393 if (I1->getParent() != I2->getParent())
21400 return NodeI2 !=
nullptr;
21403 assert((NodeI1 == NodeI2) ==
21405 "Different nodes should have different DFS numbers");
21406 if (NodeI1 != NodeI2)
21410 if (S && (IsCompatibility || !S.isAltShuffle()))
21412 if (IsCompatibility)
21414 if (I1->getOpcode() != I2->getOpcode())
21415 return I1->getOpcode() < I2->getOpcode();
21418 return IsCompatibility;
21421template <
typename ItT>
21424 bool Changed =
false;
21427 if (
R.isDeleted(
I))
21430 if (
auto *RootOp = dyn_cast<Instruction>(
Op)) {
21431 Changed |= vectorizeRootInstruction(
nullptr, RootOp, BB, R);
21432 if (
R.isDeleted(
I))
21438 if (
R.isDeleted(
I))
21440 Changed |= tryToVectorize(
I, R);
21447 return compareCmp<false>(V, V2, *TLI, *DT);
21450 auto AreCompatibleCompares = [&](
Value *V1,
Value *
V2) {
21453 return compareCmp<true>(V1, V2, *TLI, *DT);
21460 if (Vals.
size() <= 1)
21462 Changed |= tryToVectorizeSequence<Value>(
21463 Vals, CompareSorter, AreCompatibleCompares,
21466 bool ArePossiblyReducedInOtherBlock =
any_of(Candidates, [](
Value *V) {
21468 auto *Select = dyn_cast<SelectInst>(U);
21470 Select->getParent() != cast<Instruction>(V)->getParent();
21473 if (ArePossiblyReducedInOtherBlock)
21475 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21481bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
21483 assert(
all_of(Instructions, IsaPred<InsertElementInst, InsertValueInst>) &&
21484 "This function only accepts Insert instructions");
21485 bool OpsChanged =
false;
21487 for (
auto *
I :
reverse(Instructions)) {
21489 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21491 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21493 vectorizeInsertValueInst(LastInsertValue, BB, R,
true);
21494 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21496 vectorizeInsertElementInst(LastInsertElem, BB, R,
true);
21499 if (
R.isDeleted(
I))
21501 OpsChanged |= vectorizeHorReduction(
nullptr,
I, BB, R, PostponedInsts);
21502 if (
R.isDeleted(
I) || isa<CmpInst>(
I))
21505 if (
auto *LastInsertValue = dyn_cast<InsertValueInst>(
I)) {
21507 vectorizeInsertValueInst(LastInsertValue, BB, R,
false);
21508 }
else if (
auto *LastInsertElem = dyn_cast<InsertElementInst>(
I)) {
21509 OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R,
21514 OpsChanged |= tryToVectorize(PostponedInsts, R);
21521 bool Changed =
false;
21528 auto PHICompare = [
this, &PHIToOpcodes](
Value *V1,
Value *
V2) {
21531 "Expected vectorizable types only.");
21539 V2->getType()->getScalarSizeInBits())
21542 V2->getType()->getScalarSizeInBits())
21546 if (Opcodes1.
size() < Opcodes2.
size())
21548 if (Opcodes1.
size() > Opcodes2.
size())
21550 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21553 auto *
I1 = dyn_cast<Instruction>(Opcodes1[
I]);
21554 auto *I2 = dyn_cast<Instruction>(Opcodes2[
I]);
21559 return NodeI2 !=
nullptr;
21562 assert((NodeI1 == NodeI2) ==
21564 "Different nodes should have different DFS numbers");
21565 if (NodeI1 != NodeI2)
21568 if (S && !S.isAltShuffle())
21570 return I1->getOpcode() < I2->getOpcode();
21579 bool C1 = isa<Constant>(Opcodes1[
I]) && !isa<UndefValue>(Opcodes1[
I]);
21580 bool C2 = isa<Constant>(Opcodes2[
I]) && !isa<UndefValue>(Opcodes2[
I]);
21588 bool U1 = isa<UndefValue>(Opcodes1[
I]);
21589 bool U2 = isa<UndefValue>(Opcodes2[
I]);
21593 auto ValID1 = Opcodes1[
I]->getValueID();
21594 auto ValID2 = Opcodes2[
I]->getValueID();
21595 if (ValID1 == ValID2)
21597 if (ValID1 < ValID2)
21599 if (ValID1 > ValID2)
21608 assert(U1 && U2 &&
"The only thing left should be undef & undef.");
21612 auto AreCompatiblePHIs = [&PHIToOpcodes,
this, &
R](
Value *V1,
Value *
V2) {
21615 if (V1->getType() !=
V2->getType())
21619 if (Opcodes1.
size() != Opcodes2.
size())
21621 for (
int I = 0, E = Opcodes1.
size();
I < E; ++
I) {
21623 if (isa<UndefValue>(Opcodes1[
I]) || isa<UndefValue>(Opcodes2[
I]))
21625 if (
auto *I1 = dyn_cast<Instruction>(Opcodes1[
I]))
21626 if (
auto *I2 = dyn_cast<Instruction>(Opcodes2[
I])) {
21627 if (
R.isDeleted(I1) ||
R.isDeleted(I2))
21629 if (
I1->getParent() != I2->getParent())
21635 if (isa<Constant>(Opcodes1[
I]) && isa<Constant>(Opcodes2[
I]))
21637 if (Opcodes1[
I]->getValueID() != Opcodes2[
I]->getValueID())
21643 bool HaveVectorizedPhiNodes =
false;
21648 auto *
P = dyn_cast<PHINode>(&
I);
21654 if (!VisitedInstrs.
count(
P) && !
R.isDeleted(
P) &&
21667 if (!Opcodes.
empty())
21671 while (!Nodes.
empty()) {
21672 auto *
PHI = cast<PHINode>(Nodes.pop_back_val());
21675 for (
Value *V :
PHI->incoming_values()) {
21676 if (
auto *PHI1 = dyn_cast<PHINode>((V))) {
21677 Nodes.push_back(PHI1);
21685 HaveVectorizedPhiNodes = tryToVectorizeSequence<Value>(
21686 Incoming, PHICompare, AreCompatiblePHIs,
21688 return tryToVectorizeList(Candidates, R, MaxVFOnly);
21691 Changed |= HaveVectorizedPhiNodes;
21692 if (HaveVectorizedPhiNodes &&
any_of(PHIToOpcodes, [&](
const auto &
P) {
21693 auto *
PHI = dyn_cast<PHINode>(
P.first);
21694 return !
PHI ||
R.isDeleted(
PHI);
21696 PHIToOpcodes.
clear();
21698 }
while (HaveVectorizedPhiNodes);
21700 VisitedInstrs.
clear();
21702 InstSetVector PostProcessInserts;
21706 auto VectorizeInsertsAndCmps = [&](
bool VectorizeCmps) {
21707 bool Changed = vectorizeInserts(PostProcessInserts, BB, R);
21708 if (VectorizeCmps) {
21709 Changed |= vectorizeCmpInsts(
reverse(PostProcessCmps), BB, R);
21710 PostProcessCmps.
clear();
21712 PostProcessInserts.clear();
21717 if (
auto *Cmp = dyn_cast<CmpInst>(
I))
21718 return PostProcessCmps.
contains(Cmp);
21719 return isa<InsertElementInst, InsertValueInst>(
I) &&
21720 PostProcessInserts.contains(
I);
21726 return I->use_empty() &&
21727 (
I->getType()->isVoidTy() || isa<CallInst, InvokeInst>(
I));
21732 if (isa<ScalableVectorType>(It->getType()))
21736 if (
R.isDeleted(&*It))
21739 if (!VisitedInstrs.
insert(&*It).second) {
21740 if (HasNoUsers(&*It) &&
21741 VectorizeInsertsAndCmps(It->isTerminator())) {
21751 if (isa<DbgInfoIntrinsic>(It))
21755 if (
PHINode *
P = dyn_cast<PHINode>(It)) {
21757 if (
P->getNumIncomingValues() == 2) {
21760 if (Root && vectorizeRootInstruction(
P, Root, BB, R)) {
21769 for (
unsigned I : seq<unsigned>(
P->getNumIncomingValues())) {
21774 if (BB ==
P->getIncomingBlock(
I) ||
21780 if (
auto *PI = dyn_cast<Instruction>(
P->getIncomingValue(
I));
21781 PI && !IsInPostProcessInstrs(PI)) {
21783 vectorizeRootInstruction(
nullptr, PI,
P->getIncomingBlock(
I), R);
21785 if (Res &&
R.isDeleted(
P)) {
21795 if (HasNoUsers(&*It)) {
21796 bool OpsChanged =
false;
21797 auto *
SI = dyn_cast<StoreInst>(It);
21807 TryToVectorizeRoot |= (
I == Stores.
end() ||
I->second.size() == 1) &&
21808 SI->getValueOperand()->hasOneUse();
21810 if (TryToVectorizeRoot) {
21811 for (
auto *V : It->operand_values()) {
21814 if (
auto *VI = dyn_cast<Instruction>(V);
21815 VI && !IsInPostProcessInstrs(VI))
21817 OpsChanged |= vectorizeRootInstruction(
nullptr, VI, BB, R);
21824 VectorizeInsertsAndCmps(It->isTerminator());
21835 if (isa<InsertElementInst, InsertValueInst>(It))
21836 PostProcessInserts.insert(&*It);
21837 else if (isa<CmpInst>(It))
21838 PostProcessCmps.
insert(cast<CmpInst>(&*It));
21845 auto Changed =
false;
21846 for (
auto &Entry : GEPs) {
21849 if (
Entry.second.size() < 2)
21852 LLVM_DEBUG(
dbgs() <<
"SLP: Analyzing a getelementptr list of length "
21853 <<
Entry.second.size() <<
".\n");
21861 return !R.isDeleted(GEP);
21863 if (It ==
Entry.second.end())
21865 unsigned MaxVecRegSize =
R.getMaxVecRegSize();
21866 unsigned EltSize =
R.getVectorElementSize(*(*It)->idx_begin());
21867 if (MaxVecRegSize < EltSize)
21870 unsigned MaxElts = MaxVecRegSize / EltSize;
21871 for (
unsigned BI = 0, BE =
Entry.second.size(); BI < BE; BI += MaxElts) {
21872 auto Len = std::min<unsigned>(BE - BI, MaxElts);
21885 Candidates.remove_if([&R](
Value *
I) {
21886 return R.isDeleted(cast<Instruction>(
I)) ||
21887 isa<Constant>(cast<GetElementPtrInst>(
I)->idx_begin()->
get());
21895 for (
int I = 0, E = GEPList.size();
I < E && Candidates.
size() > 1; ++
I) {
21896 auto *GEPI = GEPList[
I];
21897 if (!Candidates.count(GEPI))
21900 for (
int J =
I + 1; J < E && Candidates.
size() > 1; ++J) {
21901 auto *GEPJ = GEPList[J];
21903 if (isa<SCEVConstant>(SE->
getMinusSCEV(SCEVI, SCEVJ))) {
21904 Candidates.remove(GEPI);
21905 Candidates.remove(GEPJ);
21906 }
else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
21907 Candidates.remove(GEPJ);
21914 if (Candidates.
size() < 2)
21921 auto BundleIndex = 0
u;
21922 for (
auto *V : Candidates) {
21923 auto *
GEP = cast<GetElementPtrInst>(V);
21924 auto *GEPIdx =
GEP->idx_begin()->get();
21925 assert(
GEP->getNumIndices() == 1 && !isa<Constant>(GEPIdx));
21926 Bundle[BundleIndex++] = GEPIdx;
21938 Changed |= tryToVectorizeList(Bundle, R);
21944bool SLPVectorizerPass::vectorizeStoreChains(
BoUpSLP &R) {
21945 bool Changed =
false;
21950 if (
V->getValueOperand()->getType()->getTypeID() <
21951 V2->getValueOperand()->getType()->getTypeID())
21953 if (
V->getValueOperand()->getType()->getTypeID() >
21954 V2->getValueOperand()->getType()->getTypeID())
21956 if (
V->getPointerOperandType()->getTypeID() <
21957 V2->getPointerOperandType()->getTypeID())
21959 if (
V->getPointerOperandType()->getTypeID() >
21960 V2->getPointerOperandType()->getTypeID())
21962 if (
V->getValueOperand()->getType()->getScalarSizeInBits() <
21963 V2->getValueOperand()->getType()->getScalarSizeInBits())
21965 if (
V->getValueOperand()->getType()->getScalarSizeInBits() >
21966 V2->getValueOperand()->getType()->getScalarSizeInBits())
21969 if (
auto *I1 = dyn_cast<Instruction>(
V->getValueOperand()))
21970 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
21974 DT->
getNode(I2->getParent());
21975 assert(NodeI1 &&
"Should only process reachable instructions");
21976 assert(NodeI2 &&
"Should only process reachable instructions");
21977 assert((NodeI1 == NodeI2) ==
21979 "Different nodes should have different DFS numbers");
21980 if (NodeI1 != NodeI2)
21982 return I1->getOpcode() < I2->getOpcode();
21984 return V->getValueOperand()->getValueID() <
21985 V2->getValueOperand()->getValueID();
21997 isa<UndefValue>(
V2->getValueOperand()))
22000 if (
auto *I2 = dyn_cast<Instruction>(
V2->getValueOperand())) {
22001 if (
I1->getParent() != I2->getParent())
22006 isa<Constant>(
V2->getValueOperand()))
22009 V2->getValueOperand()->getValueID();
22014 for (
auto &Pair : Stores) {
22015 if (Pair.second.size() < 2)
22019 << Pair.second.size() <<
".\n");
22028 Pair.second.rend());
22029 Changed |= tryToVectorizeSequence<StoreInst>(
22030 ReversedStores, StoreSorter, AreCompatibleStores,
22032 return vectorizeStores(Candidates, R, Attempted);
static bool isConstant(const MachineInstr &MI)
AMDGPU Lower Kernel Arguments
AMDGPU Register Bank Select
ReachingDefAnalysis InstSet InstSet & Ignore
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
#define LLVM_DUMP_METHOD
Mark debug helper function definitions like dump() that should not be stripped from debug builds.
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static APInt getElementIndex(TypeSize ElemSize, APInt &Offset)
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides an implementation of debug counters.
#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
DenseMap< Block *, BlockRelaxAux > Blocks
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI)
This is the interface for a simple mod/ref and alias analysis over globals.
static const HTTPClientCleanup Cleanup
Module.h This file contains the declarations for the Module class.
This defines the Use class.
iv Induction Variable Users
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
static bool isSplat(Value *V)
Return true if V is a splat of a value (which is used when multiplying a matrix with a scalar).
mir Rename Register Operands
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
static bool IsSelect(MachineInstr &MI)
This file defines the PriorityQueue class.
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts, TargetTransformInfo *TTI, bool MustMatchOrInst)
static cl::opt< bool > RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden, cl::desc("Run the SLP vectorization passes"))
static FixedVectorType * getWidenedType(Type *ScalarTy, unsigned VF)
static bool isVectorLikeInstWithConstOps(Value *V)
Checks if V is one of vector-like instructions, i.e.
static std::optional< Value * > calculateRtStride(ArrayRef< Value * > PointerOps, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices, Instruction *Inst=nullptr)
Checks if the provided list of pointers Pointers represents the strided pointers for type ElemTy.
static bool isRepeatedNonIdentityClusteredMask(ArrayRef< int > Mask, unsigned Sz)
Checks if the given mask is a "clustered" mask with the same clusters of size Sz, which are not ident...
static const unsigned MaxPHINumOperands
Maximum allowed number of operands in the PHI nodes.
static cl::opt< int > MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static cl::opt< unsigned > MaxProfitableLoadStride("slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable."))
static bool findBuildAggregate(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, const BoUpSLP &R)
Recognize construction of vectors like ra = insertelement <4 x float> poison, float s0,...
static bool needToScheduleSingleInstruction(ArrayRef< Value * > VL)
static bool clusterSortPtrAccesses(ArrayRef< Value * > VL, ArrayRef< BasicBlock * > BBs, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
static unsigned getNumElements(Type *Ty)
static SmallBitVector buildUseMask(int VF, ArrayRef< int > Mask, UseMask MaskArg)
Prepares a use bitset for the given mask either for the first argument or for the second.
static bool areCompatibleCmpOps(Value *BaseOp0, Value *BaseOp1, Value *Op0, Value *Op1, const TargetLibraryInfo &TLI)
Checks if the provided operands of 2 cmp instructions are compatible, i.e.
static Value * createInsertVector(IRBuilderBase &Builder, Value *Vec, Value *V, unsigned Index, function_ref< Value *(Value *, Value *, ArrayRef< int >)> Generator={})
Correctly creates insert_subvector, checking that the index is multiple of the subvectors length.
static unsigned getNumElems(unsigned Size, unsigned PartNumElems, unsigned Part)
Returns correct remaining number of elements, considering total amount Size, (power-of-2 number) of e...
static InstructionCost getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={})
Returns the cost of the shuffle instructions with the given Kind, vector type Tp and optional Mask.
static void findBuildAggregate_rec(Instruction *LastInsertInst, TargetTransformInfo *TTI, SmallVectorImpl< Value * > &BuildVectorOpds, SmallVectorImpl< Value * > &InsertElts, unsigned OperandOffset, const BoUpSLP &R)
static bool isSimple(Instruction *I)
static const int MinScheduleRegionSize
If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling regions to be handled.
static cl::opt< unsigned > MinProfitableStridedLoads("slp-min-strided-loads", cl::init(2), cl::Hidden, cl::desc("The minimum number of loads, which should be considered strided, " "if the stride is > 1 or is runtime value"))
static bool isFirstInsertElement(const InsertElementInst *IE1, const InsertElementInst *IE2)
Checks if the IE1 instructions is followed by IE2 instruction in the buildvector sequence.
static SmallBitVector getAltInstrMask(ArrayRef< Value * > VL, unsigned Opcode0, unsigned Opcode1)
static cl::opt< int > LookAheadMaxDepth("slp-max-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for operand reordering scores"))
static cl::opt< unsigned > MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden, cl::desc("Maximum SLP vectorization factor (0=unlimited)"))
static void reorderReuses(SmallVectorImpl< int > &Reuses, ArrayRef< int > Mask)
Reorders the given Reuses mask according to the given Mask.
static void combineOrders(MutableArrayRef< unsigned > Order, ArrayRef< unsigned > SecondaryOrder)
static const unsigned MaxMemDepDistance
static cl::opt< bool > ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz"))
static bool doesInTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst, TargetLibraryInfo *TLI, const TargetTransformInfo *TTI)
static cl::opt< bool > VectorizeNonPowerOf2("slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements."))
static cl::opt< unsigned > MinTreeSize("slp-min-tree-size", cl::init(3), cl::Hidden, cl::desc("Only vectorize small trees if they are fully vectorizable"))
static void reorderOrder(SmallVectorImpl< unsigned > &Order, ArrayRef< int > Mask, bool BottomOrder=false)
Reorders the given Order according to the given Mask.
static unsigned getFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not less than Sz, which forms type,...
static T * performExtractsShuffleAction(MutableArrayRef< std::pair< T *, SmallVector< int > > > ShuffleMask, Value *Base, function_ref< unsigned(T *)> GetVF, function_ref< std::pair< T *, bool >(T *, ArrayRef< int >, bool)> ResizeAction, function_ref< T *(ArrayRef< int >, ArrayRef< T * >)> Action)
Does the analysis of the provided shuffle masks and performs the requested actions on the vectors wit...
static cl::opt< bool > ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions"))
static bool isConstant(Value *V)
static bool isSplat(ArrayRef< Value * > VL)
static cl::opt< int > SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden, cl::desc("Only vectorize if you gain more than this " "number "))
static unsigned getPartNumElems(unsigned Size, unsigned NumParts)
Returns power-of-2 number of elements in a single register (part), given the total number of elements...
static bool allConstant(ArrayRef< Value * > VL)
static constexpr int UsesLimit
static std::optional< unsigned > getElementIndex(const Value *Inst, unsigned Offset=0)
static bool isReductionCandidate(Instruction *I)
\Returns true if I is a candidate instruction for reduction vectorization.
static bool checkTreeSizes(ArrayRef< std::pair< unsigned, unsigned > > Sizes, bool First)
Checks if the quadratic mean deviation is less than 90% of the mean size.
static unsigned getShufflevectorNumGroups(ArrayRef< Value * > VL)
static bool isCmpSameOrSwapped(const CmpInst *BaseCI, const CmpInst *CI, const TargetLibraryInfo &TLI)
static cl::opt< bool > SLPSkipEarlyProfitabilityCheck("slp-skip-early-profitability-check", cl::init(false), cl::Hidden, cl::desc("When true, SLP vectorizer bypasses profitability checks based on " "heuristics and makes vectorization decision via cost modeling."))
static std::pair< size_t, size_t > generateKeySubkey(Value *V, const TargetLibraryInfo *TLI, function_ref< hash_code(size_t, LoadInst *)> LoadsSubkeyGenerator, bool AllowAlternate)
Generates key/subkey pair for the given value to provide effective sorting of the values and better d...
static cl::opt< bool > ShouldStartVectorizeHorAtStore("slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions feeding into a store"))
static std::pair< InstructionCost, InstructionCost > getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, ArrayRef< Type * > ArgTys)
static void transformScalarShuffleIndiciesToVector(unsigned VecTyNumElements, SmallVectorImpl< int > &Mask)
static cl::opt< bool > SLPReVec("slp-revec", cl::init(false), cl::Hidden, cl::desc("Enable vectorization for wider vector utilization"))
static bool isValidForAlternation(unsigned Opcode)
static SmallVector< Type * > buildIntrinsicArgTypes(const CallInst *CI, const Intrinsic::ID ID, const unsigned VF, unsigned MinBW, const TargetTransformInfo *TTI)
Builds the arguments types vector for the given call instruction with the given ID for the specified ...
static std::optional< unsigned > getExtractIndex(Instruction *E)
static cl::opt< int > RootLookAheadMaxDepth("slp-max-root-look-ahead-depth", cl::init(2), cl::Hidden, cl::desc("The maximum look-ahead depth for searching best rooting option"))
static const unsigned AliasedCheckLimit
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
static void gatherPossiblyVectorizableLoads(const BoUpSLP &R, ArrayRef< Value * > VL, const DataLayout &DL, ScalarEvolution &SE, const TargetTransformInfo &TTI, SmallVectorImpl< SmallVector< std::pair< LoadInst *, int > > > &GatheredLoads, bool AddNew=true)
Tries to find subvector of loads and builds new vector of only loads if can be profitable.
static std::string shortBundleName(ArrayRef< Value * > VL, int Idx=-1)
Print a short descriptor of the instruction bundle suitable for debug output.
static LLVM_DUMP_METHOD void dumpOrder(const BoUpSLP::OrdersType &Order)
static bool isValidElementType(Type *Ty)
Predicate for the element types that the SLP vectorizer supports.
static Instruction * getReductionInstr(const DominatorTree *DT, PHINode *P, BasicBlock *ParentBB, LoopInfo *LI)
Try and get a reduction instruction from a phi node.
static SmallVector< int > calculateShufflevectorMask(ArrayRef< Value * > VL)
static bool allSameType(ArrayRef< Value * > VL)
static MemoryLocation getLocation(Instruction *I)
static bool isCommutative(Instruction *I)
static bool allSameBlock(ArrayRef< Value * > VL)
static unsigned getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns the number of elements of the given type Ty, not greater than Sz, which forms type,...
static bool areTwoInsertFromSameBuildVector(InsertElementInst *VU, InsertElementInst *V, function_ref< Value *(InsertElementInst *)> GetBaseOperand)
Check if two insertelement instructions are from the same buildvector.
static bool arePointersCompatible(Value *Ptr1, Value *Ptr2, const TargetLibraryInfo &TLI, bool CompareOpcodes=true)
static std::pair< InstructionCost, InstructionCost > getGEPCosts(const TargetTransformInfo &TTI, ArrayRef< Value * > Ptrs, Value *BasePtr, unsigned Opcode, TTI::TargetCostKind CostKind, Type *ScalarTy, VectorType *VecTy)
Calculate the scalar and the vector costs from vectorizing set of GEPs.
static SmallBitVector isUndefVector(const Value *V, const SmallBitVector &UseMask={})
Checks if the given value is actually an undefined constant vector.
static bool tryToVectorizeSequence(SmallVectorImpl< T * > &Incoming, function_ref< bool(T *, T *)> Comparator, function_ref< bool(T *, T *)> AreCompatible, function_ref< bool(ArrayRef< T * >, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R)
static InstructionsState getSameOpcode(ArrayRef< Value * > VL, const TargetLibraryInfo &TLI)
static cl::opt< int > ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden, cl::desc("Limit the size of the SLP scheduling region per block"))
Limits the size of scheduling regions in a block.
static Instruction * tryGetSecondaryReductionRoot(PHINode *Phi, Instruction *Root)
We could have an initial reduction that is not an add.
static RecurKind getRdxKind(Value *V)
Gets recurrence kind from the specified value.
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1)
static cl::opt< int > MinVectorRegSizeOption("slp-min-reg-size", cl::init(128), cl::Hidden, cl::desc("Attempt to vectorize for this register size in bits"))
static std::optional< TargetTransformInfo::ShuffleKind > isFixedVectorShuffle(ArrayRef< Value * > VL, SmallVectorImpl< int > &Mask, AssumptionCache *AC)
Checks if the vector of instructions can be represented as a shuffle, like: x0 = extractelement <4 x ...
static std::optional< unsigned > getAggregateSize(Instruction *InsertInst)
static std::optional< unsigned > getInsertExtractIndex(const Value *Inst, unsigned Offset)
static cl::opt< unsigned > RecursionMaxDepth("slp-recursion-max-depth", cl::init(12), cl::Hidden, cl::desc("Limit the recursion depth when building a vectorizable tree"))
static Align computeCommonAlignment(ArrayRef< Value * > VL)
Calculates minimal alignment as a common alignment.
static void addMask(SmallVectorImpl< int > &Mask, ArrayRef< int > SubMask, bool ExtendingManyInputs=false)
Shuffles Mask in accordance with the given SubMask.
static void fixupOrderingIndices(MutableArrayRef< unsigned > Order)
Order may have elements assigned special value (size) which is out of bounds.
static Value * createExtractVector(IRBuilderBase &Builder, Value *Vec, unsigned SubVecVF, unsigned Index)
Correctly creates extract_subvector, checking that the index is multiple of the subvectors length.
static Instruction * getNonPhiOperand(Instruction *I, PHINode *Phi)
Returns the first operand of I that does not match Phi.
static bool compareCmp(Value *V, Value *V2, TargetLibraryInfo &TLI, const DominatorTree &DT)
Compare two cmp instructions.
static bool isReverseOrder(ArrayRef< unsigned > Order)
Check if Order represents reverse order.
static bool isAlternateInstruction(const Instruction *I, const Instruction *MainOp, const Instruction *AltOp, const TargetLibraryInfo &TLI)
Checks if the specified instruction I is an alternate operation for the given MainOp and AltOp instru...
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file implements the SmallBitVector class.
This file defines the SmallPtrSet class.
This file defines the SmallSet class.
This file defines the SmallString class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static SymbolRef::Type getType(const Symbol *Sym)
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
static SmallVector< VPValue *, 4 > getOperands(ArrayRef< VPValue * > Values, unsigned OperandIndex)
static const uint32_t IV[8]
Merges shuffle masks and emits final shuffle instruction, if required.
ShuffleCostEstimator(Type *ScalarTy, TargetTransformInfo &TTI, ArrayRef< Value * > VectorizedVals, BoUpSLP &R, SmallPtrSetImpl< Value * > &CheckedExtracts)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
std::optional< InstructionCost > needToDelay(const TreeEntry *, ArrayRef< SmallVector< const TreeEntry * > >) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
InstructionCost finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
InstructionCost createFreeze(InstructionCost Cost)
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
void add(Value *V1, ArrayRef< int > Mask, bool ForExtracts=false)
Adds another one input vector and the mask for the shuffling.
Merges shuffle masks and emits final shuffle instruction, if required.
void add(Value *V1, ArrayRef< int > Mask, bool=false)
Adds another one input vector and the mask for the shuffling.
void addOrdered(Value *V1, ArrayRef< unsigned > Order)
Adds another one input vector and the mask for the shuffling.
std::optional< Value * > needToDelay(const TreeEntry *E, ArrayRef< SmallVector< const TreeEntry * > > Deps) const
Checks if the specified entry E needs to be delayed because of its dependency nodes.
void add(const TreeEntry &E1, ArrayRef< int > Mask)
Adds single input vector (in form of tree entry) and the mask for its shuffling.
Value * gather(ArrayRef< Value * > VL, unsigned MaskVF=0, Value *Root=nullptr)
void add(Value *V1, Value *V2, ArrayRef< int > Mask)
Adds 2 input vectors and the mask for their shuffling.
Value * createFreeze(Value *V)
ShuffleInstructionBuilder(Type *ScalarTy, IRBuilderBase &Builder, BoUpSLP &R)
void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef< int > Mask)
Adds 2 input vectors (in form of tree entries) and the mask for their shuffling.
Value * adjustExtracts(const TreeEntry *E, MutableArrayRef< int > Mask, ArrayRef< std::optional< TTI::ShuffleKind > > ShuffleKinds, unsigned NumParts, bool &UseVecBaseAsInput)
Adjusts extractelements after reusing them.
Value * finalize(ArrayRef< int > ExtMask, ArrayRef< std::pair< const TreeEntry *, unsigned > > SubVectors, ArrayRef< int > SubVectorsMask, unsigned VF=0, function_ref< void(Value *&, SmallVectorImpl< int > &)> Action={})
Finalize emission of the shuffles.
~ShuffleInstructionBuilder()
A manager for alias analyses.
Class for arbitrary precision integers.
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
void clearBit(unsigned BitPosition)
Set a given bit to 0.
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
APInt urem(const APInt &RHS) const
Unsigned remainder operation.
void clearAllBits()
Set every bit to 0.
void setAllBits()
Set every bit to 1.
void setBits(unsigned loBit, unsigned hiBit)
Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
static APInt getBitsSetFrom(unsigned numBits, unsigned loBit)
Constructs an APInt value that has a contiguous range of bits set.
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
bool equals(ArrayRef RHS) const
equals - Check for element-wise equality.
const T & back() const
back - Get the last element.
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
const T & front() const
front - Get the first element.
size_t size() const
size - Get the array size.
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
bool empty() const
empty - Check if the array is empty.
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
static Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
bool isEHPad() const
Return true if this basic block is an exception handling block.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
unsigned getBundleOperandsEndIndex() const
Return the index of the last bundle operand in the Use array.
void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
bool isNoBuiltin() const
Return true if the call should not be treated as a call to a builtin.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool hasIdenticalOperandBundleSchema(const CallBase &Other) const
Return true if Other has the same sequence of operand bundle tags with the same number of operands on...
unsigned getBundleOperandsStartIndex() const
Return the index of the first bundle operand in the Use array.
Value * getArgOperand(unsigned i) const
FunctionType * getFunctionType() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned arg_size() const
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool hasOperandBundles() const
Return true if this User has any operand bundles.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
This class is the base class for the comparison instructions.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ ICMP_UGE
unsigned greater or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ ICMP_ULT
unsigned less than
@ ICMP_SGE
signed greater or equal
@ ICMP_ULE
unsigned less or equal
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Predicate getPredicate() const
Return the predicate for this instruction.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static Constant * getIntToPtr(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
static Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
IntegerType * getIndexType(LLVMContext &C, unsigned AddressSpace) const
Returns the type of a GEP index in AddressSpace.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
static bool shouldExecute(unsigned CounterName)
An analysis that produces DemandedBits for a function.
APInt getDemandedBits(Instruction *I)
Return the bits demanded from instruction I.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
iterator find(const_arg_type_t< KeyT > Val)
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
bool erase(const KeyT &Val)
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
const ValueT & at(const_arg_type_t< KeyT > Val) const
at - Return the entry for the specified key, or abort if no such entry exists.
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
unsigned getDFSNumIn() const
getDFSNumIn/getDFSNumOut - These return the DFS visitation order for nodes in the dominator tree.
Analysis pass which computes a DominatorTree.
void updateDFSNumbers() const
updateDFSNumbers - Assign In and Out numbers to the nodes while walking dominator tree in dfs order.
DomTreeNodeBase< NodeT > * getNode(const NodeT *BB) const
getNode - return the (Post)DominatorTree node for the specified basic block.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
ArrayRef< Type * > params() const
Type * getReturnType() const
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
nodes_iterator operator++()
nodes_iterator(const ItTy &It2)
bool operator!=(const nodes_iterator &N2) const
Common base class shared among various IRBuilders.
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
ConstantInt * getTrue()
Get the constant value for i1 true.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
BasicBlock::iterator GetInsertPoint() const
Value * CreateFreeze(Value *V, const Twine &Name="")
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
BasicBlock * GetInsertBlock() const
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * getAllOnesMask(ElementCount NumElts)
Return an all true boolean vector (mask) with NumElts lanes.
Value * CreateUnOp(Instruction::UnaryOps Opc, Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 2 operands which is mangled on the first type.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
ConstantInt * getFalse()
Get the constant value for i1 false.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateMaskedGather(Type *Ty, Value *Ptrs, Align Alignment, Value *Mask=nullptr, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Gather intrinsic.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
VectorType * getType() const
Overload to return most specific vector type.
This instruction inserts a struct field of array element value into an aggregate value.
static InstructionCost getInvalid(CostType Val=0)
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
void moveAfter(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdenticalTo(const Instruction *I) const LLVM_READONLY
Return true if the specified instruction is exactly identical to the current one.
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
An instruction for reading from memory.
Value * getPointerOperand()
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
Represents a single loop in the control flow graph.
This class implements a map that also provides access to all stored values in a deterministic order.
VectorType takeVector()
Clear the MapVector and return the underlying vector.
iterator find(const KeyT &Key)
std::pair< iterator, bool > try_emplace(const KeyT &Key, Ts &&...Args)
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
ValueT lookup(const KeyT &Key) const
std::pair< KeyT, ValueT > & front()
This is the common base class for memset/memcpy/memmove.
Representation for a specific memory location.
static MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
const Value * Ptr
The address of the start of the location.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
T & front() const
front - Get the first element.
MutableArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
This is a MutableArrayRef that owns its array.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Value * getIncomingValueForBlock(const BasicBlock *BB) const
BasicBlock * getIncomingBlock(unsigned i) const
Return incoming basic block number i.
unsigned getNumIncomingValues() const
Return the number of incoming edges.
Pass interface - Implemented by all 'passes'.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
A discriminated union of two or more pointer types, with the discriminator in the low bit of the poin...
bool isNull() const
Test if the pointer held in the union is null, regardless of which type it is.
T dyn_cast() const
Returns the current pointer if it is of the specified pointer type, otherwise returns null.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
PriorityQueue - This class behaves like std::priority_queue and provides a few additional convenience...
unsigned getOpcode() const
static bool isIntMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is an integer min/max kind.
static bool isMinMaxRecurrenceKind(RecurKind Kind)
Returns true if the recurrence kind is any min/max kind.
This class uses information about analyze scalars to rewrite expressions in canonical form.
Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
This class represents an analyzed expression in the program.
bool isZero() const
Return true if the expression is a constant zero.
bool isNonConstantNegative() const
Return true if the specified scev is negated, but not a constant.
Type * getType() const
Return the LLVM type of this SCEV expression.
Analysis pass that exposes the ScalarEvolution for a function.
The main scalar evolution driver.
const SCEV * getConstant(ConstantInt *V)
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetValue(Value *V)
This method should be called by the client when it has changed a value in a way that may effect its v...
const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
const SCEV * getUDivExactExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
const SCEV * getAddExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
ArrayRef< value_type > getArrayRef() const
size_type size() const
Determine the number of elements in the SetVector.
const value_type & front() const
Return the first element of the SetVector.
void clear()
Completely clear the SetVector.
bool empty() const
Determine if the SetVector is empty or not.
bool insert(const value_type &X)
Insert a new element into the SetVector.
bool contains(const key_type &key) const
Check if the SetVector contains the given key.
This instruction constructs a fixed permutation of two input vectors.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isOneUseSingleSourceMask(ArrayRef< int > Mask, int VF)
Return true if this shuffle mask represents "clustered" mask of size VF, i.e.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
int find_first() const
Returns the index of the first set bit, -1 if none of the bits are set.
bool test(unsigned Idx) const
int find_next(unsigned Prev) const
Returns the index of the next set bit following the "Prev" bit.
bool all() const
Returns true if all bits are set.
size_type size() const
Returns the number of bits in this bitvector.
bool any() const
Returns true if any bit is set.
size_type count() const
Returns the number of bits which are set.
bool none() const
Returns true if none of the bits are set.
Implements a dense probed hash-table based set with some number of buckets stored inline.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
bool erase(PtrType Ptr)
Remove pointer from the set.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
A SetVector that performs no allocations if smaller than a certain size.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
bool contains(const T &V) const
Check if the SmallSet contains the given element.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
reverse_iterator rbegin()
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Type * getPointerOperandType() const
Value * getValueOperand()
Value * getPointerOperand()
StringRef - Represent a constant reference to a string, i.e.
TargetFolder - Create constants with target dependent folding.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getIntegerBitWidth() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isX86_FP80Ty() const
Return true if this is x86 long double.
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isEmptyTy() const
Return true if this type is empty, that is, it has no elements or all of its elements are empty.
unsigned getStructNumElements() const
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
bool isSingleValueType() const
Return true if the type is a valid type for a register in codegen.
void print(raw_ostream &O, bool IsForDebug=false, bool NoDetails=false) const
Print the current type.
bool isPPC_FP128Ty() const
Return true if this is powerpc long double.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
TypeID getTypeID() const
Return the type id for the type.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
bool isVoidTy() const
Return true if this is 'void'.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
User(Type *ty, unsigned vty, AllocInfo AllocInfo)
Value * getOperand(unsigned i) const
unsigned getNumOperands() const
iterator_range< value_op_iterator > operand_values()
The Vector Function Database.
static SmallVector< VFInfo, 8 > getMappings(const CallInst &CI)
Retrieve all the VFInfo instances associated to the CallInst CI.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
unsigned getValueID() const
Return an ID for the concrete type of this object.
bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
LLVMContext & getContext() const
All values hold a context through their type.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Value handle that is nullable, but tries to track the Value.
std::pair< iterator, bool > insert(const ValueT &V)
iterator find(const_arg_type_t< ValueT > V)
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
bool erase(const ValueT &V)
size_type count(const_arg_type_t< ValueT > V) const
Return 1 if the specified key is in the set, 0 otherwise.
constexpr ScalarTy getFixedValue() const
An efficient, type-erasing, non-owning reference to a callable.
An opaque object representing a hash code.
const ParentTy * getParent() const
self_iterator getIterator()
CRTP base class for adapting an iterator to a different type.
A range adaptor for a pair of iterators.
This class implements an extremely fast bulk output stream that can only output to a stream.
raw_ostream & indent(unsigned NumSpaces)
indent - Insert 'NumSpaces' spaces.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
A helper class used for scoring candidates for two consecutive lanes.
static const int ScoreConsecutiveExtracts
ExtractElementInst from same vector and consecutive indexes.
int getShallowScore(Value *V1, Value *V2, Instruction *U1, Instruction *U2, ArrayRef< Value * > MainAltOps) const
static const int ScoreAllUserVectorized
Score if all users are vectorized.
static const int ScoreSameOpcode
Instructions with the same opcode.
static const int ScoreUndef
Matching with an undef is preferable to failing.
int getScoreAtLevelRec(Value *LHS, Value *RHS, Instruction *U1, Instruction *U2, int CurrLevel, ArrayRef< Value * > MainAltOps) const
Go through the operands of LHS and RHS recursively until MaxLevel, and return the cummulative score.
static const int ScoreFail
Score for failing to find a decent match.
static const int ScoreMaskedGatherCandidate
A load candidate for masked gather.
static const int ScoreSplat
Identical instructions (a.k.a. splat or broadcast).
LookAheadHeuristics(const TargetLibraryInfo &TLI, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R, int NumLanes, int MaxLevel)
static const int ScoreSplatLoads
The same load multiple times.
static const int ScoreReversedLoads
Loads from reversed memory addresses, e.g. load(A[i+1]), load(A[i]).
static const int ScoreConstants
Constants.
static const int ScoreAltOpcodes
Instructions with alt opcodes (e.g, add + sub).
static const int ScoreConsecutiveLoads
Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
static const int ScoreReversedExtracts
ExtractElementInst from same vector and reversed indices.
A helper data structure to hold the operands of a vector of instructions.
ValueList getVL(unsigned OpIdx) const
\Returns a value vector with the operands across all lanes for the opearnd at OpIdx.
static LLVM_DUMP_METHOD StringRef getModeStr(ReorderingMode RMode)
VLOperands(ArrayRef< Value * > RootVL, const InstructionsState &S, const BoUpSLP &R)
Initialize with all the operands of the instruction vector RootVL.
static LLVM_DUMP_METHOD void dumpMode(ReorderingMode RMode)
Debug print.
LLVM_DUMP_METHOD void dump() const
Debug print.
friend raw_ostream & operator<<(raw_ostream &OS, ReorderingMode RMode)
static LLVM_DUMP_METHOD raw_ostream & printMode(ReorderingMode RMode, raw_ostream &OS)
LLVM_DUMP_METHOD raw_ostream & print(raw_ostream &OS) const
Bottom Up SLP Vectorizer.
SmallVector< unsigned, 4 > OrdersType
std::optional< std::pair< Type *, bool > > getRootNodeTypeWithNoCast() const
Returns the type/is-signed info for the root node in the graph without casting.
std::optional< OrdersType > findPartiallyOrderedLoads(const TreeEntry &TE)
Sort loads into increasing pointers offsets to allow greater clustering.
LoadsState
Tracks the state we can represent the loads in the given sequence.
friend raw_ostream & operator<<(raw_ostream &os, const BoUpSLP::ScheduleData &SD)
void reorderTopToBottom()
Reorders the current graph to the most profitable order starting from the root node to the leaf nodes...
void reorderBottomToTop(bool IgnoreReorder=false)
Reorders the current graph to the most profitable order starting from leaves to the root.
void registerNonVectorizableLoads(ArrayRef< T * > VL)
Registers non-vectorizable sequence of loads.
unsigned getTreeSize() const
bool areKnownNonVectorizableLoads(ArrayRef< T * > VL) const
Checks if the given loads sequence is known as not vectorizable.
unsigned getCanonicalGraphSize() const
Returns the base graph size, before any transformations.
bool areAnalyzedReductionVals(ArrayRef< Value * > VL) const
Checks if the provided list of reduced values was checked already for vectorization.
LoadsState canVectorizeLoads(ArrayRef< Value * > VL, const Value *VL0, SmallVectorImpl< unsigned > &Order, SmallVectorImpl< Value * > &PointerOps, unsigned *BestVF=nullptr, bool TryRecursiveCheck=true) const
Checks if the given array of loads can be represented as a vectorized, scatter or just simple gather.
bool isLoadCombineCandidate(ArrayRef< Value * > Stores) const
Assume that a vector of stores of bitwise-or/shifted/zexted loaded values can be load combined in the...
void analyzedReductionVals(ArrayRef< Value * > VL)
Adds the list of reduced values to list of already checked values for the vectorization.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const
Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values can be load combined in the ...
unsigned getVectorElementSize(Value *V)
bool isSignedMinBitwidthRootNode() const
Checks if the root graph node can be emitted with narrower bitwidth at codegen and returns it signedn...
void analyzedReductionRoot(Instruction *I)
Register given instruction as already analyzed for being possible reduction root.
ArrayRef< Value * > getRootNodeScalars() const
Return the scalars of the root node.
void computeMinimumValueSizes()
Compute the minimum type sizes required to represent the entries in a vectorizable tree.
void deleteTree()
Clear the internal data structures that are created by 'buildTree'.
InstructionCost getTreeCost(ArrayRef< Value * > VectorizedVals={})
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
SmallPtrSet< Value *, 16 > ValueSet
SmallDenseSet< Value *, 4 > ExtraValueToDebugLocsMap
BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, const DataLayout *DL, OptimizationRemarkEmitter *ORE)
bool isNotScheduled(const Value *V) const
Checks if the specified value was not schedule.
void transformNodes()
Transforms graph nodes to target specific representations, if profitable.
bool isDeleted(Instruction *I) const
Checks if the instruction is marked for deletion.
void buildExternalUses(const ExtraValueToDebugLocsMap &ExternallyUsedValues={})
Builds external uses of the vectorized scalars, i.e.
bool isTreeTinyAndNotFullyVectorizable(bool ForReduction=false) const
void removeInstructionsAndOperands(ArrayRef< T * > DeadVals)
Remove instructions from the parent function and clear the operands of DeadVals instructions,...
unsigned canMapToVector(Type *T) const
Check if homogeneous aggregate is isomorphic to some VectorType.
unsigned getMinVF(unsigned Sz) const
bool isAnalyzedReductionRoot(Instruction *I) const
Checks if the instruction was already analyzed for being possible reduction root.
std::optional< OrdersType > getReorderingData(const TreeEntry &TE, bool TopToBottom)
Gets reordering data for the given tree entry.
void eraseInstruction(Instruction *I)
Removes an instruction from its block and eventually deletes it.
bool doesRootHaveInTreeUses() const
Returns whether the root node has in-tree uses.
OptimizationRemarkEmitter * getORE()
bool isAnyGathered(const SmallDenseSet< Value * > &Vals) const
Checks if the given value is gathered in one of the nodes.
SmallVector< Value *, 8 > ValueList
void buildTree(ArrayRef< Value * > Roots, const SmallDenseSet< Value * > &UserIgnoreLst)
Construct a vectorizable tree that starts at Roots, ignoring users for the purpose of scheduling and ...
bool isTreeNotExtendable() const
Checks if the graph and all its subgraphs cannot be better vectorized.
FixedVectorType * getReductionType() const
Returns reduction type after minbitdth analysis.
unsigned getMaxVecRegSize() const
bool isVectorized(Value *V) const
Check if the value is vectorized in the tree.
bool isIdentityOrder(ArrayRef< unsigned > Order) const
Does this non-empty order represent an identity order? Identity should be represented as an empty ord...
bool isGathered(const Value *V) const
Checks if the given value is gathered in one of the nodes.
InstructionCost getSpillCost() const
unsigned getMinVecRegSize() const
Value * vectorizeTree()
Vectorize the tree that starts with the elements in VL.
std::optional< int > findBestRootPair(ArrayRef< std::pair< Value *, Value * > > Candidates, int Limit=LookAheadHeuristics::ScoreFail) const
Evaluate each pair in Candidates and return index into Candidates for a pair which have highest score...
std::optional< OrdersType > findReusedOrderedScalars(const TreeEntry &TE)
Checks if the specified gather tree entry TE can be represented as a shuffled vector entry + (possibl...
void clearReductionData()
Clear the list of the analyzed reduction root instructions.
void optimizeGatherSequence()
Perform LICM and CSE on the newly generated gather sequences.
Function * getVectorizedFunction(const VFShape &Shape) const
This provides a very simple, boring adaptor for a begin and end iterator into a range type.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
TwoOps_match< ValueOpTy, PointerOpTy, Instruction::Store > m_Store(const ValueOpTy &ValueOp, const PointerOpTy &PointerOp)
Matches StoreInst.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FAdd > m_FAdd(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
OneUse_match< T > m_OneUse(const T &SubPattern)
auto m_LogicalOr()
Matches L || R where L and R are arbitrary values.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
MaxMin_match< ICmpInst, LHS, RHS, umax_pred_ty > m_UMax(const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
apint_match m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
auto m_Undef()
Match an arbitrary undef constant.
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
MaxMin_match< ICmpInst, LHS, RHS, umin_pred_ty > m_UMin(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path LLVM_LIFETIME_BOUND, Style style=Style::native)
Get begin iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< int > getPointersDiff(Type *ElemTyA, Value *PtrA, Type *ElemTyB, Value *PtrB, const DataLayout &DL, ScalarEvolution &SE, bool StrictCheck=false, bool CheckType=true)
Returns the distance between the pointers PtrA and PtrB iff they are compatible and it is possible to...
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
Value * createSimpleReduction(IRBuilderBase &B, Value *Src, RecurKind RdxKind)
Create a reduction of the given vector.
static bool doesNotNeedToBeScheduled(Value *V)
Checks if the specified value does not require scheduling.
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
hash_code hash_value(const FixedPointSemantics &Val)
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
bool isEqual(const GCNRPTracker::LiveRegSet &S1, const GCNRPTracker::LiveRegSet &S2)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
static void reorderScalars(SmallVectorImpl< Value * > &Scalars, ArrayRef< int > Mask)
Reorders the list of scalars in accordance with the given Mask.
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
bool verifyFunction(const Function &F, raw_ostream *OS=nullptr)
Check a function for errors, useful for use when debugging a pass.
void salvageDebugInfo(const MachineRegisterInfo &MRI, MachineInstr &MI)
Assuming the instruction MI is going to be deleted, attempt to salvage debug users of MI by writing t...
testing::Matcher< const detail::ErrorHolder & > Failed()
static bool isUsedOutsideBlock(Value *V)
Checks if the provided value does not require scheduling.
std::pair< Intrinsic::ID, bool > canConvertToMinOrMaxIntrinsic(ArrayRef< Value * > VL)
Check if the values in VL are select instructions that can be converted to a min or max (vector) intr...
bool set_is_subset(const S1Ty &S1, const S2Ty &S2)
set_is_subset(A, B) - Return true iff A in B
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
void interleaveComma(const Container &c, StreamT &os, UnaryFunctor each_fn)
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
iterator_range< po_iterator< T > > post_order(const T &G)
MaybeAlign getAlign(const Function &F, unsigned Index)
Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P)
Provide wrappers to std::copy_if which take ranges instead of having to pass begin/end explicitly.
bool isGather(IntrinsicInst *IntInst)
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
void erase(Container &C, ValueType V)
Wrapper function to remove a value from a container:
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
constexpr bool has_single_bit(T Value) noexcept
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
auto reverse(ContainerTy &&C)
static void inversePermutation(ArrayRef< unsigned > Indices, SmallVectorImpl< int > &Mask)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
void sort(IteratorTy Start, IteratorTy End)
llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
auto find_if_not(R &&Range, UnaryPredicate P)
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
static bool hasFullVectorsOrPowerOf2(const TargetTransformInfo &TTI, Type *Ty, unsigned Sz)
Returns true if widened type of Ty elements with size Sz represents full vector type,...
bool isPointerTy(const Type *T)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
bool wouldInstructionBeTriviallyDead(const Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction would have no side effects if it was not used.
bool isModOrRefSet(const ModRefInfo MRI)
bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
bool sortPtrAccesses(ArrayRef< Value * > VL, Type *ElemTy, const DataLayout &DL, ScalarEvolution &SE, SmallVectorImpl< unsigned > &SortedIndices)
Attempt to sort the pointers in VL and return the sorted indices in SortedIndices,...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
constexpr int PoisonMaskElem
@ Ref
The access may reference the value stored in memory.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
CmpInst::Predicate getMinMaxReductionPredicate(RecurKind RK)
Returns the comparison predicate used when expanding a min/max reduction.
RecurKind
These are the kinds of recurrences that we support.
@ Or
Bitwise or logical OR of integers.
bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
static bool areAllOperandsNonInsts(Value *V)
Checks if the provided value does not require scheduling.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
void ViewGraph(const GraphType &G, const Twine &Name, bool ShortNames=false, const Twine &Title="", GraphProgram::Name Program=GraphProgram::DOT)
ViewGraph - Emit a dot graph, run 'dot', run gv on the postscript file, then cleanup.
OutputIt copy(R &&Range, OutputIt Out)
static bool doesNotNeedToSchedule(ArrayRef< Value * > VL)
Checks if the specified array of instructions does not require scheduling.
constexpr unsigned BitWidth
bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Constant * ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned, const DataLayout &DL)
Constant fold a zext, sext or trunc, depending on IsSigned and whether the DestTy is wider or narrowe...
bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
bool mayHaveNonDefUseDependency(const Instruction &I)
Returns true if the result or effects of the given instructions I depend values not reachable through...
bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Used to keep track of an operand bundle.
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
BoUpSLP::TreeEntry TreeEntry
std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R)
static std::string getNodeAttributes(const TreeEntry *Entry, const BoUpSLP *)
DOTGraphTraits(bool IsSimple=false)
DOTGraphTraits - Template class that can be specialized to customize how graphs are converted to 'dot...
DefaultDOTGraphTraits - This class provides the default implementations of all of the DOTGraphTraits ...
Used in the streaming interface as the general argument type.
ChildIteratorType(SmallVector< BoUpSLP::EdgeInfo, 1 >::iterator W, ContainerTy &VT)
ContainerTy & VectorizableTree
static ChildIteratorType child_end(NodeRef N)
static NodeRef getEntryNode(BoUpSLP &R)
static ChildIteratorType child_begin(NodeRef N)
static nodes_iterator nodes_begin(BoUpSLP *R)
TreeEntry * NodeRef
NodeRef has to be a pointer per the GraphWriter.
static unsigned size(BoUpSLP *R)
BoUpSLP::TreeEntry TreeEntry
static nodes_iterator nodes_end(BoUpSLP *R)
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AAResults *AA_, LoopInfo *LI_, DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, OptimizationRemarkEmitter *ORE_)
A MapVector that performs no allocations if smaller than a certain size.
Contains the information about the kind of vectorization available.
static VFShape get(const FunctionType *FTy, ElementCount EC, bool HasGlobalPred)
Retrieve the basic vectorization shape of the function, where all parameters are mapped to VFParamKin...
Function object to check whether the first component of a container supported by std::get (like std::...
Function object to check whether the second component of a container supported by std::get (like std:...
This structure holds any data we need about the edges being traversed during buildTree_rec().
unsigned EdgeIdx
The operand index of the use.
EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
LLVM_DUMP_METHOD void dump() const
TreeEntry * UserTE
The user TreeEntry.
friend raw_ostream & operator<<(raw_ostream &OS, const BoUpSLP::EdgeInfo &EI)
void dump(raw_ostream &OS) const
Debug print.