LLVM 22.0.0git
VPlanTransforms.cpp
Go to the documentation of this file.
1//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8///
9/// \file
10/// This file implements a set of utility VPlan to VPlan transformations.
11///
12//===----------------------------------------------------------------------===//
13
14#include "VPlanTransforms.h"
15#include "VPRecipeBuilder.h"
16#include "VPlan.h"
17#include "VPlanAnalysis.h"
18#include "VPlanCFG.h"
19#include "VPlanDominatorTree.h"
20#include "VPlanHelpers.h"
21#include "VPlanPatternMatch.h"
22#include "VPlanUtils.h"
23#include "VPlanVerifier.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/STLExtras.h"
28#include "llvm/ADT/SetVector.h"
30#include "llvm/ADT/TypeSwitch.h"
38#include "llvm/IR/Intrinsics.h"
39#include "llvm/IR/MDBuilder.h"
40#include "llvm/IR/Metadata.h"
44
45using namespace llvm;
46using namespace VPlanPatternMatch;
47using namespace SCEVPatternMatch;
48
50 VPlan &Plan,
52 GetIntOrFpInductionDescriptor,
53 const TargetLibraryInfo &TLI) {
54
56 Plan.getVectorLoopRegion());
58 // Skip blocks outside region
59 if (!VPBB->getParent())
60 break;
61 VPRecipeBase *Term = VPBB->getTerminator();
62 auto EndIter = Term ? Term->getIterator() : VPBB->end();
63 // Introduce each ingredient into VPlan.
64 for (VPRecipeBase &Ingredient :
65 make_early_inc_range(make_range(VPBB->begin(), EndIter))) {
66
67 VPValue *VPV = Ingredient.getVPSingleValue();
68 if (!VPV->getUnderlyingValue())
69 continue;
70
72
73 VPRecipeBase *NewRecipe = nullptr;
74 if (auto *PhiR = dyn_cast<VPPhi>(&Ingredient)) {
75 auto *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
76 const auto *II = GetIntOrFpInductionDescriptor(Phi);
77 if (!II) {
78 NewRecipe = new VPWidenPHIRecipe(Phi, nullptr, PhiR->getDebugLoc());
79 for (VPValue *Op : PhiR->operands())
80 NewRecipe->addOperand(Op);
81 } else {
82 VPValue *Start = Plan.getOrAddLiveIn(II->getStartValue());
83 VPValue *Step =
85 // It is always safe to copy over the NoWrap and FastMath flags. In
86 // particular, when folding tail by masking, the masked-off lanes are
87 // never used, so it is safe.
89 NewRecipe = new VPWidenIntOrFpInductionRecipe(
90 Phi, Start, Step, &Plan.getVF(), *II, Flags,
91 Ingredient.getDebugLoc());
92 }
93 } else {
94 auto *VPI = cast<VPInstruction>(&Ingredient);
95 assert(!isa<PHINode>(Inst) && "phis should be handled above");
96 // Create VPWidenMemoryRecipe for loads and stores.
97 if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
98 NewRecipe = new VPWidenLoadRecipe(
99 *Load, Ingredient.getOperand(0), nullptr /*Mask*/,
100 false /*Consecutive*/, false /*Reverse*/, *VPI,
101 Ingredient.getDebugLoc());
102 } else if (StoreInst *Store = dyn_cast<StoreInst>(Inst)) {
103 NewRecipe = new VPWidenStoreRecipe(
104 *Store, Ingredient.getOperand(1), Ingredient.getOperand(0),
105 nullptr /*Mask*/, false /*Consecutive*/, false /*Reverse*/, *VPI,
106 Ingredient.getDebugLoc());
108 NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands(), *VPI,
109 Ingredient.getDebugLoc());
110 } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
111 Intrinsic::ID VectorID = getVectorIntrinsicIDForCall(CI, &TLI);
112 if (VectorID == Intrinsic::not_intrinsic)
113 return false;
114 NewRecipe = new VPWidenIntrinsicRecipe(
115 *CI, getVectorIntrinsicIDForCall(CI, &TLI),
116 drop_end(Ingredient.operands()), CI->getType(), VPIRFlags(*CI),
117 *VPI, CI->getDebugLoc());
118 } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
119 NewRecipe = new VPWidenSelectRecipe(SI, Ingredient.operands(), *VPI,
120 *VPI, Ingredient.getDebugLoc());
121 } else if (auto *CI = dyn_cast<CastInst>(Inst)) {
122 NewRecipe = new VPWidenCastRecipe(
123 CI->getOpcode(), Ingredient.getOperand(0), CI->getType(), CI,
124 VPIRFlags(*CI), VPIRMetadata(*CI));
125 } else {
126 NewRecipe = new VPWidenRecipe(*Inst, Ingredient.operands(), *VPI,
127 *VPI, Ingredient.getDebugLoc());
128 }
129 }
130
131 NewRecipe->insertBefore(&Ingredient);
132 if (NewRecipe->getNumDefinedValues() == 1)
133 VPV->replaceAllUsesWith(NewRecipe->getVPSingleValue());
134 else
135 assert(NewRecipe->getNumDefinedValues() == 0 &&
136 "Only recpies with zero or one defined values expected");
137 Ingredient.eraseFromParent();
138 }
139 }
140 return true;
141}
142
143/// Helper for extra no-alias checks via known-safe recipe and SCEV.
145 const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
146 VPReplicateRecipe &GroupLeader;
148 const Loop &L;
149 VPTypeAnalysis &TypeInfo;
150
151 // Return true if \p A and \p B are known to not alias for all VFs in the
152 // plan, checked via the distance between the accesses
153 bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
154 if (A->getOpcode() != Instruction::Store ||
155 B->getOpcode() != Instruction::Store)
156 return false;
157
158 VPValue *AddrA = A->getOperand(1);
159 const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, PSE, &L);
160 VPValue *AddrB = B->getOperand(1);
161 const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, PSE, &L);
163 return false;
164
165 const APInt *Distance;
166 ScalarEvolution &SE = *PSE.getSE();
167 if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
168 return false;
169
170 const DataLayout &DL = SE.getDataLayout();
171 Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
172 uint64_t SizeA = DL.getTypeStoreSize(TyA);
173 Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
174 uint64_t SizeB = DL.getTypeStoreSize(TyB);
175
176 // Use the maximum store size to ensure no overlap from either direction.
177 // Currently only handles fixed sizes, as it is only used for
178 // replicating VPReplicateRecipes.
179 uint64_t MaxStoreSize = std::max(SizeA, SizeB);
180
181 auto VFs = B->getParent()->getPlan()->vectorFactors();
183 return Distance->abs().uge(
184 MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
185 }
186
187public:
190 const Loop &L, VPTypeAnalysis &TypeInfo)
191 : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), PSE(PSE),
192 L(L), TypeInfo(TypeInfo) {}
193
194 /// Return true if \p R should be skipped during alias checking, either
195 /// because it's in the exclude set or because no-alias can be proven via
196 /// SCEV.
197 bool shouldSkip(VPRecipeBase &R) const {
198 auto *Store = dyn_cast<VPReplicateRecipe>(&R);
199 return ExcludeRecipes.contains(&R) ||
200 (Store && isNoAliasViaDistance(Store, &GroupLeader));
201 }
202};
203
204/// Check if a memory operation doesn't alias with memory operations in blocks
205/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
206/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
207/// checked (for load hoisting). Otherwise recipes that both read and write
208/// memory are checked, and SCEV is used to prove no-alias between the group
209/// leader and other replicate recipes (for store sinking).
210static bool
212 VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
213 std::optional<SinkStoreInfo> SinkInfo = {}) {
214 bool CheckReads = SinkInfo.has_value();
215 if (!MemLoc.AATags.Scope)
216 return false;
217
218 const AAMDNodes &MemAA = MemLoc.AATags;
219
220 for (VPBlockBase *Block = FirstBB; Block;
221 Block = Block->getSingleSuccessor()) {
222 assert(Block->getNumSuccessors() <= 1 &&
223 "Expected at most one successor in block chain");
224 auto *VPBB = cast<VPBasicBlock>(Block);
225 for (VPRecipeBase &R : *VPBB) {
226 if (SinkInfo && SinkInfo->shouldSkip(R))
227 continue;
228
229 // Skip recipes that don't need checking.
230 if (!R.mayWriteToMemory() && !(CheckReads && R.mayReadFromMemory()))
231 continue;
232
234 if (!Loc)
235 // Conservatively assume aliasing for memory operations without
236 // location.
237 return false;
238
239 // For reads, check if they don't alias in the reverse direction and
240 // skip if so.
241 if (CheckReads && R.mayReadFromMemory() &&
243 MemAA.NoAlias))
244 continue;
245
246 // Check if the memory operations may alias in the forward direction.
248 Loc->AATags.NoAlias))
249 return false;
250 }
251
252 if (Block == LastBB)
253 break;
254 }
255 return true;
256}
257
258/// Return true if we do not know how to (mechanically) hoist or sink \p R out
259/// of a loop region.
261 // Assumes don't alias anything or throw; as long as they're guaranteed to
262 // execute, they're safe to hoist.
264 return false;
265
266 // TODO: Relax checks in the future, e.g. we could also hoist reads, if their
267 // memory location is not modified in the vector loop.
268 if (R.mayHaveSideEffects() || R.mayReadFromMemory() || R.isPhi())
269 return true;
270
271 // Allocas cannot be hoisted.
272 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
273 return RepR && RepR->getOpcode() == Instruction::Alloca;
274}
275
276static bool sinkScalarOperands(VPlan &Plan) {
277 auto Iter = vp_depth_first_deep(Plan.getEntry());
278 bool ScalarVFOnly = Plan.hasScalarVFOnly();
279 bool Changed = false;
280
282 auto InsertIfValidSinkCandidate = [ScalarVFOnly, &WorkList](
283 VPBasicBlock *SinkTo, VPValue *Op) {
284 auto *Candidate =
285 dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe());
286 if (!Candidate)
287 return;
288
289 // We only know how to sink VPReplicateRecipes and VPScalarIVStepsRecipes
290 // for now.
292 return;
293
294 if (Candidate->getParent() == SinkTo || cannotHoistOrSinkRecipe(*Candidate))
295 return;
296
297 if (auto *RepR = dyn_cast<VPReplicateRecipe>(Candidate))
298 if (!ScalarVFOnly && RepR->isSingleScalar())
299 return;
300
301 WorkList.insert({SinkTo, Candidate});
302 };
303
304 // First, collect the operands of all recipes in replicate blocks as seeds for
305 // sinking.
307 VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
308 if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
309 continue;
310 VPBasicBlock *VPBB = cast<VPBasicBlock>(EntryVPBB->getSuccessors().front());
311 if (VPBB->getSingleSuccessor() != VPR->getExitingBasicBlock())
312 continue;
313 for (auto &Recipe : *VPBB)
314 for (VPValue *Op : Recipe.operands())
315 InsertIfValidSinkCandidate(VPBB, Op);
316 }
317
318 // Try to sink each replicate or scalar IV steps recipe in the worklist.
319 for (unsigned I = 0; I != WorkList.size(); ++I) {
320 VPBasicBlock *SinkTo;
321 VPSingleDefRecipe *SinkCandidate;
322 std::tie(SinkTo, SinkCandidate) = WorkList[I];
323
324 // All recipe users of SinkCandidate must be in the same block SinkTo or all
325 // users outside of SinkTo must only use the first lane of SinkCandidate. In
326 // the latter case, we need to duplicate SinkCandidate.
327 auto UsersOutsideSinkTo =
328 make_filter_range(SinkCandidate->users(), [SinkTo](VPUser *U) {
329 return cast<VPRecipeBase>(U)->getParent() != SinkTo;
330 });
331 if (any_of(UsersOutsideSinkTo, [SinkCandidate](VPUser *U) {
332 return !U->usesFirstLaneOnly(SinkCandidate);
333 }))
334 continue;
335 bool NeedsDuplicating = !UsersOutsideSinkTo.empty();
336
337 if (NeedsDuplicating) {
338 if (ScalarVFOnly)
339 continue;
340 VPSingleDefRecipe *Clone;
341 if (auto *SinkCandidateRepR =
342 dyn_cast<VPReplicateRecipe>(SinkCandidate)) {
343 // TODO: Handle converting to uniform recipes as separate transform,
344 // then cloning should be sufficient here.
345 Instruction *I = SinkCandidate->getUnderlyingInstr();
346 Clone = new VPReplicateRecipe(I, SinkCandidate->operands(), true,
347 nullptr /*Mask*/, *SinkCandidateRepR,
348 *SinkCandidateRepR);
349 // TODO: add ".cloned" suffix to name of Clone's VPValue.
350 } else {
351 Clone = SinkCandidate->clone();
352 }
353
354 Clone->insertBefore(SinkCandidate);
355 SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
356 return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
357 });
358 }
359 SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
360 for (VPValue *Op : SinkCandidate->operands())
361 InsertIfValidSinkCandidate(SinkTo, Op);
362 Changed = true;
363 }
364 return Changed;
365}
366
367/// If \p R is a region with a VPBranchOnMaskRecipe in the entry block, return
368/// the mask.
370 auto *EntryBB = dyn_cast<VPBasicBlock>(R->getEntry());
371 if (!EntryBB || EntryBB->size() != 1 ||
372 !isa<VPBranchOnMaskRecipe>(EntryBB->begin()))
373 return nullptr;
374
375 return cast<VPBranchOnMaskRecipe>(&*EntryBB->begin())->getOperand(0);
376}
377
378/// If \p R is a triangle region, return the 'then' block of the triangle.
380 auto *EntryBB = cast<VPBasicBlock>(R->getEntry());
381 if (EntryBB->getNumSuccessors() != 2)
382 return nullptr;
383
384 auto *Succ0 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[0]);
385 auto *Succ1 = dyn_cast<VPBasicBlock>(EntryBB->getSuccessors()[1]);
386 if (!Succ0 || !Succ1)
387 return nullptr;
388
389 if (Succ0->getNumSuccessors() + Succ1->getNumSuccessors() != 1)
390 return nullptr;
391 if (Succ0->getSingleSuccessor() == Succ1)
392 return Succ0;
393 if (Succ1->getSingleSuccessor() == Succ0)
394 return Succ1;
395 return nullptr;
396}
397
398// Merge replicate regions in their successor region, if a replicate region
399// is connected to a successor replicate region with the same predicate by a
400// single, empty VPBasicBlock.
402 SmallPtrSet<VPRegionBlock *, 4> TransformedRegions;
403
404 // Collect replicate regions followed by an empty block, followed by another
405 // replicate region with matching masks to process front. This is to avoid
406 // iterator invalidation issues while merging regions.
409 vp_depth_first_deep(Plan.getEntry()))) {
410 if (!Region1->isReplicator())
411 continue;
412 auto *MiddleBasicBlock =
413 dyn_cast_or_null<VPBasicBlock>(Region1->getSingleSuccessor());
414 if (!MiddleBasicBlock || !MiddleBasicBlock->empty())
415 continue;
416
417 auto *Region2 =
418 dyn_cast_or_null<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
419 if (!Region2 || !Region2->isReplicator())
420 continue;
421
422 VPValue *Mask1 = getPredicatedMask(Region1);
423 VPValue *Mask2 = getPredicatedMask(Region2);
424 if (!Mask1 || Mask1 != Mask2)
425 continue;
426
427 assert(Mask1 && Mask2 && "both region must have conditions");
428 WorkList.push_back(Region1);
429 }
430
431 // Move recipes from Region1 to its successor region, if both are triangles.
432 for (VPRegionBlock *Region1 : WorkList) {
433 if (TransformedRegions.contains(Region1))
434 continue;
435 auto *MiddleBasicBlock = cast<VPBasicBlock>(Region1->getSingleSuccessor());
436 auto *Region2 = cast<VPRegionBlock>(MiddleBasicBlock->getSingleSuccessor());
437
438 VPBasicBlock *Then1 = getPredicatedThenBlock(Region1);
439 VPBasicBlock *Then2 = getPredicatedThenBlock(Region2);
440 if (!Then1 || !Then2)
441 continue;
442
443 // Note: No fusion-preventing memory dependencies are expected in either
444 // region. Such dependencies should be rejected during earlier dependence
445 // checks, which guarantee accesses can be re-ordered for vectorization.
446 //
447 // Move recipes to the successor region.
448 for (VPRecipeBase &ToMove : make_early_inc_range(reverse(*Then1)))
449 ToMove.moveBefore(*Then2, Then2->getFirstNonPhi());
450
451 auto *Merge1 = cast<VPBasicBlock>(Then1->getSingleSuccessor());
452 auto *Merge2 = cast<VPBasicBlock>(Then2->getSingleSuccessor());
453
454 // Move VPPredInstPHIRecipes from the merge block to the successor region's
455 // merge block. Update all users inside the successor region to use the
456 // original values.
457 for (VPRecipeBase &Phi1ToMove : make_early_inc_range(reverse(*Merge1))) {
458 VPValue *PredInst1 =
459 cast<VPPredInstPHIRecipe>(&Phi1ToMove)->getOperand(0);
460 VPValue *Phi1ToMoveV = Phi1ToMove.getVPSingleValue();
461 Phi1ToMoveV->replaceUsesWithIf(PredInst1, [Then2](VPUser &U, unsigned) {
462 return cast<VPRecipeBase>(&U)->getParent() == Then2;
463 });
464
465 // Remove phi recipes that are unused after merging the regions.
466 if (Phi1ToMove.getVPSingleValue()->getNumUsers() == 0) {
467 Phi1ToMove.eraseFromParent();
468 continue;
469 }
470 Phi1ToMove.moveBefore(*Merge2, Merge2->begin());
471 }
472
473 // Remove the dead recipes in Region1's entry block.
474 for (VPRecipeBase &R :
475 make_early_inc_range(reverse(*Region1->getEntryBasicBlock())))
476 R.eraseFromParent();
477
478 // Finally, remove the first region.
479 for (VPBlockBase *Pred : make_early_inc_range(Region1->getPredecessors())) {
480 VPBlockUtils::disconnectBlocks(Pred, Region1);
481 VPBlockUtils::connectBlocks(Pred, MiddleBasicBlock);
482 }
483 VPBlockUtils::disconnectBlocks(Region1, MiddleBasicBlock);
484 TransformedRegions.insert(Region1);
485 }
486
487 return !TransformedRegions.empty();
488}
489
491 VPlan &Plan) {
492 Instruction *Instr = PredRecipe->getUnderlyingInstr();
493 // Build the triangular if-then region.
494 std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
495 assert(Instr->getParent() && "Predicated instruction not in any basic block");
496 auto *BlockInMask = PredRecipe->getMask();
497 auto *MaskDef = BlockInMask->getDefiningRecipe();
498 auto *BOMRecipe = new VPBranchOnMaskRecipe(
499 BlockInMask, MaskDef ? MaskDef->getDebugLoc() : DebugLoc::getUnknown());
500 auto *Entry =
501 Plan.createVPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
502
503 // Replace predicated replicate recipe with a replicate recipe without a
504 // mask but in the replicate region.
505 auto *RecipeWithoutMask = new VPReplicateRecipe(
506 PredRecipe->getUnderlyingInstr(), drop_end(PredRecipe->operands()),
507 PredRecipe->isSingleScalar(), nullptr /*Mask*/, *PredRecipe, *PredRecipe,
508 PredRecipe->getDebugLoc());
509 auto *Pred =
510 Plan.createVPBasicBlock(Twine(RegionName) + ".if", RecipeWithoutMask);
511
512 VPPredInstPHIRecipe *PHIRecipe = nullptr;
513 if (PredRecipe->getNumUsers() != 0) {
514 PHIRecipe = new VPPredInstPHIRecipe(RecipeWithoutMask,
515 RecipeWithoutMask->getDebugLoc());
516 PredRecipe->replaceAllUsesWith(PHIRecipe);
517 PHIRecipe->setOperand(0, RecipeWithoutMask);
518 }
519 PredRecipe->eraseFromParent();
520 auto *Exiting =
521 Plan.createVPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
523 Plan.createReplicateRegion(Entry, Exiting, RegionName);
524
525 // Note: first set Entry as region entry and then connect successors starting
526 // from it in order, to propagate the "parent" of each VPBasicBlock.
527 VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
528 VPBlockUtils::connectBlocks(Pred, Exiting);
529
530 return Region;
531}
532
533static void addReplicateRegions(VPlan &Plan) {
536 vp_depth_first_deep(Plan.getEntry()))) {
537 for (VPRecipeBase &R : *VPBB)
538 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
539 if (RepR->isPredicated())
540 WorkList.push_back(RepR);
541 }
542 }
543
544 unsigned BBNum = 0;
545 for (VPReplicateRecipe *RepR : WorkList) {
546 VPBasicBlock *CurrentBlock = RepR->getParent();
547 VPBasicBlock *SplitBlock = CurrentBlock->splitAt(RepR->getIterator());
548
549 BasicBlock *OrigBB = RepR->getUnderlyingInstr()->getParent();
550 SplitBlock->setName(
551 OrigBB->hasName() ? OrigBB->getName() + "." + Twine(BBNum++) : "");
552 // Record predicated instructions for above packing optimizations.
554 Region->setParent(CurrentBlock->getParent());
556
557 VPRegionBlock *ParentRegion = Region->getParent();
558 if (ParentRegion && ParentRegion->getExiting() == CurrentBlock)
559 ParentRegion->setExiting(SplitBlock);
560 }
561}
562
563/// Remove redundant VPBasicBlocks by merging them into their predecessor if
564/// the predecessor has a single successor.
568 vp_depth_first_deep(Plan.getEntry()))) {
569 // Don't fold the blocks in the skeleton of the Plan into their single
570 // predecessors for now.
571 // TODO: Remove restriction once more of the skeleton is modeled in VPlan.
572 if (!VPBB->getParent())
573 continue;
574 auto *PredVPBB =
575 dyn_cast_or_null<VPBasicBlock>(VPBB->getSinglePredecessor());
576 if (!PredVPBB || PredVPBB->getNumSuccessors() != 1 ||
577 isa<VPIRBasicBlock>(PredVPBB))
578 continue;
579 WorkList.push_back(VPBB);
580 }
581
582 for (VPBasicBlock *VPBB : WorkList) {
583 VPBasicBlock *PredVPBB = cast<VPBasicBlock>(VPBB->getSinglePredecessor());
584 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
585 R.moveBefore(*PredVPBB, PredVPBB->end());
586 VPBlockUtils::disconnectBlocks(PredVPBB, VPBB);
587 auto *ParentRegion = VPBB->getParent();
588 if (ParentRegion && ParentRegion->getExiting() == VPBB)
589 ParentRegion->setExiting(PredVPBB);
590 for (auto *Succ : to_vector(VPBB->successors())) {
592 VPBlockUtils::connectBlocks(PredVPBB, Succ);
593 }
594 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
595 }
596 return !WorkList.empty();
597}
598
600 // Convert masked VPReplicateRecipes to if-then region blocks.
602
603 bool ShouldSimplify = true;
604 while (ShouldSimplify) {
605 ShouldSimplify = sinkScalarOperands(Plan);
606 ShouldSimplify |= mergeReplicateRegionsIntoSuccessors(Plan);
607 ShouldSimplify |= mergeBlocksIntoPredecessors(Plan);
608 }
609}
610
611/// Remove redundant casts of inductions.
612///
613/// Such redundant casts are casts of induction variables that can be ignored,
614/// because we already proved that the casted phi is equal to the uncasted phi
615/// in the vectorized loop. There is no need to vectorize the cast - the same
616/// value can be used for both the phi and casts in the vector loop.
618 for (auto &Phi : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
620 if (!IV || IV->getTruncInst())
621 continue;
622
623 // A sequence of IR Casts has potentially been recorded for IV, which
624 // *must be bypassed* when the IV is vectorized, because the vectorized IV
625 // will produce the desired casted value. This sequence forms a def-use
626 // chain and is provided in reverse order, ending with the cast that uses
627 // the IV phi. Search for the recipe of the last cast in the chain and
628 // replace it with the original IV. Note that only the final cast is
629 // expected to have users outside the cast-chain and the dead casts left
630 // over will be cleaned up later.
631 ArrayRef<Instruction *> Casts = IV->getInductionDescriptor().getCastInsts();
632 VPValue *FindMyCast = IV;
633 for (Instruction *IRCast : reverse(Casts)) {
634 VPSingleDefRecipe *FoundUserCast = nullptr;
635 for (auto *U : FindMyCast->users()) {
636 auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
637 if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
638 FoundUserCast = UserCast;
639 break;
640 }
641 }
642 FindMyCast = FoundUserCast;
643 }
644 FindMyCast->replaceAllUsesWith(IV);
645 }
646}
647
648/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV
649/// recipe, if it exists.
651 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
652 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
653 VPWidenCanonicalIVRecipe *WidenNewIV = nullptr;
654 for (VPUser *U : CanonicalIV->users()) {
656 if (WidenNewIV)
657 break;
658 }
659
660 if (!WidenNewIV)
661 return;
662
663 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
664 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
665 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
666
667 if (!WidenOriginalIV || !WidenOriginalIV->isCanonical())
668 continue;
669
670 // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides
671 // everything WidenNewIV's users need. That is, WidenOriginalIV will
672 // generate a vector phi or all users of WidenNewIV demand the first lane
673 // only.
674 if (!vputils::onlyScalarValuesUsed(WidenOriginalIV) ||
675 vputils::onlyFirstLaneUsed(WidenNewIV)) {
676 // We are replacing a wide canonical iv with a suitable wide induction.
677 // This is used to compute header mask, hence all lanes will be used and
678 // we need to drop wrap flags only applying to lanes guranteed to execute
679 // in the original scalar loop.
680 WidenOriginalIV->dropPoisonGeneratingFlags();
681 WidenNewIV->replaceAllUsesWith(WidenOriginalIV);
682 WidenNewIV->eraseFromParent();
683 return;
684 }
685 }
686}
687
688/// Returns true if \p R is dead and can be removed.
689static bool isDeadRecipe(VPRecipeBase &R) {
690 // Do remove conditional assume instructions as their conditions may be
691 // flattened.
692 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
693 bool IsConditionalAssume = RepR && RepR->isPredicated() &&
695 if (IsConditionalAssume)
696 return true;
697
698 if (R.mayHaveSideEffects())
699 return false;
700
701 // Recipe is dead if no user keeps the recipe alive.
702 return all_of(R.definedValues(),
703 [](VPValue *V) { return V->getNumUsers() == 0; });
704}
705
708 vp_post_order_deep(Plan.getEntry()))) {
709 // The recipes in the block are processed in reverse order, to catch chains
710 // of dead recipes.
711 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
712 if (isDeadRecipe(R)) {
713 R.eraseFromParent();
714 continue;
715 }
716
717 // Check if R is a dead VPPhi <-> update cycle and remove it.
718 auto *PhiR = dyn_cast<VPPhi>(&R);
719 if (!PhiR || PhiR->getNumOperands() != 2)
720 continue;
721 VPUser *PhiUser = PhiR->getSingleUser();
722 if (!PhiUser)
723 continue;
724 VPValue *Incoming = PhiR->getOperand(1);
725 if (PhiUser != Incoming->getDefiningRecipe() ||
726 Incoming->getNumUsers() != 1)
727 continue;
728 PhiR->replaceAllUsesWith(PhiR->getOperand(0));
729 PhiR->eraseFromParent();
730 Incoming->getDefiningRecipe()->eraseFromParent();
731 }
732 }
733}
734
737 Instruction::BinaryOps InductionOpcode,
738 FPMathOperator *FPBinOp, Instruction *TruncI,
739 VPValue *StartV, VPValue *Step, DebugLoc DL,
740 VPBuilder &Builder) {
741 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
742 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
743 VPCanonicalIVPHIRecipe *CanonicalIV = LoopRegion->getCanonicalIV();
744 VPSingleDefRecipe *BaseIV = Builder.createDerivedIV(
745 Kind, FPBinOp, StartV, CanonicalIV, Step, "offset.idx");
746
747 // Truncate base induction if needed.
748 VPTypeAnalysis TypeInfo(Plan);
749 Type *ResultTy = TypeInfo.inferScalarType(BaseIV);
750 if (TruncI) {
751 Type *TruncTy = TruncI->getType();
752 assert(ResultTy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits() &&
753 "Not truncating.");
754 assert(ResultTy->isIntegerTy() && "Truncation requires an integer type");
755 BaseIV = Builder.createScalarCast(Instruction::Trunc, BaseIV, TruncTy, DL);
756 ResultTy = TruncTy;
757 }
758
759 // Truncate step if needed.
760 Type *StepTy = TypeInfo.inferScalarType(Step);
761 if (ResultTy != StepTy) {
762 assert(StepTy->getScalarSizeInBits() > ResultTy->getScalarSizeInBits() &&
763 "Not truncating.");
764 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
765 auto *VecPreheader =
767 VPBuilder::InsertPointGuard Guard(Builder);
768 Builder.setInsertPoint(VecPreheader);
769 Step = Builder.createScalarCast(Instruction::Trunc, Step, ResultTy, DL);
770 }
771 return Builder.createScalarIVSteps(InductionOpcode, FPBinOp, BaseIV, Step,
772 &Plan.getVF(), DL);
773}
774
777 for (unsigned I = 0; I != Users.size(); ++I) {
779 if (isa<VPHeaderPHIRecipe>(Cur))
780 continue;
781 for (VPValue *V : Cur->definedValues())
782 Users.insert_range(V->users());
783 }
784 return Users.takeVector();
785}
786
787/// Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd
788/// (IndStart, ScalarIVSteps (0, Step)). This is used when the recipe only
789/// generates scalar values.
790static VPValue *
792 VPlan &Plan, VPBuilder &Builder) {
794 VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
795 VPValue *StepV = PtrIV->getOperand(1);
797 Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
798 nullptr, StartV, StepV, PtrIV->getDebugLoc(), Builder);
799
800 return Builder.createPtrAdd(PtrIV->getStartValue(), Steps,
801 PtrIV->getDebugLoc(), "next.gep");
802}
803
804/// Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd
805/// (IndStart, ScalarIVSteps (0, Step)) if only its scalar values are used, as
806/// VPWidenPointerInductionRecipe will generate vectors only. If some users
807/// require vectors while other require scalars, the scalar uses need to extract
808/// the scalars from the generated vectors (Note that this is different to how
809/// int/fp inductions are handled). Legalize extract-from-ends using uniform
810/// VPReplicateRecipe of wide inductions to use regular VPReplicateRecipe, so
811/// the correct end value is available. Also optimize
812/// VPWidenIntOrFpInductionRecipe, if any of its users needs scalar values, by
813/// providing them scalar steps built on the canonical scalar IV and update the
814/// original IV's users. This is an optional optimization to reduce the needs of
815/// vector extracts.
818 bool HasOnlyVectorVFs = !Plan.hasScalarVFOnly();
819 VPBuilder Builder(HeaderVPBB, HeaderVPBB->getFirstNonPhi());
820 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
821 auto *PhiR = dyn_cast<VPWidenInductionRecipe>(&Phi);
822 if (!PhiR)
823 continue;
824
825 // Try to narrow wide and replicating recipes to uniform recipes, based on
826 // VPlan analysis.
827 // TODO: Apply to all recipes in the future, to replace legacy uniformity
828 // analysis.
829 auto Users = collectUsersRecursively(PhiR);
830 for (VPUser *U : reverse(Users)) {
831 auto *Def = dyn_cast<VPRecipeWithIRFlags>(U);
832 auto *RepR = dyn_cast<VPReplicateRecipe>(U);
833 // Skip recipes that shouldn't be narrowed.
834 if (!Def || !isa<VPReplicateRecipe, VPWidenRecipe>(Def) ||
835 Def->getNumUsers() == 0 || !Def->getUnderlyingValue() ||
836 (RepR && (RepR->isSingleScalar() || RepR->isPredicated())))
837 continue;
838
839 // Skip recipes that may have other lanes than their first used.
841 continue;
842
843 auto *Clone = new VPReplicateRecipe(Def->getUnderlyingInstr(),
844 Def->operands(), /*IsUniform*/ true,
845 /*Mask*/ nullptr, /*Flags*/ *Def);
846 Clone->insertAfter(Def);
847 Def->replaceAllUsesWith(Clone);
848 }
849
850 // Replace wide pointer inductions which have only their scalars used by
851 // PtrAdd(IndStart, ScalarIVSteps (0, Step)).
852 if (auto *PtrIV = dyn_cast<VPWidenPointerInductionRecipe>(&Phi)) {
853 if (!Plan.hasScalarVFOnly() &&
854 !PtrIV->onlyScalarsGenerated(Plan.hasScalableVF()))
855 continue;
856
857 VPValue *PtrAdd = scalarizeVPWidenPointerInduction(PtrIV, Plan, Builder);
858 PtrIV->replaceAllUsesWith(PtrAdd);
859 continue;
860 }
861
862 // Replace widened induction with scalar steps for users that only use
863 // scalars.
864 auto *WideIV = cast<VPWidenIntOrFpInductionRecipe>(&Phi);
865 if (HasOnlyVectorVFs && none_of(WideIV->users(), [WideIV](VPUser *U) {
866 return U->usesScalars(WideIV);
867 }))
868 continue;
869
870 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
872 Plan, ID.getKind(), ID.getInductionOpcode(),
873 dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
874 WideIV->getTruncInst(), WideIV->getStartValue(), WideIV->getStepValue(),
875 WideIV->getDebugLoc(), Builder);
876
877 // Update scalar users of IV to use Step instead.
878 if (!HasOnlyVectorVFs) {
879 assert(!Plan.hasScalableVF() &&
880 "plans containing a scalar VF cannot also include scalable VFs");
881 WideIV->replaceAllUsesWith(Steps);
882 } else {
883 bool HasScalableVF = Plan.hasScalableVF();
884 WideIV->replaceUsesWithIf(Steps,
885 [WideIV, HasScalableVF](VPUser &U, unsigned) {
886 if (HasScalableVF)
887 return U.usesFirstLaneOnly(WideIV);
888 return U.usesScalars(WideIV);
889 });
890 }
891 }
892}
893
894/// Check if \p VPV is an untruncated wide induction, either before or after the
895/// increment. If so return the header IV (before the increment), otherwise
896/// return null.
899 auto *WideIV = dyn_cast<VPWidenInductionRecipe>(VPV);
900 if (WideIV) {
901 // VPV itself is a wide induction, separately compute the end value for exit
902 // users if it is not a truncated IV.
903 auto *IntOrFpIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
904 return (IntOrFpIV && IntOrFpIV->getTruncInst()) ? nullptr : WideIV;
905 }
906
907 // Check if VPV is an optimizable induction increment.
908 VPRecipeBase *Def = VPV->getDefiningRecipe();
909 if (!Def || Def->getNumOperands() != 2)
910 return nullptr;
911 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(0));
912 if (!WideIV)
913 WideIV = dyn_cast<VPWidenInductionRecipe>(Def->getOperand(1));
914 if (!WideIV)
915 return nullptr;
916
917 auto IsWideIVInc = [&]() {
918 auto &ID = WideIV->getInductionDescriptor();
919
920 // Check if VPV increments the induction by the induction step.
921 VPValue *IVStep = WideIV->getStepValue();
922 switch (ID.getInductionOpcode()) {
923 case Instruction::Add:
924 return match(VPV, m_c_Add(m_Specific(WideIV), m_Specific(IVStep)));
925 case Instruction::FAdd:
927 m_Specific(IVStep)));
928 case Instruction::FSub:
929 return match(VPV, m_Binary<Instruction::FSub>(m_Specific(WideIV),
930 m_Specific(IVStep)));
931 case Instruction::Sub: {
932 // IVStep will be the negated step of the subtraction. Check if Step == -1
933 // * IVStep.
934 VPValue *Step;
935 if (!match(VPV, m_Sub(m_VPValue(), m_VPValue(Step))))
936 return false;
937 const SCEV *IVStepSCEV = vputils::getSCEVExprForVPValue(IVStep, PSE);
938 const SCEV *StepSCEV = vputils::getSCEVExprForVPValue(Step, PSE);
939 ScalarEvolution &SE = *PSE.getSE();
940 return !isa<SCEVCouldNotCompute>(IVStepSCEV) &&
941 !isa<SCEVCouldNotCompute>(StepSCEV) &&
942 IVStepSCEV == SE.getNegativeSCEV(StepSCEV);
943 }
944 default:
945 return ID.getKind() == InductionDescriptor::IK_PtrInduction &&
946 match(VPV, m_GetElementPtr(m_Specific(WideIV),
947 m_Specific(WideIV->getStepValue())));
948 }
949 llvm_unreachable("should have been covered by switch above");
950 };
951 return IsWideIVInc() ? WideIV : nullptr;
952}
953
954/// Attempts to optimize the induction variable exit values for users in the
955/// early exit block.
957 VPTypeAnalysis &TypeInfo,
958 VPBlockBase *PredVPBB,
959 VPValue *Op,
961 VPValue *Incoming, *Mask;
964 return nullptr;
965
966 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
967 if (!WideIV)
968 return nullptr;
969
970 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
971 if (WideIntOrFp && WideIntOrFp->getTruncInst())
972 return nullptr;
973
974 // Calculate the final index.
975 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
976 auto *CanonicalIV = LoopRegion->getCanonicalIV();
977 Type *CanonicalIVType = LoopRegion->getCanonicalIVType();
978 VPBuilder B(cast<VPBasicBlock>(PredVPBB));
979
980 DebugLoc DL = cast<VPInstruction>(Op)->getDebugLoc();
981 VPValue *FirstActiveLane =
982 B.createNaryOp(VPInstruction::FirstActiveLane, Mask, DL);
983 Type *FirstActiveLaneType = TypeInfo.inferScalarType(FirstActiveLane);
984 FirstActiveLane = B.createScalarZExtOrTrunc(FirstActiveLane, CanonicalIVType,
985 FirstActiveLaneType, DL);
986 VPValue *EndValue =
987 B.createNaryOp(Instruction::Add, {CanonicalIV, FirstActiveLane}, DL);
988
989 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
990 // changed it means the exit is using the incremented value, so we need to
991 // add the step.
992 if (Incoming != WideIV) {
993 VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
994 EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
995 }
996
997 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
998 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
999 VPValue *Start = WideIV->getStartValue();
1000 VPValue *Step = WideIV->getStepValue();
1001 EndValue = B.createDerivedIV(
1002 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
1003 Start, EndValue, Step);
1004 }
1005
1006 return EndValue;
1007}
1008
1009/// Attempts to optimize the induction variable exit values for users in the
1010/// exit block coming from the latch in the original scalar loop.
1012 VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op,
1016 return nullptr;
1017
1018 auto *WideIV = getOptimizableIVOf(Incoming, PSE);
1019 if (!WideIV)
1020 return nullptr;
1021
1022 VPValue *EndValue = EndValues.lookup(WideIV);
1023 assert(EndValue && "end value must have been pre-computed");
1024
1025 // `getOptimizableIVOf()` always returns the pre-incremented IV, so if it
1026 // changed it means the exit is using the incremented value, so we don't
1027 // need to subtract the step.
1028 if (Incoming != WideIV)
1029 return EndValue;
1030
1031 // Otherwise, subtract the step from the EndValue.
1032 VPBuilder B(cast<VPBasicBlock>(PredVPBB)->getTerminator());
1033 VPValue *Step = WideIV->getStepValue();
1034 Type *ScalarTy = TypeInfo.inferScalarType(WideIV);
1035 if (ScalarTy->isIntegerTy())
1036 return B.createNaryOp(Instruction::Sub, {EndValue, Step},
1037 DebugLoc::getUnknown(), "ind.escape");
1038 if (ScalarTy->isPointerTy()) {
1039 Type *StepTy = TypeInfo.inferScalarType(Step);
1040 auto *Zero = Plan.getConstantInt(StepTy, 0);
1041 return B.createPtrAdd(EndValue,
1042 B.createNaryOp(Instruction::Sub, {Zero, Step}),
1043 DebugLoc::getUnknown(), "ind.escape");
1044 }
1045 if (ScalarTy->isFloatingPointTy()) {
1046 const auto &ID = WideIV->getInductionDescriptor();
1047 return B.createNaryOp(
1048 ID.getInductionBinOp()->getOpcode() == Instruction::FAdd
1049 ? Instruction::FSub
1050 : Instruction::FAdd,
1051 {EndValue, Step}, {ID.getInductionBinOp()->getFastMathFlags()});
1052 }
1053 llvm_unreachable("all possible induction types must be handled");
1054 return nullptr;
1055}
1056
1058 VPlan &Plan, DenseMap<VPValue *, VPValue *> &EndValues,
1060 VPBlockBase *MiddleVPBB = Plan.getMiddleBlock();
1061 VPTypeAnalysis TypeInfo(Plan);
1062 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
1063 for (VPRecipeBase &R : ExitVPBB->phis()) {
1064 auto *ExitIRI = cast<VPIRPhi>(&R);
1065
1066 for (auto [Idx, PredVPBB] : enumerate(ExitVPBB->getPredecessors())) {
1067 VPValue *Escape = nullptr;
1068 if (PredVPBB == MiddleVPBB)
1069 Escape = optimizeLatchExitInductionUser(Plan, TypeInfo, PredVPBB,
1070 ExitIRI->getOperand(Idx),
1071 EndValues, PSE);
1072 else
1074 Plan, TypeInfo, PredVPBB, ExitIRI->getOperand(Idx), PSE);
1075 if (Escape)
1076 ExitIRI->setOperand(Idx, Escape);
1077 }
1078 }
1079 }
1080}
1081
1082/// Remove redundant EpxandSCEVRecipes in \p Plan's entry block by replacing
1083/// them with already existing recipes expanding the same SCEV expression.
1086
1087 for (VPRecipeBase &R :
1089 auto *ExpR = dyn_cast<VPExpandSCEVRecipe>(&R);
1090 if (!ExpR)
1091 continue;
1092
1093 const auto &[V, Inserted] = SCEV2VPV.try_emplace(ExpR->getSCEV(), ExpR);
1094 if (Inserted)
1095 continue;
1096 ExpR->replaceAllUsesWith(V->second);
1097 ExpR->eraseFromParent();
1098 }
1099}
1100
1102 SmallVector<VPValue *> WorkList;
1104 WorkList.push_back(V);
1105
1106 while (!WorkList.empty()) {
1107 VPValue *Cur = WorkList.pop_back_val();
1108 if (!Seen.insert(Cur).second)
1109 continue;
1110 VPRecipeBase *R = Cur->getDefiningRecipe();
1111 if (!R)
1112 continue;
1113 if (!isDeadRecipe(*R))
1114 continue;
1115 append_range(WorkList, R->operands());
1116 R->eraseFromParent();
1117 }
1118}
1119
1120/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1121/// Returns an optional pair, where the first element indicates whether it is
1122/// an intrinsic ID.
1123static std::optional<std::pair<bool, unsigned>>
1125 return TypeSwitch<const VPSingleDefRecipe *,
1126 std::optional<std::pair<bool, unsigned>>>(R)
1129 [](auto *I) { return std::make_pair(false, I->getOpcode()); })
1130 .Case<VPWidenIntrinsicRecipe>([](auto *I) {
1131 return std::make_pair(true, I->getVectorIntrinsicID());
1132 })
1133 .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
1134 // For recipes that do not directly map to LLVM IR instructions,
1135 // assign opcodes after the last VPInstruction opcode (which is also
1136 // after the last IR Instruction opcode), based on the VPDefID.
1137 return std::make_pair(false,
1138 VPInstruction::OpsEnd + 1 + I->getVPDefID());
1139 })
1140 .Default([](auto *) { return std::nullopt; });
1141}
1142
1143/// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
1144/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
1145/// Operands are foldable live-ins.
1147 ArrayRef<VPValue *> Operands,
1148 const DataLayout &DL,
1149 VPTypeAnalysis &TypeInfo) {
1150 auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
1151 if (!OpcodeOrIID)
1152 return nullptr;
1153
1155 for (VPValue *Op : Operands) {
1156 if (!Op->isLiveIn() || !Op->getLiveInIRValue())
1157 return nullptr;
1158 Ops.push_back(Op->getLiveInIRValue());
1159 }
1160
1161 auto FoldToIRValue = [&]() -> Value * {
1162 InstSimplifyFolder Folder(DL);
1163 if (OpcodeOrIID->first) {
1164 if (R.getNumOperands() != 2)
1165 return nullptr;
1166 unsigned ID = OpcodeOrIID->second;
1167 return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
1168 TypeInfo.inferScalarType(&R));
1169 }
1170 unsigned Opcode = OpcodeOrIID->second;
1171 if (Instruction::isBinaryOp(Opcode))
1172 return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
1173 Ops[0], Ops[1]);
1174 if (Instruction::isCast(Opcode))
1175 return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
1176 TypeInfo.inferScalarType(R.getVPSingleValue()));
1177 switch (Opcode) {
1179 return Folder.FoldSelect(Ops[0], Ops[1],
1181 case VPInstruction::Not:
1182 return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
1184 case Instruction::Select:
1185 return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
1186 case Instruction::ICmp:
1187 case Instruction::FCmp:
1188 return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
1189 Ops[1]);
1190 case Instruction::GetElementPtr: {
1191 auto &RFlags = cast<VPRecipeWithIRFlags>(R);
1192 auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
1193 return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
1194 drop_begin(Ops), RFlags.getGEPNoWrapFlags());
1195 }
1198 return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
1199 Ops[0], Ops[1],
1200 cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
1201 // An extract of a live-in is an extract of a broadcast, so return the
1202 // broadcasted element.
1203 case Instruction::ExtractElement:
1204 assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
1205 return Ops[0];
1206 }
1207 return nullptr;
1208 };
1209
1210 if (Value *V = FoldToIRValue())
1211 return R.getParent()->getPlan()->getOrAddLiveIn(V);
1212 return nullptr;
1213}
1214
1215/// Try to simplify VPSingleDefRecipe \p Def.
1217 VPlan *Plan = Def->getParent()->getPlan();
1218
1219 // Simplification of live-in IR values for SingleDef recipes using
1220 // InstSimplifyFolder.
1221 const DataLayout &DL =
1223 if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
1224 return Def->replaceAllUsesWith(V);
1225
1226 // Fold PredPHI LiveIn -> LiveIn.
1227 if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(Def)) {
1228 VPValue *Op = PredPHI->getOperand(0);
1229 if (Op->isLiveIn())
1230 PredPHI->replaceAllUsesWith(Op);
1231 }
1232
1233 VPBuilder Builder(Def);
1234 VPValue *A;
1235 if (match(Def, m_Trunc(m_ZExtOrSExt(m_VPValue(A))))) {
1236 Type *TruncTy = TypeInfo.inferScalarType(Def);
1237 Type *ATy = TypeInfo.inferScalarType(A);
1238 if (TruncTy == ATy) {
1239 Def->replaceAllUsesWith(A);
1240 } else {
1241 // Don't replace a scalarizing recipe with a widened cast.
1242 if (isa<VPReplicateRecipe>(Def))
1243 return;
1244 if (ATy->getScalarSizeInBits() < TruncTy->getScalarSizeInBits()) {
1245
1246 unsigned ExtOpcode = match(Def->getOperand(0), m_SExt(m_VPValue()))
1247 ? Instruction::SExt
1248 : Instruction::ZExt;
1249 auto *Ext = Builder.createWidenCast(Instruction::CastOps(ExtOpcode), A,
1250 TruncTy);
1251 if (auto *UnderlyingExt = Def->getOperand(0)->getUnderlyingValue()) {
1252 // UnderlyingExt has distinct return type, used to retain legacy cost.
1253 Ext->setUnderlyingValue(UnderlyingExt);
1254 }
1255 Def->replaceAllUsesWith(Ext);
1256 } else if (ATy->getScalarSizeInBits() > TruncTy->getScalarSizeInBits()) {
1257 auto *Trunc = Builder.createWidenCast(Instruction::Trunc, A, TruncTy);
1258 Def->replaceAllUsesWith(Trunc);
1259 }
1260 }
1261#ifndef NDEBUG
1262 // Verify that the cached type info is for both A and its users is still
1263 // accurate by comparing it to freshly computed types.
1264 VPTypeAnalysis TypeInfo2(*Plan);
1265 assert(TypeInfo.inferScalarType(A) == TypeInfo2.inferScalarType(A));
1266 for (VPUser *U : A->users()) {
1267 auto *R = cast<VPRecipeBase>(U);
1268 for (VPValue *VPV : R->definedValues())
1269 assert(TypeInfo.inferScalarType(VPV) == TypeInfo2.inferScalarType(VPV));
1270 }
1271#endif
1272 }
1273
1274 // Simplify (X && Y) || (X && !Y) -> X.
1275 // TODO: Split up into simpler, modular combines: (X && Y) || (X && Z) into X
1276 // && (Y || Z) and (X || !X) into true. This requires queuing newly created
1277 // recipes to be visited during simplification.
1278 VPValue *X, *Y, *Z;
1279 if (match(Def,
1282 Def->replaceAllUsesWith(X);
1283 Def->eraseFromParent();
1284 return;
1285 }
1286
1287 // x | 1 -> 1
1288 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_AllOnes())))
1289 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1290
1291 // x | 0 -> x
1292 if (match(Def, m_c_BinaryOr(m_VPValue(X), m_ZeroInt())))
1293 return Def->replaceAllUsesWith(X);
1294
1295 // x & 0 -> 0
1296 if (match(Def, m_c_BinaryAnd(m_VPValue(X), m_ZeroInt())))
1297 return Def->replaceAllUsesWith(Def->getOperand(Def->getOperand(0) == X));
1298
1299 // x && false -> false
1300 if (match(Def, m_LogicalAnd(m_VPValue(X), m_False())))
1301 return Def->replaceAllUsesWith(Def->getOperand(1));
1302
1303 // (x && y) || (x && z) -> x && (y || z)
1306 // Simplify only if one of the operands has one use to avoid creating an
1307 // extra recipe.
1308 (!Def->getOperand(0)->hasMoreThanOneUniqueUser() ||
1309 !Def->getOperand(1)->hasMoreThanOneUniqueUser()))
1310 return Def->replaceAllUsesWith(
1311 Builder.createLogicalAnd(X, Builder.createOr(Y, Z)));
1312
1313 // x && !x -> 0
1315 return Def->replaceAllUsesWith(Plan->getFalse());
1316
1317 if (match(Def, m_Select(m_VPValue(), m_VPValue(X), m_Deferred(X))))
1318 return Def->replaceAllUsesWith(X);
1319
1320 // select !c, x, y -> select c, y, x
1321 VPValue *C;
1322 if (match(Def, m_Select(m_Not(m_VPValue(C)), m_VPValue(X), m_VPValue(Y)))) {
1323 Def->setOperand(0, C);
1324 Def->setOperand(1, Y);
1325 Def->setOperand(2, X);
1326 return;
1327 }
1328
1329 // Reassociate (x && y) && z -> x && (y && z) if x has multiple users. With
1330 // tail folding it is likely that x is a header mask and can be simplified
1331 // further.
1333 m_VPValue(Z))) &&
1334 X->hasMoreThanOneUniqueUser())
1335 return Def->replaceAllUsesWith(
1336 Builder.createLogicalAnd(X, Builder.createLogicalAnd(Y, Z)));
1337
1338 if (match(Def, m_c_Add(m_VPValue(A), m_ZeroInt())))
1339 return Def->replaceAllUsesWith(A);
1340
1341 if (match(Def, m_c_Mul(m_VPValue(A), m_One())))
1342 return Def->replaceAllUsesWith(A);
1343
1344 if (match(Def, m_c_Mul(m_VPValue(A), m_ZeroInt())))
1345 return Def->replaceAllUsesWith(
1346 Def->getOperand(0) == A ? Def->getOperand(1) : Def->getOperand(0));
1347
1348 if (match(Def, m_Not(m_VPValue(A)))) {
1349 if (match(A, m_Not(m_VPValue(A))))
1350 return Def->replaceAllUsesWith(A);
1351
1352 // Try to fold Not into compares by adjusting the predicate in-place.
1353 CmpPredicate Pred;
1354 if (match(A, m_Cmp(Pred, m_VPValue(), m_VPValue()))) {
1355 auto *Cmp = cast<VPRecipeWithIRFlags>(A);
1356 if (all_of(Cmp->users(),
1358 m_Not(m_Specific(Cmp)),
1359 m_Select(m_Specific(Cmp), m_VPValue(), m_VPValue()))))) {
1360 Cmp->setPredicate(CmpInst::getInversePredicate(Pred));
1361 for (VPUser *U : to_vector(Cmp->users())) {
1362 auto *R = cast<VPSingleDefRecipe>(U);
1363 if (match(R, m_Select(m_Specific(Cmp), m_VPValue(X), m_VPValue(Y)))) {
1364 // select (cmp pred), x, y -> select (cmp inv_pred), y, x
1365 R->setOperand(1, Y);
1366 R->setOperand(2, X);
1367 } else {
1368 // not (cmp pred) -> cmp inv_pred
1369 assert(match(R, m_Not(m_Specific(Cmp))) && "Unexpected user");
1370 R->replaceAllUsesWith(Cmp);
1371 }
1372 }
1373 // If Cmp doesn't have a debug location, use the one from the negation,
1374 // to preserve the location.
1375 if (!Cmp->getDebugLoc() && Def->getDebugLoc())
1376 Cmp->setDebugLoc(Def->getDebugLoc());
1377 }
1378 }
1379 }
1380
1381 // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
1382 // any-of (fcmp uno %A, %B), ...
1383 if (match(Def, m_AnyOf())) {
1385 VPRecipeBase *UnpairedCmp = nullptr;
1386 for (VPValue *Op : Def->operands()) {
1387 VPValue *X;
1388 if (Op->getNumUsers() > 1 ||
1390 m_Deferred(X)))) {
1391 NewOps.push_back(Op);
1392 } else if (!UnpairedCmp) {
1393 UnpairedCmp = Op->getDefiningRecipe();
1394 } else {
1395 NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
1396 UnpairedCmp->getOperand(0), X));
1397 UnpairedCmp = nullptr;
1398 }
1399 }
1400
1401 if (UnpairedCmp)
1402 NewOps.push_back(UnpairedCmp->getVPSingleValue());
1403
1404 if (NewOps.size() < Def->getNumOperands()) {
1405 VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
1406 return Def->replaceAllUsesWith(NewAnyOf);
1407 }
1408 }
1409
1410 // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
1411 // This is useful for fmax/fmin without fast-math flags, where we need to
1412 // check if any operand is NaN.
1414 m_Deferred(X)),
1416 m_Deferred(Y))))) {
1417 VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y);
1418 return Def->replaceAllUsesWith(NewCmp);
1419 }
1420
1421 // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0.
1422 if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) ||
1423 match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) &&
1424 TypeInfo.inferScalarType(Def->getOperand(1)) ==
1425 TypeInfo.inferScalarType(Def))
1426 return Def->replaceAllUsesWith(Def->getOperand(1));
1427
1429 m_One()))) {
1430 Type *WideStepTy = TypeInfo.inferScalarType(Def);
1431 if (TypeInfo.inferScalarType(X) != WideStepTy)
1432 X = Builder.createWidenCast(Instruction::Trunc, X, WideStepTy);
1433 Def->replaceAllUsesWith(X);
1434 return;
1435 }
1436
1437 // For i1 vp.merges produced by AnyOf reductions:
1438 // vp.merge true, (or x, y), x, evl -> vp.merge y, true, x, evl
1440 m_VPValue(X), m_VPValue())) &&
1442 TypeInfo.inferScalarType(Def)->isIntegerTy(1)) {
1443 Def->setOperand(1, Def->getOperand(0));
1444 Def->setOperand(0, Y);
1445 return;
1446 }
1447
1448 if (auto *Phi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Def)) {
1449 if (Phi->getOperand(0) == Phi->getOperand(1))
1450 Phi->replaceAllUsesWith(Phi->getOperand(0));
1451 return;
1452 }
1453
1454 // Look through ExtractLastLane.
1455 if (match(Def, m_ExtractLastLane(m_VPValue(A)))) {
1456 if (match(A, m_BuildVector())) {
1457 auto *BuildVector = cast<VPInstruction>(A);
1458 Def->replaceAllUsesWith(
1459 BuildVector->getOperand(BuildVector->getNumOperands() - 1));
1460 return;
1461 }
1462 if (Plan->hasScalarVFOnly())
1463 return Def->replaceAllUsesWith(A);
1464 }
1465
1466 // Look through ExtractPenultimateElement (BuildVector ....).
1468 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1469 Def->replaceAllUsesWith(
1470 BuildVector->getOperand(BuildVector->getNumOperands() - 2));
1471 return;
1472 }
1473
1474 uint64_t Idx;
1476 auto *BuildVector = cast<VPInstruction>(Def->getOperand(0));
1477 Def->replaceAllUsesWith(BuildVector->getOperand(Idx));
1478 return;
1479 }
1480
1481 if (match(Def, m_BuildVector()) && all_equal(Def->operands())) {
1482 Def->replaceAllUsesWith(
1483 Builder.createNaryOp(VPInstruction::Broadcast, Def->getOperand(0)));
1484 return;
1485 }
1486
1487 // Look through broadcast of single-scalar when used as select conditions; in
1488 // that case the scalar condition can be used directly.
1489 if (match(Def,
1492 "broadcast operand must be single-scalar");
1493 Def->setOperand(0, C);
1494 return;
1495 }
1496
1497 if (auto *Phi = dyn_cast<VPPhi>(Def)) {
1498 if (Phi->getNumOperands() == 1)
1499 Phi->replaceAllUsesWith(Phi->getOperand(0));
1500 return;
1501 }
1502
1503 // Some simplifications can only be applied after unrolling. Perform them
1504 // below.
1505 if (!Plan->isUnrolled())
1506 return;
1507
1508 // Hoist an invariant increment Y of a phi X, by having X start at Y.
1509 if (match(Def, m_c_Add(m_VPValue(X), m_VPValue(Y))) && Y->isLiveIn() &&
1510 isa<VPPhi>(X)) {
1511 auto *Phi = cast<VPPhi>(X);
1512 if (Phi->getOperand(1) != Def && match(Phi->getOperand(0), m_ZeroInt()) &&
1513 Phi->getSingleUser() == Def) {
1514 Phi->setOperand(0, Y);
1515 Def->replaceAllUsesWith(Phi);
1516 return;
1517 }
1518 }
1519
1520 // Simplify unrolled VectorPointer without offset, or with zero offset, to
1521 // just the pointer operand.
1522 if (auto *VPR = dyn_cast<VPVectorPointerRecipe>(Def))
1523 if (!VPR->getOffset() || match(VPR->getOffset(), m_ZeroInt()))
1524 return VPR->replaceAllUsesWith(VPR->getOperand(0));
1525
1526 // VPScalarIVSteps for part 0 can be replaced by their start value, if only
1527 // the first lane is demanded.
1528 if (auto *Steps = dyn_cast<VPScalarIVStepsRecipe>(Def)) {
1529 if (Steps->isPart0() && vputils::onlyFirstLaneUsed(Steps)) {
1530 Steps->replaceAllUsesWith(Steps->getOperand(0));
1531 return;
1532 }
1533 }
1534 // Simplify redundant ReductionStartVector recipes after unrolling.
1535 VPValue *StartV;
1537 m_VPValue(StartV), m_VPValue(), m_VPValue()))) {
1538 Def->replaceUsesWithIf(StartV, [](const VPUser &U, unsigned Idx) {
1539 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&U);
1540 return PhiR && PhiR->isInLoop();
1541 });
1542 return;
1543 }
1544
1546 Def->replaceAllUsesWith(A);
1547 return;
1548 }
1549
1550 if (match(Def, m_ExtractLastLane(m_VPValue(A))) &&
1553 cast<VPReplicateRecipe>(A)->isSingleScalar())) &&
1554 all_of(A->users(),
1555 [Def, A](VPUser *U) { return U->usesScalars(A) || Def == U; })) {
1556 return Def->replaceAllUsesWith(A);
1557 }
1558
1559 if (Plan->getUF() == 1 && match(Def, m_ExtractLastPart(m_VPValue(A))))
1560 return Def->replaceAllUsesWith(A);
1561}
1562
1565 Plan.getEntry());
1566 VPTypeAnalysis TypeInfo(Plan);
1568 for (VPRecipeBase &R : make_early_inc_range(*VPBB))
1569 if (auto *Def = dyn_cast<VPSingleDefRecipe>(&R))
1570 simplifyRecipe(Def, TypeInfo);
1571 }
1572}
1573
1575 if (Plan.hasScalarVFOnly())
1576 return;
1577
1578 // Try to narrow wide and replicating recipes to single scalar recipes,
1579 // based on VPlan analysis. Only process blocks in the loop region for now,
1580 // without traversing into nested regions, as recipes in replicate regions
1581 // cannot be converted yet.
1584 for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
1587 continue;
1588 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
1589 if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
1590 continue;
1591
1592 // Convert an unmasked scatter with an uniform address into
1593 // extract-last-lane + scalar store.
1594 // TODO: Add a profitability check comparing the cost of a scatter vs.
1595 // extract + scalar store.
1596 auto *WidenStoreR = dyn_cast<VPWidenStoreRecipe>(&R);
1597 if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
1598 !WidenStoreR->isConsecutive()) {
1599 assert(!WidenStoreR->isReverse() &&
1600 "Not consecutive memory recipes shouldn't be reversed");
1601 VPValue *Mask = WidenStoreR->getMask();
1602
1603 // Only convert the scatter to a scalar store if it is unmasked.
1604 // TODO: Support converting scatter masked by the header mask to scalar
1605 // store.
1606 if (Mask)
1607 continue;
1608
1610 {WidenStoreR->getOperand(1)});
1611 Extract->insertBefore(WidenStoreR);
1612
1613 // TODO: Sink the scalar store recipe to middle block if possible.
1614 auto *ScalarStore = new VPReplicateRecipe(
1615 &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
1616 true /*IsSingleScalar*/, nullptr /*Mask*/, {},
1617 *WidenStoreR /*Metadata*/);
1618 ScalarStore->insertBefore(WidenStoreR);
1619 WidenStoreR->eraseFromParent();
1620 continue;
1621 }
1622
1623 auto *RepOrWidenR = dyn_cast<VPRecipeWithIRFlags>(&R);
1624 if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
1625 vputils::isSingleScalar(RepR->getOperand(1))) {
1626 auto *Clone = new VPReplicateRecipe(
1627 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1628 true /*IsSingleScalar*/, nullptr /*Mask*/, *RepR /*Flags*/,
1629 *RepR /*Metadata*/, RepR->getDebugLoc());
1630 Clone->insertBefore(RepOrWidenR);
1631 VPBuilder Builder(Clone);
1632 VPValue *ExtractOp = Clone->getOperand(0);
1633 if (vputils::isUniformAcrossVFsAndUFs(RepR->getOperand(1)))
1634 ExtractOp =
1635 Builder.createNaryOp(VPInstruction::ExtractLastPart, ExtractOp);
1636 ExtractOp =
1637 Builder.createNaryOp(VPInstruction::ExtractLastLane, ExtractOp);
1638 Clone->setOperand(0, ExtractOp);
1639 RepR->eraseFromParent();
1640 continue;
1641 }
1642
1643 // Skip recipes that aren't single scalars.
1644 if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR))
1645 continue;
1646
1647 // Skip recipes for which conversion to single-scalar does introduce
1648 // additional broadcasts. No extra broadcasts are needed, if either only
1649 // the scalars of the recipe are used, or at least one of the operands
1650 // would require a broadcast. In the latter case, the single-scalar may
1651 // need to be broadcasted, but another broadcast is removed.
1652 if (!all_of(RepOrWidenR->users(),
1653 [RepOrWidenR](const VPUser *U) {
1654 if (auto *VPI = dyn_cast<VPInstruction>(U)) {
1655 unsigned Opcode = VPI->getOpcode();
1656 if (Opcode == VPInstruction::ExtractLastLane ||
1657 Opcode == VPInstruction::ExtractLastPart ||
1658 Opcode == VPInstruction::ExtractPenultimateElement)
1659 return true;
1660 }
1661
1662 return U->usesScalars(RepOrWidenR);
1663 }) &&
1664 none_of(RepOrWidenR->operands(), [RepOrWidenR](VPValue *Op) {
1665 if (Op->getSingleUser() != RepOrWidenR)
1666 return false;
1667 // Non-constant live-ins require broadcasts, while constants do not
1668 // need explicit broadcasts.
1669 bool LiveInNeedsBroadcast =
1670 Op->isLiveIn() && !isa<Constant>(Op->getLiveInIRValue());
1671 auto *OpR = dyn_cast<VPReplicateRecipe>(Op);
1672 return LiveInNeedsBroadcast || (OpR && OpR->isSingleScalar());
1673 }))
1674 continue;
1675
1676 auto *Clone = new VPReplicateRecipe(
1677 RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
1678 true /*IsSingleScalar*/, nullptr, *RepOrWidenR);
1679 Clone->insertBefore(RepOrWidenR);
1680 RepOrWidenR->replaceAllUsesWith(Clone);
1681 if (isDeadRecipe(*RepOrWidenR))
1682 RepOrWidenR->eraseFromParent();
1683 }
1684 }
1685}
1686
1687/// Try to see if all of \p Blend's masks share a common value logically and'ed
1688/// and remove it from the masks.
1690 if (Blend->isNormalized())
1691 return;
1692 VPValue *CommonEdgeMask;
1693 if (!match(Blend->getMask(0),
1694 m_LogicalAnd(m_VPValue(CommonEdgeMask), m_VPValue())))
1695 return;
1696 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1697 if (!match(Blend->getMask(I),
1698 m_LogicalAnd(m_Specific(CommonEdgeMask), m_VPValue())))
1699 return;
1700 for (unsigned I = 0; I < Blend->getNumIncomingValues(); I++)
1701 Blend->setMask(I, Blend->getMask(I)->getDefiningRecipe()->getOperand(1));
1702}
1703
1704/// Normalize and simplify VPBlendRecipes. Should be run after simplifyRecipes
1705/// to make sure the masks are simplified.
1706static void simplifyBlends(VPlan &Plan) {
1709 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
1710 auto *Blend = dyn_cast<VPBlendRecipe>(&R);
1711 if (!Blend)
1712 continue;
1713
1714 removeCommonBlendMask(Blend);
1715
1716 // Try to remove redundant blend recipes.
1717 SmallPtrSet<VPValue *, 4> UniqueValues;
1718 if (Blend->isNormalized() || !match(Blend->getMask(0), m_False()))
1719 UniqueValues.insert(Blend->getIncomingValue(0));
1720 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
1721 if (!match(Blend->getMask(I), m_False()))
1722 UniqueValues.insert(Blend->getIncomingValue(I));
1723
1724 if (UniqueValues.size() == 1) {
1725 Blend->replaceAllUsesWith(*UniqueValues.begin());
1726 Blend->eraseFromParent();
1727 continue;
1728 }
1729
1730 if (Blend->isNormalized())
1731 continue;
1732
1733 // Normalize the blend so its first incoming value is used as the initial
1734 // value with the others blended into it.
1735
1736 unsigned StartIndex = 0;
1737 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1738 // If a value's mask is used only by the blend then is can be deadcoded.
1739 // TODO: Find the most expensive mask that can be deadcoded, or a mask
1740 // that's used by multiple blends where it can be removed from them all.
1741 VPValue *Mask = Blend->getMask(I);
1742 if (Mask->getNumUsers() == 1 && !match(Mask, m_False())) {
1743 StartIndex = I;
1744 break;
1745 }
1746 }
1747
1748 SmallVector<VPValue *, 4> OperandsWithMask;
1749 OperandsWithMask.push_back(Blend->getIncomingValue(StartIndex));
1750
1751 for (unsigned I = 0; I != Blend->getNumIncomingValues(); ++I) {
1752 if (I == StartIndex)
1753 continue;
1754 OperandsWithMask.push_back(Blend->getIncomingValue(I));
1755 OperandsWithMask.push_back(Blend->getMask(I));
1756 }
1757
1758 auto *NewBlend =
1759 new VPBlendRecipe(cast_or_null<PHINode>(Blend->getUnderlyingValue()),
1760 OperandsWithMask, Blend->getDebugLoc());
1761 NewBlend->insertBefore(&R);
1762
1763 VPValue *DeadMask = Blend->getMask(StartIndex);
1764 Blend->replaceAllUsesWith(NewBlend);
1765 Blend->eraseFromParent();
1767
1768 /// Simplify BLEND %a, %b, Not(%mask) -> BLEND %b, %a, %mask.
1769 VPValue *NewMask;
1770 if (NewBlend->getNumOperands() == 3 &&
1771 match(NewBlend->getMask(1), m_Not(m_VPValue(NewMask)))) {
1772 VPValue *Inc0 = NewBlend->getOperand(0);
1773 VPValue *Inc1 = NewBlend->getOperand(1);
1774 VPValue *OldMask = NewBlend->getOperand(2);
1775 NewBlend->setOperand(0, Inc1);
1776 NewBlend->setOperand(1, Inc0);
1777 NewBlend->setOperand(2, NewMask);
1778 if (OldMask->getNumUsers() == 0)
1779 cast<VPInstruction>(OldMask)->eraseFromParent();
1780 }
1781 }
1782 }
1783}
1784
1785/// Optimize the width of vector induction variables in \p Plan based on a known
1786/// constant Trip Count, \p BestVF and \p BestUF.
1788 ElementCount BestVF,
1789 unsigned BestUF) {
1790 // Only proceed if we have not completely removed the vector region.
1791 if (!Plan.getVectorLoopRegion())
1792 return false;
1793
1794 const APInt *TC;
1795 if (!BestVF.isFixed() || !match(Plan.getTripCount(), m_APInt(TC)))
1796 return false;
1797
1798 // Calculate the minimum power-of-2 bit width that can fit the known TC, VF
1799 // and UF. Returns at least 8.
1800 auto ComputeBitWidth = [](APInt TC, uint64_t Align) {
1801 APInt AlignedTC =
1804 APInt MaxVal = AlignedTC - 1;
1805 return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8);
1806 };
1807 unsigned NewBitWidth =
1808 ComputeBitWidth(*TC, BestVF.getKnownMinValue() * BestUF);
1809
1810 LLVMContext &Ctx = Plan.getContext();
1811 auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth);
1812
1813 bool MadeChange = false;
1814
1815 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
1816 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
1817 auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
1818
1819 // Currently only handle canonical IVs as it is trivial to replace the start
1820 // and stop values, and we currently only perform the optimization when the
1821 // IV has a single use.
1822 if (!WideIV || !WideIV->isCanonical() ||
1823 WideIV->hasMoreThanOneUniqueUser() ||
1824 NewIVTy == WideIV->getScalarType())
1825 continue;
1826
1827 // Currently only handle cases where the single user is a header-mask
1828 // comparison with the backedge-taken-count.
1829 VPUser *SingleUser = WideIV->getSingleUser();
1830 if (!SingleUser ||
1831 !match(SingleUser, m_ICmp(m_Specific(WideIV),
1834 continue;
1835
1836 // Update IV operands and comparison bound to use new narrower type.
1837 auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
1838 WideIV->setStartValue(NewStart);
1839 auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
1840 WideIV->setStepValue(NewStep);
1841
1842 auto *NewBTC = new VPWidenCastRecipe(
1843 Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy);
1844 Plan.getVectorPreheader()->appendRecipe(NewBTC);
1845 auto *Cmp = cast<VPInstruction>(WideIV->getSingleUser());
1846 Cmp->setOperand(1, NewBTC);
1847
1848 MadeChange = true;
1849 }
1850
1851 return MadeChange;
1852}
1853
1854/// Return true if \p Cond is known to be true for given \p BestVF and \p
1855/// BestUF.
1857 ElementCount BestVF, unsigned BestUF,
1860 return any_of(Cond->getDefiningRecipe()->operands(), [&Plan, BestVF, BestUF,
1861 &PSE](VPValue *C) {
1862 return isConditionTrueViaVFAndUF(C, Plan, BestVF, BestUF, PSE);
1863 });
1864
1865 auto *CanIV = Plan.getVectorLoopRegion()->getCanonicalIV();
1867 m_Specific(CanIV->getBackedgeValue()),
1868 m_Specific(&Plan.getVectorTripCount()))))
1869 return false;
1870
1871 // The compare checks CanIV + VFxUF == vector trip count. The vector trip
1872 // count is not conveniently available as SCEV so far, so we compare directly
1873 // against the original trip count. This is stricter than necessary, as we
1874 // will only return true if the trip count == vector trip count.
1875 const SCEV *VectorTripCount =
1877 if (isa<SCEVCouldNotCompute>(VectorTripCount))
1878 VectorTripCount = vputils::getSCEVExprForVPValue(Plan.getTripCount(), PSE);
1879 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
1880 "Trip count SCEV must be computable");
1881 ScalarEvolution &SE = *PSE.getSE();
1882 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
1883 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
1884 return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C);
1885}
1886
1887/// Try to replace multiple active lane masks used for control flow with
1888/// a single, wide active lane mask instruction followed by multiple
1889/// extract subvector intrinsics. This applies to the active lane mask
1890/// instructions both in the loop and in the preheader.
1891/// Incoming values of all ActiveLaneMaskPHIs are updated to use the
1892/// new extracts from the first active lane mask, which has it's last
1893/// operand (multiplier) set to UF.
1895 unsigned UF) {
1896 if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1)
1897 return false;
1898
1899 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1900 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1901 auto *Term = &ExitingVPBB->back();
1902
1903 using namespace llvm::VPlanPatternMatch;
1905 m_VPValue(), m_VPValue(), m_VPValue())))))
1906 return false;
1907
1908 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
1909 LLVMContext &Ctx = Plan.getContext();
1910
1911 auto ExtractFromALM = [&](VPInstruction *ALM,
1912 SmallVectorImpl<VPValue *> &Extracts) {
1913 DebugLoc DL = ALM->getDebugLoc();
1914 for (unsigned Part = 0; Part < UF; ++Part) {
1916 Ops.append({ALM, Plan.getOrAddLiveIn(
1917 ConstantInt::get(IntegerType::getInt64Ty(Ctx),
1918 VF.getKnownMinValue() * Part))});
1919 auto *Ext =
1920 new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops,
1921 IntegerType::getInt1Ty(Ctx), {}, {}, DL);
1922 Extracts[Part] = Ext;
1923 Ext->insertAfter(ALM);
1924 }
1925 };
1926
1927 // Create a list of each active lane mask phi, ordered by unroll part.
1929 for (VPRecipeBase &R : Header->phis()) {
1931 if (!Phi)
1932 continue;
1933 VPValue *Index = nullptr;
1934 match(Phi->getBackedgeValue(),
1936 assert(Index && "Expected index from ActiveLaneMask instruction");
1937
1938 uint64_t Part;
1939 if (match(Index,
1941 m_VPValue(), m_ConstantInt(Part))))
1942 Phis[Part] = Phi;
1943 else
1944 // Anything other than a CanonicalIVIncrementForPart is part 0
1945 Phis[0] = Phi;
1946 }
1947
1948 assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) &&
1949 "Expected one VPActiveLaneMaskPHIRecipe for each unroll part");
1950
1951 auto *EntryALM = cast<VPInstruction>(Phis[0]->getStartValue());
1952 auto *LoopALM = cast<VPInstruction>(Phis[0]->getBackedgeValue());
1953
1954 assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask &&
1955 LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) &&
1956 "Expected incoming values of Phi to be ActiveLaneMasks");
1957
1958 // When using wide lane masks, the return type of the get.active.lane.mask
1959 // intrinsic is VF x UF (last operand).
1960 VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
1961 EntryALM->setOperand(2, ALMMultiplier);
1962 LoopALM->setOperand(2, ALMMultiplier);
1963
1964 // Create UF x extract vectors and insert into preheader.
1965 SmallVector<VPValue *> EntryExtracts(UF);
1966 ExtractFromALM(EntryALM, EntryExtracts);
1967
1968 // Create UF x extract vectors and insert before the loop compare & branch,
1969 // updating the compare to use the first extract.
1970 SmallVector<VPValue *> LoopExtracts(UF);
1971 ExtractFromALM(LoopALM, LoopExtracts);
1972 VPInstruction *Not = cast<VPInstruction>(Term->getOperand(0));
1973 Not->setOperand(0, LoopExtracts[0]);
1974
1975 // Update the incoming values of active lane mask phis.
1976 for (unsigned Part = 0; Part < UF; ++Part) {
1977 Phis[Part]->setStartValue(EntryExtracts[Part]);
1978 Phis[Part]->setBackedgeValue(LoopExtracts[Part]);
1979 }
1980
1981 return true;
1982}
1983
1984/// Try to simplify the branch condition of \p Plan. This may restrict the
1985/// resulting plan to \p BestVF and \p BestUF.
1987 unsigned BestUF,
1989 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
1990 VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock();
1991 auto *Term = &ExitingVPBB->back();
1992 VPValue *Cond;
1993 if (match(Term, m_BranchOnCount()) ||
1995 m_VPValue(), m_VPValue(), m_VPValue()))))) {
1996 // Try to simplify the branch condition if VectorTC <= VF * UF when the
1997 // latch terminator is BranchOnCount or BranchOnCond(Not(ActiveLaneMask)).
1998 const SCEV *VectorTripCount =
2000 if (isa<SCEVCouldNotCompute>(VectorTripCount))
2001 VectorTripCount =
2003 assert(!isa<SCEVCouldNotCompute>(VectorTripCount) &&
2004 "Trip count SCEV must be computable");
2005 ScalarEvolution &SE = *PSE.getSE();
2006 ElementCount NumElements = BestVF.multiplyCoefficientBy(BestUF);
2007 const SCEV *C = SE.getElementCount(VectorTripCount->getType(), NumElements);
2008 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, VectorTripCount, C))
2009 return false;
2010 } else if (match(Term, m_BranchOnCond(m_VPValue(Cond))) ||
2012 // For BranchOnCond, check if we can prove the condition to be true using VF
2013 // and UF.
2014 if (!isConditionTrueViaVFAndUF(Cond, Plan, BestVF, BestUF, PSE))
2015 return false;
2016 } else {
2017 return false;
2018 }
2019
2020 // The vector loop region only executes once. If possible, completely remove
2021 // the region, otherwise replace the terminator controlling the latch with
2022 // (BranchOnCond true).
2023 // TODO: VPWidenIntOrFpInductionRecipe is only partially supported; add
2024 // support for other non-canonical widen induction recipes (e.g.,
2025 // VPWidenPointerInductionRecipe).
2026 // TODO: fold branch-on-constant after dissolving region.
2027 auto *Header = cast<VPBasicBlock>(VectorRegion->getEntry());
2028 if (all_of(Header->phis(), [](VPRecipeBase &Phi) {
2029 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi))
2030 return R->isCanonical();
2031 return isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
2032 VPFirstOrderRecurrencePHIRecipe, VPPhi>(&Phi);
2033 })) {
2034 for (VPRecipeBase &HeaderR : make_early_inc_range(Header->phis())) {
2035 if (auto *R = dyn_cast<VPWidenIntOrFpInductionRecipe>(&HeaderR)) {
2036 VPBuilder Builder(Plan.getVectorPreheader());
2037 VPValue *StepV = Builder.createNaryOp(VPInstruction::StepVector, {},
2038 R->getScalarType());
2039 HeaderR.getVPSingleValue()->replaceAllUsesWith(StepV);
2040 HeaderR.eraseFromParent();
2041 continue;
2042 }
2043 auto *Phi = cast<VPPhiAccessors>(&HeaderR);
2044 HeaderR.getVPSingleValue()->replaceAllUsesWith(Phi->getIncomingValue(0));
2045 HeaderR.eraseFromParent();
2046 }
2047
2048 VPBlockBase *Preheader = VectorRegion->getSinglePredecessor();
2049 SmallVector<VPBlockBase *> Exits = to_vector(VectorRegion->getSuccessors());
2050 VPBlockUtils::disconnectBlocks(Preheader, VectorRegion);
2051 for (VPBlockBase *Exit : Exits)
2052 VPBlockUtils::disconnectBlocks(VectorRegion, Exit);
2053
2054 for (VPBlockBase *B : vp_depth_first_shallow(VectorRegion->getEntry()))
2055 B->setParent(nullptr);
2056
2057 VPBlockUtils::connectBlocks(Preheader, Header);
2058
2059 for (VPBlockBase *Exit : Exits)
2060 VPBlockUtils::connectBlocks(ExitingVPBB, Exit);
2061
2062 // Replace terminating branch-on-two-conds with branch-on-cond to early
2063 // exit.
2064 if (Exits.size() != 1) {
2065 assert(match(Term, m_BranchOnTwoConds()) && Exits.size() == 2 &&
2066 "BranchOnTwoConds needs 2 remaining exits");
2068 Term->getOperand(0));
2069 }
2071 } else {
2072 // The vector region contains header phis for which we cannot remove the
2073 // loop region yet.
2074
2075 // For BranchOnTwoConds, set the latch exit condition to true directly.
2076 if (match(Term, m_BranchOnTwoConds())) {
2077 Term->setOperand(1, Plan.getTrue());
2078 return true;
2079 }
2080
2081 auto *BOC = new VPInstruction(VPInstruction::BranchOnCond, {Plan.getTrue()},
2082 {}, {}, Term->getDebugLoc());
2083 ExitingVPBB->appendRecipe(BOC);
2084 }
2085
2086 Term->eraseFromParent();
2087
2088 return true;
2089}
2090
2091/// From the definition of llvm.experimental.get.vector.length,
2092/// VPInstruction::ExplicitVectorLength(%AVL) = %AVL when %AVL <= VF.
2096 vp_depth_first_deep(Plan.getEntry()))) {
2097 for (VPRecipeBase &R : *VPBB) {
2098 VPValue *AVL;
2099 if (!match(&R, m_EVL(m_VPValue(AVL))))
2100 continue;
2101
2102 const SCEV *AVLSCEV = vputils::getSCEVExprForVPValue(AVL, PSE);
2103 if (isa<SCEVCouldNotCompute>(AVLSCEV))
2104 continue;
2105 ScalarEvolution &SE = *PSE.getSE();
2106 const SCEV *VFSCEV = SE.getElementCount(AVLSCEV->getType(), VF);
2107 if (!SE.isKnownPredicate(CmpInst::ICMP_ULE, AVLSCEV, VFSCEV))
2108 continue;
2109
2111 AVL, Type::getInt32Ty(Plan.getContext()), AVLSCEV->getType(),
2112 R.getDebugLoc());
2113 R.getVPSingleValue()->replaceAllUsesWith(Trunc);
2114 return true;
2115 }
2116 }
2117 return false;
2118}
2119
2121 unsigned BestUF,
2123 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
2124 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
2125
2126 bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF);
2127 MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE);
2128 MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF);
2129 MadeChange |= simplifyKnownEVL(Plan, BestVF, PSE);
2130
2131 if (MadeChange) {
2132 Plan.setVF(BestVF);
2133 assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF");
2134 }
2135}
2136
2137/// Sink users of \p FOR after the recipe defining the previous value \p
2138/// Previous of the recurrence. \returns true if all users of \p FOR could be
2139/// re-arranged as needed or false if it is not possible.
2140static bool
2142 VPRecipeBase *Previous,
2143 VPDominatorTree &VPDT) {
2144 // Collect recipes that need sinking.
2147 Seen.insert(Previous);
2148 auto TryToPushSinkCandidate = [&](VPRecipeBase *SinkCandidate) {
2149 // The previous value must not depend on the users of the recurrence phi. In
2150 // that case, FOR is not a fixed order recurrence.
2151 if (SinkCandidate == Previous)
2152 return false;
2153
2154 if (isa<VPHeaderPHIRecipe>(SinkCandidate) ||
2155 !Seen.insert(SinkCandidate).second ||
2156 VPDT.properlyDominates(Previous, SinkCandidate))
2157 return true;
2158
2159 if (cannotHoistOrSinkRecipe(*SinkCandidate))
2160 return false;
2161
2162 WorkList.push_back(SinkCandidate);
2163 return true;
2164 };
2165
2166 // Recursively sink users of FOR after Previous.
2167 WorkList.push_back(FOR);
2168 for (unsigned I = 0; I != WorkList.size(); ++I) {
2169 VPRecipeBase *Current = WorkList[I];
2170 assert(Current->getNumDefinedValues() == 1 &&
2171 "only recipes with a single defined value expected");
2172
2173 for (VPUser *User : Current->getVPSingleValue()->users()) {
2174 if (!TryToPushSinkCandidate(cast<VPRecipeBase>(User)))
2175 return false;
2176 }
2177 }
2178
2179 // Keep recipes to sink ordered by dominance so earlier instructions are
2180 // processed first.
2181 sort(WorkList, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2182 return VPDT.properlyDominates(A, B);
2183 });
2184
2185 for (VPRecipeBase *SinkCandidate : WorkList) {
2186 if (SinkCandidate == FOR)
2187 continue;
2188
2189 SinkCandidate->moveAfter(Previous);
2190 Previous = SinkCandidate;
2191 }
2192 return true;
2193}
2194
2195/// Try to hoist \p Previous and its operands before all users of \p FOR.
2197 VPRecipeBase *Previous,
2198 VPDominatorTree &VPDT) {
2199 if (cannotHoistOrSinkRecipe(*Previous))
2200 return false;
2201
2202 // Collect recipes that need hoisting.
2203 SmallVector<VPRecipeBase *> HoistCandidates;
2205 VPRecipeBase *HoistPoint = nullptr;
2206 // Find the closest hoist point by looking at all users of FOR and selecting
2207 // the recipe dominating all other users.
2208 for (VPUser *U : FOR->users()) {
2209 auto *R = cast<VPRecipeBase>(U);
2210 if (!HoistPoint || VPDT.properlyDominates(R, HoistPoint))
2211 HoistPoint = R;
2212 }
2213 assert(all_of(FOR->users(),
2214 [&VPDT, HoistPoint](VPUser *U) {
2215 auto *R = cast<VPRecipeBase>(U);
2216 return HoistPoint == R ||
2217 VPDT.properlyDominates(HoistPoint, R);
2218 }) &&
2219 "HoistPoint must dominate all users of FOR");
2220
2221 auto NeedsHoisting = [HoistPoint, &VPDT,
2222 &Visited](VPValue *HoistCandidateV) -> VPRecipeBase * {
2223 VPRecipeBase *HoistCandidate = HoistCandidateV->getDefiningRecipe();
2224 if (!HoistCandidate)
2225 return nullptr;
2226 VPRegionBlock *EnclosingLoopRegion =
2227 HoistCandidate->getParent()->getEnclosingLoopRegion();
2228 assert((!HoistCandidate->getRegion() ||
2229 HoistCandidate->getRegion() == EnclosingLoopRegion) &&
2230 "CFG in VPlan should still be flat, without replicate regions");
2231 // Hoist candidate was already visited, no need to hoist.
2232 if (!Visited.insert(HoistCandidate).second)
2233 return nullptr;
2234
2235 // Candidate is outside loop region or a header phi, dominates FOR users w/o
2236 // hoisting.
2237 if (!EnclosingLoopRegion || isa<VPHeaderPHIRecipe>(HoistCandidate))
2238 return nullptr;
2239
2240 // If we reached a recipe that dominates HoistPoint, we don't need to
2241 // hoist the recipe.
2242 if (VPDT.properlyDominates(HoistCandidate, HoistPoint))
2243 return nullptr;
2244 return HoistCandidate;
2245 };
2246
2247 if (!NeedsHoisting(Previous->getVPSingleValue()))
2248 return true;
2249
2250 // Recursively try to hoist Previous and its operands before all users of FOR.
2251 HoistCandidates.push_back(Previous);
2252
2253 for (unsigned I = 0; I != HoistCandidates.size(); ++I) {
2254 VPRecipeBase *Current = HoistCandidates[I];
2255 assert(Current->getNumDefinedValues() == 1 &&
2256 "only recipes with a single defined value expected");
2257 if (cannotHoistOrSinkRecipe(*Current))
2258 return false;
2259
2260 for (VPValue *Op : Current->operands()) {
2261 // If we reach FOR, it means the original Previous depends on some other
2262 // recurrence that in turn depends on FOR. If that is the case, we would
2263 // also need to hoist recipes involving the other FOR, which may break
2264 // dependencies.
2265 if (Op == FOR)
2266 return false;
2267
2268 if (auto *R = NeedsHoisting(Op)) {
2269 // Bail out if the recipe defines multiple values.
2270 // TODO: Hoisting such recipes requires additional handling.
2271 if (R->getNumDefinedValues() != 1)
2272 return false;
2273 HoistCandidates.push_back(R);
2274 }
2275 }
2276 }
2277
2278 // Order recipes to hoist by dominance so earlier instructions are processed
2279 // first.
2280 sort(HoistCandidates, [&VPDT](const VPRecipeBase *A, const VPRecipeBase *B) {
2281 return VPDT.properlyDominates(A, B);
2282 });
2283
2284 for (VPRecipeBase *HoistCandidate : HoistCandidates) {
2285 HoistCandidate->moveBefore(*HoistPoint->getParent(),
2286 HoistPoint->getIterator());
2287 }
2288
2289 return true;
2290}
2291
2293 VPBuilder &LoopBuilder) {
2294 VPDominatorTree VPDT(Plan);
2295
2297 for (VPRecipeBase &R :
2300 RecurrencePhis.push_back(FOR);
2301
2302 for (VPFirstOrderRecurrencePHIRecipe *FOR : RecurrencePhis) {
2304 VPRecipeBase *Previous = FOR->getBackedgeValue()->getDefiningRecipe();
2305 // Fixed-order recurrences do not contain cycles, so this loop is guaranteed
2306 // to terminate.
2307 while (auto *PrevPhi =
2309 assert(PrevPhi->getParent() == FOR->getParent());
2310 assert(SeenPhis.insert(PrevPhi).second);
2311 Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe();
2312 }
2313
2314 if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT) &&
2315 !hoistPreviousBeforeFORUsers(FOR, Previous, VPDT))
2316 return false;
2317
2318 // Introduce a recipe to combine the incoming and previous values of a
2319 // fixed-order recurrence.
2320 VPBasicBlock *InsertBlock = Previous->getParent();
2321 if (isa<VPHeaderPHIRecipe>(Previous))
2322 LoopBuilder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
2323 else
2324 LoopBuilder.setInsertPoint(InsertBlock,
2325 std::next(Previous->getIterator()));
2326
2327 auto *RecurSplice =
2329 {FOR, FOR->getBackedgeValue()});
2330
2331 FOR->replaceAllUsesWith(RecurSplice);
2332 // Set the first operand of RecurSplice to FOR again, after replacing
2333 // all users.
2334 RecurSplice->setOperand(0, FOR);
2335
2336 // Check for users extracting at the penultimate active lane of the FOR.
2337 // If only a single lane is active in the current iteration, we need to
2338 // select the last element from the previous iteration (from the FOR phi
2339 // directly).
2340 for (VPUser *U : RecurSplice->users()) {
2342 m_Specific(RecurSplice))))
2343 continue;
2344
2346 VPValue *LastActiveLane = cast<VPInstruction>(U)->getOperand(0);
2347 Type *I64Ty = Type::getInt64Ty(Plan.getContext());
2348 VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 0));
2349 VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(I64Ty, 1));
2350 VPValue *PenultimateIndex =
2351 B.createNaryOp(Instruction::Sub, {LastActiveLane, One});
2352 VPValue *PenultimateLastIter =
2353 B.createNaryOp(VPInstruction::ExtractLane,
2354 {PenultimateIndex, FOR->getBackedgeValue()});
2355 VPValue *LastPrevIter =
2356 B.createNaryOp(VPInstruction::ExtractLastLane, FOR);
2357
2358 VPValue *Cmp = B.createICmp(CmpInst::ICMP_EQ, LastActiveLane, Zero);
2359 VPValue *Sel = B.createSelect(Cmp, LastPrevIter, PenultimateLastIter);
2360 cast<VPInstruction>(U)->replaceAllUsesWith(Sel);
2361 }
2362 }
2363 return true;
2364}
2365
2367 for (VPRecipeBase &R :
2369 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
2370 if (!PhiR)
2371 continue;
2372 RecurKind RK = PhiR->getRecurrenceKind();
2373 if (RK != RecurKind::Add && RK != RecurKind::Mul && RK != RecurKind::Sub &&
2375 continue;
2376
2377 for (VPUser *U : collectUsersRecursively(PhiR))
2378 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(U)) {
2379 RecWithFlags->dropPoisonGeneratingFlags();
2380 }
2381 }
2382}
2383
2384namespace {
2385struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
2386 static bool isSentinel(const VPSingleDefRecipe *Def) {
2387 return Def == getEmptyKey() || Def == getTombstoneKey();
2388 }
2389
2390 /// If recipe \p R will lower to a GEP with a non-i8 source element type,
2391 /// return that source element type.
2392 static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
2393 // All VPInstructions that lower to GEPs must have the i8 source element
2394 // type (as they are PtrAdds), so we omit it.
2396 .Case<VPReplicateRecipe>([](auto *I) -> Type * {
2397 if (auto *GEP = dyn_cast<GetElementPtrInst>(I->getUnderlyingValue()))
2398 return GEP->getSourceElementType();
2399 return nullptr;
2400 })
2401 .Case<VPVectorPointerRecipe, VPWidenGEPRecipe>(
2402 [](auto *I) { return I->getSourceElementType(); })
2403 .Default([](auto *) { return nullptr; });
2404 }
2405
2406 /// Returns true if recipe \p Def can be safely handed for CSE.
2407 static bool canHandle(const VPSingleDefRecipe *Def) {
2408 // We can extend the list of handled recipes in the future,
2409 // provided we account for the data embedded in them while checking for
2410 // equality or hashing.
2411 auto C = getOpcodeOrIntrinsicID(Def);
2412
2413 // The issue with (Insert|Extract)Value is that the index of the
2414 // insert/extract is not a proper operand in LLVM IR, and hence also not in
2415 // VPlan.
2416 if (!C || (!C->first && (C->second == Instruction::InsertValue ||
2417 C->second == Instruction::ExtractValue)))
2418 return false;
2419
2420 // During CSE, we can only handle recipes that don't read from memory: if
2421 // they read from memory, there could be an intervening write to memory
2422 // before the next instance is CSE'd, leading to an incorrect result.
2423 return !Def->mayReadFromMemory();
2424 }
2425
2426 /// Hash the underlying data of \p Def.
2427 static unsigned getHashValue(const VPSingleDefRecipe *Def) {
2428 const VPlan *Plan = Def->getParent()->getPlan();
2429 VPTypeAnalysis TypeInfo(*Plan);
2430 hash_code Result = hash_combine(
2431 Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
2432 getGEPSourceElementType(Def), TypeInfo.inferScalarType(Def),
2434 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
2435 if (RFlags->hasPredicate())
2436 return hash_combine(Result, RFlags->getPredicate());
2437 return Result;
2438 }
2439
2440 /// Check equality of underlying data of \p L and \p R.
2441 static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
2442 if (isSentinel(L) || isSentinel(R))
2443 return L == R;
2444 if (L->getVPDefID() != R->getVPDefID() ||
2446 getGEPSourceElementType(L) != getGEPSourceElementType(R) ||
2448 !equal(L->operands(), R->operands()))
2449 return false;
2451 "must have valid opcode info for both recipes");
2452 if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
2453 if (LFlags->hasPredicate() &&
2454 LFlags->getPredicate() !=
2455 cast<VPRecipeWithIRFlags>(R)->getPredicate())
2456 return false;
2457 // Recipes in replicate regions implicitly depend on predicate. If either
2458 // recipe is in a replicate region, only consider them equal if both have
2459 // the same parent.
2460 const VPRegionBlock *RegionL = L->getRegion();
2461 const VPRegionBlock *RegionR = R->getRegion();
2462 if (((RegionL && RegionL->isReplicator()) ||
2463 (RegionR && RegionR->isReplicator())) &&
2464 L->getParent() != R->getParent())
2465 return false;
2466 const VPlan *Plan = L->getParent()->getPlan();
2467 VPTypeAnalysis TypeInfo(*Plan);
2468 return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
2469 }
2470};
2471} // end anonymous namespace
2472
2473/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
2474/// Plan.
2476 VPDominatorTree VPDT(Plan);
2478
2480 vp_depth_first_deep(Plan.getEntry()))) {
2481 for (VPRecipeBase &R : *VPBB) {
2482 auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
2483 if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
2484 continue;
2485 if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
2486 // V must dominate Def for a valid replacement.
2487 if (!VPDT.dominates(V->getParent(), VPBB))
2488 continue;
2489 // Only keep flags present on both V and Def.
2490 if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
2491 RFlags->intersectFlags(*cast<VPRecipeWithIRFlags>(Def));
2492 Def->replaceAllUsesWith(V);
2493 continue;
2494 }
2495 CSEMap[Def] = Def;
2496 }
2497 }
2498}
2499
2500/// Move loop-invariant recipes out of the vector loop region in \p Plan.
2501static void licm(VPlan &Plan) {
2502 VPBasicBlock *Preheader = Plan.getVectorPreheader();
2503
2504 // Hoist any loop invariant recipes from the vector loop region to the
2505 // preheader. Preform a shallow traversal of the vector loop region, to
2506 // exclude recipes in replicate regions. Since the top-level blocks in the
2507 // vector loop region are guaranteed to execute if the vector pre-header is,
2508 // we don't need to check speculation safety.
2509 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2510 assert(Preheader->getSingleSuccessor() == LoopRegion &&
2511 "Expected vector prehader's successor to be the vector loop region");
2513 vp_depth_first_shallow(LoopRegion->getEntry()))) {
2514 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2516 continue;
2517 if (any_of(R.operands(), [](VPValue *Op) {
2518 return !Op->isDefinedOutsideLoopRegions();
2519 }))
2520 continue;
2521 R.moveBefore(*Preheader, Preheader->end());
2522 }
2523 }
2524}
2525
2527 VPlan &Plan, const MapVector<Instruction *, uint64_t> &MinBWs) {
2528 if (Plan.hasScalarVFOnly())
2529 return;
2530 // Keep track of created truncates, so they can be re-used. Note that we
2531 // cannot use RAUW after creating a new truncate, as this would could make
2532 // other uses have different types for their operands, making them invalidly
2533 // typed.
2535 VPTypeAnalysis TypeInfo(Plan);
2536 VPBasicBlock *PH = Plan.getVectorPreheader();
2539 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
2542 &R))
2543 continue;
2544
2545 VPValue *ResultVPV = R.getVPSingleValue();
2546 auto *UI = cast_or_null<Instruction>(ResultVPV->getUnderlyingValue());
2547 unsigned NewResSizeInBits = MinBWs.lookup(UI);
2548 if (!NewResSizeInBits)
2549 continue;
2550
2551 // If the value wasn't vectorized, we must maintain the original scalar
2552 // type. Skip those here, after incrementing NumProcessedRecipes. Also
2553 // skip casts which do not need to be handled explicitly here, as
2554 // redundant casts will be removed during recipe simplification.
2556 continue;
2557
2558 Type *OldResTy = TypeInfo.inferScalarType(ResultVPV);
2559 unsigned OldResSizeInBits = OldResTy->getScalarSizeInBits();
2560 assert(OldResTy->isIntegerTy() && "only integer types supported");
2561 (void)OldResSizeInBits;
2562
2563 auto *NewResTy = IntegerType::get(Plan.getContext(), NewResSizeInBits);
2564
2565 // Any wrapping introduced by shrinking this operation shouldn't be
2566 // considered undefined behavior. So, we can't unconditionally copy
2567 // arithmetic wrapping flags to VPW.
2568 if (auto *VPW = dyn_cast<VPRecipeWithIRFlags>(&R))
2569 VPW->dropPoisonGeneratingFlags();
2570
2571 if (OldResSizeInBits != NewResSizeInBits &&
2572 !match(&R, m_ICmp(m_VPValue(), m_VPValue()))) {
2573 // Extend result to original width.
2574 auto *Ext =
2575 new VPWidenCastRecipe(Instruction::ZExt, ResultVPV, OldResTy);
2576 Ext->insertAfter(&R);
2577 ResultVPV->replaceAllUsesWith(Ext);
2578 Ext->setOperand(0, ResultVPV);
2579 assert(OldResSizeInBits > NewResSizeInBits && "Nothing to shrink?");
2580 } else {
2581 assert(match(&R, m_ICmp(m_VPValue(), m_VPValue())) &&
2582 "Only ICmps should not need extending the result.");
2583 }
2584
2585 assert(!isa<VPWidenStoreRecipe>(&R) && "stores cannot be narrowed");
2587 continue;
2588
2589 // Shrink operands by introducing truncates as needed.
2590 unsigned StartIdx = isa<VPWidenSelectRecipe>(&R) ? 1 : 0;
2591 for (unsigned Idx = StartIdx; Idx != R.getNumOperands(); ++Idx) {
2592 auto *Op = R.getOperand(Idx);
2593 unsigned OpSizeInBits =
2595 if (OpSizeInBits == NewResSizeInBits)
2596 continue;
2597 assert(OpSizeInBits > NewResSizeInBits && "nothing to truncate");
2598 auto [ProcessedIter, IterIsEmpty] = ProcessedTruncs.try_emplace(Op);
2599 if (!IterIsEmpty) {
2600 R.setOperand(Idx, ProcessedIter->second);
2601 continue;
2602 }
2603
2604 VPBuilder Builder;
2605 if (Op->isLiveIn())
2606 Builder.setInsertPoint(PH);
2607 else
2608 Builder.setInsertPoint(&R);
2609 VPWidenCastRecipe *NewOp =
2610 Builder.createWidenCast(Instruction::Trunc, Op, NewResTy);
2611 ProcessedIter->second = NewOp;
2612 R.setOperand(Idx, NewOp);
2613 }
2614
2615 }
2616 }
2617}
2618
2622 VPValue *Cond;
2623 // Skip blocks that are not terminated by BranchOnCond.
2624 if (VPBB->empty() || !match(&VPBB->back(), m_BranchOnCond(m_VPValue(Cond))))
2625 continue;
2626
2627 assert(VPBB->getNumSuccessors() == 2 &&
2628 "Two successors expected for BranchOnCond");
2629 unsigned RemovedIdx;
2630 if (match(Cond, m_True()))
2631 RemovedIdx = 1;
2632 else if (match(Cond, m_False()))
2633 RemovedIdx = 0;
2634 else
2635 continue;
2636
2637 VPBasicBlock *RemovedSucc =
2638 cast<VPBasicBlock>(VPBB->getSuccessors()[RemovedIdx]);
2639 assert(count(RemovedSucc->getPredecessors(), VPBB) == 1 &&
2640 "There must be a single edge between VPBB and its successor");
2641 // Values coming from VPBB into phi recipes of RemoveSucc are removed from
2642 // these recipes.
2643 for (VPRecipeBase &R : RemovedSucc->phis())
2644 cast<VPPhiAccessors>(&R)->removeIncomingValueFor(VPBB);
2645
2646 // Disconnect blocks and remove the terminator. RemovedSucc will be deleted
2647 // automatically on VPlan destruction if it becomes unreachable.
2648 VPBlockUtils::disconnectBlocks(VPBB, RemovedSucc);
2649 VPBB->back().eraseFromParent();
2650 }
2651}
2652
2672
2673// Add a VPActiveLaneMaskPHIRecipe and related recipes to \p Plan and replace
2674// the loop terminator with a branch-on-cond recipe with the negated
2675// active-lane-mask as operand. Note that this turns the loop into an
2676// uncountable one. Only the existing terminator is replaced, all other existing
2677// recipes/users remain unchanged, except for poison-generating flags being
2678// dropped from the canonical IV increment. Return the created
2679// VPActiveLaneMaskPHIRecipe.
2680//
2681// The function uses the following definitions:
2682//
2683// %TripCount = DataWithControlFlowWithoutRuntimeCheck ?
2684// calculate-trip-count-minus-VF (original TC) : original TC
2685// %IncrementValue = DataWithControlFlowWithoutRuntimeCheck ?
2686// CanonicalIVPhi : CanonicalIVIncrement
2687// %StartV is the canonical induction start value.
2688//
2689// The function adds the following recipes:
2690//
2691// vector.ph:
2692// %TripCount = calculate-trip-count-minus-VF (original TC)
2693// [if DataWithControlFlowWithoutRuntimeCheck]
2694// %EntryInc = canonical-iv-increment-for-part %StartV
2695// %EntryALM = active-lane-mask %EntryInc, %TripCount
2696//
2697// vector.body:
2698// ...
2699// %P = active-lane-mask-phi [ %EntryALM, %vector.ph ], [ %ALM, %vector.body ]
2700// ...
2701// %InLoopInc = canonical-iv-increment-for-part %IncrementValue
2702// %ALM = active-lane-mask %InLoopInc, TripCount
2703// %Negated = Not %ALM
2704// branch-on-cond %Negated
2705//
2708 VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
2709 VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
2710 auto *CanonicalIVPHI = TopRegion->getCanonicalIV();
2711 VPValue *StartV = CanonicalIVPHI->getStartValue();
2712
2713 auto *CanonicalIVIncrement =
2714 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
2715 // TODO: Check if dropping the flags is needed if
2716 // !DataAndControlFlowWithoutRuntimeCheck.
2717 CanonicalIVIncrement->dropPoisonGeneratingFlags();
2718 DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
2719 // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
2720 // we have to take unrolling into account. Each part needs to start at
2721 // Part * VF
2722 auto *VecPreheader = Plan.getVectorPreheader();
2723 VPBuilder Builder(VecPreheader);
2724
2725 // Create the ActiveLaneMask instruction using the correct start values.
2726 VPValue *TC = Plan.getTripCount();
2727
2728 VPValue *TripCount, *IncrementValue;
2730 // When the loop is guarded by a runtime overflow check for the loop
2731 // induction variable increment by VF, we can increment the value before
2732 // the get.active.lane mask and use the unmodified tripcount.
2733 IncrementValue = CanonicalIVIncrement;
2734 TripCount = TC;
2735 } else {
2736 // When avoiding a runtime check, the active.lane.mask inside the loop
2737 // uses a modified trip count and the induction variable increment is
2738 // done after the active.lane.mask intrinsic is called.
2739 IncrementValue = CanonicalIVPHI;
2740 TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
2741 {TC}, DL);
2742 }
2743 auto *EntryIncrement = Builder.createOverflowingOp(
2744 VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
2745 "index.part.next");
2746
2747 // Create the active lane mask instruction in the VPlan preheader.
2748 VPValue *ALMMultiplier =
2749 Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
2750 auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2751 {EntryIncrement, TC, ALMMultiplier}, DL,
2752 "active.lane.mask.entry");
2753
2754 // Now create the ActiveLaneMaskPhi recipe in the main loop using the
2755 // preheader ActiveLaneMask instruction.
2756 auto *LaneMaskPhi =
2758 LaneMaskPhi->insertAfter(CanonicalIVPHI);
2759
2760 // Create the active lane mask for the next iteration of the loop before the
2761 // original terminator.
2762 VPRecipeBase *OriginalTerminator = EB->getTerminator();
2763 Builder.setInsertPoint(OriginalTerminator);
2764 auto *InLoopIncrement =
2765 Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
2766 {IncrementValue}, {false, false}, DL);
2767 auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
2768 {InLoopIncrement, TripCount, ALMMultiplier},
2769 DL, "active.lane.mask.next");
2770 LaneMaskPhi->addOperand(ALM);
2771
2772 // Replace the original terminator with BranchOnCond. We have to invert the
2773 // mask here because a true condition means jumping to the exit block.
2774 auto *NotMask = Builder.createNot(ALM, DL);
2775 Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
2776 OriginalTerminator->eraseFromParent();
2777 return LaneMaskPhi;
2778}
2779
2780/// Collect the header mask with the pattern:
2781/// (ICMP_ULE, WideCanonicalIV, backedge-taken-count)
2782/// TODO: Introduce explicit recipe for header-mask instead of searching
2783/// for the header-mask pattern manually.
2785 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2786 SmallVector<VPValue *> WideCanonicalIVs;
2787 auto *FoundWidenCanonicalIVUser = find_if(
2789 assert(count_if(LoopRegion->getCanonicalIV()->users(),
2791 "Must have at most one VPWideCanonicalIVRecipe");
2792 if (FoundWidenCanonicalIVUser !=
2793 LoopRegion->getCanonicalIV()->users().end()) {
2794 auto *WideCanonicalIV =
2795 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2796 WideCanonicalIVs.push_back(WideCanonicalIV);
2797 }
2798
2799 // Also include VPWidenIntOrFpInductionRecipes that represent a widened
2800 // version of the canonical induction.
2801 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
2802 for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
2803 auto *WidenOriginalIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
2804 if (WidenOriginalIV && WidenOriginalIV->isCanonical())
2805 WideCanonicalIVs.push_back(WidenOriginalIV);
2806 }
2807
2808 // Walk users of wide canonical IVs and find the single compare of the form
2809 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count).
2810 VPSingleDefRecipe *HeaderMask = nullptr;
2811 for (auto *Wide : WideCanonicalIVs) {
2812 for (VPUser *U : SmallVector<VPUser *>(Wide->users())) {
2813 auto *VPI = dyn_cast<VPInstruction>(U);
2814 if (!VPI || !vputils::isHeaderMask(VPI, Plan))
2815 continue;
2816
2817 assert(VPI->getOperand(0) == Wide &&
2818 "WidenCanonicalIV must be the first operand of the compare");
2819 assert(!HeaderMask && "Multiple header masks found?");
2820 HeaderMask = VPI;
2821 }
2822 }
2823 return HeaderMask;
2824}
2825
2827 VPlan &Plan, bool UseActiveLaneMaskForControlFlow,
2830 UseActiveLaneMaskForControlFlow) &&
2831 "DataAndControlFlowWithoutRuntimeCheck implies "
2832 "UseActiveLaneMaskForControlFlow");
2833
2834 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2835 auto *FoundWidenCanonicalIVUser = find_if(
2837 assert(FoundWidenCanonicalIVUser &&
2838 "Must have widened canonical IV when tail folding!");
2839 VPSingleDefRecipe *HeaderMask = findHeaderMask(Plan);
2840 auto *WideCanonicalIV =
2841 cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
2842 VPSingleDefRecipe *LaneMask;
2843 if (UseActiveLaneMaskForControlFlow) {
2846 } else {
2847 VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
2848 VPValue *ALMMultiplier = Plan.getOrAddLiveIn(
2849 ConstantInt::get(LoopRegion->getCanonicalIVType(), 1));
2850 LaneMask =
2851 B.createNaryOp(VPInstruction::ActiveLaneMask,
2852 {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier},
2853 nullptr, "active.lane.mask");
2854 }
2855
2856 // Walk users of WideCanonicalIV and replace the header mask of the form
2857 // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an active-lane-mask,
2858 // removing the old one to ensure there is always only a single header mask.
2859 HeaderMask->replaceAllUsesWith(LaneMask);
2860 HeaderMask->eraseFromParent();
2861}
2862
2863template <typename Op0_t, typename Op1_t> struct RemoveMask_match {
2864 Op0_t In;
2866
2867 RemoveMask_match(const Op0_t &In, Op1_t &Out) : In(In), Out(Out) {}
2868
2869 template <typename OpTy> bool match(OpTy *V) const {
2870 if (m_Specific(In).match(V)) {
2871 Out = nullptr;
2872 return true;
2873 }
2874 return m_LogicalAnd(m_Specific(In), m_VPValue(Out)).match(V);
2875 }
2876};
2877
2878/// Match a specific mask \p In, or a combination of it (logical-and In, Out).
2879/// Returns the remaining part \p Out if so, or nullptr otherwise.
2880template <typename Op0_t, typename Op1_t>
2881static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
2882 Op1_t &Out) {
2883 return RemoveMask_match<Op0_t, Op1_t>(In, Out);
2884}
2885
2886/// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
2887/// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
2888/// recipe could be created.
2889/// \p HeaderMask Header Mask.
2890/// \p CurRecipe Recipe to be transform.
2891/// \p TypeInfo VPlan-based type analysis.
2892/// \p EVL The explicit vector length parameter of vector-predication
2893/// intrinsics.
2895 VPRecipeBase &CurRecipe,
2896 VPTypeAnalysis &TypeInfo, VPValue &EVL) {
2897 VPlan *Plan = CurRecipe.getParent()->getPlan();
2898 DebugLoc DL = CurRecipe.getDebugLoc();
2899 VPValue *Addr, *Mask, *EndPtr;
2900
2901 /// Adjust any end pointers so that they point to the end of EVL lanes not VF.
2902 auto AdjustEndPtr = [&CurRecipe, &EVL](VPValue *EndPtr) {
2903 auto *EVLEndPtr = cast<VPVectorEndPointerRecipe>(EndPtr)->clone();
2904 EVLEndPtr->insertBefore(&CurRecipe);
2905 EVLEndPtr->setOperand(1, &EVL);
2906 return EVLEndPtr;
2907 };
2908
2909 if (match(&CurRecipe,
2910 m_MaskedLoad(m_VPValue(Addr), m_RemoveMask(HeaderMask, Mask))) &&
2911 !cast<VPWidenLoadRecipe>(CurRecipe).isReverse())
2912 return new VPWidenLoadEVLRecipe(cast<VPWidenLoadRecipe>(CurRecipe), Addr,
2913 EVL, Mask);
2914
2915 VPValue *ReversedVal;
2916 if (match(&CurRecipe, m_Reverse(m_VPValue(ReversedVal))) &&
2917 match(ReversedVal,
2918 m_MaskedLoad(m_VPValue(EndPtr), m_RemoveMask(HeaderMask, Mask))) &&
2919 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2920 cast<VPWidenLoadRecipe>(ReversedVal)->isReverse()) {
2921 auto *LoadR = new VPWidenLoadEVLRecipe(
2922 *cast<VPWidenLoadRecipe>(ReversedVal), AdjustEndPtr(EndPtr), EVL, Mask);
2923 LoadR->insertBefore(&CurRecipe);
2924 return new VPWidenIntrinsicRecipe(
2925 Intrinsic::experimental_vp_reverse, {LoadR, Plan->getTrue(), &EVL},
2926 TypeInfo.inferScalarType(LoadR), {}, {}, DL);
2927 }
2928
2929 VPValue *StoredVal;
2930 if (match(&CurRecipe, m_MaskedStore(m_VPValue(Addr), m_VPValue(StoredVal),
2931 m_RemoveMask(HeaderMask, Mask))) &&
2932 !cast<VPWidenStoreRecipe>(CurRecipe).isReverse())
2933 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe), Addr,
2934 StoredVal, EVL, Mask);
2935
2936 if (match(&CurRecipe,
2937 m_MaskedStore(m_VPValue(EndPtr), m_Reverse(m_VPValue(ReversedVal)),
2938 m_RemoveMask(HeaderMask, Mask))) &&
2939 match(EndPtr, m_VecEndPtr(m_VPValue(Addr), m_Specific(&Plan->getVF()))) &&
2940 cast<VPWidenStoreRecipe>(CurRecipe).isReverse()) {
2941 auto *NewReverse = new VPWidenIntrinsicRecipe(
2942 Intrinsic::experimental_vp_reverse,
2943 {ReversedVal, Plan->getTrue(), &EVL},
2944 TypeInfo.inferScalarType(ReversedVal), {}, {}, DL);
2945 NewReverse->insertBefore(&CurRecipe);
2946 return new VPWidenStoreEVLRecipe(cast<VPWidenStoreRecipe>(CurRecipe),
2947 AdjustEndPtr(EndPtr), NewReverse, EVL,
2948 Mask);
2949 }
2950
2951 if (auto *Rdx = dyn_cast<VPReductionRecipe>(&CurRecipe))
2952 if (Rdx->isConditional() &&
2953 match(Rdx->getCondOp(), m_RemoveMask(HeaderMask, Mask)))
2954 return new VPReductionEVLRecipe(*Rdx, EVL, Mask);
2955
2956 if (auto *Interleave = dyn_cast<VPInterleaveRecipe>(&CurRecipe))
2957 if (Interleave->getMask() &&
2958 match(Interleave->getMask(), m_RemoveMask(HeaderMask, Mask)))
2959 return new VPInterleaveEVLRecipe(*Interleave, EVL, Mask);
2960
2961 VPValue *LHS, *RHS;
2962 if (match(&CurRecipe,
2963 m_Select(m_Specific(HeaderMask), m_VPValue(LHS), m_VPValue(RHS))))
2964 return new VPWidenIntrinsicRecipe(
2965 Intrinsic::vp_merge, {Plan->getTrue(), LHS, RHS, &EVL},
2966 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2967
2968 if (match(&CurRecipe, m_Select(m_RemoveMask(HeaderMask, Mask), m_VPValue(LHS),
2969 m_VPValue(RHS))))
2970 return new VPWidenIntrinsicRecipe(
2971 Intrinsic::vp_merge, {Mask, LHS, RHS, &EVL},
2972 TypeInfo.inferScalarType(LHS), {}, {}, DL);
2973
2974 if (match(&CurRecipe, m_LastActiveLane(m_Specific(HeaderMask)))) {
2975 Type *Ty = TypeInfo.inferScalarType(CurRecipe.getVPSingleValue());
2976 VPValue *ZExt =
2977 VPBuilder(&CurRecipe).createScalarCast(Instruction::ZExt, &EVL, Ty, DL);
2978 return new VPInstruction(Instruction::Sub,
2979 {ZExt, Plan->getConstantInt(Ty, 1)}, {}, {}, DL);
2980 }
2981
2982 return nullptr;
2983}
2984
2985/// Replace recipes with their EVL variants.
2987 VPTypeAnalysis TypeInfo(Plan);
2988 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
2989 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
2990
2991 assert(all_of(Plan.getVF().users(),
2994 "User of VF that we can't transform to EVL.");
2995 Plan.getVF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
2997 });
2998
2999 assert(all_of(Plan.getVFxUF().users(),
3000 [&LoopRegion, &Plan](VPUser *U) {
3001 return match(U,
3002 m_c_Add(m_Specific(LoopRegion->getCanonicalIV()),
3003 m_Specific(&Plan.getVFxUF()))) ||
3004 isa<VPWidenPointerInductionRecipe>(U);
3005 }) &&
3006 "Only users of VFxUF should be VPWidenPointerInductionRecipe and the "
3007 "increment of the canonical induction.");
3008 Plan.getVFxUF().replaceUsesWithIf(&EVL, [](VPUser &U, unsigned Idx) {
3009 // Only replace uses in VPWidenPointerInductionRecipe; The increment of the
3010 // canonical induction must not be updated.
3012 });
3013
3014 // Defer erasing recipes till the end so that we don't invalidate the
3015 // VPTypeAnalysis cache.
3017
3018 // Create a scalar phi to track the previous EVL if fixed-order recurrence is
3019 // contained.
3020 bool ContainsFORs =
3022 if (ContainsFORs) {
3023 // TODO: Use VPInstruction::ExplicitVectorLength to get maximum EVL.
3024 VPValue *MaxEVL = &Plan.getVF();
3025 // Emit VPScalarCastRecipe in preheader if VF is not a 32 bits integer.
3026 VPBuilder Builder(LoopRegion->getPreheaderVPBB());
3027 MaxEVL = Builder.createScalarZExtOrTrunc(
3028 MaxEVL, Type::getInt32Ty(Plan.getContext()),
3029 TypeInfo.inferScalarType(MaxEVL), DebugLoc::getUnknown());
3030
3031 Builder.setInsertPoint(Header, Header->getFirstNonPhi());
3032 VPValue *PrevEVL = Builder.createScalarPhi(
3033 {MaxEVL, &EVL}, DebugLoc::getUnknown(), "prev.evl");
3034
3037 for (VPRecipeBase &R : *VPBB) {
3038 VPValue *V1, *V2;
3039 if (!match(&R,
3041 m_VPValue(V1), m_VPValue(V2))))
3042 continue;
3043 VPValue *Imm = Plan.getOrAddLiveIn(
3046 Intrinsic::experimental_vp_splice,
3047 {V1, V2, Imm, Plan.getTrue(), PrevEVL, &EVL},
3048 TypeInfo.inferScalarType(R.getVPSingleValue()), {}, {},
3049 R.getDebugLoc());
3050 VPSplice->insertBefore(&R);
3051 R.getVPSingleValue()->replaceAllUsesWith(VPSplice);
3052 ToErase.push_back(&R);
3053 }
3054 }
3055 }
3056
3057 VPValue *HeaderMask = findHeaderMask(Plan);
3058 if (!HeaderMask)
3059 return;
3060
3061 // Replace header masks with a mask equivalent to predicating by EVL:
3062 //
3063 // icmp ule widen-canonical-iv backedge-taken-count
3064 // ->
3065 // icmp ult step-vector, EVL
3066 VPRecipeBase *EVLR = EVL.getDefiningRecipe();
3067 VPBuilder Builder(EVLR->getParent(), std::next(EVLR->getIterator()));
3068 Type *EVLType = TypeInfo.inferScalarType(&EVL);
3069 VPValue *EVLMask = Builder.createICmp(
3071 Builder.createNaryOp(VPInstruction::StepVector, {}, EVLType), &EVL);
3072 HeaderMask->replaceAllUsesWith(EVLMask);
3073 ToErase.push_back(HeaderMask->getDefiningRecipe());
3074
3075 // Try to optimize header mask recipes away to their EVL variants.
3076 // TODO: Split optimizeMaskToEVL out and move into
3077 // VPlanTransforms::optimize. transformRecipestoEVLRecipes should be run in
3078 // tryToBuildVPlanWithVPRecipes beforehand.
3079 for (VPUser *U : collectUsersRecursively(EVLMask)) {
3080 auto *CurRecipe = cast<VPRecipeBase>(U);
3081 VPRecipeBase *EVLRecipe =
3082 optimizeMaskToEVL(EVLMask, *CurRecipe, TypeInfo, EVL);
3083 if (!EVLRecipe)
3084 continue;
3085
3086 unsigned NumDefVal = EVLRecipe->getNumDefinedValues();
3087 assert(NumDefVal == CurRecipe->getNumDefinedValues() &&
3088 "New recipe must define the same number of values as the "
3089 "original.");
3090 EVLRecipe->insertBefore(CurRecipe);
3092 EVLRecipe)) {
3093 for (unsigned I = 0; I < NumDefVal; ++I) {
3094 VPValue *CurVPV = CurRecipe->getVPValue(I);
3095 CurVPV->replaceAllUsesWith(EVLRecipe->getVPValue(I));
3096 }
3097 }
3098 ToErase.push_back(CurRecipe);
3099 }
3100 // Remove dead EVL mask.
3101 if (EVLMask->getNumUsers() == 0)
3102 ToErase.push_back(EVLMask->getDefiningRecipe());
3103
3104 for (VPRecipeBase *R : reverse(ToErase)) {
3105 SmallVector<VPValue *> PossiblyDead(R->operands());
3106 R->eraseFromParent();
3107 for (VPValue *Op : PossiblyDead)
3109 }
3110}
3111
3112/// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and
3113/// replaces all uses except the canonical IV increment of
3114/// VPCanonicalIVPHIRecipe with a VPEVLBasedIVPHIRecipe. VPCanonicalIVPHIRecipe
3115/// is used only for loop iterations counting after this transformation.
3116///
3117/// The function uses the following definitions:
3118/// %StartV is the canonical induction start value.
3119///
3120/// The function adds the following recipes:
3121///
3122/// vector.ph:
3123/// ...
3124///
3125/// vector.body:
3126/// ...
3127/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3128/// [ %NextEVLIV, %vector.body ]
3129/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3130/// %VPEVL = EXPLICIT-VECTOR-LENGTH %AVL
3131/// ...
3132/// %OpEVL = cast i32 %VPEVL to IVSize
3133/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3134/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3135/// ...
3136///
3137/// If MaxSafeElements is provided, the function adds the following recipes:
3138/// vector.ph:
3139/// ...
3140///
3141/// vector.body:
3142/// ...
3143/// %EVLPhi = EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI [ %StartV, %vector.ph ],
3144/// [ %NextEVLIV, %vector.body ]
3145/// %AVL = phi [ trip-count, %vector.ph ], [ %NextAVL, %vector.body ]
3146/// %cmp = cmp ult %AVL, MaxSafeElements
3147/// %SAFE_AVL = select %cmp, %AVL, MaxSafeElements
3148/// %VPEVL = EXPLICIT-VECTOR-LENGTH %SAFE_AVL
3149/// ...
3150/// %OpEVL = cast i32 %VPEVL to IVSize
3151/// %NextEVLIV = add IVSize %OpEVL, %EVLPhi
3152/// %NextAVL = sub IVSize nuw %AVL, %OpEVL
3153/// ...
3154///
3156 VPlan &Plan, const std::optional<unsigned> &MaxSafeElements) {
3157 if (Plan.hasScalarVFOnly())
3158 return;
3159 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3160 VPBasicBlock *Header = LoopRegion->getEntryBasicBlock();
3161
3162 auto *CanonicalIVPHI = LoopRegion->getCanonicalIV();
3163 auto *CanIVTy = LoopRegion->getCanonicalIVType();
3164 VPValue *StartV = CanonicalIVPHI->getStartValue();
3165
3166 // Create the ExplicitVectorLengthPhi recipe in the main loop.
3167 auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc::getUnknown());
3168 EVLPhi->insertAfter(CanonicalIVPHI);
3169 VPBuilder Builder(Header, Header->getFirstNonPhi());
3170 // Create the AVL (application vector length), starting from TC -> 0 in steps
3171 // of EVL.
3172 VPPhi *AVLPhi = Builder.createScalarPhi(
3173 {Plan.getTripCount()}, DebugLoc::getCompilerGenerated(), "avl");
3174 VPValue *AVL = AVLPhi;
3175
3176 if (MaxSafeElements) {
3177 // Support for MaxSafeDist for correct loop emission.
3178 VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
3179 VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
3180 AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
3181 "safe_avl");
3182 }
3183 auto *VPEVL = Builder.createNaryOp(VPInstruction::ExplicitVectorLength, AVL,
3185
3186 auto *CanonicalIVIncrement =
3187 cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
3188 Builder.setInsertPoint(CanonicalIVIncrement);
3189 VPValue *OpVPEVL = VPEVL;
3190
3191 auto *I32Ty = Type::getInt32Ty(Plan.getContext());
3192 OpVPEVL = Builder.createScalarZExtOrTrunc(
3193 OpVPEVL, CanIVTy, I32Ty, CanonicalIVIncrement->getDebugLoc());
3194
3195 auto *NextEVLIV = Builder.createOverflowingOp(
3196 Instruction::Add, {OpVPEVL, EVLPhi},
3197 {CanonicalIVIncrement->hasNoUnsignedWrap(),
3198 CanonicalIVIncrement->hasNoSignedWrap()},
3199 CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
3200 EVLPhi->addOperand(NextEVLIV);
3201
3202 VPValue *NextAVL = Builder.createOverflowingOp(
3203 Instruction::Sub, {AVLPhi, OpVPEVL}, {/*hasNUW=*/true, /*hasNSW=*/false},
3204 DebugLoc::getCompilerGenerated(), "avl.next");
3205 AVLPhi->addOperand(NextAVL);
3206
3207 transformRecipestoEVLRecipes(Plan, *VPEVL);
3208
3209 // Replace all uses of VPCanonicalIVPHIRecipe by
3210 // VPEVLBasedIVPHIRecipe except for the canonical IV increment.
3211 CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
3212 CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
3213 // TODO: support unroll factor > 1.
3214 Plan.setUF(1);
3215}
3216
3218 // Find EVL loop entries by locating VPEVLBasedIVPHIRecipe.
3219 // There should be only one EVL PHI in the entire plan.
3220 VPEVLBasedIVPHIRecipe *EVLPhi = nullptr;
3221
3224 for (VPRecipeBase &R : VPBB->phis())
3225 if (auto *PhiR = dyn_cast<VPEVLBasedIVPHIRecipe>(&R)) {
3226 assert(!EVLPhi && "Found multiple EVL PHIs. Only one expected");
3227 EVLPhi = PhiR;
3228 }
3229
3230 // Early return if no EVL PHI is found.
3231 if (!EVLPhi)
3232 return;
3233
3234 VPBasicBlock *HeaderVPBB = EVLPhi->getParent();
3235 VPValue *EVLIncrement = EVLPhi->getBackedgeValue();
3236 VPValue *AVL;
3237 [[maybe_unused]] bool FoundAVL =
3238 match(EVLIncrement,
3239 m_c_Add(m_ZExtOrSelf(m_EVL(m_VPValue(AVL))), m_Specific(EVLPhi)));
3240 assert(FoundAVL && "Didn't find AVL?");
3241
3242 // The AVL may be capped to a safe distance.
3243 VPValue *SafeAVL;
3244 if (match(AVL, m_Select(m_VPValue(), m_VPValue(SafeAVL), m_VPValue())))
3245 AVL = SafeAVL;
3246
3247 VPValue *AVLNext;
3248 [[maybe_unused]] bool FoundAVLNext =
3250 m_Specific(Plan.getTripCount()), m_VPValue(AVLNext)));
3251 assert(FoundAVLNext && "Didn't find AVL backedge?");
3252
3253 // Convert EVLPhi to concrete recipe.
3254 auto *ScalarR =
3255 VPBuilder(EVLPhi).createScalarPhi({EVLPhi->getStartValue(), EVLIncrement},
3256 EVLPhi->getDebugLoc(), "evl.based.iv");
3257 EVLPhi->replaceAllUsesWith(ScalarR);
3258 EVLPhi->eraseFromParent();
3259
3260 // Replace CanonicalIVInc with EVL-PHI increment.
3261 auto *CanonicalIV = cast<VPPhi>(&*HeaderVPBB->begin());
3262 VPValue *Backedge = CanonicalIV->getIncomingValue(1);
3263 assert(match(Backedge, m_c_Add(m_Specific(CanonicalIV),
3264 m_Specific(&Plan.getVFxUF()))) &&
3265 "Unexpected canonical iv");
3266 Backedge->replaceAllUsesWith(EVLIncrement);
3267
3268 // Remove unused phi and increment.
3269 VPRecipeBase *CanonicalIVIncrement = Backedge->getDefiningRecipe();
3270 CanonicalIVIncrement->eraseFromParent();
3271 CanonicalIV->eraseFromParent();
3272
3273 // Replace the use of VectorTripCount in the latch-exiting block.
3274 // Before: (branch-on-cond (icmp eq EVLIVInc, VectorTripCount))
3275 // After: (branch-on-cond icmp eq AVLNext, 0)
3276 VPBasicBlock *LatchExiting =
3277 HeaderVPBB->getPredecessors()[1]->getEntryBasicBlock();
3278 auto *LatchExitingBr = cast<VPInstruction>(LatchExiting->getTerminator());
3279 if (match(LatchExitingBr, m_BranchOnCond(m_True())))
3280 return;
3281
3282 assert(match(LatchExitingBr, m_BranchOnCond(m_SpecificCmp(
3283 CmpInst::ICMP_EQ, m_VPValue(EVLIncrement),
3284 m_Specific(&Plan.getVectorTripCount())))) &&
3285 "Expected BranchOnCond with ICmp comparing EVL increment with vector "
3286 "trip count");
3287
3288 Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
3289 VPBuilder Builder(LatchExitingBr);
3290 LatchExitingBr->setOperand(0,
3291 Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
3292 Plan.getConstantInt(AVLTy, 0)));
3293}
3294
3296 VPlan &Plan, PredicatedScalarEvolution &PSE,
3297 const DenseMap<Value *, const SCEV *> &StridesMap) {
3298 // Replace VPValues for known constant strides guaranteed by predicate scalar
3299 // evolution.
3300 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
3301 auto *R = cast<VPRecipeBase>(&U);
3302 return R->getRegion() ||
3303 R->getParent() == Plan.getVectorLoopRegion()->getSinglePredecessor();
3304 };
3305 ValueToSCEVMapTy RewriteMap;
3306 for (const SCEV *Stride : StridesMap.values()) {
3307 using namespace SCEVPatternMatch;
3308 auto *StrideV = cast<SCEVUnknown>(Stride)->getValue();
3309 const APInt *StrideConst;
3310 if (!match(PSE.getSCEV(StrideV), m_scev_APInt(StrideConst)))
3311 // Only handle constant strides for now.
3312 continue;
3313
3314 auto *CI = Plan.getConstantInt(*StrideConst);
3315 if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
3316 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3317
3318 // The versioned value may not be used in the loop directly but through a
3319 // sext/zext. Add new live-ins in those cases.
3320 for (Value *U : StrideV->users()) {
3322 continue;
3323 VPValue *StrideVPV = Plan.getLiveIn(U);
3324 if (!StrideVPV)
3325 continue;
3326 unsigned BW = U->getType()->getScalarSizeInBits();
3327 APInt C =
3328 isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
3329 VPValue *CI = Plan.getConstantInt(C);
3330 StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
3331 }
3332 RewriteMap[StrideV] = PSE.getSCEV(StrideV);
3333 }
3334
3335 for (VPRecipeBase &R : *Plan.getEntry()) {
3336 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
3337 if (!ExpSCEV)
3338 continue;
3339 const SCEV *ScevExpr = ExpSCEV->getSCEV();
3340 auto *NewSCEV =
3341 SCEVParameterRewriter::rewrite(ScevExpr, *PSE.getSE(), RewriteMap);
3342 if (NewSCEV != ScevExpr) {
3343 VPValue *NewExp = vputils::getOrCreateVPValueForSCEVExpr(Plan, NewSCEV);
3344 ExpSCEV->replaceAllUsesWith(NewExp);
3345 if (Plan.getTripCount() == ExpSCEV)
3346 Plan.resetTripCount(NewExp);
3347 }
3348 }
3349}
3350
3352 VPlan &Plan,
3353 const std::function<bool(BasicBlock *)> &BlockNeedsPredication) {
3354 // Collect recipes in the backward slice of `Root` that may generate a poison
3355 // value that is used after vectorization.
3357 auto CollectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
3359 Worklist.push_back(Root);
3360
3361 // Traverse the backward slice of Root through its use-def chain.
3362 while (!Worklist.empty()) {
3363 VPRecipeBase *CurRec = Worklist.pop_back_val();
3364
3365 if (!Visited.insert(CurRec).second)
3366 continue;
3367
3368 // Prune search if we find another recipe generating a widen memory
3369 // instruction. Widen memory instructions involved in address computation
3370 // will lead to gather/scatter instructions, which don't need to be
3371 // handled.
3373 VPHeaderPHIRecipe>(CurRec))
3374 continue;
3375
3376 // This recipe contributes to the address computation of a widen
3377 // load/store. If the underlying instruction has poison-generating flags,
3378 // drop them directly.
3379 if (auto *RecWithFlags = dyn_cast<VPRecipeWithIRFlags>(CurRec)) {
3380 VPValue *A, *B;
3381 // Dropping disjoint from an OR may yield incorrect results, as some
3382 // analysis may have converted it to an Add implicitly (e.g. SCEV used
3383 // for dependence analysis). Instead, replace it with an equivalent Add.
3384 // This is possible as all users of the disjoint OR only access lanes
3385 // where the operands are disjoint or poison otherwise.
3386 if (match(RecWithFlags, m_BinaryOr(m_VPValue(A), m_VPValue(B))) &&
3387 RecWithFlags->isDisjoint()) {
3388 VPBuilder Builder(RecWithFlags);
3389 VPInstruction *New = Builder.createOverflowingOp(
3390 Instruction::Add, {A, B}, {false, false},
3391 RecWithFlags->getDebugLoc());
3392 New->setUnderlyingValue(RecWithFlags->getUnderlyingValue());
3393 RecWithFlags->replaceAllUsesWith(New);
3394 RecWithFlags->eraseFromParent();
3395 CurRec = New;
3396 } else
3397 RecWithFlags->dropPoisonGeneratingFlags();
3398 } else {
3401 (void)Instr;
3402 assert((!Instr || !Instr->hasPoisonGeneratingFlags()) &&
3403 "found instruction with poison generating flags not covered by "
3404 "VPRecipeWithIRFlags");
3405 }
3406
3407 // Add new definitions to the worklist.
3408 for (VPValue *Operand : CurRec->operands())
3409 if (VPRecipeBase *OpDef = Operand->getDefiningRecipe())
3410 Worklist.push_back(OpDef);
3411 }
3412 });
3413
3414 // Traverse all the recipes in the VPlan and collect the poison-generating
3415 // recipes in the backward slice starting at the address of a VPWidenRecipe or
3416 // VPInterleaveRecipe.
3417 auto Iter = vp_depth_first_deep(Plan.getEntry());
3419 for (VPRecipeBase &Recipe : *VPBB) {
3420 if (auto *WidenRec = dyn_cast<VPWidenMemoryRecipe>(&Recipe)) {
3421 Instruction &UnderlyingInstr = WidenRec->getIngredient();
3422 VPRecipeBase *AddrDef = WidenRec->getAddr()->getDefiningRecipe();
3423 if (AddrDef && WidenRec->isConsecutive() &&
3424 BlockNeedsPredication(UnderlyingInstr.getParent()))
3425 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3426 } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
3427 VPRecipeBase *AddrDef = InterleaveRec->getAddr()->getDefiningRecipe();
3428 if (AddrDef) {
3429 // Check if any member of the interleave group needs predication.
3430 const InterleaveGroup<Instruction> *InterGroup =
3431 InterleaveRec->getInterleaveGroup();
3432 bool NeedPredication = false;
3433 for (int I = 0, NumMembers = InterGroup->getNumMembers();
3434 I < NumMembers; ++I) {
3435 Instruction *Member = InterGroup->getMember(I);
3436 if (Member)
3437 NeedPredication |= BlockNeedsPredication(Member->getParent());
3438 }
3439
3440 if (NeedPredication)
3441 CollectPoisonGeneratingInstrsInBackwardSlice(AddrDef);
3442 }
3443 }
3444 }
3445 }
3446}
3447
3449 VPlan &Plan,
3451 &InterleaveGroups,
3452 VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed) {
3453 if (InterleaveGroups.empty())
3454 return;
3455
3456 // Interleave memory: for each Interleave Group we marked earlier as relevant
3457 // for this VPlan, replace the Recipes widening its memory instructions with a
3458 // single VPInterleaveRecipe at its insertion point.
3459 VPDominatorTree VPDT(Plan);
3460 for (const auto *IG : InterleaveGroups) {
3461 auto *Start =
3462 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
3463 VPIRMetadata InterleaveMD(*Start);
3464 SmallVector<VPValue *, 4> StoredValues;
3465 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(Start))
3466 StoredValues.push_back(StoreR->getStoredValue());
3467 for (unsigned I = 1; I < IG->getFactor(); ++I) {
3468 Instruction *MemberI = IG->getMember(I);
3469 if (!MemberI)
3470 continue;
3471 VPWidenMemoryRecipe *MemoryR =
3472 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(MemberI));
3473 if (auto *StoreR = dyn_cast<VPWidenStoreRecipe>(MemoryR))
3474 StoredValues.push_back(StoreR->getStoredValue());
3475 InterleaveMD.intersect(*MemoryR);
3476 }
3477
3478 bool NeedsMaskForGaps =
3479 (IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed) ||
3480 (!StoredValues.empty() && !IG->isFull());
3481
3482 Instruction *IRInsertPos = IG->getInsertPos();
3483 auto *InsertPos =
3484 cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));
3485
3487 if (auto *Gep = dyn_cast<GetElementPtrInst>(
3488 getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
3489 NW = Gep->getNoWrapFlags().withoutNoUnsignedWrap();
3490
3491 // Get or create the start address for the interleave group.
3492 VPValue *Addr = Start->getAddr();
3493 VPRecipeBase *AddrDef = Addr->getDefiningRecipe();
3494 if (AddrDef && !VPDT.properlyDominates(AddrDef, InsertPos)) {
3495 // We cannot re-use the address of member zero because it does not
3496 // dominate the insert position. Instead, use the address of the insert
3497 // position and create a PtrAdd adjusting it to the address of member
3498 // zero.
3499 // TODO: Hoist Addr's defining recipe (and any operands as needed) to
3500 // InsertPos or sink loads above zero members to join it.
3501 assert(IG->getIndex(IRInsertPos) != 0 &&
3502 "index of insert position shouldn't be zero");
3503 auto &DL = IRInsertPos->getDataLayout();
3504 APInt Offset(32,
3505 DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
3506 IG->getIndex(IRInsertPos),
3507 /*IsSigned=*/true);
3508 VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
3509 VPBuilder B(InsertPos);
3510 Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
3511 }
3512 // If the group is reverse, adjust the index to refer to the last vector
3513 // lane instead of the first. We adjust the index from the first vector
3514 // lane, rather than directly getting the pointer for lane VF - 1, because
3515 // the pointer operand of the interleaved access is supposed to be uniform.
3516 if (IG->isReverse()) {
3517 auto *ReversePtr = new VPVectorEndPointerRecipe(
3518 Addr, &Plan.getVF(), getLoadStoreType(IRInsertPos),
3519 -(int64_t)IG->getFactor(), NW, InsertPos->getDebugLoc());
3520 ReversePtr->insertBefore(InsertPos);
3521 Addr = ReversePtr;
3522 }
3523 auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
3524 InsertPos->getMask(), NeedsMaskForGaps,
3525 InterleaveMD, InsertPos->getDebugLoc());
3526 VPIG->insertBefore(InsertPos);
3527
3528 unsigned J = 0;
3529 for (unsigned i = 0; i < IG->getFactor(); ++i)
3530 if (Instruction *Member = IG->getMember(i)) {
3531 VPRecipeBase *MemberR = RecipeBuilder.getRecipe(Member);
3532 if (!Member->getType()->isVoidTy()) {
3533 VPValue *OriginalV = MemberR->getVPSingleValue();
3534 OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
3535 J++;
3536 }
3537 MemberR->eraseFromParent();
3538 }
3539 }
3540}
3541
3542/// Expand a VPWidenIntOrFpInduction into executable recipes, for the initial
3543/// value, phi and backedge value. In the following example:
3544///
3545/// vector.ph:
3546/// Successor(s): vector loop
3547///
3548/// <x1> vector loop: {
3549/// vector.body:
3550/// WIDEN-INDUCTION %i = phi %start, %step, %vf
3551/// ...
3552/// EMIT branch-on-count ...
3553/// No successors
3554/// }
3555///
3556/// WIDEN-INDUCTION will get expanded to:
3557///
3558/// vector.ph:
3559/// ...
3560/// vp<%induction.start> = ...
3561/// vp<%induction.increment> = ...
3562///
3563/// Successor(s): vector loop
3564///
3565/// <x1> vector loop: {
3566/// vector.body:
3567/// ir<%i> = WIDEN-PHI vp<%induction.start>, vp<%vec.ind.next>
3568/// ...
3569/// vp<%vec.ind.next> = add ir<%i>, vp<%induction.increment>
3570/// EMIT branch-on-count ...
3571/// No successors
3572/// }
3573static void
3575 VPTypeAnalysis &TypeInfo) {
3576 VPlan *Plan = WidenIVR->getParent()->getPlan();
3577 VPValue *Start = WidenIVR->getStartValue();
3578 VPValue *Step = WidenIVR->getStepValue();
3579 VPValue *VF = WidenIVR->getVFValue();
3580 DebugLoc DL = WidenIVR->getDebugLoc();
3581
3582 // The value from the original loop to which we are mapping the new induction
3583 // variable.
3584 Type *Ty = TypeInfo.inferScalarType(WidenIVR);
3585
3586 const InductionDescriptor &ID = WidenIVR->getInductionDescriptor();
3589 VPIRFlags Flags = *WidenIVR;
3590 if (ID.getKind() == InductionDescriptor::IK_IntInduction) {
3591 AddOp = Instruction::Add;
3592 MulOp = Instruction::Mul;
3593 } else {
3594 AddOp = ID.getInductionOpcode();
3595 MulOp = Instruction::FMul;
3596 }
3597
3598 // If the phi is truncated, truncate the start and step values.
3599 VPBuilder Builder(Plan->getVectorPreheader());
3600 Type *StepTy = TypeInfo.inferScalarType(Step);
3601 if (Ty->getScalarSizeInBits() < StepTy->getScalarSizeInBits()) {
3602 assert(StepTy->isIntegerTy() && "Truncation requires an integer type");
3603 Step = Builder.createScalarCast(Instruction::Trunc, Step, Ty, DL);
3604 Start = Builder.createScalarCast(Instruction::Trunc, Start, Ty, DL);
3605 // Truncation doesn't preserve WrapFlags.
3606 Flags.dropPoisonGeneratingFlags();
3607 StepTy = Ty;
3608 }
3609
3610 // Construct the initial value of the vector IV in the vector loop preheader.
3611 Type *IVIntTy =
3613 VPValue *Init = Builder.createNaryOp(VPInstruction::StepVector, {}, IVIntTy);
3614 if (StepTy->isFloatingPointTy())
3615 Init = Builder.createWidenCast(Instruction::UIToFP, Init, StepTy);
3616
3617 VPValue *SplatStart = Builder.createNaryOp(VPInstruction::Broadcast, Start);
3618 VPValue *SplatStep = Builder.createNaryOp(VPInstruction::Broadcast, Step);
3619
3620 Init = Builder.createNaryOp(MulOp, {Init, SplatStep}, Flags);
3621 Init = Builder.createNaryOp(AddOp, {SplatStart, Init}, Flags,
3622 DebugLoc::getUnknown(), "induction");
3623
3624 // Create the widened phi of the vector IV.
3625 auto *WidePHI = new VPWidenPHIRecipe(WidenIVR->getPHINode(), Init,
3626 WidenIVR->getDebugLoc(), "vec.ind");
3627 WidePHI->insertBefore(WidenIVR);
3628
3629 // Create the backedge value for the vector IV.
3630 VPValue *Inc;
3631 VPValue *Prev;
3632 // If unrolled, use the increment and prev value from the operands.
3633 if (auto *SplatVF = WidenIVR->getSplatVFValue()) {
3634 Inc = SplatVF;
3635 Prev = WidenIVR->getLastUnrolledPartOperand();
3636 } else {
3637 if (VPRecipeBase *R = VF->getDefiningRecipe())
3638 Builder.setInsertPoint(R->getParent(), std::next(R->getIterator()));
3639 // Multiply the vectorization factor by the step using integer or
3640 // floating-point arithmetic as appropriate.
3641 if (StepTy->isFloatingPointTy())
3642 VF = Builder.createScalarCast(Instruction::CastOps::UIToFP, VF, StepTy,
3643 DL);
3644 else
3645 VF = Builder.createScalarZExtOrTrunc(VF, StepTy,
3646 TypeInfo.inferScalarType(VF), DL);
3647
3648 Inc = Builder.createNaryOp(MulOp, {Step, VF}, Flags);
3649 Inc = Builder.createNaryOp(VPInstruction::Broadcast, Inc);
3650 Prev = WidePHI;
3651 }
3652
3654 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3655 auto *Next = Builder.createNaryOp(AddOp, {Prev, Inc}, Flags,
3656 WidenIVR->getDebugLoc(), "vec.ind.next");
3657
3658 WidePHI->addOperand(Next);
3659
3660 WidenIVR->replaceAllUsesWith(WidePHI);
3661}
3662
3663/// Expand a VPWidenPointerInductionRecipe into executable recipes, for the
3664/// initial value, phi and backedge value. In the following example:
3665///
3666/// <x1> vector loop: {
3667/// vector.body:
3668/// EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION %start, %step, %vf
3669/// ...
3670/// EMIT branch-on-count ...
3671/// }
3672///
3673/// WIDEN-POINTER-INDUCTION will get expanded to:
3674///
3675/// <x1> vector loop: {
3676/// vector.body:
3677/// EMIT-SCALAR %pointer.phi = phi %start, %ptr.ind
3678/// EMIT %mul = mul %stepvector, %step
3679/// EMIT %vector.gep = wide-ptradd %pointer.phi, %mul
3680/// ...
3681/// EMIT %ptr.ind = ptradd %pointer.phi, %vf
3682/// EMIT branch-on-count ...
3683/// }
3685 VPTypeAnalysis &TypeInfo) {
3686 VPlan *Plan = R->getParent()->getPlan();
3687 VPValue *Start = R->getStartValue();
3688 VPValue *Step = R->getStepValue();
3689 VPValue *VF = R->getVFValue();
3690
3691 assert(R->getInductionDescriptor().getKind() ==
3693 "Not a pointer induction according to InductionDescriptor!");
3694 assert(TypeInfo.inferScalarType(R)->isPointerTy() && "Unexpected type.");
3695 assert(!R->onlyScalarsGenerated(Plan->hasScalableVF()) &&
3696 "Recipe should have been replaced");
3697
3698 VPBuilder Builder(R);
3699 DebugLoc DL = R->getDebugLoc();
3700
3701 // Build a scalar pointer phi.
3702 VPPhi *ScalarPtrPhi = Builder.createScalarPhi(Start, DL, "pointer.phi");
3703
3704 // Create actual address geps that use the pointer phi as base and a
3705 // vectorized version of the step value (<step*0, ..., step*N>) as offset.
3706 Builder.setInsertPoint(R->getParent(), R->getParent()->getFirstNonPhi());
3707 Type *StepTy = TypeInfo.inferScalarType(Step);
3708 VPValue *Offset = Builder.createNaryOp(VPInstruction::StepVector, {}, StepTy);
3709 Offset = Builder.createOverflowingOp(Instruction::Mul, {Offset, Step});
3710 VPValue *PtrAdd = Builder.createNaryOp(
3711 VPInstruction::WidePtrAdd, {ScalarPtrPhi, Offset}, DL, "vector.gep");
3712 R->replaceAllUsesWith(PtrAdd);
3713
3714 // Create the backedge value for the scalar pointer phi.
3716 Builder.setInsertPoint(ExitingBB, ExitingBB->getTerminator()->getIterator());
3717 VF = Builder.createScalarZExtOrTrunc(VF, StepTy, TypeInfo.inferScalarType(VF),
3718 DL);
3719 VPValue *Inc = Builder.createOverflowingOp(Instruction::Mul, {Step, VF});
3720
3721 VPValue *InductionGEP =
3722 Builder.createPtrAdd(ScalarPtrPhi, Inc, DL, "ptr.ind");
3723 ScalarPtrPhi->addOperand(InductionGEP);
3724}
3725
3727 // Replace loop regions with explicity CFG.
3728 SmallVector<VPRegionBlock *> LoopRegions;
3730 vp_depth_first_deep(Plan.getEntry()))) {
3731 if (!R->isReplicator())
3732 LoopRegions.push_back(R);
3733 }
3734 for (VPRegionBlock *R : LoopRegions)
3735 R->dissolveToCFGLoop();
3736}
3737
3740 // The transform runs after dissolving loop regions, so all VPBasicBlocks
3741 // terminated with BranchOnTwoConds are reached via a shallow traversal.
3744 if (!VPBB->empty() && match(&VPBB->back(), m_BranchOnTwoConds()))
3745 WorkList.push_back(cast<VPInstruction>(&VPBB->back()));
3746 }
3747
3748 // Expand BranchOnTwoConds instructions into explicit CFG with
3749 // single-condition branches, by introducing a new branch in VPBB that jumps
3750 // to a new intermediate block if either condition is true and to the
3751 // third successor otherwise. The intermediate block jumps to the first or
3752 // second successor, depending on the first condition.
3753 for (VPInstruction *Br : WorkList) {
3754 assert(Br->getNumOperands() == 2 &&
3755 "BranchOnTwoConds must have exactly 2 conditions");
3756 DebugLoc DL = Br->getDebugLoc();
3757 VPBasicBlock *Latch = Br->getParent();
3758 const auto Successors = to_vector(Latch->getSuccessors());
3759 assert(Successors.size() == 3 &&
3760 "BranchOnTwoConds must have exactly 3 successors");
3761
3762 for (VPBlockBase *Succ : Successors)
3763 VPBlockUtils::disconnectBlocks(Latch, Succ);
3764
3765 VPValue *EarlyExitingCond = Br->getOperand(0);
3766 VPValue *LateExitingCond = Br->getOperand(1);
3767 VPBlockBase *EarlyExitBB = Successors[0];
3768 VPBlockBase *LateExitBB = Successors[1];
3769 VPBlockBase *Header = Successors[2];
3770
3771 VPBasicBlock *MiddleSplit = Plan.createVPBasicBlock("middle.split");
3772 MiddleSplit->setParent(LateExitBB->getParent());
3773
3774 VPBuilder Builder(Latch);
3775 VPValue *AnyExitTaken = Builder.createNaryOp(
3776 Instruction::Or, {EarlyExitingCond, LateExitingCond}, DL);
3777 Builder.createNaryOp(VPInstruction::BranchOnCond, {AnyExitTaken}, DL);
3778 VPBlockUtils::connectBlocks(Latch, MiddleSplit);
3779 VPBlockUtils::connectBlocks(Latch, Header);
3780
3781 VPBuilder(MiddleSplit)
3782 .createNaryOp(VPInstruction::BranchOnCond, {EarlyExitingCond}, DL);
3783 VPBlockUtils::connectBlocks(MiddleSplit, EarlyExitBB);
3784 VPBlockUtils::connectBlocks(MiddleSplit, LateExitBB);
3785
3786 Br->eraseFromParent();
3787 }
3788}
3789
3791 VPTypeAnalysis TypeInfo(Plan);
3794 vp_depth_first_deep(Plan.getEntry()))) {
3795 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
3796 if (auto *WidenIVR = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
3797 expandVPWidenIntOrFpInduction(WidenIVR, TypeInfo);
3798 ToRemove.push_back(WidenIVR);
3799 continue;
3800 }
3801
3802 if (auto *WidenIVR = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
3803 // If the recipe only generates scalars, scalarize it instead of
3804 // expanding it.
3805 if (WidenIVR->onlyScalarsGenerated(Plan.hasScalableVF())) {
3806 VPBuilder Builder(WidenIVR);
3807 VPValue *PtrAdd =
3808 scalarizeVPWidenPointerInduction(WidenIVR, Plan, Builder);
3809 WidenIVR->replaceAllUsesWith(PtrAdd);
3810 ToRemove.push_back(WidenIVR);
3811 continue;
3812 }
3813 expandVPWidenPointerInduction(WidenIVR, TypeInfo);
3814 ToRemove.push_back(WidenIVR);
3815 continue;
3816 }
3817
3818 // Expand VPBlendRecipe into VPInstruction::Select.
3819 VPBuilder Builder(&R);
3820 if (auto *Blend = dyn_cast<VPBlendRecipe>(&R)) {
3821 VPValue *Select = Blend->getIncomingValue(0);
3822 for (unsigned I = 1; I != Blend->getNumIncomingValues(); ++I)
3823 Select = Builder.createSelect(Blend->getMask(I),
3824 Blend->getIncomingValue(I), Select,
3825 R.getDebugLoc(), "predphi");
3826 Blend->replaceAllUsesWith(Select);
3827 ToRemove.push_back(Blend);
3828 }
3829
3830 if (auto *Expr = dyn_cast<VPExpressionRecipe>(&R)) {
3831 Expr->decompose();
3832 ToRemove.push_back(Expr);
3833 }
3834
3835 // Expand LastActiveLane into Not + FirstActiveLane + Sub.
3836 auto *LastActiveL = dyn_cast<VPInstruction>(&R);
3837 if (LastActiveL &&
3838 LastActiveL->getOpcode() == VPInstruction::LastActiveLane) {
3839 // Create Not(Mask) for all operands.
3841 for (VPValue *Op : LastActiveL->operands()) {
3842 VPValue *NotMask = Builder.createNot(Op, LastActiveL->getDebugLoc());
3843 NotMasks.push_back(NotMask);
3844 }
3845
3846 // Create FirstActiveLane on the inverted masks.
3847 VPValue *FirstInactiveLane = Builder.createNaryOp(
3849 LastActiveL->getDebugLoc(), "first.inactive.lane");
3850
3851 // Subtract 1 to get the last active lane.
3852 VPValue *One = Plan.getOrAddLiveIn(
3853 ConstantInt::get(Type::getInt64Ty(Plan.getContext()), 1));
3854 VPValue *LastLane = Builder.createNaryOp(
3855 Instruction::Sub, {FirstInactiveLane, One},
3856 LastActiveL->getDebugLoc(), "last.active.lane");
3857
3858 LastActiveL->replaceAllUsesWith(LastLane);
3859 ToRemove.push_back(LastActiveL);
3860 continue;
3861 }
3862
3863 // Lower BranchOnCount to ICmp + BranchOnCond.
3864 VPValue *IV, *TC;
3865 if (match(&R, m_BranchOnCount(m_VPValue(IV), m_VPValue(TC)))) {
3866 auto *BranchOnCountInst = cast<VPInstruction>(&R);
3867 DebugLoc DL = BranchOnCountInst->getDebugLoc();
3868 VPValue *Cond = Builder.createICmp(CmpInst::ICMP_EQ, IV, TC, DL);
3869 Builder.createNaryOp(VPInstruction::BranchOnCond, Cond, DL);
3870 ToRemove.push_back(BranchOnCountInst);
3871 continue;
3872 }
3873
3874 VPValue *VectorStep;
3875 VPValue *ScalarStep;
3877 m_VPValue(VectorStep), m_VPValue(ScalarStep))))
3878 continue;
3879
3880 // Expand WideIVStep.
3881 auto *VPI = cast<VPInstruction>(&R);
3882 Type *IVTy = TypeInfo.inferScalarType(VPI);
3883 if (TypeInfo.inferScalarType(VectorStep) != IVTy) {
3885 ? Instruction::UIToFP
3886 : Instruction::Trunc;
3887 VectorStep = Builder.createWidenCast(CastOp, VectorStep, IVTy);
3888 }
3889
3890 assert(!match(ScalarStep, m_One()) && "Expected non-unit scalar-step");
3891 if (TypeInfo.inferScalarType(ScalarStep) != IVTy) {
3892 ScalarStep =
3893 Builder.createWidenCast(Instruction::Trunc, ScalarStep, IVTy);
3894 }
3895
3896 VPIRFlags Flags;
3897 if (IVTy->isFloatingPointTy())
3898 Flags = {VPI->getFastMathFlags()};
3899
3900 unsigned MulOpc =
3901 IVTy->isFloatingPointTy() ? Instruction::FMul : Instruction::Mul;
3902 VPInstruction *Mul = Builder.createNaryOp(
3903 MulOpc, {VectorStep, ScalarStep}, Flags, R.getDebugLoc());
3904 VectorStep = Mul;
3905 VPI->replaceAllUsesWith(VectorStep);
3906 ToRemove.push_back(VPI);
3907 }
3908 }
3909
3910 for (VPRecipeBase *R : ToRemove)
3911 R->eraseFromParent();
3912}
3913
3915 VPBasicBlock *EarlyExitVPBB,
3916 VPlan &Plan,
3917 VPBasicBlock *HeaderVPBB,
3918 VPBasicBlock *LatchVPBB) {
3919 auto *MiddleVPBB = cast<VPBasicBlock>(LatchVPBB->getSuccessors()[0]);
3920 if (!EarlyExitVPBB->getSinglePredecessor() &&
3921 EarlyExitVPBB->getPredecessors()[1] == MiddleVPBB) {
3922 assert(EarlyExitVPBB->getNumPredecessors() == 2 &&
3923 EarlyExitVPBB->getPredecessors()[0] == EarlyExitingVPBB &&
3924 "unsupported early exit VPBB");
3925 // Early exit operand should always be last phi operand. If EarlyExitVPBB
3926 // has two predecessors and EarlyExitingVPBB is the first, swap the operands
3927 // of the phis.
3928 for (VPRecipeBase &R : EarlyExitVPBB->phis())
3929 cast<VPIRPhi>(&R)->swapOperands();
3930 }
3931
3932 VPBuilder Builder(LatchVPBB->getTerminator());
3933 VPBlockBase *TrueSucc = EarlyExitingVPBB->getSuccessors()[0];
3934 assert(match(EarlyExitingVPBB->getTerminator(), m_BranchOnCond()) &&
3935 "Terminator must be be BranchOnCond");
3936 VPValue *CondOfEarlyExitingVPBB =
3937 EarlyExitingVPBB->getTerminator()->getOperand(0);
3938 auto *CondToEarlyExit = TrueSucc == EarlyExitVPBB
3939 ? CondOfEarlyExitingVPBB
3940 : Builder.createNot(CondOfEarlyExitingVPBB);
3941
3942 // Create a BranchOnTwoConds in the latch that branches to:
3943 // [0] vector.early.exit, [1] middle block, [2] header (continue looping).
3944 VPValue *IsEarlyExitTaken =
3945 Builder.createNaryOp(VPInstruction::AnyOf, {CondToEarlyExit});
3946 VPBasicBlock *VectorEarlyExitVPBB =
3947 Plan.createVPBasicBlock("vector.early.exit");
3948 VectorEarlyExitVPBB->setParent(EarlyExitVPBB->getParent());
3949
3950 VPBlockUtils::connectBlocks(VectorEarlyExitVPBB, EarlyExitVPBB);
3951
3952 // Update the exit phis in the early exit block.
3953 VPBuilder MiddleBuilder(MiddleVPBB);
3954 VPBuilder EarlyExitB(VectorEarlyExitVPBB);
3955 for (VPRecipeBase &R : EarlyExitVPBB->phis()) {
3956 auto *ExitIRI = cast<VPIRPhi>(&R);
3957 // Early exit operand should always be last, i.e., 0 if EarlyExitVPBB has
3958 // a single predecessor and 1 if it has two.
3959 unsigned EarlyExitIdx = ExitIRI->getNumOperands() - 1;
3960 if (ExitIRI->getNumOperands() != 1) {
3961 // The first of two operands corresponds to the latch exit, via MiddleVPBB
3962 // predecessor. Extract its final lane.
3963 ExitIRI->extractLastLaneOfLastPartOfFirstOperand(MiddleBuilder);
3964 }
3965
3966 VPValue *IncomingFromEarlyExit = ExitIRI->getOperand(EarlyExitIdx);
3967 if (!IncomingFromEarlyExit->isLiveIn()) {
3968 // Update the incoming value from the early exit.
3969 VPValue *FirstActiveLane = EarlyExitB.createNaryOp(
3970 VPInstruction::FirstActiveLane, {CondToEarlyExit},
3971 DebugLoc::getUnknown(), "first.active.lane");
3972 IncomingFromEarlyExit = EarlyExitB.createNaryOp(
3973 VPInstruction::ExtractLane, {FirstActiveLane, IncomingFromEarlyExit},
3974 DebugLoc::getUnknown(), "early.exit.value");
3975 ExitIRI->setOperand(EarlyExitIdx, IncomingFromEarlyExit);
3976 }
3977 }
3978
3979 // Replace the conditional branch controlling the latch exit from the vector
3980 // loop with a multi-conditional branch exiting to vector early exit if the
3981 // early exit has been taken, exiting to middle block if the original
3982 // condition of the vector latch is true, otherwise continuing back to header.
3983 auto *LatchExitingBranch = cast<VPInstruction>(LatchVPBB->getTerminator());
3984 assert(LatchExitingBranch->getOpcode() == VPInstruction::BranchOnCount &&
3985 "Unexpected terminator");
3986 auto *IsLatchExitTaken =
3987 Builder.createICmp(CmpInst::ICMP_EQ, LatchExitingBranch->getOperand(0),
3988 LatchExitingBranch->getOperand(1));
3989
3990 DebugLoc LatchDL = LatchExitingBranch->getDebugLoc();
3991 LatchExitingBranch->eraseFromParent();
3992
3993 Builder.setInsertPoint(LatchVPBB);
3994 Builder.createNaryOp(VPInstruction::BranchOnTwoConds,
3995 {IsEarlyExitTaken, IsLatchExitTaken}, LatchDL);
3996 LatchVPBB->clearSuccessors();
3997 LatchVPBB->setSuccessors({VectorEarlyExitVPBB, MiddleVPBB, HeaderVPBB});
3998 VectorEarlyExitVPBB->setPredecessors({LatchVPBB});
3999}
4000
4001/// This function tries convert extended in-loop reductions to
4002/// VPExpressionRecipe and clamp the \p Range if it is beneficial and
4003/// valid. The created recipe must be decomposed to its constituent
4004/// recipes before execution.
4005static VPExpressionRecipe *
4007 VFRange &Range) {
4008 Type *RedTy = Ctx.Types.inferScalarType(Red);
4009 VPValue *VecOp = Red->getVecOp();
4010
4011 // Clamp the range if using extended-reduction is profitable.
4012 auto IsExtendedRedValidAndClampRange =
4013 [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
4015 [&](ElementCount VF) {
4016 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4018
4019 InstructionCost ExtRedCost;
4020 InstructionCost ExtCost =
4021 cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
4022 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4023
4024 if (Red->isPartialReduction()) {
4027 // FIXME: Move partial reduction creation, costing and clamping
4028 // here from LoopVectorize.cpp.
4029 ExtRedCost = Ctx.TTI.getPartialReductionCost(
4030 Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
4031 llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
4032 } else {
4033 ExtRedCost = Ctx.TTI.getExtendedReductionCost(
4034 Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
4035 Red->getFastMathFlags(), CostKind);
4036 }
4037 return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
4038 },
4039 Range);
4040 };
4041
4042 VPValue *A;
4043 // Match reduce(ext)).
4044 if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
4045 IsExtendedRedValidAndClampRange(
4046 RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
4047 cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
4048 Ctx.Types.inferScalarType(A)))
4049 return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
4050
4051 return nullptr;
4052}
4053
4054/// This function tries convert extended in-loop reductions to
4055/// VPExpressionRecipe and clamp the \p Range if it is beneficial
4056/// and valid. The created VPExpressionRecipe must be decomposed to its
4057/// constituent recipes before execution. Patterns of the
4058/// VPExpressionRecipe:
4059/// reduce.add(mul(...)),
4060/// reduce.add(mul(ext(A), ext(B))),
4061/// reduce.add(ext(mul(ext(A), ext(B)))).
4062static VPExpressionRecipe *
4064 VPCostContext &Ctx, VFRange &Range) {
4065 unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
4066 if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
4067 return nullptr;
4068
4069 Type *RedTy = Ctx.Types.inferScalarType(Red);
4070
4071 // Clamp the range if using multiply-accumulate-reduction is profitable.
4072 auto IsMulAccValidAndClampRange =
4074 VPWidenCastRecipe *OuterExt) -> bool {
4076 [&](ElementCount VF) {
4078 Type *SrcTy =
4079 Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
4080 InstructionCost MulAccCost;
4081
4082 if (Red->isPartialReduction()) {
4083 Type *SrcTy2 =
4084 Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
4085 // FIXME: Move partial reduction creation, costing and clamping
4086 // here from LoopVectorize.cpp.
4087 MulAccCost = Ctx.TTI.getPartialReductionCost(
4088 Opcode, SrcTy, SrcTy2, RedTy, VF,
4090 Ext0->getOpcode())
4093 Ext1->getOpcode())
4095 Mul->getOpcode(), CostKind);
4096 } else {
4097 // Only partial reductions support mixed extends at the moment.
4098 if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
4099 return false;
4100
4101 bool IsZExt =
4102 !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
4103 auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
4104 MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
4105 SrcVecTy, CostKind);
4106 }
4107
4108 InstructionCost MulCost = Mul->computeCost(VF, Ctx);
4109 InstructionCost RedCost = Red->computeCost(VF, Ctx);
4110 InstructionCost ExtCost = 0;
4111 if (Ext0)
4112 ExtCost += Ext0->computeCost(VF, Ctx);
4113 if (Ext1)
4114 ExtCost += Ext1->computeCost(VF, Ctx);
4115 if (OuterExt)
4116 ExtCost += OuterExt->computeCost(VF, Ctx);
4117
4118 return MulAccCost.isValid() &&
4119 MulAccCost < ExtCost + MulCost + RedCost;
4120 },
4121 Range);
4122 };
4123
4124 VPValue *VecOp = Red->getVecOp();
4125 VPRecipeBase *Sub = nullptr;
4126 VPValue *A, *B;
4127 VPValue *Tmp = nullptr;
4128 // Sub reductions could have a sub between the add reduction and vec op.
4129 if (match(VecOp, m_Sub(m_ZeroInt(), m_VPValue(Tmp)))) {
4130 Sub = VecOp->getDefiningRecipe();
4131 VecOp = Tmp;
4132 }
4133
4134 // If ValB is a constant and can be safely extended, truncate it to the same
4135 // type as ExtA's operand, then extend it to the same type as ExtA. This
4136 // creates two uniform extends that can more easily be matched by the rest of
4137 // the bundling code. The ExtB reference, ValB and operand 1 of Mul are all
4138 // replaced with the new extend of the constant.
4139 auto ExtendAndReplaceConstantOp = [&Ctx](VPWidenCastRecipe *ExtA,
4140 VPWidenCastRecipe *&ExtB,
4141 VPValue *&ValB, VPWidenRecipe *Mul) {
4142 if (!ExtA || ExtB || !ValB->isLiveIn())
4143 return;
4144 Type *NarrowTy = Ctx.Types.inferScalarType(ExtA->getOperand(0));
4145 Instruction::CastOps ExtOpc = ExtA->getOpcode();
4146 const APInt *Const;
4147 if (!match(ValB, m_APInt(Const)) ||
4149 Const, NarrowTy, TTI::getPartialReductionExtendKind(ExtOpc)))
4150 return;
4151 // The truncate ensures that the type of each extended operand is the
4152 // same, and it's been proven that the constant can be extended from
4153 // NarrowTy safely. Necessary since ExtA's extended operand would be
4154 // e.g. an i8, while the const will likely be an i32. This will be
4155 // elided by later optimisations.
4156 VPBuilder Builder(Mul);
4157 auto *Trunc =
4158 Builder.createWidenCast(Instruction::CastOps::Trunc, ValB, NarrowTy);
4159 Type *WideTy = Ctx.Types.inferScalarType(ExtA);
4160 ValB = ExtB = Builder.createWidenCast(ExtOpc, Trunc, WideTy);
4161 Mul->setOperand(1, ExtB);
4162 };
4163
4164 // Try to match reduce.add(mul(...)).
4165 if (match(VecOp, m_Mul(m_VPValue(A), m_VPValue(B)))) {
4168 auto *Mul = cast<VPWidenRecipe>(VecOp);
4169
4170 // Convert reduce.add(mul(ext, const)) to reduce.add(mul(ext, ext(const)))
4171 ExtendAndReplaceConstantOp(RecipeA, RecipeB, B, Mul);
4172
4173 // Match reduce.add/sub(mul(ext, ext)).
4174 if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
4175 match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
4176 IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
4177 if (Sub)
4178 return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
4179 cast<VPWidenRecipe>(Sub), Red);
4180 return new VPExpressionRecipe(RecipeA, RecipeB, Mul, Red);
4181 }
4182 // TODO: Add an expression type for this variant with a negated mul
4183 if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
4184 return new VPExpressionRecipe(Mul, Red);
4185 }
4186 // TODO: Add an expression type for negated versions of other expression
4187 // variants.
4188 if (Sub)
4189 return nullptr;
4190
4191 // Match reduce.add(ext(mul(A, B))).
4192 if (match(VecOp, m_ZExtOrSExt(m_Mul(m_VPValue(A), m_VPValue(B))))) {
4193 auto *Ext = cast<VPWidenCastRecipe>(VecOp);
4194 auto *Mul = cast<VPWidenRecipe>(Ext->getOperand(0));
4197
4198 // reduce.add(ext(mul(ext, const)))
4199 // -> reduce.add(ext(mul(ext, ext(const))))
4200 ExtendAndReplaceConstantOp(Ext0, Ext1, B, Mul);
4201
4202 // reduce.add(ext(mul(ext(A), ext(B))))
4203 // -> reduce.add(mul(wider_ext(A), wider_ext(B)))
4204 // The inner extends must either have the same opcode as the outer extend or
4205 // be the same, in which case the multiply can never result in a negative
4206 // value and the outer extend can be folded away by doing wider
4207 // extends for the operands of the mul.
4208 if (Ext0 && Ext1 &&
4209 (Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
4210 Ext0->getOpcode() == Ext1->getOpcode() &&
4211 IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
4212 auto *NewExt0 = new VPWidenCastRecipe(
4213 Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), nullptr,
4214 *Ext0, *Ext0, Ext0->getDebugLoc());
4215 NewExt0->insertBefore(Ext0);
4216
4217 VPWidenCastRecipe *NewExt1 = NewExt0;
4218 if (Ext0 != Ext1) {
4219 NewExt1 = new VPWidenCastRecipe(Ext1->getOpcode(), Ext1->getOperand(0),
4220 Ext->getResultType(), nullptr, *Ext1,
4221 *Ext1, Ext1->getDebugLoc());
4222 NewExt1->insertBefore(Ext1);
4223 }
4224 Mul->setOperand(0, NewExt0);
4225 Mul->setOperand(1, NewExt1);
4226 Red->setOperand(1, Mul);
4227 return new VPExpressionRecipe(NewExt0, NewExt1, Mul, Red);
4228 }
4229 }
4230 return nullptr;
4231}
4232
4233/// This function tries to create abstract recipes from the reduction recipe for
4234/// following optimizations and cost estimation.
4236 VPCostContext &Ctx,
4237 VFRange &Range) {
4238 VPExpressionRecipe *AbstractR = nullptr;
4239 auto IP = std::next(Red->getIterator());
4240 auto *VPBB = Red->getParent();
4241 if (auto *MulAcc = tryToMatchAndCreateMulAccumulateReduction(Red, Ctx, Range))
4242 AbstractR = MulAcc;
4243 else if (auto *ExtRed = tryToMatchAndCreateExtendedReduction(Red, Ctx, Range))
4244 AbstractR = ExtRed;
4245 // Cannot create abstract inloop reduction recipes.
4246 if (!AbstractR)
4247 return;
4248
4249 AbstractR->insertBefore(*VPBB, IP);
4250 Red->replaceAllUsesWith(AbstractR);
4251}
4252
4263
4265 if (Plan.hasScalarVFOnly())
4266 return;
4267
4268#ifndef NDEBUG
4269 VPDominatorTree VPDT(Plan);
4270#endif
4271
4272 SmallVector<VPValue *> VPValues;
4275 append_range(VPValues, Plan.getLiveIns());
4276 for (VPRecipeBase &R : *Plan.getEntry())
4277 append_range(VPValues, R.definedValues());
4278
4279 auto *VectorPreheader = Plan.getVectorPreheader();
4280 for (VPValue *VPV : VPValues) {
4282 (VPV->isLiveIn() && VPV->getLiveInIRValue() &&
4283 isa<Constant>(VPV->getLiveInIRValue())))
4284 continue;
4285
4286 // Add explicit broadcast at the insert point that dominates all users.
4287 VPBasicBlock *HoistBlock = VectorPreheader;
4288 VPBasicBlock::iterator HoistPoint = VectorPreheader->end();
4289 for (VPUser *User : VPV->users()) {
4290 if (User->usesScalars(VPV))
4291 continue;
4292 if (cast<VPRecipeBase>(User)->getParent() == VectorPreheader)
4293 HoistPoint = HoistBlock->begin();
4294 else
4295 assert(VPDT.dominates(VectorPreheader,
4296 cast<VPRecipeBase>(User)->getParent()) &&
4297 "All users must be in the vector preheader or dominated by it");
4298 }
4299
4300 VPBuilder Builder(cast<VPBasicBlock>(HoistBlock), HoistPoint);
4301 auto *Broadcast = Builder.createNaryOp(VPInstruction::Broadcast, {VPV});
4302 VPV->replaceUsesWithIf(Broadcast,
4303 [VPV, Broadcast](VPUser &U, unsigned Idx) {
4304 return Broadcast != &U && !U.usesScalars(VPV);
4305 });
4306 }
4307}
4308
4310 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4311
4312 // Collect candidate loads with invariant addresses and noalias scopes
4313 // metadata and memory-writing recipes with noalias metadata.
4317 vp_depth_first_shallow(LoopRegion->getEntry()))) {
4318 for (VPRecipeBase &R : *VPBB) {
4319 // Only handle single-scalar replicated loads with invariant addresses.
4320 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
4321 if (RepR->isPredicated() || !RepR->isSingleScalar() ||
4322 RepR->getOpcode() != Instruction::Load)
4323 continue;
4324
4325 VPValue *Addr = RepR->getOperand(0);
4326 if (Addr->isDefinedOutsideLoopRegions()) {
4328 if (!Loc.AATags.Scope)
4329 continue;
4330 CandidateLoads.push_back({RepR, Loc});
4331 }
4332 }
4333 if (R.mayWriteToMemory()) {
4335 if (!Loc || !Loc->AATags.Scope || !Loc->AATags.NoAlias)
4336 return;
4337 Stores.push_back(*Loc);
4338 }
4339 }
4340 }
4341
4342 VPBasicBlock *Preheader = Plan.getVectorPreheader();
4343 for (auto &[LoadRecipe, LoadLoc] : CandidateLoads) {
4344 // Hoist the load to the preheader if it doesn't alias with any stores
4345 // according to the noalias metadata. Other loads should have been hoisted
4346 // by other passes
4347 const AAMDNodes &LoadAA = LoadLoc.AATags;
4348 if (all_of(Stores, [&](const MemoryLocation &StoreLoc) {
4350 LoadAA.Scope, StoreLoc.AATags.NoAlias);
4351 })) {
4352 LoadRecipe->moveBefore(*Preheader, Preheader->getFirstNonPhi());
4353 }
4354 }
4355}
4356
4357// Collect common metadata from a group of replicate recipes by intersecting
4358// metadata from all recipes in the group.
4360 VPIRMetadata CommonMetadata = *Recipes.front();
4361 for (VPReplicateRecipe *Recipe : drop_begin(Recipes))
4362 CommonMetadata.intersect(*Recipe);
4363 return CommonMetadata;
4364}
4365
4366template <unsigned Opcode>
4370 const Loop *L) {
4371 static_assert(Opcode == Instruction::Load || Opcode == Instruction::Store,
4372 "Only Load and Store opcodes supported");
4373 constexpr bool IsLoad = (Opcode == Instruction::Load);
4374 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4375 VPTypeAnalysis TypeInfo(Plan);
4376
4377 // Group predicated operations by their address SCEV.
4379 for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4380 auto *VPBB = cast<VPBasicBlock>(Block);
4381 for (VPRecipeBase &R : *VPBB) {
4382 auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4383 if (!RepR || RepR->getOpcode() != Opcode || !RepR->isPredicated())
4384 continue;
4385
4386 // For loads, operand 0 is address; for stores, operand 1 is address.
4387 VPValue *Addr = RepR->getOperand(IsLoad ? 0 : 1);
4388 const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, PSE, L);
4389 if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4390 RecipesByAddress[AddrSCEV].push_back(RepR);
4391 }
4392 }
4393
4394 // For each address, collect operations with the same or complementary masks.
4396 auto GetLoadStoreValueType = [&](VPReplicateRecipe *Recipe) {
4397 return TypeInfo.inferScalarType(IsLoad ? Recipe : Recipe->getOperand(0));
4398 };
4399 for (auto &[Addr, Recipes] : RecipesByAddress) {
4400 if (Recipes.size() < 2)
4401 continue;
4402
4403 // Collect groups with the same or complementary masks.
4404 for (VPReplicateRecipe *&RecipeI : Recipes) {
4405 if (!RecipeI)
4406 continue;
4407
4408 VPValue *MaskI = RecipeI->getMask();
4409 Type *TypeI = GetLoadStoreValueType(RecipeI);
4411 Group.push_back(RecipeI);
4412 RecipeI = nullptr;
4413
4414 // Find all operations with the same or complementary masks.
4415 bool HasComplementaryMask = false;
4416 for (VPReplicateRecipe *&RecipeJ : Recipes) {
4417 if (!RecipeJ)
4418 continue;
4419
4420 VPValue *MaskJ = RecipeJ->getMask();
4421 Type *TypeJ = GetLoadStoreValueType(RecipeJ);
4422 if (TypeI == TypeJ) {
4423 // Check if any operation in the group has a complementary mask with
4424 // another, that is M1 == NOT(M2) or M2 == NOT(M1).
4425 HasComplementaryMask |= match(MaskI, m_Not(m_Specific(MaskJ))) ||
4426 match(MaskJ, m_Not(m_Specific(MaskI)));
4427 Group.push_back(RecipeJ);
4428 RecipeJ = nullptr;
4429 }
4430 }
4431
4432 if (HasComplementaryMask) {
4433 assert(Group.size() >= 2 && "must have at least 2 entries");
4434 AllGroups.push_back(std::move(Group));
4435 }
4436 }
4437 }
4438
4439 return AllGroups;
4440}
4441
4442// Find the recipe with minimum alignment in the group.
4443template <typename InstType>
4444static VPReplicateRecipe *
4446 return *min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4447 return cast<InstType>(A->getUnderlyingInstr())->getAlign() <
4448 cast<InstType>(B->getUnderlyingInstr())->getAlign();
4449 });
4450}
4451
4454 const Loop *L) {
4455 auto Groups =
4457 if (Groups.empty())
4458 return;
4459
4460 VPDominatorTree VPDT(Plan);
4461
4462 // Process each group of loads.
4463 for (auto &Group : Groups) {
4464 // Sort loads by dominance order, with earliest (most dominating) first.
4465 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4466 return VPDT.properlyDominates(A, B);
4467 });
4468
4469 // Try to use the earliest (most dominating) load to replace all others.
4470 VPReplicateRecipe *EarliestLoad = Group[0];
4471 VPBasicBlock *FirstBB = EarliestLoad->getParent();
4472 VPBasicBlock *LastBB = Group.back()->getParent();
4473
4474 // Check that the load doesn't alias with stores between first and last.
4475 auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
4476 if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
4477 continue;
4478
4479 // Collect common metadata from all loads in the group.
4480 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4481
4482 // Find the load with minimum alignment to use.
4483 auto *LoadWithMinAlign = findRecipeWithMinAlign<LoadInst>(Group);
4484
4485 // Create an unpredicated version of the earliest load with common
4486 // metadata.
4487 auto *UnpredicatedLoad = new VPReplicateRecipe(
4488 LoadWithMinAlign->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4489 /*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad,
4490 CommonMetadata);
4491
4492 UnpredicatedLoad->insertBefore(EarliestLoad);
4493
4494 // Replace all loads in the group with the unpredicated load.
4495 for (VPReplicateRecipe *Load : Group) {
4496 Load->replaceAllUsesWith(UnpredicatedLoad);
4497 Load->eraseFromParent();
4498 }
4499 }
4500}
4501
4502static bool
4504 PredicatedScalarEvolution &PSE, const Loop &L,
4505 VPTypeAnalysis &TypeInfo) {
4506 auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
4507 if (!StoreLoc || !StoreLoc->AATags.Scope)
4508 return false;
4509
4510 // When sinking a group of stores, all members of the group alias each other.
4511 // Skip them during the alias checks.
4512 SmallPtrSet<VPRecipeBase *, 4> StoresToSinkSet(StoresToSink.begin(),
4513 StoresToSink.end());
4514
4515 VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
4516 VPBasicBlock *LastBB = StoresToSink.back()->getParent();
4517 SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], PSE, L, TypeInfo);
4518 return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
4519}
4520
4523 const Loop *L) {
4524 auto Groups =
4526 if (Groups.empty())
4527 return;
4528
4529 VPDominatorTree VPDT(Plan);
4530 VPTypeAnalysis TypeInfo(Plan);
4531
4532 for (auto &Group : Groups) {
4533 sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4534 return VPDT.properlyDominates(A, B);
4535 });
4536
4537 if (!canSinkStoreWithNoAliasCheck(Group, PSE, *L, TypeInfo))
4538 continue;
4539
4540 // Use the last (most dominated) store's location for the unconditional
4541 // store.
4542 VPReplicateRecipe *LastStore = Group.back();
4543 VPBasicBlock *InsertBB = LastStore->getParent();
4544
4545 // Collect common alias metadata from all stores in the group.
4546 VPIRMetadata CommonMetadata = getCommonMetadata(Group);
4547
4548 // Build select chain for stored values.
4549 VPValue *SelectedValue = Group[0]->getOperand(0);
4550 VPBuilder Builder(InsertBB, LastStore->getIterator());
4551
4552 for (unsigned I = 1; I < Group.size(); ++I) {
4553 VPValue *Mask = Group[I]->getMask();
4554 VPValue *Value = Group[I]->getOperand(0);
4555 SelectedValue = Builder.createSelect(Mask, Value, SelectedValue,
4556 Group[I]->getDebugLoc());
4557 }
4558
4559 // Find the store with minimum alignment to use.
4560 auto *StoreWithMinAlign = findRecipeWithMinAlign<StoreInst>(Group);
4561
4562 // Create unconditional store with selected value and common metadata.
4563 auto *UnpredicatedStore =
4564 new VPReplicateRecipe(StoreWithMinAlign->getUnderlyingInstr(),
4565 {SelectedValue, LastStore->getOperand(1)},
4566 /*IsSingleScalar=*/false,
4567 /*Mask=*/nullptr, *LastStore, CommonMetadata);
4568 UnpredicatedStore->insertBefore(*InsertBB, LastStore->getIterator());
4569
4570 // Remove all predicated stores from the group.
4571 for (VPReplicateRecipe *Store : Group)
4572 Store->eraseFromParent();
4573 }
4574}
4575
4577 VPlan &Plan, ElementCount BestVF, unsigned BestUF,
4579 assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan");
4580 assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan");
4581
4582 VPValue *TC = Plan.getTripCount();
4583 // Skip cases for which the trip count may be non-trivial to materialize.
4584 // I.e., when a scalar tail is absent - due to tail folding, or when a scalar
4585 // tail is required.
4586 if (!Plan.hasScalarTail() ||
4588 Plan.getScalarPreheader() ||
4589 !TC->isLiveIn())
4590 return;
4591
4592 // Materialize vector trip counts for constants early if it can simply
4593 // be computed as (Original TC / VF * UF) * VF * UF.
4594 // TODO: Compute vector trip counts for loops requiring a scalar epilogue and
4595 // tail-folded loops.
4596 ScalarEvolution &SE = *PSE.getSE();
4597 auto *TCScev = SE.getSCEV(TC->getLiveInIRValue());
4598 if (!isa<SCEVConstant>(TCScev))
4599 return;
4600 const SCEV *VFxUF = SE.getElementCount(TCScev->getType(), BestVF * BestUF);
4601 auto VecTCScev = SE.getMulExpr(SE.getUDivExpr(TCScev, VFxUF), VFxUF);
4602 if (auto *ConstVecTC = dyn_cast<SCEVConstant>(VecTCScev))
4603 Plan.getVectorTripCount().setUnderlyingValue(ConstVecTC->getValue());
4604}
4605
4607 VPBasicBlock *VectorPH) {
4609 if (BTC->getNumUsers() == 0)
4610 return;
4611
4612 VPBuilder Builder(VectorPH, VectorPH->begin());
4613 auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4614 auto *TCMO = Builder.createNaryOp(
4615 Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
4616 DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
4617 BTC->replaceAllUsesWith(TCMO);
4618}
4619
4621 if (Plan.hasScalarVFOnly())
4622 return;
4623
4624 VPTypeAnalysis TypeInfo(Plan);
4625 VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4626 auto VPBBsOutsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4628 auto VPBBsInsideLoopRegion = VPBlockUtils::blocksOnly<VPBasicBlock>(
4629 vp_depth_first_shallow(LoopRegion->getEntry()));
4630 // Materialize Build(Struct)Vector for all replicating VPReplicateRecipes and
4631 // VPInstructions, excluding ones in replicate regions. Those are not
4632 // materialized explicitly yet. Those vector users are still handled in
4633 // VPReplicateRegion::execute(), via shouldPack().
4634 // TODO: materialize build vectors for replicating recipes in replicating
4635 // regions.
4636 for (VPBasicBlock *VPBB :
4637 concat<VPBasicBlock *>(VPBBsOutsideLoopRegion, VPBBsInsideLoopRegion)) {
4638 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4640 continue;
4641 auto *DefR = cast<VPRecipeWithIRFlags>(&R);
4642 auto UsesVectorOrInsideReplicateRegion = [DefR, LoopRegion](VPUser *U) {
4643 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4644 return !U->usesScalars(DefR) || ParentRegion != LoopRegion;
4645 };
4646 if ((isa<VPReplicateRecipe>(DefR) &&
4647 cast<VPReplicateRecipe>(DefR)->isSingleScalar()) ||
4648 (isa<VPInstruction>(DefR) &&
4650 !cast<VPInstruction>(DefR)->doesGeneratePerAllLanes())) ||
4651 none_of(DefR->users(), UsesVectorOrInsideReplicateRegion))
4652 continue;
4653
4654 Type *ScalarTy = TypeInfo.inferScalarType(DefR);
4655 unsigned Opcode = ScalarTy->isStructTy()
4658 auto *BuildVector = new VPInstruction(Opcode, {DefR});
4659 BuildVector->insertAfter(DefR);
4660
4661 DefR->replaceUsesWithIf(
4662 BuildVector, [BuildVector, &UsesVectorOrInsideReplicateRegion](
4663 VPUser &U, unsigned) {
4664 return &U != BuildVector && UsesVectorOrInsideReplicateRegion(&U);
4665 });
4666 }
4667 }
4668
4669 // Create explicit VPInstructions to convert vectors to scalars. The current
4670 // implementation is conservative - it may miss some cases that may or may not
4671 // be vector values. TODO: introduce Unpacks speculatively - remove them later
4672 // if they are known to operate on scalar values.
4673 for (VPBasicBlock *VPBB : VPBBsInsideLoopRegion) {
4674 for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
4677 continue;
4678 for (VPValue *Def : R.definedValues()) {
4679 // Skip recipes that are single-scalar or only have their first lane
4680 // used.
4681 // TODO: The Defs skipped here may or may not be vector values.
4682 // Introduce Unpacks, and remove them later, if they are guaranteed to
4683 // produce scalar values.
4685 continue;
4686
4687 // At the moment, we create unpacks only for scalar users outside
4688 // replicate regions. Recipes inside replicate regions still extract the
4689 // required lanes implicitly.
4690 // TODO: Remove once replicate regions are unrolled completely.
4691 auto IsCandidateUnpackUser = [Def](VPUser *U) {
4692 VPRegionBlock *ParentRegion = cast<VPRecipeBase>(U)->getRegion();
4693 return U->usesScalars(Def) &&
4694 (!ParentRegion || !ParentRegion->isReplicator());
4695 };
4696 if (none_of(Def->users(), IsCandidateUnpackUser))
4697 continue;
4698
4699 auto *Unpack = new VPInstruction(VPInstruction::Unpack, {Def});
4700 if (R.isPhi())
4701 Unpack->insertBefore(*VPBB, VPBB->getFirstNonPhi());
4702 else
4703 Unpack->insertAfter(&R);
4704 Def->replaceUsesWithIf(Unpack,
4705 [&IsCandidateUnpackUser](VPUser &U, unsigned) {
4706 return IsCandidateUnpackUser(&U);
4707 });
4708 }
4709 }
4710 }
4711}
4712
4714 VPBasicBlock *VectorPHVPBB,
4715 bool TailByMasking,
4716 bool RequiresScalarEpilogue) {
4717 VPValue &VectorTC = Plan.getVectorTripCount();
4718 assert(VectorTC.isLiveIn() && "vector-trip-count must be a live-in");
4719 // There's nothing to do if there are no users of the vector trip count or its
4720 // IR value has already been set.
4721 if (VectorTC.getNumUsers() == 0 || VectorTC.getLiveInIRValue())
4722 return;
4723
4724 VPValue *TC = Plan.getTripCount();
4725 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(TC);
4726 VPBuilder Builder(VectorPHVPBB, VectorPHVPBB->begin());
4727 VPValue *Step = &Plan.getVFxUF();
4728
4729 // If the tail is to be folded by masking, round the number of iterations N
4730 // up to a multiple of Step instead of rounding down. This is done by first
4731 // adding Step-1 and then rounding down. Note that it's ok if this addition
4732 // overflows: the vector induction variable will eventually wrap to zero given
4733 // that it starts at zero and its Step is a power of two; the loop will then
4734 // exit, with the last early-exit vector comparison also producing all-true.
4735 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
4736 // is accounted for in emitIterationCountCheck that adds an overflow check.
4737 if (TailByMasking) {
4738 TC = Builder.createNaryOp(
4739 Instruction::Add,
4740 {TC, Builder.createNaryOp(Instruction::Sub,
4741 {Step, Plan.getConstantInt(TCTy, 1)})},
4742 DebugLoc::getCompilerGenerated(), "n.rnd.up");
4743 }
4744
4745 // Now we need to generate the expression for the part of the loop that the
4746 // vectorized body will execute. This is equal to N - (N % Step) if scalar
4747 // iterations are not required for correctness, or N - Step, otherwise. Step
4748 // is equal to the vectorization factor (number of SIMD elements) times the
4749 // unroll factor (number of SIMD instructions).
4750 VPValue *R =
4751 Builder.createNaryOp(Instruction::URem, {TC, Step},
4752 DebugLoc::getCompilerGenerated(), "n.mod.vf");
4753
4754 // There are cases where we *must* run at least one iteration in the remainder
4755 // loop. See the cost model for when this can happen. If the step evenly
4756 // divides the trip count, we set the remainder to be equal to the step. If
4757 // the step does not evenly divide the trip count, no adjustment is necessary
4758 // since there will already be scalar iterations. Note that the minimum
4759 // iterations check ensures that N >= Step.
4760 if (RequiresScalarEpilogue) {
4761 assert(!TailByMasking &&
4762 "requiring scalar epilogue is not supported with fail folding");
4763 VPValue *IsZero =
4764 Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
4765 R = Builder.createSelect(IsZero, Step, R);
4766 }
4767
4768 VPValue *Res = Builder.createNaryOp(
4769 Instruction::Sub, {TC, R}, DebugLoc::getCompilerGenerated(), "n.vec");
4770 VectorTC.replaceAllUsesWith(Res);
4771}
4772
4774 ElementCount VFEC) {
4775 VPBuilder Builder(VectorPH, VectorPH->begin());
4776 Type *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
4777 VPValue &VF = Plan.getVF();
4778 VPValue &VFxUF = Plan.getVFxUF();
4779 // Note that after the transform, Plan.getVF and Plan.getVFxUF should not be
4780 // used.
4781 // TODO: Assert that they aren't used.
4782
4783 // If there are no users of the runtime VF, compute VFxUF by constant folding
4784 // the multiplication of VF and UF.
4785 if (VF.getNumUsers() == 0) {
4786 VPValue *RuntimeVFxUF =
4787 Builder.createElementCount(TCTy, VFEC * Plan.getUF());
4788 VFxUF.replaceAllUsesWith(RuntimeVFxUF);
4789 return;
4790 }
4791
4792 // For users of the runtime VF, compute it as VF * vscale, and VFxUF as (VF *
4793 // vscale) * UF.
4794 VPValue *RuntimeVF = Builder.createElementCount(TCTy, VFEC);
4796 VPValue *BC = Builder.createNaryOp(VPInstruction::Broadcast, RuntimeVF);
4798 BC, [&VF](VPUser &U, unsigned) { return !U.usesScalars(&VF); });
4799 }
4800 VF.replaceAllUsesWith(RuntimeVF);
4801
4802 VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
4803 VPValue *MulByUF = Builder.createOverflowingOp(
4804 Instruction::Mul, {RuntimeVF, UF}, {true, false});
4805 VFxUF.replaceAllUsesWith(MulByUF);
4806}
4807
4810 SCEVExpander Expander(SE, "induction", /*PreserveLCSSA=*/false);
4811
4812 auto *Entry = cast<VPIRBasicBlock>(Plan.getEntry());
4813 BasicBlock *EntryBB = Entry->getIRBasicBlock();
4814 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
4815 for (VPRecipeBase &R : make_early_inc_range(*Entry)) {
4817 continue;
4818 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(&R);
4819 if (!ExpSCEV)
4820 break;
4821 const SCEV *Expr = ExpSCEV->getSCEV();
4822 Value *Res =
4823 Expander.expandCodeFor(Expr, Expr->getType(), EntryBB->getTerminator());
4824 ExpandedSCEVs[ExpSCEV->getSCEV()] = Res;
4825 VPValue *Exp = Plan.getOrAddLiveIn(Res);
4826 ExpSCEV->replaceAllUsesWith(Exp);
4827 if (Plan.getTripCount() == ExpSCEV)
4828 Plan.resetTripCount(Exp);
4829 ExpSCEV->eraseFromParent();
4830 }
4832 "VPExpandSCEVRecipes must be at the beginning of the entry block, "
4833 "after any VPIRInstructions");
4834 // Add IR instructions in the entry basic block but not in the VPIRBasicBlock
4835 // to the VPIRBasicBlock.
4836 auto EI = Entry->begin();
4837 for (Instruction &I : drop_end(*EntryBB)) {
4838 if (EI != Entry->end() && isa<VPIRInstruction>(*EI) &&
4839 &cast<VPIRInstruction>(&*EI)->getInstruction() == &I) {
4840 EI++;
4841 continue;
4842 }
4844 }
4845
4846 return ExpandedSCEVs;
4847}
4848
4849/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
4850/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
4851/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding
4852/// the same interleave group at index 0. A VPWidenLoadRecipe can be narrowed to
4853/// an index-independent load if it feeds all wide ops at all indices (\p OpV
4854/// must be the operand at index \p OpIdx for both the recipe at lane 0, \p
4855/// WideMember0). A VPInterleaveRecipe can be narrowed to a wide load, if \p V
4856/// is defined at \p Idx of a load interleave group.
4857static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
4858 VPValue *OpV, unsigned Idx) {
4859 VPValue *Member0Op = WideMember0->getOperand(OpIdx);
4860 VPRecipeBase *Member0OpR = Member0Op->getDefiningRecipe();
4861 if (!Member0OpR)
4862 return Member0Op == OpV;
4863 if (auto *W = dyn_cast<VPWidenLoadRecipe>(Member0OpR))
4864 return !W->getMask() && Member0Op == OpV;
4865 if (auto *IR = dyn_cast<VPInterleaveRecipe>(Member0OpR))
4866 return IR->getInterleaveGroup()->isFull() && IR->getVPValue(Idx) == OpV;
4867 return false;
4868}
4869
4870/// Returns true if \p IR is a full interleave group with factor and number of
4871/// members both equal to \p VF. The interleave group must also access the full
4872/// vector width \p VectorRegWidth.
4874 ElementCount VF,
4875 VPTypeAnalysis &TypeInfo,
4876 TypeSize VectorRegWidth) {
4877 if (!InterleaveR || InterleaveR->getMask())
4878 return false;
4879
4880 Type *GroupElementTy = nullptr;
4881 if (InterleaveR->getStoredValues().empty()) {
4882 GroupElementTy = TypeInfo.inferScalarType(InterleaveR->getVPValue(0));
4883 if (!all_of(InterleaveR->definedValues(),
4884 [&TypeInfo, GroupElementTy](VPValue *Op) {
4885 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4886 }))
4887 return false;
4888 } else {
4889 GroupElementTy =
4890 TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
4891 if (!all_of(InterleaveR->getStoredValues(),
4892 [&TypeInfo, GroupElementTy](VPValue *Op) {
4893 return TypeInfo.inferScalarType(Op) == GroupElementTy;
4894 }))
4895 return false;
4896 }
4897
4898 unsigned VFMin = VF.getKnownMinValue();
4899 TypeSize GroupSize = TypeSize::get(
4900 GroupElementTy->getScalarSizeInBits() * VFMin, VF.isScalable());
4901 const auto *IG = InterleaveR->getInterleaveGroup();
4902 return IG->getFactor() == VFMin && IG->getNumMembers() == VFMin &&
4903 GroupSize == VectorRegWidth;
4904}
4905
4906/// Returns true if \p VPValue is a narrow VPValue.
4907static bool isAlreadyNarrow(VPValue *VPV) {
4908 if (VPV->isLiveIn())
4909 return true;
4910 auto *RepR = dyn_cast<VPReplicateRecipe>(VPV);
4911 return RepR && RepR->isSingleScalar();
4912}
4913
4914// Convert a wide recipe defining a VPValue \p V feeding an interleave group to
4915// a narrow variant.
4916static VPValue *
4918 auto *R = V->getDefiningRecipe();
4919 if (!R || NarrowedOps.contains(V))
4920 return V;
4921
4922 if (isAlreadyNarrow(V))
4923 return V;
4924
4925 if (auto *WideMember0 = dyn_cast<VPWidenRecipe>(R)) {
4926 for (unsigned Idx = 0, E = WideMember0->getNumOperands(); Idx != E; ++Idx)
4927 WideMember0->setOperand(
4928 Idx,
4929 narrowInterleaveGroupOp(WideMember0->getOperand(Idx), NarrowedOps));
4930 return V;
4931 }
4932
4933 if (auto *LoadGroup = dyn_cast<VPInterleaveRecipe>(R)) {
4934 // Narrow interleave group to wide load, as transformed VPlan will only
4935 // process one original iteration.
4936 auto *LI = cast<LoadInst>(LoadGroup->getInterleaveGroup()->getInsertPos());
4937 auto *L = new VPWidenLoadRecipe(
4938 *LI, LoadGroup->getAddr(), LoadGroup->getMask(), /*Consecutive=*/true,
4939 /*Reverse=*/false, {}, LoadGroup->getDebugLoc());
4940 L->insertBefore(LoadGroup);
4941 NarrowedOps.insert(L);
4942 return L;
4943 }
4944
4945 if (auto *RepR = dyn_cast<VPReplicateRecipe>(R)) {
4946 assert(RepR->isSingleScalar() &&
4947 isa<LoadInst>(RepR->getUnderlyingInstr()) &&
4948 "must be a single scalar load");
4949 NarrowedOps.insert(RepR);
4950 return RepR;
4951 }
4952
4953 auto *WideLoad = cast<VPWidenLoadRecipe>(R);
4954 VPValue *PtrOp = WideLoad->getAddr();
4955 if (auto *VecPtr = dyn_cast<VPVectorPointerRecipe>(PtrOp))
4956 PtrOp = VecPtr->getOperand(0);
4957 // Narrow wide load to uniform scalar load, as transformed VPlan will only
4958 // process one original iteration.
4959 auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), {PtrOp},
4960 /*IsUniform*/ true,
4961 /*Mask*/ nullptr, {}, *WideLoad);
4962 N->insertBefore(WideLoad);
4963 NarrowedOps.insert(N);
4964 return N;
4965}
4966
4968 TypeSize VectorRegWidth) {
4969 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
4970 if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
4971 return;
4972
4973 VPTypeAnalysis TypeInfo(Plan);
4974
4976 for (auto &R : *VectorLoop->getEntryBasicBlock()) {
4978 continue;
4979
4982 continue;
4983
4984 // Bail out on recipes not supported at the moment:
4985 // * phi recipes other than the canonical induction
4986 // * recipes writing to memory except interleave groups
4987 // Only support plans with a canonical induction phi.
4988 if (R.isPhi())
4989 return;
4990
4991 auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
4992 if (R.mayWriteToMemory() && !InterleaveR)
4993 return;
4994
4995 // Do not narrow interleave groups if there are VectorPointer recipes and
4996 // the plan was unrolled. The recipe implicitly uses VF from
4997 // VPTransformState.
4998 // TODO: Remove restriction once the VF for the VectorPointer offset is
4999 // modeled explicitly as operand.
5000 if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
5001 return;
5002
5003 // All other ops are allowed, but we reject uses that cannot be converted
5004 // when checking all allowed consumers (store interleave groups) below.
5005 if (!InterleaveR)
5006 continue;
5007
5008 // Bail out on non-consecutive interleave groups.
5009 if (!isConsecutiveInterleaveGroup(InterleaveR, VF, TypeInfo,
5010 VectorRegWidth))
5011 return;
5012
5013 // Skip read interleave groups.
5014 if (InterleaveR->getStoredValues().empty())
5015 continue;
5016
5017 // Narrow interleave groups, if all operands are already matching narrow
5018 // ops.
5019 auto *Member0 = InterleaveR->getStoredValues()[0];
5020 if (isAlreadyNarrow(Member0) &&
5021 all_of(InterleaveR->getStoredValues(),
5022 [Member0](VPValue *VPV) { return Member0 == VPV; })) {
5023 StoreGroups.push_back(InterleaveR);
5024 continue;
5025 }
5026
5027 // For now, we only support full interleave groups storing load interleave
5028 // groups.
5029 if (all_of(enumerate(InterleaveR->getStoredValues()), [](auto Op) {
5030 VPRecipeBase *DefR = Op.value()->getDefiningRecipe();
5031 if (!DefR)
5032 return false;
5033 auto *IR = dyn_cast<VPInterleaveRecipe>(DefR);
5034 return IR && IR->getInterleaveGroup()->isFull() &&
5035 IR->getVPValue(Op.index()) == Op.value();
5036 })) {
5037 StoreGroups.push_back(InterleaveR);
5038 continue;
5039 }
5040
5041 // Check if all values feeding InterleaveR are matching wide recipes, which
5042 // operands that can be narrowed.
5043 auto *WideMember0 =
5044 dyn_cast_or_null<VPWidenRecipe>(InterleaveR->getStoredValues()[0]);
5045 if (!WideMember0)
5046 return;
5047 for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
5049 if (!R || R->getOpcode() != WideMember0->getOpcode() ||
5050 R->getNumOperands() > 2)
5051 return;
5052 if (any_of(enumerate(R->operands()),
5053 [WideMember0, Idx = I](const auto &P) {
5054 const auto &[OpIdx, OpV] = P;
5055 return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
5056 }))
5057 return;
5058 }
5059 StoreGroups.push_back(InterleaveR);
5060 }
5061
5062 if (StoreGroups.empty())
5063 return;
5064
5065 // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
5066 SmallPtrSet<VPValue *, 4> NarrowedOps;
5067 // Narrow operation tree rooted at store groups.
5068 for (auto *StoreGroup : StoreGroups) {
5069 VPValue *Res =
5070 narrowInterleaveGroupOp(StoreGroup->getStoredValues()[0], NarrowedOps);
5071 auto *SI =
5072 cast<StoreInst>(StoreGroup->getInterleaveGroup()->getInsertPos());
5073 auto *S = new VPWidenStoreRecipe(
5074 *SI, StoreGroup->getAddr(), Res, nullptr, /*Consecutive=*/true,
5075 /*Reverse=*/false, {}, StoreGroup->getDebugLoc());
5076 S->insertBefore(StoreGroup);
5077 StoreGroup->eraseFromParent();
5078 }
5079
5080 // Adjust induction to reflect that the transformed plan only processes one
5081 // original iteration.
5082 auto *CanIV = VectorLoop->getCanonicalIV();
5083 auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
5084 VPBuilder PHBuilder(Plan.getVectorPreheader());
5085
5086 VPValue *UF = Plan.getOrAddLiveIn(
5087 ConstantInt::get(VectorLoop->getCanonicalIVType(), 1 * Plan.getUF()));
5088 if (VF.isScalable()) {
5089 VPValue *VScale = PHBuilder.createElementCount(
5091 VPValue *VScaleUF = PHBuilder.createOverflowingOp(
5092 Instruction::Mul, {VScale, UF}, {true, false});
5093 Inc->setOperand(1, VScaleUF);
5094 Plan.getVF().replaceAllUsesWith(VScale);
5095 } else {
5096 Inc->setOperand(1, UF);
5098 Plan.getConstantInt(CanIV->getScalarType(), 1));
5099 }
5100 removeDeadRecipes(Plan);
5101}
5102
5103/// Add branch weight metadata, if the \p Plan's middle block is terminated by a
5104/// BranchOnCond recipe.
5106 VPlan &Plan, ElementCount VF, std::optional<unsigned> VScaleForTuning) {
5107 VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock();
5108 auto *MiddleTerm =
5110 // Only add branch metadata if there is a (conditional) terminator.
5111 if (!MiddleTerm)
5112 return;
5113
5114 assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond &&
5115 "must have a BranchOnCond");
5116 // Assume that `TripCount % VectorStep ` is equally distributed.
5117 unsigned VectorStep = Plan.getUF() * VF.getKnownMinValue();
5118 if (VF.isScalable() && VScaleForTuning.has_value())
5119 VectorStep *= *VScaleForTuning;
5120 assert(VectorStep > 0 && "trip count should not be zero");
5121 MDBuilder MDB(Plan.getContext());
5122 MDNode *BranchWeights =
5123 MDB.createBranchWeights({1, VectorStep - 1}, /*IsExpected=*/false);
5124 MiddleTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
5125}
5126
5127/// Compute and return the end value for \p WideIV, unless it is truncated. If
5128/// the induction recipe is not canonical, creates a VPDerivedIVRecipe to
5129/// compute the end value of the induction.
5131 VPBuilder &VectorPHBuilder,
5132 VPTypeAnalysis &TypeInfo,
5133 VPValue *VectorTC) {
5134 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(WideIV);
5135 // Truncated wide inductions resume from the last lane of their vector value
5136 // in the last vector iteration which is handled elsewhere.
5137 if (WideIntOrFp && WideIntOrFp->getTruncInst())
5138 return nullptr;
5139
5140 VPValue *Start = WideIV->getStartValue();
5141 VPValue *Step = WideIV->getStepValue();
5143 VPValue *EndValue = VectorTC;
5144 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
5145 EndValue = VectorPHBuilder.createDerivedIV(
5146 ID.getKind(), dyn_cast_or_null<FPMathOperator>(ID.getInductionBinOp()),
5147 Start, VectorTC, Step);
5148 }
5149
5150 // EndValue is derived from the vector trip count (which has the same type as
5151 // the widest induction) and thus may be wider than the induction here.
5152 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(WideIV);
5153 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(EndValue)) {
5154 EndValue = VectorPHBuilder.createScalarCast(Instruction::Trunc, EndValue,
5155 ScalarTypeOfWideIV,
5156 WideIV->getDebugLoc());
5157 }
5158
5159 return EndValue;
5160}
5161
5163 VPlan &Plan, DenseMap<VPValue *, VPValue *> &IVEndValues) {
5164 VPTypeAnalysis TypeInfo(Plan);
5165 auto *ScalarPH = Plan.getScalarPreheader();
5166 auto *MiddleVPBB = cast<VPBasicBlock>(ScalarPH->getPredecessors()[0]);
5167 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5168 VPBuilder VectorPHBuilder(
5169 cast<VPBasicBlock>(VectorRegion->getSinglePredecessor()));
5170 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5171 for (VPRecipeBase &PhiR : Plan.getScalarPreheader()->phis()) {
5172 auto *ResumePhiR = cast<VPPhi>(&PhiR);
5173
5174 // TODO: Extract final value from induction recipe initially, optimize to
5175 // pre-computed end value together in optimizeInductionExitUsers.
5176 auto *VectorPhiR = cast<VPHeaderPHIRecipe>(ResumePhiR->getOperand(0));
5177 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(VectorPhiR)) {
5179 WideIVR, VectorPHBuilder, TypeInfo, &Plan.getVectorTripCount())) {
5180 IVEndValues[WideIVR] = EndValue;
5181 ResumePhiR->setOperand(0, EndValue);
5182 ResumePhiR->setName("bc.resume.val");
5183 continue;
5184 }
5185 // TODO: Also handle truncated inductions here. Computing end-values
5186 // separately should be done as VPlan-to-VPlan optimization, after
5187 // legalizing all resume values to use the last lane from the loop.
5188 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
5189 "should only skip truncated wide inductions");
5190 continue;
5191 }
5192
5193 // The backedge value provides the value to resume coming out of a loop,
5194 // which for FORs is a vector whose last element needs to be extracted. The
5195 // start value provides the value if the loop is bypassed.
5196 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(VectorPhiR);
5197 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
5198 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5199 "Cannot handle loops with uncountable early exits");
5200 if (IsFOR) {
5201 auto *ExtractPart = MiddleBuilder.createNaryOp(
5202 VPInstruction::ExtractLastPart, ResumeFromVectorLoop);
5203 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
5205 "vector.recur.extract");
5206 }
5207 ResumePhiR->setName(IsFOR ? "scalar.recur.init" : "bc.merge.rdx");
5208 ResumePhiR->setOperand(0, ResumeFromVectorLoop);
5209 }
5210}
5211
5213 VFRange &Range) {
5214 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
5215 auto *ScalarPHVPBB = Plan.getScalarPreheader();
5216 auto *MiddleVPBB = Plan.getMiddleBlock();
5217 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
5218 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
5219
5220 auto IsScalableOne = [](ElementCount VF) -> bool {
5221 return VF == ElementCount::getScalable(1);
5222 };
5223
5224 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
5225 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&HeaderPhi);
5226 if (!FOR)
5227 continue;
5228
5229 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
5230 "Cannot handle loops with uncountable early exits");
5231
5232 // This is the second phase of vectorizing first-order recurrences, creating
5233 // extract for users outside the loop. An overview of the transformation is
5234 // described below. Suppose we have the following loop with some use after
5235 // the loop of the last a[i-1],
5236 //
5237 // for (int i = 0; i < n; ++i) {
5238 // t = a[i - 1];
5239 // b[i] = a[i] - t;
5240 // }
5241 // use t;
5242 //
5243 // There is a first-order recurrence on "a". For this loop, the shorthand
5244 // scalar IR looks like:
5245 //
5246 // scalar.ph:
5247 // s.init = a[-1]
5248 // br scalar.body
5249 //
5250 // scalar.body:
5251 // i = phi [0, scalar.ph], [i+1, scalar.body]
5252 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
5253 // s2 = a[i]
5254 // b[i] = s2 - s1
5255 // br cond, scalar.body, exit.block
5256 //
5257 // exit.block:
5258 // use = lcssa.phi [s1, scalar.body]
5259 //
5260 // In this example, s1 is a recurrence because it's value depends on the
5261 // previous iteration. In the first phase of vectorization, we created a
5262 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
5263 // for users in the scalar preheader and exit block.
5264 //
5265 // vector.ph:
5266 // v_init = vector(..., ..., ..., a[-1])
5267 // br vector.body
5268 //
5269 // vector.body
5270 // i = phi [0, vector.ph], [i+4, vector.body]
5271 // v1 = phi [v_init, vector.ph], [v2, vector.body]
5272 // v2 = a[i, i+1, i+2, i+3]
5273 // b[i] = v2 - v1
5274 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
5275 // b[i, i+1, i+2, i+3] = v2 - v1
5276 // br cond, vector.body, middle.block
5277 //
5278 // middle.block:
5279 // vector.recur.extract.for.phi = v2(2)
5280 // vector.recur.extract = v2(3)
5281 // br cond, scalar.ph, exit.block
5282 //
5283 // scalar.ph:
5284 // scalar.recur.init = phi [vector.recur.extract, middle.block],
5285 // [s.init, otherwise]
5286 // br scalar.body
5287 //
5288 // scalar.body:
5289 // i = phi [0, scalar.ph], [i+1, scalar.body]
5290 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
5291 // s2 = a[i]
5292 // b[i] = s2 - s1
5293 // br cond, scalar.body, exit.block
5294 //
5295 // exit.block:
5296 // lo = lcssa.phi [s1, scalar.body],
5297 // [vector.recur.extract.for.phi, middle.block]
5298 //
5299 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
5300 // Extract the penultimate value of the recurrence and use it as operand for
5301 // the VPIRInstruction modeling the phi.
5303 make_range(MiddleVPBB->getFirstNonPhi(), MiddleVPBB->end()))) {
5305 continue;
5306
5307 // For VF vscale x 1, if vscale = 1, we are unable to extract the
5308 // penultimate value of the recurrence. Instead we rely on the existing
5309 // extract of the last element from the result of
5310 // VPInstruction::FirstOrderRecurrenceSplice.
5311 // TODO: Consider vscale_range info and UF.
5313 Range))
5314 return;
5315 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
5316 VPInstruction::ExtractPenultimateElement, FOR->getBackedgeValue(), {},
5317 "vector.recur.extract.for.phi");
5318 cast<VPInstruction>(&R)->replaceAllUsesWith(PenultimateElement);
5319 }
5320 }
5321}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
This file implements a class to represent arbitrary precision integral constant values and operations...
ReachingDefInfo InstSet & ToRemove
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool isEqual(const Function &Caller, const Function &Callee)
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static bool isSentinel(const DWARFDebugNames::AttributeEncoding &AE)
@ Default
Hexagon Common GEP
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
licm
Definition LICM.cpp:383
Legalize the Machine IR a function s Machine IR
Definition Legalizer.cpp:80
static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU, ScalarEvolution &SE)
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
This file provides utility analysis objects describing memory locations.
This file contains the declarations for metadata subclasses.
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
#define P(N)
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
const SmallVectorImpl< MachineOperand > & Cond
This file contains some templates that are useful if you are working with the STL at all.
This is the interface for a metadata-based scoped no-alias analysis.
This file defines generic set operations that may be used on set's of different types,...
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file implements the TypeSwitch template, which mimics a switch() statement whose cases are type ...
This file implements dominator tree analysis for a single level of a VPlan's H-CFG.
This file contains the declarations of different VPlan-related auxiliary helpers.
static SmallVector< SmallVector< VPReplicateRecipe *, 4 > > collectComplementaryPredicatedMemOps(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
static void removeCommonBlendMask(VPBlendRecipe *Blend)
Try to see if all of Blend's masks share a common value logically and'ed and remove it from the masks...
static void tryToCreateAbstractReductionRecipe(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries to create abstract recipes from the reduction recipe for following optimizations ...
static VPReplicateRecipe * findRecipeWithMinAlign(ArrayRef< VPReplicateRecipe * > Group)
static bool sinkScalarOperands(VPlan &Plan)
static bool cannotHoistOrSinkRecipe(const VPRecipeBase &R)
Return true if we do not know how to (mechanically) hoist or sink R out of a loop region.
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Try to simplify the branch condition of Plan.
static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo)
Try to simplify VPSingleDefRecipe Def.
static void removeRedundantInductionCasts(VPlan &Plan)
Remove redundant casts of inductions.
static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Return true if Cond is known to be true for given BestVF and BestUF.
static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, unsigned UF)
Try to replace multiple active lane masks used for control flow with a single, wide active lane mask ...
static std::optional< std::pair< bool, unsigned > > getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R)
Get any instruction opcode or intrinsic ID data embedded in recipe R.
static VPExpressionRecipe * tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static VPScalarIVStepsRecipe * createScalarIVSteps(VPlan &Plan, InductionDescriptor::InductionKind Kind, Instruction::BinaryOps InductionOpcode, FPMathOperator *FPBinOp, Instruction *TruncI, VPValue *StartV, VPValue *Step, DebugLoc DL, VPBuilder &Builder)
static RemoveMask_match< Op0_t, Op1_t > m_RemoveMask(const Op0_t &In, Op1_t &Out)
Match a specific mask In, or a combination of it (logical-and In, Out).
static VPIRMetadata getCommonMetadata(ArrayRef< VPReplicateRecipe * > Recipes)
static VPValue * getPredicatedMask(VPRegionBlock *R)
If R is a region with a VPBranchOnMaskRecipe in the entry block, return the mask.
static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Sink users of FOR after the recipe defining the previous value Previous of the recurrence.
static bool mergeReplicateRegionsIntoSuccessors(VPlan &Plan)
static VPWidenInductionRecipe * getOptimizableIVOf(VPValue *VPV, PredicatedScalarEvolution &PSE)
Check if VPV is an untruncated wide induction, either before or after the increment.
static VPActiveLaneMaskPHIRecipe * addVPLaneMaskPhiAndUpdateExitBranch(VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck)
static void expandVPWidenPointerInduction(VPWidenPointerInductionRecipe *R, VPTypeAnalysis &TypeInfo)
Expand a VPWidenPointerInductionRecipe into executable recipes, for the initial value,...
static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL)
Replace recipes with their EVL variants.
static bool isDeadRecipe(VPRecipeBase &R)
Returns true if R is dead and can be removed.
static void legalizeAndOptimizeInductions(VPlan &Plan)
Legalize VPWidenPointerInductionRecipe, by replacing it with a PtrAdd (IndStart, ScalarIVSteps (0,...
static void addReplicateRegions(VPlan &Plan)
static VPValue * tryToFoldLiveIns(VPSingleDefRecipe &R, ArrayRef< VPValue * > Operands, const DataLayout &DL, VPTypeAnalysis &TypeInfo)
Try to fold R using InstSimplifyFolder.
static VPValue * tryToComputeEndValueForInduction(VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC)
Compute and return the end value for WideIV, unless it is truncated.
static void removeRedundantExpandSCEVRecipes(VPlan &Plan)
Remove redundant EpxandSCEVRecipes in Plan's entry block by replacing them with already existing reci...
static bool simplifyKnownEVL(VPlan &Plan, ElementCount VF, PredicatedScalarEvolution &PSE)
From the definition of llvm.experimental.get.vector.length, VPInstruction::ExplicitVectorLength(AVL) ...
static bool hoistPreviousBeforeFORUsers(VPFirstOrderRecurrencePHIRecipe *FOR, VPRecipeBase *Previous, VPDominatorTree &VPDT)
Try to hoist Previous and its operands before all users of FOR.
static VPValue * scalarizeVPWidenPointerInduction(VPWidenPointerInductionRecipe *PtrIV, VPlan &Plan, VPBuilder &Builder)
Scalarize a VPWidenPointerInductionRecipe by replacing it with a PtrAdd (IndStart,...
static SmallVector< VPUser * > collectUsersRecursively(VPValue *V)
static VPValue * optimizeEarlyExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the early exit block.
static void recursivelyDeleteDeadRecipes(VPValue *V)
static bool canSinkStoreWithNoAliasCheck(ArrayRef< VPReplicateRecipe * > StoresToSink, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
static VPRegionBlock * createReplicateRegion(VPReplicateRecipe *PredRecipe, VPlan &Plan)
static VPBasicBlock * getPredicatedThenBlock(VPRegionBlock *R)
If R is a triangle region, return the 'then' block of the triangle.
static VPValue * narrowInterleaveGroupOp(VPValue *V, SmallPtrSetImpl< VPValue * > &NarrowedOps)
static bool canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB, std::optional< SinkStoreInfo > SinkInfo={})
Check if a memory operation doesn't alias with memory operations in blocks between FirstBB and LastBB...
static void simplifyBlends(VPlan &Plan)
Normalize and simplify VPBlendRecipes.
static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR, ElementCount VF, VPTypeAnalysis &TypeInfo, TypeSize VectorRegWidth)
Returns true if IR is a full interleave group with factor and number of members both equal to VF.
static VPRecipeBase * optimizeMaskToEVL(VPValue *HeaderMask, VPRecipeBase &CurRecipe, VPTypeAnalysis &TypeInfo, VPValue &EVL)
Try to optimize a CurRecipe masked by HeaderMask to a corresponding EVL-based recipe without the head...
static bool isAlreadyNarrow(VPValue *VPV)
Returns true if VPValue is a narrow VPValue.
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF)
Optimize the width of vector induction variables in Plan based on a known constant Trip Count,...
static VPExpressionRecipe * tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red, VPCostContext &Ctx, VFRange &Range)
This function tries convert extended in-loop reductions to VPExpressionRecipe and clamp the Range if ...
static void expandVPWidenIntOrFpInduction(VPWidenIntOrFpInductionRecipe *WidenIVR, VPTypeAnalysis &TypeInfo)
Expand a VPWidenIntOrFpInduction into executable recipes, for the initial value, phi and backedge val...
static VPSingleDefRecipe * findHeaderMask(VPlan &Plan)
Collect the header mask with the pattern: (ICMP_ULE, WideCanonicalIV, backedge-taken-count) TODO: Int...
static void removeRedundantCanonicalIVs(VPlan &Plan)
Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV recipe, if it exists.
static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx, VPValue *OpV, unsigned Idx)
Returns true if V is VPWidenLoadRecipe or VPInterleaveRecipe that can be converted to a narrower reci...
static void narrowToSingleScalarRecipes(VPlan &Plan)
static VPValue * optimizeLatchExitInductionUser(VPlan &Plan, VPTypeAnalysis &TypeInfo, VPBlockBase *PredVPBB, VPValue *Op, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
Attempts to optimize the induction variable exit values for users in the exit block coming from the l...
This file provides utility VPlan to VPlan transformations.
This file declares the class VPlanVerifier, which contains utility functions to check the consistency...
This file contains the declarations of the Vectorization Plan base classes:
static const X86InstrFMA3Group Groups[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static const uint32_t IV[8]
Definition blake3_impl.h:83
Helper for extra no-alias checks via known-safe recipe and SCEV.
SinkStoreInfo(const SmallPtrSetImpl< VPRecipeBase * > &ExcludeRecipes, VPReplicateRecipe &GroupLeader, PredicatedScalarEvolution &PSE, const Loop &L, VPTypeAnalysis &TypeInfo)
bool shouldSkip(VPRecipeBase &R) const
Return true if R should be skipped during alias checking, either because it's in the exclude set or b...
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1513
APInt abs() const
Get the absolute value.
Definition APInt.h:1796
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1489
LLVM_ABI APInt sext(unsigned width) const
Sign extend to a new width.
Definition APInt.cpp:985
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & back() const
back - Get the last element.
Definition ArrayRef.h:151
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
iterator end() const
Definition ArrayRef.h:131
iterator begin() const
Definition ArrayRef.h:130
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this basic block belongs to.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=true)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:138
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:123
static DebugLoc getCompilerGenerated()
Definition DebugLoc.h:162
static DebugLoc getUnknown()
Definition DebugLoc.h:161
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition DenseMap.h:205
std::pair< iterator, bool > try_emplace(KeyT &&Key, Ts &&...Args)
Definition DenseMap.h:256
bool dominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
dominates - Returns true iff A dominates B.
constexpr bool isVector() const
One or more elements.
Definition TypeSize.h:324
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
Represents flags for the getelementptr instruction/expression.
GEPNoWrapFlags withoutNoUnsignedWrap() const
static GEPNoWrapFlags none()
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
A struct for saving information about induction variables.
InductionKind
This enum represents the kinds of inductions that we support.
@ IK_PtrInduction
Pointer induction var. Step = C.
@ IK_IntInduction
Integer induction variable. Step = C.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
bool isCast() const
bool isBinaryOp() const
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
The group of interleaved loads/stores sharing the same stride and close to each other.
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
uint32_t getNumMembers() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static bool getDecisionAndClampRange(const std::function< bool(ElementCount)> &Predicate, VFRange &Range)
Test a Predicate on a Range of VF's.
Definition VPlan.cpp:1541
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight, bool IsExpected=false)
Return metadata containing two branch weights.
Definition MDBuilder.cpp:38
Metadata node.
Definition Metadata.h:1078
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
ValueT lookup(const KeyT &Key) const
Definition MapVector.h:108
Representation for a specific memory location.
AAMDNodes AATags
The metadata nodes which describes the aliasing of the location (each member is null if that kind of ...
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
LLVM_ABI const SCEV * getSCEV(Value *V)
Returns the SCEV expression of V, in the context of the current SCEV predicate.
static LLVM_ABI unsigned getOpcode(RecurKind Kind)
Returns the opcode corresponding to the RecurrenceKind.
RegionT * getParent() const
Get the parent of the Region.
Definition RegionInfo.h:362
This class uses information about analyze scalars to rewrite expressions in canonical form.
LLVM_ABI Value * expandCodeFor(const SCEV *SH, Type *Ty, BasicBlock::iterator I)
Insert code to directly compute the specified SCEV expression into the program.
static const SCEV * rewrite(const SCEV *Scev, ScalarEvolution &SE, ValueToSCEVMapTy &Map)
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
const DataLayout & getDataLayout() const
Return the DataLayout associated with the module this SCEV instance is operating on.
LLVM_ABI const SCEV * getNegativeSCEV(const SCEV *V, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
Return the SCEV object corresponding to -V.
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI const SCEV * getUDivExpr(const SCEV *LHS, const SCEV *RHS)
Get a canonical unsigned division expression, or something simpler if possible.
LLVM_ABI const SCEV * getElementCount(Type *Ty, ElementCount EC, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap)
LLVM_ABI const SCEV * getMinusSCEV(const SCEV *LHS, const SCEV *RHS, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Return LHS-RHS.
LLVM_ABI const SCEV * getMulExpr(SmallVectorImpl< const SCEV * > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical multiply expression, or something simpler if possible.
LLVM_ABI bool isKnownPredicate(CmpPredicate Pred, const SCEV *LHS, const SCEV *RHS)
Test if the given expression is known to satisfy the condition described by Pred, LHS,...
static LLVM_ABI bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias)
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Provides information about what library functions are available for the current target.
static LLVM_ABI PartialReductionExtendKind getPartialReductionExtendKind(Instruction *I)
Get the kind of extension that an instruction represents.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize get(ScalarTy Quantity, bool Scalable)
Definition TypeSize.h:340
This class implements a switch-like dispatch statement for a value of 'T' using dyn_cast functionalit...
Definition TypeSwitch.h:88
TypeSwitch< T, ResultT > & Case(CallableT &&caseFn)
Add a case on the given type.
Definition TypeSwitch.h:97
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
op_range operands()
Definition User.h:293
A recipe for generating the active lane mask for the vector loop that is used to predicate the vector...
Definition VPlan.h:3629
VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph.
Definition VPlan.h:3990
void appendRecipe(VPRecipeBase *Recipe)
Augment the existing recipes of a VPBasicBlock with an additional Recipe as the last recipe.
Definition VPlan.h:4065
RecipeListTy::iterator iterator
Instruction iterators...
Definition VPlan.h:4017
iterator end()
Definition VPlan.h:4027
iterator begin()
Recipe iterator methods.
Definition VPlan.h:4025
iterator_range< iterator > phis()
Returns an iterator range over the PHI-like recipes in the block.
Definition VPlan.h:4078
iterator getFirstNonPhi()
Return the position of the first non-phi node recipe in the block.
Definition VPlan.cpp:216
VPRegionBlock * getEnclosingLoopRegion()
Definition VPlan.cpp:578
VPBasicBlock * splitAt(iterator SplitAt)
Split current block at SplitAt by inserting a new block between the current block and its successors ...
Definition VPlan.cpp:550
VPRecipeBase * getTerminator()
If the block has multiple successors, return the branch recipe terminating the block.
Definition VPlan.cpp:624
const VPRecipeBase & back() const
Definition VPlan.h:4039
A recipe for vectorizing a phi-node as a sequence of mask-based select instructions.
Definition VPlan.h:2538
VPValue * getMask(unsigned Idx) const
Return mask number Idx.
Definition VPlan.h:2572
unsigned getNumIncomingValues() const
Return the number of incoming values, taking into account when normalized the first incoming value wi...
Definition VPlan.h:2562
void setMask(unsigned Idx, VPValue *V)
Set mask number Idx to V.
Definition VPlan.h:2578
bool isNormalized() const
A normalized blend is one that has an odd number of operands, whereby the first operand does not have...
Definition VPlan.h:2558
VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
Definition VPlan.h:81
void setSuccessors(ArrayRef< VPBlockBase * > NewSuccs)
Set each VPBasicBlock in NewSuccss as successor of this VPBlockBase.
Definition VPlan.h:300
VPRegionBlock * getParent()
Definition VPlan.h:173
const VPBasicBlock * getExitingBasicBlock() const
Definition VPlan.cpp:186
size_t getNumSuccessors() const
Definition VPlan.h:219
size_t getNumPredecessors() const
Definition VPlan.h:220
void setPredecessors(ArrayRef< VPBlockBase * > NewPreds)
Set each VPBasicBlock in NewPreds as predecessor of this VPBlockBase.
Definition VPlan.h:291
const VPBlocksTy & getPredecessors() const
Definition VPlan.h:204
VPlan * getPlan()
Definition VPlan.cpp:161
void clearSuccessors()
Remove all the successors of this block.
Definition VPlan.h:310
VPBlockBase * getSinglePredecessor() const
Definition VPlan.h:215
const VPBasicBlock * getEntryBasicBlock() const
Definition VPlan.cpp:166
void setParent(VPRegionBlock *P)
Definition VPlan.h:184
VPBlockBase * getSingleHierarchicalPredecessor()
Definition VPlan.h:264
VPBlockBase * getSingleSuccessor() const
Definition VPlan.h:209
const VPBlocksTy & getSuccessors() const
Definition VPlan.h:198
static auto blocksOnly(const T &Range)
Return an iterator range over Range which only includes BlockTy blocks.
Definition VPlanUtils.h:221
static void insertOnEdge(VPBlockBase *From, VPBlockBase *To, VPBlockBase *BlockPtr)
Inserts BlockPtr on the edge between From and To.
Definition VPlanUtils.h:242
static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse, VPBlockBase *BlockPtr)
Insert disconnected VPBlockBases IfTrue and IfFalse after BlockPtr.
Definition VPlanUtils.h:154
static void connectBlocks(VPBlockBase *From, VPBlockBase *To, unsigned PredIdx=-1u, unsigned SuccIdx=-1u)
Connect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:173
static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To)
Disconnect VPBlockBases From and To bi-directionally.
Definition VPlanUtils.h:191
A recipe for generating conditional branches on the bits of a mask.
Definition VPlan.h:3042
RAII object that stores the current insertion point and restores it when the object is destroyed.
VPlan-based builder utility analogous to IRBuilder.
VPValue * createScalarZExtOrTrunc(VPValue *Op, Type *ResultTy, Type *SrcTy, DebugLoc DL)
VPValue * createElementCount(Type *Ty, ElementCount EC)
VPInstruction * createScalarCast(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy, DebugLoc DL, const VPIRFlags &Flags={}, const VPIRMetadata &Metadata={})
VPDerivedIVRecipe * createDerivedIV(InductionDescriptor::InductionKind Kind, FPMathOperator *FPBinOp, VPValue *Start, VPValue *Current, VPValue *Step, const Twine &Name="")
Convert the input value Current to the corresponding value of an induction with Start and Step values...
static VPBuilder getToInsertAfter(VPRecipeBase *R)
Create a VPBuilder to insert after R.
VPInstruction * createOverflowingOp(unsigned Opcode, ArrayRef< VPValue * > Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags={false, false}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
VPPhi * createScalarPhi(ArrayRef< VPValue * > IncomingValues, DebugLoc DL, const Twine &Name="")
void setInsertPoint(VPBasicBlock *TheBB)
This specifies that created VPInstructions should be appended to the end of the specified block.
VPInstruction * createNaryOp(unsigned Opcode, ArrayRef< VPValue * > Operands, Instruction *Inst=nullptr, const VPIRFlags &Flags={}, const VPIRMetadata &MD={}, DebugLoc DL=DebugLoc::getUnknown(), const Twine &Name="")
Create an N-ary operation with Opcode, Operands and set Inst as its underlying Instruction.
Canonical scalar induction phi of the vector loop.
Definition VPlan.h:3573
unsigned getNumDefinedValues() const
Returns the number of values defined by the VPDef.
Definition VPlanValue.h:426
ArrayRef< VPValue * > definedValues()
Returns an ArrayRef of the values defined by the VPDef.
Definition VPlanValue.h:421
VPValue * getVPSingleValue()
Returns the only VPValue defined by the VPDef.
Definition VPlanValue.h:399
VPValue * getVPValue(unsigned I)
Returns the VPValue with index I defined by the VPDef.
Definition VPlanValue.h:411
A recipe for converting the input value IV value to the corresponding value of an IV with different s...
Definition VPlan.h:3740
Template specialization of the standard LLVM dominator tree utility for VPBlockBases.
bool properlyDominates(const VPRecipeBase *A, const VPRecipeBase *B)
A recipe for generating the phi node for the current index of elements, adjusted in accordance with E...
Definition VPlan.h:3661
A recipe to combine multiple recipes into a single 'expression' recipe, which should be considered a ...
Definition VPlan.h:3087
A pure virtual base class for all recipes modeling header phis, including phis for first order recurr...
Definition VPlan.h:2064
virtual VPValue * getBackedgeValue()
Returns the incoming value from the loop backedge.
Definition VPlan.h:2107
VPValue * getStartValue()
Returns the start value of the phi, if one is set.
Definition VPlan.h:2096
A special type of VPBasicBlock that wraps an existing IR basic block.
Definition VPlan.h:4143
BasicBlock * getIRBasicBlock() const
Definition VPlan.h:4167
Class to record and manage LLVM IR flags.
Definition VPlan.h:609
static LLVM_ABI_FOR_TEST VPIRInstruction * create(Instruction &I)
Create a new VPIRPhi for \I , if it is a PHINode, otherwise create a VPIRInstruction.
Helper to manage IR metadata for recipes.
Definition VPlan.h:982
void intersect(const VPIRMetadata &MD)
Intersect this VPIRMetadata object with MD, keeping only metadata nodes that are common to both.
This is a concrete Recipe that models a single VPlan-level instruction.
Definition VPlan.h:1036
@ ExtractLane
Extracts a single lane (first operand) from a set of vector operands.
Definition VPlan.h:1138
@ Unpack
Extracts all lanes from its (non-scalable) vector operand.
Definition VPlan.h:1080
@ BuildVector
Creates a fixed-width vector containing all operands.
Definition VPlan.h:1075
@ BuildStructVector
Given operands of (the same) struct type, creates a struct of fixed- width vectors each containing a ...
Definition VPlan.h:1072
@ CanonicalIVIncrementForPart
Definition VPlan.h:1056
const InterleaveGroup< Instruction > * getInterleaveGroup() const
Definition VPlan.h:2680
VPValue * getMask() const
Return the mask used by this recipe.
Definition VPlan.h:2672
ArrayRef< VPValue * > getStoredValues() const
Return the VPValues stored by this interleave group.
Definition VPlan.h:2701
A recipe for interleaved memory operations with vector-predication intrinsics.
Definition VPlan.h:2754
VPInterleaveRecipe is a recipe for transforming an interleave group of load or stores into one wide l...
Definition VPlan.h:2712
VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when control converges back from ...
Definition VPlan.h:3229
VPRecipeBase is a base class modeling a sequence of one or more output IR instructions.
Definition VPlan.h:387
VPRegionBlock * getRegion()
Definition VPlan.h:4295
VPBasicBlock * getParent()
Definition VPlan.h:408
DebugLoc getDebugLoc() const
Returns the debug location of the recipe.
Definition VPlan.h:479
void moveBefore(VPBasicBlock &BB, iplist< VPRecipeBase >::iterator I)
Unlink this recipe and insert into BB before I.
void insertBefore(VPRecipeBase *InsertPos)
Insert an unlinked recipe into a basic block immediately before the specified recipe.
void insertAfter(VPRecipeBase *InsertPos)
Insert an unlinked Recipe into a basic block immediately after the specified Recipe.
iplist< VPRecipeBase >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Helper class to create VPRecipies from IR instructions.
VPRecipeBase * getRecipe(Instruction *I)
Return the recipe created for given ingredient.
A recipe to represent inloop reduction operations with vector-predication intrinsics,...
Definition VPlan.h:2916
A recipe to represent inloop, ordered or partial reduction operations.
Definition VPlan.h:2805
VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks which form a Single-Entry-S...
Definition VPlan.h:4178
const VPBlockBase * getEntry() const
Definition VPlan.h:4214
Type * getCanonicalIVType()
Return the type of the canonical IV for loop regions.
Definition VPlan.h:4289
bool isReplicator() const
An indicator whether this region is to generate multiple replicated instances of output IR correspond...
Definition VPlan.h:4246
void setExiting(VPBlockBase *ExitingBlock)
Set ExitingBlock as the exiting VPBlockBase of this VPRegionBlock.
Definition VPlan.h:4231
VPCanonicalIVPHIRecipe * getCanonicalIV()
Returns the canonical induction recipe of the region.
Definition VPlan.h:4276
const VPBlockBase * getExiting() const
Definition VPlan.h:4226
VPBasicBlock * getPreheaderVPBB()
Returns the pre-header VPBasicBlock of the loop region.
Definition VPlan.h:4239
VPReplicateRecipe replicates a given instruction producing multiple scalar copies of the original sca...
Definition VPlan.h:2961
bool isSingleScalar() const
Definition VPlan.h:3002
VPValue * getMask()
Return the mask of a predicated VPReplicateRecipe.
Definition VPlan.h:3026
A recipe for handling phi nodes of integer and floating-point inductions, producing their scalar valu...
Definition VPlan.h:3810
VPSingleDef is a base class for recipes for modeling a sequence of one or more output IR that define ...
Definition VPlan.h:531
Instruction * getUnderlyingInstr()
Returns the underlying instruction.
Definition VPlan.h:595
VPSingleDefRecipe * clone() override=0
Clone the current recipe.
An analysis for type-inference for VPValues.
LLVMContext & getContext()
Return the LLVMContext used by the analysis.
Type * inferScalarType(const VPValue *V)
Infer the type of V. Returns the scalar type of V.
This class augments VPValue with operands which provide the inverse def-use edges from VPValue's user...
Definition VPlanValue.h:202
operand_range operands()
Definition VPlanValue.h:270
void setOperand(unsigned I, VPValue *New)
Definition VPlanValue.h:246
VPValue * getOperand(unsigned N) const
Definition VPlanValue.h:241
void addOperand(VPValue *Operand)
Definition VPlanValue.h:235
This is the base class of the VPlan Def/Use graph, used for modeling the data flow into,...
Definition VPlanValue.h:46
bool isDefinedOutsideLoopRegions() const
Returns true if the VPValue is defined outside any loop.
Definition VPlan.cpp:1374
VPRecipeBase * getDefiningRecipe()
Returns the recipe defining this VPValue or nullptr if it is not defined by a recipe,...
Definition VPlan.cpp:131
Value * getLiveInIRValue() const
Returns the underlying IR value, if this VPValue is defined outside the scope of VPlan.
Definition VPlanValue.h:181
Value * getUnderlyingValue() const
Return the underlying Value attached to this VPValue.
Definition VPlanValue.h:83
void setUnderlyingValue(Value *Val)
Definition VPlanValue.h:191
void replaceAllUsesWith(VPValue *New)
Definition VPlan.cpp:1377
unsigned getNumUsers() const
Definition VPlanValue.h:111
bool isLiveIn() const
Returns true if this VPValue is a live-in, i.e. defined outside the VPlan.
Definition VPlanValue.h:176
void replaceUsesWithIf(VPValue *New, llvm::function_ref< bool(VPUser &U, unsigned Idx)> ShouldReplace)
Go through the uses list for this VPValue and make each use point to New if the callback ShouldReplac...
Definition VPlan.cpp:1381
user_range users()
Definition VPlanValue.h:132
A recipe to compute a pointer to the last element of each part of a widened memory access for widened...
Definition VPlan.h:1923
A Recipe for widening the canonical induction variable of the vector loop.
Definition VPlan.h:3703
VPWidenCastRecipe is a recipe to create vector cast instructions.
Definition VPlan.h:1564
Instruction::CastOps getOpcode() const
Definition VPlan.h:1600
A recipe for handling GEP instructions.
Definition VPlan.h:1860
Base class for widened induction (VPWidenIntOrFpInductionRecipe and VPWidenPointerInductionRecipe),...
Definition VPlan.h:2131
PHINode * getPHINode() const
Definition VPlan.h:2173
VPValue * getStepValue()
Returns the step value of the induction.
Definition VPlan.h:2159
const InductionDescriptor & getInductionDescriptor() const
Returns the induction descriptor for the recipe.
Definition VPlan.h:2176
A recipe for handling phi nodes of integer and floating-point inductions, producing their vector valu...
Definition VPlan.h:2207
VPValue * getLastUnrolledPartOperand()
Returns the VPValue representing the value of this induction at the last unrolled part,...
Definition VPlan.h:2283
A recipe for widening vector intrinsics.
Definition VPlan.h:1614
A common base class for widening memory operations.
Definition VPlan.h:3272
A recipe for widened phis.
Definition VPlan.h:2341
VPWidenRecipe is a recipe for producing a widened instruction using the opcode and operands of the re...
Definition VPlan.h:1524
VPlan models a candidate for vectorization, encoding various decisions take to produce efficient outp...
Definition VPlan.h:4308
bool hasVF(ElementCount VF) const
Definition VPlan.h:4505
LLVMContext & getContext() const
Definition VPlan.h:4493
VPBasicBlock * getEntry()
Definition VPlan.h:4397
VPValue & getVectorTripCount()
The vector trip count.
Definition VPlan.h:4484
bool hasScalableVF() const
Definition VPlan.h:4506
VPValue & getVFxUF()
Returns VF * UF of the vector loop region.
Definition VPlan.h:4491
VPValue & getVF()
Returns the VF of the vector loop region.
Definition VPlan.h:4487
VPValue * getTripCount() const
The trip count of the original loop.
Definition VPlan.h:4455
VPValue * getTrue()
Return a VPValue wrapping i1 true.
Definition VPlan.h:4561
VPValue * getOrCreateBackedgeTakenCount()
The backedge taken count of the original loop.
Definition VPlan.h:4476
unsigned getUF() const
Definition VPlan.h:4525
VPRegionBlock * createReplicateRegion(VPBlockBase *Entry, VPBlockBase *Exiting, const std::string &Name="")
Create a new replicate region with Entry, Exiting and Name.
Definition VPlan.h:4631
auto getLiveIns() const
Return the list of live-in VPValues available in the VPlan.
Definition VPlan.h:4586
bool hasUF(unsigned UF) const
Definition VPlan.h:4523
ArrayRef< VPIRBasicBlock * > getExitBlocks() const
Return an ArrayRef containing VPIRBasicBlocks wrapping the exit blocks of the original scalar loop.
Definition VPlan.h:4445
VPValue * getConstantInt(Type *Ty, uint64_t Val, bool IsSigned=false)
Return a VPValue wrapping a ConstantInt with the given type and value.
Definition VPlan.h:4567
void setVF(ElementCount VF)
Definition VPlan.h:4499
bool isUnrolled() const
Returns true if the VPlan already has been unrolled, i.e.
Definition VPlan.h:4538
LLVM_ABI_FOR_TEST VPRegionBlock * getVectorLoopRegion()
Returns the VPRegionBlock of the vector loop.
Definition VPlan.cpp:1011
void resetTripCount(VPValue *NewTripCount)
Resets the trip count for the VPlan.
Definition VPlan.h:4469
VPBasicBlock * getMiddleBlock()
Returns the 'middle' block of the plan, that is the block that selects whether to execute the scalar ...
Definition VPlan.h:4422
VPBasicBlock * createVPBasicBlock(const Twine &Name, VPRecipeBase *Recipe=nullptr)
Create a new VPBasicBlock with Name and containing Recipe if present.
Definition VPlan.h:4609
VPValue * getFalse()
Return a VPValue wrapping i1 false.
Definition VPlan.h:4564
VPValue * getOrAddLiveIn(Value *V)
Gets the live-in VPValue for V or adds a new live-in (if none exists yet) for V.
Definition VPlan.h:4547
bool hasScalarVFOnly() const
Definition VPlan.h:4516
VPBasicBlock * getScalarPreheader() const
Return the VPBasicBlock for the preheader of the scalar loop.
Definition VPlan.h:4436
VPIRBasicBlock * getScalarHeader() const
Return the VPIRBasicBlock wrapping the header of the scalar loop.
Definition VPlan.h:4441
VPValue * getLiveIn(Value *V) const
Return the live-in VPValue for V, if there is one or nullptr otherwise.
Definition VPlan.h:4583
VPBasicBlock * getVectorPreheader()
Returns the preheader of the vector loop region, if one exists, or null otherwise.
Definition VPlan.h:4402
void setUF(unsigned UF)
Definition VPlan.h:4530
bool hasScalarTail() const
Returns true if the scalar tail may execute after the vector loop.
Definition VPlan.h:4663
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
iterator_range< user_iterator > users()
Definition Value.h:426
bool hasName() const
Definition Value.h:262
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:216
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr LeafTy multiplyCoefficientBy(ScalarTy RHS) const
Definition TypeSize.h:256
constexpr bool isFixed() const
Returns true if the quantity is not scaled by vscale.
Definition TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt RoundingUDiv(const APInt &A, const APInt &B, APInt::Rounding RM)
Return A unsign-divided by B, rounded by the given rounding mode.
Definition APInt.cpp:2763
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedStore(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedStore Intrinsic.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
LogicalOp_match< LHS, RHS, Instruction::And > m_LogicalAnd(const LHS &L, const RHS &R)
Matches L && R either in the form of L & R or L ?
match_combine_or< CastInst_match< OpTy, ZExtInst >, OpTy > m_ZExtOrSelf(const OpTy &Op)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
m_Intrinsic_Ty< Opnd0, Opnd1, Opnd2 >::Ty m_MaskedLoad(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2)
Matches MaskedLoad Intrinsic.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
SpecificCmpClass_match< LHS, RHS, CmpInst > m_SpecificCmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_LogicalAnd()
Matches L && R where L and R are arbitrary values.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
MatchFunctor< Val, Pattern > match_fn(const Pattern &P)
A match functor that can be used as a UnaryPredicate in functional algorithms like all_of.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
bind_cst_ty m_scev_APInt(const APInt *&C)
Match an SCEV constant and bind it to an APInt.
bool match(const SCEV *S, const Pattern &P)
VPInstruction_match< VPInstruction::ExtractLastLane, VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > > m_ExtractLastLaneOfLastPart(const Op0_t &Op0)
AllRecipe_commutative_match< Instruction::And, Op0_t, Op1_t > m_c_BinaryAnd(const Op0_t &Op0, const Op1_t &Op1)
Match a binary AND operation.
AllRecipe_match< Instruction::Or, Op0_t, Op1_t > m_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
Match a binary OR operation.
VPInstruction_match< VPInstruction::AnyOf > m_AnyOf()
AllRecipe_commutative_match< Opcode, Op0_t, Op1_t > m_c_Binary(const Op0_t &Op0, const Op1_t &Op1)
AllRecipe_commutative_match< Instruction::Or, Op0_t, Op1_t > m_c_BinaryOr(const Op0_t &Op0, const Op1_t &Op1)
GEPLikeRecipe_match< Op0_t, Op1_t > m_GetElementPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::BranchOnTwoConds > m_BranchOnTwoConds()
AllRecipe_match< Opcode, Op0_t, Op1_t > m_Binary(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::LastActiveLane, Op0_t > m_LastActiveLane(const Op0_t &Op0)
VPInstruction_match< Instruction::ExtractElement, Op0_t, Op1_t > m_ExtractElement(const Op0_t &Op0, const Op1_t &Op1)
specific_intval< 1 > m_False()
VPDerivedIV_match< Op0_t, Op1_t, Op2_t > m_DerivedIV(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::ExtractLastLane, Op0_t > m_ExtractLastLane(const Op0_t &Op0)
VPInstruction_match< VPInstruction::ActiveLaneMask, Op0_t, Op1_t, Op2_t > m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2)
VPInstruction_match< VPInstruction::BranchOnCount > m_BranchOnCount()
specific_intval< 1 > m_True()
VectorEndPointerRecipe_match< Op0_t, Op1_t > m_VecEndPtr(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::ExtractLastPart, Op0_t > m_ExtractLastPart(const Op0_t &Op0)
VPInstruction_match< VPInstruction::Broadcast, Op0_t > m_Broadcast(const Op0_t &Op0)
class_match< VPValue > m_VPValue()
Match an arbitrary VPValue and ignore it.
VPInstruction_match< VPInstruction::ExplicitVectorLength, Op0_t > m_EVL(const Op0_t &Op0)
VPInstruction_match< VPInstruction::BuildVector > m_BuildVector()
BuildVector is matches only its opcode, w/o matching its operands as the number of operands is not fi...
VPInstruction_match< VPInstruction::ExtractPenultimateElement, Op0_t > m_ExtractPenultimateElement(const Op0_t &Op0)
VPInstruction_match< VPInstruction::FirstActiveLane, Op0_t > m_FirstActiveLane(const Op0_t &Op0)
bind_ty< VPInstruction > m_VPInstruction(VPInstruction *&V)
Match a VPInstruction, capturing if we match.
VPInstruction_match< VPInstruction::BranchOnCond > m_BranchOnCond()
VPInstruction_match< VPInstruction::ExtractLane, Op0_t, Op1_t > m_ExtractLane(const Op0_t &Op0, const Op1_t &Op1)
VPInstruction_match< VPInstruction::Reverse, Op0_t > m_Reverse(const Op0_t &Op0)
NodeAddr< DefNode * > Def
Definition RDFGraph.h:384
bool isSingleScalar(const VPValue *VPV)
Returns true if VPV is a single scalar, either because it produces the same value for all lanes or on...
bool isUniformAcrossVFsAndUFs(VPValue *V)
Checks if V is uniform across all VF lanes and UF parts.
VPValue * getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr)
Get or create a VPValue that corresponds to the expansion of Expr.
std::optional< MemoryLocation > getMemoryLocation(const VPRecipeBase &R)
Return a MemoryLocation for R with noalias metadata populated from R, if the recipe is supported and ...
bool onlyFirstLaneUsed(const VPValue *Def)
Returns true if only the first lane of Def is used.
VPIRFlags getFlagsFromIndDesc(const InductionDescriptor &ID)
Extracts and returns NoWrap and FastMath flags from the induction binop in ID.
Definition VPlanUtils.h:93
bool onlyScalarValuesUsed(const VPValue *Def)
Returns true if only scalar values of Def are used by all users.
const SCEV * getSCEVExprForVPValue(const VPValue *V, PredicatedScalarEvolution &PSE, const Loop *L=nullptr)
Return the SCEV expression for V.
bool isHeaderMask(const VPValue *V, const VPlan &Plan)
Return true if V is a header mask in Plan.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
auto min_element(R &&Range)
Provide wrappers to std::min_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2068
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
DenseMap< const Value *, const SCEV * > ValueToSCEVMapTy
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
auto cast_or_null(const Y &Val)
Definition Casting.h:714
iterator_range< df_iterator< VPBlockShallowTraversalWrapper< VPBlockBase * > > > vp_depth_first_shallow(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order.
Definition VPlanCFG.h:216
iterator_range< df_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_depth_first_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in depth-first order while traversing t...
Definition VPlanCFG.h:243
detail::concat_range< ValueT, RangeTs... > concat(RangeTs &&...Ranges)
Returns a concatenated range across two or more ranges.
Definition STLExtras.h:1150
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
iterator_range< po_iterator< VPBlockDeepTraversalWrapper< VPBlockBase * > > > vp_post_order_deep(VPBlockBase *G)
Returns an iterator range to traverse the graph starting at G in post order while traversing through ...
Definition VPlanCFG.h:236
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI_FOR_TEST cl::opt< bool > EnableWideActiveLaneMask
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:550
bool canConstantBeExtended(const APInt *C, Type *NarrowType, TTI::PartialReductionExtendKind ExtKind)
Check if a constant CI can be safely treated as having been extended from a narrower type with the gi...
Definition VPlan.cpp:1718
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
auto drop_end(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the last N elements excluded.
Definition STLExtras.h:323
RecurKind
These are the kinds of recurrences that we support.
@ Mul
Product of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
Definition STLExtras.h:2002
DWARFExpression::Operation Op
auto max_element(R &&Range)
Provide wrappers to std::max_element which take ranges instead of having to pass begin/end explicitly...
Definition STLExtras.h:2078
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2009
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
@ DataAndControlFlowWithoutRuntimeCheck
Use predicate to control both data and control flow, but modify the trip count so that a runtime over...
hash_code hash_combine(const Ts &...args)
Combine values into a single hash_code.
Definition Hashing.h:592
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2136
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
constexpr detail::IsaCheckPredicate< Types... > IsaPred
Function object wrapper for the llvm::isa type check.
Definition Casting.h:866
hash_code hash_combine_range(InputIteratorT first, InputIteratorT last)
Compute a hash_code for a sequence of values.
Definition Hashing.h:466
#define N
RemoveMask_match(const Op0_t &In, Op1_t &Out)
bool match(OpTy *V) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:761
MDNode * Scope
The tag for alias scope specification (used with noalias).
Definition Metadata.h:784
MDNode * NoAlias
The tag specifying the noalias scope.
Definition Metadata.h:787
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
An information struct used to provide DenseMap with the various necessary components for a given valu...
Incoming for lane maks phi as machine instruction, incoming register Reg and incoming block Block are...
A range of powers-of-2 vectorization factors with fixed start and adjustable end.
Struct to hold various analysis needed for cost computations.
A recipe for handling first-order recurrence phis.
Definition VPlan.h:2383
A recipe for widening load operations with vector-predication intrinsics, using the address to load f...
Definition VPlan.h:3404
A recipe for widening load operations, using the address to load from and an optional mask.
Definition VPlan.h:3363
A recipe for widening select instructions.
Definition VPlan.h:1813
A recipe for widening store operations with vector-predication intrinsics, using the value to store,...
Definition VPlan.h:3488
A recipe for widening store operations, using the stored value, the address to store to and an option...
Definition VPlan.h:3445
static void materializeBroadcasts(VPlan &Plan)
Add explicit broadcasts for live-ins and VPValues defined in Plan's entry block if they are used as v...
static void materializePacksAndUnpacks(VPlan &Plan)
Add explicit Build[Struct]Vector recipes to Pack multiple scalar values into vectors and Unpack recip...
static void optimizeInductionExitUsers(VPlan &Plan, DenseMap< VPValue *, VPValue * > &EndValues, PredicatedScalarEvolution &PSE)
If there's a single exit block, optimize its phi recipes that use exiting IV values by feeding them p...
static void materializeBackedgeTakenCount(VPlan &Plan, VPBasicBlock *VectorPH)
Materialize the backedge-taken count to be computed explicitly using VPInstructions.
static void hoistInvariantLoads(VPlan &Plan)
Hoist single-scalar loads with invariant addresses out of the vector loop to the preheader,...
static void canonicalizeEVLLoops(VPlan &Plan)
Transform EVL loops to use variable-length stepping after region dissolution.
static void dropPoisonGeneratingRecipes(VPlan &Plan, const std::function< bool(BasicBlock *)> &BlockNeedsPredication)
Drop poison flags from recipes that may generate a poison value that is used after vectorization,...
static void createAndOptimizeReplicateRegions(VPlan &Plan)
Wrap predicated VPReplicateRecipes with a mask operand in an if-then region block and remove the mask...
static void createInterleaveGroups(VPlan &Plan, const SmallPtrSetImpl< const InterleaveGroup< Instruction > * > &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed)
static bool runPass(bool(*Transform)(VPlan &, ArgsTy...), VPlan &Plan, typename std::remove_reference< ArgsTy >::type &...Args)
Helper to run a VPlan transform Transform on VPlan, forwarding extra arguments to the transform.
static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, std::optional< unsigned > VScaleForTuning)
Add branch weight metadata, if the Plan's middle block is terminated by a BranchOnCond recipe.
static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF, TypeSize VectorRegWidth)
Try to convert a plan with interleave groups with VF elements to a plan with the interleave groups re...
static DenseMap< const SCEV *, Value * > expandSCEVs(VPlan &Plan, ScalarEvolution &SE)
Expand VPExpandSCEVRecipes in Plan's entry block.
static void convertToConcreteRecipes(VPlan &Plan)
Lower abstract recipes to concrete ones, that can be codegen'd.
static void expandBranchOnTwoConds(VPlan &Plan)
Expand BranchOnTwoConds instructions into explicit CFG with BranchOnCond instructions.
static void hoistPredicatedLoads(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Hoist predicated loads from the same address to the loop entry block, if they are guaranteed to execu...
static void convertToAbstractRecipes(VPlan &Plan, VPCostContext &Ctx, VFRange &Range)
This function converts initial recipes to the abstract recipes and clamps Range based on cost model f...
static void materializeConstantVectorTripCount(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
static LLVM_ABI_FOR_TEST bool tryToConvertVPInstructionsToVPRecipes(VPlan &Plan, function_ref< const InductionDescriptor *(PHINode *)> GetIntOrFpInductionDescriptor, const TargetLibraryInfo &TLI)
Replaces the VPInstructions in Plan with corresponding widen recipes.
static void addExitUsersForFirstOrderRecurrences(VPlan &Plan, VFRange &Range)
Handle users in the exit block for first order reductions in the original exit block.
static void addExplicitVectorLength(VPlan &Plan, const std::optional< unsigned > &MaxEVLSafeElements)
Add a VPEVLBasedIVPHIRecipe and related recipes to Plan and replaces all uses except the canonical IV...
static void replaceSymbolicStrides(VPlan &Plan, PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &StridesMap)
Replace symbolic strides from StridesMap in Plan with constants when possible.
static void removeBranchOnConst(VPlan &Plan)
Remove BranchOnCond recipes with true or false conditions together with removing dead edges to their ...
static void removeDeadRecipes(VPlan &Plan)
Remove dead recipes from Plan.
static void materializeVectorTripCount(VPlan &Plan, VPBasicBlock *VectorPHVPBB, bool TailByMasking, bool RequiresScalarEpilogue)
Materialize vector trip count computations to a set of VPInstructions.
static void simplifyRecipes(VPlan &Plan)
Perform instcombine-like simplifications on recipes in Plan.
static void sinkPredicatedStores(VPlan &Plan, PredicatedScalarEvolution &PSE, const Loop *L)
Sink predicated stores to the same address with complementary predicates (P and NOT P) to an uncondit...
static void handleUncountableEarlyExit(VPBasicBlock *EarlyExitingVPBB, VPBasicBlock *EarlyExitVPBB, VPlan &Plan, VPBasicBlock *HeaderVPBB, VPBasicBlock *LatchVPBB)
Update Plan to account for the uncountable early exit from EarlyExitingVPBB to EarlyExitVPBB by intro...
static void clearReductionWrapFlags(VPlan &Plan)
Clear NSW/NUW flags from reduction instructions if necessary.
static void cse(VPlan &Plan)
Perform common-subexpression-elimination on Plan.
static void addActiveLaneMask(VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck)
Replace (ICMP_ULE, wide canonical IV, backedge-taken-count) checks with an (active-lane-mask recipe,...
static LLVM_ABI_FOR_TEST void optimize(VPlan &Plan)
Apply VPlan-to-VPlan optimizations to Plan, including induction recipe optimizations,...
static void dissolveLoopRegions(VPlan &Plan)
Replace loop regions with explicit CFG.
static void truncateToMinimalBitwidths(VPlan &Plan, const MapVector< Instruction *, uint64_t > &MinBWs)
Insert truncates and extends for any truncated recipe.
static bool adjustFixedOrderRecurrences(VPlan &Plan, VPBuilder &Builder)
Try to have all users of fixed-order recurrences appear after the recipe defining their previous valu...
static void optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE)
Optimize Plan based on BestVF and BestUF.
static void materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH, ElementCount VF)
Materialize VF and VFxUF to be computed explicitly using VPInstructions.
static void updateScalarResumePhis(VPlan &Plan, DenseMap< VPValue *, VPValue * > &IVEndValues)
Update the resume phis in the scalar preheader after creating wide recipes for first-order recurrence...