LLVM 20.0.0git
LoopInterchange.cpp
Go to the documentation of this file.
1//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This Pass handles loop interchange transform.
10// This pass interchanges loops to provide a more cache-friendly memory access
11// patterns.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/Statistic.h"
19#include "llvm/ADT/StringRef.h"
20#include "llvm/ADT/StringSet.h"
29#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/Dominators.h"
32#include "llvm/IR/Function.h"
33#include "llvm/IR/InstrTypes.h"
34#include "llvm/IR/Instruction.h"
36#include "llvm/IR/User.h"
37#include "llvm/IR/Value.h"
40#include "llvm/Support/Debug.h"
46#include <cassert>
47#include <utility>
48#include <vector>
49
50using namespace llvm;
51
52#define DEBUG_TYPE "loop-interchange"
53
54STATISTIC(LoopsInterchanged, "Number of loops interchanged");
55
57 "loop-interchange-threshold", cl::init(0), cl::Hidden,
58 cl::desc("Interchange if you gain more than this number"));
59
60// Maximum number of load-stores that can be handled in the dependency matrix.
62 "loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden,
64 "Maximum number of load-store instructions that should be handled "
65 "in the dependency matrix. Higher value may lead to more interchanges "
66 "at the cost of compile-time"));
67
68namespace {
69
70using LoopVector = SmallVector<Loop *, 8>;
71
72// TODO: Check if we can use a sparse matrix here.
73using CharMatrix = std::vector<std::vector<char>>;
74
75} // end anonymous namespace
76
77// Maximum loop depth supported.
78static const unsigned MaxLoopNestDepth = 10;
79
80#ifndef NDEBUG
81static void printDepMatrix(CharMatrix &DepMatrix) {
82 for (auto &Row : DepMatrix) {
83 for (auto D : Row)
84 LLVM_DEBUG(dbgs() << D << " ");
85 LLVM_DEBUG(dbgs() << "\n");
86 }
87}
88#endif
89
90static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
91 Loop *L, DependenceInfo *DI,
94 using ValueVector = SmallVector<Value *, 16>;
95
96 ValueVector MemInstr;
97
98 // For each block.
99 for (BasicBlock *BB : L->blocks()) {
100 // Scan the BB and collect legal loads and stores.
101 for (Instruction &I : *BB) {
102 if (!isa<Instruction>(I))
103 return false;
104 if (auto *Ld = dyn_cast<LoadInst>(&I)) {
105 if (!Ld->isSimple())
106 return false;
107 MemInstr.push_back(&I);
108 } else if (auto *St = dyn_cast<StoreInst>(&I)) {
109 if (!St->isSimple())
110 return false;
111 MemInstr.push_back(&I);
112 }
113 }
114 }
115
116 LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
117 << " Loads and Stores to analyze\n");
118 if (MemInstr.size() > MaxMemInstrCount) {
119 LLVM_DEBUG(dbgs() << "The transform doesn't support more than "
120 << MaxMemInstrCount << " load/stores in a loop\n");
121 ORE->emit([&]() {
122 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedLoop",
123 L->getStartLoc(), L->getHeader())
124 << "Number of loads/stores exceeded, the supported maximum "
125 "can be increased with option "
126 "-loop-interchange-maxmeminstr-count.";
127 });
128 return false;
129 }
130 ValueVector::iterator I, IE, J, JE;
131 StringSet<> Seen;
132
133 for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
134 for (J = I, JE = MemInstr.end(); J != JE; ++J) {
135 std::vector<char> Dep;
136 Instruction *Src = cast<Instruction>(*I);
137 Instruction *Dst = cast<Instruction>(*J);
138 // Ignore Input dependencies.
139 if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
140 continue;
141 // Track Output, Flow, and Anti dependencies.
142 if (auto D = DI->depends(Src, Dst, true)) {
143 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
144 // If the direction vector is negative, normalize it to
145 // make it non-negative.
146 if (D->normalize(SE))
147 LLVM_DEBUG(dbgs() << "Negative dependence vector normalized.\n");
148 LLVM_DEBUG(StringRef DepType =
149 D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
150 dbgs() << "Found " << DepType
151 << " dependency between Src and Dst\n"
152 << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
153 unsigned Levels = D->getLevels();
154 char Direction;
155 for (unsigned II = 1; II <= Levels; ++II) {
156 unsigned Dir = D->getDirection(II);
158 Direction = '<';
159 else if (Dir == Dependence::DVEntry::GT ||
161 Direction = '>';
162 else if (Dir == Dependence::DVEntry::EQ)
163 Direction = '=';
164 else
165 Direction = '*';
166 Dep.push_back(Direction);
167 }
168 while (Dep.size() != Level) {
169 Dep.push_back('I');
170 }
171
172 // Make sure we only add unique entries to the dependency matrix.
173 if (Seen.insert(StringRef(Dep.data(), Dep.size())).second)
174 DepMatrix.push_back(Dep);
175 }
176 }
177 }
178
179 return true;
180}
181
182// A loop is moved from index 'from' to an index 'to'. Update the Dependence
183// matrix by exchanging the two columns.
184static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
185 unsigned ToIndx) {
186 for (unsigned I = 0, E = DepMatrix.size(); I < E; ++I)
187 std::swap(DepMatrix[I][ToIndx], DepMatrix[I][FromIndx]);
188}
189
190// After interchanging, check if the direction vector is valid.
191// [Theorem] A permutation of the loops in a perfect nest is legal if and only
192// if the direction matrix, after the same permutation is applied to its
193// columns, has no ">" direction as the leftmost non-"=" direction in any row.
194static bool isLexicographicallyPositive(std::vector<char> &DV) {
195 for (unsigned char Direction : DV) {
196 if (Direction == '<')
197 return true;
198 if (Direction == '>' || Direction == '*')
199 return false;
200 }
201 return true;
202}
203
204// Checks if it is legal to interchange 2 loops.
205static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
206 unsigned InnerLoopId,
207 unsigned OuterLoopId) {
208 unsigned NumRows = DepMatrix.size();
209 std::vector<char> Cur;
210 // For each row check if it is valid to interchange.
211 for (unsigned Row = 0; Row < NumRows; ++Row) {
212 // Create temporary DepVector check its lexicographical order
213 // before and after swapping OuterLoop vs InnerLoop
214 Cur = DepMatrix[Row];
216 return false;
217 std::swap(Cur[InnerLoopId], Cur[OuterLoopId]);
219 return false;
220 }
221 return true;
222}
223
224static void populateWorklist(Loop &L, LoopVector &LoopList) {
225 LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
226 << L.getHeader()->getParent()->getName() << " Loop: %"
227 << L.getHeader()->getName() << '\n');
228 assert(LoopList.empty() && "LoopList should initially be empty!");
229 Loop *CurrentLoop = &L;
230 const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
231 while (!Vec->empty()) {
232 // The current loop has multiple subloops in it hence it is not tightly
233 // nested.
234 // Discard all loops above it added into Worklist.
235 if (Vec->size() != 1) {
236 LoopList = {};
237 return;
238 }
239
240 LoopList.push_back(CurrentLoop);
241 CurrentLoop = Vec->front();
242 Vec = &CurrentLoop->getSubLoops();
243 }
244 LoopList.push_back(CurrentLoop);
245}
246
248 unsigned LoopNestDepth = LoopList.size();
249 if (LoopNestDepth < 2) {
250 LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
251 return false;
252 }
253 return true;
254}
255namespace {
256
257/// LoopInterchangeLegality checks if it is legal to interchange the loop.
258class LoopInterchangeLegality {
259public:
260 LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
262 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
263
264 /// Check if the loops can be interchanged.
265 bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
266 CharMatrix &DepMatrix);
267
268 /// Discover induction PHIs in the header of \p L. Induction
269 /// PHIs are added to \p Inductions.
270 bool findInductions(Loop *L, SmallVectorImpl<PHINode *> &Inductions);
271
272 /// Check if the loop structure is understood. We do not handle triangular
273 /// loops for now.
274 bool isLoopStructureUnderstood();
275
276 bool currentLimitations();
277
278 const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
279 return OuterInnerReductions;
280 }
281
282 const SmallVectorImpl<PHINode *> &getInnerLoopInductions() const {
283 return InnerLoopInductions;
284 }
285
286private:
287 bool tightlyNested(Loop *Outer, Loop *Inner);
288 bool containsUnsafeInstructions(BasicBlock *BB);
289
290 /// Discover induction and reduction PHIs in the header of \p L. Induction
291 /// PHIs are added to \p Inductions, reductions are added to
292 /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
293 /// to be passed as \p InnerLoop.
294 bool findInductionAndReductions(Loop *L,
295 SmallVector<PHINode *, 8> &Inductions,
296 Loop *InnerLoop);
297
298 Loop *OuterLoop;
299 Loop *InnerLoop;
300
301 ScalarEvolution *SE;
302
303 /// Interface to emit optimization remarks.
305
306 /// Set of reduction PHIs taking part of a reduction across the inner and
307 /// outer loop.
308 SmallPtrSet<PHINode *, 4> OuterInnerReductions;
309
310 /// Set of inner loop induction PHIs
311 SmallVector<PHINode *, 8> InnerLoopInductions;
312};
313
314/// LoopInterchangeProfitability checks if it is profitable to interchange the
315/// loop.
316class LoopInterchangeProfitability {
317public:
318 LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
320 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
321
322 /// Check if the loop interchange is profitable.
323 bool isProfitable(const Loop *InnerLoop, const Loop *OuterLoop,
324 unsigned InnerLoopId, unsigned OuterLoopId,
325 CharMatrix &DepMatrix,
327 std::unique_ptr<CacheCost> &CC);
328
329private:
330 int getInstrOrderCost();
331 std::optional<bool> isProfitablePerLoopCacheAnalysis(
333 std::unique_ptr<CacheCost> &CC);
334 std::optional<bool> isProfitablePerInstrOrderCost();
335 std::optional<bool> isProfitableForVectorization(unsigned InnerLoopId,
336 unsigned OuterLoopId,
337 CharMatrix &DepMatrix);
338 Loop *OuterLoop;
339 Loop *InnerLoop;
340
341 /// Scev analysis.
342 ScalarEvolution *SE;
343
344 /// Interface to emit optimization remarks.
346};
347
348/// LoopInterchangeTransform interchanges the loop.
349class LoopInterchangeTransform {
350public:
351 LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
352 LoopInfo *LI, DominatorTree *DT,
353 const LoopInterchangeLegality &LIL)
354 : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), LIL(LIL) {}
355
356 /// Interchange OuterLoop and InnerLoop.
357 bool transform();
358 void restructureLoops(Loop *NewInner, Loop *NewOuter,
359 BasicBlock *OrigInnerPreHeader,
360 BasicBlock *OrigOuterPreHeader);
361 void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
362
363private:
364 bool adjustLoopLinks();
365 bool adjustLoopBranches();
366
367 Loop *OuterLoop;
368 Loop *InnerLoop;
369
370 /// Scev analysis.
371 ScalarEvolution *SE;
372
373 LoopInfo *LI;
374 DominatorTree *DT;
375
376 const LoopInterchangeLegality &LIL;
377};
378
379struct LoopInterchange {
380 ScalarEvolution *SE = nullptr;
381 LoopInfo *LI = nullptr;
382 DependenceInfo *DI = nullptr;
383 DominatorTree *DT = nullptr;
384 std::unique_ptr<CacheCost> CC = nullptr;
385
386 /// Interface to emit optimization remarks.
388
389 LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
390 DominatorTree *DT, std::unique_ptr<CacheCost> &CC,
392 : SE(SE), LI(LI), DI(DI), DT(DT), CC(std::move(CC)), ORE(ORE) {}
393
394 bool run(Loop *L) {
395 if (L->getParentLoop())
396 return false;
397 SmallVector<Loop *, 8> LoopList;
398 populateWorklist(*L, LoopList);
399 return processLoopList(LoopList);
400 }
401
402 bool run(LoopNest &LN) {
403 SmallVector<Loop *, 8> LoopList(LN.getLoops());
404 for (unsigned I = 1; I < LoopList.size(); ++I)
405 if (LoopList[I]->getParentLoop() != LoopList[I - 1])
406 return false;
407 return processLoopList(LoopList);
408 }
409
410 bool isComputableLoopNest(ArrayRef<Loop *> LoopList) {
411 for (Loop *L : LoopList) {
412 const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
413 if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
414 LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
415 return false;
416 }
417 if (L->getNumBackEdges() != 1) {
418 LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
419 return false;
420 }
421 if (!L->getExitingBlock()) {
422 LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
423 return false;
424 }
425 }
426 return true;
427 }
428
429 unsigned selectLoopForInterchange(ArrayRef<Loop *> LoopList) {
430 // TODO: Add a better heuristic to select the loop to be interchanged based
431 // on the dependence matrix. Currently we select the innermost loop.
432 return LoopList.size() - 1;
433 }
434
435 bool processLoopList(SmallVectorImpl<Loop *> &LoopList) {
436 bool Changed = false;
437
438 // Ensure minimum loop nest depth.
439 assert(hasMinimumLoopDepth(LoopList) && "Loop nest does not meet minimum depth.");
440
441 unsigned LoopNestDepth = LoopList.size();
442 if (LoopNestDepth > MaxLoopNestDepth) {
443 LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
444 << MaxLoopNestDepth << "\n");
445 return false;
446 }
447 if (!isComputableLoopNest(LoopList)) {
448 LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
449 return false;
450 }
451
452 LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
453 << "\n");
454
455 CharMatrix DependencyMatrix;
456 Loop *OuterMostLoop = *(LoopList.begin());
457 if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
458 OuterMostLoop, DI, SE, ORE)) {
459 LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
460 return false;
461 }
462
463 LLVM_DEBUG(dbgs() << "Dependency matrix before interchange:\n";
464 printDepMatrix(DependencyMatrix));
465
466 // Get the Outermost loop exit.
467 BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
468 if (!LoopNestExit) {
469 LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
470 return false;
471 }
472
473 unsigned SelecLoopId = selectLoopForInterchange(LoopList);
474 // Obtain the loop vector returned from loop cache analysis beforehand,
475 // and put each <Loop, index> pair into a map for constant time query
476 // later. Indices in loop vector reprsent the optimal order of the
477 // corresponding loop, e.g., given a loopnest with depth N, index 0
478 // indicates the loop should be placed as the outermost loop and index N
479 // indicates the loop should be placed as the innermost loop.
480 //
481 // For the old pass manager CacheCost would be null.
483 if (CC != nullptr) {
484 const auto &LoopCosts = CC->getLoopCosts();
485 for (unsigned i = 0; i < LoopCosts.size(); i++) {
486 CostMap[LoopCosts[i].first] = i;
487 }
488 }
489 // We try to achieve the globally optimal memory access for the loopnest,
490 // and do interchange based on a bubble-sort fasion. We start from
491 // the innermost loop, move it outwards to the best possible position
492 // and repeat this process.
493 for (unsigned j = SelecLoopId; j > 0; j--) {
494 bool ChangedPerIter = false;
495 for (unsigned i = SelecLoopId; i > SelecLoopId - j; i--) {
496 bool Interchanged = processLoop(LoopList[i], LoopList[i - 1], i, i - 1,
497 DependencyMatrix, CostMap);
498 if (!Interchanged)
499 continue;
500 // Loops interchanged, update LoopList accordingly.
501 std::swap(LoopList[i - 1], LoopList[i]);
502 // Update the DependencyMatrix
503 interChangeDependencies(DependencyMatrix, i, i - 1);
504
505 LLVM_DEBUG(dbgs() << "Dependency matrix after interchange:\n";
506 printDepMatrix(DependencyMatrix));
507
508 ChangedPerIter |= Interchanged;
509 Changed |= Interchanged;
510 }
511 // Early abort if there was no interchange during an entire round of
512 // moving loops outwards.
513 if (!ChangedPerIter)
514 break;
515 }
516 return Changed;
517 }
518
519 bool processLoop(Loop *InnerLoop, Loop *OuterLoop, unsigned InnerLoopId,
520 unsigned OuterLoopId,
521 std::vector<std::vector<char>> &DependencyMatrix,
522 const DenseMap<const Loop *, unsigned> &CostMap) {
523 LLVM_DEBUG(dbgs() << "Processing InnerLoopId = " << InnerLoopId
524 << " and OuterLoopId = " << OuterLoopId << "\n");
525 LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
526 if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
527 LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
528 return false;
529 }
530 LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
531 LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
532 if (!LIP.isProfitable(InnerLoop, OuterLoop, InnerLoopId, OuterLoopId,
533 DependencyMatrix, CostMap, CC)) {
534 LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
535 return false;
536 }
537
538 ORE->emit([&]() {
539 return OptimizationRemark(DEBUG_TYPE, "Interchanged",
540 InnerLoop->getStartLoc(),
541 InnerLoop->getHeader())
542 << "Loop interchanged with enclosing loop.";
543 });
544
545 LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LIL);
546 LIT.transform();
547 LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
548 LoopsInterchanged++;
549
550 llvm::formLCSSARecursively(*OuterLoop, *DT, LI, SE);
551 return true;
552 }
553};
554
555} // end anonymous namespace
556
557bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
558 return any_of(*BB, [](const Instruction &I) {
559 return I.mayHaveSideEffects() || I.mayReadFromMemory();
560 });
561}
562
563bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
564 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
565 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
566 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
567
568 LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
569
570 // A perfectly nested loop will not have any branch in between the outer and
571 // inner block i.e. outer header will branch to either inner preheader and
572 // outerloop latch.
573 BranchInst *OuterLoopHeaderBI =
574 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
575 if (!OuterLoopHeaderBI)
576 return false;
577
578 for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
579 if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
580 Succ != OuterLoopLatch)
581 return false;
582
583 LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
584 // We do not have any basic block in between now make sure the outer header
585 // and outer loop latch doesn't contain any unsafe instructions.
586 if (containsUnsafeInstructions(OuterLoopHeader) ||
587 containsUnsafeInstructions(OuterLoopLatch))
588 return false;
589
590 // Also make sure the inner loop preheader does not contain any unsafe
591 // instructions. Note that all instructions in the preheader will be moved to
592 // the outer loop header when interchanging.
593 if (InnerLoopPreHeader != OuterLoopHeader &&
594 containsUnsafeInstructions(InnerLoopPreHeader))
595 return false;
596
597 BasicBlock *InnerLoopExit = InnerLoop->getExitBlock();
598 // Ensure the inner loop exit block flows to the outer loop latch possibly
599 // through empty blocks.
600 const BasicBlock &SuccInner =
601 LoopNest::skipEmptyBlockUntil(InnerLoopExit, OuterLoopLatch);
602 if (&SuccInner != OuterLoopLatch) {
603 LLVM_DEBUG(dbgs() << "Inner loop exit block " << *InnerLoopExit
604 << " does not lead to the outer loop latch.\n";);
605 return false;
606 }
607 // The inner loop exit block does flow to the outer loop latch and not some
608 // other BBs, now make sure it contains safe instructions, since it will be
609 // moved into the (new) inner loop after interchange.
610 if (containsUnsafeInstructions(InnerLoopExit))
611 return false;
612
613 LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
614 // We have a perfect loop nest.
615 return true;
616}
617
618bool LoopInterchangeLegality::isLoopStructureUnderstood() {
619 BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
620 for (PHINode *InnerInduction : InnerLoopInductions) {
621 unsigned Num = InnerInduction->getNumOperands();
622 for (unsigned i = 0; i < Num; ++i) {
623 Value *Val = InnerInduction->getOperand(i);
624 if (isa<Constant>(Val))
625 continue;
626 Instruction *I = dyn_cast<Instruction>(Val);
627 if (!I)
628 return false;
629 // TODO: Handle triangular loops.
630 // e.g. for(int i=0;i<N;i++)
631 // for(int j=i;j<N;j++)
632 unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
633 if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
634 InnerLoopPreheader &&
635 !OuterLoop->isLoopInvariant(I)) {
636 return false;
637 }
638 }
639 }
640
641 // TODO: Handle triangular loops of another form.
642 // e.g. for(int i=0;i<N;i++)
643 // for(int j=0;j<i;j++)
644 // or,
645 // for(int i=0;i<N;i++)
646 // for(int j=0;j*i<N;j++)
647 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
648 BranchInst *InnerLoopLatchBI =
649 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
650 if (!InnerLoopLatchBI->isConditional())
651 return false;
652 if (CmpInst *InnerLoopCmp =
653 dyn_cast<CmpInst>(InnerLoopLatchBI->getCondition())) {
654 Value *Op0 = InnerLoopCmp->getOperand(0);
655 Value *Op1 = InnerLoopCmp->getOperand(1);
656
657 // LHS and RHS of the inner loop exit condition, e.g.,
658 // in "for(int j=0;j<i;j++)", LHS is j and RHS is i.
659 Value *Left = nullptr;
660 Value *Right = nullptr;
661
662 // Check if V only involves inner loop induction variable.
663 // Return true if V is InnerInduction, or a cast from
664 // InnerInduction, or a binary operator that involves
665 // InnerInduction and a constant.
666 std::function<bool(Value *)> IsPathToInnerIndVar;
667 IsPathToInnerIndVar = [this, &IsPathToInnerIndVar](const Value *V) -> bool {
668 if (llvm::is_contained(InnerLoopInductions, V))
669 return true;
670 if (isa<Constant>(V))
671 return true;
672 const Instruction *I = dyn_cast<Instruction>(V);
673 if (!I)
674 return false;
675 if (isa<CastInst>(I))
676 return IsPathToInnerIndVar(I->getOperand(0));
677 if (isa<BinaryOperator>(I))
678 return IsPathToInnerIndVar(I->getOperand(0)) &&
679 IsPathToInnerIndVar(I->getOperand(1));
680 return false;
681 };
682
683 // In case of multiple inner loop indvars, it is okay if LHS and RHS
684 // are both inner indvar related variables.
685 if (IsPathToInnerIndVar(Op0) && IsPathToInnerIndVar(Op1))
686 return true;
687
688 // Otherwise we check if the cmp instruction compares an inner indvar
689 // related variable (Left) with a outer loop invariant (Right).
690 if (IsPathToInnerIndVar(Op0) && !isa<Constant>(Op0)) {
691 Left = Op0;
692 Right = Op1;
693 } else if (IsPathToInnerIndVar(Op1) && !isa<Constant>(Op1)) {
694 Left = Op1;
695 Right = Op0;
696 }
697
698 if (Left == nullptr)
699 return false;
700
701 const SCEV *S = SE->getSCEV(Right);
702 if (!SE->isLoopInvariant(S, OuterLoop))
703 return false;
704 }
705
706 return true;
707}
708
709// If SV is a LCSSA PHI node with a single incoming value, return the incoming
710// value.
711static Value *followLCSSA(Value *SV) {
712 PHINode *PHI = dyn_cast<PHINode>(SV);
713 if (!PHI)
714 return SV;
715
716 if (PHI->getNumIncomingValues() != 1)
717 return SV;
718 return followLCSSA(PHI->getIncomingValue(0));
719}
720
721// Check V's users to see if it is involved in a reduction in L.
723 // Reduction variables cannot be constants.
724 if (isa<Constant>(V))
725 return nullptr;
726
727 for (Value *User : V->users()) {
728 if (PHINode *PHI = dyn_cast<PHINode>(User)) {
729 if (PHI->getNumIncomingValues() == 1)
730 continue;
733 // Detect floating point reduction only when it can be reordered.
734 if (RD.getExactFPMathInst() != nullptr)
735 return nullptr;
736 return PHI;
737 }
738 return nullptr;
739 }
740 }
741
742 return nullptr;
743}
744
745bool LoopInterchangeLegality::findInductionAndReductions(
746 Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
747 if (!L->getLoopLatch() || !L->getLoopPredecessor())
748 return false;
749 for (PHINode &PHI : L->getHeader()->phis()) {
752 Inductions.push_back(&PHI);
753 else {
754 // PHIs in inner loops need to be part of a reduction in the outer loop,
755 // discovered when checking the PHIs of the outer loop earlier.
756 if (!InnerLoop) {
757 if (!OuterInnerReductions.count(&PHI)) {
758 LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
759 "across the outer loop.\n");
760 return false;
761 }
762 } else {
763 assert(PHI.getNumIncomingValues() == 2 &&
764 "Phis in loop header should have exactly 2 incoming values");
765 // Check if we have a PHI node in the outer loop that has a reduction
766 // result from the inner loop as an incoming value.
767 Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
768 PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
769 if (!InnerRedPhi ||
770 !llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
772 dbgs()
773 << "Failed to recognize PHI as an induction or reduction.\n");
774 return false;
775 }
776 OuterInnerReductions.insert(&PHI);
777 OuterInnerReductions.insert(InnerRedPhi);
778 }
779 }
780 }
781 return true;
782}
783
784// This function indicates the current limitations in the transform as a result
785// of which we do not proceed.
786bool LoopInterchangeLegality::currentLimitations() {
787 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
788
789 // transform currently expects the loop latches to also be the exiting
790 // blocks.
791 if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
792 OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
793 !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
794 !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
796 dbgs() << "Loops where the latch is not the exiting block are not"
797 << " supported currently.\n");
798 ORE->emit([&]() {
799 return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
800 OuterLoop->getStartLoc(),
801 OuterLoop->getHeader())
802 << "Loops where the latch is not the exiting block cannot be"
803 " interchange currently.";
804 });
805 return true;
806 }
807
808 SmallVector<PHINode *, 8> Inductions;
809 if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
811 dbgs() << "Only outer loops with induction or reduction PHI nodes "
812 << "are supported currently.\n");
813 ORE->emit([&]() {
814 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
815 OuterLoop->getStartLoc(),
816 OuterLoop->getHeader())
817 << "Only outer loops with induction or reduction PHI nodes can be"
818 " interchanged currently.";
819 });
820 return true;
821 }
822
823 Inductions.clear();
824 // For multi-level loop nests, make sure that all phi nodes for inner loops
825 // at all levels can be recognized as a induction or reduction phi. Bail out
826 // if a phi node at a certain nesting level cannot be properly recognized.
827 Loop *CurLevelLoop = OuterLoop;
828 while (!CurLevelLoop->getSubLoops().empty()) {
829 // We already made sure that the loop nest is tightly nested.
830 CurLevelLoop = CurLevelLoop->getSubLoops().front();
831 if (!findInductionAndReductions(CurLevelLoop, Inductions, nullptr)) {
833 dbgs() << "Only inner loops with induction or reduction PHI nodes "
834 << "are supported currently.\n");
835 ORE->emit([&]() {
836 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
837 CurLevelLoop->getStartLoc(),
838 CurLevelLoop->getHeader())
839 << "Only inner loops with induction or reduction PHI nodes can be"
840 " interchange currently.";
841 });
842 return true;
843 }
844 }
845
846 // TODO: Triangular loops are not handled for now.
847 if (!isLoopStructureUnderstood()) {
848 LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
849 ORE->emit([&]() {
850 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
851 InnerLoop->getStartLoc(),
852 InnerLoop->getHeader())
853 << "Inner loop structure not understood currently.";
854 });
855 return true;
856 }
857
858 return false;
859}
860
861bool LoopInterchangeLegality::findInductions(
862 Loop *L, SmallVectorImpl<PHINode *> &Inductions) {
863 for (PHINode &PHI : L->getHeader()->phis()) {
866 Inductions.push_back(&PHI);
867 }
868 return !Inductions.empty();
869}
870
871// We currently only support LCSSA PHI nodes in the inner loop exit, if their
872// users are either reduction PHIs or PHIs outside the outer loop (which means
873// the we are only interested in the final value after the loop).
874static bool
877 BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
878 for (PHINode &PHI : InnerExit->phis()) {
879 // Reduction lcssa phi will have only 1 incoming block that from loop latch.
880 if (PHI.getNumIncomingValues() > 1)
881 return false;
882 if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
883 PHINode *PN = dyn_cast<PHINode>(U);
884 return !PN ||
885 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
886 })) {
887 return false;
888 }
889 }
890 return true;
891}
892
893// We currently support LCSSA PHI nodes in the outer loop exit, if their
894// incoming values do not come from the outer loop latch or if the
895// outer loop latch has a single predecessor. In that case, the value will
896// be available if both the inner and outer loop conditions are true, which
897// will still be true after interchanging. If we have multiple predecessor,
898// that may not be the case, e.g. because the outer loop latch may be executed
899// if the inner loop is not executed.
900static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
901 BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
902 for (PHINode &PHI : LoopNestExit->phis()) {
903 for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
904 Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
905 if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
906 continue;
907
908 // The incoming value is defined in the outer loop latch. Currently we
909 // only support that in case the outer loop latch has a single predecessor.
910 // This guarantees that the outer loop latch is executed if and only if
911 // the inner loop is executed (because tightlyNested() guarantees that the
912 // outer loop header only branches to the inner loop or the outer loop
913 // latch).
914 // FIXME: We could weaken this logic and allow multiple predecessors,
915 // if the values are produced outside the loop latch. We would need
916 // additional logic to update the PHI nodes in the exit block as
917 // well.
918 if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
919 return false;
920 }
921 }
922 return true;
923}
924
925// In case of multi-level nested loops, it may occur that lcssa phis exist in
926// the latch of InnerLoop, i.e., when defs of the incoming values are further
927// inside the loopnest. Sometimes those incoming values are not available
928// after interchange, since the original inner latch will become the new outer
929// latch which may have predecessor paths that do not include those incoming
930// values.
931// TODO: Handle transformation of lcssa phis in the InnerLoop latch in case of
932// multi-level loop nests.
933static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
934 if (InnerLoop->getSubLoops().empty())
935 return true;
936 // If the original outer latch has only one predecessor, then values defined
937 // further inside the looploop, e.g., in the innermost loop, will be available
938 // at the new outer latch after interchange.
939 if (OuterLoop->getLoopLatch()->getUniquePredecessor() != nullptr)
940 return true;
941
942 // The outer latch has more than one predecessors, i.e., the inner
943 // exit and the inner header.
944 // PHI nodes in the inner latch are lcssa phis where the incoming values
945 // are defined further inside the loopnest. Check if those phis are used
946 // in the original inner latch. If that is the case then bail out since
947 // those incoming values may not be available at the new outer latch.
948 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
949 for (PHINode &PHI : InnerLoopLatch->phis()) {
950 for (auto *U : PHI.users()) {
951 Instruction *UI = cast<Instruction>(U);
952 if (InnerLoopLatch == UI->getParent())
953 return false;
954 }
955 }
956 return true;
957}
958
959bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
960 unsigned OuterLoopId,
961 CharMatrix &DepMatrix) {
962 if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
963 LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
964 << " and OuterLoopId = " << OuterLoopId
965 << " due to dependence\n");
966 ORE->emit([&]() {
967 return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
968 InnerLoop->getStartLoc(),
969 InnerLoop->getHeader())
970 << "Cannot interchange loops due to dependences.";
971 });
972 return false;
973 }
974 // Check if outer and inner loop contain legal instructions only.
975 for (auto *BB : OuterLoop->blocks())
977 if (CallInst *CI = dyn_cast<CallInst>(&I)) {
978 // readnone functions do not prevent interchanging.
979 if (CI->onlyWritesMemory())
980 continue;
982 dbgs() << "Loops with call instructions cannot be interchanged "
983 << "safely.");
984 ORE->emit([&]() {
985 return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
986 CI->getDebugLoc(),
987 CI->getParent())
988 << "Cannot interchange loops due to call instruction.";
989 });
990
991 return false;
992 }
993
994 if (!findInductions(InnerLoop, InnerLoopInductions)) {
995 LLVM_DEBUG(dbgs() << "Could not find inner loop induction variables.\n");
996 return false;
997 }
998
999 if (!areInnerLoopLatchPHIsSupported(OuterLoop, InnerLoop)) {
1000 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop latch.\n");
1001 ORE->emit([&]() {
1002 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedInnerLatchPHI",
1003 InnerLoop->getStartLoc(),
1004 InnerLoop->getHeader())
1005 << "Cannot interchange loops because unsupported PHI nodes found "
1006 "in inner loop latch.";
1007 });
1008 return false;
1009 }
1010
1011 // TODO: The loops could not be interchanged due to current limitations in the
1012 // transform module.
1013 if (currentLimitations()) {
1014 LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
1015 return false;
1016 }
1017
1018 // Check if the loops are tightly nested.
1019 if (!tightlyNested(OuterLoop, InnerLoop)) {
1020 LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
1021 ORE->emit([&]() {
1022 return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
1023 InnerLoop->getStartLoc(),
1024 InnerLoop->getHeader())
1025 << "Cannot interchange loops because they are not tightly "
1026 "nested.";
1027 });
1028 return false;
1029 }
1030
1031 if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
1032 OuterInnerReductions)) {
1033 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
1034 ORE->emit([&]() {
1035 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1036 InnerLoop->getStartLoc(),
1037 InnerLoop->getHeader())
1038 << "Found unsupported PHI node in loop exit.";
1039 });
1040 return false;
1041 }
1042
1043 if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
1044 LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
1045 ORE->emit([&]() {
1046 return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
1047 OuterLoop->getStartLoc(),
1048 OuterLoop->getHeader())
1049 << "Found unsupported PHI node in loop exit.";
1050 });
1051 return false;
1052 }
1053
1054 return true;
1055}
1056
1057int LoopInterchangeProfitability::getInstrOrderCost() {
1058 unsigned GoodOrder, BadOrder;
1059 BadOrder = GoodOrder = 0;
1060 for (BasicBlock *BB : InnerLoop->blocks()) {
1061 for (Instruction &Ins : *BB) {
1062 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
1063 unsigned NumOp = GEP->getNumOperands();
1064 bool FoundInnerInduction = false;
1065 bool FoundOuterInduction = false;
1066 for (unsigned i = 0; i < NumOp; ++i) {
1067 // Skip operands that are not SCEV-able.
1068 if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
1069 continue;
1070
1071 const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
1072 const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
1073 if (!AR)
1074 continue;
1075
1076 // If we find the inner induction after an outer induction e.g.
1077 // for(int i=0;i<N;i++)
1078 // for(int j=0;j<N;j++)
1079 // A[i][j] = A[i-1][j-1]+k;
1080 // then it is a good order.
1081 if (AR->getLoop() == InnerLoop) {
1082 // We found an InnerLoop induction after OuterLoop induction. It is
1083 // a good order.
1084 FoundInnerInduction = true;
1085 if (FoundOuterInduction) {
1086 GoodOrder++;
1087 break;
1088 }
1089 }
1090 // If we find the outer induction after an inner induction e.g.
1091 // for(int i=0;i<N;i++)
1092 // for(int j=0;j<N;j++)
1093 // A[j][i] = A[j-1][i-1]+k;
1094 // then it is a bad order.
1095 if (AR->getLoop() == OuterLoop) {
1096 // We found an OuterLoop induction after InnerLoop induction. It is
1097 // a bad order.
1098 FoundOuterInduction = true;
1099 if (FoundInnerInduction) {
1100 BadOrder++;
1101 break;
1102 }
1103 }
1104 }
1105 }
1106 }
1107 }
1108 return GoodOrder - BadOrder;
1109}
1110
1111std::optional<bool>
1112LoopInterchangeProfitability::isProfitablePerLoopCacheAnalysis(
1113 const DenseMap<const Loop *, unsigned> &CostMap,
1114 std::unique_ptr<CacheCost> &CC) {
1115 // This is the new cost model returned from loop cache analysis.
1116 // A smaller index means the loop should be placed an outer loop, and vice
1117 // versa.
1118 if (CostMap.contains(InnerLoop) && CostMap.contains(OuterLoop)) {
1119 unsigned InnerIndex = 0, OuterIndex = 0;
1120 InnerIndex = CostMap.find(InnerLoop)->second;
1121 OuterIndex = CostMap.find(OuterLoop)->second;
1122 LLVM_DEBUG(dbgs() << "InnerIndex = " << InnerIndex
1123 << ", OuterIndex = " << OuterIndex << "\n");
1124 if (InnerIndex < OuterIndex)
1125 return std::optional<bool>(true);
1126 assert(InnerIndex != OuterIndex && "CostMap should assign unique "
1127 "numbers to each loop");
1128 if (CC->getLoopCost(*OuterLoop) == CC->getLoopCost(*InnerLoop))
1129 return std::nullopt;
1130 return std::optional<bool>(false);
1131 }
1132 return std::nullopt;
1133}
1134
1135std::optional<bool>
1136LoopInterchangeProfitability::isProfitablePerInstrOrderCost() {
1137 // Legacy cost model: this is rough cost estimation algorithm. It counts the
1138 // good and bad order of induction variables in the instruction and allows
1139 // reordering if number of bad orders is more than good.
1140 int Cost = getInstrOrderCost();
1141 LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
1143 return std::optional<bool>(true);
1144
1145 return std::nullopt;
1146}
1147
1148std::optional<bool> LoopInterchangeProfitability::isProfitableForVectorization(
1149 unsigned InnerLoopId, unsigned OuterLoopId, CharMatrix &DepMatrix) {
1150 for (auto &Row : DepMatrix) {
1151 // If the inner loop is loop independent or doesn't carry any dependency
1152 // it is not profitable to move this to outer position, since we are
1153 // likely able to do inner loop vectorization already.
1154 if (Row[InnerLoopId] == 'I' || Row[InnerLoopId] == '=')
1155 return std::optional<bool>(false);
1156
1157 // If the outer loop is not loop independent it is not profitable to move
1158 // this to inner position, since doing so would not enable inner loop
1159 // parallelism.
1160 if (Row[OuterLoopId] != 'I' && Row[OuterLoopId] != '=')
1161 return std::optional<bool>(false);
1162 }
1163 // If inner loop has dependence and outer loop is loop independent then it
1164 // is/ profitable to interchange to enable inner loop parallelism.
1165 // If there are no dependences, interchanging will not improve anything.
1166 return std::optional<bool>(!DepMatrix.empty());
1167}
1168
1169bool LoopInterchangeProfitability::isProfitable(
1170 const Loop *InnerLoop, const Loop *OuterLoop, unsigned InnerLoopId,
1171 unsigned OuterLoopId, CharMatrix &DepMatrix,
1172 const DenseMap<const Loop *, unsigned> &CostMap,
1173 std::unique_ptr<CacheCost> &CC) {
1174 // isProfitable() is structured to avoid endless loop interchange.
1175 // If loop cache analysis could decide the profitability then,
1176 // profitability check will stop and return the analysis result.
1177 // If cache analysis failed to analyze the loopnest (e.g.,
1178 // due to delinearization issues) then only check whether it is
1179 // profitable for InstrOrderCost. Likewise, if InstrOrderCost failed to
1180 // analysis the profitability then only, isProfitableForVectorization
1181 // will decide.
1182 std::optional<bool> shouldInterchange =
1183 isProfitablePerLoopCacheAnalysis(CostMap, CC);
1184 if (!shouldInterchange.has_value()) {
1185 shouldInterchange = isProfitablePerInstrOrderCost();
1186 if (!shouldInterchange.has_value())
1187 shouldInterchange =
1188 isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix);
1189 }
1190 if (!shouldInterchange.has_value()) {
1191 ORE->emit([&]() {
1192 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1193 InnerLoop->getStartLoc(),
1194 InnerLoop->getHeader())
1195 << "Insufficient information to calculate the cost of loop for "
1196 "interchange.";
1197 });
1198 return false;
1199 } else if (!shouldInterchange.value()) {
1200 ORE->emit([&]() {
1201 return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
1202 InnerLoop->getStartLoc(),
1203 InnerLoop->getHeader())
1204 << "Interchanging loops is not considered to improve cache "
1205 "locality nor vectorization.";
1206 });
1207 return false;
1208 }
1209 return true;
1210}
1211
1212void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
1213 Loop *InnerLoop) {
1214 for (Loop *L : *OuterLoop)
1215 if (L == InnerLoop) {
1216 OuterLoop->removeChildLoop(L);
1217 return;
1218 }
1219 llvm_unreachable("Couldn't find loop");
1220}
1221
1222/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
1223/// new inner and outer loop after interchanging: NewInner is the original
1224/// outer loop and NewOuter is the original inner loop.
1225///
1226/// Before interchanging, we have the following structure
1227/// Outer preheader
1228// Outer header
1229// Inner preheader
1230// Inner header
1231// Inner body
1232// Inner latch
1233// outer bbs
1234// Outer latch
1235//
1236// After interchanging:
1237// Inner preheader
1238// Inner header
1239// Outer preheader
1240// Outer header
1241// Inner body
1242// outer bbs
1243// Outer latch
1244// Inner latch
1245void LoopInterchangeTransform::restructureLoops(
1246 Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
1247 BasicBlock *OrigOuterPreHeader) {
1248 Loop *OuterLoopParent = OuterLoop->getParentLoop();
1249 // The original inner loop preheader moves from the new inner loop to
1250 // the parent loop, if there is one.
1251 NewInner->removeBlockFromLoop(OrigInnerPreHeader);
1252 LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
1253
1254 // Switch the loop levels.
1255 if (OuterLoopParent) {
1256 // Remove the loop from its parent loop.
1257 removeChildLoop(OuterLoopParent, NewInner);
1258 removeChildLoop(NewInner, NewOuter);
1259 OuterLoopParent->addChildLoop(NewOuter);
1260 } else {
1261 removeChildLoop(NewInner, NewOuter);
1262 LI->changeTopLevelLoop(NewInner, NewOuter);
1263 }
1264 while (!NewOuter->isInnermost())
1265 NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
1266 NewOuter->addChildLoop(NewInner);
1267
1268 // BBs from the original inner loop.
1269 SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
1270
1271 // Add BBs from the original outer loop to the original inner loop (excluding
1272 // BBs already in inner loop)
1273 for (BasicBlock *BB : NewInner->blocks())
1274 if (LI->getLoopFor(BB) == NewInner)
1275 NewOuter->addBlockEntry(BB);
1276
1277 // Now remove inner loop header and latch from the new inner loop and move
1278 // other BBs (the loop body) to the new inner loop.
1279 BasicBlock *OuterHeader = NewOuter->getHeader();
1280 BasicBlock *OuterLatch = NewOuter->getLoopLatch();
1281 for (BasicBlock *BB : OrigInnerBBs) {
1282 // Nothing will change for BBs in child loops.
1283 if (LI->getLoopFor(BB) != NewOuter)
1284 continue;
1285 // Remove the new outer loop header and latch from the new inner loop.
1286 if (BB == OuterHeader || BB == OuterLatch)
1287 NewInner->removeBlockFromLoop(BB);
1288 else
1289 LI->changeLoopFor(BB, NewInner);
1290 }
1291
1292 // The preheader of the original outer loop becomes part of the new
1293 // outer loop.
1294 NewOuter->addBlockEntry(OrigOuterPreHeader);
1295 LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
1296
1297 // Tell SE that we move the loops around.
1298 SE->forgetLoop(NewOuter);
1299}
1300
1301bool LoopInterchangeTransform::transform() {
1302 bool Transformed = false;
1303
1304 if (InnerLoop->getSubLoops().empty()) {
1305 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1306 LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
1307 auto &InductionPHIs = LIL.getInnerLoopInductions();
1308 if (InductionPHIs.empty()) {
1309 LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
1310 return false;
1311 }
1312
1313 SmallVector<Instruction *, 8> InnerIndexVarList;
1314 for (PHINode *CurInductionPHI : InductionPHIs) {
1315 if (CurInductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
1316 InnerIndexVarList.push_back(
1317 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(1)));
1318 else
1319 InnerIndexVarList.push_back(
1320 dyn_cast<Instruction>(CurInductionPHI->getIncomingValue(0)));
1321 }
1322
1323 // Create a new latch block for the inner loop. We split at the
1324 // current latch's terminator and then move the condition and all
1325 // operands that are not either loop-invariant or the induction PHI into the
1326 // new latch block.
1327 BasicBlock *NewLatch =
1328 SplitBlock(InnerLoop->getLoopLatch(),
1329 InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
1330
1332 unsigned i = 0;
1333 auto MoveInstructions = [&i, &WorkList, this, &InductionPHIs, NewLatch]() {
1334 for (; i < WorkList.size(); i++) {
1335 // Duplicate instruction and move it the new latch. Update uses that
1336 // have been moved.
1337 Instruction *NewI = WorkList[i]->clone();
1338 NewI->insertBefore(NewLatch->getFirstNonPHI());
1339 assert(!NewI->mayHaveSideEffects() &&
1340 "Moving instructions with side-effects may change behavior of "
1341 "the loop nest!");
1342 for (Use &U : llvm::make_early_inc_range(WorkList[i]->uses())) {
1343 Instruction *UserI = cast<Instruction>(U.getUser());
1344 if (!InnerLoop->contains(UserI->getParent()) ||
1345 UserI->getParent() == NewLatch ||
1346 llvm::is_contained(InductionPHIs, UserI))
1347 U.set(NewI);
1348 }
1349 // Add operands of moved instruction to the worklist, except if they are
1350 // outside the inner loop or are the induction PHI.
1351 for (Value *Op : WorkList[i]->operands()) {
1352 Instruction *OpI = dyn_cast<Instruction>(Op);
1353 if (!OpI ||
1354 this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
1355 llvm::is_contained(InductionPHIs, OpI))
1356 continue;
1357 WorkList.insert(OpI);
1358 }
1359 }
1360 };
1361
1362 // FIXME: Should we interchange when we have a constant condition?
1363 Instruction *CondI = dyn_cast<Instruction>(
1364 cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
1365 ->getCondition());
1366 if (CondI)
1367 WorkList.insert(CondI);
1368 MoveInstructions();
1369 for (Instruction *InnerIndexVar : InnerIndexVarList)
1370 WorkList.insert(cast<Instruction>(InnerIndexVar));
1371 MoveInstructions();
1372 }
1373
1374 // Ensure the inner loop phi nodes have a separate basic block.
1375 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1376 if (InnerLoopHeader->getFirstNonPHI() != InnerLoopHeader->getTerminator()) {
1377 SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
1378 LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
1379 }
1380
1381 // Instructions in the original inner loop preheader may depend on values
1382 // defined in the outer loop header. Move them there, because the original
1383 // inner loop preheader will become the entry into the interchanged loop nest.
1384 // Currently we move all instructions and rely on LICM to move invariant
1385 // instructions outside the loop nest.
1386 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1387 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1388 if (InnerLoopPreHeader != OuterLoopHeader) {
1390 for (Instruction &I :
1391 make_early_inc_range(make_range(InnerLoopPreHeader->begin(),
1392 std::prev(InnerLoopPreHeader->end()))))
1393 I.moveBeforePreserving(OuterLoopHeader->getTerminator());
1394 }
1395
1396 Transformed |= adjustLoopLinks();
1397 if (!Transformed) {
1398 LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
1399 return false;
1400 }
1401
1402 return true;
1403}
1404
1405/// \brief Move all instructions except the terminator from FromBB right before
1406/// InsertBefore
1407static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
1408 BasicBlock *ToBB = InsertBefore->getParent();
1409
1410 ToBB->splice(InsertBefore->getIterator(), FromBB, FromBB->begin(),
1411 FromBB->getTerminator()->getIterator());
1412}
1413
1414/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
1415static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
1416 // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
1417 // from BB1 afterwards.
1418 auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
1419 SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
1420 for (Instruction *I : TempInstrs)
1421 I->removeFromParent();
1422
1423 // Move instructions from BB2 to BB1.
1424 moveBBContents(BB2, BB1->getTerminator());
1425
1426 // Move instructions from TempInstrs to BB2.
1427 for (Instruction *I : TempInstrs)
1428 I->insertBefore(BB2->getTerminator());
1429}
1430
1431// Update BI to jump to NewBB instead of OldBB. Records updates to the
1432// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
1433// \p OldBB is exactly once in BI's successor list.
1434static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
1435 BasicBlock *NewBB,
1436 std::vector<DominatorTree::UpdateType> &DTUpdates,
1437 bool MustUpdateOnce = true) {
1438 assert((!MustUpdateOnce ||
1440 [OldBB](BasicBlock *BB) {
1441 return BB == OldBB;
1442 }) == 1) && "BI must jump to OldBB exactly once.");
1443 bool Changed = false;
1444 for (Use &Op : BI->operands())
1445 if (Op == OldBB) {
1446 Op.set(NewBB);
1447 Changed = true;
1448 }
1449
1450 if (Changed) {
1451 DTUpdates.push_back(
1452 {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
1453 DTUpdates.push_back(
1454 {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
1455 }
1456 assert(Changed && "Expected a successor to be updated");
1457}
1458
1459// Move Lcssa PHIs to the right place.
1460static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
1461 BasicBlock *InnerLatch, BasicBlock *OuterHeader,
1462 BasicBlock *OuterLatch, BasicBlock *OuterExit,
1463 Loop *InnerLoop, LoopInfo *LI) {
1464
1465 // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
1466 // defined either in the header or latch. Those blocks will become header and
1467 // latch of the new outer loop, and the only possible users can PHI nodes
1468 // in the exit block of the loop nest or the outer loop header (reduction
1469 // PHIs, in that case, the incoming value must be defined in the inner loop
1470 // header). We can just substitute the user with the incoming value and remove
1471 // the PHI.
1472 for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
1473 assert(P.getNumIncomingValues() == 1 &&
1474 "Only loops with a single exit are supported!");
1475
1476 // Incoming values are guaranteed be instructions currently.
1477 auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
1478 // In case of multi-level nested loops, follow LCSSA to find the incoming
1479 // value defined from the innermost loop.
1480 auto IncIInnerMost = cast<Instruction>(followLCSSA(IncI));
1481 // Skip phis with incoming values from the inner loop body, excluding the
1482 // header and latch.
1483 if (IncIInnerMost->getParent() != InnerLatch &&
1484 IncIInnerMost->getParent() != InnerHeader)
1485 continue;
1486
1487 assert(all_of(P.users(),
1488 [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
1489 return (cast<PHINode>(U)->getParent() == OuterHeader &&
1490 IncI->getParent() == InnerHeader) ||
1491 cast<PHINode>(U)->getParent() == OuterExit;
1492 }) &&
1493 "Can only replace phis iff the uses are in the loop nest exit or "
1494 "the incoming value is defined in the inner header (it will "
1495 "dominate all loop blocks after interchanging)");
1496 P.replaceAllUsesWith(IncI);
1497 P.eraseFromParent();
1498 }
1499
1500 SmallVector<PHINode *, 8> LcssaInnerExit;
1501 for (PHINode &P : InnerExit->phis())
1502 LcssaInnerExit.push_back(&P);
1503
1504 SmallVector<PHINode *, 8> LcssaInnerLatch;
1505 for (PHINode &P : InnerLatch->phis())
1506 LcssaInnerLatch.push_back(&P);
1507
1508 // Lcssa PHIs for values used outside the inner loop are in InnerExit.
1509 // If a PHI node has users outside of InnerExit, it has a use outside the
1510 // interchanged loop and we have to preserve it. We move these to
1511 // InnerLatch, which will become the new exit block for the innermost
1512 // loop after interchanging.
1513 for (PHINode *P : LcssaInnerExit)
1514 P->moveBefore(InnerLatch->getFirstNonPHI());
1515
1516 // If the inner loop latch contains LCSSA PHIs, those come from a child loop
1517 // and we have to move them to the new inner latch.
1518 for (PHINode *P : LcssaInnerLatch)
1519 P->moveBefore(InnerExit->getFirstNonPHI());
1520
1521 // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
1522 // incoming values defined in the outer loop, we have to add a new PHI
1523 // in the inner loop latch, which became the exit block of the outer loop,
1524 // after interchanging.
1525 if (OuterExit) {
1526 for (PHINode &P : OuterExit->phis()) {
1527 if (P.getNumIncomingValues() != 1)
1528 continue;
1529 // Skip Phis with incoming values defined in the inner loop. Those should
1530 // already have been updated.
1531 auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
1532 if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
1533 continue;
1534
1535 PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
1536 NewPhi->setIncomingValue(0, P.getIncomingValue(0));
1537 NewPhi->setIncomingBlock(0, OuterLatch);
1538 // We might have incoming edges from other BBs, i.e., the original outer
1539 // header.
1540 for (auto *Pred : predecessors(InnerLatch)) {
1541 if (Pred == OuterLatch)
1542 continue;
1543 NewPhi->addIncoming(P.getIncomingValue(0), Pred);
1544 }
1545 NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
1546 P.setIncomingValue(0, NewPhi);
1547 }
1548 }
1549
1550 // Now adjust the incoming blocks for the LCSSA PHIs.
1551 // For PHIs moved from Inner's exit block, we need to replace Inner's latch
1552 // with the new latch.
1553 InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
1554}
1555
1556bool LoopInterchangeTransform::adjustLoopBranches() {
1557 LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
1558 std::vector<DominatorTree::UpdateType> DTUpdates;
1559
1560 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1561 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1562
1563 assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
1564 InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
1565 InnerLoopPreHeader && "Guaranteed by loop-simplify form");
1566 // Ensure that both preheaders do not contain PHI nodes and have single
1567 // predecessors. This allows us to move them easily. We use
1568 // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
1569 // preheaders do not satisfy those conditions.
1570 if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
1571 !OuterLoopPreHeader->getUniquePredecessor())
1572 OuterLoopPreHeader =
1573 InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
1574 if (InnerLoopPreHeader == OuterLoop->getHeader())
1575 InnerLoopPreHeader =
1576 InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
1577
1578 // Adjust the loop preheader
1579 BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
1580 BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
1581 BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
1582 BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
1583 BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
1584 BasicBlock *InnerLoopLatchPredecessor =
1585 InnerLoopLatch->getUniquePredecessor();
1586 BasicBlock *InnerLoopLatchSuccessor;
1587 BasicBlock *OuterLoopLatchSuccessor;
1588
1589 BranchInst *OuterLoopLatchBI =
1590 dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
1591 BranchInst *InnerLoopLatchBI =
1592 dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
1593 BranchInst *OuterLoopHeaderBI =
1594 dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
1595 BranchInst *InnerLoopHeaderBI =
1596 dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
1597
1598 if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
1599 !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
1600 !InnerLoopHeaderBI)
1601 return false;
1602
1603 BranchInst *InnerLoopLatchPredecessorBI =
1604 dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
1605 BranchInst *OuterLoopPredecessorBI =
1606 dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
1607
1608 if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
1609 return false;
1610 BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
1611 if (!InnerLoopHeaderSuccessor)
1612 return false;
1613
1614 // Adjust Loop Preheader and headers.
1615 // The branches in the outer loop predecessor and the outer loop header can
1616 // be unconditional branches or conditional branches with duplicates. Consider
1617 // this when updating the successors.
1618 updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
1619 InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
1620 // The outer loop header might or might not branch to the outer latch.
1621 // We are guaranteed to branch to the inner loop preheader.
1622 if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch)) {
1623 // In this case the outerLoopHeader should branch to the InnerLoopLatch.
1624 updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, InnerLoopLatch,
1625 DTUpdates,
1626 /*MustUpdateOnce=*/false);
1627 }
1628 updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
1629 InnerLoopHeaderSuccessor, DTUpdates,
1630 /*MustUpdateOnce=*/false);
1631
1632 // Adjust reduction PHI's now that the incoming block has changed.
1633 InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
1634 OuterLoopHeader);
1635
1636 updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
1637 OuterLoopPreHeader, DTUpdates);
1638
1639 // -------------Adjust loop latches-----------
1640 if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
1641 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
1642 else
1643 InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
1644
1645 updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
1646 InnerLoopLatchSuccessor, DTUpdates);
1647
1648 if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
1649 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
1650 else
1651 OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
1652
1653 updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
1654 OuterLoopLatchSuccessor, DTUpdates);
1655 updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
1656 DTUpdates);
1657
1658 DT->applyUpdates(DTUpdates);
1659 restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
1660 OuterLoopPreHeader);
1661
1662 moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
1663 OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
1664 InnerLoop, LI);
1665 // For PHIs in the exit block of the outer loop, outer's latch has been
1666 // replaced by Inners'.
1667 OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1668
1669 auto &OuterInnerReductions = LIL.getOuterInnerReductions();
1670 // Now update the reduction PHIs in the inner and outer loop headers.
1671 SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
1672 for (PHINode &PHI : InnerLoopHeader->phis())
1673 if (OuterInnerReductions.contains(&PHI))
1674 InnerLoopPHIs.push_back(&PHI);
1675
1676 for (PHINode &PHI : OuterLoopHeader->phis())
1677 if (OuterInnerReductions.contains(&PHI))
1678 OuterLoopPHIs.push_back(&PHI);
1679
1680 // Now move the remaining reduction PHIs from outer to inner loop header and
1681 // vice versa. The PHI nodes must be part of a reduction across the inner and
1682 // outer loop and all the remains to do is and updating the incoming blocks.
1683 for (PHINode *PHI : OuterLoopPHIs) {
1684 LLVM_DEBUG(dbgs() << "Outer loop reduction PHIs:\n"; PHI->dump(););
1685 PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
1686 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1687 }
1688 for (PHINode *PHI : InnerLoopPHIs) {
1689 LLVM_DEBUG(dbgs() << "Inner loop reduction PHIs:\n"; PHI->dump(););
1690 PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
1691 assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
1692 }
1693
1694 // Update the incoming blocks for moved PHI nodes.
1695 OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
1696 OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
1697 InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
1698 InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
1699
1700 // Values defined in the outer loop header could be used in the inner loop
1701 // latch. In that case, we need to create LCSSA phis for them, because after
1702 // interchanging they will be defined in the new inner loop and used in the
1703 // new outer loop.
1704 SmallVector<Instruction *, 4> MayNeedLCSSAPhis;
1705 for (Instruction &I :
1706 make_range(OuterLoopHeader->begin(), std::prev(OuterLoopHeader->end())))
1707 MayNeedLCSSAPhis.push_back(&I);
1708 formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE);
1709
1710 return true;
1711}
1712
1713bool LoopInterchangeTransform::adjustLoopLinks() {
1714 // Adjust all branches in the inner and outer loop.
1715 bool Changed = adjustLoopBranches();
1716 if (Changed) {
1717 // We have interchanged the preheaders so we need to interchange the data in
1718 // the preheaders as well. This is because the content of the inner
1719 // preheader was previously executed inside the outer loop.
1720 BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
1721 BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
1722 swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
1723 }
1724 return Changed;
1725}
1726
1730 LPMUpdater &U) {
1731 Function &F = *LN.getParent();
1732 SmallVector<Loop *, 8> LoopList(LN.getLoops());
1733
1734 if (MaxMemInstrCount < 1) {
1735 LLVM_DEBUG(dbgs() << "MaxMemInstrCount should be at least 1");
1736 return PreservedAnalyses::all();
1737 }
1738
1739 // Ensure minimum depth of the loop nest to do the interchange.
1740 if (!hasMinimumLoopDepth(LoopList))
1741 return PreservedAnalyses::all();
1742 DependenceInfo DI(&F, &AR.AA, &AR.SE, &AR.LI);
1743 std::unique_ptr<CacheCost> CC =
1746 if (!LoopInterchange(&AR.SE, &AR.LI, &DI, &AR.DT, CC, &ORE).run(LN))
1747 return PreservedAnalyses::all();
1748 U.markLoopNestChanged(true);
1750}
Rewrite undef for PHI
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Given that RA is a live propagate it s liveness to any other values it uses(according to Uses). void DeadArgumentEliminationPass
#define LLVM_DEBUG(...)
Definition: Debug.h:106
Hexagon Common GEP
This file defines the interface for the loop cache analysis.
static bool hasMinimumLoopDepth(SmallVectorImpl< Loop * > &LoopList)
static cl::opt< unsigned int > MaxMemInstrCount("loop-interchange-max-meminstr-count", cl::init(64), cl::Hidden, cl::desc("Maximum number of load-store instructions that should be handled " "in the dependency matrix. Higher value may lead to more interchanges " "at the cost of compile-time"))
static cl::opt< int > LoopInterchangeCostThreshold("loop-interchange-threshold", cl::init(0), cl::Hidden, cl::desc("Interchange if you gain more than this number"))
static PHINode * findInnerReductionPhi(Loop *L, Value *V)
static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore)
Move all instructions except the terminator from FromBB right before InsertBefore.
static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx, unsigned ToIndx)
static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader, BasicBlock *InnerLatch, BasicBlock *OuterHeader, BasicBlock *OuterLatch, BasicBlock *OuterExit, Loop *InnerLoop, LoopInfo *LI)
static const unsigned MaxLoopNestDepth
static void printDepMatrix(CharMatrix &DepMatrix)
static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2)
Swap instructions between BB1 and BB2 but keep terminators intact.
static bool areInnerLoopLatchPHIsSupported(Loop *OuterLoop, Loop *InnerLoop)
static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix, unsigned InnerLoopId, unsigned OuterLoopId)
static bool areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL, SmallPtrSetImpl< PHINode * > &Reductions)
#define DEBUG_TYPE
static Value * followLCSSA(Value *SV)
static void populateWorklist(Loop &L, LoopVector &LoopList)
static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level, Loop *L, DependenceInfo *DI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB, BasicBlock *NewBB, std::vector< DominatorTree::UpdateType > &DTUpdates, bool MustUpdateOnce=true)
static bool isLexicographicallyPositive(std::vector< char > &DV)
This file defines the interface for the loop nest analysis.
This header provides classes for managing a pipeline of passes over loops in LLVM IR.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
uint64_t IntrinsicInst * II
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static bool isProfitable(const SmallVector< std::unique_ptr< StableFunctionMap::StableFunctionEntry > > &SFS)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition: Statistic.h:166
StringSet - A set-like wrapper for the StringMap.
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
iterator_range< const_phi_iterator > phis() const
Returns a range that iterates over the phis in the basic block.
Definition: BasicBlock.h:517
iterator_range< filter_iterator< BasicBlock::const_iterator, std::function< bool(const Instruction &)> > > instructionsWithoutDebug(bool SkipPseudoOp=true) const
Return a const iterator range over the instructions in the block, skipping any debug instructions.
Definition: BasicBlock.cpp:250
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
void replacePhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block to refer to basic block New instead of basic block Old.
Definition: BasicBlock.cpp:651
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
Conditional or Unconditional Branch instruction.
iterator_range< succ_op_iterator > successors()
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
static std::unique_ptr< CacheCost > getCacheCost(Loop &Root, LoopStandardAnalysisResults &AR, DependenceInfo &DI, std::optional< unsigned > TRT=std::nullopt)
Create a CacheCost for the loop nest rooted by Root.
This class represents a function call, abstracting a target machine's calling convention.
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
This class represents an Operation in the Expression.
iterator find(const_arg_type_t< KeyT > Val)
Definition: DenseMap.h:156
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition: DenseMap.h:147
DependenceInfo - This class is the main dependence-analysis driver.
std::unique_ptr< Dependence > depends(Instruction *Src, Instruction *Dst, bool PossiblyLoopIndependent)
depends - Tests for a dependence between the Src and Dst instructions.
void applyUpdates(ArrayRef< UpdateType > Updates)
Inform the dominator tree about a sequence of CFG edge insertions and deletions and perform a batch u...
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:933
A struct for saving information about induction variables.
static bool isInductionPHI(PHINode *Phi, const Loop *L, ScalarEvolution *SE, InductionDescriptor &D, const SCEV *Expr=nullptr, SmallVectorImpl< Instruction * > *CastsToIgnore=nullptr)
Returns true if Phi is an induction in the loop L.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
Definition: Instruction.cpp:99
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
This class provides an interface for updating the loop pass manager based on mutations to the loop ne...
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
BlockT * getLoopLatch() const
If there is a single latch block for this loop, return it.
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
void removeBlockFromLoop(BlockT *BB)
This removes the specified basic block from the current loop, updating the Blocks as appropriate.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
BlockT * getHeader() const
iterator_range< block_iterator > blocks() const
void addChildLoop(LoopT *NewChild)
Add the specified loop to be a child of this loop.
void addBlockEntry(BlockT *BB)
This adds a basic block directly to the basic block list.
BlockT * getExitBlock() const
If getExitBlocks would return exactly one block, return that block.
BlockT * getLoopPreheader() const
If there is a preheader for this loop, return it.
BlockT * getExitingBlock() const
If getExitingBlocks would return exactly one block, return that block.
LoopT * getParentLoop() const
Return the parent loop if it exists or nullptr for top level loops.
iterator begin() const
BlockT * getUniqueExitBlock() const
If getUniqueExitBlocks would return exactly one block, return that block.
LoopT * removeChildLoop(iterator I)
This removes the specified child from being a subloop of this loop.
void changeTopLevelLoop(LoopT *OldLoop, LoopT *NewLoop)
Replace the specified loop in the top-level loops list with the indicated loop.
void changeLoopFor(BlockT *BB, LoopT *L)
Change the top-level loop that contains BB to the specified loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
static const BasicBlock & skipEmptyBlockUntil(const BasicBlock *From, const BasicBlock *End, bool CheckUniquePred=false)
Recursivelly traverse all empty 'single successor' basic blocks of From (if there are any).
ArrayRef< Loop * > getLoops() const
Get the loops in the nest.
Function * getParent() const
Return the function to which the loop-nest belongs.
Loop & getOutermostLoop() const
Return the outermost loop in the loop nest.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
DebugLoc getStartLoc() const
Return the debug location of the start of this loop.
Definition: LoopInfo.cpp:632
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:61
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for missed-optimization remarks.
Diagnostic information for applied optimization remarks.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
op_range incoming_values()
void setIncomingBlock(unsigned i, BasicBlock *BB)
void setIncomingValue(unsigned i, Value *V)
static unsigned getIncomingValueNumForOperand(unsigned i)
A set of analyses that are preserved following a run of a transformation pass.
Definition: Analysis.h:111
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: Analysis.h:117
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:77
Instruction * getExactFPMathInst() const
Returns 1st non-reassociative FP instruction in the PHI node's use-chain.
static bool isReductionPHI(PHINode *Phi, Loop *TheLoop, RecurrenceDescriptor &RedDes, DemandedBits *DB=nullptr, AssumptionCache *AC=nullptr, DominatorTree *DT=nullptr, ScalarEvolution *SE=nullptr)
Returns true if Phi is a reduction in TheLoop.
This node represents a polynomial recurrence on the trip count of the specified loop.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
void forgetLoop(const Loop *L)
This method should be called by the client when it has changed a loop in a way that may effect Scalar...
bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
bool isSCEVable(Type *Ty) const
Test if values of the given type are analyzable within the SCEV framework.
size_type size() const
Determine the number of elements in the SetVector.
Definition: SetVector.h:98
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition: SetVector.h:162
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:363
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
A SetVector that performs no allocations if smaller than a certain size.
Definition: SetVector.h:370
bool empty() const
Definition: SmallVector.h:81
size_t size() const
Definition: SmallVector.h:78
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:51
StringSet - A wrapper for StringMap that provides set-like functionality.
Definition: StringSet.h:23
std::pair< typename Base::iterator, bool > insert(StringRef key)
Definition: StringSet.h:38
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
op_range operands()
Definition: User.h:288
LLVM Value Representation.
Definition: Value.h:74
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
BasicBlock * InsertPreheaderForLoop(Loop *L, DominatorTree *DT, LoopInfo *LI, MemorySSAUpdater *MSSAU, bool PreserveLCSSA)
InsertPreheaderForLoop - Once we discover that a loop doesn't have a preheader, this method is called...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
auto successors(const MachineBasicBlock *BB)
bool formLCSSARecursively(Loop &L, const DominatorTree &DT, const LoopInfo *LI, ScalarEvolution *SE)
Put a loop nest into LCSSA form.
Definition: LCSSA.cpp:465
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:657
auto map_range(ContainerTy &&C, FuncTy F)
Definition: STLExtras.h:377
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition: STLExtras.h:1952
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1873
bool formLCSSAForInstructions(SmallVectorImpl< Instruction * > &Worklist, const DominatorTree &DT, const LoopInfo &LI, ScalarEvolution *SE, SmallVectorImpl< PHINode * > *PHIsToRemove=nullptr, SmallVectorImpl< PHINode * > *InsertedPHIs=nullptr)
Ensures LCSSA form for every instruction from the Worklist in the scope of innermost containing loop.
Definition: LCSSA.cpp:325
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition: STLExtras.h:1945
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
PreservedAnalyses getLoopPassPreservedAnalyses()
Returns the minimum set of Analyses that all loop passes must preserve.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition: STLExtras.h:1903
Implement std::hash so that hash_code can be used in STL containers.
Definition: BitVector.h:858
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition: BitVector.h:860
PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U)
The adaptor from a function pass to a loop pass computes these analyses and makes them available to t...
Direction
An enum for the direction of the loop.
Definition: LoopInfo.h:215