LLVM 20.0.0git
BasicTTIImpl.h
Go to the documentation of this file.
1//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file provides a helper that implements much of the TTI interface in
11/// terms of the target-independent code generator and TargetLowering
12/// interfaces.
13//
14//===----------------------------------------------------------------------===//
15
16#ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17#define LLVM_CODEGEN_BASICTTIIMPL_H
18
19#include "llvm/ADT/APInt.h"
20#include "llvm/ADT/BitVector.h"
33#include "llvm/IR/BasicBlock.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DataLayout.h"
38#include "llvm/IR/InstrTypes.h"
39#include "llvm/IR/Instruction.h"
41#include "llvm/IR/Intrinsics.h"
42#include "llvm/IR/Operator.h"
43#include "llvm/IR/Type.h"
44#include "llvm/IR/Value.h"
52#include <algorithm>
53#include <cassert>
54#include <cstdint>
55#include <limits>
56#include <optional>
57#include <utility>
58
59namespace llvm {
60
61class Function;
62class GlobalValue;
63class LLVMContext;
64class ScalarEvolution;
65class SCEV;
66class TargetMachine;
67
68extern cl::opt<unsigned> PartialUnrollingThreshold;
69
70/// Base class which can be used to help build a TTI implementation.
71///
72/// This class provides as much implementation of the TTI interface as is
73/// possible using the target independent parts of the code generator.
74///
75/// In order to subclass it, your class must implement a getST() method to
76/// return the subtarget, and a getTLI() method to return the target lowering.
77/// We need these methods implemented in the derived class so that this class
78/// doesn't have to duplicate storage for them.
79template <typename T>
81private:
84
85 /// Helper function to access this as a T.
86 T *thisT() { return static_cast<T *>(this); }
87
88 /// Estimate a cost of Broadcast as an extract and sequence of insert
89 /// operations.
90 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
93 // Broadcast cost is equal to the cost of extracting the zero'th element
94 // plus the cost of inserting it into every element of the result vector.
95 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
96 CostKind, 0, nullptr, nullptr);
97
98 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
99 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
100 CostKind, i, nullptr, nullptr);
101 }
102 return Cost;
103 }
104
105 /// Estimate a cost of shuffle as a sequence of extract and insert
106 /// operations.
107 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
110 // Shuffle cost is equal to the cost of extracting element from its argument
111 // plus the cost of inserting them onto the result vector.
112
113 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
114 // index 0 of first vector, index 1 of second vector,index 2 of first
115 // vector and finally index 3 of second vector and insert them at index
116 // <0,1,2,3> of result vector.
117 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
118 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
119 CostKind, i, nullptr, nullptr);
120 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
121 CostKind, i, nullptr, nullptr);
122 }
123 return Cost;
124 }
125
126 /// Estimate a cost of subvector extraction as a sequence of extract and
127 /// insert operations.
128 InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
130 int Index,
131 FixedVectorType *SubVTy) {
132 assert(VTy && SubVTy &&
133 "Can only extract subvectors from vectors");
134 int NumSubElts = SubVTy->getNumElements();
135 assert((!isa<FixedVectorType>(VTy) ||
136 (Index + NumSubElts) <=
137 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
138 "SK_ExtractSubvector index out of range");
139
141 // Subvector extraction cost is equal to the cost of extracting element from
142 // the source type plus the cost of inserting them into the result vector
143 // type.
144 for (int i = 0; i != NumSubElts; ++i) {
145 Cost +=
146 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
147 CostKind, i + Index, nullptr, nullptr);
148 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
149 CostKind, i, nullptr, nullptr);
150 }
151 return Cost;
152 }
153
154 /// Estimate a cost of subvector insertion as a sequence of extract and
155 /// insert operations.
156 InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
158 int Index,
159 FixedVectorType *SubVTy) {
160 assert(VTy && SubVTy &&
161 "Can only insert subvectors into vectors");
162 int NumSubElts = SubVTy->getNumElements();
163 assert((!isa<FixedVectorType>(VTy) ||
164 (Index + NumSubElts) <=
165 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
166 "SK_InsertSubvector index out of range");
167
169 // Subvector insertion cost is equal to the cost of extracting element from
170 // the source type plus the cost of inserting them into the result vector
171 // type.
172 for (int i = 0; i != NumSubElts; ++i) {
173 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
174 CostKind, i, nullptr, nullptr);
175 Cost +=
176 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
177 i + Index, nullptr, nullptr);
178 }
179 return Cost;
180 }
181
182 /// Local query method delegates up to T which *must* implement this!
183 const TargetSubtargetInfo *getST() const {
184 return static_cast<const T *>(this)->getST();
185 }
186
187 /// Local query method delegates up to T which *must* implement this!
188 const TargetLoweringBase *getTLI() const {
189 return static_cast<const T *>(this)->getTLI();
190 }
191
192 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
193 switch (M) {
195 return ISD::UNINDEXED;
196 case TTI::MIM_PreInc:
197 return ISD::PRE_INC;
198 case TTI::MIM_PreDec:
199 return ISD::PRE_DEC;
200 case TTI::MIM_PostInc:
201 return ISD::POST_INC;
202 case TTI::MIM_PostDec:
203 return ISD::POST_DEC;
204 }
205 llvm_unreachable("Unexpected MemIndexedMode");
206 }
207
208 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
209 Align Alignment,
210 bool VariableMask,
211 bool IsGatherScatter,
213 unsigned AddressSpace = 0) {
214 // We cannot scalarize scalable vectors, so return Invalid.
215 if (isa<ScalableVectorType>(DataTy))
217
218 auto *VT = cast<FixedVectorType>(DataTy);
219 unsigned VF = VT->getNumElements();
220
221 // Assume the target does not have support for gather/scatter operations
222 // and provide a rough estimate.
223 //
224 // First, compute the cost of the individual memory operations.
225 InstructionCost AddrExtractCost =
226 IsGatherScatter ? getScalarizationOverhead(
228 PointerType::get(VT->getContext(), 0), VF),
229 /*Insert=*/false, /*Extract=*/true, CostKind)
230 : 0;
231
232 // The cost of the scalar loads/stores.
233 InstructionCost MemoryOpCost =
234 VF * thisT()->getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
236
237 // Next, compute the cost of packing the result in a vector.
238 InstructionCost PackingCost =
239 getScalarizationOverhead(VT, Opcode != Instruction::Store,
240 Opcode == Instruction::Store, CostKind);
241
242 InstructionCost ConditionalCost = 0;
243 if (VariableMask) {
244 // Compute the cost of conditionally executing the memory operations with
245 // variable masks. This includes extracting the individual conditions, a
246 // branches and PHIs to combine the results.
247 // NOTE: Estimating the cost of conditionally executing the memory
248 // operations accurately is quite difficult and the current solution
249 // provides a very rough estimate only.
250 ConditionalCost =
253 /*Insert=*/false, /*Extract=*/true, CostKind) +
254 VF * (thisT()->getCFInstrCost(Instruction::Br, CostKind) +
255 thisT()->getCFInstrCost(Instruction::PHI, CostKind));
256 }
257
258 return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
259 }
260
261 /// Checks if the provided mask \p is a splat mask, i.e. it contains only -1
262 /// or same non -1 index value and this index value contained at least twice.
263 /// So, mask <0, -1,-1, -1> is not considered splat (it is just identity),
264 /// same for <-1, 0, -1, -1> (just a slide), while <2, -1, 2, -1> is a splat
265 /// with \p Index=2.
266 static bool isSplatMask(ArrayRef<int> Mask, unsigned NumSrcElts, int &Index) {
267 // Check that the broadcast index meets at least twice.
268 bool IsCompared = false;
269 if (int SplatIdx = PoisonMaskElem;
270 all_of(enumerate(Mask), [&](const auto &P) {
271 if (P.value() == PoisonMaskElem)
272 return P.index() != Mask.size() - 1 || IsCompared;
273 if (static_cast<unsigned>(P.value()) >= NumSrcElts * 2)
274 return false;
275 if (SplatIdx == PoisonMaskElem) {
276 SplatIdx = P.value();
277 return P.index() != Mask.size() - 1;
278 }
279 IsCompared = true;
280 return SplatIdx == P.value();
281 })) {
282 Index = SplatIdx;
283 return true;
284 }
285 return false;
286 }
287
288protected:
289 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
290 : BaseT(DL) {}
291 virtual ~BasicTTIImplBase() = default;
292
294
295public:
296 /// \name Scalar TTI Implementations
297 /// @{
299 unsigned AddressSpace, Align Alignment,
300 unsigned *Fast) const {
301 EVT E = EVT::getIntegerVT(Context, BitWidth);
302 return getTLI()->allowsMisalignedMemoryAccesses(
304 }
305
306 bool areInlineCompatible(const Function *Caller,
307 const Function *Callee) const {
308 const TargetMachine &TM = getTLI()->getTargetMachine();
309
310 const FeatureBitset &CallerBits =
311 TM.getSubtargetImpl(*Caller)->getFeatureBits();
312 const FeatureBitset &CalleeBits =
313 TM.getSubtargetImpl(*Callee)->getFeatureBits();
314
315 // Inline a callee if its target-features are a subset of the callers
316 // target-features.
317 return (CallerBits & CalleeBits) == CalleeBits;
318 }
319
320 bool hasBranchDivergence(const Function *F = nullptr) { return false; }
321
322 bool isSourceOfDivergence(const Value *V) { return false; }
323
324 bool isAlwaysUniform(const Value *V) { return false; }
325
326 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
327 return false;
328 }
329
330 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
331 return true;
332 }
333
335 // Return an invalid address space.
336 return -1;
337 }
338
340 Intrinsic::ID IID) const {
341 return false;
342 }
343
344 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
345 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
346 }
347
348 unsigned getAssumedAddrSpace(const Value *V) const {
349 return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
350 }
351
352 bool isSingleThreaded() const {
353 return getTLI()->getTargetMachine().Options.ThreadModel ==
355 }
356
357 std::pair<const Value *, unsigned>
359 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
360 }
361
363 Value *NewV) const {
364 return nullptr;
365 }
366
367 bool isLegalAddImmediate(int64_t imm) {
368 return getTLI()->isLegalAddImmediate(imm);
369 }
370
371 bool isLegalAddScalableImmediate(int64_t Imm) {
372 return getTLI()->isLegalAddScalableImmediate(Imm);
373 }
374
375 bool isLegalICmpImmediate(int64_t imm) {
376 return getTLI()->isLegalICmpImmediate(imm);
377 }
378
379 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
380 bool HasBaseReg, int64_t Scale, unsigned AddrSpace,
381 Instruction *I = nullptr,
382 int64_t ScalableOffset = 0) {
384 AM.BaseGV = BaseGV;
385 AM.BaseOffs = BaseOffset;
386 AM.HasBaseReg = HasBaseReg;
387 AM.Scale = Scale;
388 AM.ScalableOffset = ScalableOffset;
389 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
390 }
391
392 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) {
393 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset);
394 }
395
396 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
397 Type *ScalarValTy) const {
398 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
399 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
400 EVT VT = getTLI()->getValueType(DL, SrcTy);
401 if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
402 getTLI()->isOperationCustom(ISD::STORE, VT))
403 return true;
404
405 EVT ValVT =
406 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
407 EVT LegalizedVT =
408 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
409 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
410 };
411 while (VF > 2 && IsSupportedByTarget(VF))
412 VF /= 2;
413 return VF;
414 }
415
417 const DataLayout &DL) const {
418 EVT VT = getTLI()->getValueType(DL, Ty);
419 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
420 }
421
423 const DataLayout &DL) const {
424 EVT VT = getTLI()->getValueType(DL, Ty);
425 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
426 }
427
430 }
431
434 }
435
438 }
439
442 }
443
445 StackOffset BaseOffset, bool HasBaseReg,
446 int64_t Scale, unsigned AddrSpace) {
448 AM.BaseGV = BaseGV;
449 AM.BaseOffs = BaseOffset.getFixed();
450 AM.HasBaseReg = HasBaseReg;
451 AM.Scale = Scale;
452 AM.ScalableOffset = BaseOffset.getScalable();
453 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
454 return 0;
455 return -1;
456 }
457
458 bool isTruncateFree(Type *Ty1, Type *Ty2) {
459 return getTLI()->isTruncateFree(Ty1, Ty2);
460 }
461
463 return getTLI()->isProfitableToHoist(I);
464 }
465
466 bool useAA() const { return getST()->useAA(); }
467
468 bool isTypeLegal(Type *Ty) {
469 EVT VT = getTLI()->getValueType(DL, Ty, /*AllowUnknown=*/true);
470 return getTLI()->isTypeLegal(VT);
471 }
472
473 unsigned getRegUsageForType(Type *Ty) {
474 EVT ETy = getTLI()->getValueType(DL, Ty);
475 return getTLI()->getNumRegisters(Ty->getContext(), ETy);
476 }
477
481 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
482 }
483
485 unsigned &JumpTableSize,
487 BlockFrequencyInfo *BFI) {
488 /// Try to find the estimated number of clusters. Note that the number of
489 /// clusters identified in this function could be different from the actual
490 /// numbers found in lowering. This function ignore switches that are
491 /// lowered with a mix of jump table / bit test / BTree. This function was
492 /// initially intended to be used when estimating the cost of switch in
493 /// inline cost heuristic, but it's a generic cost model to be used in other
494 /// places (e.g., in loop unrolling).
495 unsigned N = SI.getNumCases();
496 const TargetLoweringBase *TLI = getTLI();
497 const DataLayout &DL = this->getDataLayout();
498
499 JumpTableSize = 0;
500 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
501
502 // Early exit if both a jump table and bit test are not allowed.
503 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
504 return N;
505
506 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
507 APInt MinCaseVal = MaxCaseVal;
508 for (auto CI : SI.cases()) {
509 const APInt &CaseVal = CI.getCaseValue()->getValue();
510 if (CaseVal.sgt(MaxCaseVal))
511 MaxCaseVal = CaseVal;
512 if (CaseVal.slt(MinCaseVal))
513 MinCaseVal = CaseVal;
514 }
515
516 // Check if suitable for a bit test
517 if (N <= DL.getIndexSizeInBits(0u)) {
519 for (auto I : SI.cases())
520 Dests.insert(I.getCaseSuccessor());
521
522 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
523 DL))
524 return 1;
525 }
526
527 // Check if suitable for a jump table.
528 if (IsJTAllowed) {
529 if (N < 2 || N < TLI->getMinimumJumpTableEntries())
530 return N;
532 (MaxCaseVal - MinCaseVal)
533 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
534 // Check whether a range of clusters is dense enough for a jump table
535 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
536 JumpTableSize = Range;
537 return 1;
538 }
539 }
540 return N;
541 }
542
544 const TargetLoweringBase *TLI = getTLI();
545 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
546 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
547 }
548
550 const TargetMachine &TM = getTLI()->getTargetMachine();
551 // If non-PIC mode, do not generate a relative lookup table.
552 if (!TM.isPositionIndependent())
553 return false;
554
555 /// Relative lookup table entries consist of 32-bit offsets.
556 /// Do not generate relative lookup tables for large code models
557 /// in 64-bit achitectures where 32-bit offsets might not be enough.
558 if (TM.getCodeModel() == CodeModel::Medium ||
559 TM.getCodeModel() == CodeModel::Large)
560 return false;
561
562 const Triple &TargetTriple = TM.getTargetTriple();
563 if (!TargetTriple.isArch64Bit())
564 return false;
565
566 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
567 // there.
568 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
569 return false;
570
571 return true;
572 }
573
574 bool haveFastSqrt(Type *Ty) {
575 const TargetLoweringBase *TLI = getTLI();
576 EVT VT = TLI->getValueType(DL, Ty);
577 return TLI->isTypeLegal(VT) &&
579 }
580
582 return true;
583 }
584
586 // Check whether FADD is available, as a proxy for floating-point in
587 // general.
588 const TargetLoweringBase *TLI = getTLI();
589 EVT VT = TLI->getValueType(DL, Ty);
593 }
594
596 const Function &Fn) const {
597 switch (Inst.getOpcode()) {
598 default:
599 break;
600 case Instruction::SDiv:
601 case Instruction::SRem:
602 case Instruction::UDiv:
603 case Instruction::URem: {
604 if (!isa<ConstantInt>(Inst.getOperand(1)))
605 return false;
606 EVT VT = getTLI()->getValueType(DL, Inst.getType());
607 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes());
608 }
609 };
610
611 return false;
612 }
613
614 unsigned getInliningThresholdMultiplier() const { return 1; }
615 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
616 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
617 return 0;
618 }
619
620 int getInlinerVectorBonusPercent() const { return 150; }
621
625 // This unrolling functionality is target independent, but to provide some
626 // motivation for its intended use, for x86:
627
628 // According to the Intel 64 and IA-32 Architectures Optimization Reference
629 // Manual, Intel Core models and later have a loop stream detector (and
630 // associated uop queue) that can benefit from partial unrolling.
631 // The relevant requirements are:
632 // - The loop must have no more than 4 (8 for Nehalem and later) branches
633 // taken, and none of them may be calls.
634 // - The loop can have no more than 18 (28 for Nehalem and later) uops.
635
636 // According to the Software Optimization Guide for AMD Family 15h
637 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
638 // and loop buffer which can benefit from partial unrolling.
639 // The relevant requirements are:
640 // - The loop must have fewer than 16 branches
641 // - The loop must have less than 40 uops in all executed loop branches
642
643 // The number of taken branches in a loop is hard to estimate here, and
644 // benchmarking has revealed that it is better not to be conservative when
645 // estimating the branch count. As a result, we'll ignore the branch limits
646 // until someone finds a case where it matters in practice.
647
648 unsigned MaxOps;
649 const TargetSubtargetInfo *ST = getST();
650 if (PartialUnrollingThreshold.getNumOccurrences() > 0)
652 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
653 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
654 else
655 return;
656
657 // Scan the loop: don't unroll loops with calls.
658 for (BasicBlock *BB : L->blocks()) {
659 for (Instruction &I : *BB) {
660 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
661 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
662 if (!thisT()->isLoweredToCall(F))
663 continue;
664 }
665
666 if (ORE) {
667 ORE->emit([&]() {
668 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
669 L->getHeader())
670 << "advising against unrolling the loop because it "
671 "contains a "
672 << ore::NV("Call", &I);
673 });
674 }
675 return;
676 }
677 }
678 }
679
680 // Enable runtime and partial unrolling up to the specified size.
681 // Enable using trip count upper bound to unroll loops.
682 UP.Partial = UP.Runtime = UP.UpperBound = true;
683 UP.PartialThreshold = MaxOps;
684
685 // Avoid unrolling when optimizing for size.
686 UP.OptSizeThreshold = 0;
688
689 // Set number of instructions optimized when "back edge"
690 // becomes "fall through" to default value of 2.
691 UP.BEInsns = 2;
692 }
693
696 PP.PeelCount = 0;
697 PP.AllowPeeling = true;
698 PP.AllowLoopNestsPeeling = false;
699 PP.PeelProfiledIterations = true;
700 }
701
703 AssumptionCache &AC,
704 TargetLibraryInfo *LibInfo,
705 HardwareLoopInfo &HWLoopInfo) {
706 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
707 }
708
711 }
712
715 }
716
718 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
719 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
720 }
721
722 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
723 IntrinsicInst &II) {
725 }
726
727 std::optional<Value *>
729 APInt DemandedMask, KnownBits &Known,
730 bool &KnownBitsComputed) {
731 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
732 KnownBitsComputed);
733 }
734
736 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
737 APInt &UndefElts2, APInt &UndefElts3,
738 std::function<void(Instruction *, unsigned, APInt, APInt &)>
739 SimplifyAndSetOp) {
741 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
742 SimplifyAndSetOp);
743 }
744
745 virtual std::optional<unsigned>
747 return std::optional<unsigned>(
748 getST()->getCacheSize(static_cast<unsigned>(Level)));
749 }
750
751 virtual std::optional<unsigned>
753 std::optional<unsigned> TargetResult =
754 getST()->getCacheAssociativity(static_cast<unsigned>(Level));
755
756 if (TargetResult)
757 return TargetResult;
758
759 return BaseT::getCacheAssociativity(Level);
760 }
761
762 virtual unsigned getCacheLineSize() const {
763 return getST()->getCacheLineSize();
764 }
765
766 virtual unsigned getPrefetchDistance() const {
767 return getST()->getPrefetchDistance();
768 }
769
770 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
771 unsigned NumStridedMemAccesses,
772 unsigned NumPrefetches,
773 bool HasCall) const {
774 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
775 NumPrefetches, HasCall);
776 }
777
778 virtual unsigned getMaxPrefetchIterationsAhead() const {
779 return getST()->getMaxPrefetchIterationsAhead();
780 }
781
782 virtual bool enableWritePrefetching() const {
783 return getST()->enableWritePrefetching();
784 }
785
786 virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
787 return getST()->shouldPrefetchAddressSpace(AS);
788 }
789
790 /// @}
791
792 /// \name Vector TTI Implementations
793 /// @{
794
796 return TypeSize::getFixed(32);
797 }
798
799 std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
800 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
801 bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
802
803 /// Estimate the overhead of scalarizing an instruction. Insert and Extract
804 /// are set if the demanded result elements need to be inserted and/or
805 /// extracted from vectors.
807 const APInt &DemandedElts,
808 bool Insert, bool Extract,
810 ArrayRef<Value *> VL = {}) {
811 /// FIXME: a bitfield is not a reasonable abstraction for talking about
812 /// which elements are needed from a scalable vector
813 if (isa<ScalableVectorType>(InTy))
815 auto *Ty = cast<FixedVectorType>(InTy);
816
817 assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
818 (VL.empty() || VL.size() == Ty->getNumElements()) &&
819 "Vector size mismatch");
820
822
823 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
824 if (!DemandedElts[i])
825 continue;
826 if (Insert) {
827 Value *InsertedVal = VL.empty() ? nullptr : VL[i];
828 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
829 CostKind, i, nullptr, InsertedVal);
830 }
831 if (Extract)
832 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
833 CostKind, i, nullptr, nullptr);
834 }
835
836 return Cost;
837 }
838
840 return false;
841 }
842
844 unsigned ScalarOpdIdx) const {
845 return false;
846 }
847
849 int OpdIdx) const {
850 return OpdIdx == -1;
851 }
852
854 int RetIdx) const {
855 return RetIdx == 0;
856 }
857
858 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
860 bool Extract,
862 if (isa<ScalableVectorType>(InTy))
864 auto *Ty = cast<FixedVectorType>(InTy);
865
866 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
867 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
868 CostKind);
869 }
870
871 /// Estimate the overhead of scalarizing an instructions unique
872 /// non-constant operands. The (potentially vector) types to use for each of
873 /// argument are passes via Tys.
878 assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
879
881 SmallPtrSet<const Value*, 4> UniqueOperands;
882 for (int I = 0, E = Args.size(); I != E; I++) {
883 // Disregard things like metadata arguments.
884 const Value *A = Args[I];
885 Type *Ty = Tys[I];
886 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
887 !Ty->isPtrOrPtrVectorTy())
888 continue;
889
890 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
891 if (auto *VecTy = dyn_cast<VectorType>(Ty))
892 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
893 /*Extract*/ true, CostKind);
894 }
895 }
896
897 return Cost;
898 }
899
900 /// Estimate the overhead of scalarizing the inputs and outputs of an
901 /// instruction, with return type RetTy and arguments Args of type Tys. If
902 /// Args are unknown (empty), then the cost associated with one argument is
903 /// added as a heuristic.
909 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
910 if (!Args.empty())
912 else
913 // When no information on arguments is provided, we add the cost
914 // associated with one argument as a heuristic.
915 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
916 /*Extract*/ true, CostKind);
917
918 return Cost;
919 }
920
921 /// Estimate the cost of type-legalization and the legalized type.
922 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
923 LLVMContext &C = Ty->getContext();
924 EVT MTy = getTLI()->getValueType(DL, Ty);
925
927 // We keep legalizing the type until we find a legal kind. We assume that
928 // the only operation that costs anything is the split. After splitting
929 // we need to handle two types.
930 while (true) {
932
934 // Ensure we return a sensible simple VT here, since many callers of
935 // this function require it.
936 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
937 return std::make_pair(InstructionCost::getInvalid(), VT);
938 }
939
940 if (LK.first == TargetLoweringBase::TypeLegal)
941 return std::make_pair(Cost, MTy.getSimpleVT());
942
943 if (LK.first == TargetLoweringBase::TypeSplitVector ||
945 Cost *= 2;
946
947 // Do not loop with f128 type.
948 if (MTy == LK.second)
949 return std::make_pair(Cost, MTy.getSimpleVT());
950
951 // Keep legalizing the type.
952 MTy = LK.second;
953 }
954 }
955
956 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
957
959 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
962 ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr) {
963 // Check if any of the operands are vector operands.
964 const TargetLoweringBase *TLI = getTLI();
965 int ISD = TLI->InstructionOpcodeToISD(Opcode);
966 assert(ISD && "Invalid opcode");
967
968 // TODO: Handle more cost kinds.
970 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
971 Opd1Info, Opd2Info,
972 Args, CxtI);
973
974 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
975
976 bool IsFloat = Ty->isFPOrFPVectorTy();
977 // Assume that floating point arithmetic operations cost twice as much as
978 // integer operations.
979 InstructionCost OpCost = (IsFloat ? 2 : 1);
980
981 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
982 // The operation is legal. Assume it costs 1.
983 // TODO: Once we have extract/insert subvector cost we need to use them.
984 return LT.first * OpCost;
985 }
986
987 if (!TLI->isOperationExpand(ISD, LT.second)) {
988 // If the operation is custom lowered, then assume that the code is twice
989 // as expensive.
990 return LT.first * 2 * OpCost;
991 }
992
993 // An 'Expand' of URem and SRem is special because it may default
994 // to expanding the operation into a sequence of sub-operations
995 // i.e. X % Y -> X-(X/Y)*Y.
996 if (ISD == ISD::UREM || ISD == ISD::SREM) {
997 bool IsSigned = ISD == ISD::SREM;
998 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
999 LT.second) ||
1000 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
1001 LT.second)) {
1002 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
1003 InstructionCost DivCost = thisT()->getArithmeticInstrCost(
1004 DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
1005 InstructionCost MulCost =
1006 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
1007 InstructionCost SubCost =
1008 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
1009 return DivCost + MulCost + SubCost;
1010 }
1011 }
1012
1013 // We cannot scalarize scalable vectors, so return Invalid.
1014 if (isa<ScalableVectorType>(Ty))
1016
1017 // Else, assume that we need to scalarize this op.
1018 // TODO: If one of the types get legalized by splitting, handle this
1019 // similarly to what getCastInstrCost() does.
1020 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1021 InstructionCost Cost = thisT()->getArithmeticInstrCost(
1022 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
1023 Args, CxtI);
1024 // Return the cost of multiple scalar invocation plus the cost of
1025 // inserting and extracting the values.
1026 SmallVector<Type *> Tys(Args.size(), Ty);
1027 return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1028 VTy->getNumElements() * Cost;
1029 }
1030
1031 // We don't know anything about this scalar instruction.
1032 return OpCost;
1033 }
1034
1036 ArrayRef<int> Mask,
1037 VectorType *Ty, int &Index,
1038 VectorType *&SubTy) const {
1039 if (Mask.empty())
1040 return Kind;
1041 int NumSrcElts = Ty->getElementCount().getKnownMinValue();
1042 switch (Kind) {
1044 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts))
1045 return TTI::SK_Reverse;
1046 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts))
1047 return TTI::SK_Broadcast;
1048 if (isSplatMask(Mask, NumSrcElts, Index))
1049 return TTI::SK_Broadcast;
1050 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) &&
1051 (Index + Mask.size()) <= (size_t)NumSrcElts) {
1052 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size());
1054 }
1055 break;
1056 }
1057 case TTI::SK_PermuteTwoSrc: {
1058 int NumSubElts;
1059 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask(
1060 Mask, NumSrcElts, NumSubElts, Index)) {
1061 if (Index + NumSubElts > NumSrcElts)
1062 return Kind;
1063 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts);
1065 }
1066 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts))
1067 return TTI::SK_Select;
1068 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts))
1069 return TTI::SK_Transpose;
1070 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index))
1071 return TTI::SK_Splice;
1072 break;
1073 }
1074 case TTI::SK_Select:
1075 case TTI::SK_Reverse:
1076 case TTI::SK_Broadcast:
1077 case TTI::SK_Transpose:
1080 case TTI::SK_Splice:
1081 break;
1082 }
1083 return Kind;
1084 }
1085
1087 ArrayRef<int> Mask,
1089 VectorType *SubTp,
1090 ArrayRef<const Value *> Args = {},
1091 const Instruction *CxtI = nullptr) {
1092 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) {
1093 case TTI::SK_Broadcast:
1094 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1095 return getBroadcastShuffleOverhead(FVT, CostKind);
1097 case TTI::SK_Select:
1098 case TTI::SK_Splice:
1099 case TTI::SK_Reverse:
1100 case TTI::SK_Transpose:
1103 if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
1104 return getPermuteShuffleOverhead(FVT, CostKind);
1107 return getExtractSubvectorOverhead(Tp, CostKind, Index,
1108 cast<FixedVectorType>(SubTp));
1110 return getInsertSubvectorOverhead(Tp, CostKind, Index,
1111 cast<FixedVectorType>(SubTp));
1112 }
1113 llvm_unreachable("Unknown TTI::ShuffleKind");
1114 }
1115
1116 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1119 const Instruction *I = nullptr) {
1120 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1121 return 0;
1122
1123 const TargetLoweringBase *TLI = getTLI();
1124 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1125 assert(ISD && "Invalid opcode");
1126 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1127 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1128
1129 TypeSize SrcSize = SrcLT.second.getSizeInBits();
1130 TypeSize DstSize = DstLT.second.getSizeInBits();
1131 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1132 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1133
1134 switch (Opcode) {
1135 default:
1136 break;
1137 case Instruction::Trunc:
1138 // Check for NOOP conversions.
1139 if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1140 return 0;
1141 [[fallthrough]];
1142 case Instruction::BitCast:
1143 // Bitcast between types that are legalized to the same type are free and
1144 // assume int to/from ptr of the same size is also free.
1145 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1146 SrcSize == DstSize)
1147 return 0;
1148 break;
1149 case Instruction::FPExt:
1150 if (I && getTLI()->isExtFree(I))
1151 return 0;
1152 break;
1153 case Instruction::ZExt:
1154 if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1155 return 0;
1156 [[fallthrough]];
1157 case Instruction::SExt:
1158 if (I && getTLI()->isExtFree(I))
1159 return 0;
1160
1161 // If this is a zext/sext of a load, return 0 if the corresponding
1162 // extending load exists on target and the result type is legal.
1163 if (CCH == TTI::CastContextHint::Normal) {
1164 EVT ExtVT = EVT::getEVT(Dst);
1165 EVT LoadVT = EVT::getEVT(Src);
1166 unsigned LType =
1167 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1168 if (DstLT.first == SrcLT.first &&
1169 TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1170 return 0;
1171 }
1172 break;
1173 case Instruction::AddrSpaceCast:
1174 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1175 Dst->getPointerAddressSpace()))
1176 return 0;
1177 break;
1178 }
1179
1180 auto *SrcVTy = dyn_cast<VectorType>(Src);
1181 auto *DstVTy = dyn_cast<VectorType>(Dst);
1182
1183 // If the cast is marked as legal (or promote) then assume low cost.
1184 if (SrcLT.first == DstLT.first &&
1185 TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1186 return SrcLT.first;
1187
1188 // Handle scalar conversions.
1189 if (!SrcVTy && !DstVTy) {
1190 // Just check the op cost. If the operation is legal then assume it costs
1191 // 1.
1192 if (!TLI->isOperationExpand(ISD, DstLT.second))
1193 return 1;
1194
1195 // Assume that illegal scalar instruction are expensive.
1196 return 4;
1197 }
1198
1199 // Check vector-to-vector casts.
1200 if (DstVTy && SrcVTy) {
1201 // If the cast is between same-sized registers, then the check is simple.
1202 if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1203
1204 // Assume that Zext is done using AND.
1205 if (Opcode == Instruction::ZExt)
1206 return SrcLT.first;
1207
1208 // Assume that sext is done using SHL and SRA.
1209 if (Opcode == Instruction::SExt)
1210 return SrcLT.first * 2;
1211
1212 // Just check the op cost. If the operation is legal then assume it
1213 // costs
1214 // 1 and multiply by the type-legalization overhead.
1215 if (!TLI->isOperationExpand(ISD, DstLT.second))
1216 return SrcLT.first * 1;
1217 }
1218
1219 // If we are legalizing by splitting, query the concrete TTI for the cost
1220 // of casting the original vector twice. We also need to factor in the
1221 // cost of the split itself. Count that as 1, to be consistent with
1222 // getTypeLegalizationCost().
1223 bool SplitSrc =
1224 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1226 bool SplitDst =
1227 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1229 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1230 DstVTy->getElementCount().isVector()) {
1231 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1232 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1233 T *TTI = static_cast<T *>(this);
1234 // If both types need to be split then the split is free.
1235 InstructionCost SplitCost =
1236 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1237 return SplitCost +
1238 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1239 CostKind, I));
1240 }
1241
1242 // Scalarization cost is Invalid, can't assume any num elements.
1243 if (isa<ScalableVectorType>(DstVTy))
1245
1246 // In other cases where the source or destination are illegal, assume
1247 // the operation will get scalarized.
1248 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1249 InstructionCost Cost = thisT()->getCastInstrCost(
1250 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1251
1252 // Return the cost of multiple scalar invocation plus the cost of
1253 // inserting and extracting the values.
1254 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1255 CostKind) +
1256 Num * Cost;
1257 }
1258
1259 // We already handled vector-to-vector and scalar-to-scalar conversions.
1260 // This
1261 // is where we handle bitcast between vectors and scalars. We need to assume
1262 // that the conversion is scalarized in one way or another.
1263 if (Opcode == Instruction::BitCast) {
1264 // Illegal bitcasts are done by storing and loading from a stack slot.
1265 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1266 /*Extract*/ true, CostKind)
1267 : 0) +
1268 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1269 /*Extract*/ false, CostKind)
1270 : 0);
1271 }
1272
1273 llvm_unreachable("Unhandled cast");
1274 }
1275
1277 VectorType *VecTy, unsigned Index) {
1279 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1280 CostKind, Index, nullptr, nullptr) +
1281 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1283 }
1284
1286 const Instruction *I = nullptr) {
1287 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1288 }
1289
1291 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1295 const Instruction *I = nullptr) {
1296 const TargetLoweringBase *TLI = getTLI();
1297 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1298 assert(ISD && "Invalid opcode");
1299
1300 // TODO: Handle other cost kinds.
1302 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1303 Op1Info, Op2Info, I);
1304
1305 // Selects on vectors are actually vector selects.
1306 if (ISD == ISD::SELECT) {
1307 assert(CondTy && "CondTy must exist");
1308 if (CondTy->isVectorTy())
1309 ISD = ISD::VSELECT;
1310 }
1311 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1312
1313 if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1314 !TLI->isOperationExpand(ISD, LT.second)) {
1315 // The operation is legal. Assume it costs 1. Multiply
1316 // by the type-legalization overhead.
1317 return LT.first * 1;
1318 }
1319
1320 // Otherwise, assume that the cast is scalarized.
1321 // TODO: If one of the types get legalized by splitting, handle this
1322 // similarly to what getCastInstrCost() does.
1323 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1324 if (isa<ScalableVectorType>(ValTy))
1326
1327 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1328 if (CondTy)
1329 CondTy = CondTy->getScalarType();
1330 InstructionCost Cost =
1331 thisT()->getCmpSelInstrCost(Opcode, ValVTy->getScalarType(), CondTy,
1332 VecPred, CostKind, Op1Info, Op2Info, I);
1333
1334 // Return the cost of multiple scalar invocation plus the cost of
1335 // inserting and extracting the values.
1336 return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1337 /*Extract*/ false, CostKind) +
1338 Num * Cost;
1339 }
1340
1341 // Unknown scalar opcode.
1342 return 1;
1343 }
1344
1347 unsigned Index, Value *Op0, Value *Op1) {
1348 return getRegUsageForType(Val->getScalarType());
1349 }
1350
1351 /// \param ScalarUserAndIdx encodes the information about extracts from a
1352 /// vector with 'Scalar' being the value being extracted,'User' being the user
1353 /// of the extract(nullptr if user is not known before vectorization) and
1354 /// 'Idx' being the extract lane.
1356 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
1357 Value *Scalar,
1358 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx) {
1359 return thisT()->getVectorInstrCost(Opcode, Val, CostKind, Index, nullptr,
1360 nullptr);
1361 }
1362
1365 unsigned Index) {
1366 Value *Op0 = nullptr;
1367 Value *Op1 = nullptr;
1368 if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1369 Op0 = IE->getOperand(0);
1370 Op1 = IE->getOperand(1);
1371 }
1372 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1373 Op1);
1374 }
1375
1376 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1377 int VF,
1378 const APInt &DemandedDstElts,
1380 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1381 "Unexpected size of DemandedDstElts.");
1382
1384
1385 auto *SrcVT = FixedVectorType::get(EltTy, VF);
1386 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1387
1388 // The Mask shuffling cost is extract all the elements of the Mask
1389 // and insert each of them Factor times into the wide vector:
1390 //
1391 // E.g. an interleaved group with factor 3:
1392 // %mask = icmp ult <8 x i32> %vec1, %vec2
1393 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1394 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1395 // The cost is estimated as extract all mask elements from the <8xi1> mask
1396 // vector and insert them factor times into the <24xi1> shuffled mask
1397 // vector.
1398 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1399 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1400 /*Insert*/ false,
1401 /*Extract*/ true, CostKind);
1402 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1403 /*Insert*/ true,
1404 /*Extract*/ false, CostKind);
1405
1406 return Cost;
1407 }
1408
1410 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1413 const Instruction *I = nullptr) {
1414 assert(!Src->isVoidTy() && "Invalid type");
1415 // Assume types, such as structs, are expensive.
1416 if (getTLI()->getValueType(DL, Src, true) == MVT::Other)
1417 return 4;
1418 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1419
1420 // Assuming that all loads of legal types cost 1.
1421 InstructionCost Cost = LT.first;
1423 return Cost;
1424
1425 const DataLayout &DL = this->getDataLayout();
1426 if (Src->isVectorTy() &&
1427 // In practice it's not currently possible to have a change in lane
1428 // length for extending loads or truncating stores so both types should
1429 // have the same scalable property.
1431 LT.second.getSizeInBits())) {
1432 // This is a vector load that legalizes to a larger type than the vector
1433 // itself. Unless the corresponding extending load or truncating store is
1434 // legal, then this will scalarize.
1436 EVT MemVT = getTLI()->getValueType(DL, Src);
1437 if (Opcode == Instruction::Store)
1438 LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1439 else
1440 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1441
1442 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1443 // This is a vector load/store for some illegal type that is scalarized.
1444 // We must account for the cost of building or decomposing the vector.
1446 cast<VectorType>(Src), Opcode != Instruction::Store,
1447 Opcode == Instruction::Store, CostKind);
1448 }
1449 }
1450
1451 return Cost;
1452 }
1453
1455 Align Alignment, unsigned AddressSpace,
1457 // TODO: Pass on AddressSpace when we have test coverage.
1458 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1459 CostKind);
1460 }
1461
1463 const Value *Ptr, bool VariableMask,
1464 Align Alignment,
1466 const Instruction *I = nullptr) {
1467 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1468 true, CostKind);
1469 }
1470
1472 const Value *Ptr, bool VariableMask,
1473 Align Alignment,
1475 const Instruction *I) {
1476 // For a target without strided memory operations (or for an illegal
1477 // operation type on one which does), assume we lower to a gather/scatter
1478 // operation. (Which may in turn be scalarized.)
1479 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1480 Alignment, CostKind, I);
1481 }
1482
1484 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1485 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1486 bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1487
1488 // We cannot scalarize scalable vectors, so return Invalid.
1489 if (isa<ScalableVectorType>(VecTy))
1491
1492 auto *VT = cast<FixedVectorType>(VecTy);
1493
1494 unsigned NumElts = VT->getNumElements();
1495 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1496
1497 unsigned NumSubElts = NumElts / Factor;
1498 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1499
1500 // Firstly, the cost of load/store operation.
1502 if (UseMaskForCond || UseMaskForGaps)
1503 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1505 else
1506 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1507 CostKind);
1508
1509 // Legalize the vector type, and get the legalized and unlegalized type
1510 // sizes.
1511 MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1512 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1513 unsigned VecTyLTSize = VecTyLT.getStoreSize();
1514
1515 // Scale the cost of the memory operation by the fraction of legalized
1516 // instructions that will actually be used. We shouldn't account for the
1517 // cost of dead instructions since they will be removed.
1518 //
1519 // E.g., An interleaved load of factor 8:
1520 // %vec = load <16 x i64>, <16 x i64>* %ptr
1521 // %v0 = shufflevector %vec, undef, <0, 8>
1522 //
1523 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1524 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1525 // type). The other loads are unused.
1526 //
1527 // TODO: Note that legalization can turn masked loads/stores into unmasked
1528 // (legalized) loads/stores. This can be reflected in the cost.
1529 if (Cost.isValid() && VecTySize > VecTyLTSize) {
1530 // The number of loads of a legal type it will take to represent a load
1531 // of the unlegalized vector type.
1532 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1533
1534 // The number of elements of the unlegalized type that correspond to a
1535 // single legal instruction.
1536 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1537
1538 // Determine which legal instructions will be used.
1539 BitVector UsedInsts(NumLegalInsts, false);
1540 for (unsigned Index : Indices)
1541 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1542 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1543
1544 // Scale the cost of the load by the fraction of legal instructions that
1545 // will be used.
1546 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1547 }
1548
1549 // Then plus the cost of interleave operation.
1550 assert(Indices.size() <= Factor &&
1551 "Interleaved memory op has too many members");
1552
1553 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1554 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1555
1556 APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1557 for (unsigned Index : Indices) {
1558 assert(Index < Factor && "Invalid index for interleaved memory op");
1559 for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1560 DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1561 }
1562
1563 if (Opcode == Instruction::Load) {
1564 // The interleave cost is similar to extract sub vectors' elements
1565 // from the wide vector, and insert them into sub vectors.
1566 //
1567 // E.g. An interleaved load of factor 2 (with one member of index 0):
1568 // %vec = load <8 x i32>, <8 x i32>* %ptr
1569 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0
1570 // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1571 // <8 x i32> vector and insert them into a <4 x i32> vector.
1572 InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1573 SubVT, DemandedAllSubElts,
1574 /*Insert*/ true, /*Extract*/ false, CostKind);
1575 Cost += Indices.size() * InsSubCost;
1576 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1577 /*Insert*/ false,
1578 /*Extract*/ true, CostKind);
1579 } else {
1580 // The interleave cost is extract elements from sub vectors, and
1581 // insert them into the wide vector.
1582 //
1583 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1584 // (using VF=4):
1585 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1586 // %gaps.mask = <true, true, false, true, true, false,
1587 // true, true, false, true, true, false>
1588 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1589 // i32 Align, <12 x i1> %gaps.mask
1590 // The cost is estimated as extract all elements (of actual members,
1591 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1592 // i32> vector.
1593 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1594 SubVT, DemandedAllSubElts,
1595 /*Insert*/ false, /*Extract*/ true, CostKind);
1596 Cost += ExtSubCost * Indices.size();
1597 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1598 /*Insert*/ true,
1599 /*Extract*/ false, CostKind);
1600 }
1601
1602 if (!UseMaskForCond)
1603 return Cost;
1604
1605 Type *I8Type = Type::getInt8Ty(VT->getContext());
1606
1607 Cost += thisT()->getReplicationShuffleCost(
1608 I8Type, Factor, NumSubElts,
1609 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1610 CostKind);
1611
1612 // The Gaps mask is invariant and created outside the loop, therefore the
1613 // cost of creating it is not accounted for here. However if we have both
1614 // a MaskForGaps and some other mask that guards the execution of the
1615 // memory access, we need to account for the cost of And-ing the two masks
1616 // inside the loop.
1617 if (UseMaskForGaps) {
1618 auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1619 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1620 CostKind);
1621 }
1622
1623 return Cost;
1624 }
1625
1626 /// Get intrinsic cost based on arguments.
1629 // Check for generically free intrinsics.
1631 return 0;
1632
1633 // Assume that target intrinsics are cheap.
1634 Intrinsic::ID IID = ICA.getID();
1637
1638 // VP Intrinsics should have the same cost as their non-vp counterpart.
1639 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp
1640 // counterpart when the vector length argument is smaller than the maximum
1641 // vector length.
1642 // TODO: Support other kinds of VPIntrinsics
1643 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) {
1644 std::optional<unsigned> FOp =
1646 if (FOp) {
1647 if (ICA.getID() == Intrinsic::vp_load) {
1648 Align Alignment;
1649 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1650 Alignment = VPI->getPointerAlignment().valueOrOne();
1651 unsigned AS = 0;
1652 if (ICA.getArgTypes().size() > 1)
1653 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[0]))
1654 AS = PtrTy->getAddressSpace();
1655 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment,
1656 AS, CostKind);
1657 }
1658 if (ICA.getID() == Intrinsic::vp_store) {
1659 Align Alignment;
1660 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst()))
1661 Alignment = VPI->getPointerAlignment().valueOrOne();
1662 unsigned AS = 0;
1663 if (ICA.getArgTypes().size() >= 2)
1664 if (auto *PtrTy = dyn_cast<PointerType>(ICA.getArgTypes()[1]))
1665 AS = PtrTy->getAddressSpace();
1666 return thisT()->getMemoryOpCost(*FOp, ICA.getArgTypes()[0], Alignment,
1667 AS, CostKind);
1668 }
1670 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(),
1671 CostKind);
1672 }
1673 if (VPCastIntrinsic::isVPCast(ICA.getID())) {
1674 return thisT()->getCastInstrCost(
1675 *FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
1677 }
1678 if (VPCmpIntrinsic::isVPCmp(ICA.getID())) {
1679 // We can only handle vp_cmp intrinsics with underlying instructions.
1680 if (ICA.getInst()) {
1681 assert(FOp);
1682 auto *UI = cast<VPCmpIntrinsic>(ICA.getInst());
1683 return thisT()->getCmpSelInstrCost(*FOp, ICA.getArgTypes()[0],
1684 ICA.getReturnType(),
1685 UI->getPredicate(), CostKind);
1686 }
1687 }
1688 }
1689
1690 std::optional<Intrinsic::ID> FID =
1692 if (FID) {
1693 // Non-vp version will have same arg types except mask and vector
1694 // length.
1695 assert(ICA.getArgTypes().size() >= 2 &&
1696 "Expected VPIntrinsic to have Mask and Vector Length args and "
1697 "types");
1699
1700 // VPReduction intrinsics have a start value argument that their non-vp
1701 // counterparts do not have, except for the fadd and fmul non-vp
1702 // counterpart.
1704 *FID != Intrinsic::vector_reduce_fadd &&
1705 *FID != Intrinsic::vector_reduce_fmul)
1706 NewTys = NewTys.drop_front();
1707
1708 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys,
1709 ICA.getFlags());
1710 return thisT()->getIntrinsicInstrCost(NewICA, CostKind);
1711 }
1712 }
1713
1714 if (ICA.isTypeBasedOnly())
1716
1717 Type *RetTy = ICA.getReturnType();
1718
1719 ElementCount RetVF =
1720 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1722 const IntrinsicInst *I = ICA.getInst();
1723 const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1724 FastMathFlags FMF = ICA.getFlags();
1725 switch (IID) {
1726 default:
1727 break;
1728
1729 case Intrinsic::powi:
1730 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1731 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1732 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1733 ShouldOptForSize)) {
1734 // The cost is modeled on the expansion performed by ExpandPowI in
1735 // SelectionDAGBuilder.
1736 APInt Exponent = RHSC->getValue().abs();
1737 unsigned ActiveBits = Exponent.getActiveBits();
1738 unsigned PopCount = Exponent.popcount();
1739 InstructionCost Cost = (ActiveBits + PopCount - 2) *
1740 thisT()->getArithmeticInstrCost(
1741 Instruction::FMul, RetTy, CostKind);
1742 if (RHSC->isNegative())
1743 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1744 CostKind);
1745 return Cost;
1746 }
1747 }
1748 break;
1749 case Intrinsic::cttz:
1750 // FIXME: If necessary, this should go in target-specific overrides.
1751 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1753 break;
1754
1755 case Intrinsic::ctlz:
1756 // FIXME: If necessary, this should go in target-specific overrides.
1757 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1759 break;
1760
1761 case Intrinsic::memcpy:
1762 return thisT()->getMemcpyCost(ICA.getInst());
1763
1764 case Intrinsic::masked_scatter: {
1765 const Value *Mask = Args[3];
1766 bool VarMask = !isa<Constant>(Mask);
1767 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1768 return thisT()->getGatherScatterOpCost(Instruction::Store,
1769 ICA.getArgTypes()[0], Args[1],
1770 VarMask, Alignment, CostKind, I);
1771 }
1772 case Intrinsic::masked_gather: {
1773 const Value *Mask = Args[2];
1774 bool VarMask = !isa<Constant>(Mask);
1775 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1776 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1777 VarMask, Alignment, CostKind, I);
1778 }
1779 case Intrinsic::experimental_vp_strided_store: {
1780 const Value *Data = Args[0];
1781 const Value *Ptr = Args[1];
1782 const Value *Mask = Args[3];
1783 const Value *EVL = Args[4];
1784 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1785 Type *EltTy = cast<VectorType>(Data->getType())->getElementType();
1786 Align Alignment =
1787 I->getParamAlign(1).value_or(thisT()->DL.getABITypeAlign(EltTy));
1788 return thisT()->getStridedMemoryOpCost(Instruction::Store,
1789 Data->getType(), Ptr, VarMask,
1790 Alignment, CostKind, I);
1791 }
1792 case Intrinsic::experimental_vp_strided_load: {
1793 const Value *Ptr = Args[0];
1794 const Value *Mask = Args[2];
1795 const Value *EVL = Args[3];
1796 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL);
1797 Type *EltTy = cast<VectorType>(RetTy)->getElementType();
1798 Align Alignment =
1799 I->getParamAlign(0).value_or(thisT()->DL.getABITypeAlign(EltTy));
1800 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr,
1801 VarMask, Alignment, CostKind, I);
1802 }
1803 case Intrinsic::stepvector: {
1804 if (isa<ScalableVectorType>(RetTy))
1806 // The cost of materialising a constant integer vector.
1808 }
1809 case Intrinsic::vector_extract: {
1810 // FIXME: Handle case where a scalable vector is extracted from a scalable
1811 // vector
1812 if (isa<ScalableVectorType>(RetTy))
1814 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1815 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1816 cast<VectorType>(Args[0]->getType()), {},
1817 CostKind, Index, cast<VectorType>(RetTy));
1818 }
1819 case Intrinsic::vector_insert: {
1820 // FIXME: Handle case where a scalable vector is inserted into a scalable
1821 // vector
1822 if (isa<ScalableVectorType>(Args[1]->getType()))
1824 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1825 return thisT()->getShuffleCost(
1826 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), {},
1827 CostKind, Index, cast<VectorType>(Args[1]->getType()));
1828 }
1829 case Intrinsic::vector_reverse: {
1830 return thisT()->getShuffleCost(TTI::SK_Reverse,
1831 cast<VectorType>(Args[0]->getType()), {},
1832 CostKind, 0, cast<VectorType>(RetTy));
1833 }
1834 case Intrinsic::vector_splice: {
1835 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1836 return thisT()->getShuffleCost(TTI::SK_Splice,
1837 cast<VectorType>(Args[0]->getType()), {},
1838 CostKind, Index, cast<VectorType>(RetTy));
1839 }
1840 case Intrinsic::vector_reduce_add:
1841 case Intrinsic::vector_reduce_mul:
1842 case Intrinsic::vector_reduce_and:
1843 case Intrinsic::vector_reduce_or:
1844 case Intrinsic::vector_reduce_xor:
1845 case Intrinsic::vector_reduce_smax:
1846 case Intrinsic::vector_reduce_smin:
1847 case Intrinsic::vector_reduce_fmax:
1848 case Intrinsic::vector_reduce_fmin:
1849 case Intrinsic::vector_reduce_fmaximum:
1850 case Intrinsic::vector_reduce_fminimum:
1851 case Intrinsic::vector_reduce_umax:
1852 case Intrinsic::vector_reduce_umin: {
1853 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1855 }
1856 case Intrinsic::vector_reduce_fadd:
1857 case Intrinsic::vector_reduce_fmul: {
1859 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1861 }
1862 case Intrinsic::fshl:
1863 case Intrinsic::fshr: {
1864 const Value *X = Args[0];
1865 const Value *Y = Args[1];
1866 const Value *Z = Args[2];
1869 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1870 const TTI::OperandValueInfo OpInfoBW =
1872 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1873 : TTI::OP_None};
1874
1875 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1876 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1878 Cost +=
1879 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1880 Cost +=
1881 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1882 Cost += thisT()->getArithmeticInstrCost(
1883 BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1884 {OpInfoZ.Kind, TTI::OP_None});
1885 Cost += thisT()->getArithmeticInstrCost(
1886 BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1887 {OpInfoZ.Kind, TTI::OP_None});
1888 // Non-constant shift amounts requires a modulo.
1889 if (!OpInfoZ.isConstant())
1890 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1891 CostKind, OpInfoZ, OpInfoBW);
1892 // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1893 if (X != Y) {
1894 Type *CondTy = RetTy->getWithNewBitWidth(1);
1895 Cost +=
1896 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1898 Cost +=
1899 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1901 }
1902 return Cost;
1903 }
1904 case Intrinsic::get_active_lane_mask: {
1905 EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1906 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1907
1908 // If we're not expanding the intrinsic then we assume this is cheap
1909 // to implement.
1910 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1911 return getTypeLegalizationCost(RetTy).first;
1912 }
1913
1914 // Create the expanded types that will be used to calculate the uadd_sat
1915 // operation.
1916 Type *ExpRetTy = VectorType::get(
1917 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1918 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1920 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1921 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1923 return Cost;
1924 }
1925 case Intrinsic::experimental_cttz_elts: {
1926 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1927
1928 // If we're not expanding the intrinsic then we assume this is cheap
1929 // to implement.
1930 if (!getTLI()->shouldExpandCttzElements(ArgType))
1931 return getTypeLegalizationCost(RetTy).first;
1932
1933 // TODO: The costs below reflect the expansion code in
1934 // SelectionDAGBuilder, but we may want to sacrifice some accuracy in
1935 // favour of compile time.
1936
1937 // Find the smallest "sensible" element type to use for the expansion.
1938 bool ZeroIsPoison = !cast<ConstantInt>(Args[1])->isZero();
1939 ConstantRange VScaleRange(APInt(64, 1), APInt::getZero(64));
1940 if (isa<ScalableVectorType>(ICA.getArgTypes()[0]) && I && I->getCaller())
1941 VScaleRange = getVScaleRange(I->getCaller(), 64);
1942
1943 unsigned EltWidth = getTLI()->getBitWidthForCttzElements(
1944 RetTy, ArgType.getVectorElementCount(), ZeroIsPoison, &VScaleRange);
1945 Type *NewEltTy = IntegerType::getIntNTy(RetTy->getContext(), EltWidth);
1946
1947 // Create the new vector type & get the vector length
1948 Type *NewVecTy = VectorType::get(
1949 NewEltTy, cast<VectorType>(Args[0]->getType())->getElementCount());
1950
1951 IntrinsicCostAttributes StepVecAttrs(Intrinsic::stepvector, NewVecTy, {},
1952 FMF);
1954 thisT()->getIntrinsicInstrCost(StepVecAttrs, CostKind);
1955
1956 Cost +=
1957 thisT()->getArithmeticInstrCost(Instruction::Sub, NewVecTy, CostKind);
1958 Cost += thisT()->getCastInstrCost(Instruction::SExt, NewVecTy,
1959 Args[0]->getType(),
1961 Cost +=
1962 thisT()->getArithmeticInstrCost(Instruction::And, NewVecTy, CostKind);
1963
1964 IntrinsicCostAttributes ReducAttrs(Intrinsic::vector_reduce_umax,
1965 NewEltTy, NewVecTy, FMF, I, 1);
1966 Cost += thisT()->getTypeBasedIntrinsicInstrCost(ReducAttrs, CostKind);
1967 Cost +=
1968 thisT()->getArithmeticInstrCost(Instruction::Sub, NewEltTy, CostKind);
1969
1970 return Cost;
1971 }
1972 case Intrinsic::experimental_vector_match:
1973 return thisT()->getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1974 }
1975
1976 // Assume that we need to scalarize this intrinsic.)
1977 // Compute the scalarization overhead based on Args for a vector
1978 // intrinsic.
1979 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1980 if (RetVF.isVector() && !RetVF.isScalable()) {
1981 ScalarizationCost = 0;
1982 if (!RetTy->isVoidTy())
1983 ScalarizationCost += getScalarizationOverhead(
1984 cast<VectorType>(RetTy),
1985 /*Insert*/ true, /*Extract*/ false, CostKind);
1986 ScalarizationCost +=
1988 }
1989
1990 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1991 ScalarizationCost);
1992 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1993 }
1994
1995 /// Get intrinsic cost based on argument types.
1996 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1997 /// cost of scalarizing the arguments and the return value will be computed
1998 /// based on types.
2002 Intrinsic::ID IID = ICA.getID();
2003 Type *RetTy = ICA.getReturnType();
2004 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
2005 FastMathFlags FMF = ICA.getFlags();
2006 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
2007 bool SkipScalarizationCost = ICA.skipScalarizationCost();
2008
2009 VectorType *VecOpTy = nullptr;
2010 if (!Tys.empty()) {
2011 // The vector reduction operand is operand 0 except for fadd/fmul.
2012 // Their operand 0 is a scalar start value, so the vector op is operand 1.
2013 unsigned VecTyIndex = 0;
2014 if (IID == Intrinsic::vector_reduce_fadd ||
2015 IID == Intrinsic::vector_reduce_fmul)
2016 VecTyIndex = 1;
2017 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
2018 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
2019 }
2020
2021 // Library call cost - other than size, make it expensive.
2022 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
2023 unsigned ISD = 0;
2024 switch (IID) {
2025 default: {
2026 // Scalable vectors cannot be scalarized, so return Invalid.
2027 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2028 return isa<ScalableVectorType>(Ty);
2029 }))
2031
2032 // Assume that we need to scalarize this intrinsic.
2033 InstructionCost ScalarizationCost =
2034 SkipScalarizationCost ? ScalarizationCostPassed : 0;
2035 unsigned ScalarCalls = 1;
2036 Type *ScalarRetTy = RetTy;
2037 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2038 if (!SkipScalarizationCost)
2039 ScalarizationCost = getScalarizationOverhead(
2040 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
2041 ScalarCalls = std::max(ScalarCalls,
2042 cast<FixedVectorType>(RetVTy)->getNumElements());
2043 ScalarRetTy = RetTy->getScalarType();
2044 }
2045 SmallVector<Type *, 4> ScalarTys;
2046 for (Type *Ty : Tys) {
2047 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2048 if (!SkipScalarizationCost)
2049 ScalarizationCost += getScalarizationOverhead(
2050 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2051 ScalarCalls = std::max(ScalarCalls,
2052 cast<FixedVectorType>(VTy)->getNumElements());
2053 Ty = Ty->getScalarType();
2054 }
2055 ScalarTys.push_back(Ty);
2056 }
2057 if (ScalarCalls == 1)
2058 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
2059
2060 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
2061 InstructionCost ScalarCost =
2062 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
2063
2064 return ScalarCalls * ScalarCost + ScalarizationCost;
2065 }
2066 // Look for intrinsics that can be lowered directly or turned into a scalar
2067 // intrinsic call.
2068 case Intrinsic::sqrt:
2069 ISD = ISD::FSQRT;
2070 break;
2071 case Intrinsic::sin:
2072 ISD = ISD::FSIN;
2073 break;
2074 case Intrinsic::cos:
2075 ISD = ISD::FCOS;
2076 break;
2077 case Intrinsic::sincos:
2078 ISD = ISD::FSINCOS;
2079 break;
2080 case Intrinsic::tan:
2081 ISD = ISD::FTAN;
2082 break;
2083 case Intrinsic::asin:
2084 ISD = ISD::FASIN;
2085 break;
2086 case Intrinsic::acos:
2087 ISD = ISD::FACOS;
2088 break;
2089 case Intrinsic::atan:
2090 ISD = ISD::FATAN;
2091 break;
2092 case Intrinsic::atan2:
2093 ISD = ISD::FATAN2;
2094 break;
2095 case Intrinsic::sinh:
2096 ISD = ISD::FSINH;
2097 break;
2098 case Intrinsic::cosh:
2099 ISD = ISD::FCOSH;
2100 break;
2101 case Intrinsic::tanh:
2102 ISD = ISD::FTANH;
2103 break;
2104 case Intrinsic::exp:
2105 ISD = ISD::FEXP;
2106 break;
2107 case Intrinsic::exp2:
2108 ISD = ISD::FEXP2;
2109 break;
2110 case Intrinsic::exp10:
2111 ISD = ISD::FEXP10;
2112 break;
2113 case Intrinsic::log:
2114 ISD = ISD::FLOG;
2115 break;
2116 case Intrinsic::log10:
2117 ISD = ISD::FLOG10;
2118 break;
2119 case Intrinsic::log2:
2120 ISD = ISD::FLOG2;
2121 break;
2122 case Intrinsic::fabs:
2123 ISD = ISD::FABS;
2124 break;
2125 case Intrinsic::canonicalize:
2126 ISD = ISD::FCANONICALIZE;
2127 break;
2128 case Intrinsic::minnum:
2129 ISD = ISD::FMINNUM;
2130 break;
2131 case Intrinsic::maxnum:
2132 ISD = ISD::FMAXNUM;
2133 break;
2134 case Intrinsic::minimum:
2135 ISD = ISD::FMINIMUM;
2136 break;
2137 case Intrinsic::maximum:
2138 ISD = ISD::FMAXIMUM;
2139 break;
2140 case Intrinsic::minimumnum:
2141 ISD = ISD::FMINIMUMNUM;
2142 break;
2143 case Intrinsic::maximumnum:
2144 ISD = ISD::FMAXIMUMNUM;
2145 break;
2146 case Intrinsic::copysign:
2147 ISD = ISD::FCOPYSIGN;
2148 break;
2149 case Intrinsic::floor:
2150 ISD = ISD::FFLOOR;
2151 break;
2152 case Intrinsic::ceil:
2153 ISD = ISD::FCEIL;
2154 break;
2155 case Intrinsic::trunc:
2156 ISD = ISD::FTRUNC;
2157 break;
2158 case Intrinsic::nearbyint:
2159 ISD = ISD::FNEARBYINT;
2160 break;
2161 case Intrinsic::rint:
2162 ISD = ISD::FRINT;
2163 break;
2164 case Intrinsic::lrint:
2165 ISD = ISD::LRINT;
2166 break;
2167 case Intrinsic::llrint:
2168 ISD = ISD::LLRINT;
2169 break;
2170 case Intrinsic::round:
2171 ISD = ISD::FROUND;
2172 break;
2173 case Intrinsic::roundeven:
2174 ISD = ISD::FROUNDEVEN;
2175 break;
2176 case Intrinsic::pow:
2177 ISD = ISD::FPOW;
2178 break;
2179 case Intrinsic::fma:
2180 ISD = ISD::FMA;
2181 break;
2182 case Intrinsic::fmuladd:
2183 ISD = ISD::FMA;
2184 break;
2185 case Intrinsic::experimental_constrained_fmuladd:
2186 ISD = ISD::STRICT_FMA;
2187 break;
2188 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
2189 case Intrinsic::lifetime_start:
2190 case Intrinsic::lifetime_end:
2191 case Intrinsic::sideeffect:
2192 case Intrinsic::pseudoprobe:
2193 case Intrinsic::arithmetic_fence:
2194 return 0;
2195 case Intrinsic::masked_store: {
2196 Type *Ty = Tys[0];
2197 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2198 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
2199 CostKind);
2200 }
2201 case Intrinsic::masked_load: {
2202 Type *Ty = RetTy;
2203 Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
2204 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
2205 CostKind);
2206 }
2207 case Intrinsic::vector_reduce_add:
2208 case Intrinsic::vector_reduce_mul:
2209 case Intrinsic::vector_reduce_and:
2210 case Intrinsic::vector_reduce_or:
2211 case Intrinsic::vector_reduce_xor:
2212 return thisT()->getArithmeticReductionCost(
2213 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt,
2214 CostKind);
2215 case Intrinsic::vector_reduce_fadd:
2216 case Intrinsic::vector_reduce_fmul:
2217 return thisT()->getArithmeticReductionCost(
2218 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind);
2219 case Intrinsic::vector_reduce_smax:
2220 case Intrinsic::vector_reduce_smin:
2221 case Intrinsic::vector_reduce_umax:
2222 case Intrinsic::vector_reduce_umin:
2223 case Intrinsic::vector_reduce_fmax:
2224 case Intrinsic::vector_reduce_fmin:
2225 case Intrinsic::vector_reduce_fmaximum:
2226 case Intrinsic::vector_reduce_fminimum:
2227 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID),
2228 VecOpTy, ICA.getFlags(), CostKind);
2229 case Intrinsic::experimental_vector_match: {
2230 auto *SearchTy = cast<VectorType>(ICA.getArgTypes()[0]);
2231 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
2232 unsigned SearchSize = NeedleTy->getNumElements();
2233
2234 // If we're not expanding the intrinsic then we assume this is cheap to
2235 // implement.
2236 EVT SearchVT = getTLI()->getValueType(DL, SearchTy);
2237 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize))
2238 return getTypeLegalizationCost(RetTy).first;
2239
2240 // Approximate the cost based on the expansion code in
2241 // SelectionDAGBuilder.
2243 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, NeedleTy,
2244 CostKind, 1, nullptr, nullptr);
2245 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SearchTy,
2246 CostKind, 0, nullptr, nullptr);
2247 Cost += thisT()->getShuffleCost(TTI::SK_Broadcast, SearchTy, std::nullopt,
2248 CostKind, 0, nullptr);
2249 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SearchTy, RetTy,
2251 Cost +=
2252 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
2253 Cost *= SearchSize;
2254 Cost +=
2255 thisT()->getArithmeticInstrCost(BinaryOperator::And, RetTy, CostKind);
2256 return Cost;
2257 }
2258 case Intrinsic::abs:
2259 ISD = ISD::ABS;
2260 break;
2261 case Intrinsic::smax:
2262 ISD = ISD::SMAX;
2263 break;
2264 case Intrinsic::smin:
2265 ISD = ISD::SMIN;
2266 break;
2267 case Intrinsic::umax:
2268 ISD = ISD::UMAX;
2269 break;
2270 case Intrinsic::umin:
2271 ISD = ISD::UMIN;
2272 break;
2273 case Intrinsic::sadd_sat:
2274 ISD = ISD::SADDSAT;
2275 break;
2276 case Intrinsic::ssub_sat:
2277 ISD = ISD::SSUBSAT;
2278 break;
2279 case Intrinsic::uadd_sat:
2280 ISD = ISD::UADDSAT;
2281 break;
2282 case Intrinsic::usub_sat:
2283 ISD = ISD::USUBSAT;
2284 break;
2285 case Intrinsic::smul_fix:
2286 ISD = ISD::SMULFIX;
2287 break;
2288 case Intrinsic::umul_fix:
2289 ISD = ISD::UMULFIX;
2290 break;
2291 case Intrinsic::sadd_with_overflow:
2292 ISD = ISD::SADDO;
2293 break;
2294 case Intrinsic::ssub_with_overflow:
2295 ISD = ISD::SSUBO;
2296 break;
2297 case Intrinsic::uadd_with_overflow:
2298 ISD = ISD::UADDO;
2299 break;
2300 case Intrinsic::usub_with_overflow:
2301 ISD = ISD::USUBO;
2302 break;
2303 case Intrinsic::smul_with_overflow:
2304 ISD = ISD::SMULO;
2305 break;
2306 case Intrinsic::umul_with_overflow:
2307 ISD = ISD::UMULO;
2308 break;
2309 case Intrinsic::fptosi_sat:
2310 ISD = ISD::FP_TO_SINT_SAT;
2311 break;
2312 case Intrinsic::fptoui_sat:
2313 ISD = ISD::FP_TO_UINT_SAT;
2314 break;
2315 case Intrinsic::ctpop:
2316 ISD = ISD::CTPOP;
2317 // In case of legalization use TCC_Expensive. This is cheaper than a
2318 // library call but still not a cheap instruction.
2319 SingleCallCost = TargetTransformInfo::TCC_Expensive;
2320 break;
2321 case Intrinsic::ctlz:
2322 ISD = ISD::CTLZ;
2323 break;
2324 case Intrinsic::cttz:
2325 ISD = ISD::CTTZ;
2326 break;
2327 case Intrinsic::bswap:
2328 ISD = ISD::BSWAP;
2329 break;
2330 case Intrinsic::bitreverse:
2331 ISD = ISD::BITREVERSE;
2332 break;
2333 case Intrinsic::ucmp:
2334 ISD = ISD::UCMP;
2335 break;
2336 case Intrinsic::scmp:
2337 ISD = ISD::SCMP;
2338 break;
2339 }
2340
2341 auto *ST = dyn_cast<StructType>(RetTy);
2342 Type *LegalizeTy = ST ? ST->getContainedType(0) : RetTy;
2343 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(LegalizeTy);
2344
2345 const TargetLoweringBase *TLI = getTLI();
2346
2347 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2348 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2349 TLI->isFAbsFree(LT.second)) {
2350 return 0;
2351 }
2352
2353 // The operation is legal. Assume it costs 1.
2354 // If the type is split to multiple registers, assume that there is some
2355 // overhead to this.
2356 // TODO: Once we have extract/insert subvector cost we need to use them.
2357 if (LT.first > 1)
2358 return (LT.first * 2);
2359 else
2360 return (LT.first * 1);
2361 } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2362 // If the operation is custom lowered then assume
2363 // that the code is twice as expensive.
2364 return (LT.first * 2);
2365 }
2366
2367 switch (IID) {
2368 case Intrinsic::fmuladd: {
2369 // If we can't lower fmuladd into an FMA estimate the cost as a floating
2370 // point mul followed by an add.
2371
2372 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2373 CostKind) +
2374 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2375 CostKind);
2376 }
2377 case Intrinsic::experimental_constrained_fmuladd: {
2378 IntrinsicCostAttributes FMulAttrs(
2379 Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2380 IntrinsicCostAttributes FAddAttrs(
2381 Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2382 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2383 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2384 }
2385 case Intrinsic::smin:
2386 case Intrinsic::smax:
2387 case Intrinsic::umin:
2388 case Intrinsic::umax: {
2389 // minmax(X,Y) = select(icmp(X,Y),X,Y)
2390 Type *CondTy = RetTy->getWithNewBitWidth(1);
2391 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
2392 CmpInst::Predicate Pred =
2393 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
2395 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2396 Pred, CostKind);
2397 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2398 Pred, CostKind);
2399 return Cost;
2400 }
2401 case Intrinsic::sadd_with_overflow:
2402 case Intrinsic::ssub_with_overflow: {
2403 Type *SumTy = RetTy->getContainedType(0);
2404 Type *OverflowTy = RetTy->getContainedType(1);
2405 unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2406 ? BinaryOperator::Add
2407 : BinaryOperator::Sub;
2408
2409 // Add:
2410 // Overflow -> (Result < LHS) ^ (RHS < 0)
2411 // Sub:
2412 // Overflow -> (Result < LHS) ^ (RHS > 0)
2414 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2415 Cost +=
2416 2 * thisT()->getCmpSelInstrCost(Instruction::ICmp, SumTy, OverflowTy,
2418 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2419 CostKind);
2420 return Cost;
2421 }
2422 case Intrinsic::uadd_with_overflow:
2423 case Intrinsic::usub_with_overflow: {
2424 Type *SumTy = RetTy->getContainedType(0);
2425 Type *OverflowTy = RetTy->getContainedType(1);
2426 unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2427 ? BinaryOperator::Add
2428 : BinaryOperator::Sub;
2429 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2432
2434 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2435 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy,
2436 OverflowTy, Pred, CostKind);
2437 return Cost;
2438 }
2439 case Intrinsic::smul_with_overflow:
2440 case Intrinsic::umul_with_overflow: {
2441 Type *MulTy = RetTy->getContainedType(0);
2442 Type *OverflowTy = RetTy->getContainedType(1);
2443 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2444 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2445 bool IsSigned = IID == Intrinsic::smul_with_overflow;
2446
2447 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2449
2451 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2452 Cost +=
2453 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2454 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2455 CCH, CostKind);
2456 Cost += thisT()->getArithmeticInstrCost(
2457 Instruction::LShr, ExtTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2459
2460 if (IsSigned)
2461 Cost += thisT()->getArithmeticInstrCost(
2462 Instruction::AShr, MulTy, CostKind,
2465
2466 Cost += thisT()->getCmpSelInstrCost(
2467 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2468 return Cost;
2469 }
2470 case Intrinsic::sadd_sat:
2471 case Intrinsic::ssub_sat: {
2472 // Assume a default expansion.
2473 Type *CondTy = RetTy->getWithNewBitWidth(1);
2474
2475 Type *OpTy = StructType::create({RetTy, CondTy});
2476 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
2477 ? Intrinsic::sadd_with_overflow
2478 : Intrinsic::ssub_with_overflow;
2480
2481 // SatMax -> Overflow && SumDiff < 0
2482 // SatMin -> Overflow && SumDiff >= 0
2484 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2485 nullptr, ScalarizationCostPassed);
2486 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2487 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2488 Pred, CostKind);
2489 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
2490 CondTy, Pred, CostKind);
2491 return Cost;
2492 }
2493 case Intrinsic::uadd_sat:
2494 case Intrinsic::usub_sat: {
2495 Type *CondTy = RetTy->getWithNewBitWidth(1);
2496
2497 Type *OpTy = StructType::create({RetTy, CondTy});
2498 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
2499 ? Intrinsic::uadd_with_overflow
2500 : Intrinsic::usub_with_overflow;
2501
2503 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
2504 nullptr, ScalarizationCostPassed);
2505 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2506 Cost +=
2507 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2509 return Cost;
2510 }
2511 case Intrinsic::smul_fix:
2512 case Intrinsic::umul_fix: {
2513 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
2514 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
2515
2516 unsigned ExtOp =
2517 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
2519
2521 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
2522 Cost +=
2523 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2524 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2525 CCH, CostKind);
2526 Cost += thisT()->getArithmeticInstrCost(
2527 Instruction::LShr, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2529 Cost += thisT()->getArithmeticInstrCost(
2530 Instruction::Shl, RetTy, CostKind, {TTI::OK_AnyValue, TTI::OP_None},
2532 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2533 return Cost;
2534 }
2535 case Intrinsic::abs: {
2536 // abs(X) = select(icmp(X,0),X,sub(0,X))
2537 Type *CondTy = RetTy->getWithNewBitWidth(1);
2540 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
2541 Pred, CostKind);
2542 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2543 Pred, CostKind);
2544 // TODO: Should we add an OperandValueProperties::OP_Zero property?
2545 Cost += thisT()->getArithmeticInstrCost(
2546 BinaryOperator::Sub, RetTy, CostKind,
2548 return Cost;
2549 }
2550 case Intrinsic::fptosi_sat:
2551 case Intrinsic::fptoui_sat: {
2552 if (Tys.empty())
2553 break;
2554 Type *FromTy = Tys[0];
2555 bool IsSigned = IID == Intrinsic::fptosi_sat;
2556
2558 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2559 {FromTy, FromTy});
2560 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2561 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2562 {FromTy, FromTy});
2563 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2564 Cost += thisT()->getCastInstrCost(
2565 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2567 if (IsSigned) {
2568 Type *CondTy = RetTy->getWithNewBitWidth(1);
2569 Cost += thisT()->getCmpSelInstrCost(
2570 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2571 Cost += thisT()->getCmpSelInstrCost(
2572 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2573 }
2574 return Cost;
2575 }
2576 case Intrinsic::ucmp:
2577 case Intrinsic::scmp: {
2578 Type *CmpTy = Tys[0];
2579 Type *CondTy = RetTy->getWithNewBitWidth(1);
2581 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2583 CostKind) +
2584 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, CmpTy, CondTy,
2586 CostKind);
2587
2588 EVT VT = TLI->getValueType(DL, CmpTy, true);
2589 if (TLI->shouldExpandCmpUsingSelects(VT)) {
2590 // x < y ? -1 : (x > y ? 1 : 0)
2591 Cost += 2 * thisT()->getCmpSelInstrCost(
2592 BinaryOperator::Select, RetTy, CondTy,
2594 } else {
2595 // zext(x > y) - zext(x < y)
2596 Cost +=
2597 2 * thisT()->getCastInstrCost(CastInst::ZExt, RetTy, CondTy,
2599 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy,
2600 CostKind);
2601 }
2602 return Cost;
2603 }
2604 default:
2605 break;
2606 }
2607
2608 // Else, assume that we need to scalarize this intrinsic. For math builtins
2609 // this will emit a costly libcall, adding call overhead and spills. Make it
2610 // very expensive.
2611 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2612 // Scalable vectors cannot be scalarized, so return Invalid.
2613 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2614 return isa<ScalableVectorType>(Ty);
2615 }))
2617
2618 InstructionCost ScalarizationCost =
2619 SkipScalarizationCost
2620 ? ScalarizationCostPassed
2621 : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2622 /*Extract*/ false, CostKind);
2623
2624 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2625 SmallVector<Type *, 4> ScalarTys;
2626 for (Type *Ty : Tys) {
2627 if (Ty->isVectorTy())
2628 Ty = Ty->getScalarType();
2629 ScalarTys.push_back(Ty);
2630 }
2631 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2632 InstructionCost ScalarCost =
2633 thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2634 for (Type *Ty : Tys) {
2635 if (auto *VTy = dyn_cast<VectorType>(Ty)) {
2636 if (!ICA.skipScalarizationCost())
2637 ScalarizationCost += getScalarizationOverhead(
2638 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2639 ScalarCalls = std::max(ScalarCalls,
2640 cast<FixedVectorType>(VTy)->getNumElements());
2641 }
2642 }
2643 return ScalarCalls * ScalarCost + ScalarizationCost;
2644 }
2645
2646 // This is going to be turned into a library call, make it expensive.
2647 return SingleCallCost;
2648 }
2649
2650 /// Compute a cost of the given call instruction.
2651 ///
2652 /// Compute the cost of calling function F with return type RetTy and
2653 /// argument types Tys. F might be nullptr, in this case the cost of an
2654 /// arbitrary call with the specified signature will be returned.
2655 /// This is used, for instance, when we estimate call of a vector
2656 /// counterpart of the given function.
2657 /// \param F Called function, might be nullptr.
2658 /// \param RetTy Return value types.
2659 /// \param Tys Argument types.
2660 /// \returns The cost of Call instruction.
2662 ArrayRef<Type *> Tys,
2664 return 10;
2665 }
2666
2667 unsigned getNumberOfParts(Type *Tp) {
2668 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2669 if (!LT.first.isValid())
2670 return 0;
2671 // Try to find actual number of parts for non-power-of-2 elements as
2672 // ceil(num-of-elements/num-of-subtype-elements).
2673 if (auto *FTp = dyn_cast<FixedVectorType>(Tp);
2674 Tp && LT.second.isFixedLengthVector() &&
2675 !has_single_bit(FTp->getNumElements())) {
2676 if (auto *SubTp = dyn_cast_if_present<FixedVectorType>(
2677 EVT(LT.second).getTypeForEVT(Tp->getContext()));
2678 SubTp && SubTp->getElementType() == FTp->getElementType())
2679 return divideCeil(FTp->getNumElements(), SubTp->getNumElements());
2680 }
2681 return *LT.first.getValue();
2682 }
2683
2685 const SCEV *) {
2686 return 0;
2687 }
2688
2689 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2690 /// We're assuming that reduction operation are performing the following way:
2691 ///
2692 /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2693 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2694 /// \----------------v-------------/ \----------v------------/
2695 /// n/2 elements n/2 elements
2696 /// %red1 = op <n x t> %val, <n x t> val1
2697 /// After this operation we have a vector %red1 where only the first n/2
2698 /// elements are meaningful, the second n/2 elements are undefined and can be
2699 /// dropped. All other operations are actually working with the vector of
2700 /// length n/2, not n, though the real vector length is still n.
2701 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2702 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2703 /// \----------------v-------------/ \----------v------------/
2704 /// n/4 elements 3*n/4 elements
2705 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of
2706 /// length n/2, the resulting vector has length n/4 etc.
2707 ///
2708 /// The cost model should take into account that the actual length of the
2709 /// vector is reduced on each iteration.
2712 // Targets must implement a default value for the scalable case, since
2713 // we don't know how many lanes the vector has.
2714 if (isa<ScalableVectorType>(Ty))
2716
2717 Type *ScalarTy = Ty->getElementType();
2718 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2719 if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2720 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2721 NumVecElts >= 2) {
2722 // Or reduction for i1 is represented as:
2723 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2724 // %res = cmp ne iReduxWidth %val, 0
2725 // And reduction for i1 is represented as:
2726 // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2727 // %res = cmp eq iReduxWidth %val, 11111
2728 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2729 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2731 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2734 }
2735 unsigned NumReduxLevels = Log2_32(NumVecElts);
2736 InstructionCost ArithCost = 0;
2737 InstructionCost ShuffleCost = 0;
2738 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2739 unsigned LongVectorCount = 0;
2740 unsigned MVTLen =
2741 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2742 while (NumVecElts > MVTLen) {
2743 NumVecElts /= 2;
2744 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2745 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2746 CostKind, NumVecElts, SubTy);
2747 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2748 Ty = SubTy;
2749 ++LongVectorCount;
2750 }
2751
2752 NumReduxLevels -= LongVectorCount;
2753
2754 // The minimal length of the vector is limited by the real length of vector
2755 // operations performed on the current platform. That's why several final
2756 // reduction operations are performed on the vectors with the same
2757 // architecture-dependent length.
2758
2759 // By default reductions need one shuffle per reduction level.
2760 ShuffleCost +=
2761 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2762 {}, CostKind, 0, Ty);
2763 ArithCost +=
2764 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2765 return ShuffleCost + ArithCost +
2766 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2767 CostKind, 0, nullptr, nullptr);
2768 }
2769
2770 /// Try to calculate the cost of performing strict (in-order) reductions,
2771 /// which involves doing a sequence of floating point additions in lane
2772 /// order, starting with an initial value. For example, consider a scalar
2773 /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2774 ///
2775 /// Vector = <float %v0, float %v1, float %v2, float %v3>
2776 ///
2777 /// %add1 = %InitVal + %v0
2778 /// %add2 = %add1 + %v1
2779 /// %add3 = %add2 + %v2
2780 /// %add4 = %add3 + %v3
2781 ///
2782 /// As a simple estimate we can say the cost of such a reduction is 4 times
2783 /// the cost of a scalar FP addition. We can only estimate the costs for
2784 /// fixed-width vectors here because for scalable vectors we do not know the
2785 /// runtime number of operations.
2788 // Targets must implement a default value for the scalable case, since
2789 // we don't know how many lanes the vector has.
2790 if (isa<ScalableVectorType>(Ty))
2792
2793 auto *VTy = cast<FixedVectorType>(Ty);
2795 VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2796 InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2797 Opcode, VTy->getElementType(), CostKind);
2798 ArithCost *= VTy->getNumElements();
2799
2800 return ExtractCost + ArithCost;
2801 }
2802
2804 std::optional<FastMathFlags> FMF,
2806 assert(Ty && "Unknown reduction vector type");
2808 return getOrderedReductionCost(Opcode, Ty, CostKind);
2809 return getTreeReductionCost(Opcode, Ty, CostKind);
2810 }
2811
2812 /// Try to calculate op costs for min/max reduction operations.
2813 /// \param CondTy Conditional type for the Select instruction.
2815 FastMathFlags FMF,
2817 // Targets must implement a default value for the scalable case, since
2818 // we don't know how many lanes the vector has.
2819 if (isa<ScalableVectorType>(Ty))
2821
2822 Type *ScalarTy = Ty->getElementType();
2823 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2824 unsigned NumReduxLevels = Log2_32(NumVecElts);
2825 InstructionCost MinMaxCost = 0;
2826 InstructionCost ShuffleCost = 0;
2827 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2828 unsigned LongVectorCount = 0;
2829 unsigned MVTLen =
2830 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2831 while (NumVecElts > MVTLen) {
2832 NumVecElts /= 2;
2833 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2834
2835 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, {},
2836 CostKind, NumVecElts, SubTy);
2837
2838 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2839 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2840 Ty = SubTy;
2841 ++LongVectorCount;
2842 }
2843
2844 NumReduxLevels -= LongVectorCount;
2845
2846 // The minimal length of the vector is limited by the real length of vector
2847 // operations performed on the current platform. That's why several final
2848 // reduction opertions are perfomed on the vectors with the same
2849 // architecture-dependent length.
2850 ShuffleCost +=
2851 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2852 {}, CostKind, 0, Ty);
2853 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2854 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2855 // The last min/max should be in vector registers and we counted it above.
2856 // So just need a single extractelement.
2857 return ShuffleCost + MinMaxCost +
2858 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2859 CostKind, 0, nullptr, nullptr);
2860 }
2861
2862 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2863 Type *ResTy, VectorType *Ty,
2864 FastMathFlags FMF,
2866 if (auto *FTy = dyn_cast<FixedVectorType>(Ty);
2867 FTy && IsUnsigned && Opcode == Instruction::Add &&
2868 FTy->getElementType() == IntegerType::getInt1Ty(Ty->getContext())) {
2869 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2870 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2871 auto *IntTy =
2872 IntegerType::get(ResTy->getContext(), FTy->getNumElements());
2873 IntrinsicCostAttributes ICA(Intrinsic::ctpop, IntTy, {IntTy}, FMF);
2874 return thisT()->getCastInstrCost(Instruction::BitCast, IntTy, FTy,
2876 thisT()->getIntrinsicInstrCost(ICA, CostKind);
2877 }
2878 // Without any native support, this is equivalent to the cost of
2879 // vecreduce.opcode(ext(Ty A)).
2880 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2881 InstructionCost RedCost =
2882 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2883 InstructionCost ExtCost = thisT()->getCastInstrCost(
2884 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2886
2887 return RedCost + ExtCost;
2888 }
2889
2891 VectorType *Ty,
2893 // Without any native support, this is equivalent to the cost of
2894 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2895 // vecreduce.add(mul(A, B)).
2896 VectorType *ExtTy = VectorType::get(ResTy, Ty);
2897 InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2898 Instruction::Add, ExtTy, std::nullopt, CostKind);
2899 InstructionCost ExtCost = thisT()->getCastInstrCost(
2900 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2902
2903 InstructionCost MulCost =
2904 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2905
2906 return RedCost + MulCost + 2 * ExtCost;
2907 }
2908
2910
2911 /// @}
2912};
2913
2914/// Concrete BasicTTIImpl that can be used if no further customization
2915/// is needed.
2916class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2918
2919 friend class BasicTTIImplBase<BasicTTIImpl>;
2920
2921 const TargetSubtargetInfo *ST;
2922 const TargetLoweringBase *TLI;
2923
2924 const TargetSubtargetInfo *getST() const { return ST; }
2925 const TargetLoweringBase *getTLI() const { return TLI; }
2926
2927public:
2928 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2929};
2930
2931} // end namespace llvm
2932
2933#endif // LLVM_CODEGEN_BASICTTIIMPL_H
This file implements a class to represent arbitrary precision integral constant values and operations...
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
uint32_t Index
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
#define P(N)
if(PassOpts->AAPipeline)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This file provides helpers for the implementation of a TargetTransformInfo-conforming class.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition: APInt.h:234
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition: APInt.h:1330
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition: APInt.h:1201
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition: APInt.h:1468
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition: APInt.h:1130
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition: APInt.h:200
an instruction to allocate memory on the stack
Definition: Instructions.h:63
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
ArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition: ArrayRef.h:207
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
ArrayRef< T > drop_back(size_t N=1) const
Drop the last N elements of the array.
Definition: ArrayRef.h:213
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:80
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Scalar, ArrayRef< std::tuple< Value *, User *, int > > ScalarUserAndIdx)
bool isTypeLegal(Type *Ty)
Definition: BasicTTIImpl.h:468
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:326
virtual unsigned getPrefetchDistance() const
Definition: BasicTTIImpl.h:766
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:622
bool preferToKeepConstantsAttached(const Instruction &Inst, const Function &Fn) const
Definition: BasicTTIImpl.h:595
unsigned getMaxInterleaveFactor(ElementCount VF)
Definition: BasicTTIImpl.h:956
unsigned getNumberOfParts(Type *Tp)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
Definition: BasicTTIImpl.h:795
std::optional< unsigned > getVScaleForTuning() const
Definition: BasicTTIImpl.h:800
InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate the cost of performing strict (in-order) reductions, which involves doing a sequence...
bool isTruncateFree(Type *Ty1, Type *Ty2)
Definition: BasicTTIImpl.h:458
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo)
Definition: BasicTTIImpl.h:702
InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
Definition: BasicTTIImpl.h:713
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
Definition: BasicTTIImpl.h:786
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I)
bool isLegalICmpImmediate(int64_t imm)
Definition: BasicTTIImpl.h:375
bool isProfitableToHoist(Instruction *I)
Definition: BasicTTIImpl.h:462
virtual unsigned getMaxPrefetchIterationsAhead() const
Definition: BasicTTIImpl.h:778
InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, TTI::TargetCostKind CostKind, unsigned Index)
std::optional< unsigned > getMaxVScale() const
Definition: BasicTTIImpl.h:799
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
unsigned getRegUsageForType(Type *Ty)
Definition: BasicTTIImpl.h:473
bool shouldBuildRelLookupTables() const
Definition: BasicTTIImpl.h:549
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Definition: BasicTTIImpl.h:616
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
Definition: BasicTTIImpl.h:484
bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:416
bool shouldDropLSRSolutionIfLessProfitable() const
Definition: BasicTTIImpl.h:436
bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2)
Definition: BasicTTIImpl.h:428
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed)
Definition: BasicTTIImpl.h:728
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Definition: BasicTTIImpl.h:770
bool hasBranchDivergence(const Function *F=nullptr)
Definition: BasicTTIImpl.h:320
bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const
Definition: BasicTTIImpl.h:422
unsigned getAssumedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:348
InstructionCost getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:875
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *)
unsigned getEpilogueVectorizationMinVF()
Definition: BasicTTIImpl.h:709
int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset)
Definition: BasicTTIImpl.h:392
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:478
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty)
Definition: BasicTTIImpl.h:581
bool isTargetIntrinsicTriviallyScalarizable(Intrinsic::ID ID) const
Definition: BasicTTIImpl.h:839
virtual std::optional< unsigned > getCacheSize(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:746
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace)
Definition: BasicTTIImpl.h:444
bool isTargetIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx) const
Definition: BasicTTIImpl.h:853
bool isAlwaysUniform(const Value *V)
Definition: BasicTTIImpl.h:324
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true)
Definition: BasicTTIImpl.h:718
bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, unsigned *Fast) const
Definition: BasicTTIImpl.h:298
unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const
Definition: BasicTTIImpl.h:396
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: BasicTTIImpl.h:306
InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
Definition: BasicTTIImpl.h:859
virtual std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
Definition: BasicTTIImpl.h:752
virtual bool enableWritePrefetching() const
Definition: BasicTTIImpl.h:782
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: BasicTTIImpl.h:362
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: BasicTTIImpl.h:339
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getFPOpCost(Type *Ty)
Definition: BasicTTIImpl.h:585
InstructionCost getVectorSplitCost()
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:922
bool haveFastSqrt(Type *Ty)
Definition: BasicTTIImpl.h:574
std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
Definition: BasicTTIImpl.h:358
unsigned getInliningThresholdMultiplier() const
Definition: BasicTTIImpl.h:614
InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind)
virtual ~BasicTTIImplBase()=default
bool isLegalAddScalableImmediate(int64_t Imm)
Definition: BasicTTIImpl.h:371
InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef< const Value * > Args, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing the inputs and outputs of an instruction, with return type RetTy...
Definition: BasicTTIImpl.h:904
bool isVScaleKnownToBeAPowerOfTwo() const
Definition: BasicTTIImpl.h:801
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II)
Definition: BasicTTIImpl.h:722
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
Definition: BasicTTIImpl.h:330
bool isLegalAddImmediate(int64_t imm)
Definition: BasicTTIImpl.h:367
unsigned getFlatAddressSpace()
Definition: BasicTTIImpl.h:334
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
virtual unsigned getCacheLineSize() const
Definition: BasicTTIImpl.h:762
bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
Definition: BasicTTIImpl.h:344
bool isSourceOfDivergence(const Value *V)
Definition: BasicTTIImpl.h:322
bool isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx) const
Definition: BasicTTIImpl.h:843
int getInlinerVectorBonusPercent() const
Definition: BasicTTIImpl.h:620
InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp)
Definition: BasicTTIImpl.h:735
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:379
bool isSingleThreaded() const
Definition: BasicTTIImpl.h:352
BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
Definition: BasicTTIImpl.h:289
bool isTargetIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx) const
Definition: BasicTTIImpl.h:848
unsigned adjustInliningThreshold(const CallBase *CB)
Definition: BasicTTIImpl.h:615
bool isProfitableLSRChainElement(Instruction *I)
Definition: BasicTTIImpl.h:440
Concrete BasicTTIImpl that can be used if no further customization is needed.
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
BlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation to estimate IR basic block frequen...
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition: InstrTypes.h:980
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:696
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:700
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:698
@ ICMP_EQ
equal
Definition: InstrTypes.h:694
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
CmpInst::Predicate getLTPredicate() const
CmpInst::Predicate getGTPredicate() const
This class represents a range of values.
Definition: ConstantRange.h:47
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
TypeSize getTypeStoreSizeInBits(Type *Ty) const
Returns the maximum number of bits that may be overwritten by storing the specified type; always a mu...
Definition: DataLayout.h:434
Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Definition: DataLayout.cpp:843
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:369
constexpr bool isVector() const
One or more elements.
Definition: TypeSize.h:326
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:322
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
unsigned getNumElements() const
Definition: DerivedTypes.h:606
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:353
The core instruction combiner logic.
Definition: InstCombiner.h:48
static InstructionCost getInvalid(CostType Val=0)
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:311
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
InstructionCost getScalarizationCost() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
virtual bool shouldPrefetchAddressSpace(unsigned AS) const
virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const
Return the minimum stride necessary to trigger software prefetching.
virtual bool enableWritePrefetching() const
virtual unsigned getMaxPrefetchIterationsAhead() const
Return the maximum prefetch distance in terms of loop iterations.
virtual unsigned getPrefetchDistance() const
Return the preferred prefetch distance in terms of instructions.
virtual std::optional< unsigned > getCacheAssociativity(unsigned Level) const
Return the cache associatvity for the given level of cache.
virtual std::optional< unsigned > getCacheLineSize(unsigned Level) const
Return the target cache line size in bytes at a given level.
Machine Value Type.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
The optimization diagnostic interface.
void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis providing profile information.
This class represents an analyzed expression in the program.
The main scalar evolution driver.
static bool isZeroEltSplatMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses all elements with the same value as the first element of exa...
static bool isSpliceMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is a splice mask, concatenating the two inputs together and then ext...
static bool isSelectMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from its source vectors without lane crossings.
static bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static bool isReverseMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask swaps the order of elements from exactly one source vector.
static bool isTransposeMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask is a transpose mask.
static bool isInsertSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &NumSubElts, int &Index)
Return true if this shuffle mask is an insert subvector mask.
size_type size() const
Definition: SmallPtrSet.h:94
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:384
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:519
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
void push_back(const T &Elt)
Definition: SmallVector.h:413
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:612
Multiway switch.
Provides information about what library functions are available for the current target.
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isIndexedStoreLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LegalizeAction
This enum indicates whether operations are valid for a target, and if not, what action should be used...
virtual bool isLegalICmpImmediate(int64_t) const
Return true if the specified immediate is legal icmp immediate, that is the target has icmp instructi...
const TargetMachine & getTargetMachine() const
virtual bool isZExtFree(Type *FromTy, Type *ToTy) const
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
virtual bool isSuitableForJumpTable(const SwitchInst *SI, uint64_t NumCases, uint64_t Range, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) const
Return true if lowering to a jump table is suitable for a set of case clusters which may contain NumC...
virtual bool areJTsAllowed(const Function *Fn) const
Return true if lowering to a jump table is allowed.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
virtual bool isCheapToSpeculateCttz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic cttz.
bool isTruncStoreLegal(EVT ValVT, EVT MemVT) const
Return true if the specified store with truncation is legal on this target.
unsigned getBitWidthForCttzElements(Type *RetTy, ElementCount EC, bool ZeroIsPoison, const ConstantRange *VScaleRange) const
Return the minimum number of bits required to hold the maximum possible number of trailing zero vecto...
virtual bool shouldExpandCmpUsingSelects(EVT VT) const
Should we expand [US]CMP nodes using two selects and two compares, or by doing arithmetic on boolean ...
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
virtual bool isTruncateFree(Type *FromTy, Type *ToTy) const
Return true if it's free to truncate a value of type FromTy to type ToTy.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps, const APInt &Low, const APInt &High, const DataLayout &DL) const
Return true if lowering to a bit test is suitable for a set of case clusters which contains NumDests ...
virtual bool isLegalAddImmediate(int64_t) const
Return true if the specified immediate is legal add immediate, that is the target has add instruction...
virtual bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
LegalizeAction getTruncStoreAction(EVT ValVT, EVT MemVT) const
Return how this store with truncation should be treated: either it is legal, needs to be promoted to ...
LegalizeAction getLoadExtAction(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return how this load with extension should be treated: either it is legal, needs to be promoted to a ...
virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const
Return true if integer divide is usually cheaper than a sequence of several shifts,...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isProfitableToHoist(Instruction *I) const
bool isIndexedLoadLegal(unsigned IdxMode, EVT VT) const
Return true if the specified indexed load is legal on this target.
bool isLoadExtLegal(unsigned ExtType, EVT ValVT, EVT MemVT) const
Return true if the specified load with extension is legal on this target.
virtual bool isCheapToSpeculateCtlz(Type *Ty) const
Return true if it is cheap to speculate a call to intrinsic ctlz.
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const
Return the prefered common base offset.
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
virtual bool isLegalAddScalableImmediate(int64_t) const
Return true if adding the specified scalable immediate is legal, that is the target has add instructi...
bool isBeneficialToExpandPowI(int64_t Exponent, bool OptForSize) const
Return true if it is beneficial to expand an @llvm.powi.
virtual bool isFAbsFree(EVT VT) const
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
virtual bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AddrSpace, Instruction *I=nullptr) const
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
bool isOperationLegalOrCustomOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
virtual std::pair< const Value *, unsigned > getPredicatedAddrSpace(const Value *V) const
If the specified predicate checks whether a generic pointer falls within a specified address space,...
virtual bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const
Returns true if a cast between SrcAS and DestAS is a noop.
virtual unsigned getAssumedAddrSpace(const Value *V) const
If the specified generic pointer could be assumed as a pointer to a specific address space,...
TargetOptions Options
ThreadModel::Model ThreadModel
ThreadModel - This flag specifies the type of threading model to assume for things like atomics.
TargetSubtargetInfo - Generic base class for all target subtargets.
virtual bool useAA() const
Enable use of alias analysis during code generation (during MI scheduling, DAGCombine,...
const DataLayout & getDataLayout() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isProfitableLSRChainElement(Instruction *I) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I) const
std::optional< unsigned > getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info, TTI::OperandValueInfo Opd2Info, ArrayRef< const Value * > Args, const Instruction *CxtI=nullptr) const
bool isLoweredToCall(const Function *F) const
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) const
TailFoldingStyle getPreferredTailFoldingStyle(bool IVUpdateMayOverflow=true) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const
std::optional< Value * > simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, const Instruction *I) const
CRTP base class for use as a mix-in that aids implementing a TargetTransformInfo-compatible class.
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind)
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
@ TCC_Basic
The cost of a typical 'add' instruction.
MemIndexedMode
The type of load/store indexing.
@ MIM_PostInc
Post-incrementing.
@ MIM_PostDec
Post-decrementing.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
CacheLevel
The possible cache levels.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition: Triple.h:395
bool isArch64Bit() const
Test whether the architecture is 64-bit.
Definition: Triple.cpp:1714
bool isOSDarwin() const
Is this a "Darwin" OS (macOS, iOS, tvOS, watchOS, XROS, or DriverKit).
Definition: Triple.h:585
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
static IntegerType * getInt1Ty(LLVMContext &C)
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
static IntegerType * getInt8Ty(LLVMContext &C)
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:225
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Value * getOperand(unsigned i) const
Definition: User.h:228
static bool isVPBinOp(Intrinsic::ID ID)
static bool isVPCast(Intrinsic::ID ID)
static bool isVPCmp(Intrinsic::ID ID)
static std::optional< unsigned > getFunctionalOpcodeForVP(Intrinsic::ID ID)
static std::optional< Intrinsic::ID > getFunctionalIntrinsicIDForVP(Intrinsic::ID ID)
static bool isVPIntrinsic(Intrinsic::ID)
static bool isVPReduction(Intrinsic::ID ID)
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
static VectorType * getHalfElementsVectorType(VectorType *VTy)
This static method returns a VectorType with half as many elements as the input type and the same ele...
Definition: DerivedTypes.h:531
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:665
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
Definition: DerivedTypes.h:460
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition: TypeSize.h:218
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
APInt ScaleBitMask(const APInt &A, unsigned NewBitWidth, bool MatchAllBits=false)
Splat/Merge neighboring bits to widen/narrow the bitmask represented by.
Definition: APInt.cpp:2982
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition: CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BSWAP
Byte Swap and Counting operators.
Definition: ISDOpcodes.h:744
@ SMULFIX
RESULT = [US]MULFIX(LHS, RHS, SCALE) - Perform fixed point multiplication on 2 integers with the same...
Definition: ISDOpcodes.h:374
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition: ISDOpcodes.h:498
@ FATAN2
FATAN2 - atan2, inspired by libm.
Definition: ISDOpcodes.h:999
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition: ISDOpcodes.h:717
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition: ISDOpcodes.h:262
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
Definition: ISDOpcodes.h:1059
@ SSUBO
Same for subtraction.
Definition: ISDOpcodes.h:334
@ BRIND
BRIND - Indirect branch.
Definition: ISDOpcodes.h:1123
@ BR_JT
BR_JT - Jumptable branch.
Definition: ISDOpcodes.h:1127
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition: ISDOpcodes.h:515
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition: ISDOpcodes.h:356
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:757
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition: ISDOpcodes.h:330
@ FP_TO_UINT_SAT
Definition: ISDOpcodes.h:907
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum or maximum on two values.
Definition: ISDOpcodes.h:1031
@ SMULO
Same for multiplication.
Definition: ISDOpcodes.h:338
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition: ISDOpcodes.h:697
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition: ISDOpcodes.h:766
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
Definition: ISDOpcodes.h:1050
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition: ISDOpcodes.h:705
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition: ISDOpcodes.h:906
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition: ISDOpcodes.h:508
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition: ISDOpcodes.h:347
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
Definition: ISDOpcodes.h:1055
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
Definition: ISDOpcodes.h:1559
ID ArrayRef< Type * > Tys
Definition: Intrinsics.h:102
bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
Definition: Intrinsics.cpp:617
DiagnosticInfoOptimizationBase::Argument NV
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1739
Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
Definition: LoopUtils.cpp:989
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2448
unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
Definition: LoopUtils.cpp:960
constexpr bool has_single_bit(T Value) noexcept
Definition: bit.h:146
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
ConstantRange getVScaleRange(const Function *F, unsigned BitWidth)
Determine the possible constant range of vscale with the given bit width, based on the vscale_range f...
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
constexpr unsigned BitWidth
Definition: BitmaskEnum.h:217
InstructionCost Cost
cl::opt< unsigned > PartialUnrollingThreshold
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:137
ElementCount getVectorElementCount() const
Definition: ValueTypes.h:345
static EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:289
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:311
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition: ValueTypes.h:65
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:210
Attributes of a target dependent hardware loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
bool AllowPeeling
Allow peeling off loop iterations.
bool AllowLoopNestsPeeling
Allow peeling off loop iterations for loop nests.
bool PeelProfiledIterations
Allow peeling basing on profile.
unsigned PeelCount
A forced peeling factor (the number of bodied of the original loop that should be peeled off before t...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).