LLVM 20.0.0git
SystemZTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements a TargetTransformInfo analysis pass specific to the
10// SystemZ target machine. It uses the target's detailed information to provide
11// more precise answers to certain TTI queries, while letting the target
12// independent and default TTI implementations handle the rest.
13//
14//===----------------------------------------------------------------------===//
15
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/Support/Debug.h"
26
27using namespace llvm;
28
29#define DEBUG_TYPE "systemztti"
30
31//===----------------------------------------------------------------------===//
32//
33// SystemZ cost model.
34//
35//===----------------------------------------------------------------------===//
36
37static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse) {
38 bool UsedAsMemCpySource = false;
39 for (const User *U : V->users())
40 if (const Instruction *User = dyn_cast<Instruction>(U)) {
41 if (isa<BitCastInst>(User) || isa<GetElementPtrInst>(User)) {
42 UsedAsMemCpySource |= isUsedAsMemCpySource(User, OtherUse);
43 continue;
44 }
45 if (const MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(User)) {
46 if (Memcpy->getOperand(1) == V && !Memcpy->isVolatile()) {
47 UsedAsMemCpySource = true;
48 continue;
49 }
50 }
51 OtherUse = true;
52 }
53 return UsedAsMemCpySource;
54}
55
56static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores,
57 unsigned &NumLoads, const Function *F) {
58 if (!isa<PointerType>(Ptr->getType()))
59 return;
60 for (const User *U : Ptr->users())
61 if (const Instruction *User = dyn_cast<Instruction>(U)) {
62 if (User->getParent()->getParent() == F) {
63 if (const auto *SI = dyn_cast<StoreInst>(User)) {
64 if (SI->getPointerOperand() == Ptr && !SI->isVolatile())
65 NumStores++;
66 } else if (const auto *LI = dyn_cast<LoadInst>(User)) {
67 if (LI->getPointerOperand() == Ptr && !LI->isVolatile())
68 NumLoads++;
69 } else if (const auto *GEP = dyn_cast<GetElementPtrInst>(User)) {
70 if (GEP->getPointerOperand() == Ptr)
71 countNumMemAccesses(GEP, NumStores, NumLoads, F);
72 }
73 }
74 }
75}
76
78 unsigned Bonus = 0;
79 const Function *Caller = CB->getParent()->getParent();
80 const Function *Callee = CB->getCalledFunction();
81 if (!Callee)
82 return 0;
83 const Module *M = Caller->getParent();
84
85 // Increase the threshold if an incoming argument is used only as a memcpy
86 // source.
87 for (const Argument &Arg : Callee->args()) {
88 bool OtherUse = false;
89 if (isUsedAsMemCpySource(&Arg, OtherUse) && !OtherUse) {
90 Bonus = 1000;
91 break;
92 }
93 }
94
95 // Give bonus for globals used much in both caller and callee.
96 std::set<const GlobalVariable *> CalleeGlobals;
97 std::set<const GlobalVariable *> CallerGlobals;
98 for (const GlobalVariable &Global : M->globals())
99 for (const User *U : Global.users())
100 if (const Instruction *User = dyn_cast<Instruction>(U)) {
101 if (User->getParent()->getParent() == Callee)
102 CalleeGlobals.insert(&Global);
103 if (User->getParent()->getParent() == Caller)
104 CallerGlobals.insert(&Global);
105 }
106 for (auto *GV : CalleeGlobals)
107 if (CallerGlobals.count(GV)) {
108 unsigned CalleeStores = 0, CalleeLoads = 0;
109 unsigned CallerStores = 0, CallerLoads = 0;
110 countNumMemAccesses(GV, CalleeStores, CalleeLoads, Callee);
111 countNumMemAccesses(GV, CallerStores, CallerLoads, Caller);
112 if ((CalleeStores + CalleeLoads) > 10 &&
113 (CallerStores + CallerLoads) > 10) {
114 Bonus = 1000;
115 break;
116 }
117 }
118
119 // Give bonus when Callee accesses an Alloca of Caller heavily.
120 unsigned NumStores = 0;
121 unsigned NumLoads = 0;
122 for (unsigned OpIdx = 0; OpIdx != Callee->arg_size(); ++OpIdx) {
123 Value *CallerArg = CB->getArgOperand(OpIdx);
124 Argument *CalleeArg = Callee->getArg(OpIdx);
125 if (isa<AllocaInst>(CallerArg))
126 countNumMemAccesses(CalleeArg, NumStores, NumLoads, Callee);
127 }
128 if (NumLoads > 10)
129 Bonus += NumLoads * 50;
130 if (NumStores > 10)
131 Bonus += NumStores * 50;
132 Bonus = std::min(Bonus, unsigned(1000));
133
134 LLVM_DEBUG(if (Bonus)
135 dbgs() << "++ SZTTI Adding inlining bonus: " << Bonus << "\n";);
136 return Bonus;
137}
138
141 assert(Ty->isIntegerTy());
142
143 unsigned BitSize = Ty->getPrimitiveSizeInBits();
144 // There is no cost model for constants with a bit size of 0. Return TCC_Free
145 // here, so that constant hoisting will ignore this constant.
146 if (BitSize == 0)
147 return TTI::TCC_Free;
148 // No cost model for operations on integers larger than 128 bit implemented yet.
149 if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
150 return TTI::TCC_Free;
151
152 if (Imm == 0)
153 return TTI::TCC_Free;
154
155 if (Imm.getBitWidth() <= 64) {
156 // Constants loaded via lgfi.
157 if (isInt<32>(Imm.getSExtValue()))
158 return TTI::TCC_Basic;
159 // Constants loaded via llilf.
160 if (isUInt<32>(Imm.getZExtValue()))
161 return TTI::TCC_Basic;
162 // Constants loaded via llihf:
163 if ((Imm.getZExtValue() & 0xffffffff) == 0)
164 return TTI::TCC_Basic;
165
166 return 2 * TTI::TCC_Basic;
167 }
168
169 // i128 immediates loads from Constant Pool
170 return 2 * TTI::TCC_Basic;
171}
172
174 const APInt &Imm, Type *Ty,
176 Instruction *Inst) {
177 assert(Ty->isIntegerTy());
178
179 unsigned BitSize = Ty->getPrimitiveSizeInBits();
180 // There is no cost model for constants with a bit size of 0. Return TCC_Free
181 // here, so that constant hoisting will ignore this constant.
182 if (BitSize == 0)
183 return TTI::TCC_Free;
184 // No cost model for operations on integers larger than 64 bit implemented yet.
185 if (BitSize > 64)
186 return TTI::TCC_Free;
187
188 switch (Opcode) {
189 default:
190 return TTI::TCC_Free;
191 case Instruction::GetElementPtr:
192 // Always hoist the base address of a GetElementPtr. This prevents the
193 // creation of new constants for every base constant that gets constant
194 // folded with the offset.
195 if (Idx == 0)
196 return 2 * TTI::TCC_Basic;
197 return TTI::TCC_Free;
198 case Instruction::Store:
199 if (Idx == 0 && Imm.getBitWidth() <= 64) {
200 // Any 8-bit immediate store can by implemented via mvi.
201 if (BitSize == 8)
202 return TTI::TCC_Free;
203 // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
204 if (isInt<16>(Imm.getSExtValue()))
205 return TTI::TCC_Free;
206 }
207 break;
208 case Instruction::ICmp:
209 if (Idx == 1 && Imm.getBitWidth() <= 64) {
210 // Comparisons against signed 32-bit immediates implemented via cgfi.
211 if (isInt<32>(Imm.getSExtValue()))
212 return TTI::TCC_Free;
213 // Comparisons against unsigned 32-bit immediates implemented via clgfi.
214 if (isUInt<32>(Imm.getZExtValue()))
215 return TTI::TCC_Free;
216 }
217 break;
218 case Instruction::Add:
219 case Instruction::Sub:
220 if (Idx == 1 && Imm.getBitWidth() <= 64) {
221 // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
222 if (isUInt<32>(Imm.getZExtValue()))
223 return TTI::TCC_Free;
224 // Or their negation, by swapping addition vs. subtraction.
225 if (isUInt<32>(-Imm.getSExtValue()))
226 return TTI::TCC_Free;
227 }
228 break;
229 case Instruction::Mul:
230 if (Idx == 1 && Imm.getBitWidth() <= 64) {
231 // We use msgfi to multiply by 32-bit signed immediates.
232 if (isInt<32>(Imm.getSExtValue()))
233 return TTI::TCC_Free;
234 }
235 break;
236 case Instruction::Or:
237 case Instruction::Xor:
238 if (Idx == 1 && Imm.getBitWidth() <= 64) {
239 // Masks supported by oilf/xilf.
240 if (isUInt<32>(Imm.getZExtValue()))
241 return TTI::TCC_Free;
242 // Masks supported by oihf/xihf.
243 if ((Imm.getZExtValue() & 0xffffffff) == 0)
244 return TTI::TCC_Free;
245 }
246 break;
247 case Instruction::And:
248 if (Idx == 1 && Imm.getBitWidth() <= 64) {
249 // Any 32-bit AND operation can by implemented via nilf.
250 if (BitSize <= 32)
251 return TTI::TCC_Free;
252 // 64-bit masks supported by nilf.
253 if (isUInt<32>(~Imm.getZExtValue()))
254 return TTI::TCC_Free;
255 // 64-bit masks supported by nilh.
256 if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
257 return TTI::TCC_Free;
258 // Some 64-bit AND operations can be implemented via risbg.
259 const SystemZInstrInfo *TII = ST->getInstrInfo();
260 unsigned Start, End;
261 if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
262 return TTI::TCC_Free;
263 }
264 break;
265 case Instruction::Shl:
266 case Instruction::LShr:
267 case Instruction::AShr:
268 // Always return TCC_Free for the shift value of a shift instruction.
269 if (Idx == 1)
270 return TTI::TCC_Free;
271 break;
272 case Instruction::UDiv:
273 case Instruction::SDiv:
274 case Instruction::URem:
275 case Instruction::SRem:
276 case Instruction::Trunc:
277 case Instruction::ZExt:
278 case Instruction::SExt:
279 case Instruction::IntToPtr:
280 case Instruction::PtrToInt:
281 case Instruction::BitCast:
282 case Instruction::PHI:
283 case Instruction::Call:
284 case Instruction::Select:
285 case Instruction::Ret:
286 case Instruction::Load:
287 break;
288 }
289
291}
292
295 const APInt &Imm, Type *Ty,
297 assert(Ty->isIntegerTy());
298
299 unsigned BitSize = Ty->getPrimitiveSizeInBits();
300 // There is no cost model for constants with a bit size of 0. Return TCC_Free
301 // here, so that constant hoisting will ignore this constant.
302 if (BitSize == 0)
303 return TTI::TCC_Free;
304 // No cost model for operations on integers larger than 64 bit implemented yet.
305 if (BitSize > 64)
306 return TTI::TCC_Free;
307
308 switch (IID) {
309 default:
310 return TTI::TCC_Free;
311 case Intrinsic::sadd_with_overflow:
312 case Intrinsic::uadd_with_overflow:
313 case Intrinsic::ssub_with_overflow:
314 case Intrinsic::usub_with_overflow:
315 // These get expanded to include a normal addition/subtraction.
316 if (Idx == 1 && Imm.getBitWidth() <= 64) {
317 if (isUInt<32>(Imm.getZExtValue()))
318 return TTI::TCC_Free;
319 if (isUInt<32>(-Imm.getSExtValue()))
320 return TTI::TCC_Free;
321 }
322 break;
323 case Intrinsic::smul_with_overflow:
324 case Intrinsic::umul_with_overflow:
325 // These get expanded to include a normal multiplication.
326 if (Idx == 1 && Imm.getBitWidth() <= 64) {
327 if (isInt<32>(Imm.getSExtValue()))
328 return TTI::TCC_Free;
329 }
330 break;
331 case Intrinsic::experimental_stackmap:
332 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
333 return TTI::TCC_Free;
334 break;
335 case Intrinsic::experimental_patchpoint_void:
336 case Intrinsic::experimental_patchpoint:
337 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
338 return TTI::TCC_Free;
339 break;
340 }
342}
343
346 assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
347 if (ST->hasPopulationCount() && TyWidth <= 64)
349 return TTI::PSK_Software;
350}
351
355 // Find out if L contains a call, what the machine instruction count
356 // estimate is, and how many stores there are.
357 bool HasCall = false;
358 InstructionCost NumStores = 0;
359 for (auto &BB : L->blocks())
360 for (auto &I : *BB) {
361 if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
362 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
363 if (isLoweredToCall(F))
364 HasCall = true;
365 if (F->getIntrinsicID() == Intrinsic::memcpy ||
366 F->getIntrinsicID() == Intrinsic::memset)
367 NumStores++;
368 } else { // indirect call.
369 HasCall = true;
370 }
371 }
372 if (isa<StoreInst>(&I)) {
373 Type *MemAccessTy = I.getOperand(0)->getType();
374 NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy,
375 std::nullopt, 0, TTI::TCK_RecipThroughput);
376 }
377 }
378
379 // The z13 processor will run out of store tags if too many stores
380 // are fed into it too quickly. Therefore make sure there are not
381 // too many stores in the resulting unrolled loop.
382 unsigned const NumStoresVal = *NumStores.getValue();
383 unsigned const Max = (NumStoresVal ? (12 / NumStoresVal) : UINT_MAX);
384
385 if (HasCall) {
386 // Only allow full unrolling if loop has any calls.
387 UP.FullUnrollMaxCount = Max;
388 UP.MaxCount = 1;
389 return;
390 }
391
392 UP.MaxCount = Max;
393 if (UP.MaxCount <= 1)
394 return;
395
396 // Allow partial and runtime trip count unrolling.
397 UP.Partial = UP.Runtime = true;
398
399 UP.PartialThreshold = 75;
401
402 // Allow expensive instructions in the pre-header of the loop.
403 UP.AllowExpensiveTripCount = true;
404
405 UP.Force = true;
406}
407
411}
412
415 // SystemZ specific: check instruction count (first), and don't care about
416 // ImmCost, since offsets are checked explicitly.
417 return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
418 C1.NumIVMuls, C1.NumBaseAdds,
419 C1.ScaleCost, C1.SetupCost) <
420 std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
421 C2.NumIVMuls, C2.NumBaseAdds,
422 C2.ScaleCost, C2.SetupCost);
423}
424
425unsigned SystemZTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
426 bool Vector = (ClassID == 1);
427 if (!Vector)
428 // Discount the stack pointer. Also leave out %r0, since it can't
429 // be used in an address.
430 return 14;
431 if (ST->hasVector())
432 return 32;
433 return 0;
434}
435
438 switch (K) {
440 return TypeSize::getFixed(64);
442 return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
444 return TypeSize::getScalable(0);
445 }
446
447 llvm_unreachable("Unsupported register kind");
448}
449
450unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
451 unsigned NumStridedMemAccesses,
452 unsigned NumPrefetches,
453 bool HasCall) const {
454 // Don't prefetch a loop with many far apart accesses.
455 if (NumPrefetches > 16)
456 return UINT_MAX;
457
458 // Emit prefetch instructions for smaller strides in cases where we think
459 // the hardware prefetcher might not be able to keep up.
460 if (NumStridedMemAccesses > 32 && !HasCall &&
461 (NumMemAccesses - NumStridedMemAccesses) * 32 <= NumStridedMemAccesses)
462 return 1;
463
464 return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
465}
466
467bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
468 EVT VT = TLI->getValueType(DL, DataType);
469 return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
470}
471
472static bool isFreeEltLoad(Value *Op) {
473 if (isa<LoadInst>(Op) && Op->hasOneUse()) {
474 const Instruction *UserI = cast<Instruction>(*Op->user_begin());
475 return !isa<StoreInst>(UserI); // Prefer MVC
476 }
477 return false;
478}
479
481 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
483 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
485
486 if (Insert && Ty->isIntOrIntVectorTy(64)) {
487 // VLVGP will insert two GPRs with one instruction, while VLE will load
488 // an element directly with no extra cost
489 assert((VL.empty() || VL.size() == NumElts) &&
490 "Type does not match the number of values.");
491 InstructionCost CurrVectorCost = 0;
492 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
493 if (DemandedElts[Idx] && !(VL.size() && isFreeEltLoad(VL[Idx])))
494 ++CurrVectorCost;
495 if (Idx % 2 == 1) {
496 Cost += std::min(InstructionCost(1), CurrVectorCost);
497 CurrVectorCost = 0;
498 }
499 }
500 Insert = false;
501 }
502
503 Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
504 CostKind, VL);
505 return Cost;
506}
507
508// Return the bit size for the scalar type or vector element
509// type. getScalarSizeInBits() returns 0 for a pointer type.
510static unsigned getScalarSizeInBits(Type *Ty) {
511 unsigned Size =
512 (Ty->isPtrOrPtrVectorTy() ? 64U : Ty->getScalarSizeInBits());
513 assert(Size > 0 && "Element must have non-zero size.");
514 return Size;
515}
516
517// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
518// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
519// 3.
520static unsigned getNumVectorRegs(Type *Ty) {
521 auto *VTy = cast<FixedVectorType>(Ty);
522 unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
523 assert(WideBits > 0 && "Could not compute size of vector");
524 return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
525}
526
528 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531 const Instruction *CxtI) {
532
533 // TODO: Handle more cost kinds.
535 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
536 Op2Info, Args, CxtI);
537
538 // TODO: return a good value for BB-VECTORIZER that includes the
539 // immediate loads, which we do not want to count for the loop
540 // vectorizer, since they are hopefully hoisted out of the loop. This
541 // would require a new parameter 'InLoop', but not sure if constant
542 // args are common enough to motivate this.
543
544 unsigned ScalarBits = Ty->getScalarSizeInBits();
545
546 // There are thre cases of division and remainder: Dividing with a register
547 // needs a divide instruction. A divisor which is a power of two constant
548 // can be implemented with a sequence of shifts. Any other constant needs a
549 // multiply and shifts.
550 const unsigned DivInstrCost = 20;
551 const unsigned DivMulSeqCost = 10;
552 const unsigned SDivPow2Cost = 4;
553
554 bool SignedDivRem =
555 Opcode == Instruction::SDiv || Opcode == Instruction::SRem;
556 bool UnsignedDivRem =
557 Opcode == Instruction::UDiv || Opcode == Instruction::URem;
558
559 // Check for a constant divisor.
560 bool DivRemConst = false;
561 bool DivRemConstPow2 = false;
562 if ((SignedDivRem || UnsignedDivRem) && Args.size() == 2) {
563 if (const Constant *C = dyn_cast<Constant>(Args[1])) {
564 const ConstantInt *CVal =
565 (C->getType()->isVectorTy()
566 ? dyn_cast_or_null<const ConstantInt>(C->getSplatValue())
567 : dyn_cast<const ConstantInt>(C));
568 if (CVal && (CVal->getValue().isPowerOf2() ||
569 CVal->getValue().isNegatedPowerOf2()))
570 DivRemConstPow2 = true;
571 else
572 DivRemConst = true;
573 }
574 }
575
576 if (!Ty->isVectorTy()) {
577 // These FP operations are supported with a dedicated instruction for
578 // float, double and fp128 (base implementation assumes float generally
579 // costs 2).
580 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
581 Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
582 return 1;
583
584 // There is no native support for FRem.
585 if (Opcode == Instruction::FRem)
586 return LIBCALL_COST;
587
588 // Give discount for some combined logical operations if supported.
589 if (Args.size() == 2) {
590 if (Opcode == Instruction::Xor) {
591 for (const Value *A : Args) {
592 if (const Instruction *I = dyn_cast<Instruction>(A))
593 if (I->hasOneUse() &&
594 (I->getOpcode() == Instruction::Or ||
595 I->getOpcode() == Instruction::And ||
596 I->getOpcode() == Instruction::Xor))
597 if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
598 (isInt128InVR(Ty) &&
599 (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
600 return 0;
601 }
602 }
603 else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
604 for (const Value *A : Args) {
605 if (const Instruction *I = dyn_cast<Instruction>(A))
606 if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
607 ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
608 (isInt128InVR(Ty) &&
609 (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
610 return 0;
611 }
612 }
613 }
614
615 // Or requires one instruction, although it has custom handling for i64.
616 if (Opcode == Instruction::Or)
617 return 1;
618
619 if (Opcode == Instruction::Xor && ScalarBits == 1) {
620 if (ST->hasLoadStoreOnCond2())
621 return 5; // 2 * (li 0; loc 1); xor
622 return 7; // 2 * ipm sequences ; xor ; shift ; compare
623 }
624
625 if (DivRemConstPow2)
626 return (SignedDivRem ? SDivPow2Cost : 1);
627 if (DivRemConst)
628 return DivMulSeqCost;
629 if (SignedDivRem || UnsignedDivRem)
630 return DivInstrCost;
631 }
632 else if (ST->hasVector()) {
633 auto *VTy = cast<FixedVectorType>(Ty);
634 unsigned VF = VTy->getNumElements();
635 unsigned NumVectors = getNumVectorRegs(Ty);
636
637 // These vector operations are custom handled, but are still supported
638 // with one instruction per vector, regardless of element size.
639 if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
640 Opcode == Instruction::AShr) {
641 return NumVectors;
642 }
643
644 if (DivRemConstPow2)
645 return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
646 if (DivRemConst) {
647 SmallVector<Type *> Tys(Args.size(), Ty);
648 return VF * DivMulSeqCost +
650 }
651 if (SignedDivRem || UnsignedDivRem) {
652 if (ST->hasVectorEnhancements3() && ScalarBits >= 32)
653 return NumVectors * DivInstrCost;
654 else if (VF > 4)
655 // Temporary hack: disable high vectorization factors with integer
656 // division/remainder, which will get scalarized and handled with
657 // GR128 registers. The mischeduler is not clever enough to avoid
658 // spilling yet.
659 return 1000;
660 }
661
662 // These FP operations are supported with a single vector instruction for
663 // double (base implementation assumes float generally costs 2). For
664 // FP128, the scalar cost is 1, and there is no overhead since the values
665 // are already in scalar registers.
666 if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
667 Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
668 switch (ScalarBits) {
669 case 32: {
670 // The vector enhancements facility 1 provides v4f32 instructions.
671 if (ST->hasVectorEnhancements1())
672 return NumVectors;
673 // Return the cost of multiple scalar invocation plus the cost of
674 // inserting and extracting the values.
675 InstructionCost ScalarCost =
677 SmallVector<Type *> Tys(Args.size(), Ty);
679 (VF * ScalarCost) +
681 // FIXME: VF 2 for these FP operations are currently just as
682 // expensive as for VF 4.
683 if (VF == 2)
684 Cost *= 2;
685 return Cost;
686 }
687 case 64:
688 case 128:
689 return NumVectors;
690 default:
691 break;
692 }
693 }
694
695 // There is no native support for FRem.
696 if (Opcode == Instruction::FRem) {
697 SmallVector<Type *> Tys(Args.size(), Ty);
699 (VF * LIBCALL_COST) +
701 // FIXME: VF 2 for float is currently just as expensive as for VF 4.
702 if (VF == 2 && ScalarBits == 32)
703 Cost *= 2;
704 return Cost;
705 }
706 }
707
708 // Fallback to the default implementation.
709 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
710 Args, CxtI);
711}
712
715 TTI::TargetCostKind CostKind, int Index, VectorType *SubTp,
716 ArrayRef<const Value *> Args, const Instruction *CxtI) {
717 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
718 if (ST->hasVector()) {
719 unsigned NumVectors = getNumVectorRegs(Tp);
720
721 // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
722
723 // FP128 values are always in scalar registers, so there is no work
724 // involved with a shuffle, except for broadcast. In that case register
725 // moves are done with a single instruction per element.
726 if (Tp->getScalarType()->isFP128Ty())
727 return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
728
729 switch (Kind) {
731 // ExtractSubvector Index indicates start offset.
732
733 // Extracting a subvector from first index is a noop.
734 return (Index == 0 ? 0 : NumVectors);
735
737 // Loop vectorizer calls here to figure out the extra cost of
738 // broadcasting a loaded value to all elements of a vector. Since vlrep
739 // loads and replicates with a single instruction, adjust the returned
740 // value.
741 return NumVectors - 1;
742
743 default:
744
745 // SystemZ supports single instruction permutation / replication.
746 return NumVectors;
747 }
748 }
749
750 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
751}
752
753// Return the log2 difference of the element sizes of the two vector types.
754static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
755 unsigned Bits0 = Ty0->getScalarSizeInBits();
756 unsigned Bits1 = Ty1->getScalarSizeInBits();
757
758 if (Bits1 > Bits0)
759 return (Log2_32(Bits1) - Log2_32(Bits0));
760
761 return (Log2_32(Bits0) - Log2_32(Bits1));
762}
763
764// Return the number of instructions needed to truncate SrcTy to DstTy.
766getVectorTruncCost(Type *SrcTy, Type *DstTy) {
767 assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
770 "Packing must reduce size of vector type.");
771 assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
772 cast<FixedVectorType>(DstTy)->getNumElements() &&
773 "Packing should not change number of elements.");
774
775 // TODO: Since fp32 is expanded, the extract cost should always be 0.
776
777 unsigned NumParts = getNumVectorRegs(SrcTy);
778 if (NumParts <= 2)
779 // Up to 2 vector registers can be truncated efficiently with pack or
780 // permute. The latter requires an immediate mask to be loaded, which
781 // typically gets hoisted out of a loop. TODO: return a good value for
782 // BB-VECTORIZER that includes the immediate loads, which we do not want
783 // to count for the loop vectorizer.
784 return 1;
785
786 unsigned Cost = 0;
787 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
788 unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
789 for (unsigned P = 0; P < Log2Diff; ++P) {
790 if (NumParts > 1)
791 NumParts /= 2;
792 Cost += NumParts;
793 }
794
795 // Currently, a general mix of permutes and pack instructions is output by
796 // isel, which follow the cost computation above except for this case which
797 // is one instruction less:
798 if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
799 DstTy->getScalarSizeInBits() == 8)
800 Cost--;
801
802 return Cost;
803}
804
805// Return the cost of converting a vector bitmask produced by a compare
806// (SrcTy), to the type of the select or extend instruction (DstTy).
809 assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
810 "Should only be called with vector types.");
811
812 unsigned PackCost = 0;
813 unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
814 unsigned DstScalarBits = DstTy->getScalarSizeInBits();
815 unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
816 if (SrcScalarBits > DstScalarBits)
817 // The bitmask will be truncated.
818 PackCost = getVectorTruncCost(SrcTy, DstTy);
819 else if (SrcScalarBits < DstScalarBits) {
820 unsigned DstNumParts = getNumVectorRegs(DstTy);
821 // Each vector select needs its part of the bitmask unpacked.
822 PackCost = Log2Diff * DstNumParts;
823 // Extra cost for moving part of mask before unpacking.
824 PackCost += DstNumParts - 1;
825 }
826
827 return PackCost;
828}
829
830// Return the type of the compared operands. This is needed to compute the
831// cost for a Select / ZExt or SExt instruction.
832static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
833 Type *OpTy = nullptr;
834 if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
835 OpTy = CI->getOperand(0)->getType();
836 else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
837 if (LogicI->getNumOperands() == 2)
838 if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
839 if (isa<CmpInst>(LogicI->getOperand(1)))
840 OpTy = CI0->getOperand(0)->getType();
841
842 if (OpTy != nullptr) {
843 if (VF == 1) {
844 assert (!OpTy->isVectorTy() && "Expected scalar type");
845 return OpTy;
846 }
847 // Return the potentially vectorized type based on 'I' and 'VF'. 'I' may
848 // be either scalar or already vectorized with a same or lesser VF.
849 Type *ElTy = OpTy->getScalarType();
850 return FixedVectorType::get(ElTy, VF);
851 }
852
853 return nullptr;
854}
855
856// Get the cost of converting a boolean vector to a vector with same width
857// and element size as Dst, plus the cost of zero extending if needed.
859getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
860 const Instruction *I) {
861 auto *DstVTy = cast<FixedVectorType>(Dst);
862 unsigned VF = DstVTy->getNumElements();
863 unsigned Cost = 0;
864 // If we know what the widths of the compared operands, get any cost of
865 // converting it to match Dst. Otherwise assume same widths.
866 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
867 if (CmpOpTy != nullptr)
868 Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
869 if (Opcode == Instruction::ZExt || Opcode == Instruction::UIToFP)
870 // One 'vn' per dst vector with an immediate mask.
871 Cost += getNumVectorRegs(Dst);
872 return Cost;
873}
874
876 Type *Src,
879 const Instruction *I) {
880 // FIXME: Can the logic below also be used for these cost kinds?
882 auto BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
883 return BaseCost == 0 ? BaseCost : 1;
884 }
885
886 unsigned DstScalarBits = Dst->getScalarSizeInBits();
887 unsigned SrcScalarBits = Src->getScalarSizeInBits();
888
889 if (!Src->isVectorTy()) {
890 assert (!Dst->isVectorTy());
891
892 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
893 if (Src->isIntegerTy(128))
894 return LIBCALL_COST;
895 if (SrcScalarBits >= 32 ||
896 (I != nullptr && isa<LoadInst>(I->getOperand(0))))
897 return 1;
898 return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
899 }
900
901 if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
902 Dst->isIntegerTy(128))
903 return LIBCALL_COST;
904
905 if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
906 if (Src->isIntegerTy(1)) {
907 if (DstScalarBits == 128) {
908 if (Opcode == Instruction::SExt && ST->hasVectorEnhancements3())
909 return 0;/*VCEQQ*/
910 return 5 /*branch seq.*/;
911 }
912
913 if (ST->hasLoadStoreOnCond2())
914 return 2; // li 0; loc 1
915
916 // This should be extension of a compare i1 result, which is done with
917 // ipm and a varying sequence of instructions.
918 unsigned Cost = 0;
919 if (Opcode == Instruction::SExt)
920 Cost = (DstScalarBits < 64 ? 3 : 4);
921 if (Opcode == Instruction::ZExt)
922 Cost = 3;
923 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
924 if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
925 // If operands of an fp-type was compared, this costs +1.
926 Cost++;
927 return Cost;
928 }
929 else if (isInt128InVR(Dst)) {
930 // Extensions from GPR to i128 (in VR) typically costs two instructions,
931 // but a zero-extending load would be just one extra instruction.
932 if (Opcode == Instruction::ZExt && I != nullptr)
933 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
934 if (Ld->hasOneUse())
935 return 1;
936 return 2;
937 }
938 }
939
940 if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
941 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
942 if (Ld->hasOneUse())
943 return 0; // Will be converted to GPR load.
944 bool OnlyTruncatingStores = true;
945 for (const User *U : I->users())
946 if (!isa<StoreInst>(U)) {
947 OnlyTruncatingStores = false;
948 break;
949 }
950 if (OnlyTruncatingStores)
951 return 0;
952 return 2; // Vector element extraction.
953 }
954 }
955 else if (ST->hasVector()) {
956 // Vector to scalar cast.
957 auto *SrcVecTy = cast<FixedVectorType>(Src);
958 auto *DstVecTy = dyn_cast<FixedVectorType>(Dst);
959 if (!DstVecTy) {
960 // TODO: tune vector-to-scalar cast.
961 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
962 }
963 unsigned VF = SrcVecTy->getNumElements();
964 unsigned NumDstVectors = getNumVectorRegs(Dst);
965 unsigned NumSrcVectors = getNumVectorRegs(Src);
966
967 if (Opcode == Instruction::Trunc) {
968 if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
969 return 0; // Check for NOOP conversions.
970 return getVectorTruncCost(Src, Dst);
971 }
972
973 if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
974 if (SrcScalarBits >= 8) {
975 // ZExt will use either a single unpack or a vector permute.
976 if (Opcode == Instruction::ZExt)
977 return NumDstVectors;
978
979 // SExt will be handled with one unpack per doubling of width.
980 unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
981
982 // For types that spans multiple vector registers, some additional
983 // instructions are used to setup the unpacking.
984 unsigned NumSrcVectorOps =
985 (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
986 : (NumDstVectors / 2));
987
988 return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
989 }
990 else if (SrcScalarBits == 1)
991 return getBoolVecToIntConversionCost(Opcode, Dst, I);
992 }
993
994 if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
995 Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
996 // TODO: Fix base implementation which could simplify things a bit here
997 // (seems to miss on differentiating on scalar/vector types).
998
999 // Only 64 bit vector conversions are natively supported before z15.
1000 if (DstScalarBits == 64 || ST->hasVectorEnhancements2()) {
1001 if (SrcScalarBits == DstScalarBits)
1002 return NumDstVectors;
1003
1004 if (SrcScalarBits == 1)
1005 return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
1006 }
1007
1008 // Return the cost of multiple scalar invocation plus the cost of
1009 // inserting and extracting the values. Base implementation does not
1010 // realize float->int gets scalarized.
1011 InstructionCost ScalarCost = getCastInstrCost(
1012 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind);
1013 InstructionCost TotCost = VF * ScalarCost;
1014 bool NeedsInserts = true, NeedsExtracts = true;
1015 // FP128 registers do not get inserted or extracted.
1016 if (DstScalarBits == 128 &&
1017 (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
1018 NeedsInserts = false;
1019 if (SrcScalarBits == 128 &&
1020 (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
1021 NeedsExtracts = false;
1022
1023 TotCost += BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1024 NeedsExtracts, CostKind);
1025 TotCost += BaseT::getScalarizationOverhead(DstVecTy, NeedsInserts,
1026 /*Extract*/ false, CostKind);
1027
1028 // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
1029 if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
1030 TotCost *= 2;
1031
1032 return TotCost;
1033 }
1034
1035 if (Opcode == Instruction::FPTrunc) {
1036 if (SrcScalarBits == 128) // fp128 -> double/float + inserts of elements.
1037 return VF /*ldxbr/lexbr*/ +
1038 BaseT::getScalarizationOverhead(DstVecTy, /*Insert*/ true,
1039 /*Extract*/ false, CostKind);
1040 else // double -> float
1041 return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
1042 }
1043
1044 if (Opcode == Instruction::FPExt) {
1045 if (SrcScalarBits == 32 && DstScalarBits == 64) {
1046 // float -> double is very rare and currently unoptimized. Instead of
1047 // using vldeb, which can do two at a time, all conversions are
1048 // scalarized.
1049 return VF * 2;
1050 }
1051 // -> fp128. VF * lxdb/lxeb + extraction of elements.
1052 return VF + BaseT::getScalarizationOverhead(SrcVecTy, /*Insert*/ false,
1053 /*Extract*/ true, CostKind);
1054 }
1055 }
1056
1057 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1058}
1059
1060// Scalar i8 / i16 operations will typically be made after first extending
1061// the operands to i32.
1062static unsigned getOperandsExtensionCost(const Instruction *I) {
1063 unsigned ExtCost = 0;
1064 for (Value *Op : I->operands())
1065 // A load of i8 or i16 sign/zero extends to i32.
1066 if (!isa<LoadInst>(Op) && !isa<ConstantInt>(Op))
1067 ExtCost++;
1068
1069 return ExtCost;
1070}
1071
1073 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
1075 TTI::OperandValueInfo Op2Info, const Instruction *I) {
1077 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1078 Op1Info, Op2Info);
1079
1080 if (!ValTy->isVectorTy()) {
1081 switch (Opcode) {
1082 case Instruction::ICmp: {
1083 // A loaded value compared with 0 with multiple users becomes Load and
1084 // Test. The load is then not foldable, so return 0 cost for the ICmp.
1085 unsigned ScalarBits = ValTy->getScalarSizeInBits();
1086 if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
1087 if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
1088 if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
1089 if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
1090 C->isZero())
1091 return 0;
1092
1093 unsigned Cost = 1;
1094 if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
1095 Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
1096 return Cost;
1097 }
1098 case Instruction::Select:
1099 if (ValTy->isFloatingPointTy())
1100 return 4; // No LOC for FP - costs a conditional jump.
1101
1102 // When selecting based on an i128 comparison, LOC / VSEL is possible
1103 // if i128 comparisons are directly supported.
1104 if (I != nullptr)
1105 if (ICmpInst *CI = dyn_cast<ICmpInst>(I->getOperand(0)))
1106 if (CI->getOperand(0)->getType()->isIntegerTy(128))
1107 return ST->hasVectorEnhancements3() ? 1 : 4;
1108
1109 // Load On Condition / Select Register available, except for i128.
1110 return !isInt128InVR(ValTy) ? 1 : 4;
1111 }
1112 }
1113 else if (ST->hasVector()) {
1114 unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
1115
1116 // Called with a compare instruction.
1117 if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
1118 unsigned PredicateExtraCost = 0;
1119 if (I != nullptr) {
1120 // Some predicates cost one or two extra instructions.
1121 switch (cast<CmpInst>(I)->getPredicate()) {
1127 PredicateExtraCost = 1;
1128 break;
1133 PredicateExtraCost = 2;
1134 break;
1135 default:
1136 break;
1137 }
1138 }
1139
1140 // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
1141 // floats. FIXME: <2 x float> generates same code as <4 x float>.
1142 unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
1143 unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
1144
1145 unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
1146 return Cost;
1147 }
1148 else { // Called with a select instruction.
1149 assert (Opcode == Instruction::Select);
1150
1151 // We can figure out the extra cost of packing / unpacking if the
1152 // instruction was passed and the compare instruction is found.
1153 unsigned PackCost = 0;
1154 Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
1155 if (CmpOpTy != nullptr)
1156 PackCost =
1157 getVectorBitmaskConversionCost(CmpOpTy, ValTy);
1158
1159 return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
1160 }
1161 }
1162
1163 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1164 Op1Info, Op2Info);
1165}
1166
1169 unsigned Index, Value *Op0,
1170 Value *Op1) {
1171 if (Opcode == Instruction::InsertElement) {
1172 // Vector Element Load.
1173 if (Op1 != nullptr && isFreeEltLoad(Op1))
1174 return 0;
1175
1176 // vlvgp will insert two grs into a vector register, so count half the
1177 // number of instructions as an estimate when we don't have the full
1178 // picture (as in getScalarizationOverhead()).
1179 if (Val->isIntOrIntVectorTy(64))
1180 return ((Index % 2 == 0) ? 1 : 0);
1181 }
1182
1183 if (Opcode == Instruction::ExtractElement) {
1184 int Cost = ((getScalarSizeInBits(Val) == 1) ? 2 /*+test-under-mask*/ : 1);
1185
1186 // Give a slight penalty for moving out of vector pipeline to FXU unit.
1187 if (Index == 0 && Val->isIntOrIntVectorTy())
1188 Cost += 1;
1189
1190 return Cost;
1191 }
1192
1193 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
1194}
1195
1196// Check if a load may be folded as a memory operand in its user.
1198isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
1199 if (!Ld->hasOneUse())
1200 return false;
1201 FoldedValue = Ld;
1202 const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
1203 unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
1204 unsigned TruncBits = 0;
1205 unsigned SExtBits = 0;
1206 unsigned ZExtBits = 0;
1207 if (UserI->hasOneUse()) {
1208 unsigned UserBits = UserI->getType()->getScalarSizeInBits();
1209 if (isa<TruncInst>(UserI))
1210 TruncBits = UserBits;
1211 else if (isa<SExtInst>(UserI))
1212 SExtBits = UserBits;
1213 else if (isa<ZExtInst>(UserI))
1214 ZExtBits = UserBits;
1215 }
1216 if (TruncBits || SExtBits || ZExtBits) {
1217 FoldedValue = UserI;
1218 UserI = cast<Instruction>(*UserI->user_begin());
1219 // Load (single use) -> trunc/extend (single use) -> UserI
1220 }
1221 if ((UserI->getOpcode() == Instruction::Sub ||
1222 UserI->getOpcode() == Instruction::SDiv ||
1223 UserI->getOpcode() == Instruction::UDiv) &&
1224 UserI->getOperand(1) != FoldedValue)
1225 return false; // Not commutative, only RHS foldable.
1226 // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
1227 // extension was made of the load.
1228 unsigned LoadOrTruncBits =
1229 ((SExtBits || ZExtBits) ? 0 : (TruncBits ? TruncBits : LoadedBits));
1230 switch (UserI->getOpcode()) {
1231 case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
1232 case Instruction::Sub:
1233 case Instruction::ICmp:
1234 if (LoadedBits == 32 && ZExtBits == 64)
1235 return true;
1236 [[fallthrough]];
1237 case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
1238 if (UserI->getOpcode() != Instruction::ICmp) {
1239 if (LoadedBits == 16 &&
1240 (SExtBits == 32 ||
1241 (SExtBits == 64 && ST->hasMiscellaneousExtensions2())))
1242 return true;
1243 if (LoadOrTruncBits == 16)
1244 return true;
1245 }
1246 [[fallthrough]];
1247 case Instruction::SDiv:// SE: 32->64
1248 if (LoadedBits == 32 && SExtBits == 64)
1249 return true;
1250 [[fallthrough]];
1251 case Instruction::UDiv:
1252 case Instruction::And:
1253 case Instruction::Or:
1254 case Instruction::Xor:
1255 // This also makes sense for float operations, but disabled for now due
1256 // to regressions.
1257 // case Instruction::FCmp:
1258 // case Instruction::FAdd:
1259 // case Instruction::FSub:
1260 // case Instruction::FMul:
1261 // case Instruction::FDiv:
1262
1263 // All possible extensions of memory checked above.
1264
1265 // Comparison between memory and immediate.
1266 if (UserI->getOpcode() == Instruction::ICmp)
1267 if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
1268 if (CI->getValue().isIntN(16))
1269 return true;
1270 return (LoadOrTruncBits == 32 || LoadOrTruncBits == 64);
1271 break;
1272 }
1273 return false;
1274}
1275
1276static bool isBswapIntrinsicCall(const Value *V) {
1277 if (const Instruction *I = dyn_cast<Instruction>(V))
1278 if (auto *CI = dyn_cast<CallInst>(I))
1279 if (auto *F = CI->getCalledFunction())
1280 if (F->getIntrinsicID() == Intrinsic::bswap)
1281 return true;
1282 return false;
1283}
1284
1286 MaybeAlign Alignment,
1287 unsigned AddressSpace,
1289 TTI::OperandValueInfo OpInfo,
1290 const Instruction *I) {
1291 assert(!Src->isVoidTy() && "Invalid type");
1292
1293 // TODO: Handle other cost kinds.
1295 return 1;
1296
1297 if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
1298 // Store the load or its truncated or extended value in FoldedValue.
1299 const Instruction *FoldedValue = nullptr;
1300 if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1301 const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1302 assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1303
1304 // UserI can't fold two loads, so in that case return 0 cost only
1305 // half of the time.
1306 for (unsigned i = 0; i < 2; ++i) {
1307 if (UserI->getOperand(i) == FoldedValue)
1308 continue;
1309
1310 if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1311 LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1312 if (!OtherLoad &&
1313 (isa<TruncInst>(OtherOp) || isa<SExtInst>(OtherOp) ||
1314 isa<ZExtInst>(OtherOp)))
1315 OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1316 if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1317 return i == 0; // Both operands foldable.
1318 }
1319 }
1320
1321 return 0; // Only I is foldable in user.
1322 }
1323 }
1324
1325 // Type legalization (via getNumberOfParts) can't handle structs
1326 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1327 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1328 CostKind);
1329
1330 // FP128 is a legal type but kept in a register pair on older CPUs.
1331 if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
1332 return 2;
1333
1334 unsigned NumOps =
1335 (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
1336
1337 // Store/Load reversed saves one instruction.
1338 if (((!Src->isVectorTy() && NumOps == 1) || ST->hasVectorEnhancements2()) &&
1339 I != nullptr) {
1340 if (Opcode == Instruction::Load && I->hasOneUse()) {
1341 const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1342 // In case of load -> bswap -> store, return normal cost for the load.
1343 if (isBswapIntrinsicCall(LdUser) &&
1344 (!LdUser->hasOneUse() || !isa<StoreInst>(*LdUser->user_begin())))
1345 return 0;
1346 }
1347 else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1348 const Value *StoredVal = SI->getValueOperand();
1349 if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1350 return 0;
1351 }
1352 }
1353
1354 return NumOps;
1355}
1356
1357// The generic implementation of getInterleavedMemoryOpCost() is based on
1358// adding costs of the memory operations plus all the extracts and inserts
1359// needed for using / defining the vector operands. The SystemZ version does
1360// roughly the same but bases the computations on vector permutations
1361// instead.
1363 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1364 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1365 bool UseMaskForCond, bool UseMaskForGaps) {
1366 if (UseMaskForCond || UseMaskForGaps)
1367 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1368 Alignment, AddressSpace, CostKind,
1369 UseMaskForCond, UseMaskForGaps);
1370 assert(isa<VectorType>(VecTy) &&
1371 "Expect a vector type for interleaved memory op");
1372
1373 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1374 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1375 unsigned VF = NumElts / Factor;
1376 unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1377 unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1378 unsigned NumPermutes = 0;
1379
1380 if (Opcode == Instruction::Load) {
1381 // Loading interleave groups may have gaps, which may mean fewer
1382 // loads. Find out how many vectors will be loaded in total, and in how
1383 // many of them each value will be in.
1384 BitVector UsedInsts(NumVectorMemOps, false);
1385 std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1386 for (unsigned Index : Indices)
1387 for (unsigned Elt = 0; Elt < VF; ++Elt) {
1388 unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1389 UsedInsts.set(Vec);
1390 ValueVecs[Index].set(Vec);
1391 }
1392 NumVectorMemOps = UsedInsts.count();
1393
1394 for (unsigned Index : Indices) {
1395 // Estimate that each loaded source vector containing this Index
1396 // requires one operation, except that vperm can handle two input
1397 // registers first time for each dst vector.
1398 unsigned NumSrcVecs = ValueVecs[Index].count();
1399 unsigned NumDstVecs = divideCeil(VF * getScalarSizeInBits(VecTy), 128U);
1400 assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1401 NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1402 }
1403 } else {
1404 // Estimate the permutes for each stored vector as the smaller of the
1405 // number of elements and the number of source vectors. Subtract one per
1406 // dst vector for vperm (S.A.).
1407 unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1408 unsigned NumDstVecs = NumVectorMemOps;
1409 NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1410 }
1411
1412 // Cost of load/store operations and the permutations needed.
1413 return NumVectorMemOps + NumPermutes;
1414}
1415
1416InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) {
1418 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1419 Cost += NumVec - 1;
1420 // For integer adds, VSUM creates shorter reductions on the final vector.
1421 Cost += (ScalarBits < 32) ? 3 : 2;
1422 return Cost;
1423}
1424
1425InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems,
1426 unsigned ScalarBits) {
1427 unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits);
1429 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1430 Cost += NumVec - 1;
1431 // For each shuffle / arithmetic layer, we need 2 instructions, and we need
1432 // log2(Elements in Last Vector) layers.
1433 Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg));
1434 return Cost;
1435}
1436
1437inline bool customCostReductions(unsigned Opcode) {
1438 return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
1439 Opcode == Instruction::Add || Opcode == Instruction::Mul;
1440}
1441
1444 std::optional<FastMathFlags> FMF,
1446 unsigned ScalarBits = Ty->getScalarSizeInBits();
1447 // The following is only for subtargets with vector math, non-ordered
1448 // reductions, and reasonable scalar sizes for int and fp add/mul.
1449 if (customCostReductions(Opcode) && ST->hasVector() &&
1451 ScalarBits <= SystemZ::VectorBits) {
1452 unsigned NumVectors = getNumVectorRegs(Ty);
1453 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1454 // Integer Add is using custom code gen, that needs to be accounted for.
1455 if (Opcode == Instruction::Add)
1456 return getIntAddReductionCost(NumVectors, ScalarBits);
1457 // The base cost is the same across all other arithmetic instructions
1459 getFastReductionCost(NumVectors, NumElems, ScalarBits);
1460 // But we need to account for the final op involving the scalar operand.
1461 if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul))
1462 Cost += 1;
1463 return Cost;
1464 }
1465 // otherwise, fall back to the standard implementation
1466 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1467}
1468
1471 FastMathFlags FMF,
1473 // Return custom costs only on subtargets with vector enhancements.
1474 if (ST->hasVectorEnhancements1()) {
1475 unsigned NumVectors = getNumVectorRegs(Ty);
1476 unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements();
1477 unsigned ScalarBits = Ty->getScalarSizeInBits();
1479 // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total.
1480 Cost += NumVectors - 1;
1481 // For the final vector, we need shuffle + min/max operations, and
1482 // we need #Elements - 1 of them.
1483 Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1);
1484 return Cost;
1485 }
1486 // For other targets, fall back to the standard implementation
1487 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1488}
1489
1490static int
1492 const SmallVectorImpl<Type *> &ParamTys) {
1493 if (RetTy->isVectorTy() && ID == Intrinsic::bswap)
1494 return getNumVectorRegs(RetTy); // VPERM
1495
1496 return -1;
1497}
1498
1503 ICA.getID(), ICA.getReturnType(), ICA.getArgTypes());
1504 if (Cost != -1)
1505 return Cost;
1507}
1508
1510 // Always expand on Subtargets without vector instructions.
1511 if (!ST->hasVector())
1512 return true;
1513
1514 // Whether or not to expand is a per-intrinsic decision.
1515 switch (II->getIntrinsicID()) {
1516 default:
1517 return true;
1518 // Do not expand vector.reduce.add...
1519 case Intrinsic::vector_reduce_add:
1520 auto *VType = cast<FixedVectorType>(II->getOperand(0)->getType());
1521 // ...unless the scalar size is i64 or larger,
1522 // or the operand vector is not full, since the
1523 // performance benefit is dubious in those cases.
1524 return VType->getScalarSizeInBits() >= 64 ||
1525 VType->getPrimitiveSizeInBits() < SystemZ::VectorBits;
1526 }
1527}
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(...)
Definition: Debug.h:106
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
Hexagon Common GEP
const HexagonInstrInfo * TII
This file defines an InstructionCost class that is used when calculating the cost of an instruction,...
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
static const Function * getCalledFunction(const Value *V)
uint64_t IntrinsicInst * II
#define P(N)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getNumElements(Type *Ty)
bool customCostReductions(unsigned Opcode)
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1)
static bool isBswapIntrinsicCall(const Value *V)
InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits)
static void countNumMemAccesses(const Value *Ptr, unsigned &NumStores, unsigned &NumLoads, const Function *F)
static unsigned getOperandsExtensionCost(const Instruction *I)
static Type * getCmpOpsType(const Instruction *I, unsigned VF=1)
static unsigned getScalarSizeInBits(Type *Ty)
static bool isFreeEltLoad(Value *Op)
InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, unsigned ScalarBits)
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl< Type * > &ParamTys)
static bool isUsedAsMemCpySource(const Value *V, bool &OtherUse)
static unsigned getNumVectorRegs(Type *Ty)
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:449
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:440
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:168
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:163
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:694
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:806
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:958
size_type count() const
count - Returns the number of bits which are set.
Definition: BitVector.h:162
BitVector & set()
Definition: BitVector.h:351
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1112
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1341
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1286
This class is the base class for the comparison instructions.
Definition: InstrTypes.h:661
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:673
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:703
@ ICMP_UGE
unsigned greater or equal
Definition: InstrTypes.h:697
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition: InstrTypes.h:681
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition: InstrTypes.h:684
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition: InstrTypes.h:682
@ ICMP_NE
not equal
Definition: InstrTypes.h:695
@ ICMP_SGE
signed greater or equal
Definition: InstrTypes.h:701
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:699
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition: InstrTypes.h:683
This is the shared class of boolean and integer constants.
Definition: Constants.h:83
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:148
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:563
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:791
This instruction compares its operands according to the predicate given to the constructor.
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
An instruction for reading from memory.
Definition: Instructions.h:176
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
This class wraps the llvm.memcpy intrinsic.
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1196
An instruction for storing to memory.
Definition: Instructions.h:292
const SystemZInstrInfo * getInstrInfo() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
bool shouldExpandReduction(const IntrinsicInst *II) const
unsigned getNumberOfRegisters(unsigned ClassID) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst, const Instruction *I)
unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const override
unsigned adjustInliningThreshold(const CallBase *CB) const
unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, ArrayRef< Value * > VL={})
unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool hasDivRemOp(Type *DataType, bool IsSigned)
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:270
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition: Type.h:243
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition: Type.h:153
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:162
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:267
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:237
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:355
Value * getOperand(unsigned i) const
Definition: User.h:228
unsigned getNumOperands() const
Definition: User.h:250
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
constexpr ScalarTy getFixedValue() const
Definition: TypeSize.h:202
const ParentTy * getParent() const
Definition: ilist_node.h:32
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
const unsigned VectorBits
Definition: SystemZ.h:154
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition: MathExtras.h:355
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:342
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:293
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
@ Global
Append to llvm.global_dtors.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:405
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Extended Value Type.
Definition: ValueTypes.h:35
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition: ValueTypes.h:157
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
unsigned Insns
TODO: Some of these could be merged.
Parameters that control the generic loop unrolling transformation.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned FullUnrollMaxCount
Set the maximum unrolling factor for full unrolling.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...