LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
64
65#include <cstdint>
66#include <optional>
67
68#define DEBUG_TYPE "openmp-ir-builder"
69
70using namespace llvm;
71using namespace omp;
72
73static cl::opt<bool>
74 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
75 cl::desc("Use optimistic attributes describing "
76 "'as-if' properties of runtime calls."),
77 cl::init(false));
78
80 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
81 cl::desc("Factor for the unroll threshold to account for code "
82 "simplifications still taking place"),
83 cl::init(1.5));
84
85#ifndef NDEBUG
86/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
87/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
88/// an InsertPoint stores the instruction before something is inserted. For
89/// instance, if both point to the same instruction, two IRBuilders alternating
90/// creating instruction will cause the instructions to be interleaved.
93 if (!IP1.isSet() || !IP2.isSet())
94 return false;
95 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
96}
97
99 // Valid ordered/unordered and base algorithm combinations.
100 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
101 case OMPScheduleType::UnorderedStaticChunked:
102 case OMPScheduleType::UnorderedStatic:
103 case OMPScheduleType::UnorderedDynamicChunked:
104 case OMPScheduleType::UnorderedGuidedChunked:
105 case OMPScheduleType::UnorderedRuntime:
106 case OMPScheduleType::UnorderedAuto:
107 case OMPScheduleType::UnorderedTrapezoidal:
108 case OMPScheduleType::UnorderedGreedy:
109 case OMPScheduleType::UnorderedBalanced:
110 case OMPScheduleType::UnorderedGuidedIterativeChunked:
111 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
112 case OMPScheduleType::UnorderedSteal:
113 case OMPScheduleType::UnorderedStaticBalancedChunked:
114 case OMPScheduleType::UnorderedGuidedSimd:
115 case OMPScheduleType::UnorderedRuntimeSimd:
116 case OMPScheduleType::OrderedStaticChunked:
117 case OMPScheduleType::OrderedStatic:
118 case OMPScheduleType::OrderedDynamicChunked:
119 case OMPScheduleType::OrderedGuidedChunked:
120 case OMPScheduleType::OrderedRuntime:
121 case OMPScheduleType::OrderedAuto:
122 case OMPScheduleType::OrderdTrapezoidal:
123 case OMPScheduleType::NomergeUnorderedStaticChunked:
124 case OMPScheduleType::NomergeUnorderedStatic:
125 case OMPScheduleType::NomergeUnorderedDynamicChunked:
126 case OMPScheduleType::NomergeUnorderedGuidedChunked:
127 case OMPScheduleType::NomergeUnorderedRuntime:
128 case OMPScheduleType::NomergeUnorderedAuto:
129 case OMPScheduleType::NomergeUnorderedTrapezoidal:
130 case OMPScheduleType::NomergeUnorderedGreedy:
131 case OMPScheduleType::NomergeUnorderedBalanced:
132 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
133 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
134 case OMPScheduleType::NomergeUnorderedSteal:
135 case OMPScheduleType::NomergeOrderedStaticChunked:
136 case OMPScheduleType::NomergeOrderedStatic:
137 case OMPScheduleType::NomergeOrderedDynamicChunked:
138 case OMPScheduleType::NomergeOrderedGuidedChunked:
139 case OMPScheduleType::NomergeOrderedRuntime:
140 case OMPScheduleType::NomergeOrderedAuto:
141 case OMPScheduleType::NomergeOrderedTrapezoidal:
142 case OMPScheduleType::OrderedDistributeChunked:
143 case OMPScheduleType::OrderedDistribute:
144 break;
145 default:
146 return false;
147 }
148
149 // Must not set both monotonicity modifiers at the same time.
150 OMPScheduleType MonotonicityFlags =
151 SchedType & OMPScheduleType::MonotonicityMask;
152 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
153 return false;
154
155 return true;
156}
157#endif
158
159/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
160/// debug location to the last instruction in the specified basic block if the
161/// insert point points to the end of the block.
164 Builder.restoreIP(IP);
165 llvm::BasicBlock *BB = Builder.GetInsertBlock();
166 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
167 if (!BB->empty() && I == BB->end())
168 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
169}
170
171static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
172 if (T.isAMDGPU()) {
173 StringRef Features =
174 Kernel->getFnAttribute("target-features").getValueAsString();
175 if (Features.count("+wavefrontsize64"))
178 }
179 if (T.isNVPTX())
181 if (T.isSPIRV())
183 llvm_unreachable("No grid value available for this architecture!");
184}
185
186/// Determine which scheduling algorithm to use, determined from schedule clause
187/// arguments.
188static OMPScheduleType
189getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
190 bool HasSimdModifier, bool HasDistScheduleChunks) {
191 // Currently, the default schedule it static.
192 switch (ClauseKind) {
193 case OMP_SCHEDULE_Default:
194 case OMP_SCHEDULE_Static:
195 return HasChunks ? OMPScheduleType::BaseStaticChunked
196 : OMPScheduleType::BaseStatic;
197 case OMP_SCHEDULE_Dynamic:
198 return OMPScheduleType::BaseDynamicChunked;
199 case OMP_SCHEDULE_Guided:
200 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
201 : OMPScheduleType::BaseGuidedChunked;
202 case OMP_SCHEDULE_Auto:
204 case OMP_SCHEDULE_Runtime:
205 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
206 : OMPScheduleType::BaseRuntime;
207 case OMP_SCHEDULE_Distribute:
208 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
209 : OMPScheduleType::BaseDistribute;
210 }
211 llvm_unreachable("unhandled schedule clause argument");
212}
213
214/// Adds ordering modifier flags to schedule type.
215static OMPScheduleType
217 bool HasOrderedClause) {
218 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
219 OMPScheduleType::None &&
220 "Must not have ordering nor monotonicity flags already set");
221
222 OMPScheduleType OrderingModifier = HasOrderedClause
223 ? OMPScheduleType::ModifierOrdered
224 : OMPScheduleType::ModifierUnordered;
225 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
226
227 // Unsupported combinations
228 if (OrderingScheduleType ==
229 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
230 return OMPScheduleType::OrderedGuidedChunked;
231 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
232 OMPScheduleType::ModifierOrdered))
233 return OMPScheduleType::OrderedRuntime;
234
235 return OrderingScheduleType;
236}
237
238/// Adds monotonicity modifier flags to schedule type.
239static OMPScheduleType
241 bool HasSimdModifier, bool HasMonotonic,
242 bool HasNonmonotonic, bool HasOrderedClause) {
243 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
244 OMPScheduleType::None &&
245 "Must not have monotonicity flags already set");
246 assert((!HasMonotonic || !HasNonmonotonic) &&
247 "Monotonic and Nonmonotonic are contradicting each other");
248
249 if (HasMonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierMonotonic;
251 } else if (HasNonmonotonic) {
252 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
253 } else {
254 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
255 // If the static schedule kind is specified or if the ordered clause is
256 // specified, and if the nonmonotonic modifier is not specified, the
257 // effect is as if the monotonic modifier is specified. Otherwise, unless
258 // the monotonic modifier is specified, the effect is as if the
259 // nonmonotonic modifier is specified.
260 OMPScheduleType BaseScheduleType =
261 ScheduleType & ~OMPScheduleType::ModifierMask;
262 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
263 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
264 HasOrderedClause) {
265 // The monotonic is used by default in openmp runtime library, so no need
266 // to set it.
267 return ScheduleType;
268 } else {
269 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
270 }
271 }
272}
273
274/// Determine the schedule type using schedule and ordering clause arguments.
275static OMPScheduleType
276computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
277 bool HasSimdModifier, bool HasMonotonicModifier,
278 bool HasNonmonotonicModifier, bool HasOrderedClause,
279 bool HasDistScheduleChunks) {
281 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
282 OMPScheduleType OrderedSchedule =
283 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
285 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
286 HasNonmonotonicModifier, HasOrderedClause);
287
289 return Result;
290}
291
292/// Make \p Source branch to \p Target.
293///
294/// Handles two situations:
295/// * \p Source already has an unconditional branch.
296/// * \p Source is a degenerate block (no terminator because the BB is
297/// the current head of the IR construction).
299 if (Instruction *Term = Source->getTerminator()) {
300 auto *Br = cast<BranchInst>(Term);
301 assert(!Br->isConditional() &&
302 "BB's terminator must be an unconditional branch (or degenerate)");
303 BasicBlock *Succ = Br->getSuccessor(0);
304 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
305 Br->setSuccessor(0, Target);
306 return;
307 }
308
309 auto *NewBr = BranchInst::Create(Target, Source);
310 NewBr->setDebugLoc(DL);
311}
312
313void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
314 bool CreateBranch, DebugLoc DL) {
315 assert(New->getFirstInsertionPt() == New->begin() &&
316 "Target BB must not have PHI nodes");
317
318 // Move instructions to new block.
319 BasicBlock *Old = IP.getBlock();
320 // If the `Old` block is empty then there are no instructions to move. But in
321 // the new debug scheme, it could have trailing debug records which will be
322 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
323 // reasons:
324 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
325 // 2. Even if `New` is not empty, the rationale to move those records to `New`
326 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
327 // assumes that `Old` is optimized out and is going away. This is not the case
328 // here. The `Old` block is still being used e.g. a branch instruction is
329 // added to it later in this function.
330 // So we call `BasicBlock::splice` only when `Old` is not empty.
331 if (!Old->empty())
332 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
333
334 if (CreateBranch) {
335 auto *NewBr = BranchInst::Create(New, Old);
336 NewBr->setDebugLoc(DL);
337 }
338}
339
340void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
341 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
342 BasicBlock *Old = Builder.GetInsertBlock();
343
344 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Old->getTerminator());
347 else
348 Builder.SetInsertPoint(Old);
349
350 // SetInsertPoint also updates the Builder's debug location, but we want to
351 // keep the one the Builder was configured to use.
352 Builder.SetCurrentDebugLocation(DebugLoc);
353}
354
355BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
356 DebugLoc DL, llvm::Twine Name) {
357 BasicBlock *Old = IP.getBlock();
359 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
360 Old->getParent(), Old->getNextNode());
361 spliceBB(IP, New, CreateBranch, DL);
362 New->replaceSuccessorsPhiUsesWith(Old, New);
363 return New;
364}
365
366BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
367 llvm::Twine Name) {
368 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
369 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
370 if (CreateBranch)
371 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
372 else
373 Builder.SetInsertPoint(Builder.GetInsertBlock());
374 // SetInsertPoint also updates the Builder's debug location, but we want to
375 // keep the one the Builder was configured to use.
376 Builder.SetCurrentDebugLocation(DebugLoc);
377 return New;
378}
379
380BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
381 llvm::Twine Name) {
382 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
383 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
384 if (CreateBranch)
385 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
386 else
387 Builder.SetInsertPoint(Builder.GetInsertBlock());
388 // SetInsertPoint also updates the Builder's debug location, but we want to
389 // keep the one the Builder was configured to use.
390 Builder.SetCurrentDebugLocation(DebugLoc);
391 return New;
392}
393
394BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
395 llvm::Twine Suffix) {
396 BasicBlock *Old = Builder.GetInsertBlock();
397 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
398}
399
400// This function creates a fake integer value and a fake use for the integer
401// value. It returns the fake value created. This is useful in modeling the
402// extra arguments to the outlined functions.
404 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
406 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
407 const Twine &Name = "", bool AsPtr = true,
408 bool Is64Bit = false) {
409 Builder.restoreIP(OuterAllocaIP);
410 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
411 Instruction *FakeVal;
412 AllocaInst *FakeValAddr =
413 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
414 ToBeDeleted.push_back(FakeValAddr);
415
416 if (AsPtr) {
417 FakeVal = FakeValAddr;
418 } else {
419 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
420 ToBeDeleted.push_back(FakeVal);
421 }
422
423 // Generate a fake use of this value
424 Builder.restoreIP(InnerAllocaIP);
425 Instruction *UseFakeVal;
426 if (AsPtr) {
427 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
428 } else {
429 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
430 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
431 }
432 ToBeDeleted.push_back(UseFakeVal);
433 return FakeVal;
434}
435
436//===----------------------------------------------------------------------===//
437// OpenMPIRBuilderConfig
438//===----------------------------------------------------------------------===//
439
440namespace {
442/// Values for bit flags for marking which requires clauses have been used.
443enum OpenMPOffloadingRequiresDirFlags {
444 /// flag undefined.
445 OMP_REQ_UNDEFINED = 0x000,
446 /// no requires directive present.
447 OMP_REQ_NONE = 0x001,
448 /// reverse_offload clause.
449 OMP_REQ_REVERSE_OFFLOAD = 0x002,
450 /// unified_address clause.
451 OMP_REQ_UNIFIED_ADDRESS = 0x004,
452 /// unified_shared_memory clause.
453 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
454 /// dynamic_allocators clause.
455 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
456 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
457};
458
459} // anonymous namespace
460
461OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
462 : RequiresFlags(OMP_REQ_UNDEFINED) {}
463
464OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
465 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
466 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
467 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
468 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
469 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
470 RequiresFlags(OMP_REQ_UNDEFINED) {
471 if (HasRequiresReverseOffload)
472 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
473 if (HasRequiresUnifiedAddress)
474 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
475 if (HasRequiresUnifiedSharedMemory)
476 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
477 if (HasRequiresDynamicAllocators)
478 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
479}
480
481bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
482 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
483}
484
485bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
486 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
487}
488
489bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
490 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
491}
492
493bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
494 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
495}
496
497int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
498 return hasRequiresFlags() ? RequiresFlags
499 : static_cast<int64_t>(OMP_REQ_NONE);
500}
501
502void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
503 if (Value)
504 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
505 else
506 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
507}
508
509void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
510 if (Value)
511 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
512 else
513 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
514}
515
516void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
517 if (Value)
518 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
519 else
520 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
521}
522
523void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
524 if (Value)
525 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
526 else
527 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
528}
529
530//===----------------------------------------------------------------------===//
531// OpenMPIRBuilder
532//===----------------------------------------------------------------------===//
533
534void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
535 IRBuilderBase &Builder,
536 SmallVector<Value *> &ArgsVector) {
537 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
538 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
539 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
540 constexpr size_t MaxDim = 3;
541 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
542
543 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
544
545 Value *DynCGroupMemFallbackFlag =
546 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
547 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
548 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
549
550 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
551
552 Value *NumTeams3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
554 Value *NumThreads3D =
555 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
556 for (unsigned I :
557 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
558 NumTeams3D =
559 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
560 for (unsigned I :
561 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
562 NumThreads3D =
563 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
564
565 ArgsVector = {Version,
566 PointerNum,
567 KernelArgs.RTArgs.BasePointersArray,
568 KernelArgs.RTArgs.PointersArray,
569 KernelArgs.RTArgs.SizesArray,
570 KernelArgs.RTArgs.MapTypesArray,
571 KernelArgs.RTArgs.MapNamesArray,
572 KernelArgs.RTArgs.MappersArray,
573 KernelArgs.NumIterations,
574 Flags,
575 NumTeams3D,
576 NumThreads3D,
577 KernelArgs.DynCGroupMem};
578}
579
580void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
581 LLVMContext &Ctx = Fn.getContext();
582
583 // Get the function's current attributes.
584 auto Attrs = Fn.getAttributes();
585 auto FnAttrs = Attrs.getFnAttrs();
586 auto RetAttrs = Attrs.getRetAttrs();
588 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
589 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
590
591 // Add AS to FnAS while taking special care with integer extensions.
592 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
593 bool Param = true) -> void {
594 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
595 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
596 if (HasSignExt || HasZeroExt) {
597 assert(AS.getNumAttributes() == 1 &&
598 "Currently not handling extension attr combined with others.");
599 if (Param) {
600 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
601 FnAS = FnAS.addAttribute(Ctx, AK);
602 } else if (auto AK =
603 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else {
606 FnAS = FnAS.addAttributes(Ctx, AS);
607 }
608 };
609
610#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
611#include "llvm/Frontend/OpenMP/OMPKinds.def"
612
613 // Add attributes to the function declaration.
614 switch (FnID) {
615#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
616 case Enum: \
617 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
618 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
619 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
620 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
621 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
622 break;
623#include "llvm/Frontend/OpenMP/OMPKinds.def"
624 default:
625 // Attributes are optional.
626 break;
627 }
628}
629
631OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
632 FunctionType *FnTy = nullptr;
633 Function *Fn = nullptr;
634
635 // Try to find the declation in the module first.
636 switch (FnID) {
637#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
638 case Enum: \
639 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
640 IsVarArg); \
641 Fn = M.getFunction(Str); \
642 break;
643#include "llvm/Frontend/OpenMP/OMPKinds.def"
644 }
645
646 if (!Fn) {
647 // Create a new declaration if we need one.
648 switch (FnID) {
649#define OMP_RTL(Enum, Str, ...) \
650 case Enum: \
651 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
652 break;
653#include "llvm/Frontend/OpenMP/OMPKinds.def"
654 }
655 Fn->setCallingConv(Config.getRuntimeCC());
656 // Add information if the runtime function takes a callback function
657 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
658 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
659 LLVMContext &Ctx = Fn->getContext();
660 MDBuilder MDB(Ctx);
661 // Annotate the callback behavior of the runtime function:
662 // - The callback callee is argument number 2 (microtask).
663 // - The first two arguments of the callback callee are unknown (-1).
664 // - All variadic arguments to the runtime function are passed to the
665 // callback callee.
666 Fn->addMetadata(
667 LLVMContext::MD_callback,
668 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
669 2, {-1, -1}, /* VarArgsArePassed */ true)}));
670 }
671 }
672
673 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
674 << " with type " << *Fn->getFunctionType() << "\n");
675 addAttributes(FnID, *Fn);
676
677 } else {
678 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
679 << " with type " << *Fn->getFunctionType() << "\n");
680 }
681
682 assert(Fn && "Failed to create OpenMP runtime function");
683
684 return {FnTy, Fn};
685}
686
688OpenMPIRBuilder::FinalizationInfo::getFiniBB(IRBuilderBase &Builder) {
689 if (!FiniBB) {
690 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
691 IRBuilderBase::InsertPointGuard Guard(Builder);
692 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
693 Builder.SetInsertPoint(FiniBB);
694 // FiniCB adds the branch to the exit stub.
695 if (Error Err = FiniCB(Builder.saveIP()))
696 return Err;
697 }
698 return FiniBB;
699}
700
701Error OpenMPIRBuilder::FinalizationInfo::mergeFiniBB(IRBuilderBase &Builder,
702 BasicBlock *OtherFiniBB) {
703 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
704 if (!FiniBB) {
705 FiniBB = OtherFiniBB;
706
707 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
708 if (Error Err = FiniCB(Builder.saveIP()))
709 return Err;
710
711 return Error::success();
712 }
713
714 // Move instructions from FiniBB to the start of OtherFiniBB.
715 auto EndIt = FiniBB->end();
716 if (FiniBB->size() >= 1)
717 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
718 EndIt = Prev;
719 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
720 EndIt);
721
722 FiniBB->replaceAllUsesWith(OtherFiniBB);
723 FiniBB->eraseFromParent();
724 FiniBB = OtherFiniBB;
725 return Error::success();
726}
727
728Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
729 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
730 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
731 assert(Fn && "Failed to create OpenMP runtime function pointer");
732 return Fn;
733}
734
735CallInst *OpenMPIRBuilder::createRuntimeFunctionCall(FunctionCallee Callee,
737 StringRef Name) {
738 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
739 Call->setCallingConv(Config.getRuntimeCC());
740 return Call;
741}
742
743void OpenMPIRBuilder::initialize() { initializeTypes(M); }
744
747 BasicBlock &EntryBlock = Function->getEntryBlock();
748 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
749
750 // Loop over blocks looking for constant allocas, skipping the entry block
751 // as any allocas there are already in the desired location.
752 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
753 Block++) {
754 for (auto Inst = Block->getReverseIterator()->begin();
755 Inst != Block->getReverseIterator()->end();) {
757 Inst++;
759 continue;
760 AllocaInst->moveBeforePreserving(MoveLocInst);
761 } else {
762 Inst++;
763 }
764 }
765 }
766}
767
770
771 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
772 // TODO: For now, we support simple static allocations, we might need to
773 // move non-static ones as well. However, this will need further analysis to
774 // move the lenght arguments as well.
776 };
777
778 for (llvm::Instruction &Inst : Block)
780 if (ShouldHoistAlloca(*AllocaInst))
781 AllocasToMove.push_back(AllocaInst);
782
783 auto InsertPoint =
784 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
785
786 for (llvm::Instruction *AllocaInst : AllocasToMove)
787 AllocaInst->moveBefore(InsertPoint);
788}
789
790void OpenMPIRBuilder::finalize(Function *Fn) {
791 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
793 SmallVector<OutlineInfo, 16> DeferredOutlines;
794 for (OutlineInfo &OI : OutlineInfos) {
795 // Skip functions that have not finalized yet; may happen with nested
796 // function generation.
797 if (Fn && OI.getFunction() != Fn) {
798 DeferredOutlines.push_back(OI);
799 continue;
800 }
801
802 ParallelRegionBlockSet.clear();
803 Blocks.clear();
804 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
805
806 Function *OuterFn = OI.getFunction();
807 CodeExtractorAnalysisCache CEAC(*OuterFn);
808 // If we generate code for the target device, we need to allocate
809 // struct for aggregate params in the device default alloca address space.
810 // OpenMP runtime requires that the params of the extracted functions are
811 // passed as zero address space pointers. This flag ensures that
812 // CodeExtractor generates correct code for extracted functions
813 // which are used by OpenMP runtime.
814 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
815 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
816 /* AggregateArgs */ true,
817 /* BlockFrequencyInfo */ nullptr,
818 /* BranchProbabilityInfo */ nullptr,
819 /* AssumptionCache */ nullptr,
820 /* AllowVarArgs */ true,
821 /* AllowAlloca */ true,
822 /* AllocaBlock*/ OI.OuterAllocaBB,
823 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
824
825 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
826 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
827 << " Exit: " << OI.ExitBB->getName() << "\n");
828 assert(Extractor.isEligible() &&
829 "Expected OpenMP outlining to be possible!");
830
831 for (auto *V : OI.ExcludeArgsFromAggregate)
832 Extractor.excludeArgFromAggregate(V);
833
834 Function *OutlinedFn =
835 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
836
837 // Forward target-cpu, target-features attributes to the outlined function.
838 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
839 if (TargetCpuAttr.isStringAttribute())
840 OutlinedFn->addFnAttr(TargetCpuAttr);
841
842 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
843 if (TargetFeaturesAttr.isStringAttribute())
844 OutlinedFn->addFnAttr(TargetFeaturesAttr);
845
846 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
847 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
848 assert(OutlinedFn->getReturnType()->isVoidTy() &&
849 "OpenMP outlined functions should not return a value!");
850
851 // For compability with the clang CG we move the outlined function after the
852 // one with the parallel region.
853 OutlinedFn->removeFromParent();
854 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
855
856 // Remove the artificial entry introduced by the extractor right away, we
857 // made our own entry block after all.
858 {
859 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
860 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
861 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
862 // Move instructions from the to-be-deleted ArtificialEntry to the entry
863 // basic block of the parallel region. CodeExtractor generates
864 // instructions to unwrap the aggregate argument and may sink
865 // allocas/bitcasts for values that are solely used in the outlined region
866 // and do not escape.
867 assert(!ArtificialEntry.empty() &&
868 "Expected instructions to add in the outlined region entry");
869 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
870 End = ArtificialEntry.rend();
871 It != End;) {
872 Instruction &I = *It;
873 It++;
874
875 if (I.isTerminator()) {
876 // Absorb any debug value that terminator may have
877 if (OI.EntryBB->getTerminator())
878 OI.EntryBB->getTerminator()->adoptDbgRecords(
879 &ArtificialEntry, I.getIterator(), false);
880 continue;
881 }
882
883 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
884 }
885
886 OI.EntryBB->moveBefore(&ArtificialEntry);
887 ArtificialEntry.eraseFromParent();
888 }
889 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
890 assert(OutlinedFn && OutlinedFn->hasNUses(1));
891
892 // Run a user callback, e.g. to add attributes.
893 if (OI.PostOutlineCB)
894 OI.PostOutlineCB(*OutlinedFn);
895
896 if (OI.FixUpNonEntryAllocas) {
897 PostDominatorTree PostDomTree(*OutlinedFn);
898 for (llvm::BasicBlock &BB : *OutlinedFn)
899 if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock()))
901 }
902 }
903
904 // Remove work items that have been completed.
905 OutlineInfos = std::move(DeferredOutlines);
906
907 // The createTarget functions embeds user written code into
908 // the target region which may inject allocas which need to
909 // be moved to the entry block of our target or risk malformed
910 // optimisations by later passes, this is only relevant for
911 // the device pass which appears to be a little more delicate
912 // when it comes to optimisations (however, we do not block on
913 // that here, it's up to the inserter to the list to do so).
914 // This notbaly has to occur after the OutlinedInfo candidates
915 // have been extracted so we have an end product that will not
916 // be implicitly adversely affected by any raises unless
917 // intentionally appended to the list.
918 // NOTE: This only does so for ConstantData, it could be extended
919 // to ConstantExpr's with further effort, however, they should
920 // largely be folded when they get here. Extending it to runtime
921 // defined/read+writeable allocation sizes would be non-trivial
922 // (need to factor in movement of any stores to variables the
923 // allocation size depends on, as well as the usual loads,
924 // otherwise it'll yield the wrong result after movement) and
925 // likely be more suitable as an LLVM optimisation pass.
926 for (Function *F : ConstantAllocaRaiseCandidates)
928
929 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
930 [](EmitMetadataErrorKind Kind,
931 const TargetRegionEntryInfo &EntryInfo) -> void {
932 errs() << "Error of kind: " << Kind
933 << " when emitting offload entries and metadata during "
934 "OMPIRBuilder finalization \n";
935 };
936
937 if (!OffloadInfoManager.empty())
938 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
939
940 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
941 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
942 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
943 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
944 }
945
946 IsFinalized = true;
947}
948
949bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
950
951OpenMPIRBuilder::~OpenMPIRBuilder() {
952 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
953}
954
955GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
956 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
957 auto *GV =
958 new GlobalVariable(M, I32Ty,
959 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
960 ConstantInt::get(I32Ty, Value), Name);
961 GV->setVisibility(GlobalValue::HiddenVisibility);
962
963 return GV;
964}
965
966void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
967 if (List.empty())
968 return;
969
970 // Convert List to what ConstantArray needs.
972 UsedArray.resize(List.size());
973 for (unsigned I = 0, E = List.size(); I != E; ++I)
975 cast<Constant>(&*List[I]), Builder.getPtrTy());
976
977 if (UsedArray.empty())
978 return;
979 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
980
981 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
982 ConstantArray::get(ATy, UsedArray), Name);
983
984 GV->setSection("llvm.metadata");
985}
986
988OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
990 auto *Int8Ty = Builder.getInt8Ty();
991 auto *GVMode = new GlobalVariable(
992 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
993 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
994 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
995 return GVMode;
996}
997
998Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
999 uint32_t SrcLocStrSize,
1000 IdentFlag LocFlags,
1001 unsigned Reserve2Flags) {
1002 // Enable "C-mode".
1003 LocFlags |= OMP_IDENT_FLAG_KMPC;
1004
1005 Constant *&Ident =
1006 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1007 if (!Ident) {
1008 Constant *I32Null = ConstantInt::getNullValue(Int32);
1009 Constant *IdentData[] = {I32Null,
1010 ConstantInt::get(Int32, uint32_t(LocFlags)),
1011 ConstantInt::get(Int32, Reserve2Flags),
1012 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1013
1014 size_t SrcLocStrArgIdx = 4;
1015 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1017 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1018 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1019 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1020 Constant *Initializer =
1021 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1022
1023 // Look for existing encoding of the location + flags, not needed but
1024 // minimizes the difference to the existing solution while we transition.
1025 for (GlobalVariable &GV : M.globals())
1026 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1027 if (GV.getInitializer() == Initializer)
1028 Ident = &GV;
1029
1030 if (!Ident) {
1031 auto *GV = new GlobalVariable(
1032 M, OpenMPIRBuilder::Ident,
1033 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1035 M.getDataLayout().getDefaultGlobalsAddressSpace());
1036 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1037 GV->setAlignment(Align(8));
1038 Ident = GV;
1039 }
1040 }
1041
1042 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1043}
1044
1045Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
1046 uint32_t &SrcLocStrSize) {
1047 SrcLocStrSize = LocStr.size();
1048 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1049 if (!SrcLocStr) {
1050 Constant *Initializer =
1051 ConstantDataArray::getString(M.getContext(), LocStr);
1052
1053 // Look for existing encoding of the location, not needed but minimizes the
1054 // difference to the existing solution while we transition.
1055 for (GlobalVariable &GV : M.globals())
1056 if (GV.isConstant() && GV.hasInitializer() &&
1057 GV.getInitializer() == Initializer)
1058 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1059
1060 SrcLocStr = Builder.CreateGlobalString(
1061 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1062 &M);
1063 }
1064 return SrcLocStr;
1065}
1066
1067Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
1068 StringRef FileName,
1069 unsigned Line, unsigned Column,
1070 uint32_t &SrcLocStrSize) {
1071 SmallString<128> Buffer;
1072 Buffer.push_back(';');
1073 Buffer.append(FileName);
1074 Buffer.push_back(';');
1075 Buffer.append(FunctionName);
1076 Buffer.push_back(';');
1077 Buffer.append(std::to_string(Line));
1078 Buffer.push_back(';');
1079 Buffer.append(std::to_string(Column));
1080 Buffer.push_back(';');
1081 Buffer.push_back(';');
1082 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1083}
1084
1085Constant *
1086OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
1087 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1088 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1089}
1090
1091Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
1092 uint32_t &SrcLocStrSize,
1093 Function *F) {
1094 DILocation *DIL = DL.get();
1095 if (!DIL)
1096 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1097 StringRef FileName = M.getName();
1098 if (DIFile *DIF = DIL->getFile())
1099 if (std::optional<StringRef> Source = DIF->getSource())
1100 FileName = *Source;
1101 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1102 if (Function.empty() && F)
1103 Function = F->getName();
1104 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1105 DIL->getColumn(), SrcLocStrSize);
1106}
1107
1108Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1109 uint32_t &SrcLocStrSize) {
1110 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1111 Loc.IP.getBlock()->getParent());
1112}
1113
1114Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1115 return createRuntimeFunctionCall(
1116 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1117 "omp_global_thread_num");
1118}
1119
1120OpenMPIRBuilder::InsertPointOrErrorTy
1121OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1122 bool ForceSimpleCall, bool CheckCancelFlag) {
1123 if (!updateToLocation(Loc))
1124 return Loc.IP;
1125
1126 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1127 // __kmpc_barrier(loc, thread_id);
1128
1129 IdentFlag BarrierLocFlags;
1130 switch (Kind) {
1131 case OMPD_for:
1132 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1133 break;
1134 case OMPD_sections:
1135 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1136 break;
1137 case OMPD_single:
1138 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1139 break;
1140 case OMPD_barrier:
1141 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1142 break;
1143 default:
1144 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1145 break;
1146 }
1147
1148 uint32_t SrcLocStrSize;
1149 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1150 Value *Args[] = {
1151 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1152 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1153
1154 // If we are in a cancellable parallel region, barriers are cancellation
1155 // points.
1156 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1157 bool UseCancelBarrier =
1158 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1159
1160 Value *Result = createRuntimeFunctionCall(
1161 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1162 ? OMPRTL___kmpc_cancel_barrier
1163 : OMPRTL___kmpc_barrier),
1164 Args);
1165
1166 if (UseCancelBarrier && CheckCancelFlag)
1167 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1168 return Err;
1169
1170 return Builder.saveIP();
1171}
1172
1173OpenMPIRBuilder::InsertPointOrErrorTy
1174OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1175 Value *IfCondition,
1176 omp::Directive CanceledDirective) {
1177 if (!updateToLocation(Loc))
1178 return Loc.IP;
1179
1180 // LLVM utilities like blocks with terminators.
1181 auto *UI = Builder.CreateUnreachable();
1182
1183 Instruction *ThenTI = UI, *ElseTI = nullptr;
1184 if (IfCondition) {
1185 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1186
1187 // Even if the if condition evaluates to false, this should count as a
1188 // cancellation point
1189 Builder.SetInsertPoint(ElseTI);
1190 auto ElseIP = Builder.saveIP();
1191
1192 InsertPointOrErrorTy IPOrErr = createCancellationPoint(
1193 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1194 if (!IPOrErr)
1195 return IPOrErr;
1196 }
1197
1198 Builder.SetInsertPoint(ThenTI);
1199
1200 Value *CancelKind = nullptr;
1201 switch (CanceledDirective) {
1202#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1203 case DirectiveEnum: \
1204 CancelKind = Builder.getInt32(Value); \
1205 break;
1206#include "llvm/Frontend/OpenMP/OMPKinds.def"
1207 default:
1208 llvm_unreachable("Unknown cancel kind!");
1209 }
1210
1211 uint32_t SrcLocStrSize;
1212 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1213 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1214 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1215 Value *Result = createRuntimeFunctionCall(
1216 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1217
1218 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1219 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1220 return Err;
1221
1222 // Update the insertion point and remove the terminator we introduced.
1223 Builder.SetInsertPoint(UI->getParent());
1224 UI->eraseFromParent();
1225
1226 return Builder.saveIP();
1227}
1228
1229OpenMPIRBuilder::InsertPointOrErrorTy
1230OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1231 omp::Directive CanceledDirective) {
1232 if (!updateToLocation(Loc))
1233 return Loc.IP;
1234
1235 // LLVM utilities like blocks with terminators.
1236 auto *UI = Builder.CreateUnreachable();
1237 Builder.SetInsertPoint(UI);
1238
1239 Value *CancelKind = nullptr;
1240 switch (CanceledDirective) {
1241#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1242 case DirectiveEnum: \
1243 CancelKind = Builder.getInt32(Value); \
1244 break;
1245#include "llvm/Frontend/OpenMP/OMPKinds.def"
1246 default:
1247 llvm_unreachable("Unknown cancel kind!");
1248 }
1249
1250 uint32_t SrcLocStrSize;
1251 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1252 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1253 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1254 Value *Result = createRuntimeFunctionCall(
1255 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1256
1257 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1258 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1259 return Err;
1260
1261 // Update the insertion point and remove the terminator we introduced.
1262 Builder.SetInsertPoint(UI->getParent());
1263 UI->eraseFromParent();
1264
1265 return Builder.saveIP();
1266}
1267
1268OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1269 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1270 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1271 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1272 if (!updateToLocation(Loc))
1273 return Loc.IP;
1274
1275 Builder.restoreIP(AllocaIP);
1276 auto *KernelArgsPtr =
1277 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1278 updateToLocation(Loc);
1279
1280 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1281 llvm::Value *Arg =
1282 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1283 Builder.CreateAlignedStore(
1284 KernelArgs[I], Arg,
1285 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1286 }
1287
1288 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1289 NumThreads, HostPtr, KernelArgsPtr};
1290
1291 Return = createRuntimeFunctionCall(
1292 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1293 OffloadingArgs);
1294
1295 return Builder.saveIP();
1296}
1297
1298OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1299 const LocationDescription &Loc, Value *OutlinedFnID,
1300 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1301 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1302
1303 if (!updateToLocation(Loc))
1304 return Loc.IP;
1305
1306 // On top of the arrays that were filled up, the target offloading call
1307 // takes as arguments the device id as well as the host pointer. The host
1308 // pointer is used by the runtime library to identify the current target
1309 // region, so it only has to be unique and not necessarily point to
1310 // anything. It could be the pointer to the outlined function that
1311 // implements the target region, but we aren't using that so that the
1312 // compiler doesn't need to keep that, and could therefore inline the host
1313 // function if proven worthwhile during optimization.
1314
1315 // From this point on, we need to have an ID of the target region defined.
1316 assert(OutlinedFnID && "Invalid outlined function ID!");
1317 (void)OutlinedFnID;
1318
1319 // Return value of the runtime offloading call.
1320 Value *Return = nullptr;
1321
1322 // Arguments for the target kernel.
1323 SmallVector<Value *> ArgsVector;
1324 getKernelArgsVector(Args, Builder, ArgsVector);
1325
1326 // The target region is an outlined function launched by the runtime
1327 // via calls to __tgt_target_kernel().
1328 //
1329 // Note that on the host and CPU targets, the runtime implementation of
1330 // these calls simply call the outlined function without forking threads.
1331 // The outlined functions themselves have runtime calls to
1332 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1333 // the compiler in emitTeamsCall() and emitParallelCall().
1334 //
1335 // In contrast, on the NVPTX target, the implementation of
1336 // __tgt_target_teams() launches a GPU kernel with the requested number
1337 // of teams and threads so no additional calls to the runtime are required.
1338 // Check the error code and execute the host version if required.
1339 Builder.restoreIP(emitTargetKernel(
1340 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1341 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1342
1343 BasicBlock *OffloadFailedBlock =
1344 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1345 BasicBlock *OffloadContBlock =
1346 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1347 Value *Failed = Builder.CreateIsNotNull(Return);
1348 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1349
1350 auto CurFn = Builder.GetInsertBlock()->getParent();
1351 emitBlock(OffloadFailedBlock, CurFn);
1352 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1353 if (!AfterIP)
1354 return AfterIP.takeError();
1355 Builder.restoreIP(*AfterIP);
1356 emitBranch(OffloadContBlock);
1357 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1358 return Builder.saveIP();
1359}
1360
1361Error OpenMPIRBuilder::emitCancelationCheckImpl(
1362 Value *CancelFlag, omp::Directive CanceledDirective) {
1363 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1364 "Unexpected cancellation!");
1365
1366 // For a cancel barrier we create two new blocks.
1367 BasicBlock *BB = Builder.GetInsertBlock();
1368 BasicBlock *NonCancellationBlock;
1369 if (Builder.GetInsertPoint() == BB->end()) {
1370 // TODO: This branch will not be needed once we moved to the
1371 // OpenMPIRBuilder codegen completely.
1372 NonCancellationBlock = BasicBlock::Create(
1373 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1374 } else {
1375 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1377 Builder.SetInsertPoint(BB);
1378 }
1379 BasicBlock *CancellationBlock = BasicBlock::Create(
1380 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1381
1382 // Jump to them based on the return value.
1383 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1384 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1385 /* TODO weight */ nullptr, nullptr);
1386
1387 // From the cancellation block we finalize all variables and go to the
1388 // post finalization block that is known to the FiniCB callback.
1389 auto &FI = FinalizationStack.back();
1390 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1391 if (!FiniBBOrErr)
1392 return FiniBBOrErr.takeError();
1393 Builder.SetInsertPoint(CancellationBlock);
1394 Builder.CreateBr(*FiniBBOrErr);
1395
1396 // The continuation block is where code generation continues.
1397 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1398 return Error::success();
1399}
1400
1401// Callback used to create OpenMP runtime calls to support
1402// omp parallel clause for the device.
1403// We need to use this callback to replace call to the OutlinedFn in OuterFn
1404// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1406 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1407 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1408 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1409 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1410 // Add some known attributes.
1411 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1412 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1413 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1414 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1415 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1416 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1417
1418 assert(OutlinedFn.arg_size() >= 2 &&
1419 "Expected at least tid and bounded tid as arguments");
1420 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1421
1422 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1423 assert(CI && "Expected call instruction to outlined function");
1424 CI->getParent()->setName("omp_parallel");
1425
1426 Builder.SetInsertPoint(CI);
1427 Type *PtrTy = OMPIRBuilder->VoidPtr;
1428 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1429
1430 // Add alloca for kernel args
1431 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1432 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1433 AllocaInst *ArgsAlloca =
1434 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1435 Value *Args = ArgsAlloca;
1436 // Add address space cast if array for storing arguments is not allocated
1437 // in address space 0
1438 if (ArgsAlloca->getAddressSpace())
1439 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1440 Builder.restoreIP(CurrentIP);
1441
1442 // Store captured vars which are used by kmpc_parallel_60
1443 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1444 Value *V = *(CI->arg_begin() + 2 + Idx);
1445 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1446 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1447 Builder.CreateStore(V, StoreAddress);
1448 }
1449
1450 Value *Cond =
1451 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1452 : Builder.getInt32(1);
1453
1454 // Build kmpc_parallel_60 call
1455 Value *Parallel60CallArgs[] = {
1456 /* identifier*/ Ident,
1457 /* global thread num*/ ThreadID,
1458 /* if expression */ Cond,
1459 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1460 /* Proc bind */ Builder.getInt32(-1),
1461 /* outlined function */ &OutlinedFn,
1462 /* wrapper function */ NullPtrValue,
1463 /* arguments of the outlined funciton*/ Args,
1464 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1465 /* strict for number of threads */ Builder.getInt32(0)};
1466
1467 FunctionCallee RTLFn =
1468 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1469
1470 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1471
1472 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1473 << *Builder.GetInsertBlock()->getParent() << "\n");
1474
1475 // Initialize the local TID stack location with the argument value.
1476 Builder.SetInsertPoint(PrivTID);
1477 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1478 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1479 PrivTIDAddr);
1480
1481 // Remove redundant call to the outlined function.
1482 CI->eraseFromParent();
1483
1484 for (Instruction *I : ToBeDeleted) {
1485 I->eraseFromParent();
1486 }
1487}
1488
1489// Callback used to create OpenMP runtime calls to support
1490// omp parallel clause for the host.
1491// We need to use this callback to replace call to the OutlinedFn in OuterFn
1492// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1493static void
1494hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1495 Function *OuterFn, Value *Ident, Value *IfCondition,
1496 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1497 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1498 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1499 FunctionCallee RTLFn;
1500 if (IfCondition) {
1501 RTLFn =
1502 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1503 } else {
1504 RTLFn =
1505 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1506 }
1507 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1508 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1509 LLVMContext &Ctx = F->getContext();
1510 MDBuilder MDB(Ctx);
1511 // Annotate the callback behavior of the __kmpc_fork_call:
1512 // - The callback callee is argument number 2 (microtask).
1513 // - The first two arguments of the callback callee are unknown (-1).
1514 // - All variadic arguments to the __kmpc_fork_call are passed to the
1515 // callback callee.
1516 F->addMetadata(LLVMContext::MD_callback,
1518 2, {-1, -1},
1519 /* VarArgsArePassed */ true)}));
1520 }
1521 }
1522 // Add some known attributes.
1523 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1524 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1525 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1526
1527 assert(OutlinedFn.arg_size() >= 2 &&
1528 "Expected at least tid and bounded tid as arguments");
1529 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1530
1531 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1532 CI->getParent()->setName("omp_parallel");
1533 Builder.SetInsertPoint(CI);
1534
1535 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1536 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1537 &OutlinedFn};
1538
1539 SmallVector<Value *, 16> RealArgs;
1540 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1541 if (IfCondition) {
1542 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1543 RealArgs.push_back(Cond);
1544 }
1545 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1546
1547 // __kmpc_fork_call_if always expects a void ptr as the last argument
1548 // If there are no arguments, pass a null pointer.
1549 auto PtrTy = OMPIRBuilder->VoidPtr;
1550 if (IfCondition && NumCapturedVars == 0) {
1551 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1552 RealArgs.push_back(NullPtrValue);
1553 }
1554
1555 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1556
1557 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1558 << *Builder.GetInsertBlock()->getParent() << "\n");
1559
1560 // Initialize the local TID stack location with the argument value.
1561 Builder.SetInsertPoint(PrivTID);
1562 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1563 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1564 PrivTIDAddr);
1565
1566 // Remove redundant call to the outlined function.
1567 CI->eraseFromParent();
1568
1569 for (Instruction *I : ToBeDeleted) {
1570 I->eraseFromParent();
1571 }
1572}
1573
1574OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1575 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1576 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1577 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1578 omp::ProcBindKind ProcBind, bool IsCancellable) {
1579 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1580
1581 if (!updateToLocation(Loc))
1582 return Loc.IP;
1583
1584 uint32_t SrcLocStrSize;
1585 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1586 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1587 Value *ThreadID = getOrCreateThreadID(Ident);
1588 // If we generate code for the target device, we need to allocate
1589 // struct for aggregate params in the device default alloca address space.
1590 // OpenMP runtime requires that the params of the extracted functions are
1591 // passed as zero address space pointers. This flag ensures that extracted
1592 // function arguments are declared in zero address space
1593 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1594
1595 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1596 // only if we compile for host side.
1597 if (NumThreads && !Config.isTargetDevice()) {
1598 Value *Args[] = {
1599 Ident, ThreadID,
1600 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1601 createRuntimeFunctionCall(
1602 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1603 }
1604
1605 if (ProcBind != OMP_PROC_BIND_default) {
1606 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1607 Value *Args[] = {
1608 Ident, ThreadID,
1609 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1610 createRuntimeFunctionCall(
1611 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1612 }
1613
1614 BasicBlock *InsertBB = Builder.GetInsertBlock();
1615 Function *OuterFn = InsertBB->getParent();
1616
1617 // Save the outer alloca block because the insertion iterator may get
1618 // invalidated and we still need this later.
1619 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1620
1621 // Vector to remember instructions we used only during the modeling but which
1622 // we want to delete at the end.
1624
1625 // Change the location to the outer alloca insertion point to create and
1626 // initialize the allocas we pass into the parallel region.
1627 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1628 Builder.restoreIP(NewOuter);
1629 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1630 AllocaInst *ZeroAddrAlloca =
1631 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1632 Instruction *TIDAddr = TIDAddrAlloca;
1633 Instruction *ZeroAddr = ZeroAddrAlloca;
1634 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1635 // Add additional casts to enforce pointers in zero address space
1636 TIDAddr = new AddrSpaceCastInst(
1637 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1638 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1639 ToBeDeleted.push_back(TIDAddr);
1640 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1641 PointerType ::get(M.getContext(), 0),
1642 "zero.addr.ascast");
1643 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1644 ToBeDeleted.push_back(ZeroAddr);
1645 }
1646
1647 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1648 // associated arguments in the outlined function, so we delete them later.
1649 ToBeDeleted.push_back(TIDAddrAlloca);
1650 ToBeDeleted.push_back(ZeroAddrAlloca);
1651
1652 // Create an artificial insertion point that will also ensure the blocks we
1653 // are about to split are not degenerated.
1654 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1655
1656 BasicBlock *EntryBB = UI->getParent();
1657 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1658 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1659 BasicBlock *PRegPreFiniBB =
1660 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1661 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1662
1663 auto FiniCBWrapper = [&](InsertPointTy IP) {
1664 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1665 // target to the region exit block.
1666 if (IP.getBlock()->end() == IP.getPoint()) {
1667 IRBuilder<>::InsertPointGuard IPG(Builder);
1668 Builder.restoreIP(IP);
1669 Instruction *I = Builder.CreateBr(PRegExitBB);
1670 IP = InsertPointTy(I->getParent(), I->getIterator());
1671 }
1673 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1674 "Unexpected insertion point for finalization call!");
1675 return FiniCB(IP);
1676 };
1677
1678 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1679
1680 // Generate the privatization allocas in the block that will become the entry
1681 // of the outlined function.
1682 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1683 InsertPointTy InnerAllocaIP = Builder.saveIP();
1684
1685 AllocaInst *PrivTIDAddr =
1686 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1687 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1688
1689 // Add some fake uses for OpenMP provided arguments.
1690 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1691 Instruction *ZeroAddrUse =
1692 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1693 ToBeDeleted.push_back(ZeroAddrUse);
1694
1695 // EntryBB
1696 // |
1697 // V
1698 // PRegionEntryBB <- Privatization allocas are placed here.
1699 // |
1700 // V
1701 // PRegionBodyBB <- BodeGen is invoked here.
1702 // |
1703 // V
1704 // PRegPreFiniBB <- The block we will start finalization from.
1705 // |
1706 // V
1707 // PRegionExitBB <- A common exit to simplify block collection.
1708 //
1709
1710 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1711
1712 // Let the caller create the body.
1713 assert(BodyGenCB && "Expected body generation callback!");
1714 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1715 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1716 return Err;
1717
1718 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1719
1720 OutlineInfo OI;
1721 if (Config.isTargetDevice()) {
1722 // Generate OpenMP target specific runtime call
1723 OI.PostOutlineCB = [=, ToBeDeletedVec =
1724 std::move(ToBeDeleted)](Function &OutlinedFn) {
1725 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1726 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1727 ThreadID, ToBeDeletedVec);
1728 };
1729 OI.FixUpNonEntryAllocas = true;
1730 } else {
1731 // Generate OpenMP host runtime call
1732 OI.PostOutlineCB = [=, ToBeDeletedVec =
1733 std::move(ToBeDeleted)](Function &OutlinedFn) {
1734 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1735 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1736 };
1737 OI.FixUpNonEntryAllocas = true;
1738 }
1739
1740 OI.OuterAllocaBB = OuterAllocaBlock;
1741 OI.EntryBB = PRegEntryBB;
1742 OI.ExitBB = PRegExitBB;
1743
1744 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1746 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1747
1748 CodeExtractorAnalysisCache CEAC(*OuterFn);
1749 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1750 /* AggregateArgs */ false,
1751 /* BlockFrequencyInfo */ nullptr,
1752 /* BranchProbabilityInfo */ nullptr,
1753 /* AssumptionCache */ nullptr,
1754 /* AllowVarArgs */ true,
1755 /* AllowAlloca */ true,
1756 /* AllocationBlock */ OuterAllocaBlock,
1757 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1758
1759 // Find inputs to, outputs from the code region.
1760 BasicBlock *CommonExit = nullptr;
1761 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1762 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1763
1764 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1765 /*CollectGlobalInputs=*/true);
1766
1767 Inputs.remove_if([&](Value *I) {
1769 return GV->getValueType() == OpenMPIRBuilder::Ident;
1770
1771 return false;
1772 });
1773
1774 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1775
1776 FunctionCallee TIDRTLFn =
1777 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1778
1779 auto PrivHelper = [&](Value &V) -> Error {
1780 if (&V == TIDAddr || &V == ZeroAddr) {
1781 OI.ExcludeArgsFromAggregate.push_back(&V);
1782 return Error::success();
1783 }
1784
1786 for (Use &U : V.uses())
1787 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1788 if (ParallelRegionBlockSet.count(UserI->getParent()))
1789 Uses.insert(&U);
1790
1791 // __kmpc_fork_call expects extra arguments as pointers. If the input
1792 // already has a pointer type, everything is fine. Otherwise, store the
1793 // value onto stack and load it back inside the to-be-outlined region. This
1794 // will ensure only the pointer will be passed to the function.
1795 // FIXME: if there are more than 15 trailing arguments, they must be
1796 // additionally packed in a struct.
1797 Value *Inner = &V;
1798 if (!V.getType()->isPointerTy()) {
1799 IRBuilder<>::InsertPointGuard Guard(Builder);
1800 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1801
1802 Builder.restoreIP(OuterAllocaIP);
1803 Value *Ptr =
1804 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1805
1806 // Store to stack at end of the block that currently branches to the entry
1807 // block of the to-be-outlined region.
1808 Builder.SetInsertPoint(InsertBB,
1809 InsertBB->getTerminator()->getIterator());
1810 Builder.CreateStore(&V, Ptr);
1811
1812 // Load back next to allocations in the to-be-outlined region.
1813 Builder.restoreIP(InnerAllocaIP);
1814 Inner = Builder.CreateLoad(V.getType(), Ptr);
1815 }
1816
1817 Value *ReplacementValue = nullptr;
1818 CallInst *CI = dyn_cast<CallInst>(&V);
1819 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1820 ReplacementValue = PrivTID;
1821 } else {
1822 InsertPointOrErrorTy AfterIP =
1823 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1824 if (!AfterIP)
1825 return AfterIP.takeError();
1826 Builder.restoreIP(*AfterIP);
1827 InnerAllocaIP = {
1828 InnerAllocaIP.getBlock(),
1829 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1830
1831 assert(ReplacementValue &&
1832 "Expected copy/create callback to set replacement value!");
1833 if (ReplacementValue == &V)
1834 return Error::success();
1835 }
1836
1837 for (Use *UPtr : Uses)
1838 UPtr->set(ReplacementValue);
1839
1840 return Error::success();
1841 };
1842
1843 // Reset the inner alloca insertion as it will be used for loading the values
1844 // wrapped into pointers before passing them into the to-be-outlined region.
1845 // Configure it to insert immediately after the fake use of zero address so
1846 // that they are available in the generated body and so that the
1847 // OpenMP-related values (thread ID and zero address pointers) remain leading
1848 // in the argument list.
1849 InnerAllocaIP = IRBuilder<>::InsertPoint(
1850 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1851
1852 // Reset the outer alloca insertion point to the entry of the relevant block
1853 // in case it was invalidated.
1854 OuterAllocaIP = IRBuilder<>::InsertPoint(
1855 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1856
1857 for (Value *Input : Inputs) {
1858 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1859 if (Error Err = PrivHelper(*Input))
1860 return Err;
1861 }
1862 LLVM_DEBUG({
1863 for (Value *Output : Outputs)
1864 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1865 });
1866 assert(Outputs.empty() &&
1867 "OpenMP outlining should not produce live-out values!");
1868
1869 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1870 LLVM_DEBUG({
1871 for (auto *BB : Blocks)
1872 dbgs() << " PBR: " << BB->getName() << "\n";
1873 });
1874
1875 // Adjust the finalization stack, verify the adjustment, and call the
1876 // finalize function a last time to finalize values between the pre-fini
1877 // block and the exit block if we left the parallel "the normal way".
1878 auto FiniInfo = FinalizationStack.pop_back_val();
1879 (void)FiniInfo;
1880 assert(FiniInfo.DK == OMPD_parallel &&
1881 "Unexpected finalization stack state!");
1882
1883 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1884
1885 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1886 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1887 if (!FiniBBOrErr)
1888 return FiniBBOrErr.takeError();
1889 {
1890 IRBuilderBase::InsertPointGuard Guard(Builder);
1891 Builder.restoreIP(PreFiniIP);
1892 Builder.CreateBr(*FiniBBOrErr);
1893 // There's currently a branch to omp.par.exit. Delete it. We will get there
1894 // via the fini block
1895 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1896 Term->eraseFromParent();
1897 }
1898
1899 // Register the outlined info.
1900 addOutlineInfo(std::move(OI));
1901
1902 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1903 UI->eraseFromParent();
1904
1905 return AfterIP;
1906}
1907
1908void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1909 // Build call void __kmpc_flush(ident_t *loc)
1910 uint32_t SrcLocStrSize;
1911 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1912 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1913
1914 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush),
1915 Args);
1916}
1917
1918void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1919 if (!updateToLocation(Loc))
1920 return;
1921 emitFlush(Loc);
1922}
1923
1924void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1925 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1926 // global_tid);
1927 uint32_t SrcLocStrSize;
1928 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1929 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1930 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1931
1932 // Ignore return result until untied tasks are supported.
1933 createRuntimeFunctionCall(
1934 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1935}
1936
1937void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1938 if (!updateToLocation(Loc))
1939 return;
1940 emitTaskwaitImpl(Loc);
1941}
1942
1943void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1944 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 Constant *I32Null = ConstantInt::getNullValue(Int32);
1949 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1950
1951 createRuntimeFunctionCall(
1952 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1953}
1954
1955void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1956 if (!updateToLocation(Loc))
1957 return;
1958 emitTaskyieldImpl(Loc);
1959}
1960
1961// Processes the dependencies in Dependencies and does the following
1962// - Allocates space on the stack of an array of DependInfo objects
1963// - Populates each DependInfo object with relevant information of
1964// the corresponding dependence.
1965// - All code is inserted in the entry block of the current function.
1967 OpenMPIRBuilder &OMPBuilder,
1969 // Early return if we have no dependencies to process
1970 if (Dependencies.empty())
1971 return nullptr;
1972
1973 // Given a vector of DependData objects, in this function we create an
1974 // array on the stack that holds kmp_dep_info objects corresponding
1975 // to each dependency. This is then passed to the OpenMP runtime.
1976 // For example, if there are 'n' dependencies then the following psedo
1977 // code is generated. Assume the first dependence is on a variable 'a'
1978 //
1979 // \code{c}
1980 // DepArray = alloc(n x sizeof(kmp_depend_info);
1981 // idx = 0;
1982 // DepArray[idx].base_addr = ptrtoint(&a);
1983 // DepArray[idx].len = 8;
1984 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1985 // ++idx;
1986 // DepArray[idx].base_addr = ...;
1987 // \endcode
1988
1989 IRBuilderBase &Builder = OMPBuilder.Builder;
1990 Type *DependInfo = OMPBuilder.DependInfo;
1991 Module &M = OMPBuilder.M;
1992
1993 Value *DepArray = nullptr;
1994 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1995 Builder.SetInsertPoint(
1996 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1997
1998 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1999 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2000
2001 Builder.restoreIP(OldIP);
2002
2003 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2004 Value *Base =
2005 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2006 // Store the pointer to the variable
2007 Value *Addr = Builder.CreateStructGEP(
2008 DependInfo, Base,
2009 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2010 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
2011 Builder.CreateStore(DepValPtr, Addr);
2012 // Store the size of the variable
2013 Value *Size = Builder.CreateStructGEP(
2014 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
2015 Builder.CreateStore(
2016 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2017 Size);
2018 // Store the dependency kind
2019 Value *Flags = Builder.CreateStructGEP(
2020 DependInfo, Base,
2021 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2022 Builder.CreateStore(
2023 ConstantInt::get(Builder.getInt8Ty(),
2024 static_cast<unsigned int>(Dep.DepKind)),
2025 Flags);
2026 }
2027 return DepArray;
2028}
2029
2030/// Create the task duplication function passed to kmpc_taskloop.
2031Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2032 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2033 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2034 if (!DupCB)
2036 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2037
2038 // From OpenMP Runtime p_task_dup_t:
2039 // Routine optionally generated by the compiler for setting the lastprivate
2040 // flag and calling needed constructors for private/firstprivate objects (used
2041 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2042 // lastprivate flag.
2043 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2044
2045 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2046
2047 FunctionType *DupFuncTy = FunctionType::get(
2048 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2049 /*isVarArg=*/false);
2050
2051 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2052 "omp_taskloop_dup", M);
2053 Value *DestTaskArg = DupFunction->getArg(0);
2054 Value *SrcTaskArg = DupFunction->getArg(1);
2055 Value *LastprivateFlagArg = DupFunction->getArg(2);
2056 DestTaskArg->setName("dest_task");
2057 SrcTaskArg->setName("src_task");
2058 LastprivateFlagArg->setName("lastprivate_flag");
2059
2060 IRBuilderBase::InsertPointGuard Guard(Builder);
2061 Builder.SetInsertPoint(
2062 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2063
2064 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2065 Type *TaskWithPrivatesTy =
2066 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2067 Value *TaskPrivates = Builder.CreateGEP(
2068 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2069 Value *ContextPtr = Builder.CreateGEP(
2070 PrivatesTy, TaskPrivates,
2071 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2072 return ContextPtr;
2073 };
2074
2075 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2076 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2077
2078 DestTaskContextPtr->setName("destPtr");
2079 SrcTaskContextPtr->setName("srcPtr");
2080
2081 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2082 DupFunction->getEntryBlock().begin());
2083 InsertPointTy CodeGenIP = Builder.saveIP();
2085 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2086 if (!AfterIPOrError)
2087 return AfterIPOrError.takeError();
2088 Builder.restoreIP(*AfterIPOrError);
2089
2090 Builder.CreateRetVoid();
2091
2092 return DupFunction;
2093}
2094
2095OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2096 const LocationDescription &Loc, InsertPointTy AllocaIP,
2097 BodyGenCallbackTy BodyGenCB,
2099 Value *LBVal, Value *UBVal, Value *StepVal, bool Tied,
2100 TaskDupCallbackTy DupCB, Value *TaskContextStructPtrVal) {
2101
2102 if (!updateToLocation(Loc))
2103 return InsertPointTy();
2104
2105 uint32_t SrcLocStrSize;
2106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2108
2109 BasicBlock *TaskloopExitBB =
2110 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2111 BasicBlock *TaskloopBodyBB =
2112 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2113 BasicBlock *TaskloopAllocaBB =
2114 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2115
2116 InsertPointTy TaskloopAllocaIP =
2117 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2118 InsertPointTy TaskloopBodyIP =
2119 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2120
2121 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2122 return Err;
2123
2125 if (!result) {
2126 return result.takeError();
2127 }
2128
2129 llvm::CanonicalLoopInfo *CLI = result.get();
2130 OutlineInfo OI;
2131 OI.EntryBB = TaskloopAllocaBB;
2132 OI.OuterAllocaBB = AllocaIP.getBlock();
2133 OI.ExitBB = TaskloopExitBB;
2134
2135 // Add the thread ID argument.
2136 SmallVector<Instruction *> ToBeDeleted;
2137 // dummy instruction to be used as a fake argument
2138 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2139 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2140 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2141 TaskloopAllocaIP, "lb", false, true);
2142 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2143 TaskloopAllocaIP, "ub", false, true);
2144 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2145 TaskloopAllocaIP, "step", false, true);
2146 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2147 // aggregate struct
2148 OI.Inputs.insert(FakeLB);
2149 OI.Inputs.insert(FakeUB);
2150 OI.Inputs.insert(FakeStep);
2151 if (TaskContextStructPtrVal)
2152 OI.Inputs.insert(TaskContextStructPtrVal);
2153 assert(
2154 (TaskContextStructPtrVal && DupCB) ||
2155 (!TaskContextStructPtrVal && !DupCB) &&
2156 "Task context struct ptr and duplication callback must be both set "
2157 "or both null");
2158
2159 // It isn't safe to run the duplication bodygen callback inside the post
2160 // outlining callback so this has to be run now before we know the real task
2161 // shareds structure type.
2162 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2163 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2164 Type *FakeSharedsTy = StructType::get(
2165 Builder.getContext(),
2166 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2167 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2168 FakeSharedsTy,
2169 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2170 if (!TaskDupFnOrErr) {
2171 return TaskDupFnOrErr.takeError();
2172 }
2173 Value *TaskDupFn = *TaskDupFnOrErr;
2174
2175 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Tied,
2176 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2177 FakeLB, FakeUB, FakeStep](Function &OutlinedFn) mutable {
2178 // Replace the Stale CI by appropriate RTL function call.
2179 assert(OutlinedFn.hasOneUse() &&
2180 "there must be a single user for the outlined function");
2181 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2182
2183 /* Create the casting for the Bounds Values that can be used when outlining
2184 * to replace the uses of the fakes with real values */
2185 BasicBlock *CodeReplBB = StaleCI->getParent();
2186 IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP();
2187 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2188 Value *CastedLBVal =
2189 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2190 Value *CastedUBVal =
2191 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2192 Value *CastedStepVal =
2193 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2194 Builder.restoreIP(CurrentIp);
2195
2196 Builder.SetInsertPoint(StaleCI);
2197
2198 // Gather the arguments for emitting the runtime call for
2199 // @__kmpc_omp_task_alloc
2200 Function *TaskAllocFn =
2201 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2202
2203 Value *ThreadID = getOrCreateThreadID(Ident);
2204
2205 // Emit runtime call for @__kmpc_taskgroup
2206 Function *TaskgroupFn =
2207 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2208 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2209
2210 // The flags are set to 1 if the task is tied, 0 otherwise.
2211 Value *Flags = Builder.getInt32(Tied);
2212
2213 Value *TaskSize = Builder.getInt64(
2214 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2215
2216 AllocaInst *ArgStructAlloca =
2218 assert(ArgStructAlloca &&
2219 "Unable to find the alloca instruction corresponding to arguments "
2220 "for extracted function");
2221 StructType *ArgStructType =
2222 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2223 assert(ArgStructType && "Unable to find struct type corresponding to "
2224 "arguments for extracted function");
2225 Value *SharedsSize =
2226 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2227
2228 // Emit the @__kmpc_omp_task_alloc runtime call
2229 // The runtime call returns a pointer to an area where the task captured
2230 // variables must be copied before the task is run (TaskData)
2231 CallInst *TaskData = Builder.CreateCall(
2232 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2233 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2234 /*task_func=*/&OutlinedFn});
2235
2236 Value *Shareds = StaleCI->getArgOperand(1);
2237 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2238 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2239 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2240 SharedsSize);
2241 // Get the pointer to loop lb, ub, step from task ptr
2242 // and set up the lowerbound,upperbound and step values
2243 llvm::Value *Lb = Builder.CreateGEP(
2244 ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2245
2246 llvm::Value *Ub = Builder.CreateGEP(
2247 ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2248
2249 llvm::Value *Step = Builder.CreateGEP(
2250 ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2251 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2252
2253 // set up the arguments for emitting kmpc_taskloop runtime call
2254 // setting default values for ifval, nogroup, sched, grainsize, task_dup
2255 Value *IfVal = Builder.getInt32(1);
2256 Value *NoGroup = Builder.getInt32(1);
2257 Value *Sched = Builder.getInt32(0);
2258 Value *GrainSize = Builder.getInt64(0);
2259 Value *TaskDup = TaskDupFn;
2260
2261 Value *Args[] = {Ident, ThreadID, TaskData, IfVal, Lb, Ub,
2262 Loadstep, NoGroup, Sched, GrainSize, TaskDup};
2263
2264 // taskloop runtime call
2265 Function *TaskloopFn =
2266 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2267 Builder.CreateCall(TaskloopFn, Args);
2268
2269 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2270 Function *EndTaskgroupFn =
2271 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2272 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2273
2274 StaleCI->eraseFromParent();
2275
2276 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2277
2278 LoadInst *SharedsOutlined =
2279 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2280 OutlinedFn.getArg(1)->replaceUsesWithIf(
2281 SharedsOutlined,
2282 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2283
2284 Value *IV = CLI->getIndVar();
2285 Type *IVTy = IV->getType();
2286 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2287
2288 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2289 // UpperBound. These GEP's can be reused for loading the tasks respective
2290 // bounds.
2291 Value *TaskLB = nullptr;
2292 Value *TaskUB = nullptr;
2293 Value *LoadTaskLB = nullptr;
2294 Value *LoadTaskUB = nullptr;
2295 for (Instruction &I : *TaskloopAllocaBB) {
2296 if (I.getOpcode() == Instruction::GetElementPtr) {
2298 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2299 switch (CI->getZExtValue()) {
2300 case 0:
2301 TaskLB = &I;
2302 break;
2303 case 1:
2304 TaskUB = &I;
2305 break;
2306 }
2307 }
2308 } else if (I.getOpcode() == Instruction::Load) {
2310 if (Load.getPointerOperand() == TaskLB) {
2311 assert(TaskLB != nullptr && "Expected value for TaskLB");
2312 LoadTaskLB = &I;
2313 } else if (Load.getPointerOperand() == TaskUB) {
2314 assert(TaskUB != nullptr && "Expected value for TaskUB");
2315 LoadTaskUB = &I;
2316 }
2317 }
2318 }
2319
2320 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2321
2322 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2323 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2324 Value *TripCountMinusOne =
2325 Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
2326 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2327 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2328 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2329 // set the trip count in the CLI
2330 CLI->setTripCount(CastedTripCount);
2331
2332 Builder.SetInsertPoint(CLI->getBody(),
2333 CLI->getBody()->getFirstInsertionPt());
2334
2335 // The canonical loop is generated with a fixed lower bound. We need to
2336 // update the index calculation code to use the task's lower bound. The
2337 // generated code looks like this:
2338 // %omp_loop.iv = phi ...
2339 // ...
2340 // %tmp = mul [type] %omp_loop.iv, step
2341 // %user_index = add [type] tmp, lb
2342 // OpenMPIRBuilder constructs canonical loops to have exactly three uses of
2343 // the normalised induction variable:
2344 // 1. This one: converting the normalised IV to the user IV
2345 // 2. The increment (add)
2346 // 3. The comparison against the trip count (icmp)
2347 // (1) is the only use that is a mul followed by an add so this cannot match
2348 // other IR.
2349 assert(CLI->getIndVar()->getNumUses() == 3 &&
2350 "Canonical loop should have exactly three uses of the ind var");
2351 for (User *IVUser : CLI->getIndVar()->users()) {
2352 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2353 if (Mul->getOpcode() == Instruction::Mul) {
2354 for (User *MulUser : Mul->users()) {
2355 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2356 if (Add->getOpcode() == Instruction::Add) {
2357 Add->setOperand(1, CastedTaskLB);
2358 }
2359 }
2360 }
2361 }
2362 }
2363 }
2364
2365 FakeLB->replaceAllUsesWith(CastedLBVal);
2366 FakeUB->replaceAllUsesWith(CastedUBVal);
2367 FakeStep->replaceAllUsesWith(CastedStepVal);
2368 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2369 I->eraseFromParent();
2370 }
2371 };
2372
2373 addOutlineInfo(std::move(OI));
2374 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2375 return Builder.saveIP();
2376}
2377
2378OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
2379 const LocationDescription &Loc, InsertPointTy AllocaIP,
2380 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2381 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
2382 Value *Priority) {
2383
2384 if (!updateToLocation(Loc))
2385 return InsertPointTy();
2386
2387 uint32_t SrcLocStrSize;
2388 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2389 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2390 // The current basic block is split into four basic blocks. After outlining,
2391 // they will be mapped as follows:
2392 // ```
2393 // def current_fn() {
2394 // current_basic_block:
2395 // br label %task.exit
2396 // task.exit:
2397 // ; instructions after task
2398 // }
2399 // def outlined_fn() {
2400 // task.alloca:
2401 // br label %task.body
2402 // task.body:
2403 // ret void
2404 // }
2405 // ```
2406 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2407 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2408 BasicBlock *TaskAllocaBB =
2409 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2410
2411 InsertPointTy TaskAllocaIP =
2412 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2413 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2414 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2415 return Err;
2416
2417 OutlineInfo OI;
2418 OI.EntryBB = TaskAllocaBB;
2419 OI.OuterAllocaBB = AllocaIP.getBlock();
2420 OI.ExitBB = TaskExitBB;
2421
2422 // Add the thread ID argument.
2424 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2425 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2426
2427 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2428 Mergeable, Priority, EventHandle, TaskAllocaBB,
2429 ToBeDeleted](Function &OutlinedFn) mutable {
2430 // Replace the Stale CI by appropriate RTL function call.
2431 assert(OutlinedFn.hasOneUse() &&
2432 "there must be a single user for the outlined function");
2433 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2434
2435 // HasShareds is true if any variables are captured in the outlined region,
2436 // false otherwise.
2437 bool HasShareds = StaleCI->arg_size() > 1;
2438 Builder.SetInsertPoint(StaleCI);
2439
2440 // Gather the arguments for emitting the runtime call for
2441 // @__kmpc_omp_task_alloc
2442 Function *TaskAllocFn =
2443 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2444
2445 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2446 // call.
2447 Value *ThreadID = getOrCreateThreadID(Ident);
2448
2449 // Argument - `flags`
2450 // Task is tied iff (Flags & 1) == 1.
2451 // Task is untied iff (Flags & 1) == 0.
2452 // Task is final iff (Flags & 2) == 2.
2453 // Task is not final iff (Flags & 2) == 0.
2454 // Task is mergeable iff (Flags & 4) == 4.
2455 // Task is not mergeable iff (Flags & 4) == 0.
2456 // Task is priority iff (Flags & 32) == 32.
2457 // Task is not priority iff (Flags & 32) == 0.
2458 // TODO: Handle the other flags.
2459 Value *Flags = Builder.getInt32(Tied);
2460 if (Final) {
2461 Value *FinalFlag =
2462 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2463 Flags = Builder.CreateOr(FinalFlag, Flags);
2464 }
2465
2466 if (Mergeable)
2467 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2468 if (Priority)
2469 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2470
2471 // Argument - `sizeof_kmp_task_t` (TaskSize)
2472 // Tasksize refers to the size in bytes of kmp_task_t data structure
2473 // including private vars accessed in task.
2474 // TODO: add kmp_task_t_with_privates (privates)
2475 Value *TaskSize = Builder.getInt64(
2476 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2477
2478 // Argument - `sizeof_shareds` (SharedsSize)
2479 // SharedsSize refers to the shareds array size in the kmp_task_t data
2480 // structure.
2481 Value *SharedsSize = Builder.getInt64(0);
2482 if (HasShareds) {
2483 AllocaInst *ArgStructAlloca =
2485 assert(ArgStructAlloca &&
2486 "Unable to find the alloca instruction corresponding to arguments "
2487 "for extracted function");
2488 StructType *ArgStructType =
2489 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2490 assert(ArgStructType && "Unable to find struct type corresponding to "
2491 "arguments for extracted function");
2492 SharedsSize =
2493 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2494 }
2495 // Emit the @__kmpc_omp_task_alloc runtime call
2496 // The runtime call returns a pointer to an area where the task captured
2497 // variables must be copied before the task is run (TaskData)
2498 CallInst *TaskData = createRuntimeFunctionCall(
2499 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2500 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2501 /*task_func=*/&OutlinedFn});
2502
2503 // Emit detach clause initialization.
2504 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2505 // task_descriptor);
2506 if (EventHandle) {
2507 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2508 OMPRTL___kmpc_task_allow_completion_event);
2509 llvm::Value *EventVal =
2510 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2511 llvm::Value *EventHandleAddr =
2512 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2513 Builder.getPtrTy(0));
2514 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2515 Builder.CreateStore(EventVal, EventHandleAddr);
2516 }
2517 // Copy the arguments for outlined function
2518 if (HasShareds) {
2519 Value *Shareds = StaleCI->getArgOperand(1);
2520 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2521 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2522 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2523 SharedsSize);
2524 }
2525
2526 if (Priority) {
2527 //
2528 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2529 // we populate the priority information into the "kmp_task_t" here
2530 //
2531 // The struct "kmp_task_t" definition is available in kmp.h
2532 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2533 // data2 is used for priority
2534 //
2535 Type *Int32Ty = Builder.getInt32Ty();
2536 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2537 // kmp_task_t* => { ptr }
2538 Type *TaskPtr = StructType::get(VoidPtr);
2539 Value *TaskGEP =
2540 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2541 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2542 Type *TaskStructType = StructType::get(
2543 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2544 Value *PriorityData = Builder.CreateInBoundsGEP(
2545 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2546 // kmp_cmplrdata_t => { ptr, ptr }
2547 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2548 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2549 PriorityData, {Zero, Zero});
2550 Builder.CreateStore(Priority, CmplrData);
2551 }
2552
2553 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2554
2555 // In the presence of the `if` clause, the following IR is generated:
2556 // ...
2557 // %data = call @__kmpc_omp_task_alloc(...)
2558 // br i1 %if_condition, label %then, label %else
2559 // then:
2560 // call @__kmpc_omp_task(...)
2561 // br label %exit
2562 // else:
2563 // ;; Wait for resolution of dependencies, if any, before
2564 // ;; beginning the task
2565 // call @__kmpc_omp_wait_deps(...)
2566 // call @__kmpc_omp_task_begin_if0(...)
2567 // call @outlined_fn(...)
2568 // call @__kmpc_omp_task_complete_if0(...)
2569 // br label %exit
2570 // exit:
2571 // ...
2572 if (IfCondition) {
2573 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2574 // terminator.
2575 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2576 Instruction *IfTerminator =
2577 Builder.GetInsertPoint()->getParent()->getTerminator();
2578 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2579 Builder.SetInsertPoint(IfTerminator);
2580 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2581 &ElseTI);
2582 Builder.SetInsertPoint(ElseTI);
2583
2584 if (Dependencies.size()) {
2585 Function *TaskWaitFn =
2586 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2587 createRuntimeFunctionCall(
2588 TaskWaitFn,
2589 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2590 ConstantInt::get(Builder.getInt32Ty(), 0),
2592 }
2593 Function *TaskBeginFn =
2594 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2595 Function *TaskCompleteFn =
2596 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2597 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2598 CallInst *CI = nullptr;
2599 if (HasShareds)
2600 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2601 else
2602 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2603 CI->setDebugLoc(StaleCI->getDebugLoc());
2604 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2605 Builder.SetInsertPoint(ThenTI);
2606 }
2607
2608 if (Dependencies.size()) {
2609 Function *TaskFn =
2610 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2611 createRuntimeFunctionCall(
2612 TaskFn,
2613 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2614 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2616
2617 } else {
2618 // Emit the @__kmpc_omp_task runtime call to spawn the task
2619 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2620 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2621 }
2622
2623 StaleCI->eraseFromParent();
2624
2625 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2626 if (HasShareds) {
2627 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2628 OutlinedFn.getArg(1)->replaceUsesWithIf(
2629 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2630 }
2631
2632 for (Instruction *I : llvm::reverse(ToBeDeleted))
2633 I->eraseFromParent();
2634 };
2635
2636 addOutlineInfo(std::move(OI));
2637 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2638
2639 return Builder.saveIP();
2640}
2641
2642OpenMPIRBuilder::InsertPointOrErrorTy
2643OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2644 InsertPointTy AllocaIP,
2645 BodyGenCallbackTy BodyGenCB) {
2646 if (!updateToLocation(Loc))
2647 return InsertPointTy();
2648
2649 uint32_t SrcLocStrSize;
2650 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2651 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2652 Value *ThreadID = getOrCreateThreadID(Ident);
2653
2654 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2655 Function *TaskgroupFn =
2656 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2657 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2658
2659 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2660 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2661 return Err;
2662
2663 Builder.SetInsertPoint(TaskgroupExitBB);
2664 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2665 Function *EndTaskgroupFn =
2666 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2667 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2668
2669 return Builder.saveIP();
2670}
2671
2672OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2673 const LocationDescription &Loc, InsertPointTy AllocaIP,
2674 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2675 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2676 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2677
2678 if (!updateToLocation(Loc))
2679 return Loc.IP;
2680
2681 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2682
2683 // Each section is emitted as a switch case
2684 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2685 // -> OMP.createSection() which generates the IR for each section
2686 // Iterate through all sections and emit a switch construct:
2687 // switch (IV) {
2688 // case 0:
2689 // <SectionStmt[0]>;
2690 // break;
2691 // ...
2692 // case <NumSection> - 1:
2693 // <SectionStmt[<NumSection> - 1]>;
2694 // break;
2695 // }
2696 // ...
2697 // section_loop.after:
2698 // <FiniCB>;
2699 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2700 Builder.restoreIP(CodeGenIP);
2702 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2703 Function *CurFn = Continue->getParent();
2704 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2705
2706 unsigned CaseNumber = 0;
2707 for (auto SectionCB : SectionCBs) {
2709 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2710 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2711 Builder.SetInsertPoint(CaseBB);
2712 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2713 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2714 CaseEndBr->getIterator()}))
2715 return Err;
2716 CaseNumber++;
2717 }
2718 // remove the existing terminator from body BB since there can be no
2719 // terminators after switch/case
2720 return Error::success();
2721 };
2722 // Loop body ends here
2723 // LowerBound, UpperBound, and STride for createCanonicalLoop
2724 Type *I32Ty = Type::getInt32Ty(M.getContext());
2725 Value *LB = ConstantInt::get(I32Ty, 0);
2726 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2727 Value *ST = ConstantInt::get(I32Ty, 1);
2728 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2729 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2730 if (!LoopInfo)
2731 return LoopInfo.takeError();
2732
2733 InsertPointOrErrorTy WsloopIP =
2734 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2735 WorksharingLoopType::ForStaticLoop, !IsNowait);
2736 if (!WsloopIP)
2737 return WsloopIP.takeError();
2738 InsertPointTy AfterIP = *WsloopIP;
2739
2740 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2741 assert(LoopFini && "Bad structure of static workshare loop finalization");
2742
2743 // Apply the finalization callback in LoopAfterBB
2744 auto FiniInfo = FinalizationStack.pop_back_val();
2745 assert(FiniInfo.DK == OMPD_sections &&
2746 "Unexpected finalization stack state!");
2747 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2748 return Err;
2749
2750 return AfterIP;
2751}
2752
2753OpenMPIRBuilder::InsertPointOrErrorTy
2754OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2755 BodyGenCallbackTy BodyGenCB,
2756 FinalizeCallbackTy FiniCB) {
2757 if (!updateToLocation(Loc))
2758 return Loc.IP;
2759
2760 auto FiniCBWrapper = [&](InsertPointTy IP) {
2761 if (IP.getBlock()->end() != IP.getPoint())
2762 return FiniCB(IP);
2763 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2764 // will fail because that function requires the Finalization Basic Block to
2765 // have a terminator, which is already removed by EmitOMPRegionBody.
2766 // IP is currently at cancelation block.
2767 // We need to backtrack to the condition block to fetch
2768 // the exit block and create a branch from cancelation
2769 // to exit block.
2770 IRBuilder<>::InsertPointGuard IPG(Builder);
2771 Builder.restoreIP(IP);
2772 auto *CaseBB = Loc.IP.getBlock();
2773 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2774 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2775 Instruction *I = Builder.CreateBr(ExitBB);
2776 IP = InsertPointTy(I->getParent(), I->getIterator());
2777 return FiniCB(IP);
2778 };
2779
2780 Directive OMPD = Directive::OMPD_sections;
2781 // Since we are using Finalization Callback here, HasFinalize
2782 // and IsCancellable have to be true
2783 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2784 /*Conditional*/ false, /*hasFinalize*/ true,
2785 /*IsCancellable*/ true);
2786}
2787
2788static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2790 IT++;
2791 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2792}
2793
2794Value *OpenMPIRBuilder::getGPUThreadID() {
2795 return createRuntimeFunctionCall(
2796 getOrCreateRuntimeFunction(M,
2797 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2798 {});
2799}
2800
2801Value *OpenMPIRBuilder::getGPUWarpSize() {
2802 return createRuntimeFunctionCall(
2803 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2804}
2805
2806Value *OpenMPIRBuilder::getNVPTXWarpID() {
2807 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2808 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2809}
2810
2811Value *OpenMPIRBuilder::getNVPTXLaneID() {
2812 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2813 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2814 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2815 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2816 "nvptx_lane_id");
2817}
2818
2819Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2820 Type *ToType) {
2821 Type *FromType = From->getType();
2822 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2823 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2824 assert(FromSize > 0 && "From size must be greater than zero");
2825 assert(ToSize > 0 && "To size must be greater than zero");
2826 if (FromType == ToType)
2827 return From;
2828 if (FromSize == ToSize)
2829 return Builder.CreateBitCast(From, ToType);
2830 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2831 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2832 InsertPointTy SaveIP = Builder.saveIP();
2833 Builder.restoreIP(AllocaIP);
2834 Value *CastItem = Builder.CreateAlloca(ToType);
2835 Builder.restoreIP(SaveIP);
2836
2837 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2838 CastItem, Builder.getPtrTy(0));
2839 Builder.CreateStore(From, ValCastItem);
2840 return Builder.CreateLoad(ToType, CastItem);
2841}
2842
2843Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2844 Value *Element,
2845 Type *ElementType,
2846 Value *Offset) {
2847 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2848 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2849
2850 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2851 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2852 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2853 Value *WarpSize =
2854 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2855 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2856 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2857 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2858 Value *WarpSizeCast =
2859 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2860 Value *ShuffleCall =
2861 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2862 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2863}
2864
2865void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2866 Value *DstAddr, Type *ElemType,
2867 Value *Offset, Type *ReductionArrayTy,
2868 bool IsByRefElem) {
2869 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2870 // Create the loop over the big sized data.
2871 // ptr = (void*)Elem;
2872 // ptrEnd = (void*) Elem + 1;
2873 // Step = 8;
2874 // while (ptr + Step < ptrEnd)
2875 // shuffle((int64_t)*ptr);
2876 // Step = 4;
2877 // while (ptr + Step < ptrEnd)
2878 // shuffle((int32_t)*ptr);
2879 // ...
2880 Type *IndexTy = Builder.getIndexTy(
2881 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2882 Value *ElemPtr = DstAddr;
2883 Value *Ptr = SrcAddr;
2884 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2885 if (Size < IntSize)
2886 continue;
2887 Type *IntType = Builder.getIntNTy(IntSize * 8);
2888 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2889 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2890 Value *SrcAddrGEP =
2891 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2892 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2893 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2894
2895 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2896 if ((Size / IntSize) > 1) {
2897 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2898 SrcAddrGEP, Builder.getPtrTy());
2899 BasicBlock *PreCondBB =
2900 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2901 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2902 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2903 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2904 emitBlock(PreCondBB, CurFunc);
2905 PHINode *PhiSrc =
2906 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2907 PhiSrc->addIncoming(Ptr, CurrentBB);
2908 PHINode *PhiDest =
2909 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2910 PhiDest->addIncoming(ElemPtr, CurrentBB);
2911 Ptr = PhiSrc;
2912 ElemPtr = PhiDest;
2913 Value *PtrDiff = Builder.CreatePtrDiff(
2914 Builder.getInt8Ty(), PtrEnd,
2915 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2916 Builder.CreateCondBr(
2917 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2918 ExitBB);
2919 emitBlock(ThenBB, CurFunc);
2920 Value *Res = createRuntimeShuffleFunction(
2921 AllocaIP,
2922 Builder.CreateAlignedLoad(
2923 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2924 IntType, Offset);
2925 Builder.CreateAlignedStore(Res, ElemPtr,
2926 M.getDataLayout().getPrefTypeAlign(ElemType));
2927 Value *LocalPtr =
2928 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2929 Value *LocalElemPtr =
2930 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2931 PhiSrc->addIncoming(LocalPtr, ThenBB);
2932 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2933 emitBranch(PreCondBB);
2934 emitBlock(ExitBB, CurFunc);
2935 } else {
2936 Value *Res = createRuntimeShuffleFunction(
2937 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2938 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2939 Res->getType()->getScalarSizeInBits())
2940 Res = Builder.CreateTrunc(Res, ElemType);
2941 Builder.CreateStore(Res, ElemPtr);
2942 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2943 ElemPtr =
2944 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2945 }
2946 Size = Size % IntSize;
2947 }
2948}
2949
2950Error OpenMPIRBuilder::emitReductionListCopy(
2951 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2952 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2953 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
2954 Type *IndexTy = Builder.getIndexTy(
2955 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2956 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2957
2958 // Iterates, element-by-element, through the source Reduce list and
2959 // make a copy.
2960 for (auto En : enumerate(ReductionInfos)) {
2961 const ReductionInfo &RI = En.value();
2962 Value *SrcElementAddr = nullptr;
2963 AllocaInst *DestAlloca = nullptr;
2964 Value *DestElementAddr = nullptr;
2965 Value *DestElementPtrAddr = nullptr;
2966 // Should we shuffle in an element from a remote lane?
2967 bool ShuffleInElement = false;
2968 // Set to true to update the pointer in the dest Reduce list to a
2969 // newly created element.
2970 bool UpdateDestListPtr = false;
2971
2972 // Step 1.1: Get the address for the src element in the Reduce list.
2973 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2974 ReductionArrayTy, SrcBase,
2975 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2976 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2977
2978 // Step 1.2: Create a temporary to store the element in the destination
2979 // Reduce list.
2980 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2981 ReductionArrayTy, DestBase,
2982 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2983 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
2984 switch (Action) {
2985 case CopyAction::RemoteLaneToThread: {
2986 InsertPointTy CurIP = Builder.saveIP();
2987 Builder.restoreIP(AllocaIP);
2988
2989 Type *DestAllocaType =
2990 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
2991 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
2992 ".omp.reduction.element");
2993 DestAlloca->setAlignment(
2994 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
2995 DestElementAddr = DestAlloca;
2996 DestElementAddr =
2997 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2998 DestElementAddr->getName() + ".ascast");
2999 Builder.restoreIP(CurIP);
3000 ShuffleInElement = true;
3001 UpdateDestListPtr = true;
3002 break;
3003 }
3004 case CopyAction::ThreadCopy: {
3005 DestElementAddr =
3006 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3007 break;
3008 }
3009 }
3010
3011 // Now that all active lanes have read the element in the
3012 // Reduce list, shuffle over the value from the remote lane.
3013 if (ShuffleInElement) {
3014 Type *ShuffleType = RI.ElementType;
3015 Value *ShuffleSrcAddr = SrcElementAddr;
3016 Value *ShuffleDestAddr = DestElementAddr;
3017 AllocaInst *LocalStorage = nullptr;
3018
3019 if (IsByRefElem) {
3020 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3021 assert(RI.ByRefAllocatedType &&
3022 "Expected by-ref allocated type to be set");
3023 // For by-ref reductions, we need to copy from the remote lane the
3024 // actual value of the partial reduction computed by that remote lane;
3025 // rather than, for example, a pointer to that data or, even worse, a
3026 // pointer to the descriptor of the by-ref reduction element.
3027 ShuffleType = RI.ByRefElementType;
3028
3029 InsertPointOrErrorTy GenResult =
3030 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3031
3032 if (!GenResult)
3033 return GenResult.takeError();
3034
3035 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3036
3037 {
3038 InsertPointTy OldIP = Builder.saveIP();
3039 Builder.restoreIP(AllocaIP);
3040
3041 LocalStorage = Builder.CreateAlloca(ShuffleType);
3042 Builder.restoreIP(OldIP);
3043 ShuffleDestAddr = LocalStorage;
3044 }
3045 }
3046
3047 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3048 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3049
3050 if (IsByRefElem) {
3051 Value *GEP;
3052 InsertPointOrErrorTy GenResult =
3053 RI.DataPtrPtrGen(Builder.saveIP(),
3054 Builder.CreatePointerBitCastOrAddrSpaceCast(
3055 DestAlloca, Builder.getPtrTy(), ".ascast"),
3056 GEP);
3057
3058 if (!GenResult)
3059 return GenResult.takeError();
3060
3061 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3062 LocalStorage, Builder.getPtrTy(), ".ascast"),
3063 GEP);
3064 }
3065 } else {
3066 switch (RI.EvaluationKind) {
3067 case EvalKind::Scalar: {
3068 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3069 // Store the source element value to the dest element address.
3070 Builder.CreateStore(Elem, DestElementAddr);
3071 break;
3072 }
3073 case EvalKind::Complex: {
3074 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3075 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3076 Value *SrcReal = Builder.CreateLoad(
3077 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3078 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3079 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3080 Value *SrcImg = Builder.CreateLoad(
3081 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3082
3083 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3084 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3085 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3086 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3087 Builder.CreateStore(SrcReal, DestRealPtr);
3088 Builder.CreateStore(SrcImg, DestImgPtr);
3089 break;
3090 }
3091 case EvalKind::Aggregate: {
3092 Value *SizeVal = Builder.getInt64(
3093 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3094 Builder.CreateMemCpy(
3095 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3096 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3097 SizeVal, false);
3098 break;
3099 }
3100 };
3101 }
3102
3103 // Step 3.1: Modify reference in dest Reduce list as needed.
3104 // Modifying the reference in Reduce list to point to the newly
3105 // created element. The element is live in the current function
3106 // scope and that of functions it invokes (i.e., reduce_function).
3107 // RemoteReduceData[i] = (void*)&RemoteElem
3108 if (UpdateDestListPtr) {
3109 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3110 DestElementAddr, Builder.getPtrTy(),
3111 DestElementAddr->getName() + ".ascast");
3112 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3113 }
3114 }
3115
3116 return Error::success();
3117}
3118
3119Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3120 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3121 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3122 InsertPointTy SavedIP = Builder.saveIP();
3123 LLVMContext &Ctx = M.getContext();
3125 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3126 /* IsVarArg */ false);
3127 Function *WcFunc =
3129 "_omp_reduction_inter_warp_copy_func", &M);
3130 WcFunc->setAttributes(FuncAttrs);
3131 WcFunc->addParamAttr(0, Attribute::NoUndef);
3132 WcFunc->addParamAttr(1, Attribute::NoUndef);
3133 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3134 Builder.SetInsertPoint(EntryBB);
3135
3136 // ReduceList: thread local Reduce list.
3137 // At the stage of the computation when this function is called, partially
3138 // aggregated values reside in the first lane of every active warp.
3139 Argument *ReduceListArg = WcFunc->getArg(0);
3140 // NumWarps: number of warps active in the parallel region. This could
3141 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3142 Argument *NumWarpsArg = WcFunc->getArg(1);
3143
3144 // This array is used as a medium to transfer, one reduce element at a time,
3145 // the data from the first lane of every warp to lanes in the first warp
3146 // in order to perform the final step of a reduction in a parallel region
3147 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3148 // for reduced latency, as well as to have a distinct copy for concurrently
3149 // executing target regions. The array is declared with common linkage so
3150 // as to be shared across compilation units.
3151 StringRef TransferMediumName =
3152 "__openmp_nvptx_data_transfer_temporary_storage";
3153 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3154 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3155 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3156 if (!TransferMedium) {
3157 TransferMedium = new GlobalVariable(
3158 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3159 UndefValue::get(ArrayTy), TransferMediumName,
3160 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3161 /*AddressSpace=*/3);
3162 }
3163
3164 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3165 Value *GPUThreadID = getGPUThreadID();
3166 // nvptx_lane_id = nvptx_id % warpsize
3167 Value *LaneID = getNVPTXLaneID();
3168 // nvptx_warp_id = nvptx_id / warpsize
3169 Value *WarpID = getNVPTXWarpID();
3170
3171 InsertPointTy AllocaIP =
3172 InsertPointTy(Builder.GetInsertBlock(),
3173 Builder.GetInsertBlock()->getFirstInsertionPt());
3174 Type *Arg0Type = ReduceListArg->getType();
3175 Type *Arg1Type = NumWarpsArg->getType();
3176 Builder.restoreIP(AllocaIP);
3177 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3178 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3179 AllocaInst *NumWarpsAlloca =
3180 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3181 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3182 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3183 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3184 NumWarpsAlloca, Builder.getPtrTy(0),
3185 NumWarpsAlloca->getName() + ".ascast");
3186 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3187 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3188 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3189 InsertPointTy CodeGenIP =
3190 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3191 Builder.restoreIP(CodeGenIP);
3192
3193 Value *ReduceList =
3194 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3195
3196 for (auto En : enumerate(ReductionInfos)) {
3197 //
3198 // Warp master copies reduce element to transfer medium in __shared__
3199 // memory.
3200 //
3201 const ReductionInfo &RI = En.value();
3202 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3203 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3204 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3205 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3206 Type *CType = Builder.getIntNTy(TySize * 8);
3207
3208 unsigned NumIters = RealTySize / TySize;
3209 if (NumIters == 0)
3210 continue;
3211 Value *Cnt = nullptr;
3212 Value *CntAddr = nullptr;
3213 BasicBlock *PrecondBB = nullptr;
3214 BasicBlock *ExitBB = nullptr;
3215 if (NumIters > 1) {
3216 CodeGenIP = Builder.saveIP();
3217 Builder.restoreIP(AllocaIP);
3218 CntAddr =
3219 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3220
3221 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3222 CntAddr->getName() + ".ascast");
3223 Builder.restoreIP(CodeGenIP);
3224 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3225 CntAddr,
3226 /*Volatile=*/false);
3227 PrecondBB = BasicBlock::Create(Ctx, "precond");
3228 ExitBB = BasicBlock::Create(Ctx, "exit");
3229 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3230 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3231 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3232 /*Volatile=*/false);
3233 Value *Cmp = Builder.CreateICmpULT(
3234 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3235 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3236 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3237 }
3238
3239 // kmpc_barrier.
3240 InsertPointOrErrorTy BarrierIP1 =
3241 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3242 omp::Directive::OMPD_unknown,
3243 /* ForceSimpleCall */ false,
3244 /* CheckCancelFlag */ true);
3245 if (!BarrierIP1)
3246 return BarrierIP1.takeError();
3247 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3248 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3249 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3250
3251 // if (lane_id == 0)
3252 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3253 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3254 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3255
3256 // Reduce element = LocalReduceList[i]
3257 auto *RedListArrayTy =
3258 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3259 Type *IndexTy = Builder.getIndexTy(
3260 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3261 Value *ElemPtrPtr =
3262 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3263 {ConstantInt::get(IndexTy, 0),
3264 ConstantInt::get(IndexTy, En.index())});
3265 // elemptr = ((CopyType*)(elemptrptr)) + I
3266 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3267
3268 if (IsByRefElem) {
3269 InsertPointOrErrorTy GenRes =
3270 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3271
3272 if (!GenRes)
3273 return GenRes.takeError();
3274
3275 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3276 }
3277
3278 if (NumIters > 1)
3279 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3280
3281 // Get pointer to location in transfer medium.
3282 // MediumPtr = &medium[warp_id]
3283 Value *MediumPtr = Builder.CreateInBoundsGEP(
3284 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3285 // elem = *elemptr
3286 //*MediumPtr = elem
3287 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3288 // Store the source element value to the dest element address.
3289 Builder.CreateStore(Elem, MediumPtr,
3290 /*IsVolatile*/ true);
3291 Builder.CreateBr(MergeBB);
3292
3293 // else
3294 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3295 Builder.CreateBr(MergeBB);
3296
3297 // endif
3298 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3299 InsertPointOrErrorTy BarrierIP2 =
3300 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3301 omp::Directive::OMPD_unknown,
3302 /* ForceSimpleCall */ false,
3303 /* CheckCancelFlag */ true);
3304 if (!BarrierIP2)
3305 return BarrierIP2.takeError();
3306
3307 // Warp 0 copies reduce element from transfer medium
3308 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3309 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3310 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3311
3312 Value *NumWarpsVal =
3313 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3314 // Up to 32 threads in warp 0 are active.
3315 Value *IsActiveThread =
3316 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3317 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3318
3319 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3320
3321 // SecMediumPtr = &medium[tid]
3322 // SrcMediumVal = *SrcMediumPtr
3323 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3324 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3325 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3326 Value *TargetElemPtrPtr =
3327 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3328 {ConstantInt::get(IndexTy, 0),
3329 ConstantInt::get(IndexTy, En.index())});
3330 Value *TargetElemPtrVal =
3331 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3332 Value *TargetElemPtr = TargetElemPtrVal;
3333
3334 if (IsByRefElem) {
3335 InsertPointOrErrorTy GenRes =
3336 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3337
3338 if (!GenRes)
3339 return GenRes.takeError();
3340
3341 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3342 }
3343
3344 if (NumIters > 1)
3345 TargetElemPtr =
3346 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3347
3348 // *TargetElemPtr = SrcMediumVal;
3349 Value *SrcMediumValue =
3350 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3351 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3352 Builder.CreateBr(W0MergeBB);
3353
3354 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3355 Builder.CreateBr(W0MergeBB);
3356
3357 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3358
3359 if (NumIters > 1) {
3360 Cnt = Builder.CreateNSWAdd(
3361 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3362 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3363
3364 auto *CurFn = Builder.GetInsertBlock()->getParent();
3365 emitBranch(PrecondBB);
3366 emitBlock(ExitBB, CurFn);
3367 }
3368 RealTySize %= TySize;
3369 }
3370 }
3371
3372 Builder.CreateRetVoid();
3373 Builder.restoreIP(SavedIP);
3374
3375 return WcFunc;
3376}
3377
3378Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3379 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3380 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3381 LLVMContext &Ctx = M.getContext();
3382 FunctionType *FuncTy =
3383 FunctionType::get(Builder.getVoidTy(),
3384 {Builder.getPtrTy(), Builder.getInt16Ty(),
3385 Builder.getInt16Ty(), Builder.getInt16Ty()},
3386 /* IsVarArg */ false);
3387 Function *SarFunc =
3389 "_omp_reduction_shuffle_and_reduce_func", &M);
3390 SarFunc->setAttributes(FuncAttrs);
3391 SarFunc->addParamAttr(0, Attribute::NoUndef);
3392 SarFunc->addParamAttr(1, Attribute::NoUndef);
3393 SarFunc->addParamAttr(2, Attribute::NoUndef);
3394 SarFunc->addParamAttr(3, Attribute::NoUndef);
3395 SarFunc->addParamAttr(1, Attribute::SExt);
3396 SarFunc->addParamAttr(2, Attribute::SExt);
3397 SarFunc->addParamAttr(3, Attribute::SExt);
3398 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3399 Builder.SetInsertPoint(EntryBB);
3400
3401 // Thread local Reduce list used to host the values of data to be reduced.
3402 Argument *ReduceListArg = SarFunc->getArg(0);
3403 // Current lane id; could be logical.
3404 Argument *LaneIDArg = SarFunc->getArg(1);
3405 // Offset of the remote source lane relative to the current lane.
3406 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3407 // Algorithm version. This is expected to be known at compile time.
3408 Argument *AlgoVerArg = SarFunc->getArg(3);
3409
3410 Type *ReduceListArgType = ReduceListArg->getType();
3411 Type *LaneIDArgType = LaneIDArg->getType();
3412 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3413 Value *ReduceListAlloca = Builder.CreateAlloca(
3414 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3415 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3416 LaneIDArg->getName() + ".addr");
3417 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3418 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3419 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3420 AlgoVerArg->getName() + ".addr");
3421 ArrayType *RedListArrayTy =
3422 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3423
3424 // Create a local thread-private variable to host the Reduce list
3425 // from a remote lane.
3426 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3427 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3428
3429 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3430 ReduceListAlloca, ReduceListArgType,
3431 ReduceListAlloca->getName() + ".ascast");
3432 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3433 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3434 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3435 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3436 RemoteLaneOffsetAlloca->getName() + ".ascast");
3437 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3438 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3439 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3440 RemoteReductionListAlloca, Builder.getPtrTy(),
3441 RemoteReductionListAlloca->getName() + ".ascast");
3442
3443 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3444 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3445 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3446 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3447
3448 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3449 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3450 Value *RemoteLaneOffset =
3451 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3452 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3453
3454 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3455
3456 // This loop iterates through the list of reduce elements and copies,
3457 // element by element, from a remote lane in the warp to RemoteReduceList,
3458 // hosted on the thread's stack.
3459 Error EmitRedLsCpRes = emitReductionListCopy(
3460 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3461 ReduceList, RemoteListAddrCast, IsByRef,
3462 {RemoteLaneOffset, nullptr, nullptr});
3463
3464 if (EmitRedLsCpRes)
3465 return EmitRedLsCpRes;
3466
3467 // The actions to be performed on the Remote Reduce list is dependent
3468 // on the algorithm version.
3469 //
3470 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3471 // LaneId % 2 == 0 && Offset > 0):
3472 // do the reduction value aggregation
3473 //
3474 // The thread local variable Reduce list is mutated in place to host the
3475 // reduced data, which is the aggregated value produced from local and
3476 // remote lanes.
3477 //
3478 // Note that AlgoVer is expected to be a constant integer known at compile
3479 // time.
3480 // When AlgoVer==0, the first conjunction evaluates to true, making
3481 // the entire predicate true during compile time.
3482 // When AlgoVer==1, the second conjunction has only the second part to be
3483 // evaluated during runtime. Other conjunctions evaluates to false
3484 // during compile time.
3485 // When AlgoVer==2, the third conjunction has only the second part to be
3486 // evaluated during runtime. Other conjunctions evaluates to false
3487 // during compile time.
3488 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3489 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3490 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3491 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3492 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3493 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3494 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3495 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3496 Value *RemoteOffsetComp =
3497 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3498 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3499 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3500 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3501
3502 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3503 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3504 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3505
3506 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3507 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3508 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3509 ReduceList, Builder.getPtrTy());
3510 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3511 RemoteListAddrCast, Builder.getPtrTy());
3512 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3513 ->addFnAttr(Attribute::NoUnwind);
3514 Builder.CreateBr(MergeBB);
3515
3516 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3517 Builder.CreateBr(MergeBB);
3518
3519 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3520
3521 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3522 // Reduce list.
3523 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3524 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3525 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3526
3527 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3528 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3529 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3530 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3531
3532 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3533
3534 EmitRedLsCpRes = emitReductionListCopy(
3535 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3536 RemoteListAddrCast, ReduceList, IsByRef);
3537
3538 if (EmitRedLsCpRes)
3539 return EmitRedLsCpRes;
3540
3541 Builder.CreateBr(CpyMergeBB);
3542
3543 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3544 Builder.CreateBr(CpyMergeBB);
3545
3546 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3547
3548 Builder.CreateRetVoid();
3549
3550 return SarFunc;
3551}
3552
3553Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3554 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3555 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3556 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3557 LLVMContext &Ctx = M.getContext();
3559 Builder.getVoidTy(),
3560 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3561 /* IsVarArg */ false);
3562 Function *LtGCFunc =
3564 "_omp_reduction_list_to_global_copy_func", &M);
3565 LtGCFunc->setAttributes(FuncAttrs);
3566 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3567 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3568 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3569
3570 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3571 Builder.SetInsertPoint(EntryBlock);
3572
3573 // Buffer: global reduction buffer.
3574 Argument *BufferArg = LtGCFunc->getArg(0);
3575 // Idx: index of the buffer.
3576 Argument *IdxArg = LtGCFunc->getArg(1);
3577 // ReduceList: thread local Reduce list.
3578 Argument *ReduceListArg = LtGCFunc->getArg(2);
3579
3580 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3581 BufferArg->getName() + ".addr");
3582 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3583 IdxArg->getName() + ".addr");
3584 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3585 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3586 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3587 BufferArgAlloca, Builder.getPtrTy(),
3588 BufferArgAlloca->getName() + ".ascast");
3589 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3590 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3591 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3592 ReduceListArgAlloca, Builder.getPtrTy(),
3593 ReduceListArgAlloca->getName() + ".ascast");
3594
3595 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3596 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3597 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3598
3599 Value *LocalReduceList =
3600 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3601 Value *BufferArgVal =
3602 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3603 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3604 Type *IndexTy = Builder.getIndexTy(
3605 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3606 for (auto En : enumerate(ReductionInfos)) {
3607 const ReductionInfo &RI = En.value();
3608 auto *RedListArrayTy =
3609 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3610 // Reduce element = LocalReduceList[i]
3611 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3612 RedListArrayTy, LocalReduceList,
3613 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3614 // elemptr = ((CopyType*)(elemptrptr)) + I
3615 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3616
3617 // Global = Buffer.VD[Idx];
3618 Value *BufferVD =
3619 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3620 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3621 ReductionsBufferTy, BufferVD, 0, En.index());
3622
3623 switch (RI.EvaluationKind) {
3624 case EvalKind::Scalar: {
3625 Value *TargetElement;
3626
3627 if (IsByRef.empty() || !IsByRef[En.index()]) {
3628 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3629 } else {
3630 InsertPointOrErrorTy GenResult =
3631 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3632
3633 if (!GenResult)
3634 return GenResult.takeError();
3635
3636 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3637 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3638 }
3639
3640 Builder.CreateStore(TargetElement, GlobVal);
3641 break;
3642 }
3643 case EvalKind::Complex: {
3644 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3645 RI.ElementType, ElemPtr, 0, 0, ".realp");
3646 Value *SrcReal = Builder.CreateLoad(
3647 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3648 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3649 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3650 Value *SrcImg = Builder.CreateLoad(
3651 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3652
3653 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3654 RI.ElementType, GlobVal, 0, 0, ".realp");
3655 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3656 RI.ElementType, GlobVal, 0, 1, ".imagp");
3657 Builder.CreateStore(SrcReal, DestRealPtr);
3658 Builder.CreateStore(SrcImg, DestImgPtr);
3659 break;
3660 }
3661 case EvalKind::Aggregate: {
3662 Value *SizeVal =
3663 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3664 Builder.CreateMemCpy(
3665 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3666 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3667 break;
3668 }
3669 }
3670 }
3671
3672 Builder.CreateRetVoid();
3673 Builder.restoreIP(OldIP);
3674 return LtGCFunc;
3675}
3676
3677Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3678 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3679 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3680 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3681 LLVMContext &Ctx = M.getContext();
3683 Builder.getVoidTy(),
3684 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3685 /* IsVarArg */ false);
3686 Function *LtGRFunc =
3688 "_omp_reduction_list_to_global_reduce_func", &M);
3689 LtGRFunc->setAttributes(FuncAttrs);
3690 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3691 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3692 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3693
3694 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3695 Builder.SetInsertPoint(EntryBlock);
3696
3697 // Buffer: global reduction buffer.
3698 Argument *BufferArg = LtGRFunc->getArg(0);
3699 // Idx: index of the buffer.
3700 Argument *IdxArg = LtGRFunc->getArg(1);
3701 // ReduceList: thread local Reduce list.
3702 Argument *ReduceListArg = LtGRFunc->getArg(2);
3703
3704 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3705 BufferArg->getName() + ".addr");
3706 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3707 IdxArg->getName() + ".addr");
3708 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3709 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3710 auto *RedListArrayTy =
3711 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3712
3713 // 1. Build a list of reduction variables.
3714 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3715 Value *LocalReduceList =
3716 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3717
3718 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3719
3720 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3721 BufferArgAlloca, Builder.getPtrTy(),
3722 BufferArgAlloca->getName() + ".ascast");
3723 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3724 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3725 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3726 ReduceListArgAlloca, Builder.getPtrTy(),
3727 ReduceListArgAlloca->getName() + ".ascast");
3728 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3729 LocalReduceList, Builder.getPtrTy(),
3730 LocalReduceList->getName() + ".ascast");
3731
3732 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3733 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3734 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3735
3736 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3737 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3738 Type *IndexTy = Builder.getIndexTy(
3739 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3740 for (auto En : enumerate(ReductionInfos)) {
3741 const ReductionInfo &RI = En.value();
3742 Value *ByRefAlloc;
3743
3744 if (!IsByRef.empty() && IsByRef[En.index()]) {
3745 InsertPointTy OldIP = Builder.saveIP();
3746 Builder.restoreIP(AllocaIP);
3747
3748 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3749 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3750 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3751
3752 Builder.restoreIP(OldIP);
3753 }
3754
3755 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3756 RedListArrayTy, LocalReduceListAddrCast,
3757 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3758 Value *BufferVD =
3759 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3760 // Global = Buffer.VD[Idx];
3761 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3762 ReductionsBufferTy, BufferVD, 0, En.index());
3763
3764 if (!IsByRef.empty() && IsByRef[En.index()]) {
3765 Value *ByRefDataPtr;
3766
3767 InsertPointOrErrorTy GenResult =
3768 RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr);
3769
3770 if (!GenResult)
3771 return GenResult.takeError();
3772
3773 Builder.CreateStore(GlobValPtr, ByRefDataPtr);
3774 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3775 } else {
3776 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3777 }
3778 }
3779
3780 // Call reduce_function(GlobalReduceList, ReduceList)
3781 Value *ReduceList =
3782 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3783 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3784 ->addFnAttr(Attribute::NoUnwind);
3785 Builder.CreateRetVoid();
3786 Builder.restoreIP(OldIP);
3787 return LtGRFunc;
3788}
3789
3790Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3791 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3792 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3793 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3794 LLVMContext &Ctx = M.getContext();
3796 Builder.getVoidTy(),
3797 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3798 /* IsVarArg */ false);
3799 Function *GtLCFunc =
3801 "_omp_reduction_global_to_list_copy_func", &M);
3802 GtLCFunc->setAttributes(FuncAttrs);
3803 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3804 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3805 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3806
3807 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3808 Builder.SetInsertPoint(EntryBlock);
3809
3810 // Buffer: global reduction buffer.
3811 Argument *BufferArg = GtLCFunc->getArg(0);
3812 // Idx: index of the buffer.
3813 Argument *IdxArg = GtLCFunc->getArg(1);
3814 // ReduceList: thread local Reduce list.
3815 Argument *ReduceListArg = GtLCFunc->getArg(2);
3816
3817 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3818 BufferArg->getName() + ".addr");
3819 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3820 IdxArg->getName() + ".addr");
3821 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3822 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3823 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3824 BufferArgAlloca, Builder.getPtrTy(),
3825 BufferArgAlloca->getName() + ".ascast");
3826 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3827 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3828 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3829 ReduceListArgAlloca, Builder.getPtrTy(),
3830 ReduceListArgAlloca->getName() + ".ascast");
3831 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3832 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3833 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3834
3835 Value *LocalReduceList =
3836 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3837 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3838 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3839 Type *IndexTy = Builder.getIndexTy(
3840 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3841 for (auto En : enumerate(ReductionInfos)) {
3842 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3843 auto *RedListArrayTy =
3844 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3845 // Reduce element = LocalReduceList[i]
3846 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3847 RedListArrayTy, LocalReduceList,
3848 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3849 // elemptr = ((CopyType*)(elemptrptr)) + I
3850 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3851 // Global = Buffer.VD[Idx];
3852 Value *BufferVD =
3853 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3854 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3855 ReductionsBufferTy, BufferVD, 0, En.index());
3856
3857 switch (RI.EvaluationKind) {
3858 case EvalKind::Scalar: {
3859 Type *ElemType = RI.ElementType;
3860
3861 if (!IsByRef.empty() && IsByRef[En.index()]) {
3862 ElemType = RI.ByRefElementType;
3863 InsertPointOrErrorTy GenResult =
3864 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3865
3866 if (!GenResult)
3867 return GenResult.takeError();
3868
3869 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3870 }
3871
3872 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3873 Builder.CreateStore(TargetElement, ElemPtr);
3874 break;
3875 }
3876 case EvalKind::Complex: {
3877 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3878 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3879 Value *SrcReal = Builder.CreateLoad(
3880 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3881 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3882 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3883 Value *SrcImg = Builder.CreateLoad(
3884 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3885
3886 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3887 RI.ElementType, ElemPtr, 0, 0, ".realp");
3888 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3889 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3890 Builder.CreateStore(SrcReal, DestRealPtr);
3891 Builder.CreateStore(SrcImg, DestImgPtr);
3892 break;
3893 }
3894 case EvalKind::Aggregate: {
3895 Value *SizeVal =
3896 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3897 Builder.CreateMemCpy(
3898 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3899 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3900 SizeVal, false);
3901 break;
3902 }
3903 }
3904 }
3905
3906 Builder.CreateRetVoid();
3907 Builder.restoreIP(OldIP);
3908 return GtLCFunc;
3909}
3910
3911Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
3912 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3913 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3914 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3915 LLVMContext &Ctx = M.getContext();
3916 auto *FuncTy = FunctionType::get(
3917 Builder.getVoidTy(),
3918 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3919 /* IsVarArg */ false);
3920 Function *GtLRFunc =
3922 "_omp_reduction_global_to_list_reduce_func", &M);
3923 GtLRFunc->setAttributes(FuncAttrs);
3924 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
3925 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
3926 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
3927
3928 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
3929 Builder.SetInsertPoint(EntryBlock);
3930
3931 // Buffer: global reduction buffer.
3932 Argument *BufferArg = GtLRFunc->getArg(0);
3933 // Idx: index of the buffer.
3934 Argument *IdxArg = GtLRFunc->getArg(1);
3935 // ReduceList: thread local Reduce list.
3936 Argument *ReduceListArg = GtLRFunc->getArg(2);
3937
3938 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3939 BufferArg->getName() + ".addr");
3940 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3941 IdxArg->getName() + ".addr");
3942 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3943 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3944 ArrayType *RedListArrayTy =
3945 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3946
3947 // 1. Build a list of reduction variables.
3948 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3949 Value *LocalReduceList =
3950 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3951
3952 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3953
3954 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3955 BufferArgAlloca, Builder.getPtrTy(),
3956 BufferArgAlloca->getName() + ".ascast");
3957 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3958 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3959 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3960 ReduceListArgAlloca, Builder.getPtrTy(),
3961 ReduceListArgAlloca->getName() + ".ascast");
3962 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3963 LocalReduceList, Builder.getPtrTy(),
3964 LocalReduceList->getName() + ".ascast");
3965
3966 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3967 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3968 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3969
3970 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3971 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3972 Type *IndexTy = Builder.getIndexTy(
3973 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3974 for (auto En : enumerate(ReductionInfos)) {
3975 const ReductionInfo &RI = En.value();
3976 Value *ByRefAlloc;
3977
3978 if (!IsByRef.empty() && IsByRef[En.index()]) {
3979 InsertPointTy OldIP = Builder.saveIP();
3980 Builder.restoreIP(AllocaIP);
3981
3982 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3983 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3984 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3985
3986 Builder.restoreIP(OldIP);
3987 }
3988
3989 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3990 RedListArrayTy, ReductionList,
3991 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3992 // Global = Buffer.VD[Idx];
3993 Value *BufferVD =
3994 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3995 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3996 ReductionsBufferTy, BufferVD, 0, En.index());
3997
3998 if (!IsByRef.empty() && IsByRef[En.index()]) {
3999 Value *ByRefDataPtr;
4000 InsertPointOrErrorTy GenResult =
4001 RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr);
4002 if (!GenResult)
4003 return GenResult.takeError();
4004
4005 Builder.CreateStore(GlobValPtr, ByRefDataPtr);
4006 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4007 } else {
4008 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4009 }
4010 }
4011
4012 // Call reduce_function(ReduceList, GlobalReduceList)
4013 Value *ReduceList =
4014 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4015 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4016 ->addFnAttr(Attribute::NoUnwind);
4017 Builder.CreateRetVoid();
4018 Builder.restoreIP(OldIP);
4019 return GtLRFunc;
4020}
4021
4022std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4023 std::string Suffix =
4024 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4025 return (Name + Suffix).str();
4026}
4027
4028Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4029 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4030 ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind,
4031 AttributeList FuncAttrs) {
4032 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4033 {Builder.getPtrTy(), Builder.getPtrTy()},
4034 /* IsVarArg */ false);
4035 std::string Name = getReductionFuncName(ReducerName);
4036 Function *ReductionFunc =
4038 ReductionFunc->setAttributes(FuncAttrs);
4039 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4040 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4041 BasicBlock *EntryBB =
4042 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4043 Builder.SetInsertPoint(EntryBB);
4044
4045 // Need to alloca memory here and deal with the pointers before getting
4046 // LHS/RHS pointers out
4047 Value *LHSArrayPtr = nullptr;
4048 Value *RHSArrayPtr = nullptr;
4049 Argument *Arg0 = ReductionFunc->getArg(0);
4050 Argument *Arg1 = ReductionFunc->getArg(1);
4051 Type *Arg0Type = Arg0->getType();
4052 Type *Arg1Type = Arg1->getType();
4053
4054 Value *LHSAlloca =
4055 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4056 Value *RHSAlloca =
4057 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4058 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4059 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4060 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4061 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4062 Builder.CreateStore(Arg0, LHSAddrCast);
4063 Builder.CreateStore(Arg1, RHSAddrCast);
4064 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4065 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4066
4067 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4068 Type *IndexTy = Builder.getIndexTy(
4069 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4070 SmallVector<Value *> LHSPtrs, RHSPtrs;
4071 for (auto En : enumerate(ReductionInfos)) {
4072 const ReductionInfo &RI = En.value();
4073 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4074 RedArrayTy, RHSArrayPtr,
4075 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4076 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4077 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4078 RHSI8Ptr, RI.PrivateVariable->getType(),
4079 RHSI8Ptr->getName() + ".ascast");
4080
4081 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4082 RedArrayTy, LHSArrayPtr,
4083 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4084 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4085 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4086 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4087
4088 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
4089 LHSPtrs.emplace_back(LHSPtr);
4090 RHSPtrs.emplace_back(RHSPtr);
4091 } else {
4092 Value *LHS = LHSPtr;
4093 Value *RHS = RHSPtr;
4094
4095 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4096 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4097 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4098 }
4099
4100 Value *Reduced;
4101 InsertPointOrErrorTy AfterIP =
4102 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4103 if (!AfterIP)
4104 return AfterIP.takeError();
4105 if (!Builder.GetInsertBlock())
4106 return ReductionFunc;
4107
4108 Builder.restoreIP(*AfterIP);
4109
4110 if (!IsByRef.empty() && !IsByRef[En.index()])
4111 Builder.CreateStore(Reduced, LHSPtr);
4112 }
4113 }
4114
4115 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
4116 for (auto En : enumerate(ReductionInfos)) {
4117 unsigned Index = En.index();
4118 const ReductionInfo &RI = En.value();
4119 Value *LHSFixupPtr, *RHSFixupPtr;
4120 Builder.restoreIP(RI.ReductionGenClang(
4121 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4122
4123 // Fix the CallBack code genereated to use the correct Values for the LHS
4124 // and RHS
4125 LHSFixupPtr->replaceUsesWithIf(
4126 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4127 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4128 ReductionFunc;
4129 });
4130 RHSFixupPtr->replaceUsesWithIf(
4131 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4132 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4133 ReductionFunc;
4134 });
4135 }
4136
4137 Builder.CreateRetVoid();
4138 return ReductionFunc;
4139}
4140
4141static void
4143 bool IsGPU) {
4144 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4145 (void)RI;
4146 assert(RI.Variable && "expected non-null variable");
4147 assert(RI.PrivateVariable && "expected non-null private variable");
4148 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4149 "expected non-null reduction generator callback");
4150 if (!IsGPU) {
4151 assert(
4152 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4153 "expected variables and their private equivalents to have the same "
4154 "type");
4155 }
4156 assert(RI.Variable->getType()->isPointerTy() &&
4157 "expected variables to be pointers");
4158 }
4159}
4160
4161OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
4162 const LocationDescription &Loc, InsertPointTy AllocaIP,
4163 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4164 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4165 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4166 unsigned ReductionBufNum, Value *SrcLocInfo) {
4167 if (!updateToLocation(Loc))
4168 return InsertPointTy();
4169 Builder.restoreIP(CodeGenIP);
4170 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4171 LLVMContext &Ctx = M.getContext();
4172
4173 // Source location for the ident struct
4174 if (!SrcLocInfo) {
4175 uint32_t SrcLocStrSize;
4176 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4177 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4178 }
4179
4180 if (ReductionInfos.size() == 0)
4181 return Builder.saveIP();
4182
4183 BasicBlock *ContinuationBlock = nullptr;
4184 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
4185 // Copied code from createReductions
4186 BasicBlock *InsertBlock = Loc.IP.getBlock();
4187 ContinuationBlock =
4188 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4189 InsertBlock->getTerminator()->eraseFromParent();
4190 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4191 }
4192
4193 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4194 AttributeList FuncAttrs;
4195 AttrBuilder AttrBldr(Ctx);
4196 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4197 AttrBldr.addAttribute(Attr);
4198 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4199 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4200
4201 CodeGenIP = Builder.saveIP();
4202 Expected<Function *> ReductionResult = createReductionFunction(
4203 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4204 ReductionGenCBKind, FuncAttrs);
4205 if (!ReductionResult)
4206 return ReductionResult.takeError();
4207 Function *ReductionFunc = *ReductionResult;
4208 Builder.restoreIP(CodeGenIP);
4209
4210 // Set the grid value in the config needed for lowering later on
4211 if (GridValue.has_value())
4212 Config.setGridValue(GridValue.value());
4213 else
4214 Config.setGridValue(getGridValue(T, ReductionFunc));
4215
4216 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4217 // RedList, shuffle_reduce_func, interwarp_copy_func);
4218 // or
4219 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4220 Value *Res;
4221
4222 // 1. Build a list of reduction variables.
4223 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4224 auto Size = ReductionInfos.size();
4225 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4226 Type *FuncPtrTy =
4227 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4228 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4229 CodeGenIP = Builder.saveIP();
4230 Builder.restoreIP(AllocaIP);
4231 Value *ReductionListAlloca =
4232 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4233 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4234 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4235 Builder.restoreIP(CodeGenIP);
4236 Type *IndexTy = Builder.getIndexTy(
4237 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4238 for (auto En : enumerate(ReductionInfos)) {
4239 const ReductionInfo &RI = En.value();
4240 Value *ElemPtr = Builder.CreateInBoundsGEP(
4241 RedArrayTy, ReductionList,
4242 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4243
4244 Value *PrivateVar = RI.PrivateVariable;
4245 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4246 if (IsByRefElem)
4247 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4248
4249 Value *CastElem =
4250 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4251 Builder.CreateStore(CastElem, ElemPtr);
4252 }
4253 CodeGenIP = Builder.saveIP();
4254 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4255 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4256
4257 if (!SarFunc)
4258 return SarFunc.takeError();
4259
4260 Expected<Function *> CopyResult =
4261 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4262 if (!CopyResult)
4263 return CopyResult.takeError();
4264 Function *WcFunc = *CopyResult;
4265 Builder.restoreIP(CodeGenIP);
4266
4267 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4268
4269 unsigned MaxDataSize = 0;
4270 SmallVector<Type *> ReductionTypeArgs;
4271 for (auto En : enumerate(ReductionInfos)) {
4272 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
4273 if (Size > MaxDataSize)
4274 MaxDataSize = Size;
4275 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4276 ? En.value().ByRefElementType
4277 : En.value().ElementType;
4278 ReductionTypeArgs.emplace_back(RedTypeArg);
4279 }
4280 Value *ReductionDataSize =
4281 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4282 if (!IsTeamsReduction) {
4283 Value *SarFuncCast =
4284 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4285 Value *WcFuncCast =
4286 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4287 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4288 WcFuncCast};
4289 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
4290 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4291 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4292 } else {
4293 CodeGenIP = Builder.saveIP();
4294 StructType *ReductionsBufferTy = StructType::create(
4295 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4296 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4297 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4298
4299 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4300 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4301 if (!LtGCFunc)
4302 return LtGCFunc.takeError();
4303
4304 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4305 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4306 if (!LtGRFunc)
4307 return LtGRFunc.takeError();
4308
4309 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4310 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4311 if (!GtLCFunc)
4312 return GtLCFunc.takeError();
4313
4314 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4315 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4316 if (!GtLRFunc)
4317 return GtLRFunc.takeError();
4318
4319 Builder.restoreIP(CodeGenIP);
4320
4321 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4322 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4323
4324 Value *Args3[] = {SrcLocInfo,
4325 KernelTeamsReductionPtr,
4326 Builder.getInt32(ReductionBufNum),
4327 ReductionDataSize,
4328 RL,
4329 *SarFunc,
4330 WcFunc,
4331 *LtGCFunc,
4332 *LtGRFunc,
4333 *GtLCFunc,
4334 *GtLRFunc};
4335
4336 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4337 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4338 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4339 }
4340
4341 // 5. Build if (res == 1)
4342 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4343 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4344 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4345 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4346
4347 // 6. Build then branch: where we have reduced values in the master
4348 // thread in each team.
4349 // __kmpc_end_reduce{_nowait}(<gtid>);
4350 // break;
4351 emitBlock(ThenBB, CurFunc);
4352
4353 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4354 for (auto En : enumerate(ReductionInfos)) {
4355 const ReductionInfo &RI = En.value();
4356 Type *ValueType = RI.ElementType;
4357 Value *RedValue = RI.Variable;
4358 Value *RHS =
4359 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4360
4361 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
4362 Value *LHSPtr, *RHSPtr;
4363 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4364 &LHSPtr, &RHSPtr, CurFunc));
4365
4366 // Fix the CallBack code genereated to use the correct Values for the LHS
4367 // and RHS
4368 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4369 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4370 ReductionFunc;
4371 });
4372 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4373 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4374 ReductionFunc;
4375 });
4376 } else {
4377 if (IsByRef.empty() || !IsByRef[En.index()]) {
4378 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4379 "red.value." + Twine(En.index()));
4380 }
4381 Value *PrivateRedValue = Builder.CreateLoad(
4382 ValueType, RHS, "red.private.value" + Twine(En.index()));
4383 Value *Reduced;
4384 InsertPointOrErrorTy AfterIP =
4385 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4386 if (!AfterIP)
4387 return AfterIP.takeError();
4388 Builder.restoreIP(*AfterIP);
4389
4390 if (!IsByRef.empty() && !IsByRef[En.index()])
4391 Builder.CreateStore(Reduced, RI.Variable);
4392 }
4393 }
4394 emitBlock(ExitBB, CurFunc);
4395 if (ContinuationBlock) {
4396 Builder.CreateBr(ContinuationBlock);
4397 Builder.SetInsertPoint(ContinuationBlock);
4398 }
4399 Config.setEmitLLVMUsed();
4400
4401 return Builder.saveIP();
4402}
4403
4405 Type *VoidTy = Type::getVoidTy(M.getContext());
4406 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4407 auto *FuncTy =
4408 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4410 ".omp.reduction.func", &M);
4411}
4412
4414 Function *ReductionFunc,
4416 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4417 Module *Module = ReductionFunc->getParent();
4418 BasicBlock *ReductionFuncBlock =
4419 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4420 Builder.SetInsertPoint(ReductionFuncBlock);
4421 Value *LHSArrayPtr = nullptr;
4422 Value *RHSArrayPtr = nullptr;
4423 if (IsGPU) {
4424 // Need to alloca memory here and deal with the pointers before getting
4425 // LHS/RHS pointers out
4426 //
4427 Argument *Arg0 = ReductionFunc->getArg(0);
4428 Argument *Arg1 = ReductionFunc->getArg(1);
4429 Type *Arg0Type = Arg0->getType();
4430 Type *Arg1Type = Arg1->getType();
4431
4432 Value *LHSAlloca =
4433 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4434 Value *RHSAlloca =
4435 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4436 Value *LHSAddrCast =
4437 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4438 Value *RHSAddrCast =
4439 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4440 Builder.CreateStore(Arg0, LHSAddrCast);
4441 Builder.CreateStore(Arg1, RHSAddrCast);
4442 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4443 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4444 } else {
4445 LHSArrayPtr = ReductionFunc->getArg(0);
4446 RHSArrayPtr = ReductionFunc->getArg(1);
4447 }
4448
4449 unsigned NumReductions = ReductionInfos.size();
4450 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4451
4452 for (auto En : enumerate(ReductionInfos)) {
4453 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4454 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4455 RedArrayTy, LHSArrayPtr, 0, En.index());
4456 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4457 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4458 LHSI8Ptr, RI.Variable->getType());
4459 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4460 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4461 RedArrayTy, RHSArrayPtr, 0, En.index());
4462 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4463 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4464 RHSI8Ptr, RI.PrivateVariable->getType());
4465 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4466 Value *Reduced;
4467 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4468 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4469 if (!AfterIP)
4470 return AfterIP.takeError();
4471
4472 Builder.restoreIP(*AfterIP);
4473 // TODO: Consider flagging an error.
4474 if (!Builder.GetInsertBlock())
4475 return Error::success();
4476
4477 // store is inside of the reduction region when using by-ref
4478 if (!IsByRef[En.index()])
4479 Builder.CreateStore(Reduced, LHSPtr);
4480 }
4481 Builder.CreateRetVoid();
4482 return Error::success();
4483}
4484
4485OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
4486 const LocationDescription &Loc, InsertPointTy AllocaIP,
4487 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4488 bool IsNoWait, bool IsTeamsReduction) {
4489 assert(ReductionInfos.size() == IsByRef.size());
4490 if (Config.isGPU())
4491 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4492 IsByRef, IsNoWait, IsTeamsReduction);
4493
4494 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4495
4496 if (!updateToLocation(Loc))
4497 return InsertPointTy();
4498
4499 if (ReductionInfos.size() == 0)
4500 return Builder.saveIP();
4501
4502 BasicBlock *InsertBlock = Loc.IP.getBlock();
4503 BasicBlock *ContinuationBlock =
4504 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4505 InsertBlock->getTerminator()->eraseFromParent();
4506
4507 // Create and populate array of type-erased pointers to private reduction
4508 // values.
4509 unsigned NumReductions = ReductionInfos.size();
4510 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4511 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4512 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4513
4514 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4515
4516 for (auto En : enumerate(ReductionInfos)) {
4517 unsigned Index = En.index();
4518 const ReductionInfo &RI = En.value();
4519 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4520 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4521 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4522 }
4523
4524 // Emit a call to the runtime function that orchestrates the reduction.
4525 // Declare the reduction function in the process.
4526 Type *IndexTy = Builder.getIndexTy(
4527 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4528 Function *Func = Builder.GetInsertBlock()->getParent();
4529 Module *Module = Func->getParent();
4530 uint32_t SrcLocStrSize;
4531 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4532 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4533 return RI.AtomicReductionGen;
4534 });
4535 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4536 CanGenerateAtomic
4537 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4538 : IdentFlag(0));
4539 Value *ThreadId = getOrCreateThreadID(Ident);
4540 Constant *NumVariables = Builder.getInt32(NumReductions);
4541 const DataLayout &DL = Module->getDataLayout();
4542 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4543 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4544 Function *ReductionFunc = getFreshReductionFunc(*Module);
4545 Value *Lock = getOMPCriticalRegionLock(".reduction");
4546 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
4547 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4548 : RuntimeFunction::OMPRTL___kmpc_reduce);
4549 CallInst *ReduceCall =
4550 createRuntimeFunctionCall(ReduceFunc,
4551 {Ident, ThreadId, NumVariables, RedArraySize,
4552 RedArray, ReductionFunc, Lock},
4553 "reduce");
4554
4555 // Create final reduction entry blocks for the atomic and non-atomic case.
4556 // Emit IR that dispatches control flow to one of the blocks based on the
4557 // reduction supporting the atomic mode.
4558 BasicBlock *NonAtomicRedBlock =
4559 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4560 BasicBlock *AtomicRedBlock =
4561 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4563 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4564 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4565 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4566
4567 // Populate the non-atomic reduction using the elementwise reduction function.
4568 // This loads the elements from the global and private variables and reduces
4569 // them before storing back the result to the global variable.
4570 Builder.SetInsertPoint(NonAtomicRedBlock);
4571 for (auto En : enumerate(ReductionInfos)) {
4572 const ReductionInfo &RI = En.value();
4573 Type *ValueType = RI.ElementType;
4574 // We have one less load for by-ref case because that load is now inside of
4575 // the reduction region
4576 Value *RedValue = RI.Variable;
4577 if (!IsByRef[En.index()]) {
4578 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4579 "red.value." + Twine(En.index()));
4580 }
4581 Value *PrivateRedValue =
4582 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4583 "red.private.value." + Twine(En.index()));
4584 Value *Reduced;
4585 InsertPointOrErrorTy AfterIP =
4586 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4587 if (!AfterIP)
4588 return AfterIP.takeError();
4589 Builder.restoreIP(*AfterIP);
4590
4591 if (!Builder.GetInsertBlock())
4592 return InsertPointTy();
4593 // for by-ref case, the load is inside of the reduction region
4594 if (!IsByRef[En.index()])
4595 Builder.CreateStore(Reduced, RI.Variable);
4596 }
4597 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4598 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4599 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4600 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4601 Builder.CreateBr(ContinuationBlock);
4602
4603 // Populate the atomic reduction using the atomic elementwise reduction
4604 // function. There are no loads/stores here because they will be happening
4605 // inside the atomic elementwise reduction.
4606 Builder.SetInsertPoint(AtomicRedBlock);
4607 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4608 for (const ReductionInfo &RI : ReductionInfos) {
4609 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
4610 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4611 if (!AfterIP)
4612 return AfterIP.takeError();
4613 Builder.restoreIP(*AfterIP);
4614 if (!Builder.GetInsertBlock())
4615 return InsertPointTy();
4616 }
4617 Builder.CreateBr(ContinuationBlock);
4618 } else {
4619 Builder.CreateUnreachable();
4620 }
4621
4622 // Populate the outlined reduction function using the elementwise reduction
4623 // function. Partial values are extracted from the type-erased array of
4624 // pointers to private variables.
4625 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4626 IsByRef, /*isGPU=*/false);
4627 if (Err)
4628 return Err;
4629
4630 if (!Builder.GetInsertBlock())
4631 return InsertPointTy();
4632
4633 Builder.SetInsertPoint(ContinuationBlock);
4634 return Builder.saveIP();
4635}
4636
4637OpenMPIRBuilder::InsertPointOrErrorTy
4638OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4639 BodyGenCallbackTy BodyGenCB,
4640 FinalizeCallbackTy FiniCB) {
4641 if (!updateToLocation(Loc))
4642 return Loc.IP;
4643
4644 Directive OMPD = Directive::OMPD_master;
4645 uint32_t SrcLocStrSize;
4646 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4647 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4648 Value *ThreadId = getOrCreateThreadID(Ident);
4649 Value *Args[] = {Ident, ThreadId};
4650
4651 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4652 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4653
4654 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4655 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4656
4657 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4658 /*Conditional*/ true, /*hasFinalize*/ true);
4659}
4660
4661OpenMPIRBuilder::InsertPointOrErrorTy
4662OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4663 BodyGenCallbackTy BodyGenCB,
4664 FinalizeCallbackTy FiniCB, Value *Filter) {
4665 if (!updateToLocation(Loc))
4666 return Loc.IP;
4667
4668 Directive OMPD = Directive::OMPD_masked;
4669 uint32_t SrcLocStrSize;
4670 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4671 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4672 Value *ThreadId = getOrCreateThreadID(Ident);
4673 Value *Args[] = {Ident, ThreadId, Filter};
4674 Value *ArgsEnd[] = {Ident, ThreadId};
4675
4676 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4677 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4678
4679 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4680 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4681
4682 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4683 /*Conditional*/ true, /*hasFinalize*/ true);
4684}
4685
4687 llvm::FunctionCallee Callee,
4689 const llvm::Twine &Name) {
4690 llvm::CallInst *Call = Builder.CreateCall(
4691 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4692 Call->setDoesNotThrow();
4693 return Call;
4694}
4695
4696// Expects input basic block is dominated by BeforeScanBB.
4697// Once Scan directive is encountered, the code after scan directive should be
4698// dominated by AfterScanBB. Scan directive splits the code sequence to
4699// scan and input phase. Based on whether inclusive or exclusive
4700// clause is used in the scan directive and whether input loop or scan loop
4701// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4702// input loop and second is the scan loop. The code generated handles only
4703// inclusive scans now.
4704OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4705 const LocationDescription &Loc, InsertPointTy AllocaIP,
4706 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4707 bool IsInclusive, ScanInfo *ScanRedInfo) {
4708 if (ScanRedInfo->OMPFirstScanLoop) {
4709 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4710 ScanVarsType, ScanRedInfo);
4711 if (Err)
4712 return Err;
4713 }
4714 if (!updateToLocation(Loc))
4715 return Loc.IP;
4716
4717 llvm::Value *IV = ScanRedInfo->IV;
4718
4719 if (ScanRedInfo->OMPFirstScanLoop) {
4720 // Emit buffer[i] = red; at the end of the input phase.
4721 for (size_t i = 0; i < ScanVars.size(); i++) {
4722 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4723 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4724 Type *DestTy = ScanVarsType[i];
4725 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4726 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4727
4728 Builder.CreateStore(Src, Val);
4729 }
4730 }
4731 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4732 emitBlock(ScanRedInfo->OMPScanDispatch,
4733 Builder.GetInsertBlock()->getParent());
4734
4735 if (!ScanRedInfo->OMPFirstScanLoop) {
4736 IV = ScanRedInfo->IV;
4737 // Emit red = buffer[i]; at the entrance to the scan phase.
4738 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4739 for (size_t i = 0; i < ScanVars.size(); i++) {
4740 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4741 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4742 Type *DestTy = ScanVarsType[i];
4743 Value *SrcPtr =
4744 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4745 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4746 Builder.CreateStore(Src, ScanVars[i]);
4747 }
4748 }
4749
4750 // TODO: Update it to CreateBr and remove dead blocks
4751 llvm::Value *CmpI = Builder.getInt1(true);
4752 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4753 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4754 ScanRedInfo->OMPAfterScanBlock);
4755 } else {
4756 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4757 ScanRedInfo->OMPBeforeScanBlock);
4758 }
4759 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4760 Builder.GetInsertBlock()->getParent());
4761 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4762 return Builder.saveIP();
4763}
4764
4765Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4766 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4767 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4768
4769 Builder.restoreIP(AllocaIP);
4770 // Create the shared pointer at alloca IP.
4771 for (size_t i = 0; i < ScanVars.size(); i++) {
4772 llvm::Value *BuffPtr =
4773 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4774 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4775 }
4776
4777 // Allocate temporary buffer by master thread
4778 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4779 InsertPointTy CodeGenIP) -> Error {
4780 Builder.restoreIP(CodeGenIP);
4781 Value *AllocSpan =
4782 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4783 for (size_t i = 0; i < ScanVars.size(); i++) {
4784 Type *IntPtrTy = Builder.getInt32Ty();
4785 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4786 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4787 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4788 AllocSpan, nullptr, "arr");
4789 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4790 }
4791 return Error::success();
4792 };
4793 // TODO: Perform finalization actions for variables. This has to be
4794 // called for variables which have destructors/finalizers.
4795 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4796
4797 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4798 llvm::Value *FilterVal = Builder.getInt32(0);
4799 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4800 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4801
4802 if (!AfterIP)
4803 return AfterIP.takeError();
4804 Builder.restoreIP(*AfterIP);
4805 BasicBlock *InputBB = Builder.GetInsertBlock();
4806 if (InputBB->getTerminator())
4807 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4808 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4809 if (!AfterIP)
4810 return AfterIP.takeError();
4811 Builder.restoreIP(*AfterIP);
4812
4813 return Error::success();
4814}
4815
4816Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4817 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4818 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4819 InsertPointTy CodeGenIP) -> Error {
4820 Builder.restoreIP(CodeGenIP);
4821 for (ReductionInfo RedInfo : ReductionInfos) {
4822 Value *PrivateVar = RedInfo.PrivateVariable;
4823 Value *OrigVar = RedInfo.Variable;
4824 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4825 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4826
4827 Type *SrcTy = RedInfo.ElementType;
4828 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4829 "arrayOffset");
4830 Value *Src = Builder.CreateLoad(SrcTy, Val);
4831
4832 Builder.CreateStore(Src, OrigVar);
4833 Builder.CreateFree(Buff);
4834 }
4835 return Error::success();
4836 };
4837 // TODO: Perform finalization actions for variables. This has to be
4838 // called for variables which have destructors/finalizers.
4839 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4840
4841 if (ScanRedInfo->OMPScanFinish->getTerminator())
4842 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4843 else
4844 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4845
4846 llvm::Value *FilterVal = Builder.getInt32(0);
4847 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4848 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4849
4850 if (!AfterIP)
4851 return AfterIP.takeError();
4852 Builder.restoreIP(*AfterIP);
4853 BasicBlock *InputBB = Builder.GetInsertBlock();
4854 if (InputBB->getTerminator())
4855 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4856 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4857 if (!AfterIP)
4858 return AfterIP.takeError();
4859 Builder.restoreIP(*AfterIP);
4860 return Error::success();
4861}
4862
4863OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4864 const LocationDescription &Loc,
4866 ScanInfo *ScanRedInfo) {
4867
4868 if (!updateToLocation(Loc))
4869 return Loc.IP;
4870 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4871 InsertPointTy CodeGenIP) -> Error {
4872 Builder.restoreIP(CodeGenIP);
4873 Function *CurFn = Builder.GetInsertBlock()->getParent();
4874 // for (int k = 0; k <= ceil(log2(n)); ++k)
4875 llvm::BasicBlock *LoopBB =
4876 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4877 llvm::BasicBlock *ExitBB =
4878 splitBB(Builder, false, "omp.outer.log.scan.exit");
4880 Builder.GetInsertBlock()->getModule(),
4881 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4882 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4883 llvm::Value *Arg =
4884 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4885 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4887 Builder.GetInsertBlock()->getModule(),
4888 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4889 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4890 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4891 llvm::Value *NMin1 = Builder.CreateNUWSub(
4892 ScanRedInfo->Span,
4893 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4894 Builder.SetInsertPoint(InputBB);
4895 Builder.CreateBr(LoopBB);
4896 emitBlock(LoopBB, CurFn);
4897 Builder.SetInsertPoint(LoopBB);
4898
4899 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4900 // size pow2k = 1;
4901 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4902 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4903 InputBB);
4904 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4905 InputBB);
4906 // for (size i = n - 1; i >= 2 ^ k; --i)
4907 // tmp[i] op= tmp[i-pow2k];
4908 llvm::BasicBlock *InnerLoopBB =
4909 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4910 llvm::BasicBlock *InnerExitBB =
4911 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4912 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4913 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4914 emitBlock(InnerLoopBB, CurFn);
4915 Builder.SetInsertPoint(InnerLoopBB);
4916 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4917 IVal->addIncoming(NMin1, LoopBB);
4918 for (ReductionInfo RedInfo : ReductionInfos) {
4919 Value *ReductionVal = RedInfo.PrivateVariable;
4920 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4921 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4922 Type *DestTy = RedInfo.ElementType;
4923 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4924 Value *LHSPtr =
4925 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4926 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4927 Value *RHSPtr =
4928 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4929 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4930 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4932 InsertPointOrErrorTy AfterIP =
4933 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4934 if (!AfterIP)
4935 return AfterIP.takeError();
4936 Builder.CreateStore(Result, LHSPtr);
4937 }
4938 llvm::Value *NextIVal = Builder.CreateNUWSub(
4939 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4940 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4941 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4942 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4943 emitBlock(InnerExitBB, CurFn);
4944 llvm::Value *Next = Builder.CreateNUWAdd(
4945 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4946 Counter->addIncoming(Next, Builder.GetInsertBlock());
4947 // pow2k <<= 1;
4948 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4949 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4950 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4951 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4952 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4953 return Error::success();
4954 };
4955
4956 // TODO: Perform finalization actions for variables. This has to be
4957 // called for variables which have destructors/finalizers.
4958 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4959
4960 llvm::Value *FilterVal = Builder.getInt32(0);
4961 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4962 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4963
4964 if (!AfterIP)
4965 return AfterIP.takeError();
4966 Builder.restoreIP(*AfterIP);
4967 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4968
4969 if (!AfterIP)
4970 return AfterIP.takeError();
4971 Builder.restoreIP(*AfterIP);
4972 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4973 if (Err)
4974 return Err;
4975
4976 return AfterIP;
4977}
4978
4979Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4980 llvm::function_ref<Error()> InputLoopGen,
4981 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4982 ScanInfo *ScanRedInfo) {
4983
4984 {
4985 // Emit loop with input phase:
4986 // for (i: 0..<num_iters>) {
4987 // <input phase>;
4988 // buffer[i] = red;
4989 // }
4990 ScanRedInfo->OMPFirstScanLoop = true;
4991 Error Err = InputLoopGen();
4992 if (Err)
4993 return Err;
4994 }
4995 {
4996 // Emit loop with scan phase:
4997 // for (i: 0..<num_iters>) {
4998 // red = buffer[i];
4999 // <scan phase>;
5000 // }
5001 ScanRedInfo->OMPFirstScanLoop = false;
5002 Error Err = ScanLoopGen(Builder.saveIP());
5003 if (Err)
5004 return Err;
5005 }
5006 return Error::success();
5007}
5008
5009void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5010 Function *Fun = Builder.GetInsertBlock()->getParent();
5011 ScanRedInfo->OMPScanDispatch =
5012 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5013 ScanRedInfo->OMPAfterScanBlock =
5014 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5015 ScanRedInfo->OMPBeforeScanBlock =
5016 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5017 ScanRedInfo->OMPScanLoopExit =
5018 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5019}
5020CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
5021 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5022 BasicBlock *PostInsertBefore, const Twine &Name) {
5023 Module *M = F->getParent();
5024 LLVMContext &Ctx = M->getContext();
5025 Type *IndVarTy = TripCount->getType();
5026
5027 // Create the basic block structure.
5028 BasicBlock *Preheader =
5029 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5030 BasicBlock *Header =
5031 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5032 BasicBlock *Cond =
5033 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5034 BasicBlock *Body =
5035 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5036 BasicBlock *Latch =
5037 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5038 BasicBlock *Exit =
5039 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5040 BasicBlock *After =
5041 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5042
5043 // Use specified DebugLoc for new instructions.
5044 Builder.SetCurrentDebugLocation(DL);
5045
5046 Builder.SetInsertPoint(Preheader);
5047 Builder.CreateBr(Header);
5048
5049 Builder.SetInsertPoint(Header);
5050 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5051 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5052 Builder.CreateBr(Cond);
5053
5054 Builder.SetInsertPoint(Cond);
5055 Value *Cmp =
5056 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5057 Builder.CreateCondBr(Cmp, Body, Exit);
5058
5059 Builder.SetInsertPoint(Body);
5060 Builder.CreateBr(Latch);
5061
5062 Builder.SetInsertPoint(Latch);
5063 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5064 "omp_" + Name + ".next", /*HasNUW=*/true);
5065 Builder.CreateBr(Header);
5066 IndVarPHI->addIncoming(Next, Latch);
5067
5068 Builder.SetInsertPoint(Exit);
5069 Builder.CreateBr(After);
5070
5071 // Remember and return the canonical control flow.
5072 LoopInfos.emplace_front();
5073 CanonicalLoopInfo *CL = &LoopInfos.front();
5074
5075 CL->Header = Header;
5076 CL->Cond = Cond;
5077 CL->Latch = Latch;
5078 CL->Exit = Exit;
5079
5080#ifndef NDEBUG
5081 CL->assertOK();
5082#endif
5083 return CL;
5084}
5085
5087OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
5088 LoopBodyGenCallbackTy BodyGenCB,
5089 Value *TripCount, const Twine &Name) {
5090 BasicBlock *BB = Loc.IP.getBlock();
5091 BasicBlock *NextBB = BB->getNextNode();
5092
5093 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5094 NextBB, NextBB, Name);
5095 BasicBlock *After = CL->getAfter();
5096
5097 // If location is not set, don't connect the loop.
5098 if (updateToLocation(Loc)) {
5099 // Split the loop at the insertion point: Branch to the preheader and move
5100 // every following instruction to after the loop (the After BB). Also, the
5101 // new successor is the loop's after block.
5102 spliceBB(Builder, After, /*CreateBranch=*/false);
5103 Builder.CreateBr(CL->getPreheader());
5104 }
5105
5106 // Emit the body content. We do it after connecting the loop to the CFG to
5107 // avoid that the callback encounters degenerate BBs.
5108 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5109 return Err;
5110
5111#ifndef NDEBUG
5112 CL->assertOK();
5113#endif
5114 return CL;
5115}
5116
5117Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
5118 ScanInfos.emplace_front();
5119 ScanInfo *Result = &ScanInfos.front();
5120 return Result;
5121}
5122
5124OpenMPIRBuilder::createCanonicalScanLoops(
5125 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
5126 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5127 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5128 LocationDescription ComputeLoc =
5129 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5130 updateToLocation(ComputeLoc);
5131
5133
5134 Value *TripCount = calculateCanonicalLoopTripCount(
5135 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5136 ScanRedInfo->Span = TripCount;
5137 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5138 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5139
5140 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5141 Builder.restoreIP(CodeGenIP);
5142 ScanRedInfo->IV = IV;
5143 createScanBBs(ScanRedInfo);
5144 BasicBlock *InputBlock = Builder.GetInsertBlock();
5145 Instruction *Terminator = InputBlock->getTerminator();
5146 assert(Terminator->getNumSuccessors() == 1);
5147 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5148 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5149 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5150 Builder.GetInsertBlock()->getParent());
5151 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5152 emitBlock(ScanRedInfo->OMPScanLoopExit,
5153 Builder.GetInsertBlock()->getParent());
5154 Builder.CreateBr(ContinueBlock);
5155 Builder.SetInsertPoint(
5156 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5157 return BodyGenCB(Builder.saveIP(), IV);
5158 };
5159
5160 const auto &&InputLoopGen = [&]() -> Error {
5161 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
5162 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5163 ComputeIP, Name, true, ScanRedInfo);
5164 if (!LoopInfo)
5165 return LoopInfo.takeError();
5166 Result.push_back(*LoopInfo);
5167 Builder.restoreIP((*LoopInfo)->getAfterIP());
5168 return Error::success();
5169 };
5170 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5172 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5173 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5174 if (!LoopInfo)
5175 return LoopInfo.takeError();
5176 Result.push_back(*LoopInfo);
5177 Builder.restoreIP((*LoopInfo)->getAfterIP());
5178 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5179 return Error::success();
5180 };
5181 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5182 if (Err)
5183 return Err;
5184 return Result;
5185}
5186
5187Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
5188 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5189 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5190
5191 // Consider the following difficulties (assuming 8-bit signed integers):
5192 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5193 // DO I = 1, 100, 50
5194 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5195 // DO I = 100, 0, -128
5196
5197 // Start, Stop and Step must be of the same integer type.
5198 auto *IndVarTy = cast<IntegerType>(Start->getType());
5199 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5200 assert(IndVarTy == Step->getType() && "Step type mismatch");
5201
5202 updateToLocation(Loc);
5203
5204 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5205 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5206
5207 // Like Step, but always positive.
5208 Value *Incr = Step;
5209
5210 // Distance between Start and Stop; always positive.
5211 Value *Span;
5212
5213 // Condition whether there are no iterations are executed at all, e.g. because
5214 // UB < LB.
5215 Value *ZeroCmp;
5216
5217 if (IsSigned) {
5218 // Ensure that increment is positive. If not, negate and invert LB and UB.
5219 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5220 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5221 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5222 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5223 Span = Builder.CreateSub(UB, LB, "", false, true);
5224 ZeroCmp = Builder.CreateICmp(
5225 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5226 } else {
5227 Span = Builder.CreateSub(Stop, Start, "", true);
5228 ZeroCmp = Builder.CreateICmp(
5229 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5230 }
5231
5232 Value *CountIfLooping;
5233 if (InclusiveStop) {
5234 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5235 } else {
5236 // Avoid incrementing past stop since it could overflow.
5237 Value *CountIfTwo = Builder.CreateAdd(
5238 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5239 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5240 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5241 }
5242
5243 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5244 "omp_" + Name + ".tripcount");
5245}
5246
5247Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
5248 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
5249 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5250 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5251 ScanInfo *ScanRedInfo) {
5252 LocationDescription ComputeLoc =
5253 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5254
5255 Value *TripCount = calculateCanonicalLoopTripCount(
5256 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5257
5258 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5259 Builder.restoreIP(CodeGenIP);
5260 Value *Span = Builder.CreateMul(IV, Step);
5261 Value *IndVar = Builder.CreateAdd(Span, Start);
5262 if (InScan)
5263 ScanRedInfo->IV = IndVar;
5264 return BodyGenCB(Builder.saveIP(), IndVar);
5265 };
5266 LocationDescription LoopLoc =
5267 ComputeIP.isSet()
5268 ? Loc
5269 : LocationDescription(Builder.saveIP(),
5270 Builder.getCurrentDebugLocation());
5271 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5272}
5273
5274// Returns an LLVM function to call for initializing loop bounds using OpenMP
5275// static scheduling for composite `distribute parallel for` depending on
5276// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5277// integers as unsigned similarly to CanonicalLoopInfo.
5278static FunctionCallee
5280 OpenMPIRBuilder &OMPBuilder) {
5281 unsigned Bitwidth = Ty->getIntegerBitWidth();
5282 if (Bitwidth == 32)
5283 return OMPBuilder.getOrCreateRuntimeFunction(
5284 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5285 if (Bitwidth == 64)
5286 return OMPBuilder.getOrCreateRuntimeFunction(
5287 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5288 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5289}
5290
5291// Returns an LLVM function to call for initializing loop bounds using OpenMP
5292// static scheduling depending on `type`. Only i32 and i64 are supported by the
5293// runtime. Always interpret integers as unsigned similarly to
5294// CanonicalLoopInfo.
5296 OpenMPIRBuilder &OMPBuilder) {
5297 unsigned Bitwidth = Ty->getIntegerBitWidth();
5298 if (Bitwidth == 32)
5299 return OMPBuilder.getOrCreateRuntimeFunction(
5300 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5301 if (Bitwidth == 64)
5302 return OMPBuilder.getOrCreateRuntimeFunction(
5303 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5304 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5305}
5306
5307OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5308 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5309 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5310 OMPScheduleType DistScheduleSchedType) {
5311 assert(CLI->isValid() && "Requires a valid canonical loop");
5312 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5313 "Require dedicated allocate IP");
5314
5315 // Set up the source location value for OpenMP runtime.
5316 Builder.restoreIP(CLI->getPreheaderIP());
5317 Builder.SetCurrentDebugLocation(DL);
5318
5319 uint32_t SrcLocStrSize;
5320 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5321 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5322
5323 // Declare useful OpenMP runtime functions.
5324 Value *IV = CLI->getIndVar();
5325 Type *IVTy = IV->getType();
5326 FunctionCallee StaticInit =
5327 LoopType == WorksharingLoopType::DistributeForStaticLoop
5328 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5329 : getKmpcForStaticInitForType(IVTy, M, *this);
5330 FunctionCallee StaticFini =
5331 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5332
5333 // Allocate space for computed loop bounds as expected by the "init" function.
5334 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5335
5336 Type *I32Type = Type::getInt32Ty(M.getContext());
5337 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5338 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5339 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5340 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5341 CLI->setLastIter(PLastIter);
5342
5343 // At the end of the preheader, prepare for calling the "init" function by
5344 // storing the current loop bounds into the allocated space. A canonical loop
5345 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5346 // and produces an inclusive upper bound.
5347 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5348 Constant *Zero = ConstantInt::get(IVTy, 0);
5349 Constant *One = ConstantInt::get(IVTy, 1);
5350 Builder.CreateStore(Zero, PLowerBound);
5351 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5352 Builder.CreateStore(UpperBound, PUpperBound);
5353 Builder.CreateStore(One, PStride);
5354
5355 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5356
5357 OMPScheduleType SchedType =
5358 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5359 ? OMPScheduleType::OrderedDistribute
5361 Constant *SchedulingType =
5362 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5363
5364 // Call the "init" function and update the trip count of the loop with the
5365 // value it produced.
5366 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5367 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5368 this](Value *SchedulingType, auto &Builder) {
5369 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5370 PLowerBound, PUpperBound});
5371 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5372 Value *PDistUpperBound =
5373 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5374 Args.push_back(PDistUpperBound);
5375 }
5376 Args.append({PStride, One, Zero});
5377 createRuntimeFunctionCall(StaticInit, Args);
5378 };
5379 BuildInitCall(SchedulingType, Builder);
5380 if (HasDistSchedule &&
5381 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5382 Constant *DistScheduleSchedType = ConstantInt::get(
5383 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5384 // We want to emit a second init function call for the dist_schedule clause
5385 // to the Distribute construct. This should only be done however if a
5386 // Workshare Loop is nested within a Distribute Construct
5387 BuildInitCall(DistScheduleSchedType, Builder);
5388 }
5389 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5390 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5391 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5392 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5393 CLI->setTripCount(TripCount);
5394
5395 // Update all uses of the induction variable except the one in the condition
5396 // block that compares it with the actual upper bound, and the increment in
5397 // the latch block.
5398
5399 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5400 Builder.SetInsertPoint(CLI->getBody(),
5401 CLI->getBody()->getFirstInsertionPt());
5402 Builder.SetCurrentDebugLocation(DL);
5403 return Builder.CreateAdd(OldIV, LowerBound);
5404 });
5405
5406 // In the "exit" block, call the "fini" function.
5407 Builder.SetInsertPoint(CLI->getExit(),
5408 CLI->getExit()->getTerminator()->getIterator());
5409 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5410
5411 // Add the barrier if requested.
5412 if (NeedsBarrier) {
5413 InsertPointOrErrorTy BarrierIP =
5414 createBarrier(LocationDescription(Builder.saveIP(), DL),
5415 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5416 /* CheckCancelFlag */ false);
5417 if (!BarrierIP)
5418 return BarrierIP.takeError();
5419 }
5420
5421 InsertPointTy AfterIP = CLI->getAfterIP();
5422 CLI->invalidate();
5423
5424 return AfterIP;
5425}
5426
5427static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5428 LoopInfo &LI);
5429static void addLoopMetadata(CanonicalLoopInfo *Loop,
5430 ArrayRef<Metadata *> Properties);
5431
5432static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI,
5433 LLVMContext &Ctx, Loop *Loop,
5435 SmallVector<Metadata *> &LoopMDList) {
5436 SmallSet<BasicBlock *, 8> Reachable;
5437
5438 // Get the basic blocks from the loop in which memref instructions
5439 // can be found.
5440 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5441 // preferably without running any passes.
5442 for (BasicBlock *Block : Loop->getBlocks()) {
5443 if (Block == CLI->getCond() || Block == CLI->getHeader())
5444 continue;
5445 Reachable.insert(Block);
5446 }
5447
5448 // Add access group metadata to memory-access instructions.
5449 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5450 for (BasicBlock *BB : Reachable)
5451 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5452 // TODO: If the loop has existing parallel access metadata, have
5453 // to combine two lists.
5454 LoopMDList.push_back(MDNode::get(
5455 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5456}
5457
5458OpenMPIRBuilder::InsertPointOrErrorTy
5459OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5460 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5461 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5462 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5463 assert(CLI->isValid() && "Requires a valid canonical loop");
5464 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5465
5466 LLVMContext &Ctx = CLI->getFunction()->getContext();
5467 Value *IV = CLI->getIndVar();
5468 Value *OrigTripCount = CLI->getTripCount();
5469 Type *IVTy = IV->getType();
5470 assert(IVTy->getIntegerBitWidth() <= 64 &&
5471 "Max supported tripcount bitwidth is 64 bits");
5472 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5473 : Type::getInt64Ty(Ctx);
5474 Type *I32Type = Type::getInt32Ty(M.getContext());
5475 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5476 Constant *One = ConstantInt::get(InternalIVTy, 1);
5477
5478 Function *F = CLI->getFunction();
5480 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5481 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5482 LoopAnalysis LIA;
5483 LoopInfo &&LI = LIA.run(*F, FAM);
5484 Loop *L = LI.getLoopFor(CLI->getHeader());
5485 SmallVector<Metadata *> LoopMDList;
5486 if (ChunkSize || DistScheduleChunkSize)
5487 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5488 addLoopMetadata(CLI, LoopMDList);
5489
5490 // Declare useful OpenMP runtime functions.
5491 FunctionCallee StaticInit =
5492 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5493 FunctionCallee StaticFini =
5494 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5495
5496 // Allocate space for computed loop bounds as expected by the "init" function.
5497 Builder.restoreIP(AllocaIP);
5498 Builder.SetCurrentDebugLocation(DL);
5499 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5500 Value *PLowerBound =
5501 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5502 Value *PUpperBound =
5503 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5504 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5505 CLI->setLastIter(PLastIter);
5506
5507 // Set up the source location value for the OpenMP runtime.
5508 Builder.restoreIP(CLI->getPreheaderIP());
5509 Builder.SetCurrentDebugLocation(DL);
5510
5511 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5512 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5513 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5514 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5515 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5516 "distschedulechunksize");
5517 Value *CastedTripCount =
5518 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5519
5520 Constant *SchedulingType =
5521 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5522 Constant *DistSchedulingType =
5523 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5524 Builder.CreateStore(Zero, PLowerBound);
5525 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5526 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5527 Value *UpperBound =
5528 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5529 Builder.CreateStore(UpperBound, PUpperBound);
5530 Builder.CreateStore(One, PStride);
5531
5532 // Call the "init" function and update the trip count of the loop with the
5533 // value it produced.
5534 uint32_t SrcLocStrSize;
5535 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5536 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5537 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5538 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5539 PUpperBound, PStride, One,
5540 this](Value *SchedulingType, Value *ChunkSize,
5541 auto &Builder) {
5542 createRuntimeFunctionCall(
5543 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5544 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5545 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5546 /*pstride=*/PStride, /*incr=*/One,
5547 /*chunk=*/ChunkSize});
5548 };
5549 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5550 if (DistScheduleSchedType != OMPScheduleType::None &&
5551 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5552 SchedType != OMPScheduleType::OrderedDistribute) {
5553 // We want to emit a second init function call for the dist_schedule clause
5554 // to the Distribute construct. This should only be done however if a
5555 // Workshare Loop is nested within a Distribute Construct
5556 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5557 }
5558
5559 // Load values written by the "init" function.
5560 Value *FirstChunkStart =
5561 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5562 Value *FirstChunkStop =
5563 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5564 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5565 Value *ChunkRange =
5566 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5567 Value *NextChunkStride =
5568 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5569
5570 // Create outer "dispatch" loop for enumerating the chunks.
5571 BasicBlock *DispatchEnter = splitBB(Builder, true);
5572 Value *DispatchCounter;
5573
5574 // It is safe to assume this didn't return an error because the callback
5575 // passed into createCanonicalLoop is the only possible error source, and it
5576 // always returns success.
5577 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5578 {Builder.saveIP(), DL},
5579 [&](InsertPointTy BodyIP, Value *Counter) {
5580 DispatchCounter = Counter;
5581 return Error::success();
5582 },
5583 FirstChunkStart, CastedTripCount, NextChunkStride,
5584 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5585 "dispatch"));
5586
5587 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5588 // not have to preserve the canonical invariant.
5589 BasicBlock *DispatchBody = DispatchCLI->getBody();
5590 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5591 BasicBlock *DispatchExit = DispatchCLI->getExit();
5592 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5593 DispatchCLI->invalidate();
5594
5595 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5596 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5597 redirectTo(CLI->getExit(), DispatchLatch, DL);
5598 redirectTo(DispatchBody, DispatchEnter, DL);
5599
5600 // Prepare the prolog of the chunk loop.
5601 Builder.restoreIP(CLI->getPreheaderIP());
5602 Builder.SetCurrentDebugLocation(DL);
5603
5604 // Compute the number of iterations of the chunk loop.
5605 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5606 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5607 Value *IsLastChunk =
5608 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5609 Value *CountUntilOrigTripCount =
5610 Builder.CreateSub(CastedTripCount, DispatchCounter);
5611 Value *ChunkTripCount = Builder.CreateSelect(
5612 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5613 Value *BackcastedChunkTC =
5614 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5615 CLI->setTripCount(BackcastedChunkTC);
5616
5617 // Update all uses of the induction variable except the one in the condition
5618 // block that compares it with the actual upper bound, and the increment in
5619 // the latch block.
5620 Value *BackcastedDispatchCounter =
5621 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5622 CLI->mapIndVar([&](Instruction *) -> Value * {
5623 Builder.restoreIP(CLI->getBodyIP());
5624 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5625 });
5626
5627 // In the "exit" block, call the "fini" function.
5628 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5629 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5630
5631 // Add the barrier if requested.
5632 if (NeedsBarrier) {
5633 InsertPointOrErrorTy AfterIP =
5634 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5635 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5636 if (!AfterIP)
5637 return AfterIP.takeError();
5638 }
5639
5640#ifndef NDEBUG
5641 // Even though we currently do not support applying additional methods to it,
5642 // the chunk loop should remain a canonical loop.
5643 CLI->assertOK();
5644#endif
5645
5646 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5647}
5648
5649// Returns an LLVM function to call for executing an OpenMP static worksharing
5650// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5651// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5652static FunctionCallee
5653getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
5654 WorksharingLoopType LoopType) {
5655 unsigned Bitwidth = Ty->getIntegerBitWidth();
5656 Module &M = OMPBuilder->M;
5657 switch (LoopType) {
5658 case WorksharingLoopType::ForStaticLoop:
5659 if (Bitwidth == 32)
5660 return OMPBuilder->getOrCreateRuntimeFunction(
5661 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5662 if (Bitwidth == 64)
5663 return OMPBuilder->getOrCreateRuntimeFunction(
5664 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5665 break;
5666 case WorksharingLoopType::DistributeStaticLoop:
5667 if (Bitwidth == 32)
5668 return OMPBuilder->getOrCreateRuntimeFunction(
5669 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5670 if (Bitwidth == 64)
5671 return OMPBuilder->getOrCreateRuntimeFunction(
5672 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5673 break;
5674 case WorksharingLoopType::DistributeForStaticLoop:
5675 if (Bitwidth == 32)
5676 return OMPBuilder->getOrCreateRuntimeFunction(
5677 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5678 if (Bitwidth == 64)
5679 return OMPBuilder->getOrCreateRuntimeFunction(
5680 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5681 break;
5682 }
5683 if (Bitwidth != 32 && Bitwidth != 64) {
5684 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5685 }
5686 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5687}
5688
5689// Inserts a call to proper OpenMP Device RTL function which handles
5690// loop worksharing.
5691static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
5692 WorksharingLoopType LoopType,
5693 BasicBlock *InsertBlock, Value *Ident,
5694 Value *LoopBodyArg, Value *TripCount,
5695 Function &LoopBodyFn, bool NoLoop) {
5696 Type *TripCountTy = TripCount->getType();
5697 Module &M = OMPBuilder->M;
5698 IRBuilder<> &Builder = OMPBuilder->Builder;
5699 FunctionCallee RTLFn =
5700 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5701 SmallVector<Value *, 8> RealArgs;
5702 RealArgs.push_back(Ident);
5703 RealArgs.push_back(&LoopBodyFn);
5704 RealArgs.push_back(LoopBodyArg);
5705 RealArgs.push_back(TripCount);
5706 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5707 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5708 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5709 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5710 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5711 return;
5712 }
5713 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5714 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5715 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5716 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5717
5718 RealArgs.push_back(
5719 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5720 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5721 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5722 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5723 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5724 } else {
5725 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5726 }
5727
5728 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5729}
5730
5732 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5733 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5734 WorksharingLoopType LoopType, bool NoLoop) {
5735 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5736 BasicBlock *Preheader = CLI->getPreheader();
5737 Value *TripCount = CLI->getTripCount();
5738
5739 // After loop body outling, the loop body contains only set up
5740 // of loop body argument structure and the call to the outlined
5741 // loop body function. Firstly, we need to move setup of loop body args
5742 // into loop preheader.
5743 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5744 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5745
5746 // The next step is to remove the whole loop. We do not it need anymore.
5747 // That's why make an unconditional branch from loop preheader to loop
5748 // exit block
5749 Builder.restoreIP({Preheader, Preheader->end()});
5750 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5751 Preheader->getTerminator()->eraseFromParent();
5752 Builder.CreateBr(CLI->getExit());
5753
5754 // Delete dead loop blocks
5755 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5756 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5757 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5758 CleanUpInfo.EntryBB = CLI->getHeader();
5759 CleanUpInfo.ExitBB = CLI->getExit();
5760 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5761 DeleteDeadBlocks(BlocksToBeRemoved);
5762
5763 // Find the instruction which corresponds to loop body argument structure
5764 // and remove the call to loop body function instruction.
5765 Value *LoopBodyArg;
5766 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5767 assert(OutlinedFnUser &&
5768 "Expected unique undroppable user of outlined function");
5769 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5770 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5771 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5772 "Expected outlined function call to be located in loop preheader");
5773 // Check in case no argument structure has been passed.
5774 if (OutlinedFnCallInstruction->arg_size() > 1)
5775 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5776 else
5777 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5778 OutlinedFnCallInstruction->eraseFromParent();
5779
5780 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5781 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5782
5783 for (auto &ToBeDeletedItem : ToBeDeleted)
5784 ToBeDeletedItem->eraseFromParent();
5785 CLI->invalidate();
5786}
5787
5788OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5789 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5790 WorksharingLoopType LoopType, bool NoLoop) {
5791 uint32_t SrcLocStrSize;
5792 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5793 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5794
5795 OutlineInfo OI;
5796 OI.OuterAllocaBB = CLI->getPreheader();
5797 Function *OuterFn = CLI->getPreheader()->getParent();
5798
5799 // Instructions which need to be deleted at the end of code generation
5801
5802 OI.OuterAllocaBB = AllocaIP.getBlock();
5803
5804 // Mark the body loop as region which needs to be extracted
5805 OI.EntryBB = CLI->getBody();
5806 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5807 "omp.prelatch", true);
5808
5809 // Prepare loop body for extraction
5810 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5811
5812 // Insert new loop counter variable which will be used only in loop
5813 // body.
5814 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5815 Instruction *NewLoopCntLoad =
5816 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5817 // New loop counter instructions are redundant in the loop preheader when
5818 // code generation for workshare loop is finshed. That's why mark them as
5819 // ready for deletion.
5820 ToBeDeleted.push_back(NewLoopCntLoad);
5821 ToBeDeleted.push_back(NewLoopCnt);
5822
5823 // Analyse loop body region. Find all input variables which are used inside
5824 // loop body region.
5825 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5827 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5828
5829 CodeExtractorAnalysisCache CEAC(*OuterFn);
5830 CodeExtractor Extractor(Blocks,
5831 /* DominatorTree */ nullptr,
5832 /* AggregateArgs */ true,
5833 /* BlockFrequencyInfo */ nullptr,
5834 /* BranchProbabilityInfo */ nullptr,
5835 /* AssumptionCache */ nullptr,
5836 /* AllowVarArgs */ true,
5837 /* AllowAlloca */ true,
5838 /* AllocationBlock */ CLI->getPreheader(),
5839 /* Suffix */ ".omp_wsloop",
5840 /* AggrArgsIn0AddrSpace */ true);
5841
5842 BasicBlock *CommonExit = nullptr;
5843 SetVector<Value *> SinkingCands, HoistingCands;
5844
5845 // Find allocas outside the loop body region which are used inside loop
5846 // body
5847 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5848
5849 // We need to model loop body region as the function f(cnt, loop_arg).
5850 // That's why we replace loop induction variable by the new counter
5851 // which will be one of loop body function argument
5852 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5853 CLI->getIndVar()->user_end());
5854 for (auto Use : Users) {
5855 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5856 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5857 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5858 }
5859 }
5860 }
5861 // Make sure that loop counter variable is not merged into loop body
5862 // function argument structure and it is passed as separate variable
5863 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5864
5865 // PostOutline CB is invoked when loop body function is outlined and
5866 // loop body is replaced by call to outlined function. We need to add
5867 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5868 // function will handle loop control logic.
5869 //
5870 OI.PostOutlineCB = [=, ToBeDeletedVec =
5871 std::move(ToBeDeleted)](Function &OutlinedFn) {
5872 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5873 LoopType, NoLoop);
5874 };
5875 addOutlineInfo(std::move(OI));
5876 return CLI->getAfterIP();
5877}
5878
5879OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5880 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5881 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5882 bool HasSimdModifier, bool HasMonotonicModifier,
5883 bool HasNonmonotonicModifier, bool HasOrderedClause,
5884 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
5885 Value *DistScheduleChunkSize) {
5886 if (Config.isTargetDevice())
5887 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5888 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5889 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5890 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
5891
5892 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5893 OMPScheduleType::ModifierOrdered;
5894 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
5895 if (HasDistSchedule) {
5896 DistScheduleSchedType = DistScheduleChunkSize
5897 ? OMPScheduleType::OrderedDistributeChunked
5898 : OMPScheduleType::OrderedDistribute;
5899 }
5900 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5901 case OMPScheduleType::BaseStatic:
5902 case OMPScheduleType::BaseDistribute:
5903 assert((!ChunkSize || !DistScheduleChunkSize) &&
5904 "No chunk size with static-chunked schedule");
5905 if (IsOrdered && !HasDistSchedule)
5906 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5907 NeedsBarrier, ChunkSize);
5908 // FIXME: Monotonicity ignored?
5909 if (DistScheduleChunkSize)
5910 return applyStaticChunkedWorkshareLoop(
5911 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5912 DistScheduleChunkSize, DistScheduleSchedType);
5913 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
5914 HasDistSchedule);
5915
5916 case OMPScheduleType::BaseStaticChunked:
5917 case OMPScheduleType::BaseDistributeChunked:
5918 if (IsOrdered && !HasDistSchedule)
5919 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5920 NeedsBarrier, ChunkSize);
5921 // FIXME: Monotonicity ignored?
5922 return applyStaticChunkedWorkshareLoop(
5923 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5924 DistScheduleChunkSize, DistScheduleSchedType);
5925
5926 case OMPScheduleType::BaseRuntime:
5927 case OMPScheduleType::BaseAuto:
5928 case OMPScheduleType::BaseGreedy:
5929 case OMPScheduleType::BaseBalanced:
5930 case OMPScheduleType::BaseSteal:
5931 case OMPScheduleType::BaseGuidedSimd:
5932 case OMPScheduleType::BaseRuntimeSimd:
5933 assert(!ChunkSize &&
5934 "schedule type does not support user-defined chunk sizes");
5935 [[fallthrough]];
5936 case OMPScheduleType::BaseDynamicChunked:
5937 case OMPScheduleType::BaseGuidedChunked:
5938 case OMPScheduleType::BaseGuidedIterativeChunked:
5939 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5940 case OMPScheduleType::BaseStaticBalancedChunked:
5941 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5942 NeedsBarrier, ChunkSize);
5943
5944 default:
5945 llvm_unreachable("Unknown/unimplemented schedule kind");
5946 }
5947}
5948
5949/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5950/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5951/// the runtime. Always interpret integers as unsigned similarly to
5952/// CanonicalLoopInfo.
5953static FunctionCallee
5954getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5955 unsigned Bitwidth = Ty->getIntegerBitWidth();
5956 if (Bitwidth == 32)
5957 return OMPBuilder.getOrCreateRuntimeFunction(
5958 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5959 if (Bitwidth == 64)
5960 return OMPBuilder.getOrCreateRuntimeFunction(
5961 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5962 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5963}
5964
5965/// Returns an LLVM function to call for updating the next loop using OpenMP
5966/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5967/// the runtime. Always interpret integers as unsigned similarly to
5968/// CanonicalLoopInfo.
5969static FunctionCallee
5970getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5971 unsigned Bitwidth = Ty->getIntegerBitWidth();
5972 if (Bitwidth == 32)
5973 return OMPBuilder.getOrCreateRuntimeFunction(
5974 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5975 if (Bitwidth == 64)
5976 return OMPBuilder.getOrCreateRuntimeFunction(
5977 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5978 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5979}
5980
5981/// Returns an LLVM function to call for finalizing the dynamic loop using
5982/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5983/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5984static FunctionCallee
5985getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5986 unsigned Bitwidth = Ty->getIntegerBitWidth();
5987 if (Bitwidth == 32)
5988 return OMPBuilder.getOrCreateRuntimeFunction(
5989 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5990 if (Bitwidth == 64)
5991 return OMPBuilder.getOrCreateRuntimeFunction(
5992 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5993 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5994}
5995
5996OpenMPIRBuilder::InsertPointOrErrorTy
5997OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5998 InsertPointTy AllocaIP,
5999 OMPScheduleType SchedType,
6000 bool NeedsBarrier, Value *Chunk) {
6001 assert(CLI->isValid() && "Requires a valid canonical loop");
6002 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6003 "Require dedicated allocate IP");
6005 "Require valid schedule type");
6006
6007 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6008 OMPScheduleType::ModifierOrdered;
6009
6010 // Set up the source location value for OpenMP runtime.
6011 Builder.SetCurrentDebugLocation(DL);
6012
6013 uint32_t SrcLocStrSize;
6014 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6015 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6016
6017 // Declare useful OpenMP runtime functions.
6018 Value *IV = CLI->getIndVar();
6019 Type *IVTy = IV->getType();
6020 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6021 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6022
6023 // Allocate space for computed loop bounds as expected by the "init" function.
6024 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6025 Type *I32Type = Type::getInt32Ty(M.getContext());
6026 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6027 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6028 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6029 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6030 CLI->setLastIter(PLastIter);
6031
6032 // At the end of the preheader, prepare for calling the "init" function by
6033 // storing the current loop bounds into the allocated space. A canonical loop
6034 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6035 // and produces an inclusive upper bound.
6036 BasicBlock *PreHeader = CLI->getPreheader();
6037 Builder.SetInsertPoint(PreHeader->getTerminator());
6038 Constant *One = ConstantInt::get(IVTy, 1);
6039 Builder.CreateStore(One, PLowerBound);
6040 Value *UpperBound = CLI->getTripCount();
6041 Builder.CreateStore(UpperBound, PUpperBound);
6042 Builder.CreateStore(One, PStride);
6043
6044 BasicBlock *Header = CLI->getHeader();
6045 BasicBlock *Exit = CLI->getExit();
6046 BasicBlock *Cond = CLI->getCond();
6047 BasicBlock *Latch = CLI->getLatch();
6048 InsertPointTy AfterIP = CLI->getAfterIP();
6049
6050 // The CLI will be "broken" in the code below, as the loop is no longer
6051 // a valid canonical loop.
6052
6053 if (!Chunk)
6054 Chunk = One;
6055
6056 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
6057
6058 Constant *SchedulingType =
6059 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6060
6061 // Call the "init" function.
6062 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6063 /* LowerBound */ One, UpperBound,
6064 /* step */ One, Chunk});
6065
6066 // An outer loop around the existing one.
6067 BasicBlock *OuterCond = BasicBlock::Create(
6068 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6069 PreHeader->getParent());
6070 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6071 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6072 Value *Res = createRuntimeFunctionCall(
6073 DynamicNext,
6074 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6075 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6076 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6077 Value *LowerBound =
6078 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6079 Builder.CreateCondBr(MoreWork, Header, Exit);
6080
6081 // Change PHI-node in loop header to use outer cond rather than preheader,
6082 // and set IV to the LowerBound.
6083 Instruction *Phi = &Header->front();
6084 auto *PI = cast<PHINode>(Phi);
6085 PI->setIncomingBlock(0, OuterCond);
6086 PI->setIncomingValue(0, LowerBound);
6087
6088 // Then set the pre-header to jump to the OuterCond
6089 Instruction *Term = PreHeader->getTerminator();
6090 auto *Br = cast<BranchInst>(Term);
6091 Br->setSuccessor(0, OuterCond);
6092
6093 // Modify the inner condition:
6094 // * Use the UpperBound returned from the DynamicNext call.
6095 // * jump to the loop outer loop when done with one of the inner loops.
6096 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6097 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6098 Instruction *Comp = &*Builder.GetInsertPoint();
6099 auto *CI = cast<CmpInst>(Comp);
6100 CI->setOperand(1, UpperBound);
6101 // Redirect the inner exit to branch to outer condition.
6102 Instruction *Branch = &Cond->back();
6103 auto *BI = cast<BranchInst>(Branch);
6104 assert(BI->getSuccessor(1) == Exit);
6105 BI->setSuccessor(1, OuterCond);
6106
6107 // Call the "fini" function if "ordered" is present in wsloop directive.
6108 if (Ordered) {
6109 Builder.SetInsertPoint(&Latch->back());
6110 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6111 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6112 }
6113
6114 // Add the barrier if requested.
6115 if (NeedsBarrier) {
6116 Builder.SetInsertPoint(&Exit->back());
6117 InsertPointOrErrorTy BarrierIP =
6118 createBarrier(LocationDescription(Builder.saveIP(), DL),
6119 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6120 /* CheckCancelFlag */ false);
6121 if (!BarrierIP)
6122 return BarrierIP.takeError();
6123 }
6124
6125 CLI->invalidate();
6126 return AfterIP;
6127}
6128
6129/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6130/// after this \p OldTarget will be orphaned.
6132 BasicBlock *NewTarget, DebugLoc DL) {
6133 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6134 redirectTo(Pred, NewTarget, DL);
6135}
6136
6137/// Determine which blocks in \p BBs are reachable from outside and remove the
6138/// ones that are not reachable from the function.
6141 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6142 for (Use &U : BB->uses()) {
6143 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6144 if (!UseInst)
6145 continue;
6146 if (BBsToErase.count(UseInst->getParent()))
6147 continue;
6148 return true;
6149 }
6150 return false;
6151 };
6152
6153 while (BBsToErase.remove_if(HasRemainingUses)) {
6154 // Try again if anything was removed.
6155 }
6156
6157 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6158 DeleteDeadBlocks(BBVec);
6159}
6160
6161CanonicalLoopInfo *
6162OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
6163 InsertPointTy ComputeIP) {
6164 assert(Loops.size() >= 1 && "At least one loop required");
6165 size_t NumLoops = Loops.size();
6166
6167 // Nothing to do if there is already just one loop.
6168 if (NumLoops == 1)
6169 return Loops.front();
6170
6171 CanonicalLoopInfo *Outermost = Loops.front();
6172 CanonicalLoopInfo *Innermost = Loops.back();
6173 BasicBlock *OrigPreheader = Outermost->getPreheader();
6174 BasicBlock *OrigAfter = Outermost->getAfter();
6175 Function *F = OrigPreheader->getParent();
6176
6177 // Loop control blocks that may become orphaned later.
6178 SmallVector<BasicBlock *, 12> OldControlBBs;
6179 OldControlBBs.reserve(6 * Loops.size());
6180 for (CanonicalLoopInfo *Loop : Loops)
6181 Loop->collectControlBlocks(OldControlBBs);
6182
6183 // Setup the IRBuilder for inserting the trip count computation.
6184 Builder.SetCurrentDebugLocation(DL);
6185 if (ComputeIP.isSet())
6186 Builder.restoreIP(ComputeIP);
6187 else
6188 Builder.restoreIP(Outermost->getPreheaderIP());
6189
6190 // Derive the collapsed' loop trip count.
6191 // TODO: Find common/largest indvar type.
6192 Value *CollapsedTripCount = nullptr;
6193 for (CanonicalLoopInfo *L : Loops) {
6194 assert(L->isValid() &&
6195 "All loops to collapse must be valid canonical loops");
6196 Value *OrigTripCount = L->getTripCount();
6197 if (!CollapsedTripCount) {
6198 CollapsedTripCount = OrigTripCount;
6199 continue;
6200 }
6201
6202 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6203 CollapsedTripCount =
6204 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6205 }
6206
6207 // Create the collapsed loop control flow.
6208 CanonicalLoopInfo *Result =
6209 createLoopSkeleton(DL, CollapsedTripCount, F,
6210 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6211
6212 // Build the collapsed loop body code.
6213 // Start with deriving the input loop induction variables from the collapsed
6214 // one, using a divmod scheme. To preserve the original loops' order, the
6215 // innermost loop use the least significant bits.
6216 Builder.restoreIP(Result->getBodyIP());
6217
6218 Value *Leftover = Result->getIndVar();
6219 SmallVector<Value *> NewIndVars;
6220 NewIndVars.resize(NumLoops);
6221 for (int i = NumLoops - 1; i >= 1; --i) {
6222 Value *OrigTripCount = Loops[i]->getTripCount();
6223
6224 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6225 NewIndVars[i] = NewIndVar;
6226
6227 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6228 }
6229 // Outermost loop gets all the remaining bits.
6230 NewIndVars[0] = Leftover;
6231
6232 // Construct the loop body control flow.
6233 // We progressively construct the branch structure following in direction of
6234 // the control flow, from the leading in-between code, the loop nest body, the
6235 // trailing in-between code, and rejoining the collapsed loop's latch.
6236 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6237 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6238 // its predecessors as sources.
6239 BasicBlock *ContinueBlock = Result->getBody();
6240 BasicBlock *ContinuePred = nullptr;
6241 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6242 BasicBlock *NextSrc) {
6243 if (ContinueBlock)
6244 redirectTo(ContinueBlock, Dest, DL);
6245 else
6246 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6247
6248 ContinueBlock = nullptr;
6249 ContinuePred = NextSrc;
6250 };
6251
6252 // The code before the nested loop of each level.
6253 // Because we are sinking it into the nest, it will be executed more often
6254 // that the original loop. More sophisticated schemes could keep track of what
6255 // the in-between code is and instantiate it only once per thread.
6256 for (size_t i = 0; i < NumLoops - 1; ++i)
6257 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6258
6259 // Connect the loop nest body.
6260 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6261
6262 // The code after the nested loop at each level.
6263 for (size_t i = NumLoops - 1; i > 0; --i)
6264 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6265
6266 // Connect the finished loop to the collapsed loop latch.
6267 ContinueWith(Result->getLatch(), nullptr);
6268
6269 // Replace the input loops with the new collapsed loop.
6270 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6271 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6272
6273 // Replace the input loop indvars with the derived ones.
6274 for (size_t i = 0; i < NumLoops; ++i)
6275 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6276
6277 // Remove unused parts of the input loops.
6278 removeUnusedBlocksFromParent(OldControlBBs);
6279
6280 for (CanonicalLoopInfo *L : Loops)
6281 L->invalidate();
6282
6283#ifndef NDEBUG
6284 Result->assertOK();
6285#endif
6286 return Result;
6287}
6288
6289std::vector<CanonicalLoopInfo *>
6290OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
6291 ArrayRef<Value *> TileSizes) {
6292 assert(TileSizes.size() == Loops.size() &&
6293 "Must pass as many tile sizes as there are loops");
6294 int NumLoops = Loops.size();
6295 assert(NumLoops >= 1 && "At least one loop to tile required");
6296
6297 CanonicalLoopInfo *OutermostLoop = Loops.front();
6298 CanonicalLoopInfo *InnermostLoop = Loops.back();
6299 Function *F = OutermostLoop->getBody()->getParent();
6300 BasicBlock *InnerEnter = InnermostLoop->getBody();
6301 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6302
6303 // Loop control blocks that may become orphaned later.
6304 SmallVector<BasicBlock *, 12> OldControlBBs;
6305 OldControlBBs.reserve(6 * Loops.size());
6306 for (CanonicalLoopInfo *Loop : Loops)
6307 Loop->collectControlBlocks(OldControlBBs);
6308
6309 // Collect original trip counts and induction variable to be accessible by
6310 // index. Also, the structure of the original loops is not preserved during
6311 // the construction of the tiled loops, so do it before we scavenge the BBs of
6312 // any original CanonicalLoopInfo.
6313 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6314 for (CanonicalLoopInfo *L : Loops) {
6315 assert(L->isValid() && "All input loops must be valid canonical loops");
6316 OrigTripCounts.push_back(L->getTripCount());
6317 OrigIndVars.push_back(L->getIndVar());
6318 }
6319
6320 // Collect the code between loop headers. These may contain SSA definitions
6321 // that are used in the loop nest body. To be usable with in the innermost
6322 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6323 // these instructions may be executed more often than before the tiling.
6324 // TODO: It would be sufficient to only sink them into body of the
6325 // corresponding tile loop.
6327 for (int i = 0; i < NumLoops - 1; ++i) {
6328 CanonicalLoopInfo *Surrounding = Loops[i];
6329 CanonicalLoopInfo *Nested = Loops[i + 1];
6330
6331 BasicBlock *EnterBB = Surrounding->getBody();
6332 BasicBlock *ExitBB = Nested->getHeader();
6333 InbetweenCode.emplace_back(EnterBB, ExitBB);
6334 }
6335
6336 // Compute the trip counts of the floor loops.
6337 Builder.SetCurrentDebugLocation(DL);
6338 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6339 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6340 for (int i = 0; i < NumLoops; ++i) {
6341 Value *TileSize = TileSizes[i];
6342 Value *OrigTripCount = OrigTripCounts[i];
6343 Type *IVType = OrigTripCount->getType();
6344
6345 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6346 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6347
6348 // 0 if tripcount divides the tilesize, 1 otherwise.
6349 // 1 means we need an additional iteration for a partial tile.
6350 //
6351 // Unfortunately we cannot just use the roundup-formula
6352 // (tripcount + tilesize - 1)/tilesize
6353 // because the summation might overflow. We do not want introduce undefined
6354 // behavior when the untiled loop nest did not.
6355 Value *FloorTripOverflow =
6356 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6357
6358 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6359 Value *FloorTripCount =
6360 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6361 "omp_floor" + Twine(i) + ".tripcount", true);
6362
6363 // Remember some values for later use.
6364 FloorCompleteCount.push_back(FloorCompleteTripCount);
6365 FloorCount.push_back(FloorTripCount);
6366 FloorRems.push_back(FloorTripRem);
6367 }
6368
6369 // Generate the new loop nest, from the outermost to the innermost.
6370 std::vector<CanonicalLoopInfo *> Result;
6371 Result.reserve(NumLoops * 2);
6372
6373 // The basic block of the surrounding loop that enters the nest generated
6374 // loop.
6375 BasicBlock *Enter = OutermostLoop->getPreheader();
6376
6377 // The basic block of the surrounding loop where the inner code should
6378 // continue.
6379 BasicBlock *Continue = OutermostLoop->getAfter();
6380
6381 // Where the next loop basic block should be inserted.
6382 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6383
6384 auto EmbeddNewLoop =
6385 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6386 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6387 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6388 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6389 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6390 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6391
6392 // Setup the position where the next embedded loop connects to this loop.
6393 Enter = EmbeddedLoop->getBody();
6394 Continue = EmbeddedLoop->getLatch();
6395 OutroInsertBefore = EmbeddedLoop->getLatch();
6396 return EmbeddedLoop;
6397 };
6398
6399 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6400 const Twine &NameBase) {
6401 for (auto P : enumerate(TripCounts)) {
6402 CanonicalLoopInfo *EmbeddedLoop =
6403 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6404 Result.push_back(EmbeddedLoop);
6405 }
6406 };
6407
6408 EmbeddNewLoops(FloorCount, "floor");
6409
6410 // Within the innermost floor loop, emit the code that computes the tile
6411 // sizes.
6412 Builder.SetInsertPoint(Enter->getTerminator());
6413 SmallVector<Value *, 4> TileCounts;
6414 for (int i = 0; i < NumLoops; ++i) {
6415 CanonicalLoopInfo *FloorLoop = Result[i];
6416 Value *TileSize = TileSizes[i];
6417
6418 Value *FloorIsEpilogue =
6419 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6420 Value *TileTripCount =
6421 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6422
6423 TileCounts.push_back(TileTripCount);
6424 }
6425
6426 // Create the tile loops.
6427 EmbeddNewLoops(TileCounts, "tile");
6428
6429 // Insert the inbetween code into the body.
6430 BasicBlock *BodyEnter = Enter;
6431 BasicBlock *BodyEntered = nullptr;
6432 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6433 BasicBlock *EnterBB = P.first;
6434 BasicBlock *ExitBB = P.second;
6435
6436 if (BodyEnter)
6437 redirectTo(BodyEnter, EnterBB, DL);
6438 else
6439 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6440
6441 BodyEnter = nullptr;
6442 BodyEntered = ExitBB;
6443 }
6444
6445 // Append the original loop nest body into the generated loop nest body.
6446 if (BodyEnter)
6447 redirectTo(BodyEnter, InnerEnter, DL);
6448 else
6449 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6451
6452 // Replace the original induction variable with an induction variable computed
6453 // from the tile and floor induction variables.
6454 Builder.restoreIP(Result.back()->getBodyIP());
6455 for (int i = 0; i < NumLoops; ++i) {
6456 CanonicalLoopInfo *FloorLoop = Result[i];
6457 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6458 Value *OrigIndVar = OrigIndVars[i];
6459 Value *Size = TileSizes[i];
6460
6461 Value *Scale =
6462 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6463 Value *Shift =
6464 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6465 OrigIndVar->replaceAllUsesWith(Shift);
6466 }
6467
6468 // Remove unused parts of the original loops.
6469 removeUnusedBlocksFromParent(OldControlBBs);
6470
6471 for (CanonicalLoopInfo *L : Loops)
6472 L->invalidate();
6473
6474#ifndef NDEBUG
6475 for (CanonicalLoopInfo *GenL : Result)
6476 GenL->assertOK();
6477#endif
6478 return Result;
6479}
6480
6481/// Attach metadata \p Properties to the basic block described by \p BB. If the
6482/// basic block already has metadata, the basic block properties are appended.
6484 ArrayRef<Metadata *> Properties) {
6485 // Nothing to do if no property to attach.
6486 if (Properties.empty())
6487 return;
6488
6489 LLVMContext &Ctx = BB->getContext();
6490 SmallVector<Metadata *> NewProperties;
6491 NewProperties.push_back(nullptr);
6492
6493 // If the basic block already has metadata, prepend it to the new metadata.
6494 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6495 if (Existing)
6496 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6497
6498 append_range(NewProperties, Properties);
6499 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6500 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6501
6502 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6503}
6504
6505/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6506/// loop already has metadata, the loop properties are appended.
6507static void addLoopMetadata(CanonicalLoopInfo *Loop,
6508 ArrayRef<Metadata *> Properties) {
6509 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6510
6511 // Attach metadata to the loop's latch
6512 BasicBlock *Latch = Loop->getLatch();
6513 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6514 addBasicBlockMetadata(Latch, Properties);
6515}
6516
6517/// Attach llvm.access.group metadata to the memref instructions of \p Block
6519 LoopInfo &LI) {
6520 for (Instruction &I : *Block) {
6521 if (I.mayReadOrWriteMemory()) {
6522 // TODO: This instruction may already have access group from
6523 // other pragmas e.g. #pragma clang loop vectorize. Append
6524 // so that the existing metadata is not overwritten.
6525 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6526 }
6527 }
6528}
6529
6530void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
6531 LLVMContext &Ctx = Builder.getContext();
6533 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6534 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6535}
6536
6537void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
6538 LLVMContext &Ctx = Builder.getContext();
6540 Loop, {
6541 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6542 });
6543}
6544
6545void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6546 Value *IfCond, ValueToValueMapTy &VMap,
6547 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6548 const Twine &NamePrefix) {
6549 Function *F = CanonicalLoop->getFunction();
6550
6551 // We can't do
6552 // if (cond) {
6553 // simd_loop;
6554 // } else {
6555 // non_simd_loop;
6556 // }
6557 // because then the CanonicalLoopInfo would only point to one of the loops:
6558 // leading to other constructs operating on the same loop to malfunction.
6559 // Instead generate
6560 // while (...) {
6561 // if (cond) {
6562 // simd_body;
6563 // } else {
6564 // not_simd_body;
6565 // }
6566 // }
6567 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6568 // body at -O3
6569
6570 // Define where if branch should be inserted
6571 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6572
6573 // Create additional blocks for the if statement
6574 BasicBlock *Cond = SplitBeforeIt->getParent();
6575 llvm::LLVMContext &C = Cond->getContext();
6577 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6579 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6580
6581 // Create if condition branch.
6582 Builder.SetInsertPoint(SplitBeforeIt);
6583 Instruction *BrInstr =
6584 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6585 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6586 // Then block contains branch to omp loop body which needs to be vectorized
6587 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6588 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6589
6590 Builder.SetInsertPoint(ElseBlock);
6591
6592 // Clone loop for the else branch
6594
6595 SmallVector<BasicBlock *, 8> ExistingBlocks;
6596 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6597 ExistingBlocks.push_back(ThenBlock);
6598 ExistingBlocks.append(L->block_begin(), L->block_end());
6599 // Cond is the block that has the if clause condition
6600 // LoopCond is omp_loop.cond
6601 // LoopHeader is omp_loop.header
6602 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6603 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6604 assert(LoopCond && LoopHeader && "Invalid loop structure");
6605 for (BasicBlock *Block : ExistingBlocks) {
6606 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6607 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6608 continue;
6609 }
6610 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6611
6612 // fix name not to be omp.if.then
6613 if (Block == ThenBlock)
6614 NewBB->setName(NamePrefix + ".if.else");
6615
6616 NewBB->moveBefore(CanonicalLoop->getExit());
6617 VMap[Block] = NewBB;
6618 NewBlocks.push_back(NewBB);
6619 }
6620 remapInstructionsInBlocks(NewBlocks, VMap);
6621 Builder.CreateBr(NewBlocks.front());
6622
6623 // The loop latch must have only one predecessor. Currently it is branched to
6624 // from both the 'then' and 'else' branches.
6625 L->getLoopLatch()->splitBasicBlock(
6626 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
6627
6628 // Ensure that the then block is added to the loop so we add the attributes in
6629 // the next step
6630 L->addBasicBlockToLoop(ThenBlock, LI);
6631}
6632
6633unsigned
6634OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
6635 const StringMap<bool> &Features) {
6636 if (TargetTriple.isX86()) {
6637 if (Features.lookup("avx512f"))
6638 return 512;
6639 else if (Features.lookup("avx"))
6640 return 256;
6641 return 128;
6642 }
6643 if (TargetTriple.isPPC())
6644 return 128;
6645 if (TargetTriple.isWasm())
6646 return 128;
6647 return 0;
6648}
6649
6650void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
6651 MapVector<Value *, Value *> AlignedVars,
6652 Value *IfCond, OrderKind Order,
6653 ConstantInt *Simdlen, ConstantInt *Safelen) {
6654 LLVMContext &Ctx = Builder.getContext();
6655
6656 Function *F = CanonicalLoop->getFunction();
6657
6658 // TODO: We should not rely on pass manager. Currently we use pass manager
6659 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6660 // object. We should have a method which returns all blocks between
6661 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6663 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6664 FAM.registerPass([]() { return LoopAnalysis(); });
6665 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6666
6667 LoopAnalysis LIA;
6668 LoopInfo &&LI = LIA.run(*F, FAM);
6669
6670 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6671 if (AlignedVars.size()) {
6672 InsertPointTy IP = Builder.saveIP();
6673 for (auto &AlignedItem : AlignedVars) {
6674 Value *AlignedPtr = AlignedItem.first;
6675 Value *Alignment = AlignedItem.second;
6676 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6677 Builder.SetInsertPoint(loadInst->getNextNode());
6678 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6679 Alignment);
6680 }
6681 Builder.restoreIP(IP);
6682 }
6683
6684 if (IfCond) {
6685 ValueToValueMapTy VMap;
6686 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6687 }
6688
6690
6691 // Get the basic blocks from the loop in which memref instructions
6692 // can be found.
6693 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6694 // preferably without running any passes.
6695 for (BasicBlock *Block : L->getBlocks()) {
6696 if (Block == CanonicalLoop->getCond() ||
6697 Block == CanonicalLoop->getHeader())
6698 continue;
6699 Reachable.insert(Block);
6700 }
6701
6702 SmallVector<Metadata *> LoopMDList;
6703
6704 // In presence of finite 'safelen', it may be unsafe to mark all
6705 // the memory instructions parallel, because loop-carried
6706 // dependences of 'safelen' iterations are possible.
6707 // If clause order(concurrent) is specified then the memory instructions
6708 // are marked parallel even if 'safelen' is finite.
6709 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6710 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6711
6712 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6713 // versions so we can't add the loop attributes in that case.
6714 if (IfCond) {
6715 // we can still add llvm.loop.parallel_access
6716 addLoopMetadata(CanonicalLoop, LoopMDList);
6717 return;
6718 }
6719
6720 // Use the above access group metadata to create loop level
6721 // metadata, which should be distinct for each loop.
6722 ConstantAsMetadata *BoolConst =
6724 LoopMDList.push_back(MDNode::get(
6725 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6726
6727 if (Simdlen || Safelen) {
6728 // If both simdlen and safelen clauses are specified, the value of the
6729 // simdlen parameter must be less than or equal to the value of the safelen
6730 // parameter. Therefore, use safelen only in the absence of simdlen.
6731 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6732 LoopMDList.push_back(
6733 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6734 ConstantAsMetadata::get(VectorizeWidth)}));
6735 }
6736
6737 addLoopMetadata(CanonicalLoop, LoopMDList);
6738}
6739
6740/// Create the TargetMachine object to query the backend for optimization
6741/// preferences.
6742///
6743/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6744/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6745/// needed for the LLVM pass pipline. We use some default options to avoid
6746/// having to pass too many settings from the frontend that probably do not
6747/// matter.
6748///
6749/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6750/// method. If we are going to use TargetMachine for more purposes, especially
6751/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6752/// might become be worth requiring front-ends to pass on their TargetMachine,
6753/// or at least cache it between methods. Note that while fontends such as Clang
6754/// have just a single main TargetMachine per translation unit, "target-cpu" and
6755/// "target-features" that determine the TargetMachine are per-function and can
6756/// be overrided using __attribute__((target("OPTIONS"))).
6757static std::unique_ptr<TargetMachine>
6759 Module *M = F->getParent();
6760
6761 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6762 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6763 const llvm::Triple &Triple = M->getTargetTriple();
6764
6765 std::string Error;
6767 if (!TheTarget)
6768 return {};
6769
6771 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6772 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6773 /*CodeModel=*/std::nullopt, OptLevel));
6774}
6775
6776/// Heuristically determine the best-performant unroll factor for \p CLI. This
6777/// depends on the target processor. We are re-using the same heuristics as the
6778/// LoopUnrollPass.
6779static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6780 Function *F = CLI->getFunction();
6781
6782 // Assume the user requests the most aggressive unrolling, even if the rest of
6783 // the code is optimized using a lower setting.
6785 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6786
6788 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6789 FAM.registerPass([]() { return AssumptionAnalysis(); });
6790 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6791 FAM.registerPass([]() { return LoopAnalysis(); });
6792 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6793 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6794 TargetIRAnalysis TIRA;
6795 if (TM)
6796 TIRA = TargetIRAnalysis(
6797 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6798 FAM.registerPass([&]() { return TIRA; });
6799
6800 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6802 ScalarEvolution &&SE = SEA.run(*F, FAM);
6804 DominatorTree &&DT = DTA.run(*F, FAM);
6805 LoopAnalysis LIA;
6806 LoopInfo &&LI = LIA.run(*F, FAM);
6808 AssumptionCache &&AC = ACT.run(*F, FAM);
6810
6811 Loop *L = LI.getLoopFor(CLI->getHeader());
6812 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6813
6815 L, SE, TTI,
6816 /*BlockFrequencyInfo=*/nullptr,
6817 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6818 /*UserThreshold=*/std::nullopt,
6819 /*UserCount=*/std::nullopt,
6820 /*UserAllowPartial=*/true,
6821 /*UserAllowRuntime=*/true,
6822 /*UserUpperBound=*/std::nullopt,
6823 /*UserFullUnrollMaxCount=*/std::nullopt);
6824
6825 UP.Force = true;
6826
6827 // Account for additional optimizations taking place before the LoopUnrollPass
6828 // would unroll the loop.
6831
6832 // Use normal unroll factors even if the rest of the code is optimized for
6833 // size.
6836
6837 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6838 << " Threshold=" << UP.Threshold << "\n"
6839 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6840 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6841 << " PartialOptSizeThreshold="
6842 << UP.PartialOptSizeThreshold << "\n");
6843
6844 // Disable peeling.
6847 /*UserAllowPeeling=*/false,
6848 /*UserAllowProfileBasedPeeling=*/false,
6849 /*UnrollingSpecficValues=*/false);
6850
6852 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6853
6854 // Assume that reads and writes to stack variables can be eliminated by
6855 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6856 // size.
6857 for (BasicBlock *BB : L->blocks()) {
6858 for (Instruction &I : *BB) {
6859 Value *Ptr;
6860 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6861 Ptr = Load->getPointerOperand();
6862 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6863 Ptr = Store->getPointerOperand();
6864 } else
6865 continue;
6866
6867 Ptr = Ptr->stripPointerCasts();
6868
6869 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6870 if (Alloca->getParent() == &F->getEntryBlock())
6871 EphValues.insert(&I);
6872 }
6873 }
6874 }
6875
6876 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6877
6878 // Loop is not unrollable if the loop contains certain instructions.
6879 if (!UCE.canUnroll()) {
6880 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6881 return 1;
6882 }
6883
6884 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6885 << "\n");
6886
6887 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6888 // be able to use it.
6889 int TripCount = 0;
6890 int MaxTripCount = 0;
6891 bool MaxOrZero = false;
6892 unsigned TripMultiple = 0;
6893
6894 bool UseUpperBound = false;
6895 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6896 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6897 UseUpperBound);
6898 unsigned Factor = UP.Count;
6899 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6900
6901 // This function returns 1 to signal to not unroll a loop.
6902 if (Factor == 0)
6903 return 1;
6904 return Factor;
6905}
6906
6907void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6908 int32_t Factor,
6909 CanonicalLoopInfo **UnrolledCLI) {
6910 assert(Factor >= 0 && "Unroll factor must not be negative");
6911
6912 Function *F = Loop->getFunction();
6913 LLVMContext &Ctx = F->getContext();
6914
6915 // If the unrolled loop is not used for another loop-associated directive, it
6916 // is sufficient to add metadata for the LoopUnrollPass.
6917 if (!UnrolledCLI) {
6918 SmallVector<Metadata *, 2> LoopMetadata;
6919 LoopMetadata.push_back(
6920 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6921
6922 if (Factor >= 1) {
6924 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6925 LoopMetadata.push_back(MDNode::get(
6926 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6927 }
6928
6929 addLoopMetadata(Loop, LoopMetadata);
6930 return;
6931 }
6932
6933 // Heuristically determine the unroll factor.
6934 if (Factor == 0)
6936
6937 // No change required with unroll factor 1.
6938 if (Factor == 1) {
6939 *UnrolledCLI = Loop;
6940 return;
6941 }
6942
6943 assert(Factor >= 2 &&
6944 "unrolling only makes sense with a factor of 2 or larger");
6945
6946 Type *IndVarTy = Loop->getIndVarType();
6947
6948 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6949 // unroll the inner loop.
6950 Value *FactorVal =
6951 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6952 /*isSigned=*/false));
6953 std::vector<CanonicalLoopInfo *> LoopNest =
6954 tileLoops(DL, {Loop}, {FactorVal});
6955 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6956 *UnrolledCLI = LoopNest[0];
6957 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6958
6959 // LoopUnrollPass can only fully unroll loops with constant trip count.
6960 // Unroll by the unroll factor with a fallback epilog for the remainder
6961 // iterations if necessary.
6963 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6965 InnerLoop,
6966 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6968 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6969
6970#ifndef NDEBUG
6971 (*UnrolledCLI)->assertOK();
6972#endif
6973}
6974
6975OpenMPIRBuilder::InsertPointTy
6976OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6977 llvm::Value *BufSize, llvm::Value *CpyBuf,
6978 llvm::Value *CpyFn, llvm::Value *DidIt) {
6979 if (!updateToLocation(Loc))
6980 return Loc.IP;
6981
6982 uint32_t SrcLocStrSize;
6983 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6984 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6985 Value *ThreadId = getOrCreateThreadID(Ident);
6986
6987 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6988
6989 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6990
6991 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6992 createRuntimeFunctionCall(Fn, Args);
6993
6994 return Builder.saveIP();
6995}
6996
6997OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6998 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6999 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7001
7002 if (!updateToLocation(Loc))
7003 return Loc.IP;
7004
7005 // If needed allocate and initialize `DidIt` with 0.
7006 // DidIt: flag variable: 1=single thread; 0=not single thread.
7007 llvm::Value *DidIt = nullptr;
7008 if (!CPVars.empty()) {
7009 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7010 Builder.CreateStore(Builder.getInt32(0), DidIt);
7011 }
7012
7013 Directive OMPD = Directive::OMPD_single;
7014 uint32_t SrcLocStrSize;
7015 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7016 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7017 Value *ThreadId = getOrCreateThreadID(Ident);
7018 Value *Args[] = {Ident, ThreadId};
7019
7020 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7021 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7022
7023 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7024 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7025
7026 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7027 if (Error Err = FiniCB(IP))
7028 return Err;
7029
7030 // The thread that executes the single region must set `DidIt` to 1.
7031 // This is used by __kmpc_copyprivate, to know if the caller is the
7032 // single thread or not.
7033 if (DidIt)
7034 Builder.CreateStore(Builder.getInt32(1), DidIt);
7035
7036 return Error::success();
7037 };
7038
7039 // generates the following:
7040 // if (__kmpc_single()) {
7041 // .... single region ...
7042 // __kmpc_end_single
7043 // }
7044 // __kmpc_copyprivate
7045 // __kmpc_barrier
7046
7047 InsertPointOrErrorTy AfterIP =
7048 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7049 /*Conditional*/ true,
7050 /*hasFinalize*/ true);
7051 if (!AfterIP)
7052 return AfterIP.takeError();
7053
7054 if (DidIt) {
7055 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7056 // NOTE BufSize is currently unused, so just pass 0.
7057 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
7058 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7059 CPFuncs[I], DidIt);
7060 // NOTE __kmpc_copyprivate already inserts a barrier
7061 } else if (!IsNowait) {
7062 InsertPointOrErrorTy AfterIP =
7063 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
7064 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7065 /* CheckCancelFlag */ false);
7066 if (!AfterIP)
7067 return AfterIP.takeError();
7068 }
7069 return Builder.saveIP();
7070}
7071
7072OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
7073 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7074 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7075
7076 if (!updateToLocation(Loc))
7077 return Loc.IP;
7078
7079 Directive OMPD = Directive::OMPD_critical;
7080 uint32_t SrcLocStrSize;
7081 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7082 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7083 Value *ThreadId = getOrCreateThreadID(Ident);
7084 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7085 Value *Args[] = {Ident, ThreadId, LockVar};
7086
7087 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7088 Function *RTFn = nullptr;
7089 if (HintInst) {
7090 // Add Hint to entry Args and create call
7091 EnterArgs.push_back(HintInst);
7092 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7093 } else {
7094 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7095 }
7096 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7097
7098 Function *ExitRTLFn =
7099 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7100 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7101
7102 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7103 /*Conditional*/ false, /*hasFinalize*/ true);
7104}
7105
7106OpenMPIRBuilder::InsertPointTy
7107OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
7108 InsertPointTy AllocaIP, unsigned NumLoops,
7109 ArrayRef<llvm::Value *> StoreValues,
7110 const Twine &Name, bool IsDependSource) {
7111 assert(
7112 llvm::all_of(StoreValues,
7113 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7114 "OpenMP runtime requires depend vec with i64 type");
7115
7116 if (!updateToLocation(Loc))
7117 return Loc.IP;
7118
7119 // Allocate space for vector and generate alloc instruction.
7120 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7121 Builder.restoreIP(AllocaIP);
7122 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7123 ArgsBase->setAlignment(Align(8));
7124 updateToLocation(Loc);
7125
7126 // Store the index value with offset in depend vector.
7127 for (unsigned I = 0; I < NumLoops; ++I) {
7128 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7129 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7130 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7131 STInst->setAlignment(Align(8));
7132 }
7133
7134 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7135 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7136
7137 uint32_t SrcLocStrSize;
7138 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7139 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7140 Value *ThreadId = getOrCreateThreadID(Ident);
7141 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7142
7143 Function *RTLFn = nullptr;
7144 if (IsDependSource)
7145 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7146 else
7147 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7148 createRuntimeFunctionCall(RTLFn, Args);
7149
7150 return Builder.saveIP();
7151}
7152
7153OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
7154 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7155 FinalizeCallbackTy FiniCB, bool IsThreads) {
7156 if (!updateToLocation(Loc))
7157 return Loc.IP;
7158
7159 Directive OMPD = Directive::OMPD_ordered;
7160 Instruction *EntryCall = nullptr;
7161 Instruction *ExitCall = nullptr;
7162
7163 if (IsThreads) {
7164 uint32_t SrcLocStrSize;
7165 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7166 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7167 Value *ThreadId = getOrCreateThreadID(Ident);
7168 Value *Args[] = {Ident, ThreadId};
7169
7170 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7171 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7172
7173 Function *ExitRTLFn =
7174 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7175 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7176 }
7177
7178 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7179 /*Conditional*/ false, /*hasFinalize*/ true);
7180}
7181
7182OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7183 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7184 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7185 bool HasFinalize, bool IsCancellable) {
7186
7187 if (HasFinalize)
7188 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7189
7190 // Create inlined region's entry and body blocks, in preparation
7191 // for conditional creation
7192 BasicBlock *EntryBB = Builder.GetInsertBlock();
7193 Instruction *SplitPos = EntryBB->getTerminator();
7194 if (!isa_and_nonnull<BranchInst>(SplitPos))
7195 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7196 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7197 BasicBlock *FiniBB =
7198 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7199
7200 Builder.SetInsertPoint(EntryBB->getTerminator());
7201 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7202
7203 // generate body
7204 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7205 /* CodeGenIP */ Builder.saveIP()))
7206 return Err;
7207
7208 // emit exit call and do any needed finalization.
7209 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7210 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7211 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7212 "Unexpected control flow graph state!!");
7213 InsertPointOrErrorTy AfterIP =
7214 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7215 if (!AfterIP)
7216 return AfterIP.takeError();
7217
7218 // If we are skipping the region of a non conditional, remove the exit
7219 // block, and clear the builder's insertion point.
7220 assert(SplitPos->getParent() == ExitBB &&
7221 "Unexpected Insertion point location!");
7222 auto merged = MergeBlockIntoPredecessor(ExitBB);
7223 BasicBlock *ExitPredBB = SplitPos->getParent();
7224 auto InsertBB = merged ? ExitPredBB : ExitBB;
7225 if (!isa_and_nonnull<BranchInst>(SplitPos))
7226 SplitPos->eraseFromParent();
7227 Builder.SetInsertPoint(InsertBB);
7228
7229 return Builder.saveIP();
7230}
7231
7232OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7233 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7234 // if nothing to do, Return current insertion point.
7235 if (!Conditional || !EntryCall)
7236 return Builder.saveIP();
7237
7238 BasicBlock *EntryBB = Builder.GetInsertBlock();
7239 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7240 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7241 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7242
7243 // Emit thenBB and set the Builder's insertion point there for
7244 // body generation next. Place the block after the current block.
7245 Function *CurFn = EntryBB->getParent();
7246 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7247
7248 // Move Entry branch to end of ThenBB, and replace with conditional
7249 // branch (If-stmt)
7250 Instruction *EntryBBTI = EntryBB->getTerminator();
7251 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7252 EntryBBTI->removeFromParent();
7253 Builder.SetInsertPoint(UI);
7254 Builder.Insert(EntryBBTI);
7255 UI->eraseFromParent();
7256 Builder.SetInsertPoint(ThenBB->getTerminator());
7257
7258 // return an insertion point to ExitBB.
7259 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7260}
7261
7262OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7263 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7264 bool HasFinalize) {
7265
7266 Builder.restoreIP(FinIP);
7267
7268 // If there is finalization to do, emit it before the exit call
7269 if (HasFinalize) {
7270 assert(!FinalizationStack.empty() &&
7271 "Unexpected finalization stack state!");
7272
7273 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7274 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7275
7276 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7277 return std::move(Err);
7278
7279 // Exit condition: insertion point is before the terminator of the new Fini
7280 // block
7281 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7282 }
7283
7284 if (!ExitCall)
7285 return Builder.saveIP();
7286
7287 // place the Exitcall as last instruction before Finalization block terminator
7288 ExitCall->removeFromParent();
7289 Builder.Insert(ExitCall);
7290
7291 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7292 ExitCall->getIterator());
7293}
7294
7295OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
7296 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7297 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7298 if (!IP.isSet())
7299 return IP;
7300
7301 IRBuilder<>::InsertPointGuard IPG(Builder);
7302
7303 // creates the following CFG structure
7304 // OMP_Entry : (MasterAddr != PrivateAddr)?
7305 // F T
7306 // | \
7307 // | copin.not.master
7308 // | /
7309 // v /
7310 // copyin.not.master.end
7311 // |
7312 // v
7313 // OMP.Entry.Next
7314
7315 BasicBlock *OMP_Entry = IP.getBlock();
7316 Function *CurFn = OMP_Entry->getParent();
7317 BasicBlock *CopyBegin =
7318 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7319 BasicBlock *CopyEnd = nullptr;
7320
7321 // If entry block is terminated, split to preserve the branch to following
7322 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7323 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
7324 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7325 "copyin.not.master.end");
7326 OMP_Entry->getTerminator()->eraseFromParent();
7327 } else {
7328 CopyEnd =
7329 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7330 }
7331
7332 Builder.SetInsertPoint(OMP_Entry);
7333 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7334 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7335 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7336 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7337
7338 Builder.SetInsertPoint(CopyBegin);
7339 if (BranchtoEnd)
7340 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7341
7342 return Builder.saveIP();
7343}
7344
7345CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
7347 std::string Name) {
7348 IRBuilder<>::InsertPointGuard IPG(Builder);
7349 updateToLocation(Loc);
7350
7351 uint32_t SrcLocStrSize;
7352 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7353 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7354 Value *ThreadId = getOrCreateThreadID(Ident);
7355 Value *Args[] = {ThreadId, Size, Allocator};
7356
7357 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7358
7359 return createRuntimeFunctionCall(Fn, Args, Name);
7360}
7361
7362CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
7363 Value *Addr, Value *Allocator,
7364 std::string Name) {
7365 IRBuilder<>::InsertPointGuard IPG(Builder);
7366 updateToLocation(Loc);
7367
7368 uint32_t SrcLocStrSize;
7369 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7370 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7371 Value *ThreadId = getOrCreateThreadID(Ident);
7372 Value *Args[] = {ThreadId, Addr, Allocator};
7373 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7374 return createRuntimeFunctionCall(Fn, Args, Name);
7375}
7376
7377CallInst *OpenMPIRBuilder::createOMPInteropInit(
7378 const LocationDescription &Loc, Value *InteropVar,
7379 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7380 Value *DependenceAddress, bool HaveNowaitClause) {
7381 IRBuilder<>::InsertPointGuard IPG(Builder);
7382 updateToLocation(Loc);
7383
7384 uint32_t SrcLocStrSize;
7385 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7386 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7387 Value *ThreadId = getOrCreateThreadID(Ident);
7388 if (Device == nullptr)
7390 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7391 if (NumDependences == nullptr) {
7392 NumDependences = ConstantInt::get(Int32, 0);
7393 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7394 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7395 }
7396 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7397 Value *Args[] = {
7398 Ident, ThreadId, InteropVar, InteropTypeVal,
7399 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7400
7401 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7402
7403 return createRuntimeFunctionCall(Fn, Args);
7404}
7405
7406CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
7407 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7408 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7409 IRBuilder<>::InsertPointGuard IPG(Builder);
7410 updateToLocation(Loc);
7411
7412 uint32_t SrcLocStrSize;
7413 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7414 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7415 Value *ThreadId = getOrCreateThreadID(Ident);
7416 if (Device == nullptr)
7418 if (NumDependences == nullptr) {
7419 NumDependences = ConstantInt::get(Int32, 0);
7420 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7421 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7422 }
7423 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7424 Value *Args[] = {
7425 Ident, ThreadId, InteropVar, Device,
7426 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7427
7428 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7429
7430 return createRuntimeFunctionCall(Fn, Args);
7431}
7432
7433CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
7434 Value *InteropVar, Value *Device,
7435 Value *NumDependences,
7436 Value *DependenceAddress,
7437 bool HaveNowaitClause) {
7438 IRBuilder<>::InsertPointGuard IPG(Builder);
7439 updateToLocation(Loc);
7440 uint32_t SrcLocStrSize;
7441 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7442 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7443 Value *ThreadId = getOrCreateThreadID(Ident);
7444 if (Device == nullptr)
7446 if (NumDependences == nullptr) {
7447 NumDependences = ConstantInt::get(Int32, 0);
7448 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7449 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7450 }
7451 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7452 Value *Args[] = {
7453 Ident, ThreadId, InteropVar, Device,
7454 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7455
7456 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7457
7458 return createRuntimeFunctionCall(Fn, Args);
7459}
7460
7461CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
7462 const LocationDescription &Loc, llvm::Value *Pointer,
7463 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7464 IRBuilder<>::InsertPointGuard IPG(Builder);
7465 updateToLocation(Loc);
7466
7467 uint32_t SrcLocStrSize;
7468 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7469 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7470 Value *ThreadId = getOrCreateThreadID(Ident);
7471 Constant *ThreadPrivateCache =
7472 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7473 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7474
7475 Function *Fn =
7476 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7477
7478 return createRuntimeFunctionCall(Fn, Args);
7479}
7480
7481OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
7482 const LocationDescription &Loc,
7483 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
7484 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7485 "expected num_threads and num_teams to be specified");
7486
7487 if (!updateToLocation(Loc))
7488 return Loc.IP;
7489
7490 uint32_t SrcLocStrSize;
7491 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7492 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7493 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7494 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7495 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7496 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7497 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7498
7499 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7500 Function *Kernel = DebugKernelWrapper;
7501
7502 // We need to strip the debug prefix to get the correct kernel name.
7503 StringRef KernelName = Kernel->getName();
7504 const std::string DebugPrefix = "_debug__";
7505 if (KernelName.ends_with(DebugPrefix)) {
7506 KernelName = KernelName.drop_back(DebugPrefix.length());
7507 Kernel = M.getFunction(KernelName);
7508 assert(Kernel && "Expected the real kernel to exist");
7509 }
7510
7511 // Manifest the launch configuration in the metadata matching the kernel
7512 // environment.
7513 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7514 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7515
7516 // If MaxThreads not set, select the maximum between the default workgroup
7517 // size and the MinThreads value.
7518 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7519 if (MaxThreadsVal < 0)
7520 MaxThreadsVal = std::max(
7521 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7522
7523 if (MaxThreadsVal > 0)
7524 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7525
7526 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7527 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7528 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7529 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7530 Constant *ReductionDataSize =
7531 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7532 Constant *ReductionBufferLength =
7533 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7534
7535 Function *Fn = getOrCreateRuntimeFunctionPtr(
7536 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7537 const DataLayout &DL = Fn->getDataLayout();
7538
7539 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7540 Constant *DynamicEnvironmentInitializer =
7541 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7542 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7543 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7544 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7545 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7546 DL.getDefaultGlobalsAddressSpace());
7547 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7548
7549 Constant *DynamicEnvironment =
7550 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7551 ? DynamicEnvironmentGV
7552 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7553 DynamicEnvironmentPtr);
7554
7555 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7556 ConfigurationEnvironment, {
7557 UseGenericStateMachineVal,
7558 MayUseNestedParallelismVal,
7559 IsSPMDVal,
7560 MinThreads,
7561 MaxThreads,
7562 MinTeams,
7563 MaxTeams,
7564 ReductionDataSize,
7565 ReductionBufferLength,
7566 });
7567 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7568 KernelEnvironment, {
7569 ConfigurationEnvironmentInitializer,
7570 Ident,
7571 DynamicEnvironment,
7572 });
7573 std::string KernelEnvironmentName =
7574 (KernelName + "_kernel_environment").str();
7575 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7576 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7577 KernelEnvironmentInitializer, KernelEnvironmentName,
7578 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7579 DL.getDefaultGlobalsAddressSpace());
7580 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7581
7582 Constant *KernelEnvironment =
7583 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7584 ? KernelEnvironmentGV
7585 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7586 KernelEnvironmentPtr);
7587 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
7588 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7589 KernelLaunchEnvironment =
7590 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7591 ? KernelLaunchEnvironment
7592 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7593 KernelLaunchEnvParamTy);
7594 CallInst *ThreadKind = createRuntimeFunctionCall(
7595 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7596
7597 Value *ExecUserCode = Builder.CreateICmpEQ(
7598 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7599 "exec_user_code");
7600
7601 // ThreadKind = __kmpc_target_init(...)
7602 // if (ThreadKind == -1)
7603 // user_code
7604 // else
7605 // return;
7606
7607 auto *UI = Builder.CreateUnreachable();
7608 BasicBlock *CheckBB = UI->getParent();
7609 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7610
7611 BasicBlock *WorkerExitBB = BasicBlock::Create(
7612 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7613 Builder.SetInsertPoint(WorkerExitBB);
7614 Builder.CreateRetVoid();
7615
7616 auto *CheckBBTI = CheckBB->getTerminator();
7617 Builder.SetInsertPoint(CheckBBTI);
7618 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7619
7620 CheckBBTI->eraseFromParent();
7621 UI->eraseFromParent();
7622
7623 // Continue in the "user_code" block, see diagram above and in
7624 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7625 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7626}
7627
7628void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
7629 int32_t TeamsReductionDataSize,
7630 int32_t TeamsReductionBufferLength) {
7631 if (!updateToLocation(Loc))
7632 return;
7633
7634 Function *Fn = getOrCreateRuntimeFunctionPtr(
7635 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7636
7637 createRuntimeFunctionCall(Fn, {});
7638
7639 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7640 return;
7641
7642 Function *Kernel = Builder.GetInsertBlock()->getParent();
7643 // We need to strip the debug prefix to get the correct kernel name.
7644 StringRef KernelName = Kernel->getName();
7645 const std::string DebugPrefix = "_debug__";
7646 if (KernelName.ends_with(DebugPrefix))
7647 KernelName = KernelName.drop_back(DebugPrefix.length());
7648 auto *KernelEnvironmentGV =
7649 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7650 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7651 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7652 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7653 KernelEnvironmentInitializer,
7654 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7655 NewInitializer = ConstantFoldInsertValueInstruction(
7656 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7657 {0, 8});
7658 KernelEnvironmentGV->setInitializer(NewInitializer);
7659}
7660
7661static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7662 bool Min) {
7663 if (Kernel.hasFnAttribute(Name)) {
7664 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7665 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7666 }
7667 Kernel.addFnAttr(Name, llvm::utostr(Value));
7668}
7669
7670std::pair<int32_t, int32_t>
7671OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
7672 int32_t ThreadLimit =
7673 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7674
7675 if (T.isAMDGPU()) {
7676 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7677 if (!Attr.isValid() || !Attr.isStringAttribute())
7678 return {0, ThreadLimit};
7679 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7680 int32_t LB, UB;
7681 if (!llvm::to_integer(UBStr, UB, 10))
7682 return {0, ThreadLimit};
7683 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7684 if (!llvm::to_integer(LBStr, LB, 10))
7685 return {0, UB};
7686 return {LB, UB};
7687 }
7688
7689 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7690 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7691 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7692 }
7693 return {0, ThreadLimit};
7694}
7695
7696void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
7697 Function &Kernel, int32_t LB,
7698 int32_t UB) {
7699 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7700
7701 if (T.isAMDGPU()) {
7702 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7703 llvm::utostr(LB) + "," + llvm::utostr(UB));
7704 return;
7705 }
7706
7707 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7708}
7709
7710std::pair<int32_t, int32_t>
7711OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
7712 // TODO: Read from backend annotations if available.
7713 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7714}
7715
7716void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7717 int32_t LB, int32_t UB) {
7718 if (T.isNVPTX())
7719 if (UB > 0)
7720 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7721 if (T.isAMDGPU())
7722 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7723
7724 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7725}
7726
7727void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7728 Function *OutlinedFn) {
7729 if (Config.isTargetDevice()) {
7731 // TODO: Determine if DSO local can be set to true.
7732 OutlinedFn->setDSOLocal(false);
7734 if (T.isAMDGCN())
7736 else if (T.isNVPTX())
7738 else if (T.isSPIRV())
7740 }
7741}
7742
7743Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7744 StringRef EntryFnIDName) {
7745 if (Config.isTargetDevice()) {
7746 assert(OutlinedFn && "The outlined function must exist if embedded");
7747 return OutlinedFn;
7748 }
7749
7750 return new GlobalVariable(
7751 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7752 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7753}
7754
7755Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7756 StringRef EntryFnName) {
7757 if (OutlinedFn)
7758 return OutlinedFn;
7759
7760 assert(!M.getGlobalVariable(EntryFnName, true) &&
7761 "Named kernel already exists?");
7762 return new GlobalVariable(
7763 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7764 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7765}
7766
7767Error OpenMPIRBuilder::emitTargetRegionFunction(
7768 TargetRegionEntryInfo &EntryInfo,
7769 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7770 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7771
7772 SmallString<64> EntryFnName;
7773 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7774
7775 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7776 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7777 if (!CBResult)
7778 return CBResult.takeError();
7779 OutlinedFn = *CBResult;
7780 } else {
7781 OutlinedFn = nullptr;
7782 }
7783
7784 // If this target outline function is not an offload entry, we don't need to
7785 // register it. This may be in the case of a false if clause, or if there are
7786 // no OpenMP targets.
7787 if (!IsOffloadEntry)
7788 return Error::success();
7789
7790 std::string EntryFnIDName =
7791 Config.isTargetDevice()
7792 ? std::string(EntryFnName)
7793 : createPlatformSpecificName({EntryFnName, "region_id"});
7794
7795 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7796 EntryFnName, EntryFnIDName);
7797 return Error::success();
7798}
7799
7800Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7801 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7802 StringRef EntryFnName, StringRef EntryFnIDName) {
7803 if (OutlinedFn)
7804 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7805 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7806 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7807 OffloadInfoManager.registerTargetRegionEntryInfo(
7808 EntryInfo, EntryAddr, OutlinedFnID,
7809 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7810 return OutlinedFnID;
7811}
7812
7813OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7814 const LocationDescription &Loc, InsertPointTy AllocaIP,
7815 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7816 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7817 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7818 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7819 BodyGenTy BodyGenType)>
7820 BodyGenCB,
7821 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7822 if (!updateToLocation(Loc))
7823 return InsertPointTy();
7824
7825 Builder.restoreIP(CodeGenIP);
7826 // Disable TargetData CodeGen on Device pass.
7827 if (Config.IsTargetDevice.value_or(false)) {
7828 if (BodyGenCB) {
7829 InsertPointOrErrorTy AfterIP =
7830 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7831 if (!AfterIP)
7832 return AfterIP.takeError();
7833 Builder.restoreIP(*AfterIP);
7834 }
7835 return Builder.saveIP();
7836 }
7837
7838 bool IsStandAlone = !BodyGenCB;
7839 MapInfosTy *MapInfo;
7840 // Generate the code for the opening of the data environment. Capture all the
7841 // arguments of the runtime call by reference because they are used in the
7842 // closing of the region.
7843 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7844 InsertPointTy CodeGenIP) -> Error {
7845 MapInfo = &GenMapInfoCB(Builder.saveIP());
7846 if (Error Err = emitOffloadingArrays(
7847 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7848 /*IsNonContiguous=*/true, DeviceAddrCB))
7849 return Err;
7850
7851 TargetDataRTArgs RTArgs;
7852 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7853
7854 // Emit the number of elements in the offloading arrays.
7855 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7856
7857 // Source location for the ident struct
7858 if (!SrcLocInfo) {
7859 uint32_t SrcLocStrSize;
7860 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7861 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7862 }
7863
7864 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7865 SrcLocInfo, DeviceID,
7866 PointerNum, RTArgs.BasePointersArray,
7867 RTArgs.PointersArray, RTArgs.SizesArray,
7868 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7869 RTArgs.MappersArray};
7870
7871 if (IsStandAlone) {
7872 assert(MapperFunc && "MapperFunc missing for standalone target data");
7873
7874 auto TaskBodyCB = [&](Value *, Value *,
7876 if (Info.HasNoWait) {
7877 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7881 }
7882
7883 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7884 OffloadingArgs);
7885
7886 if (Info.HasNoWait) {
7887 BasicBlock *OffloadContBlock =
7888 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7889 Function *CurFn = Builder.GetInsertBlock()->getParent();
7890 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7891 Builder.restoreIP(Builder.saveIP());
7892 }
7893 return Error::success();
7894 };
7895
7896 bool RequiresOuterTargetTask = Info.HasNoWait;
7897 if (!RequiresOuterTargetTask)
7898 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7899 /*TargetTaskAllocaIP=*/{}));
7900 else
7901 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7902 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7903 } else {
7904 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7905 omp::OMPRTL___tgt_target_data_begin_mapper);
7906
7907 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
7908
7909 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7910 if (isa<AllocaInst>(DeviceMap.second.second)) {
7911 auto *LI =
7912 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7913 Builder.CreateStore(LI, DeviceMap.second.second);
7914 }
7915 }
7916
7917 // If device pointer privatization is required, emit the body of the
7918 // region here. It will have to be duplicated: with and without
7919 // privatization.
7920 InsertPointOrErrorTy AfterIP =
7921 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7922 if (!AfterIP)
7923 return AfterIP.takeError();
7924 Builder.restoreIP(*AfterIP);
7925 }
7926 return Error::success();
7927 };
7928
7929 // If we need device pointer privatization, we need to emit the body of the
7930 // region with no privatization in the 'else' branch of the conditional.
7931 // Otherwise, we don't have to do anything.
7932 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7933 InsertPointTy CodeGenIP) -> Error {
7934 InsertPointOrErrorTy AfterIP =
7935 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7936 if (!AfterIP)
7937 return AfterIP.takeError();
7938 Builder.restoreIP(*AfterIP);
7939 return Error::success();
7940 };
7941
7942 // Generate code for the closing of the data region.
7943 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7944 TargetDataRTArgs RTArgs;
7945 Info.EmitDebug = !MapInfo->Names.empty();
7946 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7947
7948 // Emit the number of elements in the offloading arrays.
7949 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7950
7951 // Source location for the ident struct
7952 if (!SrcLocInfo) {
7953 uint32_t SrcLocStrSize;
7954 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7955 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7956 }
7957
7958 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7959 PointerNum, RTArgs.BasePointersArray,
7960 RTArgs.PointersArray, RTArgs.SizesArray,
7961 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7962 RTArgs.MappersArray};
7963 Function *EndMapperFunc =
7964 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7965
7966 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
7967 return Error::success();
7968 };
7969
7970 // We don't have to do anything to close the region if the if clause evaluates
7971 // to false.
7972 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7973 return Error::success();
7974 };
7975
7976 Error Err = [&]() -> Error {
7977 if (BodyGenCB) {
7978 Error Err = [&]() {
7979 if (IfCond)
7980 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7981 return BeginThenGen(AllocaIP, Builder.saveIP());
7982 }();
7983
7984 if (Err)
7985 return Err;
7986
7987 // If we don't require privatization of device pointers, we emit the body
7988 // in between the runtime calls. This avoids duplicating the body code.
7989 InsertPointOrErrorTy AfterIP =
7990 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7991 if (!AfterIP)
7992 return AfterIP.takeError();
7993 restoreIPandDebugLoc(Builder, *AfterIP);
7994
7995 if (IfCond)
7996 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7997 return EndThenGen(AllocaIP, Builder.saveIP());
7998 }
7999 if (IfCond)
8000 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8001 return BeginThenGen(AllocaIP, Builder.saveIP());
8002 }();
8003
8004 if (Err)
8005 return Err;
8006
8007 return Builder.saveIP();
8008}
8009
8011OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
8012 bool IsGPUDistribute) {
8013 assert((IVSize == 32 || IVSize == 64) &&
8014 "IV size is not compatible with the omp runtime");
8016 if (IsGPUDistribute)
8017 Name = IVSize == 32
8018 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8019 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8020 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8021 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8022 else
8023 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8024 : omp::OMPRTL___kmpc_for_static_init_4u)
8025 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8026 : omp::OMPRTL___kmpc_for_static_init_8u);
8027
8028 return getOrCreateRuntimeFunction(M, Name);
8029}
8030
8031FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
8032 bool IVSigned) {
8033 assert((IVSize == 32 || IVSize == 64) &&
8034 "IV size is not compatible with the omp runtime");
8035 RuntimeFunction Name = IVSize == 32
8036 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8037 : omp::OMPRTL___kmpc_dispatch_init_4u)
8038 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8039 : omp::OMPRTL___kmpc_dispatch_init_8u);
8040
8041 return getOrCreateRuntimeFunction(M, Name);
8042}
8043
8044FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
8045 bool IVSigned) {
8046 assert((IVSize == 32 || IVSize == 64) &&
8047 "IV size is not compatible with the omp runtime");
8048 RuntimeFunction Name = IVSize == 32
8049 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8050 : omp::OMPRTL___kmpc_dispatch_next_4u)
8051 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8052 : omp::OMPRTL___kmpc_dispatch_next_8u);
8053
8054 return getOrCreateRuntimeFunction(M, Name);
8055}
8056
8057FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
8058 bool IVSigned) {
8059 assert((IVSize == 32 || IVSize == 64) &&
8060 "IV size is not compatible with the omp runtime");
8061 RuntimeFunction Name = IVSize == 32
8062 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8063 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8064 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8065 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8066
8067 return getOrCreateRuntimeFunction(M, Name);
8068}
8069
8070FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
8071 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8072}
8073
8075 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8076 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8077
8078 DISubprogram *NewSP = Func->getSubprogram();
8079 if (!NewSP)
8080 return;
8081
8083
8084 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8085 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8086 // Only use cached variable if the arg number matches. This is important
8087 // so that DIVariable created for privatized variables are not discarded.
8088 if (NewVar && (arg == NewVar->getArg()))
8089 return NewVar;
8090
8092 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8093 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8094 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8095 return NewVar;
8096 };
8097
8098 auto UpdateDebugRecord = [&](auto *DR) {
8099 DILocalVariable *OldVar = DR->getVariable();
8100 unsigned ArgNo = 0;
8101 for (auto Loc : DR->location_ops()) {
8102 auto Iter = ValueReplacementMap.find(Loc);
8103 if (Iter != ValueReplacementMap.end()) {
8104 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8105 ArgNo = std::get<1>(Iter->second) + 1;
8106 }
8107 }
8108 if (ArgNo != 0)
8109 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8110 };
8111
8112 // The location and scope of variable intrinsics and records still point to
8113 // the parent function of the target region. Update them.
8114 for (Instruction &I : instructions(Func)) {
8116 "Unexpected debug intrinsic");
8117 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
8118 UpdateDebugRecord(&DVR);
8119 }
8120 // An extra argument is passed to the device. Create the debug data for it.
8121 if (OMPBuilder.Config.isTargetDevice()) {
8122 DICompileUnit *CU = NewSP->getUnit();
8123 Module *M = Func->getParent();
8124 DIBuilder DB(*M, true, CU);
8125 DIType *VoidPtrTy =
8126 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8127 DILocalVariable *Var = DB.createParameterVariable(
8128 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
8129 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8130 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8131 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
8132 &(*Func->begin()));
8133 }
8134}
8135
8137 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8138 return cast<Operator>(V)->getOperand(0);
8139 return V;
8140}
8141
8143 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8144 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8145 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8146 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
8147 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
8148 SmallVector<Type *> ParameterTypes;
8149 if (OMPBuilder.Config.isTargetDevice()) {
8150 // Add the "implicit" runtime argument we use to provide launch specific
8151 // information for target devices.
8152 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
8153 ParameterTypes.push_back(Int8PtrTy);
8154
8155 // All parameters to target devices are passed as pointers
8156 // or i64. This assumes 64-bit address spaces/pointers.
8157 for (auto &Arg : Inputs)
8158 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8159 ? Arg->getType()
8160 : Type::getInt64Ty(Builder.getContext()));
8161 } else {
8162 for (auto &Arg : Inputs)
8163 ParameterTypes.push_back(Arg->getType());
8164 }
8165
8166 auto BB = Builder.GetInsertBlock();
8167 auto M = BB->getModule();
8168 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8169 /*isVarArg*/ false);
8170 auto Func =
8171 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8172
8173 // Forward target-cpu and target-features function attributes from the
8174 // original function to the new outlined function.
8175 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8176
8177 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8178 if (TargetCpuAttr.isStringAttribute())
8179 Func->addFnAttr(TargetCpuAttr);
8180
8181 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8182 if (TargetFeaturesAttr.isStringAttribute())
8183 Func->addFnAttr(TargetFeaturesAttr);
8184
8185 if (OMPBuilder.Config.isTargetDevice()) {
8186 Value *ExecMode =
8187 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8188 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8189 }
8190
8191 // Save insert point.
8192 IRBuilder<>::InsertPointGuard IPG(Builder);
8193 // We will generate the entries in the outlined function but the debug
8194 // location may still be pointing to the parent function. Reset it now.
8195 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8196
8197 // Generate the region into the function.
8198 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8199 Builder.SetInsertPoint(EntryBB);
8200
8201 // Insert target init call in the device compilation pass.
8202 if (OMPBuilder.Config.isTargetDevice())
8203 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8204
8205 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8206
8207 // As we embed the user code in the middle of our target region after we
8208 // generate entry code, we must move what allocas we can into the entry
8209 // block to avoid possible breaking optimisations for device
8210 if (OMPBuilder.Config.isTargetDevice())
8211 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
8212
8213 // Insert target deinit call in the device compilation pass.
8214 BasicBlock *OutlinedBodyBB =
8215 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8216 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
8217 Builder.saveIP(),
8218 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8219 if (!AfterIP)
8220 return AfterIP.takeError();
8221 Builder.restoreIP(*AfterIP);
8222 if (OMPBuilder.Config.isTargetDevice())
8223 OMPBuilder.createTargetDeinit(Builder);
8224
8225 // Insert return instruction.
8226 Builder.CreateRetVoid();
8227
8228 // New Alloca IP at entry point of created device function.
8229 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8230 auto AllocaIP = Builder.saveIP();
8231
8232 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8233
8234 // Skip the artificial dyn_ptr on the device.
8235 const auto &ArgRange =
8236 OMPBuilder.Config.isTargetDevice()
8237 ? make_range(Func->arg_begin() + 1, Func->arg_end())
8238 : Func->args();
8239
8241
8242 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8243 // Things like GEP's can come in the form of Constants. Constants and
8244 // ConstantExpr's do not have access to the knowledge of what they're
8245 // contained in, so we must dig a little to find an instruction so we
8246 // can tell if they're used inside of the function we're outlining. We
8247 // also replace the original constant expression with a new instruction
8248 // equivalent; an instruction as it allows easy modification in the
8249 // following loop, as we can now know the constant (instruction) is
8250 // owned by our target function and replaceUsesOfWith can now be invoked
8251 // on it (cannot do this with constants it seems). A brand new one also
8252 // allows us to be cautious as it is perhaps possible the old expression
8253 // was used inside of the function but exists and is used externally
8254 // (unlikely by the nature of a Constant, but still).
8255 // NOTE: We cannot remove dead constants that have been rewritten to
8256 // instructions at this stage, we run the risk of breaking later lowering
8257 // by doing so as we could still be in the process of lowering the module
8258 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8259 // constants we have created rewritten versions of.
8260 if (auto *Const = dyn_cast<Constant>(Input))
8261 convertUsersOfConstantsToInstructions(Const, Func, false);
8262
8263 // Collect users before iterating over them to avoid invalidating the
8264 // iteration in case a user uses Input more than once (e.g. a call
8265 // instruction).
8266 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8267 // Collect all the instructions
8269 if (auto *Instr = dyn_cast<Instruction>(User))
8270 if (Instr->getFunction() == Func)
8271 Instr->replaceUsesOfWith(Input, InputCopy);
8272 };
8273
8274 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8275
8276 // Rewrite uses of input valus to parameters.
8277 for (auto InArg : zip(Inputs, ArgRange)) {
8278 Value *Input = std::get<0>(InArg);
8279 Argument &Arg = std::get<1>(InArg);
8280 Value *InputCopy = nullptr;
8281
8282 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
8283 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8284 if (!AfterIP)
8285 return AfterIP.takeError();
8286 Builder.restoreIP(*AfterIP);
8287 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8288
8289 // In certain cases a Global may be set up for replacement, however, this
8290 // Global may be used in multiple arguments to the kernel, just segmented
8291 // apart, for example, if we have a global array, that is sectioned into
8292 // multiple mappings (technically not legal in OpenMP, but there is a case
8293 // in Fortran for Common Blocks where this is neccesary), we will end up
8294 // with GEP's into this array inside the kernel, that refer to the Global
8295 // but are technically seperate arguments to the kernel for all intents and
8296 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8297 // index, it will fold into an referal to the Global, if we then encounter
8298 // this folded GEP during replacement all of the references to the
8299 // Global in the kernel will be replaced with the argument we have generated
8300 // that corresponds to it, including any other GEP's that refer to the
8301 // Global that may be other arguments. This will invalidate all of the other
8302 // preceding mapped arguments that refer to the same global that may be
8303 // seperate segments. To prevent this, we defer global processing until all
8304 // other processing has been performed.
8307 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8308 continue;
8309 }
8310
8312 continue;
8313
8314 ReplaceValue(Input, InputCopy, Func);
8315 }
8316
8317 // Replace all of our deferred Input values, currently just Globals.
8318 for (auto Deferred : DeferredReplacement)
8319 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8320
8321 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8322 ValueReplacementMap);
8323 return Func;
8324}
8325/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8326/// of pointers containing shared data between the parent task and the created
8327/// task.
8328static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
8329 IRBuilderBase &Builder,
8330 Value *TaskWithPrivates,
8331 Type *TaskWithPrivatesTy) {
8332
8333 Type *TaskTy = OMPIRBuilder.Task;
8334 LLVMContext &Ctx = Builder.getContext();
8335 Value *TaskT =
8336 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8337 Value *Shareds = TaskT;
8338 // TaskWithPrivatesTy can be one of the following
8339 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8340 // %struct.privates }
8341 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8342 //
8343 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8344 // its first member has to be the task descriptor. TaskTy is the type of the
8345 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8346 // first member of TaskT, gives us the pointer to shared data.
8347 if (TaskWithPrivatesTy != TaskTy)
8348 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8349 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8350}
8351/// Create an entry point for a target task with the following.
8352/// It'll have the following signature
8353/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8354/// This function is called from emitTargetTask once the
8355/// code to launch the target kernel has been outlined already.
8356/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8357/// into the task structure so that the deferred target task can access this
8358/// data even after the stack frame of the generating task has been rolled
8359/// back. Offloading arrays contain base pointers, pointers, sizes etc
8360/// of the data that the target kernel will access. These in effect are the
8361/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8363 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8364 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8365 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8366
8367 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8368 // This is because PrivatesTy is the type of the structure in which
8369 // we pass the offloading arrays to the deferred target task.
8370 assert((!NumOffloadingArrays || PrivatesTy) &&
8371 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8372 "to privatize");
8373
8374 Module &M = OMPBuilder.M;
8375 // KernelLaunchFunction is the target launch function, i.e.
8376 // the function that sets up kernel arguments and calls
8377 // __tgt_target_kernel to launch the kernel on the device.
8378 //
8379 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8380
8381 // StaleCI is the CallInst which is the call to the outlined
8382 // target kernel launch function. If there are local live-in values
8383 // that the outlined function uses then these are aggregated into a structure
8384 // which is passed as the second argument. If there are no local live-in
8385 // values or if all values used by the outlined kernel are global variables,
8386 // then there's only one argument, the threadID. So, StaleCI can be
8387 //
8388 // %structArg = alloca { ptr, ptr }, align 8
8389 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8390 // store ptr %20, ptr %gep_, align 8
8391 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8392 // store ptr %21, ptr %gep_8, align 8
8393 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8394 //
8395 // OR
8396 //
8397 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8398 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
8399 StaleCI->getIterator());
8400
8401 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8402
8403 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8404 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8405 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8406
8407 auto ProxyFnTy =
8408 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8409 /* isVarArg */ false);
8410 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8411 ".omp_target_task_proxy_func",
8412 Builder.GetInsertBlock()->getModule());
8413 Value *ThreadId = ProxyFn->getArg(0);
8414 Value *TaskWithPrivates = ProxyFn->getArg(1);
8415 ThreadId->setName("thread.id");
8416 TaskWithPrivates->setName("task");
8417
8418 bool HasShareds = SharedArgsOperandNo > 0;
8419 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8420 BasicBlock *EntryBB =
8421 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8422 Builder.SetInsertPoint(EntryBB);
8423
8424 SmallVector<Value *> KernelLaunchArgs;
8425 KernelLaunchArgs.reserve(StaleCI->arg_size());
8426 KernelLaunchArgs.push_back(ThreadId);
8427
8428 if (HasOffloadingArrays) {
8429 assert(TaskTy != TaskWithPrivatesTy &&
8430 "If there are offloading arrays to pass to the target"
8431 "TaskTy cannot be the same as TaskWithPrivatesTy");
8432 (void)TaskTy;
8433 Value *Privates =
8434 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8435 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8436 KernelLaunchArgs.push_back(
8437 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8438 }
8439
8440 if (HasShareds) {
8441 auto *ArgStructAlloca =
8442 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8443 assert(ArgStructAlloca &&
8444 "Unable to find the alloca instruction corresponding to arguments "
8445 "for extracted function");
8446 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8447
8448 AllocaInst *NewArgStructAlloca =
8449 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8450
8451 Value *SharedsSize =
8452 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8453
8455 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8456
8457 Builder.CreateMemCpy(
8458 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8459 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8460 KernelLaunchArgs.push_back(NewArgStructAlloca);
8461 }
8462 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8463 Builder.CreateRetVoid();
8464 return ProxyFn;
8465}
8467
8468 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8469 return GEP->getSourceElementType();
8470 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8471 return Alloca->getAllocatedType();
8472
8473 llvm_unreachable("Unhandled Instruction type");
8474 return nullptr;
8475}
8476// This function returns a struct that has at most two members.
8477// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8478// descriptor. The second member, if needed, is a struct containing arrays
8479// that need to be passed to the offloaded target kernel. For example,
8480// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8481// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8482// respectively, then the types created by this function are
8483//
8484// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8485// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8486// %struct.privates }
8487// %struct.task_with_privates is returned by this function.
8488// If there aren't any offloading arrays to pass to the target kernel,
8489// %struct.kmp_task_ompbuilder_t is returned.
8490static StructType *
8491createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
8492 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8493
8494 if (OffloadingArraysToPrivatize.empty())
8495 return OMPIRBuilder.Task;
8496
8497 SmallVector<Type *, 4> StructFieldTypes;
8498 for (Value *V : OffloadingArraysToPrivatize) {
8499 assert(V->getType()->isPointerTy() &&
8500 "Expected pointer to array to privatize. Got a non-pointer value "
8501 "instead");
8502 Type *ArrayTy = getOffloadingArrayType(V);
8503 assert(ArrayTy && "ArrayType cannot be nullptr");
8504 StructFieldTypes.push_back(ArrayTy);
8505 }
8506 StructType *PrivatesStructTy =
8507 StructType::create(StructFieldTypes, "struct.privates");
8508 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8509 "struct.task_with_privates");
8510}
8512 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8513 TargetRegionEntryInfo &EntryInfo,
8514 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8515 Function *&OutlinedFn, Constant *&OutlinedFnID,
8517 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
8518 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
8519
8520 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8521 [&](StringRef EntryFnName) {
8522 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8523 EntryFnName, Inputs, CBFunc,
8524 ArgAccessorFuncCB);
8525 };
8526
8527 return OMPBuilder.emitTargetRegionFunction(
8528 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8529 OutlinedFnID);
8530}
8531
8532OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
8533 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8534 OpenMPIRBuilder::InsertPointTy AllocaIP,
8536 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8537
8538 // The following explains the code-gen scenario for the `target` directive. A
8539 // similar scneario is followed for other device-related directives (e.g.
8540 // `target enter data`) but in similar fashion since we only need to emit task
8541 // that encapsulates the proper runtime call.
8542 //
8543 // When we arrive at this function, the target region itself has been
8544 // outlined into the function OutlinedFn.
8545 // So at ths point, for
8546 // --------------------------------------------------------------
8547 // void user_code_that_offloads(...) {
8548 // omp target depend(..) map(from:a) map(to:b) private(i)
8549 // do i = 1, 10
8550 // a(i) = b(i) + n
8551 // }
8552 //
8553 // --------------------------------------------------------------
8554 //
8555 // we have
8556 //
8557 // --------------------------------------------------------------
8558 //
8559 // void user_code_that_offloads(...) {
8560 // %.offload_baseptrs = alloca [2 x ptr], align 8
8561 // %.offload_ptrs = alloca [2 x ptr], align 8
8562 // %.offload_mappers = alloca [2 x ptr], align 8
8563 // ;; target region has been outlined and now we need to
8564 // ;; offload to it via a target task.
8565 // }
8566 // void outlined_device_function(ptr a, ptr b, ptr n) {
8567 // n = *n_ptr;
8568 // do i = 1, 10
8569 // a(i) = b(i) + n
8570 // }
8571 //
8572 // We have to now do the following
8573 // (i) Make an offloading call to outlined_device_function using the OpenMP
8574 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8575 // emitted by emitKernelLaunch
8576 // (ii) Create a task entry point function that calls kernel_launch_function
8577 // and is the entry point for the target task. See
8578 // '@.omp_target_task_proxy_func in the pseudocode below.
8579 // (iii) Create a task with the task entry point created in (ii)
8580 //
8581 // That is we create the following
8582 // struct task_with_privates {
8583 // struct kmp_task_ompbuilder_t task_struct;
8584 // struct privates {
8585 // [2 x ptr] ; baseptrs
8586 // [2 x ptr] ; ptrs
8587 // [2 x i64] ; sizes
8588 // }
8589 // }
8590 // void user_code_that_offloads(...) {
8591 // %.offload_baseptrs = alloca [2 x ptr], align 8
8592 // %.offload_ptrs = alloca [2 x ptr], align 8
8593 // %.offload_sizes = alloca [2 x i64], align 8
8594 //
8595 // %structArg = alloca { ptr, ptr, ptr }, align 8
8596 // %strucArg[0] = a
8597 // %strucArg[1] = b
8598 // %strucArg[2] = &n
8599 //
8600 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8601 // sizeof(kmp_task_ompbuilder_t),
8602 // sizeof(structArg),
8603 // @.omp_target_task_proxy_func,
8604 // ...)
8605 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8606 // sizeof(structArg))
8607 // memcpy(target_task_with_privates->privates->baseptrs,
8608 // offload_baseptrs, sizeof(offload_baseptrs)
8609 // memcpy(target_task_with_privates->privates->ptrs,
8610 // offload_ptrs, sizeof(offload_ptrs)
8611 // memcpy(target_task_with_privates->privates->sizes,
8612 // offload_sizes, sizeof(offload_sizes)
8613 // dependencies_array = ...
8614 // ;; if nowait not present
8615 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8616 // call @__kmpc_omp_task_begin_if0(...)
8617 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8618 // %target_task_with_privates)
8619 // call @__kmpc_omp_task_complete_if0(...)
8620 // }
8621 //
8622 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8623 // ptr %task) {
8624 // %structArg = alloca {ptr, ptr, ptr}
8625 // %task_ptr = getelementptr(%task, 0, 0)
8626 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8627 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8628 //
8629 // %offloading_arrays = getelementptr(%task, 0, 1)
8630 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8631 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8632 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8633 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8634 // %offload_sizes, %structArg)
8635 // }
8636 //
8637 // We need the proxy function because the signature of the task entry point
8638 // expected by kmpc_omp_task is always the same and will be different from
8639 // that of the kernel_launch function.
8640 //
8641 // kernel_launch_function is generated by emitKernelLaunch and has the
8642 // always_inline attribute. For this example, it'll look like so:
8643 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8644 // %offload_sizes, %structArg) alwaysinline {
8645 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8646 // ; load aggregated data from %structArg
8647 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8648 // ; offload_sizes
8649 // call i32 @__tgt_target_kernel(...,
8650 // outlined_device_function,
8651 // ptr %kernel_args)
8652 // }
8653 // void outlined_device_function(ptr a, ptr b, ptr n) {
8654 // n = *n_ptr;
8655 // do i = 1, 10
8656 // a(i) = b(i) + n
8657 // }
8658 //
8659 BasicBlock *TargetTaskBodyBB =
8660 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8661 BasicBlock *TargetTaskAllocaBB =
8662 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8663
8664 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8665 TargetTaskAllocaBB->begin());
8666 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8667
8668 OutlineInfo OI;
8669 OI.EntryBB = TargetTaskAllocaBB;
8670 OI.OuterAllocaBB = AllocaIP.getBlock();
8671
8672 // Add the thread ID argument.
8674 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
8675 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8676
8677 // Generate the task body which will subsequently be outlined.
8678 Builder.restoreIP(TargetTaskBodyIP);
8679 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8680 return Err;
8681
8682 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8683 // it is given. These blocks are enumerated by
8684 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8685 // to be outside the region. In other words, OI.ExitBlock is expected to be
8686 // the start of the region after the outlining. We used to set OI.ExitBlock
8687 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8688 // except when the task body is a single basic block. In that case,
8689 // OI.ExitBlock is set to the single task body block and will get left out of
8690 // the outlining process. So, simply create a new empty block to which we
8691 // uncoditionally branch from where TaskBodyCB left off
8692 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8693 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8694 /*IsFinished=*/true);
8695
8696 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8697 bool NeedsTargetTask = HasNoWait && DeviceID;
8698 if (NeedsTargetTask) {
8699 for (auto *V :
8700 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8701 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8702 RTArgs.SizesArray}) {
8704 OffloadingArraysToPrivatize.push_back(V);
8705 OI.ExcludeArgsFromAggregate.push_back(V);
8706 }
8707 }
8708 }
8709 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8710 DeviceID, OffloadingArraysToPrivatize](
8711 Function &OutlinedFn) mutable {
8712 assert(OutlinedFn.hasOneUse() &&
8713 "there must be a single user for the outlined function");
8714
8715 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8716
8717 // The first argument of StaleCI is always the thread id.
8718 // The next few arguments are the pointers to offloading arrays
8719 // if any. (see OffloadingArraysToPrivatize)
8720 // Finally, all other local values that are live-in into the outlined region
8721 // end up in a structure whose pointer is passed as the last argument. This
8722 // piece of data is passed in the "shared" field of the task structure. So,
8723 // we know we have to pass shareds to the task if the number of arguments is
8724 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8725 // thread id. Further, for safety, we assert that the number of arguments of
8726 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8727 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8728 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8729 assert((!HasShareds ||
8730 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8731 "Wrong number of arguments for StaleCI when shareds are present");
8732 int SharedArgOperandNo =
8733 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8734
8735 StructType *TaskWithPrivatesTy =
8736 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8737 StructType *PrivatesTy = nullptr;
8738
8739 if (!OffloadingArraysToPrivatize.empty())
8740 PrivatesTy =
8741 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8742
8744 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8745 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8746
8747 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8748 << "\n");
8749
8750 Builder.SetInsertPoint(StaleCI);
8751
8752 // Gather the arguments for emitting the runtime call.
8753 uint32_t SrcLocStrSize;
8754 Constant *SrcLocStr =
8755 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8756 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8757
8758 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8759 //
8760 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8761 // the DeviceID to the deferred task and also since
8762 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8763 Function *TaskAllocFn =
8764 !NeedsTargetTask
8765 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8766 : getOrCreateRuntimeFunctionPtr(
8767 OMPRTL___kmpc_omp_target_task_alloc);
8768
8769 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8770 // call.
8771 Value *ThreadID = getOrCreateThreadID(Ident);
8772
8773 // Argument - `sizeof_kmp_task_t` (TaskSize)
8774 // Tasksize refers to the size in bytes of kmp_task_t data structure
8775 // plus any other data to be passed to the target task, if any, which
8776 // is packed into a struct. kmp_task_t and the struct so created are
8777 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8778 Value *TaskSize = Builder.getInt64(
8779 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8780
8781 // Argument - `sizeof_shareds` (SharedsSize)
8782 // SharedsSize refers to the shareds array size in the kmp_task_t data
8783 // structure.
8784 Value *SharedsSize = Builder.getInt64(0);
8785 if (HasShareds) {
8786 auto *ArgStructAlloca =
8787 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8788 assert(ArgStructAlloca &&
8789 "Unable to find the alloca instruction corresponding to arguments "
8790 "for extracted function");
8791 auto *ArgStructType =
8792 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8793 assert(ArgStructType && "Unable to find struct type corresponding to "
8794 "arguments for extracted function");
8795 SharedsSize =
8796 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8797 }
8798
8799 // Argument - `flags`
8800 // Task is tied iff (Flags & 1) == 1.
8801 // Task is untied iff (Flags & 1) == 0.
8802 // Task is final iff (Flags & 2) == 2.
8803 // Task is not final iff (Flags & 2) == 0.
8804 // A target task is not final and is untied.
8805 Value *Flags = Builder.getInt32(0);
8806
8807 // Emit the @__kmpc_omp_task_alloc runtime call
8808 // The runtime call returns a pointer to an area where the task captured
8809 // variables must be copied before the task is run (TaskData)
8810 CallInst *TaskData = nullptr;
8811
8812 SmallVector<llvm::Value *> TaskAllocArgs = {
8813 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8814 /*flags=*/Flags,
8815 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8816 /*task_func=*/ProxyFn};
8817
8818 if (NeedsTargetTask) {
8819 assert(DeviceID && "Expected non-empty device ID.");
8820 TaskAllocArgs.push_back(DeviceID);
8821 }
8822
8823 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
8824
8825 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8826 if (HasShareds) {
8827 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8829 *this, Builder, TaskData, TaskWithPrivatesTy);
8830 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8831 SharedsSize);
8832 }
8833 if (!OffloadingArraysToPrivatize.empty()) {
8834 Value *Privates =
8835 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8836 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8837 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8838 [[maybe_unused]] Type *ArrayType =
8839 getOffloadingArrayType(PtrToPrivatize);
8840 assert(ArrayType && "ArrayType cannot be nullptr");
8841
8842 Type *ElementType = PrivatesTy->getElementType(i);
8843 assert(ElementType == ArrayType &&
8844 "ElementType should match ArrayType");
8845 (void)ArrayType;
8846
8847 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8848 Builder.CreateMemCpy(
8849 Dst, Alignment, PtrToPrivatize, Alignment,
8850 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8851 }
8852 }
8853
8854 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8855
8856 // ---------------------------------------------------------------
8857 // V5.2 13.8 target construct
8858 // If the nowait clause is present, execution of the target task
8859 // may be deferred. If the nowait clause is not present, the target task is
8860 // an included task.
8861 // ---------------------------------------------------------------
8862 // The above means that the lack of a nowait on the target construct
8863 // translates to '#pragma omp task if(0)'
8864 if (!NeedsTargetTask) {
8865 if (DepArray) {
8866 Function *TaskWaitFn =
8867 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8868 createRuntimeFunctionCall(
8869 TaskWaitFn,
8870 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8871 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8872 /*dep_list=*/DepArray,
8873 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8874 /*noalias_dep_list=*/
8876 }
8877 // Included task.
8878 Function *TaskBeginFn =
8879 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8880 Function *TaskCompleteFn =
8881 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8882 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8883 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
8884 CI->setDebugLoc(StaleCI->getDebugLoc());
8885 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8886 } else if (DepArray) {
8887 // HasNoWait - meaning the task may be deferred. Call
8888 // __kmpc_omp_task_with_deps if there are dependencies,
8889 // else call __kmpc_omp_task
8890 Function *TaskFn =
8891 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8892 createRuntimeFunctionCall(
8893 TaskFn,
8894 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8895 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8897 } else {
8898 // Emit the @__kmpc_omp_task runtime call to spawn the task
8899 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8900 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
8901 }
8902
8903 StaleCI->eraseFromParent();
8904 for (Instruction *I : llvm::reverse(ToBeDeleted))
8905 I->eraseFromParent();
8906 };
8907 addOutlineInfo(std::move(OI));
8908
8909 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8910 << *(Builder.GetInsertBlock()) << "\n");
8911 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8912 << *(Builder.GetInsertBlock()->getParent()->getParent())
8913 << "\n");
8914 return Builder.saveIP();
8915}
8916
8917Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8918 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8919 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8920 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8921 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8922 if (Error Err =
8923 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8924 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8925 return Err;
8926 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8927 return Error::success();
8928}
8929
8930static void emitTargetCall(
8931 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8932 OpenMPIRBuilder::InsertPointTy AllocaIP,
8933 OpenMPIRBuilder::TargetDataInfo &Info,
8934 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8935 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8936 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8938 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8939 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8941 bool HasNoWait, Value *DynCGroupMem,
8942 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8943 // Generate a function call to the host fallback implementation of the target
8944 // region. This is called by the host when no offload entry was generated for
8945 // the target region and when the offloading call fails at runtime.
8946 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8947 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8948 Builder.restoreIP(IP);
8949 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, Args);
8950 return Builder.saveIP();
8951 };
8952
8953 bool HasDependencies = Dependencies.size() > 0;
8954 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8955
8956 OpenMPIRBuilder::TargetKernelArgs KArgs;
8957
8958 auto TaskBodyCB =
8959 [&](Value *DeviceID, Value *RTLoc,
8960 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8961 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8962 // produce any.
8963 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8964 // emitKernelLaunch makes the necessary runtime call to offload the
8965 // kernel. We then outline all that code into a separate function
8966 // ('kernel_launch_function' in the pseudo code above). This function is
8967 // then called by the target task proxy function (see
8968 // '@.omp_target_task_proxy_func' in the pseudo code above)
8969 // "@.omp_target_task_proxy_func' is generated by
8970 // emitTargetTaskProxyFunction.
8971 if (OutlinedFnID && DeviceID)
8972 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8973 EmitTargetCallFallbackCB, KArgs,
8974 DeviceID, RTLoc, TargetTaskAllocaIP);
8975
8976 // We only need to do the outlining if `DeviceID` is set to avoid calling
8977 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8978 // generating the `else` branch of an `if` clause.
8979 //
8980 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8981 // In this case, we execute the host implementation directly.
8982 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8983 }());
8984
8985 OMPBuilder.Builder.restoreIP(AfterIP);
8986 return Error::success();
8987 };
8988
8989 auto &&EmitTargetCallElse =
8990 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8991 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8992 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8993 // produce any.
8994 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8995 if (RequiresOuterTargetTask) {
8996 // Arguments that are intended to be directly forwarded to an
8997 // emitKernelLaunch call are pased as nullptr, since
8998 // OutlinedFnID=nullptr results in that call not being done.
8999 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
9000 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9001 /*RTLoc=*/nullptr, AllocaIP,
9002 Dependencies, EmptyRTArgs, HasNoWait);
9003 }
9004 return EmitTargetCallFallbackCB(Builder.saveIP());
9005 }());
9006
9007 Builder.restoreIP(AfterIP);
9008 return Error::success();
9009 };
9010
9011 auto &&EmitTargetCallThen =
9012 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9013 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
9014 Info.HasNoWait = HasNoWait;
9015 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9016 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
9017 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9018 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9019 /*IsNonContiguous=*/true,
9020 /*ForEndCall=*/false))
9021 return Err;
9022
9023 SmallVector<Value *, 3> NumTeamsC;
9024 for (auto [DefaultVal, RuntimeVal] :
9025 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9026 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9027 : Builder.getInt32(DefaultVal));
9028
9029 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9030 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9031 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9032 if (Clause)
9033 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9034 /*isSigned=*/false);
9035 return Clause;
9036 };
9037 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9038 if (Clause)
9039 Result =
9040 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9041 Result, Clause)
9042 : Clause;
9043 };
9044
9045 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9046 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9047 SmallVector<Value *, 3> NumThreadsC;
9048 Value *MaxThreadsClause =
9049 RuntimeAttrs.TeamsThreadLimit.size() == 1
9050 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9051 : nullptr;
9052
9053 for (auto [TeamsVal, TargetVal] : zip_equal(
9054 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9055 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9056 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9057
9058 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9059 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9060
9061 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9062 }
9063
9064 unsigned NumTargetItems = Info.NumberOfPtrs;
9065 uint32_t SrcLocStrSize;
9066 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9067 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9068 llvm::omp::IdentFlag(0), 0);
9069
9070 Value *TripCount = RuntimeAttrs.LoopTripCount
9071 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9072 Builder.getInt64Ty(),
9073 /*isSigned=*/false)
9074 : Builder.getInt64(0);
9075
9076 // Request zero groupprivate bytes by default.
9077 if (!DynCGroupMem)
9078 DynCGroupMem = Builder.getInt32(0);
9079
9080 KArgs = OpenMPIRBuilder::TargetKernelArgs(
9081 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9082 HasNoWait, DynCGroupMemFallback);
9083
9084 // Assume no error was returned because TaskBodyCB and
9085 // EmitTargetCallFallbackCB don't produce any.
9086 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
9087 // The presence of certain clauses on the target directive require the
9088 // explicit generation of the target task.
9089 if (RequiresOuterTargetTask)
9090 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9091 RTLoc, AllocaIP, Dependencies,
9092 KArgs.RTArgs, Info.HasNoWait);
9093
9094 return OMPBuilder.emitKernelLaunch(
9095 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9096 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9097 }());
9098
9099 Builder.restoreIP(AfterIP);
9100 return Error::success();
9101 };
9102
9103 // If we don't have an ID for the target region, it means an offload entry
9104 // wasn't created. In this case we just run the host fallback directly and
9105 // ignore any potential 'if' clauses.
9106 if (!OutlinedFnID) {
9107 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9108 return;
9109 }
9110
9111 // If there's no 'if' clause, only generate the kernel launch code path.
9112 if (!IfCond) {
9113 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9114 return;
9115 }
9116
9117 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9118 EmitTargetCallElse, AllocaIP));
9119}
9120
9121OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
9122 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9123 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9124 TargetRegionEntryInfo &EntryInfo,
9125 const TargetKernelDefaultAttrs &DefaultAttrs,
9126 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9127 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9128 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
9129 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
9130 CustomMapperCallbackTy CustomMapperCB,
9131 const SmallVector<DependData> &Dependencies, bool HasNowait,
9132 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9133
9134 if (!updateToLocation(Loc))
9135 return InsertPointTy();
9136
9137 Builder.restoreIP(CodeGenIP);
9138
9139 Function *OutlinedFn;
9140 Constant *OutlinedFnID = nullptr;
9141 // The target region is outlined into its own function. The LLVM IR for
9142 // the target region itself is generated using the callbacks CBFunc
9143 // and ArgAccessorFuncCB
9145 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9146 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9147 return Err;
9148
9149 // If we are not on the target device, then we need to generate code
9150 // to make a remote call (offload) to the previously outlined function
9151 // that represents the target region. Do that now.
9152 if (!Config.isTargetDevice())
9153 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9154 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9155 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9156 DynCGroupMemFallback);
9157 return Builder.saveIP();
9158}
9159
9160std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9161 StringRef FirstSeparator,
9162 StringRef Separator) {
9163 SmallString<128> Buffer;
9164 llvm::raw_svector_ostream OS(Buffer);
9165 StringRef Sep = FirstSeparator;
9166 for (StringRef Part : Parts) {
9167 OS << Sep << Part;
9168 Sep = Separator;
9169 }
9170 return OS.str().str();
9171}
9172
9173std::string
9174OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
9175 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9176 Config.separator());
9177}
9178
9179GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
9180 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9181 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9182 if (Elem.second) {
9183 assert(Elem.second->getValueType() == Ty &&
9184 "OMP internal variable has different type than requested");
9185 } else {
9186 // TODO: investigate the appropriate linkage type used for the global
9187 // variable for possibly changing that to internal or private, or maybe
9188 // create different versions of the function for different OMP internal
9189 // variables.
9190 const DataLayout &DL = M.getDataLayout();
9191 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9192 // default global AS is 1.
9193 // See double-target-call-with-declare-target.f90 and
9194 // declare-target-vars-in-target-region.f90 libomptarget
9195 // tests.
9196 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9197 : M.getTargetTriple().isAMDGPU()
9198 ? 0
9199 : DL.getDefaultGlobalsAddressSpace();
9200 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9203 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9204 Constant::getNullValue(Ty), Elem.first(),
9205 /*InsertBefore=*/nullptr,
9206 GlobalValue::NotThreadLocal, AddressSpaceVal);
9207 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9208 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9209 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9210 Elem.second = GV;
9211 }
9212
9213 return Elem.second;
9214}
9215
9216Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9217 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9218 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9219 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9220}
9221
9222Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
9223 LLVMContext &Ctx = Builder.getContext();
9224 Value *Null =
9226 Value *SizeGep =
9227 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9228 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9229 return SizePtrToInt;
9230}
9231
9233OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
9234 std::string VarName) {
9235 llvm::Constant *MaptypesArrayInit =
9236 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9237 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9238 M, MaptypesArrayInit->getType(),
9239 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9240 VarName);
9241 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9242 return MaptypesArrayGlobal;
9243}
9244
9245void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
9246 InsertPointTy AllocaIP,
9247 unsigned NumOperands,
9248 struct MapperAllocas &MapperAllocas) {
9249 if (!updateToLocation(Loc))
9250 return;
9251
9252 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9253 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9254 Builder.restoreIP(AllocaIP);
9255 AllocaInst *ArgsBase = Builder.CreateAlloca(
9256 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9257 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9258 ".offload_ptrs");
9259 AllocaInst *ArgSizes = Builder.CreateAlloca(
9260 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9261 updateToLocation(Loc);
9262 MapperAllocas.ArgsBase = ArgsBase;
9263 MapperAllocas.Args = Args;
9264 MapperAllocas.ArgSizes = ArgSizes;
9265}
9266
9267void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
9268 Function *MapperFunc, Value *SrcLocInfo,
9269 Value *MaptypesArg, Value *MapnamesArg,
9270 struct MapperAllocas &MapperAllocas,
9271 int64_t DeviceID, unsigned NumOperands) {
9272 if (!updateToLocation(Loc))
9273 return;
9274
9275 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9276 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9277 Value *ArgsBaseGEP =
9278 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9279 {Builder.getInt32(0), Builder.getInt32(0)});
9280 Value *ArgsGEP =
9281 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9282 {Builder.getInt32(0), Builder.getInt32(0)});
9283 Value *ArgSizesGEP =
9284 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9285 {Builder.getInt32(0), Builder.getInt32(0)});
9286 Value *NullPtr =
9287 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9288 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9289 Builder.getInt32(NumOperands),
9290 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9291 MaptypesArg, MapnamesArg, NullPtr});
9292}
9293
9294void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
9295 TargetDataRTArgs &RTArgs,
9296 TargetDataInfo &Info,
9297 bool ForEndCall) {
9298 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9299 "expected region end call to runtime only when end call is separate");
9300 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9301 auto VoidPtrTy = UnqualPtrTy;
9302 auto VoidPtrPtrTy = UnqualPtrTy;
9303 auto Int64Ty = Type::getInt64Ty(M.getContext());
9304 auto Int64PtrTy = UnqualPtrTy;
9305
9306 if (!Info.NumberOfPtrs) {
9307 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9308 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9309 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9310 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9311 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9312 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9313 return;
9314 }
9315
9316 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9317 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9318 Info.RTArgs.BasePointersArray,
9319 /*Idx0=*/0, /*Idx1=*/0);
9320 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9321 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9322 /*Idx0=*/0,
9323 /*Idx1=*/0);
9324 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9325 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9326 /*Idx0=*/0, /*Idx1=*/0);
9327 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9328 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9329 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9330 : Info.RTArgs.MapTypesArray,
9331 /*Idx0=*/0,
9332 /*Idx1=*/0);
9333
9334 // Only emit the mapper information arrays if debug information is
9335 // requested.
9336 if (!Info.EmitDebug)
9337 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9338 else
9339 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9340 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9341 /*Idx0=*/0,
9342 /*Idx1=*/0);
9343 // If there is no user-defined mapper, set the mapper array to nullptr to
9344 // avoid an unnecessary data privatization
9345 if (!Info.HasMapper)
9346 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9347 else
9348 RTArgs.MappersArray =
9349 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9350}
9351
9352void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
9353 InsertPointTy CodeGenIP,
9354 MapInfosTy &CombinedInfo,
9355 TargetDataInfo &Info) {
9356 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
9357 CombinedInfo.NonContigInfo;
9358
9359 // Build an array of struct descriptor_dim and then assign it to
9360 // offload_args.
9361 //
9362 // struct descriptor_dim {
9363 // uint64_t offset;
9364 // uint64_t count;
9365 // uint64_t stride
9366 // };
9367 Type *Int64Ty = Builder.getInt64Ty();
9369 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9370 "struct.descriptor_dim");
9371
9372 enum { OffsetFD = 0, CountFD, StrideFD };
9373 // We need two index variable here since the size of "Dims" is the same as
9374 // the size of Components, however, the size of offset, count, and stride is
9375 // equal to the size of base declaration that is non-contiguous.
9376 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9377 // Skip emitting ir if dimension size is 1 since it cannot be
9378 // non-contiguous.
9379 if (NonContigInfo.Dims[I] == 1)
9380 continue;
9381 Builder.restoreIP(AllocaIP);
9382 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9383 AllocaInst *DimsAddr =
9384 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9385 Builder.restoreIP(CodeGenIP);
9386 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9387 unsigned RevIdx = EE - II - 1;
9388 Value *DimsLVal = Builder.CreateInBoundsGEP(
9389 DimsAddr->getAllocatedType(), DimsAddr,
9390 {Builder.getInt64(0), Builder.getInt64(II)});
9391 // Offset
9392 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9393 Builder.CreateAlignedStore(
9394 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9395 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9396 // Count
9397 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9398 Builder.CreateAlignedStore(
9399 NonContigInfo.Counts[L][RevIdx], CountLVal,
9400 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9401 // Stride
9402 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9403 Builder.CreateAlignedStore(
9404 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9405 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9406 }
9407 // args[I] = &dims
9408 Builder.restoreIP(CodeGenIP);
9409 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9410 DimsAddr, Builder.getPtrTy());
9411 Value *P = Builder.CreateConstInBoundsGEP2_32(
9412 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9413 Info.RTArgs.PointersArray, 0, I);
9414 Builder.CreateAlignedStore(
9415 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9416 ++L;
9417 }
9418}
9419
9420void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9421 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9422 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9423 BasicBlock *ExitBB, bool IsInit) {
9424 StringRef Prefix = IsInit ? ".init" : ".del";
9425
9426 // Evaluate if this is an array section.
9428 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9429 Value *IsArray =
9430 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9431 Value *DeleteBit = Builder.CreateAnd(
9432 MapType,
9433 Builder.getInt64(
9434 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9435 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9436 Value *DeleteCond;
9437 Value *Cond;
9438 if (IsInit) {
9439 // base != begin?
9440 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9441 // IsPtrAndObj?
9442 Value *PtrAndObjBit = Builder.CreateAnd(
9443 MapType,
9444 Builder.getInt64(
9445 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9446 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
9447 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
9448 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
9449 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9450 DeleteCond = Builder.CreateIsNull(
9451 DeleteBit,
9452 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9453 } else {
9454 Cond = IsArray;
9455 DeleteCond = Builder.CreateIsNotNull(
9456 DeleteBit,
9457 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9458 }
9459 Cond = Builder.CreateAnd(Cond, DeleteCond);
9460 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9461
9462 emitBlock(BodyBB, MapperFn);
9463 // Get the array size by multiplying element size and element number (i.e., \p
9464 // Size).
9465 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9466 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9467 // memory allocation/deletion purpose only.
9468 Value *MapTypeArg = Builder.CreateAnd(
9469 MapType,
9470 Builder.getInt64(
9471 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9472 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9473 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9474 MapTypeArg = Builder.CreateOr(
9475 MapTypeArg,
9476 Builder.getInt64(
9477 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9478 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9479
9480 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9481 // data structure.
9482 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9483 ArraySize, MapTypeArg, MapName};
9484 createRuntimeFunctionCall(
9485 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9486 OffloadingArgs);
9487}
9488
9489Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
9490 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
9491 llvm::Value *BeginArg)>
9492 GenMapInfoCB,
9493 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9494 SmallVector<Type *> Params;
9495 Params.emplace_back(Builder.getPtrTy());
9496 Params.emplace_back(Builder.getPtrTy());
9497 Params.emplace_back(Builder.getPtrTy());
9498 Params.emplace_back(Builder.getInt64Ty());
9499 Params.emplace_back(Builder.getInt64Ty());
9500 Params.emplace_back(Builder.getPtrTy());
9501
9502 auto *FnTy =
9503 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9504
9505 SmallString<64> TyStr;
9506 raw_svector_ostream Out(TyStr);
9507 Function *MapperFn =
9509 MapperFn->addFnAttr(Attribute::NoInline);
9510 MapperFn->addFnAttr(Attribute::NoUnwind);
9511 MapperFn->addParamAttr(0, Attribute::NoUndef);
9512 MapperFn->addParamAttr(1, Attribute::NoUndef);
9513 MapperFn->addParamAttr(2, Attribute::NoUndef);
9514 MapperFn->addParamAttr(3, Attribute::NoUndef);
9515 MapperFn->addParamAttr(4, Attribute::NoUndef);
9516 MapperFn->addParamAttr(5, Attribute::NoUndef);
9517
9518 // Start the mapper function code generation.
9519 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9520 auto SavedIP = Builder.saveIP();
9521 Builder.SetInsertPoint(EntryBB);
9522
9523 Value *MapperHandle = MapperFn->getArg(0);
9524 Value *BaseIn = MapperFn->getArg(1);
9525 Value *BeginIn = MapperFn->getArg(2);
9526 Value *Size = MapperFn->getArg(3);
9527 Value *MapType = MapperFn->getArg(4);
9528 Value *MapName = MapperFn->getArg(5);
9529
9530 // Compute the starting and end addresses of array elements.
9531 // Prepare common arguments for array initiation and deletion.
9532 // Convert the size in bytes into the number of array elements.
9533 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9534 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9535 Value *PtrBegin = BeginIn;
9536 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9537
9538 // Emit array initiation if this is an array section and \p MapType indicates
9539 // that memory allocation is required.
9540 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9541 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9542 MapType, MapName, ElementSize, HeadBB,
9543 /*IsInit=*/true);
9544
9545 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9546
9547 // Emit the loop header block.
9548 emitBlock(HeadBB, MapperFn);
9549 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9550 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9551 // Evaluate whether the initial condition is satisfied.
9552 Value *IsEmpty =
9553 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9554 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9555
9556 // Emit the loop body block.
9557 emitBlock(BodyBB, MapperFn);
9558 BasicBlock *LastBB = BodyBB;
9559 PHINode *PtrPHI =
9560 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9561 PtrPHI->addIncoming(PtrBegin, HeadBB);
9562
9563 // Get map clause information. Fill up the arrays with all mapped variables.
9564 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9565 if (!Info)
9566 return Info.takeError();
9567
9568 // Call the runtime API __tgt_mapper_num_components to get the number of
9569 // pre-existing components.
9570 Value *OffloadingArgs[] = {MapperHandle};
9571 Value *PreviousSize = createRuntimeFunctionCall(
9572 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9573 OffloadingArgs);
9574 Value *ShiftedPreviousSize =
9575 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9576
9577 // Fill up the runtime mapper handle for all components.
9578 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9579 Value *CurBaseArg = Info->BasePointers[I];
9580 Value *CurBeginArg = Info->Pointers[I];
9581 Value *CurSizeArg = Info->Sizes[I];
9582 Value *CurNameArg = Info->Names.size()
9583 ? Info->Names[I]
9584 : Constant::getNullValue(Builder.getPtrTy());
9585
9586 // Extract the MEMBER_OF field from the map type.
9587 Value *OriMapType = Builder.getInt64(
9588 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9589 Info->Types[I]));
9590 Value *MemberMapType =
9591 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9592
9593 // Combine the map type inherited from user-defined mapper with that
9594 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9595 // bits of the \a MapType, which is the input argument of the mapper
9596 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9597 // bits of MemberMapType.
9598 // [OpenMP 5.0], 1.2.6. map-type decay.
9599 // | alloc | to | from | tofrom | release | delete
9600 // ----------------------------------------------------------
9601 // alloc | alloc | alloc | alloc | alloc | release | delete
9602 // to | alloc | to | alloc | to | release | delete
9603 // from | alloc | alloc | from | from | release | delete
9604 // tofrom | alloc | to | from | tofrom | release | delete
9605 Value *LeftToFrom = Builder.CreateAnd(
9606 MapType,
9607 Builder.getInt64(
9608 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9609 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9610 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9611 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9612 BasicBlock *AllocElseBB =
9613 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9614 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9615 BasicBlock *ToElseBB =
9616 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9617 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9618 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9619 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9620 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9621 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9622 emitBlock(AllocBB, MapperFn);
9623 Value *AllocMapType = Builder.CreateAnd(
9624 MemberMapType,
9625 Builder.getInt64(
9626 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9627 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9628 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9629 Builder.CreateBr(EndBB);
9630 emitBlock(AllocElseBB, MapperFn);
9631 Value *IsTo = Builder.CreateICmpEQ(
9632 LeftToFrom,
9633 Builder.getInt64(
9634 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9635 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9636 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9637 // In case of to, clear OMP_MAP_FROM.
9638 emitBlock(ToBB, MapperFn);
9639 Value *ToMapType = Builder.CreateAnd(
9640 MemberMapType,
9641 Builder.getInt64(
9642 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9643 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9644 Builder.CreateBr(EndBB);
9645 emitBlock(ToElseBB, MapperFn);
9646 Value *IsFrom = Builder.CreateICmpEQ(
9647 LeftToFrom,
9648 Builder.getInt64(
9649 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9650 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9651 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9652 // In case of from, clear OMP_MAP_TO.
9653 emitBlock(FromBB, MapperFn);
9654 Value *FromMapType = Builder.CreateAnd(
9655 MemberMapType,
9656 Builder.getInt64(
9657 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9658 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9659 // In case of tofrom, do nothing.
9660 emitBlock(EndBB, MapperFn);
9661 LastBB = EndBB;
9662 PHINode *CurMapType =
9663 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9664 CurMapType->addIncoming(AllocMapType, AllocBB);
9665 CurMapType->addIncoming(ToMapType, ToBB);
9666 CurMapType->addIncoming(FromMapType, FromBB);
9667 CurMapType->addIncoming(MemberMapType, ToElseBB);
9668
9669 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9670 CurSizeArg, CurMapType, CurNameArg};
9671
9672 auto ChildMapperFn = CustomMapperCB(I);
9673 if (!ChildMapperFn)
9674 return ChildMapperFn.takeError();
9675 if (*ChildMapperFn) {
9676 // Call the corresponding mapper function.
9677 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9678 ->setDoesNotThrow();
9679 } else {
9680 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9681 // data structure.
9682 createRuntimeFunctionCall(
9683 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9684 OffloadingArgs);
9685 }
9686 }
9687
9688 // Update the pointer to point to the next element that needs to be mapped,
9689 // and check whether we have mapped all elements.
9690 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9691 "omp.arraymap.next");
9692 PtrPHI->addIncoming(PtrNext, LastBB);
9693 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9694 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9695 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9696
9697 emitBlock(ExitBB, MapperFn);
9698 // Emit array deletion if this is an array section and \p MapType indicates
9699 // that deletion is required.
9700 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9701 MapType, MapName, ElementSize, DoneBB,
9702 /*IsInit=*/false);
9703
9704 // Emit the function exit block.
9705 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9706
9707 Builder.CreateRetVoid();
9708 Builder.restoreIP(SavedIP);
9709 return MapperFn;
9710}
9711
9712Error OpenMPIRBuilder::emitOffloadingArrays(
9713 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9714 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9715 bool IsNonContiguous,
9716 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9717
9718 // Reset the array information.
9719 Info.clearArrayInfo();
9720 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9721
9722 if (Info.NumberOfPtrs == 0)
9723 return Error::success();
9724
9725 Builder.restoreIP(AllocaIP);
9726 // Detect if we have any capture size requiring runtime evaluation of the
9727 // size so that a constant array could be eventually used.
9728 ArrayType *PointerArrayType =
9729 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9730
9731 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9732 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9733
9734 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9735 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9736 AllocaInst *MappersArray = Builder.CreateAlloca(
9737 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9738 Info.RTArgs.MappersArray = MappersArray;
9739
9740 // If we don't have any VLA types or other types that require runtime
9741 // evaluation, we can use a constant array for the map sizes, otherwise we
9742 // need to fill up the arrays as we do for the pointers.
9743 Type *Int64Ty = Builder.getInt64Ty();
9744 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9745 ConstantInt::get(Int64Ty, 0));
9746 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9747 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9748 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9749 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9750 if (IsNonContiguous &&
9751 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9752 CombinedInfo.Types[I] &
9753 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9754 ConstSizes[I] =
9755 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9756 else
9757 ConstSizes[I] = CI;
9758 continue;
9759 }
9760 }
9761 RuntimeSizes.set(I);
9762 }
9763
9764 if (RuntimeSizes.all()) {
9765 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9766 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9767 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9768 restoreIPandDebugLoc(Builder, CodeGenIP);
9769 } else {
9770 auto *SizesArrayInit = ConstantArray::get(
9771 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9772 std::string Name = createPlatformSpecificName({"offload_sizes"});
9773 auto *SizesArrayGbl =
9774 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9775 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9776 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9777
9778 if (!RuntimeSizes.any()) {
9779 Info.RTArgs.SizesArray = SizesArrayGbl;
9780 } else {
9781 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9782 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9783 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9784 AllocaInst *Buffer = Builder.CreateAlloca(
9785 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9786 Buffer->setAlignment(OffloadSizeAlign);
9787 restoreIPandDebugLoc(Builder, CodeGenIP);
9788 Builder.CreateMemCpy(
9789 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9790 SizesArrayGbl, OffloadSizeAlign,
9791 Builder.getIntN(
9792 IndexSize,
9793 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9794
9795 Info.RTArgs.SizesArray = Buffer;
9796 }
9797 restoreIPandDebugLoc(Builder, CodeGenIP);
9798 }
9799
9800 // The map types are always constant so we don't need to generate code to
9801 // fill arrays. Instead, we create an array constant.
9803 for (auto mapFlag : CombinedInfo.Types)
9804 Mapping.push_back(
9805 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9806 mapFlag));
9807 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9808 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9809 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9810
9811 // The information types are only built if provided.
9812 if (!CombinedInfo.Names.empty()) {
9813 auto *MapNamesArrayGbl = createOffloadMapnames(
9814 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9815 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9816 Info.EmitDebug = true;
9817 } else {
9818 Info.RTArgs.MapNamesArray =
9819 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9820 Info.EmitDebug = false;
9821 }
9822
9823 // If there's a present map type modifier, it must not be applied to the end
9824 // of a region, so generate a separate map type array in that case.
9825 if (Info.separateBeginEndCalls()) {
9826 bool EndMapTypesDiffer = false;
9827 for (uint64_t &Type : Mapping) {
9828 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9829 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9830 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9831 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9832 EndMapTypesDiffer = true;
9833 }
9834 }
9835 if (EndMapTypesDiffer) {
9836 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9837 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9838 }
9839 }
9840
9841 PointerType *PtrTy = Builder.getPtrTy();
9842 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9843 Value *BPVal = CombinedInfo.BasePointers[I];
9844 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9845 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9846 0, I);
9847 Builder.CreateAlignedStore(BPVal, BP,
9848 M.getDataLayout().getPrefTypeAlign(PtrTy));
9849
9850 if (Info.requiresDevicePointerInfo()) {
9851 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9852 CodeGenIP = Builder.saveIP();
9853 Builder.restoreIP(AllocaIP);
9854 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9855 Builder.restoreIP(CodeGenIP);
9856 if (DeviceAddrCB)
9857 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9858 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9859 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9860 if (DeviceAddrCB)
9861 DeviceAddrCB(I, BP);
9862 }
9863 }
9864
9865 Value *PVal = CombinedInfo.Pointers[I];
9866 Value *P = Builder.CreateConstInBoundsGEP2_32(
9867 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9868 I);
9869 // TODO: Check alignment correct.
9870 Builder.CreateAlignedStore(PVal, P,
9871 M.getDataLayout().getPrefTypeAlign(PtrTy));
9872
9873 if (RuntimeSizes.test(I)) {
9874 Value *S = Builder.CreateConstInBoundsGEP2_32(
9875 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9876 /*Idx0=*/0,
9877 /*Idx1=*/I);
9878 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9879 Int64Ty,
9880 /*isSigned=*/true),
9881 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9882 }
9883 // Fill up the mapper array.
9884 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9885 Value *MFunc = ConstantPointerNull::get(PtrTy);
9886
9887 auto CustomMFunc = CustomMapperCB(I);
9888 if (!CustomMFunc)
9889 return CustomMFunc.takeError();
9890 if (*CustomMFunc)
9891 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9892
9893 Value *MAddr = Builder.CreateInBoundsGEP(
9894 MappersArray->getAllocatedType(), MappersArray,
9895 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9896 Builder.CreateAlignedStore(
9897 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9898 }
9899
9900 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9901 Info.NumberOfPtrs == 0)
9902 return Error::success();
9903 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9904 return Error::success();
9905}
9906
9907void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9908 BasicBlock *CurBB = Builder.GetInsertBlock();
9909
9910 if (!CurBB || CurBB->getTerminator()) {
9911 // If there is no insert point or the previous block is already
9912 // terminated, don't touch it.
9913 } else {
9914 // Otherwise, create a fall-through branch.
9915 Builder.CreateBr(Target);
9916 }
9917
9918 Builder.ClearInsertionPoint();
9919}
9920
9921void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9922 bool IsFinished) {
9923 BasicBlock *CurBB = Builder.GetInsertBlock();
9924
9925 // Fall out of the current block (if necessary).
9926 emitBranch(BB);
9927
9928 if (IsFinished && BB->use_empty()) {
9929 BB->eraseFromParent();
9930 return;
9931 }
9932
9933 // Place the block after the current block, if possible, or else at
9934 // the end of the function.
9935 if (CurBB && CurBB->getParent())
9936 CurFn->insert(std::next(CurBB->getIterator()), BB);
9937 else
9938 CurFn->insert(CurFn->end(), BB);
9939 Builder.SetInsertPoint(BB);
9940}
9941
9942Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9943 BodyGenCallbackTy ElseGen,
9944 InsertPointTy AllocaIP) {
9945 // If the condition constant folds and can be elided, try to avoid emitting
9946 // the condition and the dead arm of the if/else.
9947 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9948 auto CondConstant = CI->getSExtValue();
9949 if (CondConstant)
9950 return ThenGen(AllocaIP, Builder.saveIP());
9951
9952 return ElseGen(AllocaIP, Builder.saveIP());
9953 }
9954
9955 Function *CurFn = Builder.GetInsertBlock()->getParent();
9956
9957 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9958 // emit the conditional branch.
9959 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9960 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9961 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9962 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9963 // Emit the 'then' code.
9964 emitBlock(ThenBlock, CurFn);
9965 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9966 return Err;
9967 emitBranch(ContBlock);
9968 // Emit the 'else' code if present.
9969 // There is no need to emit line number for unconditional branch.
9970 emitBlock(ElseBlock, CurFn);
9971 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9972 return Err;
9973 // There is no need to emit line number for unconditional branch.
9974 emitBranch(ContBlock);
9975 // Emit the continuation block for code after the if.
9976 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9977 return Error::success();
9978}
9979
9980bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9981 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9984 "Unexpected Atomic Ordering.");
9985
9986 bool Flush = false;
9988
9989 switch (AK) {
9990 case Read:
9993 FlushAO = AtomicOrdering::Acquire;
9994 Flush = true;
9995 }
9996 break;
9997 case Write:
9998 case Compare:
9999 case Update:
10002 FlushAO = AtomicOrdering::Release;
10003 Flush = true;
10004 }
10005 break;
10006 case Capture:
10007 switch (AO) {
10009 FlushAO = AtomicOrdering::Acquire;
10010 Flush = true;
10011 break;
10013 FlushAO = AtomicOrdering::Release;
10014 Flush = true;
10015 break;
10019 Flush = true;
10020 break;
10021 default:
10022 // do nothing - leave silently.
10023 break;
10024 }
10025 }
10026
10027 if (Flush) {
10028 // Currently Flush RT call still doesn't take memory_ordering, so for when
10029 // that happens, this tries to do the resolution of which atomic ordering
10030 // to use with but issue the flush call
10031 // TODO: pass `FlushAO` after memory ordering support is added
10032 (void)FlushAO;
10033 emitFlush(Loc);
10034 }
10035
10036 // for AO == AtomicOrdering::Monotonic and all other case combinations
10037 // do nothing
10038 return Flush;
10039}
10040
10041OpenMPIRBuilder::InsertPointTy
10042OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
10043 AtomicOpValue &X, AtomicOpValue &V,
10044 AtomicOrdering AO, InsertPointTy AllocaIP) {
10045 if (!updateToLocation(Loc))
10046 return Loc.IP;
10047
10048 assert(X.Var->getType()->isPointerTy() &&
10049 "OMP Atomic expects a pointer to target memory");
10050 Type *XElemTy = X.ElemTy;
10051 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10052 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10053 "OMP atomic read expected a scalar type");
10054
10055 Value *XRead = nullptr;
10056
10057 if (XElemTy->isIntegerTy()) {
10058 LoadInst *XLD =
10059 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10060 XLD->setAtomic(AO);
10061 XRead = cast<Value>(XLD);
10062 } else if (XElemTy->isStructTy()) {
10063 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10064 // target does not support `atomicrmw` of the size of the struct
10065 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10066 OldVal->setAtomic(AO);
10067 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10068 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10069 OpenMPIRBuilder::AtomicInfo atomicInfo(
10070 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10071 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10072 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10073 XRead = AtomicLoadRes.first;
10074 OldVal->eraseFromParent();
10075 } else {
10076 // We need to perform atomic op as integer
10077 IntegerType *IntCastTy =
10078 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10079 LoadInst *XLoad =
10080 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10081 XLoad->setAtomic(AO);
10082 if (XElemTy->isFloatingPointTy()) {
10083 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10084 } else {
10085 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10086 }
10087 }
10088 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10089 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10090 return Builder.saveIP();
10091}
10092
10093OpenMPIRBuilder::InsertPointTy
10094OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
10095 AtomicOpValue &X, Value *Expr,
10096 AtomicOrdering AO, InsertPointTy AllocaIP) {
10097 if (!updateToLocation(Loc))
10098 return Loc.IP;
10099
10100 assert(X.Var->getType()->isPointerTy() &&
10101 "OMP Atomic expects a pointer to target memory");
10102 Type *XElemTy = X.ElemTy;
10103 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10104 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10105 "OMP atomic write expected a scalar type");
10106
10107 if (XElemTy->isIntegerTy()) {
10108 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10109 XSt->setAtomic(AO);
10110 } else if (XElemTy->isStructTy()) {
10111 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10112 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10113 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10114 OpenMPIRBuilder::AtomicInfo atomicInfo(
10115 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10116 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10117 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10118 OldVal->eraseFromParent();
10119 } else {
10120 // We need to bitcast and perform atomic op as integers
10121 IntegerType *IntCastTy =
10122 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10123 Value *ExprCast =
10124 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10125 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10126 XSt->setAtomic(AO);
10127 }
10128
10129 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10130 return Builder.saveIP();
10131}
10132
10133OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
10134 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
10135 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10136 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10137 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10138 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10139 if (!updateToLocation(Loc))
10140 return Loc.IP;
10141
10142 LLVM_DEBUG({
10143 Type *XTy = X.Var->getType();
10144 assert(XTy->isPointerTy() &&
10145 "OMP Atomic expects a pointer to target memory");
10146 Type *XElemTy = X.ElemTy;
10147 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10148 XElemTy->isPointerTy()) &&
10149 "OMP atomic update expected a scalar type");
10150 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10151 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10152 "OpenMP atomic does not support LT or GT operations");
10153 });
10154
10155 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10156 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10157 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10158 if (!AtomicResult)
10159 return AtomicResult.takeError();
10160 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10161 return Builder.saveIP();
10162}
10163
10164// FIXME: Duplicating AtomicExpand
10165Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10166 AtomicRMWInst::BinOp RMWOp) {
10167 switch (RMWOp) {
10168 case AtomicRMWInst::Add:
10169 return Builder.CreateAdd(Src1, Src2);
10170 case AtomicRMWInst::Sub:
10171 return Builder.CreateSub(Src1, Src2);
10172 case AtomicRMWInst::And:
10173 return Builder.CreateAnd(Src1, Src2);
10175 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10176 case AtomicRMWInst::Or:
10177 return Builder.CreateOr(Src1, Src2);
10178 case AtomicRMWInst::Xor:
10179 return Builder.CreateXor(Src1, Src2);
10184 case AtomicRMWInst::Max:
10185 case AtomicRMWInst::Min:
10196 llvm_unreachable("Unsupported atomic update operation");
10197 }
10198 llvm_unreachable("Unsupported atomic update operation");
10199}
10200
10201Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10202 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10204 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10205 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10206 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10207 // or a complex datatype.
10208 bool emitRMWOp = false;
10209 switch (RMWOp) {
10210 case AtomicRMWInst::Add:
10211 case AtomicRMWInst::And:
10213 case AtomicRMWInst::Or:
10214 case AtomicRMWInst::Xor:
10216 emitRMWOp = XElemTy;
10217 break;
10218 case AtomicRMWInst::Sub:
10219 emitRMWOp = (IsXBinopExpr && XElemTy);
10220 break;
10221 default:
10222 emitRMWOp = false;
10223 }
10224 emitRMWOp &= XElemTy->isIntegerTy();
10225
10226 std::pair<Value *, Value *> Res;
10227 if (emitRMWOp) {
10228 AtomicRMWInst *RMWInst =
10229 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10230 if (T.isAMDGPU()) {
10231 if (IsIgnoreDenormalMode)
10232 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10233 llvm::MDNode::get(Builder.getContext(), {}));
10234 if (!IsFineGrainedMemory)
10235 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10236 llvm::MDNode::get(Builder.getContext(), {}));
10237 if (!IsRemoteMemory)
10238 RMWInst->setMetadata("amdgpu.no.remote.memory",
10239 llvm::MDNode::get(Builder.getContext(), {}));
10240 }
10241 Res.first = RMWInst;
10242 // not needed except in case of postfix captures. Generate anyway for
10243 // consistency with the else part. Will be removed with any DCE pass.
10244 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10245 if (RMWOp == AtomicRMWInst::Xchg)
10246 Res.second = Res.first;
10247 else
10248 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10249 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10250 XElemTy->isStructTy()) {
10251 LoadInst *OldVal =
10252 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10253 OldVal->setAtomic(AO);
10254 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10255 unsigned LoadSize =
10256 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10257
10258 OpenMPIRBuilder::AtomicInfo atomicInfo(
10259 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10260 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10261 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10262 BasicBlock *CurBB = Builder.GetInsertBlock();
10263 Instruction *CurBBTI = CurBB->getTerminator();
10264 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10265 BasicBlock *ExitBB =
10266 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10267 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10268 X->getName() + ".atomic.cont");
10269 ContBB->getTerminator()->eraseFromParent();
10270 Builder.restoreIP(AllocaIP);
10271 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10272 NewAtomicAddr->setName(X->getName() + "x.new.val");
10273 Builder.SetInsertPoint(ContBB);
10274 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10275 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10276 Value *OldExprVal = PHI;
10277 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10278 if (!CBResult)
10279 return CBResult.takeError();
10280 Value *Upd = *CBResult;
10281 Builder.CreateStore(Upd, NewAtomicAddr);
10284 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10285 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10286 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10287 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10288 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10289 OldVal->eraseFromParent();
10290 Res.first = OldExprVal;
10291 Res.second = Upd;
10292
10293 if (UnreachableInst *ExitTI =
10295 CurBBTI->eraseFromParent();
10296 Builder.SetInsertPoint(ExitBB);
10297 } else {
10298 Builder.SetInsertPoint(ExitTI);
10299 }
10300 } else {
10301 IntegerType *IntCastTy =
10302 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10303 LoadInst *OldVal =
10304 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10305 OldVal->setAtomic(AO);
10306 // CurBB
10307 // | /---\
10308 // ContBB |
10309 // | \---/
10310 // ExitBB
10311 BasicBlock *CurBB = Builder.GetInsertBlock();
10312 Instruction *CurBBTI = CurBB->getTerminator();
10313 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10314 BasicBlock *ExitBB =
10315 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10316 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10317 X->getName() + ".atomic.cont");
10318 ContBB->getTerminator()->eraseFromParent();
10319 Builder.restoreIP(AllocaIP);
10320 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10321 NewAtomicAddr->setName(X->getName() + "x.new.val");
10322 Builder.SetInsertPoint(ContBB);
10323 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10324 PHI->addIncoming(OldVal, CurBB);
10325 bool IsIntTy = XElemTy->isIntegerTy();
10326 Value *OldExprVal = PHI;
10327 if (!IsIntTy) {
10328 if (XElemTy->isFloatingPointTy()) {
10329 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10330 X->getName() + ".atomic.fltCast");
10331 } else {
10332 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10333 X->getName() + ".atomic.ptrCast");
10334 }
10335 }
10336
10337 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10338 if (!CBResult)
10339 return CBResult.takeError();
10340 Value *Upd = *CBResult;
10341 Builder.CreateStore(Upd, NewAtomicAddr);
10342 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10345 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10346 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10347 Result->setVolatile(VolatileX);
10348 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10349 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10350 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10351 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10352
10353 Res.first = OldExprVal;
10354 Res.second = Upd;
10355
10356 // set Insertion point in exit block
10357 if (UnreachableInst *ExitTI =
10359 CurBBTI->eraseFromParent();
10360 Builder.SetInsertPoint(ExitBB);
10361 } else {
10362 Builder.SetInsertPoint(ExitTI);
10363 }
10364 }
10365
10366 return Res;
10367}
10368
10369OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
10370 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
10371 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10372 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10373 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10374 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10375 if (!updateToLocation(Loc))
10376 return Loc.IP;
10377
10378 LLVM_DEBUG({
10379 Type *XTy = X.Var->getType();
10380 assert(XTy->isPointerTy() &&
10381 "OMP Atomic expects a pointer to target memory");
10382 Type *XElemTy = X.ElemTy;
10383 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10384 XElemTy->isPointerTy()) &&
10385 "OMP atomic capture expected a scalar type");
10386 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10387 "OpenMP atomic does not support LT or GT operations");
10388 });
10389
10390 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10391 // 'x' is simply atomically rewritten with 'expr'.
10392 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10393 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10394 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10395 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10396 if (!AtomicResult)
10397 return AtomicResult.takeError();
10398 Value *CapturedVal =
10399 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10400 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10401
10402 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10403 return Builder.saveIP();
10404}
10405
10406OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
10407 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
10408 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
10409 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10410 bool IsFailOnly) {
10411
10413 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10414 IsPostfixUpdate, IsFailOnly, Failure);
10415}
10416
10417OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
10418 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
10419 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
10420 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10421 bool IsFailOnly, AtomicOrdering Failure) {
10422
10423 if (!updateToLocation(Loc))
10424 return Loc.IP;
10425
10426 assert(X.Var->getType()->isPointerTy() &&
10427 "OMP atomic expects a pointer to target memory");
10428 // compare capture
10429 if (V.Var) {
10430 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10431 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10432 }
10433
10434 bool IsInteger = E->getType()->isIntegerTy();
10435
10436 if (Op == OMPAtomicCompareOp::EQ) {
10437 AtomicCmpXchgInst *Result = nullptr;
10438 if (!IsInteger) {
10439 IntegerType *IntCastTy =
10440 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10441 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10442 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10443 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10444 AO, Failure);
10445 } else {
10446 Result =
10447 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10448 }
10449
10450 if (V.Var) {
10451 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10452 if (!IsInteger)
10453 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10454 assert(OldValue->getType() == V.ElemTy &&
10455 "OldValue and V must be of same type");
10456 if (IsPostfixUpdate) {
10457 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10458 } else {
10459 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10460 if (IsFailOnly) {
10461 // CurBB----
10462 // | |
10463 // v |
10464 // ContBB |
10465 // | |
10466 // v |
10467 // ExitBB <-
10468 //
10469 // where ContBB only contains the store of old value to 'v'.
10470 BasicBlock *CurBB = Builder.GetInsertBlock();
10471 Instruction *CurBBTI = CurBB->getTerminator();
10472 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10473 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10474 CurBBTI, X.Var->getName() + ".atomic.exit");
10475 BasicBlock *ContBB = CurBB->splitBasicBlock(
10476 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10477 ContBB->getTerminator()->eraseFromParent();
10478 CurBB->getTerminator()->eraseFromParent();
10479
10480 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10481
10482 Builder.SetInsertPoint(ContBB);
10483 Builder.CreateStore(OldValue, V.Var);
10484 Builder.CreateBr(ExitBB);
10485
10486 if (UnreachableInst *ExitTI =
10488 CurBBTI->eraseFromParent();
10489 Builder.SetInsertPoint(ExitBB);
10490 } else {
10491 Builder.SetInsertPoint(ExitTI);
10492 }
10493 } else {
10494 Value *CapturedValue =
10495 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10496 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10497 }
10498 }
10499 }
10500 // The comparison result has to be stored.
10501 if (R.Var) {
10502 assert(R.Var->getType()->isPointerTy() &&
10503 "r.var must be of pointer type");
10504 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10505
10506 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10507 Value *ResultCast = R.IsSigned
10508 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10509 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10510 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10511 }
10512 } else {
10513 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10514 "Op should be either max or min at this point");
10515 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10516
10517 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10518 // Let's take max as example.
10519 // OpenMP form:
10520 // x = x > expr ? expr : x;
10521 // LLVM form:
10522 // *ptr = *ptr > val ? *ptr : val;
10523 // We need to transform to LLVM form.
10524 // x = x <= expr ? x : expr;
10526 if (IsXBinopExpr) {
10527 if (IsInteger) {
10528 if (X.IsSigned)
10529 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10531 else
10532 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10534 } else {
10535 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10537 }
10538 } else {
10539 if (IsInteger) {
10540 if (X.IsSigned)
10541 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10543 else
10544 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10546 } else {
10547 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10549 }
10550 }
10551
10552 AtomicRMWInst *OldValue =
10553 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10554 if (V.Var) {
10555 Value *CapturedValue = nullptr;
10556 if (IsPostfixUpdate) {
10557 CapturedValue = OldValue;
10558 } else {
10559 CmpInst::Predicate Pred;
10560 switch (NewOp) {
10561 case AtomicRMWInst::Max:
10562 Pred = CmpInst::ICMP_SGT;
10563 break;
10565 Pred = CmpInst::ICMP_UGT;
10566 break;
10568 Pred = CmpInst::FCMP_OGT;
10569 break;
10570 case AtomicRMWInst::Min:
10571 Pred = CmpInst::ICMP_SLT;
10572 break;
10574 Pred = CmpInst::ICMP_ULT;
10575 break;
10577 Pred = CmpInst::FCMP_OLT;
10578 break;
10579 default:
10580 llvm_unreachable("unexpected comparison op");
10581 }
10582 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10583 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10584 }
10585 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10586 }
10587 }
10588
10589 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10590
10591 return Builder.saveIP();
10592}
10593
10594OpenMPIRBuilder::InsertPointOrErrorTy
10595OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
10596 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10597 Value *NumTeamsUpper, Value *ThreadLimit,
10598 Value *IfExpr) {
10599 if (!updateToLocation(Loc))
10600 return InsertPointTy();
10601
10602 uint32_t SrcLocStrSize;
10603 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10604 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10605 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10606
10607 // Outer allocation basicblock is the entry block of the current function.
10608 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10609 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10610 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10611 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10612 }
10613
10614 // The current basic block is split into four basic blocks. After outlining,
10615 // they will be mapped as follows:
10616 // ```
10617 // def current_fn() {
10618 // current_basic_block:
10619 // br label %teams.exit
10620 // teams.exit:
10621 // ; instructions after teams
10622 // }
10623 //
10624 // def outlined_fn() {
10625 // teams.alloca:
10626 // br label %teams.body
10627 // teams.body:
10628 // ; instructions within teams body
10629 // }
10630 // ```
10631 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10632 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10633 BasicBlock *AllocaBB =
10634 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10635
10636 bool SubClausesPresent =
10637 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10638 // Push num_teams
10639 if (!Config.isTargetDevice() && SubClausesPresent) {
10640 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10641 "if lowerbound is non-null, then upperbound must also be non-null "
10642 "for bounds on num_teams");
10643
10644 if (NumTeamsUpper == nullptr)
10645 NumTeamsUpper = Builder.getInt32(0);
10646
10647 if (NumTeamsLower == nullptr)
10648 NumTeamsLower = NumTeamsUpper;
10649
10650 if (IfExpr) {
10651 assert(IfExpr->getType()->isIntegerTy() &&
10652 "argument to if clause must be an integer value");
10653
10654 // upper = ifexpr ? upper : 1
10655 if (IfExpr->getType() != Int1)
10656 IfExpr = Builder.CreateICmpNE(IfExpr,
10657 ConstantInt::get(IfExpr->getType(), 0));
10658 NumTeamsUpper = Builder.CreateSelect(
10659 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10660
10661 // lower = ifexpr ? lower : 1
10662 NumTeamsLower = Builder.CreateSelect(
10663 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10664 }
10665
10666 if (ThreadLimit == nullptr)
10667 ThreadLimit = Builder.getInt32(0);
10668
10669 Value *ThreadNum = getOrCreateThreadID(Ident);
10670 createRuntimeFunctionCall(
10671 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10672 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
10673 }
10674 // Generate the body of teams.
10675 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10676 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10677 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10678 return Err;
10679
10680 OutlineInfo OI;
10681 OI.EntryBB = AllocaBB;
10682 OI.ExitBB = ExitBB;
10683 OI.OuterAllocaBB = &OuterAllocaBB;
10684
10685 // Insert fake values for global tid and bound tid.
10687 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10688 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10689 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10690 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10691 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10692
10693 auto HostPostOutlineCB = [this, Ident,
10694 ToBeDeleted](Function &OutlinedFn) mutable {
10695 // The stale call instruction will be replaced with a new call instruction
10696 // for runtime call with the outlined function.
10697
10698 assert(OutlinedFn.hasOneUse() &&
10699 "there must be a single user for the outlined function");
10700 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10701 ToBeDeleted.push_back(StaleCI);
10702
10703 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10704 "Outlined function must have two or three arguments only");
10705
10706 bool HasShared = OutlinedFn.arg_size() == 3;
10707
10708 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10709 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10710 if (HasShared)
10711 OutlinedFn.getArg(2)->setName("data");
10712
10713 // Call to the runtime function for teams in the current function.
10714 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10715 "outlined function.");
10716 Builder.SetInsertPoint(StaleCI);
10718 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10719 if (HasShared)
10720 Args.push_back(StaleCI->getArgOperand(2));
10721 createRuntimeFunctionCall(
10722 getOrCreateRuntimeFunctionPtr(
10723 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10724 Args);
10725
10726 for (Instruction *I : llvm::reverse(ToBeDeleted))
10727 I->eraseFromParent();
10728 };
10729
10730 if (!Config.isTargetDevice())
10731 OI.PostOutlineCB = HostPostOutlineCB;
10732
10733 addOutlineInfo(std::move(OI));
10734
10735 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10736
10737 return Builder.saveIP();
10738}
10739
10740OpenMPIRBuilder::InsertPointOrErrorTy
10741OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10742 InsertPointTy OuterAllocaIP,
10743 BodyGenCallbackTy BodyGenCB) {
10744 if (!updateToLocation(Loc))
10745 return InsertPointTy();
10746
10747 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10748
10749 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10750 BasicBlock *BodyBB =
10751 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10752 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10753 }
10754 BasicBlock *ExitBB =
10755 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10756 BasicBlock *BodyBB =
10757 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10758 BasicBlock *AllocaBB =
10759 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10760
10761 // Generate the body of distribute clause
10762 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10763 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10764 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10765 return Err;
10766
10767 // When using target we use different runtime functions which require a
10768 // callback.
10769 if (Config.isTargetDevice()) {
10770 OutlineInfo OI;
10771 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10772 OI.EntryBB = AllocaBB;
10773 OI.ExitBB = ExitBB;
10774
10775 addOutlineInfo(std::move(OI));
10776 }
10777 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10778
10779 return Builder.saveIP();
10780}
10781
10783OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10784 std::string VarName) {
10785 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10787 Names.size()),
10788 Names);
10789 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10790 M, MapNamesArrayInit->getType(),
10791 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10792 VarName);
10793 return MapNamesArrayGlobal;
10794}
10795
10796// Create all simple and struct types exposed by the runtime and remember
10797// the llvm::PointerTypes of them for easy access later.
10798void OpenMPIRBuilder::initializeTypes(Module &M) {
10799 LLVMContext &Ctx = M.getContext();
10800 StructType *T;
10801 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10802 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10803#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10804#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10805 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10806 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10807#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10808 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10809 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10810#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10811 T = StructType::getTypeByName(Ctx, StructName); \
10812 if (!T) \
10813 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10814 VarName = T; \
10815 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10816#include "llvm/Frontend/OpenMP/OMPKinds.def"
10817}
10818
10819void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10821 SmallVectorImpl<BasicBlock *> &BlockVector) {
10823 BlockSet.insert(EntryBB);
10824 BlockSet.insert(ExitBB);
10825
10826 Worklist.push_back(EntryBB);
10827 while (!Worklist.empty()) {
10828 BasicBlock *BB = Worklist.pop_back_val();
10829 BlockVector.push_back(BB);
10830 for (BasicBlock *SuccBB : successors(BB))
10831 if (BlockSet.insert(SuccBB).second)
10832 Worklist.push_back(SuccBB);
10833 }
10834}
10835
10836void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10837 uint64_t Size, int32_t Flags,
10839 StringRef Name) {
10840 if (!Config.isGPU()) {
10843 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10844 return;
10845 }
10846 // TODO: Add support for global variables on the device after declare target
10847 // support.
10848 Function *Fn = dyn_cast<Function>(Addr);
10849 if (!Fn)
10850 return;
10851
10852 // Add a function attribute for the kernel.
10853 Fn->addFnAttr("kernel");
10854 if (T.isAMDGCN())
10855 Fn->addFnAttr("uniform-work-group-size", "true");
10856 Fn->addFnAttr(Attribute::MustProgress);
10857}
10858
10859// We only generate metadata for function that contain target regions.
10860void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10861 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10862
10863 // If there are no entries, we don't need to do anything.
10864 if (OffloadInfoManager.empty())
10865 return;
10866
10867 LLVMContext &C = M.getContext();
10868 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10869 TargetRegionEntryInfo>,
10870 16>
10871 OrderedEntries(OffloadInfoManager.size());
10872
10873 // Auxiliary methods to create metadata values and strings.
10874 auto &&GetMDInt = [this](unsigned V) {
10875 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10876 };
10877
10878 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10879
10880 // Create the offloading info metadata node.
10881 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10882 auto &&TargetRegionMetadataEmitter =
10883 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10884 const TargetRegionEntryInfo &EntryInfo,
10885 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10886 // Generate metadata for target regions. Each entry of this metadata
10887 // contains:
10888 // - Entry 0 -> Kind of this type of metadata (0).
10889 // - Entry 1 -> Device ID of the file where the entry was identified.
10890 // - Entry 2 -> File ID of the file where the entry was identified.
10891 // - Entry 3 -> Mangled name of the function where the entry was
10892 // identified.
10893 // - Entry 4 -> Line in the file where the entry was identified.
10894 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10895 // - Entry 6 -> Order the entry was created.
10896 // The first element of the metadata node is the kind.
10897 Metadata *Ops[] = {
10898 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10899 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10900 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10901 GetMDInt(E.getOrder())};
10902
10903 // Save this entry in the right position of the ordered entries array.
10904 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10905
10906 // Add metadata to the named metadata node.
10907 MD->addOperand(MDNode::get(C, Ops));
10908 };
10909
10910 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10911
10912 // Create function that emits metadata for each device global variable entry;
10913 auto &&DeviceGlobalVarMetadataEmitter =
10914 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10915 StringRef MangledName,
10916 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10917 // Generate metadata for global variables. Each entry of this metadata
10918 // contains:
10919 // - Entry 0 -> Kind of this type of metadata (1).
10920 // - Entry 1 -> Mangled name of the variable.
10921 // - Entry 2 -> Declare target kind.
10922 // - Entry 3 -> Order the entry was created.
10923 // The first element of the metadata node is the kind.
10924 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10925 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10926
10927 // Save this entry in the right position of the ordered entries array.
10928 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10929 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10930
10931 // Add metadata to the named metadata node.
10932 MD->addOperand(MDNode::get(C, Ops));
10933 };
10934
10935 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10936 DeviceGlobalVarMetadataEmitter);
10937
10938 for (const auto &E : OrderedEntries) {
10939 assert(E.first && "All ordered entries must exist!");
10940 if (const auto *CE =
10942 E.first)) {
10943 if (!CE->getID() || !CE->getAddress()) {
10944 // Do not blame the entry if the parent funtion is not emitted.
10945 TargetRegionEntryInfo EntryInfo = E.second;
10946 StringRef FnName = EntryInfo.ParentName;
10947 if (!M.getNamedValue(FnName))
10948 continue;
10949 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10950 continue;
10951 }
10952 createOffloadEntry(CE->getID(), CE->getAddress(),
10953 /*Size=*/0, CE->getFlags(),
10955 } else if (const auto *CE = dyn_cast<
10956 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10957 E.first)) {
10958 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10959 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10960 CE->getFlags());
10961 switch (Flags) {
10962 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10963 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10964 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10965 continue;
10966 if (!CE->getAddress()) {
10967 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10968 continue;
10969 }
10970 // The vaiable has no definition - no need to add the entry.
10971 if (CE->getVarSize() == 0)
10972 continue;
10973 break;
10974 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10975 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10976 (!Config.isTargetDevice() && CE->getAddress())) &&
10977 "Declaret target link address is set.");
10978 if (Config.isTargetDevice())
10979 continue;
10980 if (!CE->getAddress()) {
10981 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10982 continue;
10983 }
10984 break;
10985 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect:
10986 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirectVTable:
10987 if (!CE->getAddress()) {
10988 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
10989 continue;
10990 }
10991 break;
10992 default:
10993 break;
10994 }
10995
10996 // Hidden or internal symbols on the device are not externally visible.
10997 // We should not attempt to register them by creating an offloading
10998 // entry. Indirect variables are handled separately on the device.
10999 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11000 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11001 (Flags !=
11002 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect &&
11003 Flags != OffloadEntriesInfoManager::
11004 OMPTargetGlobalVarEntryIndirectVTable))
11005 continue;
11006
11007 // Indirect globals need to use a special name that doesn't match the name
11008 // of the associated host global.
11009 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect ||
11010 Flags ==
11011 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirectVTable)
11012 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11013 Flags, CE->getLinkage(), CE->getVarName());
11014 else
11015 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11016 Flags, CE->getLinkage());
11017
11018 } else {
11019 llvm_unreachable("Unsupported entry kind.");
11020 }
11021 }
11022
11023 // Emit requires directive globals to a special entry so the runtime can
11024 // register them when the device image is loaded.
11025 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11026 // entries should be redesigned to better suit this use-case.
11027 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11031 ".requires", /*Size=*/0,
11032 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
11033 Config.getRequiresFlags());
11034}
11035
11036void TargetRegionEntryInfo::getTargetRegionEntryFnName(
11037 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
11038 unsigned FileID, unsigned Line, unsigned Count) {
11039 raw_svector_ostream OS(Name);
11040 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11041 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11042 if (Count)
11043 OS << "_" << Count;
11044}
11045
11046void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
11047 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11048 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11049 TargetRegionEntryInfo::getTargetRegionEntryFnName(
11050 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11051 EntryInfo.Line, NewCount);
11052}
11053
11054TargetRegionEntryInfo
11055OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
11056 vfs::FileSystem &VFS,
11057 StringRef ParentName) {
11058 sys::fs::UniqueID ID(0xdeadf17e, 0);
11059 auto FileIDInfo = CallBack();
11060 uint64_t FileID = 0;
11061 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11062 ID = Status->getUniqueID();
11063 FileID = Status->getUniqueID().getFile();
11064 } else {
11065 // If the inode ID could not be determined, create a hash value
11066 // the current file name and use that as an ID.
11067 FileID = hash_value(std::get<0>(FileIDInfo));
11068 }
11069
11070 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11071 std::get<1>(FileIDInfo));
11072}
11073
11074unsigned OpenMPIRBuilder::getFlagMemberOffset() {
11075 unsigned Offset = 0;
11076 for (uint64_t Remain =
11077 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11079 !(Remain & 1); Remain = Remain >> 1)
11080 Offset++;
11081 return Offset;
11082}
11083
11085OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
11086 // Rotate by getFlagMemberOffset() bits.
11087 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11088 << getFlagMemberOffset());
11089}
11090
11091void OpenMPIRBuilder::setCorrectMemberOfFlag(
11093 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11094 // If the entry is PTR_AND_OBJ but has not been marked with the special
11095 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11096 // marked as MEMBER_OF.
11097 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11099 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11102 return;
11103
11104 // Entries with ATTACH are not members-of anything. They are handled
11105 // separately by the runtime after other maps have been handled.
11106 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11108 return;
11109
11110 // Reset the placeholder value to prepare the flag for the assignment of the
11111 // proper MEMBER_OF value.
11112 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11113 Flags |= MemberOfFlag;
11114}
11115
11116Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
11117 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
11118 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
11119 bool IsDeclaration, bool IsExternallyVisible,
11120 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11121 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11122 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11123 std::function<Constant *()> GlobalInitializer,
11124 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11125 // TODO: convert this to utilise the IRBuilder Config rather than
11126 // a passed down argument.
11127 if (OpenMPSIMD)
11128 return nullptr;
11129
11130 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
11131 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
11132 CaptureClause ==
11133 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
11134 Config.hasRequiresUnifiedSharedMemory())) {
11135 SmallString<64> PtrName;
11136 {
11137 raw_svector_ostream OS(PtrName);
11138 OS << MangledName;
11139 if (!IsExternallyVisible)
11140 OS << format("_%x", EntryInfo.FileID);
11141 OS << "_decl_tgt_ref_ptr";
11142 }
11143
11144 Value *Ptr = M.getNamedValue(PtrName);
11145
11146 if (!Ptr) {
11147 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11148 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11149
11150 auto *GV = cast<GlobalVariable>(Ptr);
11151 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11152
11153 if (!Config.isTargetDevice()) {
11154 if (GlobalInitializer)
11155 GV->setInitializer(GlobalInitializer());
11156 else
11157 GV->setInitializer(GlobalValue);
11158 }
11159
11160 registerTargetGlobalVariable(
11161 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11162 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11163 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11164 }
11165
11166 return cast<Constant>(Ptr);
11167 }
11168
11169 return nullptr;
11170}
11171
11172void OpenMPIRBuilder::registerTargetGlobalVariable(
11173 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
11174 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
11175 bool IsDeclaration, bool IsExternallyVisible,
11176 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11177 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11178 std::vector<Triple> TargetTriple,
11179 std::function<Constant *()> GlobalInitializer,
11180 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11181 Constant *Addr) {
11182 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
11183 (TargetTriple.empty() && !Config.isTargetDevice()))
11184 return;
11185
11186 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
11188 int64_t VarSize;
11190
11191 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
11192 CaptureClause ==
11193 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
11194 !Config.hasRequiresUnifiedSharedMemory()) {
11195 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
11196 VarName = MangledName;
11197 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11198
11199 if (!IsDeclaration)
11200 VarSize = divideCeil(
11201 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11202 else
11203 VarSize = 0;
11204 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11205
11206 // This is a workaround carried over from Clang which prevents undesired
11207 // optimisation of internal variables.
11208 if (Config.isTargetDevice() &&
11209 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11210 // Do not create a "ref-variable" if the original is not also available
11211 // on the host.
11212 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11213 return;
11214
11215 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11216
11217 if (!M.getNamedValue(RefName)) {
11218 Constant *AddrRef =
11219 getOrCreateInternalVariable(Addr->getType(), RefName);
11220 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11221 GvAddrRef->setConstant(true);
11222 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11223 GvAddrRef->setInitializer(Addr);
11224 GeneratedRefs.push_back(GvAddrRef);
11225 }
11226 }
11227 } else {
11228 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
11229 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
11230 else
11231 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
11232
11233 if (Config.isTargetDevice()) {
11234 VarName = (Addr) ? Addr->getName() : "";
11235 Addr = nullptr;
11236 } else {
11237 Addr = getAddrOfDeclareTargetVar(
11238 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11239 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11240 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11241 VarName = (Addr) ? Addr->getName() : "";
11242 }
11243 VarSize = M.getDataLayout().getPointerSize();
11245 }
11246
11247 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11248 Flags, Linkage);
11249}
11250
11251/// Loads all the offload entries information from the host IR
11252/// metadata.
11253void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
11254 // If we are in target mode, load the metadata from the host IR. This code has
11255 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11256
11257 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11258 if (!MD)
11259 return;
11260
11261 for (MDNode *MN : MD->operands()) {
11262 auto &&GetMDInt = [MN](unsigned Idx) {
11263 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11264 return cast<ConstantInt>(V->getValue())->getZExtValue();
11265 };
11266
11267 auto &&GetMDString = [MN](unsigned Idx) {
11268 auto *V = cast<MDString>(MN->getOperand(Idx));
11269 return V->getString();
11270 };
11271
11272 switch (GetMDInt(0)) {
11273 default:
11274 llvm_unreachable("Unexpected metadata!");
11275 break;
11276 case OffloadEntriesInfoManager::OffloadEntryInfo::
11277 OffloadingEntryInfoTargetRegion: {
11278 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11279 /*DeviceID=*/GetMDInt(1),
11280 /*FileID=*/GetMDInt(2),
11281 /*Line=*/GetMDInt(4),
11282 /*Count=*/GetMDInt(5));
11283 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11284 /*Order=*/GetMDInt(6));
11285 break;
11286 }
11287 case OffloadEntriesInfoManager::OffloadEntryInfo::
11288 OffloadingEntryInfoDeviceGlobalVar:
11289 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11290 /*MangledName=*/GetMDString(1),
11291 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
11292 /*Flags=*/GetMDInt(2)),
11293 /*Order=*/GetMDInt(3));
11294 break;
11295 }
11296 }
11297}
11298
11299void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
11300 StringRef HostFilePath) {
11301 if (HostFilePath.empty())
11302 return;
11303
11304 auto Buf = VFS.getBufferForFile(HostFilePath);
11305 if (std::error_code Err = Buf.getError()) {
11306 report_fatal_error(("error opening host file from host file path inside of "
11307 "OpenMPIRBuilder: " +
11308 Err.message())
11309 .c_str());
11310 }
11311
11312 LLVMContext Ctx;
11314 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11315 if (std::error_code Err = M.getError()) {
11317 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11318 .c_str());
11319 }
11320
11321 loadOffloadInfoMetadata(*M.get());
11322}
11323
11324//===----------------------------------------------------------------------===//
11325// OffloadEntriesInfoManager
11326//===----------------------------------------------------------------------===//
11327
11328bool OffloadEntriesInfoManager::empty() const {
11329 return OffloadEntriesTargetRegion.empty() &&
11330 OffloadEntriesDeviceGlobalVar.empty();
11331}
11332
11333unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
11334 const TargetRegionEntryInfo &EntryInfo) const {
11335 auto It = OffloadEntriesTargetRegionCount.find(
11336 getTargetRegionEntryCountKey(EntryInfo));
11337 if (It == OffloadEntriesTargetRegionCount.end())
11338 return 0;
11339 return It->second;
11340}
11341
11342void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
11343 const TargetRegionEntryInfo &EntryInfo) {
11344 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
11345 EntryInfo.Count + 1;
11346}
11347
11348/// Initialize target region entry.
11349void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
11350 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
11351 OffloadEntriesTargetRegion[EntryInfo] =
11352 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
11353 OMPTargetRegionEntryTargetRegion);
11354 ++OffloadingEntriesNum;
11355}
11356
11357void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
11358 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
11359 OMPTargetRegionEntryKind Flags) {
11360 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
11361
11362 // Update the EntryInfo with the next available count for this location.
11363 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11364
11365 // If we are emitting code for a target, the entry is already initialized,
11366 // only has to be registered.
11367 if (OMPBuilder->Config.isTargetDevice()) {
11368 // This could happen if the device compilation is invoked standalone.
11369 if (!hasTargetRegionEntryInfo(EntryInfo)) {
11370 return;
11371 }
11372 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
11373 Entry.setAddress(Addr);
11374 Entry.setID(ID);
11375 Entry.setFlags(Flags);
11376 } else {
11377 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
11378 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
11379 return;
11380 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
11381 "Target region entry already registered!");
11382 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
11383 OffloadEntriesTargetRegion[EntryInfo] = Entry;
11384 ++OffloadingEntriesNum;
11385 }
11386 incrementTargetRegionEntryInfoCount(EntryInfo);
11387}
11388
11389bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
11390 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
11391
11392 // Update the EntryInfo with the next available count for this location.
11393 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11394
11395 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
11396 if (It == OffloadEntriesTargetRegion.end()) {
11397 return false;
11398 }
11399 // Fail if this entry is already registered.
11400 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11401 return false;
11402 return true;
11403}
11404
11405void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
11406 const OffloadTargetRegionEntryInfoActTy &Action) {
11407 // Scan all target region entries and perform the provided action.
11408 for (const auto &It : OffloadEntriesTargetRegion) {
11409 Action(It.first, It.second);
11410 }
11411}
11412
11413void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
11414 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
11415 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
11416 ++OffloadingEntriesNum;
11417}
11418
11419void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
11420 StringRef VarName, Constant *Addr, int64_t VarSize,
11421 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
11422 if (OMPBuilder->Config.isTargetDevice()) {
11423 // This could happen if the device compilation is invoked standalone.
11424 if (!hasDeviceGlobalVarEntryInfo(VarName))
11425 return;
11426 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11427 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
11428 if (Entry.getVarSize() == 0) {
11429 Entry.setVarSize(VarSize);
11430 Entry.setLinkage(Linkage);
11431 }
11432 return;
11433 }
11434 Entry.setVarSize(VarSize);
11435 Entry.setLinkage(Linkage);
11436 Entry.setAddress(Addr);
11437 } else {
11438 if (hasDeviceGlobalVarEntryInfo(VarName)) {
11439 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11440 assert(Entry.isValid() && Entry.getFlags() == Flags &&
11441 "Entry not initialized!");
11442 if (Entry.getVarSize() == 0) {
11443 Entry.setVarSize(VarSize);
11444 Entry.setLinkage(Linkage);
11445 }
11446 return;
11447 }
11448 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect ||
11449 Flags ==
11450 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirectVTable)
11451 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
11452 Addr, VarSize, Flags, Linkage,
11453 VarName.str());
11454 else
11455 OffloadEntriesDeviceGlobalVar.try_emplace(
11456 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
11457 ++OffloadingEntriesNum;
11458 }
11459}
11460
11461void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
11462 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
11463 // Scan all target region entries and perform the provided action.
11464 for (const auto &E : OffloadEntriesDeviceGlobalVar)
11465 Action(E.getKey(), E.getValue());
11466}
11467
11468//===----------------------------------------------------------------------===//
11469// CanonicalLoopInfo
11470//===----------------------------------------------------------------------===//
11471
11472void CanonicalLoopInfo::collectControlBlocks(
11474 // We only count those BBs as control block for which we do not need to
11475 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
11476 // flow. For consistency, this also means we do not add the Body block, which
11477 // is just the entry to the body code.
11478 BBs.reserve(BBs.size() + 6);
11479 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
11480}
11481
11482BasicBlock *CanonicalLoopInfo::getPreheader() const {
11483 assert(isValid() && "Requires a valid canonical loop");
11484 for (BasicBlock *Pred : predecessors(Header)) {
11485 if (Pred != Latch)
11486 return Pred;
11487 }
11488 llvm_unreachable("Missing preheader");
11489}
11490
11491void CanonicalLoopInfo::setTripCount(Value *TripCount) {
11492 assert(isValid() && "Requires a valid canonical loop");
11493
11494 Instruction *CmpI = &getCond()->front();
11495 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
11496 CmpI->setOperand(1, TripCount);
11497
11498#ifndef NDEBUG
11499 assertOK();
11500#endif
11501}
11502
11503void CanonicalLoopInfo::mapIndVar(
11504 llvm::function_ref<Value *(Instruction *)> Updater) {
11505 assert(isValid() && "Requires a valid canonical loop");
11506
11507 Instruction *OldIV = getIndVar();
11508
11509 // Record all uses excluding those introduced by the updater. Uses by the
11510 // CanonicalLoopInfo itself to keep track of the number of iterations are
11511 // excluded.
11512 SmallVector<Use *> ReplacableUses;
11513 for (Use &U : OldIV->uses()) {
11514 auto *User = dyn_cast<Instruction>(U.getUser());
11515 if (!User)
11516 continue;
11517 if (User->getParent() == getCond())
11518 continue;
11519 if (User->getParent() == getLatch())
11520 continue;
11521 ReplacableUses.push_back(&U);
11522 }
11523
11524 // Run the updater that may introduce new uses
11525 Value *NewIV = Updater(OldIV);
11526
11527 // Replace the old uses with the value returned by the updater.
11528 for (Use *U : ReplacableUses)
11529 U->set(NewIV);
11530
11531#ifndef NDEBUG
11532 assertOK();
11533#endif
11534}
11535
11536void CanonicalLoopInfo::assertOK() const {
11537#ifndef NDEBUG
11538 // No constraints if this object currently does not describe a loop.
11539 if (!isValid())
11540 return;
11541
11542 BasicBlock *Preheader = getPreheader();
11543 BasicBlock *Body = getBody();
11544 BasicBlock *After = getAfter();
11545
11546 // Verify standard control-flow we use for OpenMP loops.
11547 assert(Preheader);
11548 assert(isa<BranchInst>(Preheader->getTerminator()) &&
11549 "Preheader must terminate with unconditional branch");
11550 assert(Preheader->getSingleSuccessor() == Header &&
11551 "Preheader must jump to header");
11552
11553 assert(Header);
11554 assert(isa<BranchInst>(Header->getTerminator()) &&
11555 "Header must terminate with unconditional branch");
11556 assert(Header->getSingleSuccessor() == Cond &&
11557 "Header must jump to exiting block");
11558
11559 assert(Cond);
11560 assert(Cond->getSinglePredecessor() == Header &&
11561 "Exiting block only reachable from header");
11562
11563 assert(isa<BranchInst>(Cond->getTerminator()) &&
11564 "Exiting block must terminate with conditional branch");
11565 assert(size(successors(Cond)) == 2 &&
11566 "Exiting block must have two successors");
11567 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
11568 "Exiting block's first successor jump to the body");
11569 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
11570 "Exiting block's second successor must exit the loop");
11571
11572 assert(Body);
11573 assert(Body->getSinglePredecessor() == Cond &&
11574 "Body only reachable from exiting block");
11575 assert(!isa<PHINode>(Body->front()));
11576
11577 assert(Latch);
11579 "Latch must terminate with unconditional branch");
11580 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
11581 // TODO: To support simple redirecting of the end of the body code that has
11582 // multiple; introduce another auxiliary basic block like preheader and after.
11583 assert(Latch->getSinglePredecessor() != nullptr);
11584 assert(!isa<PHINode>(Latch->front()));
11585
11586 assert(Exit);
11587 assert(isa<BranchInst>(Exit->getTerminator()) &&
11588 "Exit block must terminate with unconditional branch");
11589 assert(Exit->getSingleSuccessor() == After &&
11590 "Exit block must jump to after block");
11591
11592 assert(After);
11593 assert(After->getSinglePredecessor() == Exit &&
11594 "After block only reachable from exit block");
11595 assert(After->empty() || !isa<PHINode>(After->front()));
11596
11597 Instruction *IndVar = getIndVar();
11598 assert(IndVar && "Canonical induction variable not found?");
11599 assert(isa<IntegerType>(IndVar->getType()) &&
11600 "Induction variable must be an integer");
11601 assert(cast<PHINode>(IndVar)->getParent() == Header &&
11602 "Induction variable must be a PHI in the loop header");
11603 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
11604 assert(
11605 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
11606 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
11607
11608 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
11609 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
11610 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
11611 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
11612 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
11613 ->isOne());
11614
11615 Value *TripCount = getTripCount();
11616 assert(TripCount && "Loop trip count not found?");
11617 assert(IndVar->getType() == TripCount->getType() &&
11618 "Trip count and induction variable must have the same type");
11619
11620 auto *CmpI = cast<CmpInst>(&Cond->front());
11621 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
11622 "Exit condition must be a signed less-than comparison");
11623 assert(CmpI->getOperand(0) == IndVar &&
11624 "Exit condition must compare the induction variable");
11625 assert(CmpI->getOperand(1) == TripCount &&
11626 "Exit condition must compare with the trip count");
11627#endif
11628}
11629
11630void CanonicalLoopInfo::invalidate() {
11631 Header = nullptr;
11632 Cond = nullptr;
11633 Latch = nullptr;
11634 Exit = nullptr;
11635}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
BinaryOperator * Mul
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:568
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:640
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:447
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:668
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:524
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:619
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1071
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1133
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:414
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1149
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:238
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:963
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:708
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:367
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...