LLVM 19.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
36#include "llvm/IR/Function.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/LLVMContext.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
42#include "llvm/IR/PassManager.h"
43#include "llvm/IR/Value.h"
55
56#include <cstdint>
57#include <optional>
58
59#define DEBUG_TYPE "openmp-ir-builder"
60
61using namespace llvm;
62using namespace omp;
63
64static cl::opt<bool>
65 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
66 cl::desc("Use optimistic attributes describing "
67 "'as-if' properties of runtime calls."),
68 cl::init(false));
69
71 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
72 cl::desc("Factor for the unroll threshold to account for code "
73 "simplifications still taking place"),
74 cl::init(1.5));
75
76#ifndef NDEBUG
77/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
78/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
79/// an InsertPoint stores the instruction before something is inserted. For
80/// instance, if both point to the same instruction, two IRBuilders alternating
81/// creating instruction will cause the instructions to be interleaved.
84 if (!IP1.isSet() || !IP2.isSet())
85 return false;
86 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
87}
88
90 // Valid ordered/unordered and base algorithm combinations.
91 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
92 case OMPScheduleType::UnorderedStaticChunked:
93 case OMPScheduleType::UnorderedStatic:
94 case OMPScheduleType::UnorderedDynamicChunked:
95 case OMPScheduleType::UnorderedGuidedChunked:
96 case OMPScheduleType::UnorderedRuntime:
97 case OMPScheduleType::UnorderedAuto:
98 case OMPScheduleType::UnorderedTrapezoidal:
99 case OMPScheduleType::UnorderedGreedy:
100 case OMPScheduleType::UnorderedBalanced:
101 case OMPScheduleType::UnorderedGuidedIterativeChunked:
102 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
103 case OMPScheduleType::UnorderedSteal:
104 case OMPScheduleType::UnorderedStaticBalancedChunked:
105 case OMPScheduleType::UnorderedGuidedSimd:
106 case OMPScheduleType::UnorderedRuntimeSimd:
107 case OMPScheduleType::OrderedStaticChunked:
108 case OMPScheduleType::OrderedStatic:
109 case OMPScheduleType::OrderedDynamicChunked:
110 case OMPScheduleType::OrderedGuidedChunked:
111 case OMPScheduleType::OrderedRuntime:
112 case OMPScheduleType::OrderedAuto:
113 case OMPScheduleType::OrderdTrapezoidal:
114 case OMPScheduleType::NomergeUnorderedStaticChunked:
115 case OMPScheduleType::NomergeUnorderedStatic:
116 case OMPScheduleType::NomergeUnorderedDynamicChunked:
117 case OMPScheduleType::NomergeUnorderedGuidedChunked:
118 case OMPScheduleType::NomergeUnorderedRuntime:
119 case OMPScheduleType::NomergeUnorderedAuto:
120 case OMPScheduleType::NomergeUnorderedTrapezoidal:
121 case OMPScheduleType::NomergeUnorderedGreedy:
122 case OMPScheduleType::NomergeUnorderedBalanced:
123 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
124 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
125 case OMPScheduleType::NomergeUnorderedSteal:
126 case OMPScheduleType::NomergeOrderedStaticChunked:
127 case OMPScheduleType::NomergeOrderedStatic:
128 case OMPScheduleType::NomergeOrderedDynamicChunked:
129 case OMPScheduleType::NomergeOrderedGuidedChunked:
130 case OMPScheduleType::NomergeOrderedRuntime:
131 case OMPScheduleType::NomergeOrderedAuto:
132 case OMPScheduleType::NomergeOrderedTrapezoidal:
133 break;
134 default:
135 return false;
136 }
137
138 // Must not set both monotonicity modifiers at the same time.
139 OMPScheduleType MonotonicityFlags =
140 SchedType & OMPScheduleType::MonotonicityMask;
141 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
142 return false;
143
144 return true;
145}
146#endif
147
148static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
149 if (T.isAMDGPU()) {
150 StringRef Features =
151 Kernel->getFnAttribute("target-features").getValueAsString();
152 if (Features.count("+wavefrontsize64"))
153 return omp::getAMDGPUGridValues<64>();
154 return omp::getAMDGPUGridValues<32>();
155 }
156 if (T.isNVPTX())
158 llvm_unreachable("No grid value available for this architecture!");
159}
160
161/// Determine which scheduling algorithm to use, determined from schedule clause
162/// arguments.
163static OMPScheduleType
164getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
165 bool HasSimdModifier) {
166 // Currently, the default schedule it static.
167 switch (ClauseKind) {
168 case OMP_SCHEDULE_Default:
169 case OMP_SCHEDULE_Static:
170 return HasChunks ? OMPScheduleType::BaseStaticChunked
171 : OMPScheduleType::BaseStatic;
172 case OMP_SCHEDULE_Dynamic:
173 return OMPScheduleType::BaseDynamicChunked;
174 case OMP_SCHEDULE_Guided:
175 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
176 : OMPScheduleType::BaseGuidedChunked;
177 case OMP_SCHEDULE_Auto:
179 case OMP_SCHEDULE_Runtime:
180 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
181 : OMPScheduleType::BaseRuntime;
182 }
183 llvm_unreachable("unhandled schedule clause argument");
184}
185
186/// Adds ordering modifier flags to schedule type.
187static OMPScheduleType
189 bool HasOrderedClause) {
190 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
191 OMPScheduleType::None &&
192 "Must not have ordering nor monotonicity flags already set");
193
194 OMPScheduleType OrderingModifier = HasOrderedClause
195 ? OMPScheduleType::ModifierOrdered
196 : OMPScheduleType::ModifierUnordered;
197 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
198
199 // Unsupported combinations
200 if (OrderingScheduleType ==
201 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
202 return OMPScheduleType::OrderedGuidedChunked;
203 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
204 OMPScheduleType::ModifierOrdered))
205 return OMPScheduleType::OrderedRuntime;
206
207 return OrderingScheduleType;
208}
209
210/// Adds monotonicity modifier flags to schedule type.
211static OMPScheduleType
213 bool HasSimdModifier, bool HasMonotonic,
214 bool HasNonmonotonic, bool HasOrderedClause) {
215 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
216 OMPScheduleType::None &&
217 "Must not have monotonicity flags already set");
218 assert((!HasMonotonic || !HasNonmonotonic) &&
219 "Monotonic and Nonmonotonic are contradicting each other");
220
221 if (HasMonotonic) {
222 return ScheduleType | OMPScheduleType::ModifierMonotonic;
223 } else if (HasNonmonotonic) {
224 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
225 } else {
226 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
227 // If the static schedule kind is specified or if the ordered clause is
228 // specified, and if the nonmonotonic modifier is not specified, the
229 // effect is as if the monotonic modifier is specified. Otherwise, unless
230 // the monotonic modifier is specified, the effect is as if the
231 // nonmonotonic modifier is specified.
232 OMPScheduleType BaseScheduleType =
233 ScheduleType & ~OMPScheduleType::ModifierMask;
234 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
235 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
236 HasOrderedClause) {
237 // The monotonic is used by default in openmp runtime library, so no need
238 // to set it.
239 return ScheduleType;
240 } else {
241 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
242 }
243 }
244}
245
246/// Determine the schedule type using schedule and ordering clause arguments.
247static OMPScheduleType
248computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
249 bool HasSimdModifier, bool HasMonotonicModifier,
250 bool HasNonmonotonicModifier, bool HasOrderedClause) {
251 OMPScheduleType BaseSchedule =
252 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
253 OMPScheduleType OrderedSchedule =
254 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
256 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
257 HasNonmonotonicModifier, HasOrderedClause);
258
260 return Result;
261}
262
263/// Make \p Source branch to \p Target.
264///
265/// Handles two situations:
266/// * \p Source already has an unconditional branch.
267/// * \p Source is a degenerate block (no terminator because the BB is
268/// the current head of the IR construction).
270 if (Instruction *Term = Source->getTerminator()) {
271 auto *Br = cast<BranchInst>(Term);
272 assert(!Br->isConditional() &&
273 "BB's terminator must be an unconditional branch (or degenerate)");
274 BasicBlock *Succ = Br->getSuccessor(0);
275 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
276 Br->setSuccessor(0, Target);
277 return;
278 }
279
280 auto *NewBr = BranchInst::Create(Target, Source);
281 NewBr->setDebugLoc(DL);
282}
283
285 bool CreateBranch) {
286 assert(New->getFirstInsertionPt() == New->begin() &&
287 "Target BB must not have PHI nodes");
288
289 // Move instructions to new block.
290 BasicBlock *Old = IP.getBlock();
291 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
292
293 if (CreateBranch)
294 BranchInst::Create(New, Old);
295}
296
297void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
299 BasicBlock *Old = Builder.GetInsertBlock();
300
301 spliceBB(Builder.saveIP(), New, CreateBranch);
302 if (CreateBranch)
303 Builder.SetInsertPoint(Old->getTerminator());
304 else
305 Builder.SetInsertPoint(Old);
306
307 // SetInsertPoint also updates the Builder's debug location, but we want to
308 // keep the one the Builder was configured to use.
310}
311
314 BasicBlock *Old = IP.getBlock();
316 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
317 Old->getParent(), Old->getNextNode());
318 spliceBB(IP, New, CreateBranch);
319 New->replaceSuccessorsPhiUsesWith(Old, New);
320 return New;
321}
322
323BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
326 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
327 if (CreateBranch)
328 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
329 else
330 Builder.SetInsertPoint(Builder.GetInsertBlock());
331 // SetInsertPoint also updates the Builder's debug location, but we want to
332 // keep the one the Builder was configured to use.
334 return New;
335}
336
337BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
340 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
341 if (CreateBranch)
342 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
343 else
344 Builder.SetInsertPoint(Builder.GetInsertBlock());
345 // SetInsertPoint also updates the Builder's debug location, but we want to
346 // keep the one the Builder was configured to use.
348 return New;
349}
350
352 llvm::Twine Suffix) {
353 BasicBlock *Old = Builder.GetInsertBlock();
354 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
355}
356
357// This function creates a fake integer value and a fake use for the integer
358// value. It returns the fake value created. This is useful in modeling the
359// extra arguments to the outlined functions.
361 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
362 std::stack<Instruction *> &ToBeDeleted,
363 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
364 const Twine &Name = "", bool AsPtr = true) {
365 Builder.restoreIP(OuterAllocaIP);
366 Instruction *FakeVal;
367 AllocaInst *FakeValAddr =
368 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
369 ToBeDeleted.push(FakeValAddr);
370
371 if (AsPtr) {
372 FakeVal = FakeValAddr;
373 } else {
374 FakeVal =
375 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
376 ToBeDeleted.push(FakeVal);
377 }
378
379 // Generate a fake use of this value
380 Builder.restoreIP(InnerAllocaIP);
381 Instruction *UseFakeVal;
382 if (AsPtr) {
383 UseFakeVal =
384 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
385 } else {
386 UseFakeVal =
387 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
388 }
389 ToBeDeleted.push(UseFakeVal);
390 return FakeVal;
391}
392
393//===----------------------------------------------------------------------===//
394// OpenMPIRBuilderConfig
395//===----------------------------------------------------------------------===//
396
397namespace {
399/// Values for bit flags for marking which requires clauses have been used.
400enum OpenMPOffloadingRequiresDirFlags {
401 /// flag undefined.
402 OMP_REQ_UNDEFINED = 0x000,
403 /// no requires directive present.
404 OMP_REQ_NONE = 0x001,
405 /// reverse_offload clause.
406 OMP_REQ_REVERSE_OFFLOAD = 0x002,
407 /// unified_address clause.
408 OMP_REQ_UNIFIED_ADDRESS = 0x004,
409 /// unified_shared_memory clause.
410 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
411 /// dynamic_allocators clause.
412 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
413 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
414};
415
416} // anonymous namespace
417
419 : RequiresFlags(OMP_REQ_UNDEFINED) {}
420
422 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
423 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
424 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
425 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
426 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
427 RequiresFlags(OMP_REQ_UNDEFINED) {
428 if (HasRequiresReverseOffload)
429 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
430 if (HasRequiresUnifiedAddress)
431 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
432 if (HasRequiresUnifiedSharedMemory)
433 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
434 if (HasRequiresDynamicAllocators)
435 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
436}
437
439 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
440}
441
443 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
448}
449
451 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
452}
453
455 return hasRequiresFlags() ? RequiresFlags
456 : static_cast<int64_t>(OMP_REQ_NONE);
457}
458
460 if (Value)
461 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
462 else
463 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
464}
465
467 if (Value)
468 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
469 else
470 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
471}
472
474 if (Value)
475 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
476 else
477 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
478}
479
481 if (Value)
482 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
483 else
484 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
485}
486
487//===----------------------------------------------------------------------===//
488// OpenMPIRBuilder
489//===----------------------------------------------------------------------===//
490
492 IRBuilderBase &Builder,
493 SmallVector<Value *> &ArgsVector) {
495 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
498 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
499
500 Value *NumTeams3D =
501 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
502 Value *NumThreads3D =
503 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
504
505 ArgsVector = {Version,
506 PointerNum,
507 KernelArgs.RTArgs.BasePointersArray,
508 KernelArgs.RTArgs.PointersArray,
509 KernelArgs.RTArgs.SizesArray,
510 KernelArgs.RTArgs.MapTypesArray,
511 KernelArgs.RTArgs.MapNamesArray,
512 KernelArgs.RTArgs.MappersArray,
513 KernelArgs.NumIterations,
514 Flags,
515 NumTeams3D,
516 NumThreads3D,
517 KernelArgs.DynCGGroupMem};
518}
519
521 LLVMContext &Ctx = Fn.getContext();
522
523 // Get the function's current attributes.
524 auto Attrs = Fn.getAttributes();
525 auto FnAttrs = Attrs.getFnAttrs();
526 auto RetAttrs = Attrs.getRetAttrs();
528 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
529 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
530
531 // Add AS to FnAS while taking special care with integer extensions.
532 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
533 bool Param = true) -> void {
534 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
535 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
536 if (HasSignExt || HasZeroExt) {
537 assert(AS.getNumAttributes() == 1 &&
538 "Currently not handling extension attr combined with others.");
539 if (Param) {
540 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
541 FnAS = FnAS.addAttribute(Ctx, AK);
542 } else if (auto AK =
543 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
544 FnAS = FnAS.addAttribute(Ctx, AK);
545 } else {
546 FnAS = FnAS.addAttributes(Ctx, AS);
547 }
548 };
549
550#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
551#include "llvm/Frontend/OpenMP/OMPKinds.def"
552
553 // Add attributes to the function declaration.
554 switch (FnID) {
555#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
556 case Enum: \
557 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
558 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
559 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
560 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
561 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
562 break;
563#include "llvm/Frontend/OpenMP/OMPKinds.def"
564 default:
565 // Attributes are optional.
566 break;
567 }
568}
569
572 FunctionType *FnTy = nullptr;
573 Function *Fn = nullptr;
574
575 // Try to find the declation in the module first.
576 switch (FnID) {
577#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
578 case Enum: \
579 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
580 IsVarArg); \
581 Fn = M.getFunction(Str); \
582 break;
583#include "llvm/Frontend/OpenMP/OMPKinds.def"
584 }
585
586 if (!Fn) {
587 // Create a new declaration if we need one.
588 switch (FnID) {
589#define OMP_RTL(Enum, Str, ...) \
590 case Enum: \
591 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
592 break;
593#include "llvm/Frontend/OpenMP/OMPKinds.def"
594 }
595
596 // Add information if the runtime function takes a callback function
597 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
598 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
599 LLVMContext &Ctx = Fn->getContext();
600 MDBuilder MDB(Ctx);
601 // Annotate the callback behavior of the runtime function:
602 // - The callback callee is argument number 2 (microtask).
603 // - The first two arguments of the callback callee are unknown (-1).
604 // - All variadic arguments to the runtime function are passed to the
605 // callback callee.
606 Fn->addMetadata(
607 LLVMContext::MD_callback,
609 2, {-1, -1}, /* VarArgsArePassed */ true)}));
610 }
611 }
612
613 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
614 << " with type " << *Fn->getFunctionType() << "\n");
615 addAttributes(FnID, *Fn);
616
617 } else {
618 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
619 << " with type " << *Fn->getFunctionType() << "\n");
620 }
621
622 assert(Fn && "Failed to create OpenMP runtime function");
623
624 return {FnTy, Fn};
625}
626
629 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
630 assert(Fn && "Failed to create OpenMP runtime function pointer");
631 return Fn;
632}
633
634void OpenMPIRBuilder::initialize() { initializeTypes(M); }
635
638 BasicBlock &EntryBlock = Function->getEntryBlock();
639 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
640
641 // Loop over blocks looking for constant allocas, skipping the entry block
642 // as any allocas there are already in the desired location.
643 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
644 Block++) {
645 for (auto Inst = Block->getReverseIterator()->begin();
646 Inst != Block->getReverseIterator()->end();) {
647 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
648 Inst++;
649 if (!isa<ConstantData>(AllocaInst->getArraySize()))
650 continue;
651 AllocaInst->moveBeforePreserving(MoveLocInst);
652 } else {
653 Inst++;
654 }
655 }
656 }
657}
658
660 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
662 SmallVector<OutlineInfo, 16> DeferredOutlines;
663 for (OutlineInfo &OI : OutlineInfos) {
664 // Skip functions that have not finalized yet; may happen with nested
665 // function generation.
666 if (Fn && OI.getFunction() != Fn) {
667 DeferredOutlines.push_back(OI);
668 continue;
669 }
670
671 ParallelRegionBlockSet.clear();
672 Blocks.clear();
673 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
674
675 Function *OuterFn = OI.getFunction();
676 CodeExtractorAnalysisCache CEAC(*OuterFn);
677 // If we generate code for the target device, we need to allocate
678 // struct for aggregate params in the device default alloca address space.
679 // OpenMP runtime requires that the params of the extracted functions are
680 // passed as zero address space pointers. This flag ensures that
681 // CodeExtractor generates correct code for extracted functions
682 // which are used by OpenMP runtime.
683 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
684 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
685 /* AggregateArgs */ true,
686 /* BlockFrequencyInfo */ nullptr,
687 /* BranchProbabilityInfo */ nullptr,
688 /* AssumptionCache */ nullptr,
689 /* AllowVarArgs */ true,
690 /* AllowAlloca */ true,
691 /* AllocaBlock*/ OI.OuterAllocaBB,
692 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
693
694 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
695 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
696 << " Exit: " << OI.ExitBB->getName() << "\n");
697 assert(Extractor.isEligible() &&
698 "Expected OpenMP outlining to be possible!");
699
700 for (auto *V : OI.ExcludeArgsFromAggregate)
701 Extractor.excludeArgFromAggregate(V);
702
703 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
704
705 // Forward target-cpu, target-features attributes to the outlined function.
706 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
707 if (TargetCpuAttr.isStringAttribute())
708 OutlinedFn->addFnAttr(TargetCpuAttr);
709
710 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
711 if (TargetFeaturesAttr.isStringAttribute())
712 OutlinedFn->addFnAttr(TargetFeaturesAttr);
713
714 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
715 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
716 assert(OutlinedFn->getReturnType()->isVoidTy() &&
717 "OpenMP outlined functions should not return a value!");
718
719 // For compability with the clang CG we move the outlined function after the
720 // one with the parallel region.
721 OutlinedFn->removeFromParent();
722 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
723
724 // Remove the artificial entry introduced by the extractor right away, we
725 // made our own entry block after all.
726 {
727 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
728 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
729 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
730 // Move instructions from the to-be-deleted ArtificialEntry to the entry
731 // basic block of the parallel region. CodeExtractor generates
732 // instructions to unwrap the aggregate argument and may sink
733 // allocas/bitcasts for values that are solely used in the outlined region
734 // and do not escape.
735 assert(!ArtificialEntry.empty() &&
736 "Expected instructions to add in the outlined region entry");
737 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
738 End = ArtificialEntry.rend();
739 It != End;) {
740 Instruction &I = *It;
741 It++;
742
743 if (I.isTerminator())
744 continue;
745
746 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
747 }
748
749 OI.EntryBB->moveBefore(&ArtificialEntry);
750 ArtificialEntry.eraseFromParent();
751 }
752 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
753 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
754
755 // Run a user callback, e.g. to add attributes.
756 if (OI.PostOutlineCB)
757 OI.PostOutlineCB(*OutlinedFn);
758 }
759
760 // Remove work items that have been completed.
761 OutlineInfos = std::move(DeferredOutlines);
762
763 // The createTarget functions embeds user written code into
764 // the target region which may inject allocas which need to
765 // be moved to the entry block of our target or risk malformed
766 // optimisations by later passes, this is only relevant for
767 // the device pass which appears to be a little more delicate
768 // when it comes to optimisations (however, we do not block on
769 // that here, it's up to the inserter to the list to do so).
770 // This notbaly has to occur after the OutlinedInfo candidates
771 // have been extracted so we have an end product that will not
772 // be implicitly adversely affected by any raises unless
773 // intentionally appended to the list.
774 // NOTE: This only does so for ConstantData, it could be extended
775 // to ConstantExpr's with further effort, however, they should
776 // largely be folded when they get here. Extending it to runtime
777 // defined/read+writeable allocation sizes would be non-trivial
778 // (need to factor in movement of any stores to variables the
779 // allocation size depends on, as well as the usual loads,
780 // otherwise it'll yield the wrong result after movement) and
781 // likely be more suitable as an LLVM optimisation pass.
784
785 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
786 [](EmitMetadataErrorKind Kind,
787 const TargetRegionEntryInfo &EntryInfo) -> void {
788 errs() << "Error of kind: " << Kind
789 << " when emitting offload entries and metadata during "
790 "OMPIRBuilder finalization \n";
791 };
792
795}
796
798 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
799}
800
803 auto *GV =
804 new GlobalVariable(M, I32Ty,
805 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
806 ConstantInt::get(I32Ty, Value), Name);
807 GV->setVisibility(GlobalValue::HiddenVisibility);
808
809 return GV;
810}
811
813 uint32_t SrcLocStrSize,
814 IdentFlag LocFlags,
815 unsigned Reserve2Flags) {
816 // Enable "C-mode".
817 LocFlags |= OMP_IDENT_FLAG_KMPC;
818
819 Constant *&Ident =
820 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
821 if (!Ident) {
823 Constant *IdentData[] = {I32Null,
824 ConstantInt::get(Int32, uint32_t(LocFlags)),
825 ConstantInt::get(Int32, Reserve2Flags),
826 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
827 Constant *Initializer =
828 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
829
830 // Look for existing encoding of the location + flags, not needed but
831 // minimizes the difference to the existing solution while we transition.
832 for (GlobalVariable &GV : M.globals())
833 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
834 if (GV.getInitializer() == Initializer)
835 Ident = &GV;
836
837 if (!Ident) {
838 auto *GV = new GlobalVariable(
839 M, OpenMPIRBuilder::Ident,
840 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
843 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
844 GV->setAlignment(Align(8));
845 Ident = GV;
846 }
847 }
848
850}
851
853 uint32_t &SrcLocStrSize) {
854 SrcLocStrSize = LocStr.size();
855 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
856 if (!SrcLocStr) {
857 Constant *Initializer =
859
860 // Look for existing encoding of the location, not needed but minimizes the
861 // difference to the existing solution while we transition.
862 for (GlobalVariable &GV : M.globals())
863 if (GV.isConstant() && GV.hasInitializer() &&
864 GV.getInitializer() == Initializer)
865 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
866
867 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
868 /* AddressSpace */ 0, &M);
869 }
870 return SrcLocStr;
871}
872
874 StringRef FileName,
875 unsigned Line, unsigned Column,
876 uint32_t &SrcLocStrSize) {
877 SmallString<128> Buffer;
878 Buffer.push_back(';');
879 Buffer.append(FileName);
880 Buffer.push_back(';');
881 Buffer.append(FunctionName);
882 Buffer.push_back(';');
883 Buffer.append(std::to_string(Line));
884 Buffer.push_back(';');
885 Buffer.append(std::to_string(Column));
886 Buffer.push_back(';');
887 Buffer.push_back(';');
888 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
889}
890
891Constant *
893 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
894 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
895}
896
898 uint32_t &SrcLocStrSize,
899 Function *F) {
900 DILocation *DIL = DL.get();
901 if (!DIL)
902 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
903 StringRef FileName = M.getName();
904 if (DIFile *DIF = DIL->getFile())
905 if (std::optional<StringRef> Source = DIF->getSource())
906 FileName = *Source;
907 StringRef Function = DIL->getScope()->getSubprogram()->getName();
908 if (Function.empty() && F)
909 Function = F->getName();
910 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
911 DIL->getColumn(), SrcLocStrSize);
912}
913
915 uint32_t &SrcLocStrSize) {
916 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
917 Loc.IP.getBlock()->getParent());
918}
919
921 return Builder.CreateCall(
922 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
923 "omp_global_thread_num");
924}
925
928 bool ForceSimpleCall, bool CheckCancelFlag) {
929 if (!updateToLocation(Loc))
930 return Loc.IP;
931 return emitBarrierImpl(Loc, DK, ForceSimpleCall, CheckCancelFlag);
932}
933
936 bool ForceSimpleCall, bool CheckCancelFlag) {
937 // Build call __kmpc_cancel_barrier(loc, thread_id) or
938 // __kmpc_barrier(loc, thread_id);
939
940 IdentFlag BarrierLocFlags;
941 switch (Kind) {
942 case OMPD_for:
943 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
944 break;
945 case OMPD_sections:
946 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
947 break;
948 case OMPD_single:
949 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
950 break;
951 case OMPD_barrier:
952 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
953 break;
954 default:
955 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
956 break;
957 }
958
959 uint32_t SrcLocStrSize;
960 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
961 Value *Args[] = {
962 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
963 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
964
965 // If we are in a cancellable parallel region, barriers are cancellation
966 // points.
967 // TODO: Check why we would force simple calls or to ignore the cancel flag.
968 bool UseCancelBarrier =
969 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
970
971 Value *Result =
973 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
974 : OMPRTL___kmpc_barrier),
975 Args);
976
977 if (UseCancelBarrier && CheckCancelFlag)
978 emitCancelationCheckImpl(Result, OMPD_parallel);
979
980 return Builder.saveIP();
981}
982
985 Value *IfCondition,
986 omp::Directive CanceledDirective) {
987 if (!updateToLocation(Loc))
988 return Loc.IP;
989
990 // LLVM utilities like blocks with terminators.
991 auto *UI = Builder.CreateUnreachable();
992
993 Instruction *ThenTI = UI, *ElseTI = nullptr;
994 if (IfCondition)
995 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
996 Builder.SetInsertPoint(ThenTI);
997
998 Value *CancelKind = nullptr;
999 switch (CanceledDirective) {
1000#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1001 case DirectiveEnum: \
1002 CancelKind = Builder.getInt32(Value); \
1003 break;
1004#include "llvm/Frontend/OpenMP/OMPKinds.def"
1005 default:
1006 llvm_unreachable("Unknown cancel kind!");
1007 }
1008
1009 uint32_t SrcLocStrSize;
1010 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1011 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1012 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1013 Value *Result = Builder.CreateCall(
1014 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1015 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1016 if (CanceledDirective == OMPD_parallel) {
1018 Builder.restoreIP(IP);
1020 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1021 /* CheckCancelFlag */ false);
1022 }
1023 };
1024
1025 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1026 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1027
1028 // Update the insertion point and remove the terminator we introduced.
1029 Builder.SetInsertPoint(UI->getParent());
1030 UI->eraseFromParent();
1031
1032 return Builder.saveIP();
1033}
1034
1036 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1037 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1038 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1039 if (!updateToLocation(Loc))
1040 return Loc.IP;
1041
1042 Builder.restoreIP(AllocaIP);
1043 auto *KernelArgsPtr =
1044 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1045 Builder.restoreIP(Loc.IP);
1046
1047 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1048 llvm::Value *Arg =
1049 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1051 KernelArgs[I], Arg,
1052 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1053 }
1054
1055 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1056 NumThreads, HostPtr, KernelArgsPtr};
1057
1058 Return = Builder.CreateCall(
1059 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1060 OffloadingArgs);
1061
1062 return Builder.saveIP();
1063}
1064
1066 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1067 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1068 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1069
1070 if (!updateToLocation(Loc))
1071 return Loc.IP;
1072
1073 Builder.restoreIP(Loc.IP);
1074 // On top of the arrays that were filled up, the target offloading call
1075 // takes as arguments the device id as well as the host pointer. The host
1076 // pointer is used by the runtime library to identify the current target
1077 // region, so it only has to be unique and not necessarily point to
1078 // anything. It could be the pointer to the outlined function that
1079 // implements the target region, but we aren't using that so that the
1080 // compiler doesn't need to keep that, and could therefore inline the host
1081 // function if proven worthwhile during optimization.
1082
1083 // From this point on, we need to have an ID of the target region defined.
1084 assert(OutlinedFnID && "Invalid outlined function ID!");
1085 (void)OutlinedFnID;
1086
1087 // Return value of the runtime offloading call.
1088 Value *Return = nullptr;
1089
1090 // Arguments for the target kernel.
1091 SmallVector<Value *> ArgsVector;
1092 getKernelArgsVector(Args, Builder, ArgsVector);
1093
1094 // The target region is an outlined function launched by the runtime
1095 // via calls to __tgt_target_kernel().
1096 //
1097 // Note that on the host and CPU targets, the runtime implementation of
1098 // these calls simply call the outlined function without forking threads.
1099 // The outlined functions themselves have runtime calls to
1100 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1101 // the compiler in emitTeamsCall() and emitParallelCall().
1102 //
1103 // In contrast, on the NVPTX target, the implementation of
1104 // __tgt_target_teams() launches a GPU kernel with the requested number
1105 // of teams and threads so no additional calls to the runtime are required.
1106 // Check the error code and execute the host version if required.
1107 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1108 Args.NumTeams, Args.NumThreads,
1109 OutlinedFnID, ArgsVector));
1110
1111 BasicBlock *OffloadFailedBlock =
1112 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1113 BasicBlock *OffloadContBlock =
1114 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1116 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1117
1118 auto CurFn = Builder.GetInsertBlock()->getParent();
1119 emitBlock(OffloadFailedBlock, CurFn);
1120 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1121 emitBranch(OffloadContBlock);
1122 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1123 return Builder.saveIP();
1124}
1125
1127 omp::Directive CanceledDirective,
1128 FinalizeCallbackTy ExitCB) {
1129 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1130 "Unexpected cancellation!");
1131
1132 // For a cancel barrier we create two new blocks.
1134 BasicBlock *NonCancellationBlock;
1135 if (Builder.GetInsertPoint() == BB->end()) {
1136 // TODO: This branch will not be needed once we moved to the
1137 // OpenMPIRBuilder codegen completely.
1138 NonCancellationBlock = BasicBlock::Create(
1139 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1140 } else {
1141 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1144 }
1145 BasicBlock *CancellationBlock = BasicBlock::Create(
1146 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1147
1148 // Jump to them based on the return value.
1149 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1150 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1151 /* TODO weight */ nullptr, nullptr);
1152
1153 // From the cancellation block we finalize all variables and go to the
1154 // post finalization block that is known to the FiniCB callback.
1155 Builder.SetInsertPoint(CancellationBlock);
1156 if (ExitCB)
1157 ExitCB(Builder.saveIP());
1158 auto &FI = FinalizationStack.back();
1159 FI.FiniCB(Builder.saveIP());
1160
1161 // The continuation block is where code generation continues.
1162 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1163}
1164
1165// Callback used to create OpenMP runtime calls to support
1166// omp parallel clause for the device.
1167// We need to use this callback to replace call to the OutlinedFn in OuterFn
1168// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1170 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1171 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1172 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1173 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1174 // Add some known attributes.
1175 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1176 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1177 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1178 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1179 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1180 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1181
1182 assert(OutlinedFn.arg_size() >= 2 &&
1183 "Expected at least tid and bounded tid as arguments");
1184 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1185
1186 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1187 assert(CI && "Expected call instruction to outlined function");
1188 CI->getParent()->setName("omp_parallel");
1189
1190 Builder.SetInsertPoint(CI);
1191 Type *PtrTy = OMPIRBuilder->VoidPtr;
1192 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1193
1194 // Add alloca for kernel args
1195 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1196 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1197 AllocaInst *ArgsAlloca =
1198 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1199 Value *Args = ArgsAlloca;
1200 // Add address space cast if array for storing arguments is not allocated
1201 // in address space 0
1202 if (ArgsAlloca->getAddressSpace())
1203 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1204 Builder.restoreIP(CurrentIP);
1205
1206 // Store captured vars which are used by kmpc_parallel_51
1207 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1208 Value *V = *(CI->arg_begin() + 2 + Idx);
1209 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1210 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1211 Builder.CreateStore(V, StoreAddress);
1212 }
1213
1214 Value *Cond =
1215 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1216 : Builder.getInt32(1);
1217
1218 // Build kmpc_parallel_51 call
1219 Value *Parallel51CallArgs[] = {
1220 /* identifier*/ Ident,
1221 /* global thread num*/ ThreadID,
1222 /* if expression */ Cond,
1223 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1224 /* Proc bind */ Builder.getInt32(-1),
1225 /* outlined function */
1226 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1227 /* wrapper function */ NullPtrValue,
1228 /* arguments of the outlined funciton*/ Args,
1229 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1230
1231 FunctionCallee RTLFn =
1232 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1233
1234 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1235
1236 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1237 << *Builder.GetInsertBlock()->getParent() << "\n");
1238
1239 // Initialize the local TID stack location with the argument value.
1240 Builder.SetInsertPoint(PrivTID);
1241 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1242 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1243 PrivTIDAddr);
1244
1245 // Remove redundant call to the outlined function.
1246 CI->eraseFromParent();
1247
1248 for (Instruction *I : ToBeDeleted) {
1249 I->eraseFromParent();
1250 }
1251}
1252
1253// Callback used to create OpenMP runtime calls to support
1254// omp parallel clause for the host.
1255// We need to use this callback to replace call to the OutlinedFn in OuterFn
1256// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1257static void
1259 Function *OuterFn, Value *Ident, Value *IfCondition,
1260 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1261 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1262 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1263 FunctionCallee RTLFn;
1264 if (IfCondition) {
1265 RTLFn =
1266 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1267 } else {
1268 RTLFn =
1269 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1270 }
1271 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1272 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1273 LLVMContext &Ctx = F->getContext();
1274 MDBuilder MDB(Ctx);
1275 // Annotate the callback behavior of the __kmpc_fork_call:
1276 // - The callback callee is argument number 2 (microtask).
1277 // - The first two arguments of the callback callee are unknown (-1).
1278 // - All variadic arguments to the __kmpc_fork_call are passed to the
1279 // callback callee.
1280 F->addMetadata(LLVMContext::MD_callback,
1282 2, {-1, -1},
1283 /* VarArgsArePassed */ true)}));
1284 }
1285 }
1286 // Add some known attributes.
1287 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1288 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1289 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1290
1291 assert(OutlinedFn.arg_size() >= 2 &&
1292 "Expected at least tid and bounded tid as arguments");
1293 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1294
1295 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1296 CI->getParent()->setName("omp_parallel");
1297 Builder.SetInsertPoint(CI);
1298
1299 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1300 Value *ForkCallArgs[] = {
1301 Ident, Builder.getInt32(NumCapturedVars),
1302 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1303
1304 SmallVector<Value *, 16> RealArgs;
1305 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1306 if (IfCondition) {
1307 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1308 RealArgs.push_back(Cond);
1309 }
1310 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1311
1312 // __kmpc_fork_call_if always expects a void ptr as the last argument
1313 // If there are no arguments, pass a null pointer.
1314 auto PtrTy = OMPIRBuilder->VoidPtr;
1315 if (IfCondition && NumCapturedVars == 0) {
1316 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1317 RealArgs.push_back(NullPtrValue);
1318 }
1319 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1320 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1321
1322 Builder.CreateCall(RTLFn, RealArgs);
1323
1324 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1325 << *Builder.GetInsertBlock()->getParent() << "\n");
1326
1327 // Initialize the local TID stack location with the argument value.
1328 Builder.SetInsertPoint(PrivTID);
1329 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1330 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1331 PrivTIDAddr);
1332
1333 // Remove redundant call to the outlined function.
1334 CI->eraseFromParent();
1335
1336 for (Instruction *I : ToBeDeleted) {
1337 I->eraseFromParent();
1338 }
1339}
1340
1342 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1343 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1344 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1345 omp::ProcBindKind ProcBind, bool IsCancellable) {
1346 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1347
1348 if (!updateToLocation(Loc))
1349 return Loc.IP;
1350
1351 uint32_t SrcLocStrSize;
1352 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1353 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1354 Value *ThreadID = getOrCreateThreadID(Ident);
1355 // If we generate code for the target device, we need to allocate
1356 // struct for aggregate params in the device default alloca address space.
1357 // OpenMP runtime requires that the params of the extracted functions are
1358 // passed as zero address space pointers. This flag ensures that extracted
1359 // function arguments are declared in zero address space
1360 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1361
1362 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1363 // only if we compile for host side.
1364 if (NumThreads && !Config.isTargetDevice()) {
1365 Value *Args[] = {
1366 Ident, ThreadID,
1367 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1369 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1370 }
1371
1372 if (ProcBind != OMP_PROC_BIND_default) {
1373 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1374 Value *Args[] = {
1375 Ident, ThreadID,
1376 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1378 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1379 }
1380
1381 BasicBlock *InsertBB = Builder.GetInsertBlock();
1382 Function *OuterFn = InsertBB->getParent();
1383
1384 // Save the outer alloca block because the insertion iterator may get
1385 // invalidated and we still need this later.
1386 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1387
1388 // Vector to remember instructions we used only during the modeling but which
1389 // we want to delete at the end.
1391
1392 // Change the location to the outer alloca insertion point to create and
1393 // initialize the allocas we pass into the parallel region.
1394 Builder.restoreIP(OuterAllocaIP);
1395 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1396 AllocaInst *ZeroAddrAlloca =
1397 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1398 Instruction *TIDAddr = TIDAddrAlloca;
1399 Instruction *ZeroAddr = ZeroAddrAlloca;
1400 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1401 // Add additional casts to enforce pointers in zero address space
1402 TIDAddr = new AddrSpaceCastInst(
1403 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1404 TIDAddr->insertAfter(TIDAddrAlloca);
1405 ToBeDeleted.push_back(TIDAddr);
1406 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1407 PointerType ::get(M.getContext(), 0),
1408 "zero.addr.ascast");
1409 ZeroAddr->insertAfter(ZeroAddrAlloca);
1410 ToBeDeleted.push_back(ZeroAddr);
1411 }
1412
1413 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1414 // associated arguments in the outlined function, so we delete them later.
1415 ToBeDeleted.push_back(TIDAddrAlloca);
1416 ToBeDeleted.push_back(ZeroAddrAlloca);
1417
1418 // Create an artificial insertion point that will also ensure the blocks we
1419 // are about to split are not degenerated.
1420 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1421
1422 BasicBlock *EntryBB = UI->getParent();
1423 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1424 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1425 BasicBlock *PRegPreFiniBB =
1426 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1427 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1428
1429 auto FiniCBWrapper = [&](InsertPointTy IP) {
1430 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1431 // target to the region exit block.
1432 if (IP.getBlock()->end() == IP.getPoint()) {
1434 Builder.restoreIP(IP);
1435 Instruction *I = Builder.CreateBr(PRegExitBB);
1436 IP = InsertPointTy(I->getParent(), I->getIterator());
1437 }
1438 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1439 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1440 "Unexpected insertion point for finalization call!");
1441 return FiniCB(IP);
1442 };
1443
1444 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1445
1446 // Generate the privatization allocas in the block that will become the entry
1447 // of the outlined function.
1448 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1449 InsertPointTy InnerAllocaIP = Builder.saveIP();
1450
1451 AllocaInst *PrivTIDAddr =
1452 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1453 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1454
1455 // Add some fake uses for OpenMP provided arguments.
1456 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1457 Instruction *ZeroAddrUse =
1458 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1459 ToBeDeleted.push_back(ZeroAddrUse);
1460
1461 // EntryBB
1462 // |
1463 // V
1464 // PRegionEntryBB <- Privatization allocas are placed here.
1465 // |
1466 // V
1467 // PRegionBodyBB <- BodeGen is invoked here.
1468 // |
1469 // V
1470 // PRegPreFiniBB <- The block we will start finalization from.
1471 // |
1472 // V
1473 // PRegionExitBB <- A common exit to simplify block collection.
1474 //
1475
1476 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1477
1478 // Let the caller create the body.
1479 assert(BodyGenCB && "Expected body generation callback!");
1480 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1481 BodyGenCB(InnerAllocaIP, CodeGenIP);
1482
1483 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1484
1485 OutlineInfo OI;
1486 if (Config.isTargetDevice()) {
1487 // Generate OpenMP target specific runtime call
1488 OI.PostOutlineCB = [=, ToBeDeletedVec =
1489 std::move(ToBeDeleted)](Function &OutlinedFn) {
1490 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1491 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1492 ThreadID, ToBeDeletedVec);
1493 };
1494 } else {
1495 // Generate OpenMP host runtime call
1496 OI.PostOutlineCB = [=, ToBeDeletedVec =
1497 std::move(ToBeDeleted)](Function &OutlinedFn) {
1498 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1499 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1500 };
1501 }
1502
1503 OI.OuterAllocaBB = OuterAllocaBlock;
1504 OI.EntryBB = PRegEntryBB;
1505 OI.ExitBB = PRegExitBB;
1506
1507 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1509 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1510
1511 // Ensure a single exit node for the outlined region by creating one.
1512 // We might have multiple incoming edges to the exit now due to finalizations,
1513 // e.g., cancel calls that cause the control flow to leave the region.
1514 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1515 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1516 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1517 Blocks.push_back(PRegOutlinedExitBB);
1518
1519 CodeExtractorAnalysisCache CEAC(*OuterFn);
1520 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1521 /* AggregateArgs */ false,
1522 /* BlockFrequencyInfo */ nullptr,
1523 /* BranchProbabilityInfo */ nullptr,
1524 /* AssumptionCache */ nullptr,
1525 /* AllowVarArgs */ true,
1526 /* AllowAlloca */ true,
1527 /* AllocationBlock */ OuterAllocaBlock,
1528 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1529
1530 // Find inputs to, outputs from the code region.
1531 BasicBlock *CommonExit = nullptr;
1532 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1533 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1534 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1535
1536 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1537
1538 FunctionCallee TIDRTLFn =
1539 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1540
1541 auto PrivHelper = [&](Value &V) {
1542 if (&V == TIDAddr || &V == ZeroAddr) {
1543 OI.ExcludeArgsFromAggregate.push_back(&V);
1544 return;
1545 }
1546
1548 for (Use &U : V.uses())
1549 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1550 if (ParallelRegionBlockSet.count(UserI->getParent()))
1551 Uses.insert(&U);
1552
1553 // __kmpc_fork_call expects extra arguments as pointers. If the input
1554 // already has a pointer type, everything is fine. Otherwise, store the
1555 // value onto stack and load it back inside the to-be-outlined region. This
1556 // will ensure only the pointer will be passed to the function.
1557 // FIXME: if there are more than 15 trailing arguments, they must be
1558 // additionally packed in a struct.
1559 Value *Inner = &V;
1560 if (!V.getType()->isPointerTy()) {
1562 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1563
1564 Builder.restoreIP(OuterAllocaIP);
1565 Value *Ptr =
1566 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1567
1568 // Store to stack at end of the block that currently branches to the entry
1569 // block of the to-be-outlined region.
1570 Builder.SetInsertPoint(InsertBB,
1571 InsertBB->getTerminator()->getIterator());
1572 Builder.CreateStore(&V, Ptr);
1573
1574 // Load back next to allocations in the to-be-outlined region.
1575 Builder.restoreIP(InnerAllocaIP);
1576 Inner = Builder.CreateLoad(V.getType(), Ptr);
1577 }
1578
1579 Value *ReplacementValue = nullptr;
1580 CallInst *CI = dyn_cast<CallInst>(&V);
1581 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1582 ReplacementValue = PrivTID;
1583 } else {
1585 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1586 assert(ReplacementValue &&
1587 "Expected copy/create callback to set replacement value!");
1588 if (ReplacementValue == &V)
1589 return;
1590 }
1591
1592 for (Use *UPtr : Uses)
1593 UPtr->set(ReplacementValue);
1594 };
1595
1596 // Reset the inner alloca insertion as it will be used for loading the values
1597 // wrapped into pointers before passing them into the to-be-outlined region.
1598 // Configure it to insert immediately after the fake use of zero address so
1599 // that they are available in the generated body and so that the
1600 // OpenMP-related values (thread ID and zero address pointers) remain leading
1601 // in the argument list.
1602 InnerAllocaIP = IRBuilder<>::InsertPoint(
1603 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1604
1605 // Reset the outer alloca insertion point to the entry of the relevant block
1606 // in case it was invalidated.
1607 OuterAllocaIP = IRBuilder<>::InsertPoint(
1608 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1609
1610 for (Value *Input : Inputs) {
1611 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1612 PrivHelper(*Input);
1613 }
1614 LLVM_DEBUG({
1615 for (Value *Output : Outputs)
1616 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1617 });
1618 assert(Outputs.empty() &&
1619 "OpenMP outlining should not produce live-out values!");
1620
1621 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1622 LLVM_DEBUG({
1623 for (auto *BB : Blocks)
1624 dbgs() << " PBR: " << BB->getName() << "\n";
1625 });
1626
1627 // Adjust the finalization stack, verify the adjustment, and call the
1628 // finalize function a last time to finalize values between the pre-fini
1629 // block and the exit block if we left the parallel "the normal way".
1630 auto FiniInfo = FinalizationStack.pop_back_val();
1631 (void)FiniInfo;
1632 assert(FiniInfo.DK == OMPD_parallel &&
1633 "Unexpected finalization stack state!");
1634
1635 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1636
1637 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1638 FiniCB(PreFiniIP);
1639
1640 // Register the outlined info.
1641 addOutlineInfo(std::move(OI));
1642
1643 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1644 UI->eraseFromParent();
1645
1646 return AfterIP;
1647}
1648
1650 // Build call void __kmpc_flush(ident_t *loc)
1651 uint32_t SrcLocStrSize;
1652 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1653 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1654
1655 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1656}
1657
1659 if (!updateToLocation(Loc))
1660 return;
1661 emitFlush(Loc);
1662}
1663
1665 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1666 // global_tid);
1667 uint32_t SrcLocStrSize;
1668 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1669 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1670 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1671
1672 // Ignore return result until untied tasks are supported.
1673 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1674 Args);
1675}
1676
1678 if (!updateToLocation(Loc))
1679 return;
1680 emitTaskwaitImpl(Loc);
1681}
1682
1684 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1685 uint32_t SrcLocStrSize;
1686 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1687 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1689 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1690
1691 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1692 Args);
1693}
1694
1696 if (!updateToLocation(Loc))
1697 return;
1698 emitTaskyieldImpl(Loc);
1699}
1700
1703 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1704 bool Tied, Value *Final, Value *IfCondition,
1705 SmallVector<DependData> Dependencies) {
1706
1707 if (!updateToLocation(Loc))
1708 return InsertPointTy();
1709
1710 uint32_t SrcLocStrSize;
1711 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1712 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1713 // The current basic block is split into four basic blocks. After outlining,
1714 // they will be mapped as follows:
1715 // ```
1716 // def current_fn() {
1717 // current_basic_block:
1718 // br label %task.exit
1719 // task.exit:
1720 // ; instructions after task
1721 // }
1722 // def outlined_fn() {
1723 // task.alloca:
1724 // br label %task.body
1725 // task.body:
1726 // ret void
1727 // }
1728 // ```
1729 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1730 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1731 BasicBlock *TaskAllocaBB =
1732 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1733
1734 InsertPointTy TaskAllocaIP =
1735 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1736 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1737 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1738
1739 OutlineInfo OI;
1740 OI.EntryBB = TaskAllocaBB;
1741 OI.OuterAllocaBB = AllocaIP.getBlock();
1742 OI.ExitBB = TaskExitBB;
1743
1744 // Add the thread ID argument.
1745 std::stack<Instruction *> ToBeDeleted;
1747 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1748
1749 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1750 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1751 // Replace the Stale CI by appropriate RTL function call.
1752 assert(OutlinedFn.getNumUses() == 1 &&
1753 "there must be a single user for the outlined function");
1754 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1755
1756 // HasShareds is true if any variables are captured in the outlined region,
1757 // false otherwise.
1758 bool HasShareds = StaleCI->arg_size() > 1;
1759 Builder.SetInsertPoint(StaleCI);
1760
1761 // Gather the arguments for emitting the runtime call for
1762 // @__kmpc_omp_task_alloc
1763 Function *TaskAllocFn =
1764 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1765
1766 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1767 // call.
1768 Value *ThreadID = getOrCreateThreadID(Ident);
1769
1770 // Argument - `flags`
1771 // Task is tied iff (Flags & 1) == 1.
1772 // Task is untied iff (Flags & 1) == 0.
1773 // Task is final iff (Flags & 2) == 2.
1774 // Task is not final iff (Flags & 2) == 0.
1775 // TODO: Handle the other flags.
1776 Value *Flags = Builder.getInt32(Tied);
1777 if (Final) {
1778 Value *FinalFlag =
1780 Flags = Builder.CreateOr(FinalFlag, Flags);
1781 }
1782
1783 // Argument - `sizeof_kmp_task_t` (TaskSize)
1784 // Tasksize refers to the size in bytes of kmp_task_t data structure
1785 // including private vars accessed in task.
1786 // TODO: add kmp_task_t_with_privates (privates)
1787 Value *TaskSize = Builder.getInt64(
1789
1790 // Argument - `sizeof_shareds` (SharedsSize)
1791 // SharedsSize refers to the shareds array size in the kmp_task_t data
1792 // structure.
1793 Value *SharedsSize = Builder.getInt64(0);
1794 if (HasShareds) {
1795 AllocaInst *ArgStructAlloca =
1796 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1797 assert(ArgStructAlloca &&
1798 "Unable to find the alloca instruction corresponding to arguments "
1799 "for extracted function");
1800 StructType *ArgStructType =
1801 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1802 assert(ArgStructType && "Unable to find struct type corresponding to "
1803 "arguments for extracted function");
1804 SharedsSize =
1806 }
1807 // Emit the @__kmpc_omp_task_alloc runtime call
1808 // The runtime call returns a pointer to an area where the task captured
1809 // variables must be copied before the task is run (TaskData)
1810 CallInst *TaskData = Builder.CreateCall(
1811 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1812 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1813 /*task_func=*/&OutlinedFn});
1814
1815 // Copy the arguments for outlined function
1816 if (HasShareds) {
1817 Value *Shareds = StaleCI->getArgOperand(1);
1818 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1819 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1820 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1821 SharedsSize);
1822 }
1823
1824 Value *DepArray = nullptr;
1825 if (Dependencies.size()) {
1826 InsertPointTy OldIP = Builder.saveIP();
1828 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1829
1830 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1831 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1832
1833 unsigned P = 0;
1834 for (const DependData &Dep : Dependencies) {
1835 Value *Base =
1836 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1837 // Store the pointer to the variable
1839 DependInfo, Base,
1840 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1841 Value *DepValPtr =
1843 Builder.CreateStore(DepValPtr, Addr);
1844 // Store the size of the variable
1846 DependInfo, Base,
1847 static_cast<unsigned int>(RTLDependInfoFields::Len));
1849 Dep.DepValueType)),
1850 Size);
1851 // Store the dependency kind
1853 DependInfo, Base,
1854 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1856 ConstantInt::get(Builder.getInt8Ty(),
1857 static_cast<unsigned int>(Dep.DepKind)),
1858 Flags);
1859 ++P;
1860 }
1861
1862 Builder.restoreIP(OldIP);
1863 }
1864
1865 // In the presence of the `if` clause, the following IR is generated:
1866 // ...
1867 // %data = call @__kmpc_omp_task_alloc(...)
1868 // br i1 %if_condition, label %then, label %else
1869 // then:
1870 // call @__kmpc_omp_task(...)
1871 // br label %exit
1872 // else:
1873 // ;; Wait for resolution of dependencies, if any, before
1874 // ;; beginning the task
1875 // call @__kmpc_omp_wait_deps(...)
1876 // call @__kmpc_omp_task_begin_if0(...)
1877 // call @outlined_fn(...)
1878 // call @__kmpc_omp_task_complete_if0(...)
1879 // br label %exit
1880 // exit:
1881 // ...
1882 if (IfCondition) {
1883 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1884 // terminator.
1885 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1886 Instruction *IfTerminator =
1887 Builder.GetInsertPoint()->getParent()->getTerminator();
1888 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1889 Builder.SetInsertPoint(IfTerminator);
1890 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1891 &ElseTI);
1892 Builder.SetInsertPoint(ElseTI);
1893
1894 if (Dependencies.size()) {
1895 Function *TaskWaitFn =
1896 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1898 TaskWaitFn,
1899 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1900 ConstantInt::get(Builder.getInt32Ty(), 0),
1902 }
1903 Function *TaskBeginFn =
1904 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1905 Function *TaskCompleteFn =
1906 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1907 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1908 CallInst *CI = nullptr;
1909 if (HasShareds)
1910 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1911 else
1912 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1913 CI->setDebugLoc(StaleCI->getDebugLoc());
1914 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1915 Builder.SetInsertPoint(ThenTI);
1916 }
1917
1918 if (Dependencies.size()) {
1919 Function *TaskFn =
1920 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
1922 TaskFn,
1923 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
1924 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
1926
1927 } else {
1928 // Emit the @__kmpc_omp_task runtime call to spawn the task
1929 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
1930 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
1931 }
1932
1933 StaleCI->eraseFromParent();
1934
1935 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
1936 if (HasShareds) {
1937 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
1938 OutlinedFn.getArg(1)->replaceUsesWithIf(
1939 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
1940 }
1941
1942 while (!ToBeDeleted.empty()) {
1943 ToBeDeleted.top()->eraseFromParent();
1944 ToBeDeleted.pop();
1945 }
1946 };
1947
1948 addOutlineInfo(std::move(OI));
1949 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
1950
1951 return Builder.saveIP();
1952}
1953
1956 InsertPointTy AllocaIP,
1957 BodyGenCallbackTy BodyGenCB) {
1958 if (!updateToLocation(Loc))
1959 return InsertPointTy();
1960
1961 uint32_t SrcLocStrSize;
1962 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1963 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1964 Value *ThreadID = getOrCreateThreadID(Ident);
1965
1966 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
1967 Function *TaskgroupFn =
1968 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
1969 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
1970
1971 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
1972 BodyGenCB(AllocaIP, Builder.saveIP());
1973
1974 Builder.SetInsertPoint(TaskgroupExitBB);
1975 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
1976 Function *EndTaskgroupFn =
1977 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
1978 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
1979
1980 return Builder.saveIP();
1981}
1982
1984 const LocationDescription &Loc, InsertPointTy AllocaIP,
1986 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
1987 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
1988
1989 if (!updateToLocation(Loc))
1990 return Loc.IP;
1991
1992 auto FiniCBWrapper = [&](InsertPointTy IP) {
1993 if (IP.getBlock()->end() != IP.getPoint())
1994 return FiniCB(IP);
1995 // This must be done otherwise any nested constructs using FinalizeOMPRegion
1996 // will fail because that function requires the Finalization Basic Block to
1997 // have a terminator, which is already removed by EmitOMPRegionBody.
1998 // IP is currently at cancelation block.
1999 // We need to backtrack to the condition block to fetch
2000 // the exit block and create a branch from cancelation
2001 // to exit block.
2003 Builder.restoreIP(IP);
2004 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2005 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2006 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2007 Instruction *I = Builder.CreateBr(ExitBB);
2008 IP = InsertPointTy(I->getParent(), I->getIterator());
2009 return FiniCB(IP);
2010 };
2011
2012 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2013
2014 // Each section is emitted as a switch case
2015 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2016 // -> OMP.createSection() which generates the IR for each section
2017 // Iterate through all sections and emit a switch construct:
2018 // switch (IV) {
2019 // case 0:
2020 // <SectionStmt[0]>;
2021 // break;
2022 // ...
2023 // case <NumSection> - 1:
2024 // <SectionStmt[<NumSection> - 1]>;
2025 // break;
2026 // }
2027 // ...
2028 // section_loop.after:
2029 // <FiniCB>;
2030 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2031 Builder.restoreIP(CodeGenIP);
2033 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2034 Function *CurFn = Continue->getParent();
2035 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2036
2037 unsigned CaseNumber = 0;
2038 for (auto SectionCB : SectionCBs) {
2040 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2041 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2042 Builder.SetInsertPoint(CaseBB);
2043 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2044 SectionCB(InsertPointTy(),
2045 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2046 CaseNumber++;
2047 }
2048 // remove the existing terminator from body BB since there can be no
2049 // terminators after switch/case
2050 };
2051 // Loop body ends here
2052 // LowerBound, UpperBound, and STride for createCanonicalLoop
2053 Type *I32Ty = Type::getInt32Ty(M.getContext());
2054 Value *LB = ConstantInt::get(I32Ty, 0);
2055 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2056 Value *ST = ConstantInt::get(I32Ty, 1);
2058 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2059 InsertPointTy AfterIP =
2060 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2061
2062 // Apply the finalization callback in LoopAfterBB
2063 auto FiniInfo = FinalizationStack.pop_back_val();
2064 assert(FiniInfo.DK == OMPD_sections &&
2065 "Unexpected finalization stack state!");
2066 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2067 Builder.restoreIP(AfterIP);
2068 BasicBlock *FiniBB =
2069 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2070 CB(Builder.saveIP());
2071 AfterIP = {FiniBB, FiniBB->begin()};
2072 }
2073
2074 return AfterIP;
2075}
2076
2079 BodyGenCallbackTy BodyGenCB,
2080 FinalizeCallbackTy FiniCB) {
2081 if (!updateToLocation(Loc))
2082 return Loc.IP;
2083
2084 auto FiniCBWrapper = [&](InsertPointTy IP) {
2085 if (IP.getBlock()->end() != IP.getPoint())
2086 return FiniCB(IP);
2087 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2088 // will fail because that function requires the Finalization Basic Block to
2089 // have a terminator, which is already removed by EmitOMPRegionBody.
2090 // IP is currently at cancelation block.
2091 // We need to backtrack to the condition block to fetch
2092 // the exit block and create a branch from cancelation
2093 // to exit block.
2095 Builder.restoreIP(IP);
2096 auto *CaseBB = Loc.IP.getBlock();
2097 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2098 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2099 Instruction *I = Builder.CreateBr(ExitBB);
2100 IP = InsertPointTy(I->getParent(), I->getIterator());
2101 return FiniCB(IP);
2102 };
2103
2104 Directive OMPD = Directive::OMPD_sections;
2105 // Since we are using Finalization Callback here, HasFinalize
2106 // and IsCancellable have to be true
2107 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2108 /*Conditional*/ false, /*hasFinalize*/ true,
2109 /*IsCancellable*/ true);
2110}
2111
2112/// Create a function with a unique name and a "void (i8*, i8*)" signature in
2113/// the given module and return it.
2115 Type *VoidTy = Type::getVoidTy(M.getContext());
2116 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
2117 auto *FuncTy =
2118 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
2120 M.getDataLayout().getDefaultGlobalsAddressSpace(),
2121 ".omp.reduction.func", &M);
2122}
2123
2126 InsertPointTy AllocaIP,
2127 ArrayRef<ReductionInfo> ReductionInfos,
2128 ArrayRef<bool> IsByRef, bool IsNoWait) {
2129 assert(ReductionInfos.size() == IsByRef.size());
2130 for (const ReductionInfo &RI : ReductionInfos) {
2131 (void)RI;
2132 assert(RI.Variable && "expected non-null variable");
2133 assert(RI.PrivateVariable && "expected non-null private variable");
2134 assert(RI.ReductionGen && "expected non-null reduction generator callback");
2135 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
2136 "expected variables and their private equivalents to have the same "
2137 "type");
2138 assert(RI.Variable->getType()->isPointerTy() &&
2139 "expected variables to be pointers");
2140 }
2141
2142 if (!updateToLocation(Loc))
2143 return InsertPointTy();
2144
2145 BasicBlock *InsertBlock = Loc.IP.getBlock();
2146 BasicBlock *ContinuationBlock =
2147 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
2148 InsertBlock->getTerminator()->eraseFromParent();
2149
2150 // Create and populate array of type-erased pointers to private reduction
2151 // values.
2152 unsigned NumReductions = ReductionInfos.size();
2153 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
2154 Builder.restoreIP(AllocaIP);
2155 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
2156
2157 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
2158
2159 for (auto En : enumerate(ReductionInfos)) {
2160 unsigned Index = En.index();
2161 const ReductionInfo &RI = En.value();
2162 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
2163 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
2164 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
2165 }
2166
2167 // Emit a call to the runtime function that orchestrates the reduction.
2168 // Declare the reduction function in the process.
2170 Module *Module = Func->getParent();
2171 uint32_t SrcLocStrSize;
2172 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2173 bool CanGenerateAtomic =
2174 llvm::all_of(ReductionInfos, [](const ReductionInfo &RI) {
2175 return RI.AtomicReductionGen;
2176 });
2177 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
2178 CanGenerateAtomic
2179 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
2180 : IdentFlag(0));
2181 Value *ThreadId = getOrCreateThreadID(Ident);
2182 Constant *NumVariables = Builder.getInt32(NumReductions);
2183 const DataLayout &DL = Module->getDataLayout();
2184 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
2185 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
2186 Function *ReductionFunc = getFreshReductionFunc(*Module);
2187 Value *Lock = getOMPCriticalRegionLock(".reduction");
2189 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
2190 : RuntimeFunction::OMPRTL___kmpc_reduce);
2191 CallInst *ReduceCall =
2192 Builder.CreateCall(ReduceFunc,
2193 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
2194 ReductionFunc, Lock},
2195 "reduce");
2196
2197 // Create final reduction entry blocks for the atomic and non-atomic case.
2198 // Emit IR that dispatches control flow to one of the blocks based on the
2199 // reduction supporting the atomic mode.
2200 BasicBlock *NonAtomicRedBlock =
2201 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
2202 BasicBlock *AtomicRedBlock =
2203 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
2204 SwitchInst *Switch =
2205 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
2206 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
2207 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
2208
2209 // Populate the non-atomic reduction using the elementwise reduction function.
2210 // This loads the elements from the global and private variables and reduces
2211 // them before storing back the result to the global variable.
2212 Builder.SetInsertPoint(NonAtomicRedBlock);
2213 for (auto En : enumerate(ReductionInfos)) {
2214 const ReductionInfo &RI = En.value();
2216 // We have one less load for by-ref case because that load is now inside of
2217 // the reduction region
2218 Value *RedValue = nullptr;
2219 if (!IsByRef[En.index()]) {
2220 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
2221 "red.value." + Twine(En.index()));
2222 }
2223 Value *PrivateRedValue =
2225 "red.private.value." + Twine(En.index()));
2226 Value *Reduced;
2227 if (IsByRef[En.index()]) {
2229 PrivateRedValue, Reduced));
2230 } else {
2232 PrivateRedValue, Reduced));
2233 }
2234 if (!Builder.GetInsertBlock())
2235 return InsertPointTy();
2236 // for by-ref case, the load is inside of the reduction region
2237 if (!IsByRef[En.index()])
2238 Builder.CreateStore(Reduced, RI.Variable);
2239 }
2240 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
2241 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
2242 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
2243 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
2244 Builder.CreateBr(ContinuationBlock);
2245
2246 // Populate the atomic reduction using the atomic elementwise reduction
2247 // function. There are no loads/stores here because they will be happening
2248 // inside the atomic elementwise reduction.
2249 Builder.SetInsertPoint(AtomicRedBlock);
2250 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
2251 for (const ReductionInfo &RI : ReductionInfos) {
2253 RI.Variable, RI.PrivateVariable));
2254 if (!Builder.GetInsertBlock())
2255 return InsertPointTy();
2256 }
2257 Builder.CreateBr(ContinuationBlock);
2258 } else {
2260 }
2261
2262 // Populate the outlined reduction function using the elementwise reduction
2263 // function. Partial values are extracted from the type-erased array of
2264 // pointers to private variables.
2265 BasicBlock *ReductionFuncBlock =
2266 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
2267 Builder.SetInsertPoint(ReductionFuncBlock);
2268 Value *LHSArrayPtr = ReductionFunc->getArg(0);
2269 Value *RHSArrayPtr = ReductionFunc->getArg(1);
2270
2271 for (auto En : enumerate(ReductionInfos)) {
2272 const ReductionInfo &RI = En.value();
2274 RedArrayTy, LHSArrayPtr, 0, En.index());
2275 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
2276 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
2277 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
2279 RedArrayTy, RHSArrayPtr, 0, En.index());
2280 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
2281 Value *RHSPtr =
2283 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
2284 Value *Reduced;
2286 if (!Builder.GetInsertBlock())
2287 return InsertPointTy();
2288 // store is inside of the reduction region when using by-ref
2289 if (!IsByRef[En.index()])
2290 Builder.CreateStore(Reduced, LHSPtr);
2291 }
2293
2294 Builder.SetInsertPoint(ContinuationBlock);
2295 return Builder.saveIP();
2296}
2297
2300 BodyGenCallbackTy BodyGenCB,
2301 FinalizeCallbackTy FiniCB) {
2302
2303 if (!updateToLocation(Loc))
2304 return Loc.IP;
2305
2306 Directive OMPD = Directive::OMPD_master;
2307 uint32_t SrcLocStrSize;
2308 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2309 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2310 Value *ThreadId = getOrCreateThreadID(Ident);
2311 Value *Args[] = {Ident, ThreadId};
2312
2313 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
2314 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
2315
2316 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
2317 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
2318
2319 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2320 /*Conditional*/ true, /*hasFinalize*/ true);
2321}
2322
2325 BodyGenCallbackTy BodyGenCB,
2326 FinalizeCallbackTy FiniCB, Value *Filter) {
2327 if (!updateToLocation(Loc))
2328 return Loc.IP;
2329
2330 Directive OMPD = Directive::OMPD_masked;
2331 uint32_t SrcLocStrSize;
2332 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2333 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2334 Value *ThreadId = getOrCreateThreadID(Ident);
2335 Value *Args[] = {Ident, ThreadId, Filter};
2336 Value *ArgsEnd[] = {Ident, ThreadId};
2337
2338 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
2339 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
2340
2341 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
2342 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
2343
2344 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2345 /*Conditional*/ true, /*hasFinalize*/ true);
2346}
2347
2349 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
2350 BasicBlock *PostInsertBefore, const Twine &Name) {
2351 Module *M = F->getParent();
2352 LLVMContext &Ctx = M->getContext();
2353 Type *IndVarTy = TripCount->getType();
2354
2355 // Create the basic block structure.
2356 BasicBlock *Preheader =
2357 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
2358 BasicBlock *Header =
2359 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
2360 BasicBlock *Cond =
2361 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
2362 BasicBlock *Body =
2363 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
2364 BasicBlock *Latch =
2365 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
2366 BasicBlock *Exit =
2367 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
2368 BasicBlock *After =
2369 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
2370
2371 // Use specified DebugLoc for new instructions.
2373
2374 Builder.SetInsertPoint(Preheader);
2375 Builder.CreateBr(Header);
2376
2377 Builder.SetInsertPoint(Header);
2378 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
2379 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
2381
2383 Value *Cmp =
2384 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
2385 Builder.CreateCondBr(Cmp, Body, Exit);
2386
2387 Builder.SetInsertPoint(Body);
2388 Builder.CreateBr(Latch);
2389
2390 Builder.SetInsertPoint(Latch);
2391 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
2392 "omp_" + Name + ".next", /*HasNUW=*/true);
2393 Builder.CreateBr(Header);
2394 IndVarPHI->addIncoming(Next, Latch);
2395
2396 Builder.SetInsertPoint(Exit);
2398
2399 // Remember and return the canonical control flow.
2400 LoopInfos.emplace_front();
2401 CanonicalLoopInfo *CL = &LoopInfos.front();
2402
2403 CL->Header = Header;
2404 CL->Cond = Cond;
2405 CL->Latch = Latch;
2406 CL->Exit = Exit;
2407
2408#ifndef NDEBUG
2409 CL->assertOK();
2410#endif
2411 return CL;
2412}
2413
2416 LoopBodyGenCallbackTy BodyGenCB,
2417 Value *TripCount, const Twine &Name) {
2418 BasicBlock *BB = Loc.IP.getBlock();
2419 BasicBlock *NextBB = BB->getNextNode();
2420
2421 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
2422 NextBB, NextBB, Name);
2423 BasicBlock *After = CL->getAfter();
2424
2425 // If location is not set, don't connect the loop.
2426 if (updateToLocation(Loc)) {
2427 // Split the loop at the insertion point: Branch to the preheader and move
2428 // every following instruction to after the loop (the After BB). Also, the
2429 // new successor is the loop's after block.
2430 spliceBB(Builder, After, /*CreateBranch=*/false);
2432 }
2433
2434 // Emit the body content. We do it after connecting the loop to the CFG to
2435 // avoid that the callback encounters degenerate BBs.
2436 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
2437
2438#ifndef NDEBUG
2439 CL->assertOK();
2440#endif
2441 return CL;
2442}
2443
2445 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
2446 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
2447 InsertPointTy ComputeIP, const Twine &Name) {
2448
2449 // Consider the following difficulties (assuming 8-bit signed integers):
2450 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
2451 // DO I = 1, 100, 50
2452 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
2453 // DO I = 100, 0, -128
2454
2455 // Start, Stop and Step must be of the same integer type.
2456 auto *IndVarTy = cast<IntegerType>(Start->getType());
2457 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
2458 assert(IndVarTy == Step->getType() && "Step type mismatch");
2459
2460 LocationDescription ComputeLoc =
2461 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
2462 updateToLocation(ComputeLoc);
2463
2464 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
2465 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
2466
2467 // Like Step, but always positive.
2468 Value *Incr = Step;
2469
2470 // Distance between Start and Stop; always positive.
2471 Value *Span;
2472
2473 // Condition whether there are no iterations are executed at all, e.g. because
2474 // UB < LB.
2475 Value *ZeroCmp;
2476
2477 if (IsSigned) {
2478 // Ensure that increment is positive. If not, negate and invert LB and UB.
2479 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
2480 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
2481 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
2482 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
2483 Span = Builder.CreateSub(UB, LB, "", false, true);
2484 ZeroCmp = Builder.CreateICmp(
2485 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
2486 } else {
2487 Span = Builder.CreateSub(Stop, Start, "", true);
2488 ZeroCmp = Builder.CreateICmp(
2489 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
2490 }
2491
2492 Value *CountIfLooping;
2493 if (InclusiveStop) {
2494 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
2495 } else {
2496 // Avoid incrementing past stop since it could overflow.
2497 Value *CountIfTwo = Builder.CreateAdd(
2498 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
2499 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
2500 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
2501 }
2502 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
2503 "omp_" + Name + ".tripcount");
2504
2505 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
2506 Builder.restoreIP(CodeGenIP);
2507 Value *Span = Builder.CreateMul(IV, Step);
2508 Value *IndVar = Builder.CreateAdd(Span, Start);
2509 BodyGenCB(Builder.saveIP(), IndVar);
2510 };
2511 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
2512 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
2513}
2514
2515// Returns an LLVM function to call for initializing loop bounds using OpenMP
2516// static scheduling depending on `type`. Only i32 and i64 are supported by the
2517// runtime. Always interpret integers as unsigned similarly to
2518// CanonicalLoopInfo.
2520 OpenMPIRBuilder &OMPBuilder) {
2521 unsigned Bitwidth = Ty->getIntegerBitWidth();
2522 if (Bitwidth == 32)
2523 return OMPBuilder.getOrCreateRuntimeFunction(
2524 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
2525 if (Bitwidth == 64)
2526 return OMPBuilder.getOrCreateRuntimeFunction(
2527 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
2528 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
2529}
2530
2532OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
2533 InsertPointTy AllocaIP,
2534 bool NeedsBarrier) {
2535 assert(CLI->isValid() && "Requires a valid canonical loop");
2536 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
2537 "Require dedicated allocate IP");
2538
2539 // Set up the source location value for OpenMP runtime.
2542
2543 uint32_t SrcLocStrSize;
2544 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2545 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2546
2547 // Declare useful OpenMP runtime functions.
2548 Value *IV = CLI->getIndVar();
2549 Type *IVTy = IV->getType();
2550 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
2551 FunctionCallee StaticFini =
2552 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
2553
2554 // Allocate space for computed loop bounds as expected by the "init" function.
2555 Builder.restoreIP(AllocaIP);
2556 Type *I32Type = Type::getInt32Ty(M.getContext());
2557 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2558 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
2559 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
2560 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
2561
2562 // At the end of the preheader, prepare for calling the "init" function by
2563 // storing the current loop bounds into the allocated space. A canonical loop
2564 // always iterates from 0 to trip-count with step 1. Note that "init" expects
2565 // and produces an inclusive upper bound.
2567 Constant *Zero = ConstantInt::get(IVTy, 0);
2568 Constant *One = ConstantInt::get(IVTy, 1);
2569 Builder.CreateStore(Zero, PLowerBound);
2570 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
2571 Builder.CreateStore(UpperBound, PUpperBound);
2572 Builder.CreateStore(One, PStride);
2573
2574 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2575
2576 Constant *SchedulingType = ConstantInt::get(
2577 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
2578
2579 // Call the "init" function and update the trip count of the loop with the
2580 // value it produced.
2581 Builder.CreateCall(StaticInit,
2582 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
2583 PUpperBound, PStride, One, Zero});
2584 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
2585 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
2586 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
2587 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
2588 CLI->setTripCount(TripCount);
2589
2590 // Update all uses of the induction variable except the one in the condition
2591 // block that compares it with the actual upper bound, and the increment in
2592 // the latch block.
2593
2594 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
2596 CLI->getBody()->getFirstInsertionPt());
2598 return Builder.CreateAdd(OldIV, LowerBound);
2599 });
2600
2601 // In the "exit" block, call the "fini" function.
2603 CLI->getExit()->getTerminator()->getIterator());
2604 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2605
2606 // Add the barrier if requested.
2607 if (NeedsBarrier)
2608 createBarrier(LocationDescription(Builder.saveIP(), DL),
2609 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
2610 /* CheckCancelFlag */ false);
2611
2612 InsertPointTy AfterIP = CLI->getAfterIP();
2613 CLI->invalidate();
2614
2615 return AfterIP;
2616}
2617
2618OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
2619 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
2620 bool NeedsBarrier, Value *ChunkSize) {
2621 assert(CLI->isValid() && "Requires a valid canonical loop");
2622 assert(ChunkSize && "Chunk size is required");
2623
2624 LLVMContext &Ctx = CLI->getFunction()->getContext();
2625 Value *IV = CLI->getIndVar();
2626 Value *OrigTripCount = CLI->getTripCount();
2627 Type *IVTy = IV->getType();
2628 assert(IVTy->getIntegerBitWidth() <= 64 &&
2629 "Max supported tripcount bitwidth is 64 bits");
2630 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
2631 : Type::getInt64Ty(Ctx);
2632 Type *I32Type = Type::getInt32Ty(M.getContext());
2633 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
2634 Constant *One = ConstantInt::get(InternalIVTy, 1);
2635
2636 // Declare useful OpenMP runtime functions.
2637 FunctionCallee StaticInit =
2638 getKmpcForStaticInitForType(InternalIVTy, M, *this);
2639 FunctionCallee StaticFini =
2640 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
2641
2642 // Allocate space for computed loop bounds as expected by the "init" function.
2643 Builder.restoreIP(AllocaIP);
2645 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
2646 Value *PLowerBound =
2647 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
2648 Value *PUpperBound =
2649 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
2650 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
2651
2652 // Set up the source location value for the OpenMP runtime.
2655
2656 // TODO: Detect overflow in ubsan or max-out with current tripcount.
2657 Value *CastedChunkSize =
2658 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
2659 Value *CastedTripCount =
2660 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
2661
2662 Constant *SchedulingType = ConstantInt::get(
2663 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
2664 Builder.CreateStore(Zero, PLowerBound);
2665 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
2666 Builder.CreateStore(OrigUpperBound, PUpperBound);
2667 Builder.CreateStore(One, PStride);
2668
2669 // Call the "init" function and update the trip count of the loop with the
2670 // value it produced.
2671 uint32_t SrcLocStrSize;
2672 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2673 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2674 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
2675 Builder.CreateCall(StaticInit,
2676 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
2677 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
2678 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
2679 /*pstride=*/PStride, /*incr=*/One,
2680 /*chunk=*/CastedChunkSize});
2681
2682 // Load values written by the "init" function.
2683 Value *FirstChunkStart =
2684 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
2685 Value *FirstChunkStop =
2686 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
2687 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
2688 Value *ChunkRange =
2689 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
2690 Value *NextChunkStride =
2691 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
2692
2693 // Create outer "dispatch" loop for enumerating the chunks.
2694 BasicBlock *DispatchEnter = splitBB(Builder, true);
2695 Value *DispatchCounter;
2697 {Builder.saveIP(), DL},
2698 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
2699 FirstChunkStart, CastedTripCount, NextChunkStride,
2700 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
2701 "dispatch");
2702
2703 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
2704 // not have to preserve the canonical invariant.
2705 BasicBlock *DispatchBody = DispatchCLI->getBody();
2706 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
2707 BasicBlock *DispatchExit = DispatchCLI->getExit();
2708 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
2709 DispatchCLI->invalidate();
2710
2711 // Rewire the original loop to become the chunk loop inside the dispatch loop.
2712 redirectTo(DispatchAfter, CLI->getAfter(), DL);
2713 redirectTo(CLI->getExit(), DispatchLatch, DL);
2714 redirectTo(DispatchBody, DispatchEnter, DL);
2715
2716 // Prepare the prolog of the chunk loop.
2719
2720 // Compute the number of iterations of the chunk loop.
2722 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
2723 Value *IsLastChunk =
2724 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
2725 Value *CountUntilOrigTripCount =
2726 Builder.CreateSub(CastedTripCount, DispatchCounter);
2727 Value *ChunkTripCount = Builder.CreateSelect(
2728 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
2729 Value *BackcastedChunkTC =
2730 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
2731 CLI->setTripCount(BackcastedChunkTC);
2732
2733 // Update all uses of the induction variable except the one in the condition
2734 // block that compares it with the actual upper bound, and the increment in
2735 // the latch block.
2736 Value *BackcastedDispatchCounter =
2737 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
2738 CLI->mapIndVar([&](Instruction *) -> Value * {
2739 Builder.restoreIP(CLI->getBodyIP());
2740 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
2741 });
2742
2743 // In the "exit" block, call the "fini" function.
2744 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
2745 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
2746
2747 // Add the barrier if requested.
2748 if (NeedsBarrier)
2749 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
2750 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
2751
2752#ifndef NDEBUG
2753 // Even though we currently do not support applying additional methods to it,
2754 // the chunk loop should remain a canonical loop.
2755 CLI->assertOK();
2756#endif
2757
2758 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
2759}
2760
2761// Returns an LLVM function to call for executing an OpenMP static worksharing
2762// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
2763// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
2764static FunctionCallee
2766 WorksharingLoopType LoopType) {
2767 unsigned Bitwidth = Ty->getIntegerBitWidth();
2768 Module &M = OMPBuilder->M;
2769 switch (LoopType) {
2770 case WorksharingLoopType::ForStaticLoop:
2771 if (Bitwidth == 32)
2772 return OMPBuilder->getOrCreateRuntimeFunction(
2773 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
2774 if (Bitwidth == 64)
2775 return OMPBuilder->getOrCreateRuntimeFunction(
2776 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
2777 break;
2778 case WorksharingLoopType::DistributeStaticLoop:
2779 if (Bitwidth == 32)
2780 return OMPBuilder->getOrCreateRuntimeFunction(
2781 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
2782 if (Bitwidth == 64)
2783 return OMPBuilder->getOrCreateRuntimeFunction(
2784 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
2785 break;
2786 case WorksharingLoopType::DistributeForStaticLoop:
2787 if (Bitwidth == 32)
2788 return OMPBuilder->getOrCreateRuntimeFunction(
2789 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
2790 if (Bitwidth == 64)
2791 return OMPBuilder->getOrCreateRuntimeFunction(
2792 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
2793 break;
2794 }
2795 if (Bitwidth != 32 && Bitwidth != 64) {
2796 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
2797 }
2798 llvm_unreachable("Unknown type of OpenMP worksharing loop");
2799}
2800
2801// Inserts a call to proper OpenMP Device RTL function which handles
2802// loop worksharing.
2804 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
2805 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
2806 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
2807 Type *TripCountTy = TripCount->getType();
2808 Module &M = OMPBuilder->M;
2809 IRBuilder<> &Builder = OMPBuilder->Builder;
2810 FunctionCallee RTLFn =
2811 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
2812 SmallVector<Value *, 8> RealArgs;
2813 RealArgs.push_back(Ident);
2814 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
2815 RealArgs.push_back(LoopBodyArg);
2816 RealArgs.push_back(TripCount);
2817 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
2818 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2819 Builder.CreateCall(RTLFn, RealArgs);
2820 return;
2821 }
2822 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
2823 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
2824 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
2825 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
2826
2827 RealArgs.push_back(
2828 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
2829 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2830 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
2831 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
2832 }
2833
2834 Builder.CreateCall(RTLFn, RealArgs);
2835}
2836
2837static void
2839 CanonicalLoopInfo *CLI, Value *Ident,
2840 Function &OutlinedFn, Type *ParallelTaskPtr,
2841 const SmallVector<Instruction *, 4> &ToBeDeleted,
2842 WorksharingLoopType LoopType) {
2843 IRBuilder<> &Builder = OMPIRBuilder->Builder;
2844 BasicBlock *Preheader = CLI->getPreheader();
2845 Value *TripCount = CLI->getTripCount();
2846
2847 // After loop body outling, the loop body contains only set up
2848 // of loop body argument structure and the call to the outlined
2849 // loop body function. Firstly, we need to move setup of loop body args
2850 // into loop preheader.
2851 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
2852 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
2853
2854 // The next step is to remove the whole loop. We do not it need anymore.
2855 // That's why make an unconditional branch from loop preheader to loop
2856 // exit block
2857 Builder.restoreIP({Preheader, Preheader->end()});
2858 Preheader->getTerminator()->eraseFromParent();
2859 Builder.CreateBr(CLI->getExit());
2860
2861 // Delete dead loop blocks
2862 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
2863 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
2864 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
2865 CleanUpInfo.EntryBB = CLI->getHeader();
2866 CleanUpInfo.ExitBB = CLI->getExit();
2867 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
2868 DeleteDeadBlocks(BlocksToBeRemoved);
2869
2870 // Find the instruction which corresponds to loop body argument structure
2871 // and remove the call to loop body function instruction.
2872 Value *LoopBodyArg;
2873 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
2874 assert(OutlinedFnUser &&
2875 "Expected unique undroppable user of outlined function");
2876 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
2877 assert(OutlinedFnCallInstruction && "Expected outlined function call");
2878 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
2879 "Expected outlined function call to be located in loop preheader");
2880 // Check in case no argument structure has been passed.
2881 if (OutlinedFnCallInstruction->arg_size() > 1)
2882 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
2883 else
2884 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
2885 OutlinedFnCallInstruction->eraseFromParent();
2886
2887 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
2888 LoopBodyArg, ParallelTaskPtr, TripCount,
2889 OutlinedFn);
2890
2891 for (auto &ToBeDeletedItem : ToBeDeleted)
2892 ToBeDeletedItem->eraseFromParent();
2893 CLI->invalidate();
2894}
2895
2897OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
2898 InsertPointTy AllocaIP,
2899 WorksharingLoopType LoopType) {
2900 uint32_t SrcLocStrSize;
2901 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
2902 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2903
2904 OutlineInfo OI;
2905 OI.OuterAllocaBB = CLI->getPreheader();
2906 Function *OuterFn = CLI->getPreheader()->getParent();
2907
2908 // Instructions which need to be deleted at the end of code generation
2910
2911 OI.OuterAllocaBB = AllocaIP.getBlock();
2912
2913 // Mark the body loop as region which needs to be extracted
2914 OI.EntryBB = CLI->getBody();
2915 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
2916 "omp.prelatch", true);
2917
2918 // Prepare loop body for extraction
2919 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
2920
2921 // Insert new loop counter variable which will be used only in loop
2922 // body.
2923 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
2924 Instruction *NewLoopCntLoad =
2925 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
2926 // New loop counter instructions are redundant in the loop preheader when
2927 // code generation for workshare loop is finshed. That's why mark them as
2928 // ready for deletion.
2929 ToBeDeleted.push_back(NewLoopCntLoad);
2930 ToBeDeleted.push_back(NewLoopCnt);
2931
2932 // Analyse loop body region. Find all input variables which are used inside
2933 // loop body region.
2934 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
2936 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
2937 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
2938 ParallelRegionBlockSet.end());
2939
2940 CodeExtractorAnalysisCache CEAC(*OuterFn);
2941 CodeExtractor Extractor(Blocks,
2942 /* DominatorTree */ nullptr,
2943 /* AggregateArgs */ true,
2944 /* BlockFrequencyInfo */ nullptr,
2945 /* BranchProbabilityInfo */ nullptr,
2946 /* AssumptionCache */ nullptr,
2947 /* AllowVarArgs */ true,
2948 /* AllowAlloca */ true,
2949 /* AllocationBlock */ CLI->getPreheader(),
2950 /* Suffix */ ".omp_wsloop",
2951 /* AggrArgsIn0AddrSpace */ true);
2952
2953 BasicBlock *CommonExit = nullptr;
2954 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
2955
2956 // Find allocas outside the loop body region which are used inside loop
2957 // body
2958 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
2959
2960 // We need to model loop body region as the function f(cnt, loop_arg).
2961 // That's why we replace loop induction variable by the new counter
2962 // which will be one of loop body function argument
2964 CLI->getIndVar()->user_end());
2965 for (auto Use : Users) {
2966 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
2967 if (ParallelRegionBlockSet.count(Inst->getParent())) {
2968 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
2969 }
2970 }
2971 }
2972 // Make sure that loop counter variable is not merged into loop body
2973 // function argument structure and it is passed as separate variable
2974 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
2975
2976 // PostOutline CB is invoked when loop body function is outlined and
2977 // loop body is replaced by call to outlined function. We need to add
2978 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
2979 // function will handle loop control logic.
2980 //
2981 OI.PostOutlineCB = [=, ToBeDeletedVec =
2982 std::move(ToBeDeleted)](Function &OutlinedFn) {
2983 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
2984 ToBeDeletedVec, LoopType);
2985 };
2986 addOutlineInfo(std::move(OI));
2987 return CLI->getAfterIP();
2988}
2989
2992 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
2993 bool HasSimdModifier, bool HasMonotonicModifier,
2994 bool HasNonmonotonicModifier, bool HasOrderedClause,
2995 WorksharingLoopType LoopType) {
2996 if (Config.isTargetDevice())
2997 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
2998 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
2999 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
3000 HasNonmonotonicModifier, HasOrderedClause);
3001
3002 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
3003 OMPScheduleType::ModifierOrdered;
3004 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
3005 case OMPScheduleType::BaseStatic:
3006 assert(!ChunkSize && "No chunk size with static-chunked schedule");
3007 if (IsOrdered)
3008 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
3009 NeedsBarrier, ChunkSize);
3010 // FIXME: Monotonicity ignored?
3011 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
3012
3013 case OMPScheduleType::BaseStaticChunked:
3014 if (IsOrdered)
3015 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
3016 NeedsBarrier, ChunkSize);
3017 // FIXME: Monotonicity ignored?
3018 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
3019 ChunkSize);
3020
3021 case OMPScheduleType::BaseRuntime:
3022 case OMPScheduleType::BaseAuto:
3023 case OMPScheduleType::BaseGreedy:
3024 case OMPScheduleType::BaseBalanced:
3025 case OMPScheduleType::BaseSteal:
3026 case OMPScheduleType::BaseGuidedSimd:
3027 case OMPScheduleType::BaseRuntimeSimd:
3028 assert(!ChunkSize &&
3029 "schedule type does not support user-defined chunk sizes");
3030 [[fallthrough]];
3031 case OMPScheduleType::BaseDynamicChunked:
3032 case OMPScheduleType::BaseGuidedChunked:
3033 case OMPScheduleType::BaseGuidedIterativeChunked:
3034 case OMPScheduleType::BaseGuidedAnalyticalChunked:
3035 case OMPScheduleType::BaseStaticBalancedChunked:
3036 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
3037 NeedsBarrier, ChunkSize);
3038
3039 default:
3040 llvm_unreachable("Unknown/unimplemented schedule kind");
3041 }
3042}
3043
3044/// Returns an LLVM function to call for initializing loop bounds using OpenMP
3045/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
3046/// the runtime. Always interpret integers as unsigned similarly to
3047/// CanonicalLoopInfo.
3048static FunctionCallee
3050 unsigned Bitwidth = Ty->getIntegerBitWidth();
3051 if (Bitwidth == 32)
3052 return OMPBuilder.getOrCreateRuntimeFunction(
3053 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
3054 if (Bitwidth == 64)
3055 return OMPBuilder.getOrCreateRuntimeFunction(
3056 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
3057 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3058}
3059
3060/// Returns an LLVM function to call for updating the next loop using OpenMP
3061/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
3062/// the runtime. Always interpret integers as unsigned similarly to
3063/// CanonicalLoopInfo.
3064static FunctionCallee
3066 unsigned Bitwidth = Ty->getIntegerBitWidth();
3067 if (Bitwidth == 32)
3068 return OMPBuilder.getOrCreateRuntimeFunction(
3069 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
3070 if (Bitwidth == 64)
3071 return OMPBuilder.getOrCreateRuntimeFunction(
3072 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
3073 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3074}
3075
3076/// Returns an LLVM function to call for finalizing the dynamic loop using
3077/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
3078/// interpret integers as unsigned similarly to CanonicalLoopInfo.
3079static FunctionCallee
3081 unsigned Bitwidth = Ty->getIntegerBitWidth();
3082 if (Bitwidth == 32)
3083 return OMPBuilder.getOrCreateRuntimeFunction(
3084 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
3085 if (Bitwidth == 64)
3086 return OMPBuilder.getOrCreateRuntimeFunction(
3087 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
3088 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3089}
3090
3091OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
3092 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
3093 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
3094 assert(CLI->isValid() && "Requires a valid canonical loop");
3095 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3096 "Require dedicated allocate IP");
3098 "Require valid schedule type");
3099
3100 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
3101 OMPScheduleType::ModifierOrdered;
3102
3103 // Set up the source location value for OpenMP runtime.
3105
3106 uint32_t SrcLocStrSize;
3107 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
3108 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3109
3110 // Declare useful OpenMP runtime functions.
3111 Value *IV = CLI->getIndVar();
3112 Type *IVTy = IV->getType();
3113 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
3114 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
3115
3116 // Allocate space for computed loop bounds as expected by the "init" function.
3117 Builder.restoreIP(AllocaIP);
3118 Type *I32Type = Type::getInt32Ty(M.getContext());
3119 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
3120 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
3121 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
3122 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
3123
3124 // At the end of the preheader, prepare for calling the "init" function by
3125 // storing the current loop bounds into the allocated space. A canonical loop
3126 // always iterates from 0 to trip-count with step 1. Note that "init" expects
3127 // and produces an inclusive upper bound.
3128 BasicBlock *PreHeader = CLI->getPreheader();
3129 Builder.SetInsertPoint(PreHeader->getTerminator());
3130 Constant *One = ConstantInt::get(IVTy, 1);
3131 Builder.CreateStore(One, PLowerBound);
3132 Value *UpperBound = CLI->getTripCount();
3133 Builder.CreateStore(UpperBound, PUpperBound);
3134 Builder.CreateStore(One, PStride);
3135
3136 BasicBlock *Header = CLI->getHeader();
3137 BasicBlock *Exit = CLI->getExit();
3138 BasicBlock *Cond = CLI->getCond();
3139 BasicBlock *Latch = CLI->getLatch();
3140 InsertPointTy AfterIP = CLI->getAfterIP();
3141
3142 // The CLI will be "broken" in the code below, as the loop is no longer
3143 // a valid canonical loop.
3144
3145 if (!Chunk)
3146 Chunk = One;
3147
3148 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
3149
3150 Constant *SchedulingType =
3151 ConstantInt::get(I32Type, static_cast<int>(SchedType));
3152
3153 // Call the "init" function.
3154 Builder.CreateCall(DynamicInit,
3155 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
3156 UpperBound, /* step */ One, Chunk});
3157
3158 // An outer loop around the existing one.
3159 BasicBlock *OuterCond = BasicBlock::Create(
3160 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
3161 PreHeader->getParent());
3162 // This needs to be 32-bit always, so can't use the IVTy Zero above.
3163 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
3164 Value *Res =
3165 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
3166 PLowerBound, PUpperBound, PStride});
3167 Constant *Zero32 = ConstantInt::get(I32Type, 0);
3168 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
3169 Value *LowerBound =
3170 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
3171 Builder.CreateCondBr(MoreWork, Header, Exit);
3172
3173 // Change PHI-node in loop header to use outer cond rather than preheader,
3174 // and set IV to the LowerBound.
3175 Instruction *Phi = &Header->front();
3176 auto *PI = cast<PHINode>(Phi);
3177 PI->setIncomingBlock(0, OuterCond);
3178 PI->setIncomingValue(0, LowerBound);
3179
3180 // Then set the pre-header to jump to the OuterCond
3181 Instruction *Term = PreHeader->getTerminator();
3182 auto *Br = cast<BranchInst>(Term);
3183 Br->setSuccessor(0, OuterCond);
3184
3185 // Modify the inner condition:
3186 // * Use the UpperBound returned from the DynamicNext call.
3187 // * jump to the loop outer loop when done with one of the inner loops.
3188 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
3189 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
3191 auto *CI = cast<CmpInst>(Comp);
3192 CI->setOperand(1, UpperBound);
3193 // Redirect the inner exit to branch to outer condition.
3194 Instruction *Branch = &Cond->back();
3195 auto *BI = cast<BranchInst>(Branch);
3196 assert(BI->getSuccessor(1) == Exit);
3197 BI->setSuccessor(1, OuterCond);
3198
3199 // Call the "fini" function if "ordered" is present in wsloop directive.
3200 if (Ordered) {
3201 Builder.SetInsertPoint(&Latch->back());
3202 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
3203 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
3204 }
3205
3206 // Add the barrier if requested.
3207 if (NeedsBarrier) {
3208 Builder.SetInsertPoint(&Exit->back());
3209 createBarrier(LocationDescription(Builder.saveIP(), DL),
3210 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
3211 /* CheckCancelFlag */ false);
3212 }
3213
3214 CLI->invalidate();
3215 return AfterIP;
3216}
3217
3218/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
3219/// after this \p OldTarget will be orphaned.
3221 BasicBlock *NewTarget, DebugLoc DL) {
3222 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
3223 redirectTo(Pred, NewTarget, DL);
3224}
3225
3226/// Determine which blocks in \p BBs are reachable from outside and remove the
3227/// ones that are not reachable from the function.
3229 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
3230 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
3231 for (Use &U : BB->uses()) {
3232 auto *UseInst = dyn_cast<Instruction>(U.getUser());
3233 if (!UseInst)
3234 continue;
3235 if (BBsToErase.count(UseInst->getParent()))
3236 continue;
3237 return true;
3238 }
3239 return false;
3240 };
3241
3242 while (true) {
3243 bool Changed = false;
3244 for (BasicBlock *BB : make_early_inc_range(BBsToErase)) {
3245 if (HasRemainingUses(BB)) {
3246 BBsToErase.erase(BB);
3247 Changed = true;
3248 }
3249 }
3250 if (!Changed)
3251 break;
3252 }
3253
3254 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
3255 DeleteDeadBlocks(BBVec);
3256}
3257
3260 InsertPointTy ComputeIP) {
3261 assert(Loops.size() >= 1 && "At least one loop required");
3262 size_t NumLoops = Loops.size();
3263
3264 // Nothing to do if there is already just one loop.
3265 if (NumLoops == 1)
3266 return Loops.front();
3267
3268 CanonicalLoopInfo *Outermost = Loops.front();
3269 CanonicalLoopInfo *Innermost = Loops.back();
3270 BasicBlock *OrigPreheader = Outermost->getPreheader();
3271 BasicBlock *OrigAfter = Outermost->getAfter();
3272 Function *F = OrigPreheader->getParent();
3273
3274 // Loop control blocks that may become orphaned later.
3275 SmallVector<BasicBlock *, 12> OldControlBBs;
3276 OldControlBBs.reserve(6 * Loops.size());
3278 Loop->collectControlBlocks(OldControlBBs);
3279
3280 // Setup the IRBuilder for inserting the trip count computation.
3282 if (ComputeIP.isSet())
3283 Builder.restoreIP(ComputeIP);
3284 else
3285 Builder.restoreIP(Outermost->getPreheaderIP());
3286
3287 // Derive the collapsed' loop trip count.
3288 // TODO: Find common/largest indvar type.
3289 Value *CollapsedTripCount = nullptr;
3290 for (CanonicalLoopInfo *L : Loops) {
3291 assert(L->isValid() &&
3292 "All loops to collapse must be valid canonical loops");
3293 Value *OrigTripCount = L->getTripCount();
3294 if (!CollapsedTripCount) {
3295 CollapsedTripCount = OrigTripCount;
3296 continue;
3297 }
3298
3299 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
3300 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
3301 {}, /*HasNUW=*/true);
3302 }
3303
3304 // Create the collapsed loop control flow.
3305 CanonicalLoopInfo *Result =
3306 createLoopSkeleton(DL, CollapsedTripCount, F,
3307 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
3308
3309 // Build the collapsed loop body code.
3310 // Start with deriving the input loop induction variables from the collapsed
3311 // one, using a divmod scheme. To preserve the original loops' order, the
3312 // innermost loop use the least significant bits.
3313 Builder.restoreIP(Result->getBodyIP());
3314
3315 Value *Leftover = Result->getIndVar();
3316 SmallVector<Value *> NewIndVars;
3317 NewIndVars.resize(NumLoops);
3318 for (int i = NumLoops - 1; i >= 1; --i) {
3319 Value *OrigTripCount = Loops[i]->getTripCount();
3320
3321 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
3322 NewIndVars[i] = NewIndVar;
3323
3324 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
3325 }
3326 // Outermost loop gets all the remaining bits.
3327 NewIndVars[0] = Leftover;
3328
3329 // Construct the loop body control flow.
3330 // We progressively construct the branch structure following in direction of
3331 // the control flow, from the leading in-between code, the loop nest body, the
3332 // trailing in-between code, and rejoining the collapsed loop's latch.
3333 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
3334 // the ContinueBlock is set, continue with that block. If ContinuePred, use
3335 // its predecessors as sources.
3336 BasicBlock *ContinueBlock = Result->getBody();
3337 BasicBlock *ContinuePred = nullptr;
3338 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
3339 BasicBlock *NextSrc) {
3340 if (ContinueBlock)
3341 redirectTo(ContinueBlock, Dest, DL);
3342 else
3343 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
3344
3345 ContinueBlock = nullptr;
3346 ContinuePred = NextSrc;
3347 };
3348
3349 // The code before the nested loop of each level.
3350 // Because we are sinking it into the nest, it will be executed more often
3351 // that the original loop. More sophisticated schemes could keep track of what
3352 // the in-between code is and instantiate it only once per thread.
3353 for (size_t i = 0; i < NumLoops - 1; ++i)
3354 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
3355
3356 // Connect the loop nest body.
3357 ContinueWith(Innermost->getBody(), Innermost->getLatch());
3358
3359 // The code after the nested loop at each level.
3360 for (size_t i = NumLoops - 1; i > 0; --i)
3361 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
3362
3363 // Connect the finished loop to the collapsed loop latch.
3364 ContinueWith(Result->getLatch(), nullptr);
3365
3366 // Replace the input loops with the new collapsed loop.
3367 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
3368 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
3369
3370 // Replace the input loop indvars with the derived ones.
3371 for (size_t i = 0; i < NumLoops; ++i)
3372 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
3373
3374 // Remove unused parts of the input loops.
3375 removeUnusedBlocksFromParent(OldControlBBs);
3376
3377 for (CanonicalLoopInfo *L : Loops)
3378 L->invalidate();
3379
3380#ifndef NDEBUG
3381 Result->assertOK();
3382#endif
3383 return Result;
3384}
3385
3386std::vector<CanonicalLoopInfo *>
3388 ArrayRef<Value *> TileSizes) {
3389 assert(TileSizes.size() == Loops.size() &&
3390 "Must pass as many tile sizes as there are loops");
3391 int NumLoops = Loops.size();
3392 assert(NumLoops >= 1 && "At least one loop to tile required");
3393
3394 CanonicalLoopInfo *OutermostLoop = Loops.front();
3395 CanonicalLoopInfo *InnermostLoop = Loops.back();
3396 Function *F = OutermostLoop->getBody()->getParent();
3397 BasicBlock *InnerEnter = InnermostLoop->getBody();
3398 BasicBlock *InnerLatch = InnermostLoop->getLatch();
3399
3400 // Loop control blocks that may become orphaned later.
3401 SmallVector<BasicBlock *, 12> OldControlBBs;
3402 OldControlBBs.reserve(6 * Loops.size());
3404 Loop->collectControlBlocks(OldControlBBs);
3405
3406 // Collect original trip counts and induction variable to be accessible by
3407 // index. Also, the structure of the original loops is not preserved during
3408 // the construction of the tiled loops, so do it before we scavenge the BBs of
3409 // any original CanonicalLoopInfo.
3410 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
3411 for (CanonicalLoopInfo *L : Loops) {
3412 assert(L->isValid() && "All input loops must be valid canonical loops");
3413 OrigTripCounts.push_back(L->getTripCount());
3414 OrigIndVars.push_back(L->getIndVar());
3415 }
3416
3417 // Collect the code between loop headers. These may contain SSA definitions
3418 // that are used in the loop nest body. To be usable with in the innermost
3419 // body, these BasicBlocks will be sunk into the loop nest body. That is,
3420 // these instructions may be executed more often than before the tiling.
3421 // TODO: It would be sufficient to only sink them into body of the
3422 // corresponding tile loop.
3424 for (int i = 0; i < NumLoops - 1; ++i) {
3425 CanonicalLoopInfo *Surrounding = Loops[i];
3426 CanonicalLoopInfo *Nested = Loops[i + 1];
3427
3428 BasicBlock *EnterBB = Surrounding->getBody();
3429 BasicBlock *ExitBB = Nested->getHeader();
3430 InbetweenCode.emplace_back(EnterBB, ExitBB);
3431 }
3432
3433 // Compute the trip counts of the floor loops.
3435 Builder.restoreIP(OutermostLoop->getPreheaderIP());
3436 SmallVector<Value *, 4> FloorCount, FloorRems;
3437 for (int i = 0; i < NumLoops; ++i) {
3438 Value *TileSize = TileSizes[i];
3439 Value *OrigTripCount = OrigTripCounts[i];
3440 Type *IVType = OrigTripCount->getType();
3441
3442 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
3443 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
3444
3445 // 0 if tripcount divides the tilesize, 1 otherwise.
3446 // 1 means we need an additional iteration for a partial tile.
3447 //
3448 // Unfortunately we cannot just use the roundup-formula
3449 // (tripcount + tilesize - 1)/tilesize
3450 // because the summation might overflow. We do not want introduce undefined
3451 // behavior when the untiled loop nest did not.
3452 Value *FloorTripOverflow =
3453 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
3454
3455 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
3456 FloorTripCount =
3457 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
3458 "omp_floor" + Twine(i) + ".tripcount", true);
3459
3460 // Remember some values for later use.
3461 FloorCount.push_back(FloorTripCount);
3462 FloorRems.push_back(FloorTripRem);
3463 }
3464
3465 // Generate the new loop nest, from the outermost to the innermost.
3466 std::vector<CanonicalLoopInfo *> Result;
3467 Result.reserve(NumLoops * 2);
3468
3469 // The basic block of the surrounding loop that enters the nest generated
3470 // loop.
3471 BasicBlock *Enter = OutermostLoop->getPreheader();
3472
3473 // The basic block of the surrounding loop where the inner code should
3474 // continue.
3475 BasicBlock *Continue = OutermostLoop->getAfter();
3476
3477 // Where the next loop basic block should be inserted.
3478 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
3479
3480 auto EmbeddNewLoop =
3481 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
3482 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
3483 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
3484 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
3485 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
3486 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
3487
3488 // Setup the position where the next embedded loop connects to this loop.
3489 Enter = EmbeddedLoop->getBody();
3490 Continue = EmbeddedLoop->getLatch();
3491 OutroInsertBefore = EmbeddedLoop->getLatch();
3492 return EmbeddedLoop;
3493 };
3494
3495 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
3496 const Twine &NameBase) {
3497 for (auto P : enumerate(TripCounts)) {
3498 CanonicalLoopInfo *EmbeddedLoop =
3499 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
3500 Result.push_back(EmbeddedLoop);
3501 }
3502 };
3503
3504 EmbeddNewLoops(FloorCount, "floor");
3505
3506 // Within the innermost floor loop, emit the code that computes the tile
3507 // sizes.
3509 SmallVector<Value *, 4> TileCounts;
3510 for (int i = 0; i < NumLoops; ++i) {
3511 CanonicalLoopInfo *FloorLoop = Result[i];
3512 Value *TileSize = TileSizes[i];
3513
3514 Value *FloorIsEpilogue =
3515 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
3516 Value *TileTripCount =
3517 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
3518
3519 TileCounts.push_back(TileTripCount);
3520 }
3521
3522 // Create the tile loops.
3523 EmbeddNewLoops(TileCounts, "tile");
3524
3525 // Insert the inbetween code into the body.
3526 BasicBlock *BodyEnter = Enter;
3527 BasicBlock *BodyEntered = nullptr;
3528 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
3529 BasicBlock *EnterBB = P.first;
3530 BasicBlock *ExitBB = P.second;
3531
3532 if (BodyEnter)
3533 redirectTo(BodyEnter, EnterBB, DL);
3534 else
3535 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
3536
3537 BodyEnter = nullptr;
3538 BodyEntered = ExitBB;
3539 }
3540
3541 // Append the original loop nest body into the generated loop nest body.
3542 if (BodyEnter)
3543 redirectTo(BodyEnter, InnerEnter, DL);
3544 else
3545 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
3547
3548 // Replace the original induction variable with an induction variable computed
3549 // from the tile and floor induction variables.
3550 Builder.restoreIP(Result.back()->getBodyIP());
3551 for (int i = 0; i < NumLoops; ++i) {
3552 CanonicalLoopInfo *FloorLoop = Result[i];
3553 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
3554 Value *OrigIndVar = OrigIndVars[i];
3555 Value *Size = TileSizes[i];
3556
3557 Value *Scale =
3558 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
3559 Value *Shift =
3560 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
3561 OrigIndVar->replaceAllUsesWith(Shift);
3562 }
3563
3564 // Remove unused parts of the original loops.
3565 removeUnusedBlocksFromParent(OldControlBBs);
3566
3567 for (CanonicalLoopInfo *L : Loops)
3568 L->invalidate();
3569
3570#ifndef NDEBUG
3571 for (CanonicalLoopInfo *GenL : Result)
3572 GenL->assertOK();
3573#endif
3574 return Result;
3575}
3576
3577/// Attach metadata \p Properties to the basic block described by \p BB. If the
3578/// basic block already has metadata, the basic block properties are appended.
3580 ArrayRef<Metadata *> Properties) {
3581 // Nothing to do if no property to attach.
3582 if (Properties.empty())
3583 return;
3584
3585 LLVMContext &Ctx = BB->getContext();
3586 SmallVector<Metadata *> NewProperties;
3587 NewProperties.push_back(nullptr);
3588
3589 // If the basic block already has metadata, prepend it to the new metadata.
3590 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
3591 if (Existing)
3592 append_range(NewProperties, drop_begin(Existing->operands(), 1));
3593
3594 append_range(NewProperties, Properties);
3595 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
3596 BasicBlockID->replaceOperandWith(0, BasicBlockID);
3597
3598 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
3599}
3600
3601/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
3602/// loop already has metadata, the loop properties are appended.
3604 ArrayRef<Metadata *> Properties) {
3605 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
3606
3607 // Attach metadata to the loop's latch
3608 BasicBlock *Latch = Loop->getLatch();
3609 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
3610 addBasicBlockMetadata(Latch, Properties);
3611}
3612
3613/// Attach llvm.access.group metadata to the memref instructions of \p Block
3614static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
3615 LoopInfo &LI) {
3616 for (Instruction &I : *Block) {
3617 if (I.mayReadOrWriteMemory()) {
3618 // TODO: This instruction may already have access group from
3619 // other pragmas e.g. #pragma clang loop vectorize. Append
3620 // so that the existing metadata is not overwritten.
3621 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
3622 }
3623 }
3624}
3625
3629 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3630 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
3631}
3632
3636 Loop, {
3637 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
3638 });
3639}
3640
3641void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
3642 Value *IfCond, ValueToValueMapTy &VMap,
3643 const Twine &NamePrefix) {
3644 Function *F = CanonicalLoop->getFunction();
3645
3646 // Define where if branch should be inserted
3647 Instruction *SplitBefore;
3648 if (Instruction::classof(IfCond)) {
3649 SplitBefore = dyn_cast<Instruction>(IfCond);
3650 } else {
3651 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
3652 }
3653
3654 // TODO: We should not rely on pass manager. Currently we use pass manager
3655 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3656 // object. We should have a method which returns all blocks between
3657 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3659 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3660 FAM.registerPass([]() { return LoopAnalysis(); });
3661 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3662
3663 // Get the loop which needs to be cloned
3664 LoopAnalysis LIA;
3665 LoopInfo &&LI = LIA.run(*F, FAM);
3666 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
3667
3668 // Create additional blocks for the if statement
3669 BasicBlock *Head = SplitBefore->getParent();
3670 Instruction *HeadOldTerm = Head->getTerminator();
3671 llvm::LLVMContext &C = Head->getContext();
3673 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
3675 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
3676
3677 // Create if condition branch.
3678 Builder.SetInsertPoint(HeadOldTerm);
3679 Instruction *BrInstr =
3680 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
3681 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
3682 // Then block contains branch to omp loop which needs to be vectorized
3683 spliceBB(IP, ThenBlock, false);
3684 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
3685
3686 Builder.SetInsertPoint(ElseBlock);
3687
3688 // Clone loop for the else branch
3690
3691 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
3692 for (BasicBlock *Block : L->getBlocks()) {
3693 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
3694 NewBB->moveBefore(CanonicalLoop->getExit());
3695 VMap[Block] = NewBB;
3696 NewBlocks.push_back(NewBB);
3697 }
3698 remapInstructionsInBlocks(NewBlocks, VMap);
3699 Builder.CreateBr(NewBlocks.front());
3700}
3701
3702unsigned
3704 const StringMap<bool> &Features) {
3705 if (TargetTriple.isX86()) {
3706 if (Features.lookup("avx512f"))
3707 return 512;
3708 else if (Features.lookup("avx"))
3709 return 256;
3710 return 128;
3711 }
3712 if (TargetTriple.isPPC())
3713 return 128;
3714 if (TargetTriple.isWasm())
3715 return 128;
3716 return 0;
3717}
3718
3720 MapVector<Value *, Value *> AlignedVars,
3721 Value *IfCond, OrderKind Order,
3722 ConstantInt *Simdlen, ConstantInt *Safelen) {
3724
3725 Function *F = CanonicalLoop->getFunction();
3726
3727 // TODO: We should not rely on pass manager. Currently we use pass manager
3728 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
3729 // object. We should have a method which returns all blocks between
3730 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
3732 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3733 FAM.registerPass([]() { return LoopAnalysis(); });
3734 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3735
3736 LoopAnalysis LIA;
3737 LoopInfo &&LI = LIA.run(*F, FAM);
3738
3739 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
3740 if (AlignedVars.size()) {
3742 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
3743 for (auto &AlignedItem : AlignedVars) {
3744 Value *AlignedPtr = AlignedItem.first;
3745 Value *Alignment = AlignedItem.second;
3746 Builder.CreateAlignmentAssumption(F->getParent()->getDataLayout(),
3747 AlignedPtr, Alignment);
3748 }
3749 Builder.restoreIP(IP);
3750 }
3751
3752 if (IfCond) {
3753 ValueToValueMapTy VMap;
3754 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
3755 // Add metadata to the cloned loop which disables vectorization
3756 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
3757 assert(MappedLatch &&
3758 "Cannot find value which corresponds to original loop latch");
3759 assert(isa<BasicBlock>(MappedLatch) &&
3760 "Cannot cast mapped latch block value to BasicBlock");
3761 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
3762 ConstantAsMetadata *BoolConst =
3765 NewLatchBlock,
3766 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
3767 BoolConst})});
3768 }
3769
3770 SmallSet<BasicBlock *, 8> Reachable;
3771
3772 // Get the basic blocks from the loop in which memref instructions
3773 // can be found.
3774 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
3775 // preferably without running any passes.
3776 for (BasicBlock *Block : L->getBlocks()) {
3777 if (Block == CanonicalLoop->getCond() ||
3778 Block == CanonicalLoop->getHeader())
3779 continue;
3780 Reachable.insert(Block);
3781 }
3782
3783 SmallVector<Metadata *> LoopMDList;
3784
3785 // In presence of finite 'safelen', it may be unsafe to mark all
3786 // the memory instructions parallel, because loop-carried
3787 // dependences of 'safelen' iterations are possible.
3788 // If clause order(concurrent) is specified then the memory instructions
3789 // are marked parallel even if 'safelen' is finite.
3790 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
3791 // Add access group metadata to memory-access instructions.
3792 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
3793 for (BasicBlock *BB : Reachable)
3794 addSimdMetadata(BB, AccessGroup, LI);
3795 // TODO: If the loop has existing parallel access metadata, have
3796 // to combine two lists.
3797 LoopMDList.push_back(MDNode::get(
3798 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
3799 }
3800
3801 // Use the above access group metadata to create loop level
3802 // metadata, which should be distinct for each loop.
3803 ConstantAsMetadata *BoolConst =
3805 LoopMDList.push_back(MDNode::get(
3806 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
3807
3808 if (Simdlen || Safelen) {
3809 // If both simdlen and safelen clauses are specified, the value of the
3810 // simdlen parameter must be less than or equal to the value of the safelen
3811 // parameter. Therefore, use safelen only in the absence of simdlen.
3812 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
3813 LoopMDList.push_back(
3814 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
3815 ConstantAsMetadata::get(VectorizeWidth)}));
3816 }
3817
3818 addLoopMetadata(CanonicalLoop, LoopMDList);
3819}
3820
3821/// Create the TargetMachine object to query the backend for optimization
3822/// preferences.
3823///
3824/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
3825/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
3826/// needed for the LLVM pass pipline. We use some default options to avoid
3827/// having to pass too many settings from the frontend that probably do not
3828/// matter.
3829///
3830/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
3831/// method. If we are going to use TargetMachine for more purposes, especially
3832/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
3833/// might become be worth requiring front-ends to pass on their TargetMachine,
3834/// or at least cache it between methods. Note that while fontends such as Clang
3835/// have just a single main TargetMachine per translation unit, "target-cpu" and
3836/// "target-features" that determine the TargetMachine are per-function and can
3837/// be overrided using __attribute__((target("OPTIONS"))).
3838static std::unique_ptr<TargetMachine>
3840 Module *M = F->getParent();
3841
3842 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
3843 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
3844 const std::string &Triple = M->getTargetTriple();
3845
3846 std::string Error;
3848 if (!TheTarget)
3849 return {};
3850
3852 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
3853 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
3854 /*CodeModel=*/std::nullopt, OptLevel));
3855}
3856
3857/// Heuristically determine the best-performant unroll factor for \p CLI. This
3858/// depends on the target processor. We are re-using the same heuristics as the
3859/// LoopUnrollPass.
3861 Function *F = CLI->getFunction();
3862
3863 // Assume the user requests the most aggressive unrolling, even if the rest of
3864 // the code is optimized using a lower setting.
3866 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
3867
3869 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
3870 FAM.registerPass([]() { return AssumptionAnalysis(); });
3871 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
3872 FAM.registerPass([]() { return LoopAnalysis(); });
3873 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
3874 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
3875 TargetIRAnalysis TIRA;
3876 if (TM)
3877 TIRA = TargetIRAnalysis(
3878 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
3879 FAM.registerPass([&]() { return TIRA; });
3880
3881 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
3883 ScalarEvolution &&SE = SEA.run(*F, FAM);
3885 DominatorTree &&DT = DTA.run(*F, FAM);
3886 LoopAnalysis LIA;
3887 LoopInfo &&LI = LIA.run(*F, FAM);
3889 AssumptionCache &&AC = ACT.run(*F, FAM);
3891
3892 Loop *L = LI.getLoopFor(CLI->getHeader());
3893 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
3894
3897 /*BlockFrequencyInfo=*/nullptr,
3898 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
3899 /*UserThreshold=*/std::nullopt,
3900 /*UserCount=*/std::nullopt,
3901 /*UserAllowPartial=*/true,
3902 /*UserAllowRuntime=*/true,
3903 /*UserUpperBound=*/std::nullopt,
3904 /*UserFullUnrollMaxCount=*/std::nullopt);
3905
3906 UP.Force = true;
3907
3908 // Account for additional optimizations taking place before the LoopUnrollPass
3909 // would unroll the loop.
3912
3913 // Use normal unroll factors even if the rest of the code is optimized for
3914 // size.
3917
3918 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
3919 << " Threshold=" << UP.Threshold << "\n"
3920 << " PartialThreshold=" << UP.PartialThreshold << "\n"
3921 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
3922 << " PartialOptSizeThreshold="
3923 << UP.PartialOptSizeThreshold << "\n");
3924
3925 // Disable peeling.
3928 /*UserAllowPeeling=*/false,
3929 /*UserAllowProfileBasedPeeling=*/false,
3930 /*UnrollingSpecficValues=*/false);
3931
3933 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
3934
3935 // Assume that reads and writes to stack variables can be eliminated by
3936 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
3937 // size.
3938 for (BasicBlock *BB : L->blocks()) {
3939 for (Instruction &I : *BB) {
3940 Value *Ptr;
3941 if (auto *Load = dyn_cast<LoadInst>(&I)) {
3942 Ptr = Load->getPointerOperand();
3943 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
3944 Ptr = Store->getPointerOperand();
3945 } else
3946 continue;
3947
3948 Ptr = Ptr->stripPointerCasts();
3949
3950 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
3951 if (Alloca->getParent() == &F->getEntryBlock())
3952 EphValues.insert(&I);
3953 }
3954 }
3955 }
3956
3957 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
3958
3959 // Loop is not unrollable if the loop contains certain instructions.
3960 if (!UCE.canUnroll() || UCE.Convergent) {
3961 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
3962 return 1;
3963 }
3964
3965 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
3966 << "\n");
3967
3968 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
3969 // be able to use it.
3970 int TripCount = 0;
3971 int MaxTripCount = 0;
3972 bool MaxOrZero = false;
3973 unsigned TripMultiple = 0;
3974
3975 bool UseUpperBound = false;
3976 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
3977 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
3978 UseUpperBound);
3979 unsigned Factor = UP.Count;
3980 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
3981
3982 // This function returns 1 to signal to not unroll a loop.
3983 if (Factor == 0)
3984 return 1;
3985 return Factor;
3986}
3987
3989 int32_t Factor,
3990 CanonicalLoopInfo **UnrolledCLI) {
3991 assert(Factor >= 0 && "Unroll factor must not be negative");
3992
3993 Function *F = Loop->getFunction();
3994 LLVMContext &Ctx = F->getContext();
3995
3996 // If the unrolled loop is not used for another loop-associated directive, it
3997 // is sufficient to add metadata for the LoopUnrollPass.
3998 if (!UnrolledCLI) {
3999 SmallVector<Metadata *, 2> LoopMetadata;
4000 LoopMetadata.push_back(
4001 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
4002
4003 if (Factor >= 1) {
4005 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
4006 LoopMetadata.push_back(MDNode::get(
4007 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
4008 }
4009
4010 addLoopMetadata(Loop, LoopMetadata);
4011 return;
4012 }
4013
4014 // Heuristically determine the unroll factor.
4015 if (Factor == 0)
4017
4018 // No change required with unroll factor 1.
4019 if (Factor == 1) {
4020 *UnrolledCLI = Loop;
4021 return;
4022 }
4023
4024 assert(Factor >= 2 &&
4025 "unrolling only makes sense with a factor of 2 or larger");
4026
4027 Type *IndVarTy = Loop->getIndVarType();
4028
4029 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
4030 // unroll the inner loop.
4031 Value *FactorVal =
4032 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
4033 /*isSigned=*/false));
4034 std::vector<CanonicalLoopInfo *> LoopNest =
4035 tileLoops(DL, {Loop}, {FactorVal});
4036 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
4037 *UnrolledCLI = LoopNest[0];
4038 CanonicalLoopInfo *InnerLoop = LoopNest[1];
4039
4040 // LoopUnrollPass can only fully unroll loops with constant trip count.
4041 // Unroll by the unroll factor with a fallback epilog for the remainder
4042 // iterations if necessary.
4044 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
4046 InnerLoop,
4047 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
4049 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
4050
4051#ifndef NDEBUG
4052 (*UnrolledCLI)->assertOK();
4053#endif
4054}
4055
4058 llvm::Value *BufSize, llvm::Value *CpyBuf,
4059 llvm::Value *CpyFn, llvm::Value *DidIt) {
4060 if (!updateToLocation(Loc))
4061 return Loc.IP;
4062
4063 uint32_t SrcLocStrSize;
4064 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4065 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4066 Value *ThreadId = getOrCreateThreadID(Ident);
4067
4068 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
4069
4070 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
4071
4072 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
4073 Builder.CreateCall(Fn, Args);
4074
4075 return Builder.saveIP();
4076}
4077
4079 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4080 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
4082
4083 if (!updateToLocation(Loc))
4084 return Loc.IP;
4085
4086 // If needed allocate and initialize `DidIt` with 0.
4087 // DidIt: flag variable: 1=single thread; 0=not single thread.
4088 llvm::Value *DidIt = nullptr;
4089 if (!CPVars.empty()) {
4092 }
4093
4094 Directive OMPD = Directive::OMPD_single;
4095 uint32_t SrcLocStrSize;
4096 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4097 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4098 Value *ThreadId = getOrCreateThreadID(Ident);
4099 Value *Args[] = {Ident, ThreadId};
4100
4101 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
4102 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4103
4104 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
4105 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4106
4107 auto FiniCBWrapper = [&](InsertPointTy IP) {
4108 FiniCB(IP);
4109
4110 // The thread that executes the single region must set `DidIt` to 1.
4111 // This is used by __kmpc_copyprivate, to know if the caller is the
4112 // single thread or not.
4113 if (DidIt)
4115 };
4116
4117 // generates the following:
4118 // if (__kmpc_single()) {
4119 // .... single region ...
4120 // __kmpc_end_single
4121 // }
4122 // __kmpc_copyprivate
4123 // __kmpc_barrier
4124
4125 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
4126 /*Conditional*/ true,
4127 /*hasFinalize*/ true);
4128
4129 if (DidIt) {
4130 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
4131 // NOTE BufSize is currently unused, so just pass 0.
4133 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
4134 CPFuncs[I], DidIt);
4135 // NOTE __kmpc_copyprivate already inserts a barrier
4136 } else if (!IsNowait)
4138 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
4139 /* CheckCancelFlag */ false);
4140 return Builder.saveIP();
4141}
4142
4144 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4145 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
4146
4147 if (!updateToLocation(Loc))
4148 return Loc.IP;
4149
4150 Directive OMPD = Directive::OMPD_critical;
4151 uint32_t SrcLocStrSize;
4152 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4153 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4154 Value *ThreadId = getOrCreateThreadID(Ident);
4155 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
4156 Value *Args[] = {Ident, ThreadId, LockVar};
4157
4158 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
4159 Function *RTFn = nullptr;
4160 if (HintInst) {
4161 // Add Hint to entry Args and create call
4162 EnterArgs.push_back(HintInst);
4163 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
4164 } else {
4165 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
4166 }
4167 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
4168
4169 Function *ExitRTLFn =
4170 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
4171 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4172
4173 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4174 /*Conditional*/ false, /*hasFinalize*/ true);
4175}
4176
4179 InsertPointTy AllocaIP, unsigned NumLoops,
4180 ArrayRef<llvm::Value *> StoreValues,
4181 const Twine &Name, bool IsDependSource) {
4182 assert(
4183 llvm::all_of(StoreValues,
4184 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
4185 "OpenMP runtime requires depend vec with i64 type");
4186
4187 if (!updateToLocation(Loc))
4188 return Loc.IP;
4189
4190 // Allocate space for vector and generate alloc instruction.
4191 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
4192 Builder.restoreIP(AllocaIP);
4193 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
4194 ArgsBase->setAlignment(Align(8));
4195 Builder.restoreIP(Loc.IP);
4196
4197 // Store the index value with offset in depend vector.
4198 for (unsigned I = 0; I < NumLoops; ++I) {
4199 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
4200 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
4201 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
4202 STInst->setAlignment(Align(8));
4203 }
4204
4205 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
4206 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
4207
4208 uint32_t SrcLocStrSize;
4209 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4210 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4211 Value *ThreadId = getOrCreateThreadID(Ident);
4212 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
4213
4214 Function *RTLFn = nullptr;
4215 if (IsDependSource)
4216 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
4217 else
4218 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
4219 Builder.CreateCall(RTLFn, Args);
4220
4221 return Builder.saveIP();
4222}
4223
4225 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
4226 FinalizeCallbackTy FiniCB, bool IsThreads) {
4227 if (!updateToLocation(Loc))
4228 return Loc.IP;
4229
4230 Directive OMPD = Directive::OMPD_ordered;
4231 Instruction *EntryCall = nullptr;
4232 Instruction *ExitCall = nullptr;
4233
4234 if (IsThreads) {
4235 uint32_t SrcLocStrSize;
4236 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4237 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4238 Value *ThreadId = getOrCreateThreadID(Ident);
4239 Value *Args[] = {Ident, ThreadId};
4240
4241 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
4242 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4243
4244 Function *ExitRTLFn =
4245 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
4246 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4247 }
4248
4249 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4250 /*Conditional*/ false, /*hasFinalize*/ true);
4251}
4252
4253OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
4254 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
4255 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
4256 bool HasFinalize, bool IsCancellable) {
4257
4258 if (HasFinalize)
4259 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
4260
4261 // Create inlined region's entry and body blocks, in preparation
4262 // for conditional creation
4263 BasicBlock *EntryBB = Builder.GetInsertBlock();
4264 Instruction *SplitPos = EntryBB->getTerminator();
4265 if (!isa_and_nonnull<BranchInst>(SplitPos))
4266 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
4267 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
4268 BasicBlock *FiniBB =
4269 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
4270
4272 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
4273
4274 // generate body
4275 BodyGenCB(/* AllocaIP */ InsertPointTy(),
4276 /* CodeGenIP */ Builder.saveIP());
4277
4278 // emit exit call and do any needed finalization.
4279 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
4280 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
4281 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
4282 "Unexpected control flow graph state!!");
4283 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
4284 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
4285 "Unexpected Control Flow State!");
4287
4288 // If we are skipping the region of a non conditional, remove the exit
4289 // block, and clear the builder's insertion point.
4290 assert(SplitPos->getParent() == ExitBB &&
4291 "Unexpected Insertion point location!");
4292 auto merged = MergeBlockIntoPredecessor(ExitBB);
4293 BasicBlock *ExitPredBB = SplitPos->getParent();
4294 auto InsertBB = merged ? ExitPredBB : ExitBB;
4295 if (!isa_and_nonnull<BranchInst>(SplitPos))
4296 SplitPos->eraseFromParent();
4297 Builder.SetInsertPoint(InsertBB);
4298
4299 return Builder.saveIP();
4300}
4301
4302OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
4303 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
4304 // if nothing to do, Return current insertion point.
4305 if (!Conditional || !EntryCall)
4306 return Builder.saveIP();
4307
4308 BasicBlock *EntryBB = Builder.GetInsertBlock();
4309 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
4310 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
4311 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
4312
4313 // Emit thenBB and set the Builder's insertion point there for
4314 // body generation next. Place the block after the current block.
4315 Function *CurFn = EntryBB->getParent();
4316 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
4317
4318 // Move Entry branch to end of ThenBB, and replace with conditional
4319 // branch (If-stmt)
4320 Instruction *EntryBBTI = EntryBB->getTerminator();
4321 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
4322 EntryBBTI->removeFromParent();
4324 Builder.Insert(EntryBBTI);
4325 UI->eraseFromParent();
4326 Builder.SetInsertPoint(ThenBB->getTerminator());
4327
4328 // return an insertion point to ExitBB.
4329 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
4330}
4331
4332OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
4333 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
4334 bool HasFinalize) {
4335
4336 Builder.restoreIP(FinIP);
4337
4338 // If there is finalization to do, emit it before the exit call
4339 if (HasFinalize) {
4340 assert(!FinalizationStack.empty() &&
4341 "Unexpected finalization stack state!");
4342
4343 FinalizationInfo Fi = FinalizationStack.pop_back_val();
4344 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
4345
4346 Fi.FiniCB(FinIP);
4347
4348 BasicBlock *FiniBB = FinIP.getBlock();
4349 Instruction *FiniBBTI = FiniBB->getTerminator();
4350
4351 // set Builder IP for call creation
4352 Builder.SetInsertPoint(FiniBBTI);
4353 }
4354
4355 if (!ExitCall)
4356 return Builder.saveIP();
4357
4358 // place the Exitcall as last instruction before Finalization block terminator
4359 ExitCall->removeFromParent();
4360 Builder.Insert(ExitCall);
4361
4362 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
4363 ExitCall->getIterator());
4364}
4365
4367 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
4368 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
4369 if (!IP.isSet())
4370 return IP;
4371
4373
4374 // creates the following CFG structure
4375 // OMP_Entry : (MasterAddr != PrivateAddr)?
4376 // F T
4377 // | \
4378 // | copin.not.master
4379 // | /
4380 // v /
4381 // copyin.not.master.end
4382 // |
4383 // v
4384 // OMP.Entry.Next
4385
4386 BasicBlock *OMP_Entry = IP.getBlock();
4387 Function *CurFn = OMP_Entry->getParent();
4388 BasicBlock *CopyBegin =
4389 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
4390 BasicBlock *CopyEnd = nullptr;
4391
4392 // If entry block is terminated, split to preserve the branch to following
4393 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
4394 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
4395 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
4396 "copyin.not.master.end");
4397 OMP_Entry->getTerminator()->eraseFromParent();
4398 } else {
4399 CopyEnd =
4400 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
4401 }
4402
4403 Builder.SetInsertPoint(OMP_Entry);
4404 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
4405 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
4406 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
4407 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
4408
4409 Builder.SetInsertPoint(CopyBegin);
4410 if (BranchtoEnd)
4412
4413 return Builder.saveIP();
4414}
4415
4417 Value *Size, Value *Allocator,
4418 std::string Name) {
4420 updateToLocation(Loc);
4421
4422 uint32_t SrcLocStrSize;
4423 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4424 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4425 Value *ThreadId = getOrCreateThreadID(Ident);
4426 Value *Args[] = {ThreadId, Size, Allocator};
4427
4428 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
4429
4430 return Builder.CreateCall(Fn, Args, Name);
4431}
4432
4434 Value *Addr, Value *Allocator,
4435 std::string Name) {
4437 updateToLocation(Loc);
4438
4439 uint32_t SrcLocStrSize;
4440 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4441 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4442 Value *ThreadId = getOrCreateThreadID(Ident);
4443 Value *Args[] = {ThreadId, Addr, Allocator};
4444 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
4445 return Builder.CreateCall(Fn, Args, Name);
4446}
4447
4449 const LocationDescription &Loc, Value *InteropVar,
4450 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
4451 Value *DependenceAddress, bool HaveNowaitClause) {
4453 updateToLocation(Loc);
4454
4455 uint32_t SrcLocStrSize;
4456 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4457 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4458 Value *ThreadId = getOrCreateThreadID(Ident);
4459 if (Device == nullptr)
4460 Device = ConstantInt::get(Int32, -1);
4461 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
4462 if (NumDependences == nullptr) {
4463 NumDependences = ConstantInt::get(Int32, 0);
4464 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4465 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4466 }
4467 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4468 Value *Args[] = {
4469 Ident, ThreadId, InteropVar, InteropTypeVal,
4470 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
4471
4472 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
4473
4474 return Builder.CreateCall(Fn, Args);
4475}
4476
4478 const LocationDescription &Loc, Value *InteropVar, Value *Device,
4479 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
4481 updateToLocation(Loc);
4482
4483 uint32_t SrcLocStrSize;
4484 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4485 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4486 Value *ThreadId = getOrCreateThreadID(Ident);
4487 if (Device == nullptr)
4488 Device = ConstantInt::get(Int32, -1);
4489 if (NumDependences == nullptr) {
4490 NumDependences = ConstantInt::get(Int32, 0);
4491 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4492 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4493 }
4494 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4495 Value *Args[] = {
4496 Ident, ThreadId, InteropVar, Device,
4497 NumDependences, DependenceAddress, HaveNowaitClauseVal};
4498
4499 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
4500
4501 return Builder.CreateCall(Fn, Args);
4502}
4503
4505 Value *InteropVar, Value *Device,
4506 Value *NumDependences,
4507 Value *DependenceAddress,
4508 bool HaveNowaitClause) {
4510 updateToLocation(Loc);
4511 uint32_t SrcLocStrSize;
4512 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4513 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4514 Value *ThreadId = getOrCreateThreadID(Ident);
4515 if (Device == nullptr)
4516 Device = ConstantInt::get(Int32, -1);
4517 if (NumDependences == nullptr) {
4518 NumDependences = ConstantInt::get(Int32, 0);
4519 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
4520 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
4521 }
4522 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
4523 Value *Args[] = {
4524 Ident, ThreadId, InteropVar, Device,
4525 NumDependences, DependenceAddress, HaveNowaitClauseVal};
4526
4527 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
4528
4529 return Builder.CreateCall(Fn, Args);
4530}
4531
4533 const LocationDescription &Loc, llvm::Value *Pointer,
4536 updateToLocation(Loc);
4537
4538 uint32_t SrcLocStrSize;
4539 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4540 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4541 Value *ThreadId = getOrCreateThreadID(Ident);
4542 Constant *ThreadPrivateCache =
4543 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
4544 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
4545
4546 Function *Fn =
4547 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
4548
4549 return Builder.CreateCall(Fn, Args);
4550}
4551
4554 int32_t MinThreadsVal, int32_t MaxThreadsVal,
4555 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
4556 if (!updateToLocation(Loc))
4557 return Loc.IP;
4558
4559 uint32_t SrcLocStrSize;
4560 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4561 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4562 Constant *IsSPMDVal = ConstantInt::getSigned(
4564 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
4565 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
4566 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
4567
4569
4570 // Manifest the launch configuration in the metadata matching the kernel
4571 // environment.
4572 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
4573 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
4574
4575 // For max values, < 0 means unset, == 0 means set but unknown.
4576 if (MaxThreadsVal < 0)
4577 MaxThreadsVal = std::max(
4578 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
4579
4580 if (MaxThreadsVal > 0)
4581 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
4582
4583 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
4585 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
4586 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
4587 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
4588 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
4589
4590 // We need to strip the debug prefix to get the correct kernel name.
4591 StringRef KernelName = Kernel->getName();
4592 const std::string DebugPrefix = "_debug__";
4593 if (KernelName.ends_with(DebugPrefix))
4594 KernelName = KernelName.drop_back(DebugPrefix.length());
4595
4597 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
4598 const DataLayout &DL = Fn->getParent()->getDataLayout();
4599
4600 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
4601 Constant *DynamicEnvironmentInitializer =
4602 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
4603 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
4604 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
4605 DynamicEnvironmentInitializer, DynamicEnvironmentName,
4606 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
4607 DL.getDefaultGlobalsAddressSpace());
4608 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
4609
4610 Constant *DynamicEnvironment =
4611 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
4612 ? DynamicEnvironmentGV
4613 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
4614 DynamicEnvironmentPtr);
4615
4616 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
4617 ConfigurationEnvironment, {
4618 UseGenericStateMachineVal,
4619 MayUseNestedParallelismVal,
4620 IsSPMDVal,
4621 MinThreads,
4622 MaxThreads,
4623 MinTeams,
4624 MaxTeams,
4625 ReductionDataSize,
4626 ReductionBufferLength,
4627 });
4628 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
4629 KernelEnvironment, {
4630 ConfigurationEnvironmentInitializer,
4631 Ident,
4632 DynamicEnvironment,
4633 });
4634 Twine KernelEnvironmentName = KernelName + "_kernel_environment";
4635 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
4636 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
4637 KernelEnvironmentInitializer, KernelEnvironmentName,
4638 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
4639 DL.getDefaultGlobalsAddressSpace());
4640 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
4641
4642 Constant *KernelEnvironment =
4643 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
4644 ? KernelEnvironmentGV
4645 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
4646 KernelEnvironmentPtr);
4647 Value *KernelLaunchEnvironment = Kernel->getArg(0);
4648 CallInst *ThreadKind =
4649 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
4650
4651 Value *ExecUserCode = Builder.CreateICmpEQ(
4652 ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
4653 "exec_user_code");
4654
4655 // ThreadKind = __kmpc_target_init(...)
4656 // if (ThreadKind == -1)
4657 // user_code
4658 // else
4659 // return;
4660
4661 auto *UI = Builder.CreateUnreachable();
4662 BasicBlock *CheckBB = UI->getParent();
4663 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
4664
4665 BasicBlock *WorkerExitBB = BasicBlock::Create(
4666 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
4667 Builder.SetInsertPoint(WorkerExitBB);
4669
4670 auto *CheckBBTI = CheckBB->getTerminator();
4671 Builder.SetInsertPoint(CheckBBTI);
4672 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
4673
4674 CheckBBTI->eraseFromParent();
4675 UI->eraseFromParent();
4676
4677 // Continue in the "user_code" block, see diagram above and in
4678 // openmp/libomptarget/deviceRTLs/common/include/target.h .
4679 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
4680}
4681
4683 int32_t TeamsReductionDataSize,
4684 int32_t TeamsReductionBufferLength) {
4685 if (!updateToLocation(Loc))
4686 return;
4687
4689 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
4690
4691 Builder.CreateCall(Fn, {});
4692
4693 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
4694 return;
4695
4697 // We need to strip the debug prefix to get the correct kernel name.
4698 StringRef KernelName = Kernel->getName();
4699 const std::string DebugPrefix = "_debug__";
4700 if (KernelName.ends_with(DebugPrefix))
4701 KernelName = KernelName.drop_back(DebugPrefix.length());
4702 auto *KernelEnvironmentGV =
4703 M.getNamedGlobal((KernelName + "_kernel_environment").str());
4704 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
4705 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
4706 auto *NewInitializer = ConstantFoldInsertValueInstruction(
4707 KernelEnvironmentInitializer,
4708 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
4709 NewInitializer = ConstantFoldInsertValueInstruction(
4710 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
4711 {0, 8});
4712 KernelEnvironmentGV->setInitializer(NewInitializer);
4713}
4714
4716 Module &M = *Kernel.getParent();
4717 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4718 for (auto *Op : MD->operands()) {
4719 if (Op->getNumOperands() != 3)
4720 continue;
4721 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
4722 if (!KernelOp || KernelOp->getValue() != &Kernel)
4723 continue;
4724 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
4725 if (!Prop || Prop->getString() != Name)
4726 continue;
4727 return Op;
4728 }
4729 return nullptr;
4730}
4731
4733 bool Min) {
4734 // Update the "maxntidx" metadata for NVIDIA, or add it.
4735 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
4736 if (ExistingOp) {
4737 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
4738 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
4739 ExistingOp->replaceOperandWith(
4740 2, ConstantAsMetadata::get(ConstantInt::get(
4741 OldVal->getValue()->getType(),
4742 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
4743 } else {
4744 LLVMContext &Ctx = Kernel.getContext();
4746 MDString::get(Ctx, Name),
4748 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
4749 // Append metadata to nvvm.annotations
4750 Module &M = *Kernel.getParent();
4751 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
4752 MD->addOperand(MDNode::get(Ctx, MDVals));
4753 }
4754}
4755
4756std::pair<int32_t, int32_t>
4758 int32_t ThreadLimit =
4759 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
4760
4761 if (T.isAMDGPU()) {
4762 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
4763 if (!Attr.isValid() || !Attr.isStringAttribute())
4764 return {0, ThreadLimit};
4765 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
4766 int32_t LB, UB;
4767 if (!llvm::to_integer(UBStr, UB, 10))
4768 return {0, ThreadLimit};
4769 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
4770 if (!llvm::to_integer(LBStr, LB, 10))
4771 return {0, UB};
4772 return {LB, UB};
4773 }
4774
4775 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
4776 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
4777 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
4778 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
4779 }
4780 return {0, ThreadLimit};
4781}
4782
4784 Function &Kernel, int32_t LB,
4785 int32_t UB) {
4786 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
4787
4788 if (T.isAMDGPU()) {
4789 Kernel.addFnAttr("amdgpu-flat-work-group-size",
4790 llvm::utostr(LB) + "," + llvm::utostr(UB));
4791 return;
4792 }
4793
4794 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
4795}
4796
4797std::pair<int32_t, int32_t>
4799 // TODO: Read from backend annotations if available.
4800 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
4801}
4802
4804 int32_t LB, int32_t UB) {
4805 if (T.isNVPTX())
4806 if (UB > 0)
4807 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
4808 if (T.isAMDGPU())
4809 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
4810
4811 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
4812}
4813
4814void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
4815 Function *OutlinedFn) {
4816 if (Config.isTargetDevice()) {
4818 // TODO: Determine if DSO local can be set to true.
4819 OutlinedFn->setDSOLocal(false);
4821 if (T.isAMDGCN())
4823 }
4824}
4825
4826Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
4827 StringRef EntryFnIDName) {
4828 if (Config.isTargetDevice()) {
4829 assert(OutlinedFn && "The outlined function must exist if embedded");
4830 return OutlinedFn;
4831 }
4832
4833 return new GlobalVariable(
4834 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
4835 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
4836}
4837
4838Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
4839 StringRef EntryFnName) {
4840 if (OutlinedFn)
4841 return OutlinedFn;
4842
4843 assert(!M.getGlobalVariable(EntryFnName, true) &&
4844 "Named kernel already exists?");
4845 return new GlobalVariable(
4846 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
4847 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
4848}
4849
4851 TargetRegionEntryInfo &EntryInfo,
4852 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
4853 Function *&OutlinedFn, Constant *&OutlinedFnID) {
4854
4855 SmallString<64> EntryFnName;
4856 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
4857
4859 ? GenerateFunctionCallback(EntryFnName)
4860 : nullptr;
4861
4862 // If this target outline function is not an offload entry, we don't need to
4863 // register it. This may be in the case of a false if clause, or if there are
4864 // no OpenMP targets.
4865 if (!IsOffloadEntry)
4866 return;
4867
4868 std::string EntryFnIDName =
4870 ? std::string(EntryFnName)
4871 : createPlatformSpecificName({EntryFnName, "region_id"});
4872
4873 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
4874 EntryFnName, EntryFnIDName);
4875}
4876
4878 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
4879 StringRef EntryFnName, StringRef EntryFnIDName) {
4880 if (OutlinedFn)
4881 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
4882 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
4883 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
4885 EntryInfo, EntryAddr, OutlinedFnID,
4887 return OutlinedFnID;
4888}
4889
4891 const LocationDescription &Loc, InsertPointTy AllocaIP,
4892 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
4893 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
4894 omp::RuntimeFunction *MapperFunc,
4895 function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)>
4896 BodyGenCB,
4897 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
4898 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
4899 if (!updateToLocation(Loc))
4900 return InsertPointTy();
4901
4902 // Disable TargetData CodeGen on Device pass.
4903 if (Config.IsTargetDevice.value_or(false)) {
4904 if (BodyGenCB)
4906 return Builder.saveIP();
4907 }
4908
4909 Builder.restoreIP(CodeGenIP);
4910 bool IsStandAlone = !BodyGenCB;
4911 MapInfosTy *MapInfo;
4912 // Generate the code for the opening of the data environment. Capture all the
4913 // arguments of the runtime call by reference because they are used in the
4914 // closing of the region.
4915 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4916 MapInfo = &GenMapInfoCB(Builder.saveIP());
4917 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
4918 /*IsNonContiguous=*/true, DeviceAddrCB,
4919 CustomMapperCB);
4920
4921 TargetDataRTArgs RTArgs;
4923 !MapInfo->Names.empty());
4924
4925 // Emit the number of elements in the offloading arrays.
4926 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
4927
4928 // Source location for the ident struct
4929 if (!SrcLocInfo) {
4930 uint32_t SrcLocStrSize;
4931 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4932 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4933 }
4934
4935 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
4936 PointerNum, RTArgs.BasePointersArray,
4937 RTArgs.PointersArray, RTArgs.SizesArray,
4938 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
4939 RTArgs.MappersArray};
4940
4941 if (IsStandAlone) {
4942 assert(MapperFunc && "MapperFunc missing for standalone target data");
4944 OffloadingArgs);
4945 } else {
4946 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
4947 omp::OMPRTL___tgt_target_data_begin_mapper);
4948
4949 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
4950
4951 for (auto DeviceMap : Info.DevicePtrInfoMap) {
4952 if (isa<AllocaInst>(DeviceMap.second.second)) {
4953 auto *LI =
4954 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
4955 Builder.CreateStore(LI, DeviceMap.second.second);
4956 }
4957 }
4958
4959 // If device pointer privatization is required, emit the body of the
4960 // region here. It will have to be duplicated: with and without
4961 // privatization.
4963 }
4964 };
4965
4966 // If we need device pointer privatization, we need to emit the body of the
4967 // region with no privatization in the 'else' branch of the conditional.
4968 // Otherwise, we don't have to do anything.
4969 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4971 };
4972
4973 // Generate code for the closing of the data region.
4974 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
4975 TargetDataRTArgs RTArgs;
4976 emitOffloadingArraysArgument(Builder, RTArgs, Info, !MapInfo->Names.empty(),
4977 /*ForEndCall=*/true);
4978
4979 // Emit the number of elements in the offloading arrays.
4980 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
4981
4982 // Source location for the ident struct
4983 if (!SrcLocInfo) {
4984 uint32_t SrcLocStrSize;
4985 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4986 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4987 }
4988
4989 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
4990 PointerNum, RTArgs.BasePointersArray,
4991 RTArgs.PointersArray, RTArgs.SizesArray,
4992 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
4993 RTArgs.MappersArray};
4994 Function *EndMapperFunc =
4995 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
4996
4997 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
4998 };
4999
5000 // We don't have to do anything to close the region if the if clause evaluates
5001 // to false.
5002 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
5003
5004 if (BodyGenCB) {
5005 if (IfCond) {
5006 emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
5007 } else {
5008 BeginThenGen(AllocaIP, Builder.saveIP());
5009 }
5010
5011 // If we don't require privatization of device pointers, we emit the body in
5012 // between the runtime calls. This avoids duplicating the body code.
5014
5015 if (IfCond) {
5016 emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
5017 } else {
5018 EndThenGen(AllocaIP, Builder.saveIP());
5019 }
5020 } else {
5021 if (IfCond) {
5022 emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
5023 } else {
5024 BeginThenGen(AllocaIP, Builder.saveIP());
5025 }
5026 }
5027
5028 return Builder.saveIP();
5029}
5030
5033 bool IsGPUDistribute) {
5034 assert((IVSize == 32 || IVSize == 64) &&
5035 "IV size is not compatible with the omp runtime");
5037 if (IsGPUDistribute)
5038 Name = IVSize == 32
5039 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
5040 : omp::OMPRTL___kmpc_distribute_static_init_4u)
5041 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
5042 : omp::OMPRTL___kmpc_distribute_static_init_8u);
5043 else
5044 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
5045 : omp::OMPRTL___kmpc_for_static_init_4u)
5046 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
5047 : omp::OMPRTL___kmpc_for_static_init_8u);
5048
5050}
5051
5053 bool IVSigned) {
5054 assert((IVSize == 32 || IVSize == 64) &&
5055 "IV size is not compatible with the omp runtime");
5056 RuntimeFunction Name = IVSize == 32
5057 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
5058 : omp::OMPRTL___kmpc_dispatch_init_4u)
5059 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
5060 : omp::OMPRTL___kmpc_dispatch_init_8u);
5061
5063}
5064
5066 bool IVSigned) {
5067 assert((IVSize == 32 || IVSize == 64) &&
5068 "IV size is not compatible with the omp runtime");
5069 RuntimeFunction Name = IVSize == 32
5070 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
5071 : omp::OMPRTL___kmpc_dispatch_next_4u)
5072 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
5073 : omp::OMPRTL___kmpc_dispatch_next_8u);
5074
5076}
5077
5079 bool IVSigned) {
5080 assert((IVSize == 32 || IVSize == 64) &&
5081 "IV size is not compatible with the omp runtime");
5082 RuntimeFunction Name = IVSize == 32
5083 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
5084 : omp::OMPRTL___kmpc_dispatch_fini_4u)
5085 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
5086 : omp::OMPRTL___kmpc_dispatch_fini_8u);
5087
5089}
5090
5092 Function *Func) {
5093 for (User *User : make_early_inc_range(ConstExpr->users())) {
5094 if (auto *Instr = dyn_cast<Instruction>(User)) {
5095 if (Instr->getFunction() == Func) {
5096 Instruction *ConstInst = ConstExpr->getAsInstruction();
5097 ConstInst->insertBefore(*Instr->getParent(), Instr->getIterator());
5098 Instr->replaceUsesOfWith(ConstExpr, ConstInst);
5099 }
5100 }
5101 }
5102}
5103
5105 Function *Func) {
5106 for (User *User : make_early_inc_range(Input->users()))
5107 if (auto *Const = dyn_cast<Constant>(User))
5108 if (auto *ConstExpr = dyn_cast<ConstantExpr>(Const))
5110}
5111
5113 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
5117 SmallVector<Type *> ParameterTypes;
5118 if (OMPBuilder.Config.isTargetDevice()) {
5119 // Add the "implicit" runtime argument we use to provide launch specific
5120 // information for target devices.
5121 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
5122 ParameterTypes.push_back(Int8PtrTy);
5123
5124 // All parameters to target devices are passed as pointers
5125 // or i64. This assumes 64-bit address spaces/pointers.
5126 for (auto &Arg : Inputs)
5127 ParameterTypes.push_back(Arg->getType()->isPointerTy()
5128 ? Arg->getType()
5129 : Type::getInt64Ty(Builder.getContext()));
5130 } else {
5131 for (auto &Arg : Inputs)
5132 ParameterTypes.push_back(Arg->getType());
5133 }
5134
5135 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
5136 /*isVarArg*/ false);
5137 auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
5138 Builder.GetInsertBlock()->getModule());
5139
5140 // Save insert point.
5141 auto OldInsertPoint = Builder.saveIP();
5142
5143 // Generate the region into the function.
5144 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
5145 Builder.SetInsertPoint(EntryBB);
5146
5147 // Insert target init call in the device compilation pass.
5148 if (OMPBuilder.Config.isTargetDevice())
5149 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
5150
5151 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
5152
5153 // As we embed the user code in the middle of our target region after we
5154 // generate entry code, we must move what allocas we can into the entry
5155 // block to avoid possible breaking optimisations for device
5156 if (OMPBuilder.Config.isTargetDevice())
5158
5159 // Insert target deinit call in the device compilation pass.
5160 Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
5161 if (OMPBuilder.Config.isTargetDevice())
5162 OMPBuilder.createTargetDeinit(Builder);
5163
5164 // Insert return instruction.
5165 Builder.CreateRetVoid();
5166
5167 // New Alloca IP at entry point of created device function.
5168 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
5169 auto AllocaIP = Builder.saveIP();
5170
5171 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
5172
5173 // Skip the artificial dyn_ptr on the device.
5174 const auto &ArgRange =
5175 OMPBuilder.Config.isTargetDevice()
5176 ? make_range(Func->arg_begin() + 1, Func->arg_end())
5177 : Func->args();
5178
5179 // Rewrite uses of input valus to parameters.
5180 for (auto InArg : zip(Inputs, ArgRange)) {
5181 Value *Input = std::get<0>(InArg);
5182 Argument &Arg = std::get<1>(InArg);
5183 Value *InputCopy = nullptr;
5184
5185 Builder.restoreIP(
5186 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
5187
5188 // Things like GEP's can come in the form of Constants. Constants and
5189 // ConstantExpr's do not have access to the knowledge of what they're
5190 // contained in, so we must dig a little to find an instruction so we can
5191 // tell if they're used inside of the function we're outlining. We also
5192 // replace the original constant expression with a new instruction
5193 // equivalent; an instruction as it allows easy modification in the
5194 // following loop, as we can now know the constant (instruction) is owned by
5195 // our target function and replaceUsesOfWith can now be invoked on it
5196 // (cannot do this with constants it seems). A brand new one also allows us
5197 // to be cautious as it is perhaps possible the old expression was used
5198 // inside of the function but exists and is used externally (unlikely by the
5199 // nature of a Constant, but still).
5201
5202 // Collect all the instructions
5203 for (User *User : make_early_inc_range(Input->users()))
5204 if (auto *Instr = dyn_cast<Instruction>(User))
5205 if (Instr->getFunction() == Func)
5206 Instr->replaceUsesOfWith(Input, InputCopy);
5207 }
5208
5209 // Restore insert point.
5210 Builder.restoreIP(OldInsertPoint);
5211
5212 return Func;
5213}
5214
5216 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
5217 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
5218 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
5221
5222 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
5223 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
5224 &ArgAccessorFuncCB](StringRef EntryFnName) {
5225 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
5226 CBFunc, ArgAccessorFuncCB);
5227 };
5228
5229 OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
5230 OutlinedFn, OutlinedFnID);
5231}
5232
5233static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
5235 Function *OutlinedFn, Constant *OutlinedFnID,
5236 int32_t NumTeams, int32_t NumThreads,
5239
5241 /*RequiresDevicePointerInfo=*/false,
5242 /*SeparateBeginEndCalls=*/true);
5243
5244 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
5245 OMPBuilder.emitOffloadingArrays(AllocaIP, Builder.saveIP(), MapInfo, Info,
5246 /*IsNonContiguous=*/true);
5247
5249 OMPBuilder.emitOffloadingArraysArgument(Builder, RTArgs, Info,
5250 !MapInfo.Names.empty());
5251
5252 // emitKernelLaunch
5253 auto &&EmitTargetCallFallbackCB =
5255 Builder.restoreIP(IP);
5256 Builder.CreateCall(OutlinedFn, Args);
5257 return Builder.saveIP();
5258 };
5259
5260 unsigned NumTargetItems = MapInfo.BasePointers.size();
5261 // TODO: Use correct device ID
5262 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
5263 Value *NumTeamsVal = Builder.getInt32(NumTeams);
5264 Value *NumThreadsVal = Builder.getInt32(NumThreads);
5265 uint32_t SrcLocStrSize;
5266 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
5267 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
5268 llvm::omp::IdentFlag(0), 0);
5269 // TODO: Use correct NumIterations
5270 Value *NumIterations = Builder.getInt64(0);
5271 // TODO: Use correct DynCGGroupMem
5272 Value *DynCGGroupMem = Builder.getInt32(0);
5273
5274 bool HasNoWait = false;
5275
5276 OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
5277 NumTeamsVal, NumThreadsVal,
5278 DynCGGroupMem, HasNoWait);
5279
5280 Builder.restoreIP(OMPBuilder.emitKernelLaunch(
5281 Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
5282 DeviceID, RTLoc, AllocaIP));
5283}
5284
5286 const LocationDescription &Loc, InsertPointTy AllocaIP,
5287 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams,
5288 int32_t NumThreads, SmallVectorImpl<Value *> &Args,
5289 GenMapInfoCallbackTy GenMapInfoCB,
5292 if (!updateToLocation(Loc))
5293 return InsertPointTy();
5294
5295 Builder.restoreIP(CodeGenIP);
5296
5297 Function *OutlinedFn;
5298 Constant *OutlinedFnID;
5299 emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
5300 OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
5301 if (!Config.isTargetDevice())
5302 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
5303 NumThreads, Args, GenMapInfoCB);
5304
5305 return Builder.saveIP();
5306}
5307
5308std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
5309 StringRef FirstSeparator,
5310 StringRef Separator) {
5311 SmallString<128> Buffer;
5313 StringRef Sep = FirstSeparator;
5314 for (StringRef Part : Parts) {
5315 OS << Sep << Part;
5316 Sep = Separator;
5317 }
5318 return OS.str().str();
5319}
5320
5321std::string
5323 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
5324 Config.separator());
5325}
5326
5329 unsigned AddressSpace) {
5330 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
5331 if (Elem.second) {
5332 assert(Elem.second->getValueType() == Ty &&
5333 "OMP internal variable has different type than requested");
5334 } else {
5335 // TODO: investigate the appropriate linkage type used for the global
5336 // variable for possibly changing that to internal or private, or maybe
5337 // create different versions of the function for different OMP internal
5338 // variables.
5339 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
5342 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
5343 Constant::getNullValue(Ty), Elem.first(),
5344 /*InsertBefore=*/nullptr,
5346 const DataLayout &DL = M.getDataLayout();
5347 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
5348 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
5349 GV->setAlignment(std::max(TypeAlign, PtrAlign));
5350 Elem.second = GV;
5351 }
5352
5353 return Elem.second;
5354}
5355
5356Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
5357 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
5358 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
5359 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
5360}
5361
5364 Value *Null =
5365 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
5366 Value *SizeGep =
5367 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
5368 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
5369 return SizePtrToInt;
5370}
5371
5374 std::string VarName) {
5375 llvm::Constant *MaptypesArrayInit =
5377 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
5378 M, MaptypesArrayInit->getType(),
5379 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
5380 VarName);
5381 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
5382 return MaptypesArrayGlobal;
5383}
5384
5386 InsertPointTy AllocaIP,
5387 unsigned NumOperands,
5388 struct MapperAllocas &MapperAllocas) {
5389 if (!updateToLocation(Loc))
5390 return;
5391
5392 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
5393 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
5394 Builder.restoreIP(AllocaIP);
5395 AllocaInst *ArgsBase = Builder.CreateAlloca(
5396 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
5397 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
5398 ".offload_ptrs");
5399 AllocaInst *ArgSizes = Builder.CreateAlloca(
5400 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
5401 Builder.restoreIP(Loc.IP);
5402 MapperAllocas.ArgsBase = ArgsBase;
5403 MapperAllocas.Args = Args;
5404 MapperAllocas.ArgSizes = ArgSizes;
5405}
5406
5408 Function *MapperFunc, Value *SrcLocInfo,
5409 Value *MaptypesArg, Value *MapnamesArg,
5411 int64_t DeviceID, unsigned NumOperands) {
5412 if (!updateToLocation(Loc))
5413 return;
5414
5415 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
5416 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
5417 Value *ArgsBaseGEP =
5419 {Builder.getInt32(0), Builder.getInt32(0)});
5420 Value *ArgsGEP =
5422 {Builder.getInt32(0), Builder.getInt32(0)});
5423 Value *ArgSizesGEP =
5425 {Builder.getInt32(0), Builder.getInt32(0)});
5426 Value *NullPtr =
5427 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
5428 Builder.CreateCall(MapperFunc,
5429 {SrcLocInfo, Builder.getInt64(DeviceID),
5430 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
5431 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
5432}
5433
5435 TargetDataRTArgs &RTArgs,
5436 TargetDataInfo &Info,
5437 bool EmitDebug,
5438 bool ForEndCall) {
5439 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
5440 "expected region end call to runtime only when end call is separate");
5441 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
5442 auto VoidPtrTy = UnqualPtrTy;
5443 auto VoidPtrPtrTy = UnqualPtrTy;
5444 auto Int64Ty = Type::getInt64Ty(M.getContext());
5445 auto Int64PtrTy = UnqualPtrTy;
5446
5447 if (!Info.NumberOfPtrs) {
5448 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5449 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5450 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
5451 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
5452 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
5453 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5454 return;
5455 }
5456
5458 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
5459 Info.RTArgs.BasePointersArray,
5460 /*Idx0=*/0, /*Idx1=*/0);
5462 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
5463 /*Idx0=*/0,
5464 /*Idx1=*/0);
5466 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
5467 /*Idx0=*/0, /*Idx1=*/0);
5469 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
5470 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
5471 : Info.RTArgs.MapTypesArray,
5472 /*Idx0=*/0,
5473 /*Idx1=*/0);
5474
5475 // Only emit the mapper information arrays if debug information is
5476 // requested.
5477 if (!EmitDebug)
5478 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
5479 else
5481 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
5482 /*Idx0=*/0,
5483 /*Idx1=*/0);
5484 // If there is no user-defined mapper, set the mapper array to nullptr to
5485 // avoid an unnecessary data privatization
5486 if (!Info.HasMapper)
5487 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
5488 else
5489 RTArgs.MappersArray =
5490 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
5491}
5492
5494 InsertPointTy CodeGenIP,
5495 MapInfosTy &CombinedInfo,
5496 TargetDataInfo &Info) {
5498 CombinedInfo.NonContigInfo;
5499
5500 // Build an array of struct descriptor_dim and then assign it to
5501 // offload_args.
5502 //
5503 // struct descriptor_dim {
5504 // uint64_t offset;
5505 // uint64_t count;
5506 // uint64_t stride
5507 // };
5508 Type *Int64Ty = Builder.getInt64Ty();
5510 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
5511 "struct.descriptor_dim");
5512
5513 enum { OffsetFD = 0, CountFD, StrideFD };
5514 // We need two index variable here since the size of "Dims" is the same as
5515 // the size of Components, however, the size of offset, count, and stride is
5516 // equal to the size of base declaration that is non-contiguous.
5517 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
5518 // Skip emitting ir if dimension size is 1 since it cannot be
5519 // non-contiguous.
5520 if (NonContigInfo.Dims[I] == 1)
5521 continue;
5522 Builder.restoreIP(AllocaIP);
5523 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
5524 AllocaInst *DimsAddr =
5525 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
5526 Builder.restoreIP(CodeGenIP);
5527 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
5528 unsigned RevIdx = EE - II - 1;
5529 Value *DimsLVal = Builder.CreateInBoundsGEP(
5530 DimsAddr->getAllocatedType(), DimsAddr,
5531 {Builder.getInt64(0), Builder.getInt64(II)});
5532 // Offset
5533 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
5535 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
5536 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
5537 // Count
5538 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
5540 NonContigInfo.Counts[L][RevIdx], CountLVal,
5541 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
5542 // Stride
5543 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
5545 NonContigInfo.Strides[L][RevIdx], StrideLVal,
5546 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
5547 }
5548 // args[I] = &dims
5549 Builder.restoreIP(CodeGenIP);
5551 DimsAddr, Builder.getPtrTy());
5553 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
5554 Info.RTArgs.PointersArray, 0, I);
5557 ++L;
5558 }
5559}
5560
5562 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
5563 TargetDataInfo &Info, bool IsNonContiguous,
5564 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
5565 function_ref<Value *(unsigned int)> CustomMapperCB) {
5566
5567 // Reset the array information.
5568 Info.clearArrayInfo();
5569 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
5570
5571 if (Info.NumberOfPtrs == 0)
5572 return;
5573
5574 Builder.restoreIP(AllocaIP);
5575 // Detect if we have any capture size requiring runtime evaluation of the
5576 // size so that a constant array could be eventually used.
5577 ArrayType *PointerArrayType =
5578 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
5579
5580 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
5581 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
5582
5583 Info.RTArgs.PointersArray = Builder.CreateAlloca(
5584 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
5585 AllocaInst *MappersArray = Builder.CreateAlloca(
5586 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
5587 Info.RTArgs.MappersArray = MappersArray;
5588
5589 // If we don't have any VLA types or other types that require runtime
5590 // evaluation, we can use a constant array for the map sizes, otherwise we
5591 // need to fill up the arrays as we do for the pointers.
5592 Type *Int64Ty = Builder.getInt64Ty();
5593 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
5594 ConstantInt::get(Int64Ty, 0));
5595 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
5596 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
5597 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
5598 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
5599 if (IsNonContiguous &&
5600 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5601 CombinedInfo.Types[I] &
5602 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
5603 ConstSizes[I] =
5604 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
5605 else
5606 ConstSizes[I] = CI;
5607 continue;
5608 }
5609 }
5610 RuntimeSizes.set(I);
5611 }
5612
5613 if (RuntimeSizes.all()) {
5614 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
5615 Info.RTArgs.SizesArray = Builder.CreateAlloca(
5616 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
5617 Builder.restoreIP(CodeGenIP);
5618 } else {
5619 auto *SizesArrayInit = ConstantArray::get(
5620 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
5621 std::string Name = createPlatformSpecificName({"offload_sizes"});
5622 auto *SizesArrayGbl =
5623 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
5624 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
5625 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
5626
5627 if (!RuntimeSizes.any()) {
5628 Info.RTArgs.SizesArray = SizesArrayGbl;
5629 } else {
5630 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
5631 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
5632 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
5634 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
5635 Buffer->setAlignment(OffloadSizeAlign);
5636 Builder.restoreIP(CodeGenIP);
5638 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
5639 SizesArrayGbl, OffloadSizeAlign,
5641 IndexSize,
5642 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
5643
5644 Info.RTArgs.SizesArray = Buffer;
5645 }
5646 Builder.restoreIP(CodeGenIP);
5647 }
5648
5649 // The map types are always constant so we don't need to generate code to
5650 // fill arrays. Instead, we create an array constant.
5652 for (auto mapFlag : CombinedInfo.Types)
5653 Mapping.push_back(
5654 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5655 mapFlag));
5656 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
5657 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
5658 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
5659
5660 // The information types are only built if provided.
5661 if (!CombinedInfo.Names.empty()) {
5662 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
5663 auto *MapNamesArrayGbl =
5664 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
5665 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
5666 } else {
5667 Info.RTArgs.MapNamesArray =
5669 }
5670
5671 // If there's a present map type modifier, it must not be applied to the end
5672 // of a region, so generate a separate map type array in that case.
5673 if (Info.separateBeginEndCalls()) {
5674 bool EndMapTypesDiffer = false;
5675 for (uint64_t &Type : Mapping) {
5676 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5677 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
5678 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
5679 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
5680 EndMapTypesDiffer = true;
5681 }
5682 }
5683 if (EndMapTypesDiffer) {
5684 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
5685 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
5686 }
5687 }
5688
5689 PointerType *PtrTy = Builder.getPtrTy();
5690 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
5691 Value *BPVal = CombinedInfo.BasePointers[I];
5693 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
5694 0, I);
5695 Builder.CreateAlignedStore(BPVal, BP,
5697
5698 if (Info.requiresDevicePointerInfo()) {
5699 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
5700 CodeGenIP = Builder.saveIP();
5701 Builder.restoreIP(AllocaIP);
5702 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
5703 Builder.restoreIP(CodeGenIP);
5704 if (DeviceAddrCB)
5705 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
5706 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
5707 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
5708 if (DeviceAddrCB)
5709 DeviceAddrCB(I, BP);
5710 }
5711 }
5712
5713 Value *PVal = CombinedInfo.Pointers[I];
5715 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
5716 I);
5717 // TODO: Check alignment correct.
5720
5721 if (RuntimeSizes.test(I)) {
5723 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
5724 /*Idx0=*/0,
5725 /*Idx1=*/I);
5727 Int64Ty,
5728 /*isSigned=*/true),
5729 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
5730 }
5731 // Fill up the mapper array.
5732 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
5733 Value *MFunc = ConstantPointerNull::get(PtrTy);
5734 if (CustomMapperCB)
5735 if (Value *CustomMFunc = CustomMapperCB(I))
5736 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
5738 MappersArray->getAllocatedType(), MappersArray,
5739 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
5741 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
5742 }
5743
5744 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
5745 Info.NumberOfPtrs == 0)
5746 return;
5747 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
5748}
5749
5752
5753 if (!CurBB || CurBB->getTerminator()) {
5754 // If there is no insert point or the previous block is already
5755 // terminated, don't touch it.
5756 } else {
5757 // Otherwise, create a fall-through branch.
5759 }
5760
5762}
5763
5765 bool IsFinished) {
5767
5768 // Fall out of the current block (if necessary).
5769 emitBranch(BB);
5770
5771 if (IsFinished && BB->use_empty()) {
5772 BB->eraseFromParent();
5773 return;
5774 }
5775
5776 // Place the block after the current block, if possible, or else at
5777 // the end of the function.
5778 if (CurBB && CurBB->getParent())
5779 CurFn->insert(std::next(CurBB->getIterator()), BB);
5780 else
5781 CurFn->insert(CurFn->end(), BB);
5783}
5784
5786 BodyGenCallbackTy ElseGen,
5787 InsertPointTy AllocaIP) {
5788 // If the condition constant folds and can be elided, try to avoid emitting
5789 // the condition and the dead arm of the if/else.
5790 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
5791 auto CondConstant = CI->getSExtValue();
5792 if (CondConstant)
5793 ThenGen(AllocaIP, Builder.saveIP());
5794 else
5795 ElseGen(AllocaIP, Builder.saveIP());
5796 return;
5797 }
5798
5800
5801 // Otherwise, the condition did not fold, or we couldn't elide it. Just
5802 // emit the conditional branch.
5803 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
5804 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
5805 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
5806 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
5807 // Emit the 'then' code.
5808 emitBlock(ThenBlock, CurFn);
5809 ThenGen(AllocaIP, Builder.saveIP());
5810 emitBranch(ContBlock);
5811 // Emit the 'else' code if present.
5812 // There is no need to emit line number for unconditional branch.
5813 emitBlock(ElseBlock, CurFn);
5814 ElseGen(AllocaIP, Builder.saveIP());
5815 // There is no need to emit line number for unconditional branch.
5816 emitBranch(ContBlock);
5817 // Emit the continuation block for code after the if.
5818 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
5819}
5820
5821bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
5822 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
5825 "Unexpected Atomic Ordering.");
5826
5827 bool Flush = false;
5829
5830 switch (AK) {
5831 case Read:
5834 FlushAO = AtomicOrdering::Acquire;
5835 Flush = true;
5836 }
5837 break;
5838 case Write:
5839 case Compare:
5840 case Update:
5843 FlushAO = AtomicOrdering::Release;
5844 Flush = true;
5845 }
5846 break;
5847 case Capture:
5848 switch (AO) {
5850 FlushAO = AtomicOrdering::Acquire;
5851 Flush = true;
5852 break;
5854 FlushAO = AtomicOrdering::Release;
5855 Flush = true;
5856 break;
5860 Flush = true;
5861 break;
5862 default:
5863 // do nothing - leave silently.
5864 break;
5865 }
5866 }
5867
5868 if (Flush) {
5869 // Currently Flush RT call still doesn't take memory_ordering, so for when
5870 // that happens, this tries to do the resolution of which atomic ordering
5871 // to use with but issue the flush call
5872 // TODO: pass `FlushAO` after memory ordering support is added
5873 (void)FlushAO;
5874 emitFlush(Loc);
5875 }
5876
5877 // for AO == AtomicOrdering::Monotonic and all other case combinations
5878 // do nothing
5879 return Flush;
5880}
5881
5885 AtomicOrdering AO) {
5886 if (!updateToLocation(Loc))
5887 return Loc.IP;
5888
5889 assert(X.Var->getType()->isPointerTy() &&
5890 "OMP Atomic expects a pointer to target memory");
5891 Type *XElemTy = X.ElemTy;
5892 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5893 XElemTy->isPointerTy()) &&
5894 "OMP atomic read expected a scalar type");
5895
5896 Value *XRead = nullptr;
5897
5898 if (XElemTy->isIntegerTy()) {
5899 LoadInst *XLD =
5900 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
5901 XLD->setAtomic(AO);
5902 XRead = cast<Value>(XLD);
5903 } else {
5904 // We need to perform atomic op as integer
5905 IntegerType *IntCastTy =
5907 LoadInst *XLoad =
5908 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
5909 XLoad->setAtomic(AO);
5910 if (XElemTy->isFloatingPointTy()) {
5911 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
5912 } else {
5913 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
5914 }
5915 }
5916 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
5917 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
5918 return Builder.saveIP();
5919}
5920
5923 AtomicOpValue &X, Value *Expr,
5924 AtomicOrdering AO) {
5925 if (!updateToLocation(Loc))
5926 return Loc.IP;
5927
5928 assert(X.Var->getType()->isPointerTy() &&
5929 "OMP Atomic expects a pointer to target memory");
5930 Type *XElemTy = X.ElemTy;
5931 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5932 XElemTy->isPointerTy()) &&
5933 "OMP atomic write expected a scalar type");
5934
5935 if (XElemTy->isIntegerTy()) {
5936 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
5937 XSt->setAtomic(AO);
5938 } else {
5939 // We need to bitcast and perform atomic op as integers
5940 IntegerType *IntCastTy =
5942 Value *ExprCast =
5943 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
5944 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
5945 XSt->setAtomic(AO);
5946 }
5947
5948 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
5949 return Builder.saveIP();
5950}
5951
5953 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
5954 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
5955 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
5956 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
5957 if (!updateToLocation(Loc))
5958 return Loc.IP;
5959
5960 LLVM_DEBUG({
5961 Type *XTy = X.Var->getType();
5962 assert(XTy->isPointerTy() &&
5963 "OMP Atomic expects a pointer to target memory");
5964 Type *XElemTy = X.ElemTy;
5965 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
5966 XElemTy->isPointerTy()) &&
5967 "OMP atomic update expected a scalar type");
5968 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
5969 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
5970 "OpenMP atomic does not support LT or GT operations");
5971 });
5972
5973 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
5974 X.IsVolatile, IsXBinopExpr);
5975 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
5976 return Builder.saveIP();
5977}
5978
5979// FIXME: Duplicating AtomicExpand
5980Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
5981 AtomicRMWInst::BinOp RMWOp) {
5982 switch (RMWOp) {
5983 case AtomicRMWInst::Add:
5984 return Builder.CreateAdd(Src1, Src2);
5985 case AtomicRMWInst::Sub:
5986 return Builder.CreateSub(Src1, Src2);
5987 case AtomicRMWInst::And:
5988 return Builder.CreateAnd(Src1, Src2);
5990 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
5991 case AtomicRMWInst::Or:
5992 return Builder.CreateOr(Src1, Src2);
5993 case AtomicRMWInst::Xor:
5994 return Builder.CreateXor(Src1, Src2);
5999 case AtomicRMWInst::Max:
6000 case AtomicRMWInst::Min:
6007 llvm_unreachable("Unsupported atomic update operation");
6008 }
6009 llvm_unreachable("Unsupported atomic update operation");
6010}
6011
6012std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
6013 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
6015 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
6016 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
6017 // or a complex datatype.
6018 bool emitRMWOp = false;
6019 switch (RMWOp) {
6020 case AtomicRMWInst::Add:
6021 case AtomicRMWInst::And:
6023 case AtomicRMWInst::Or:
6024 case AtomicRMWInst::Xor:
6026 emitRMWOp = XElemTy;
6027 break;
6028 case AtomicRMWInst::Sub:
6029 emitRMWOp = (IsXBinopExpr && XElemTy);
6030 break;
6031 default:
6032 emitRMWOp = false;
6033 }
6034 emitRMWOp &= XElemTy->isIntegerTy();
6035
6036 std::pair<Value *, Value *> Res;
6037 if (emitRMWOp) {
6038 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
6039 // not needed except in case of postfix captures. Generate anyway for
6040 // consistency with the else part. Will be removed with any DCE pass.
6041 // AtomicRMWInst::Xchg does not have a coressponding instruction.
6042 if (RMWOp == AtomicRMWInst::Xchg)
6043 Res.second = Res.first;
6044 else
6045 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
6046 } else {
6047 IntegerType *IntCastTy =
6049 LoadInst *OldVal =
6050 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
6051 OldVal->setAtomic(AO);
6052 // CurBB
6053 // | /---\
6054 // ContBB |
6055 // | \---/
6056 // ExitBB
6058 Instruction *CurBBTI = CurBB->getTerminator();
6059 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
6060 BasicBlock *ExitBB =
6061 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
6062 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
6063 X->getName() + ".atomic.cont");
6064 ContBB->getTerminator()->eraseFromParent();
6065 Builder.restoreIP(AllocaIP);
6066 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
6067 NewAtomicAddr->setName(X->getName() + "x.new.val");
6068 Builder.SetInsertPoint(ContBB);
6069 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
6070 PHI->addIncoming(OldVal, CurBB);
6071 bool IsIntTy = XElemTy->isIntegerTy();
6072 Value *OldExprVal = PHI;
6073 if (!IsIntTy) {
6074 if (XElemTy->isFloatingPointTy()) {
6075 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
6076 X->getName() + ".atomic.fltCast");
6077 } else {
6078 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
6079 X->getName() + ".atomic.ptrCast");
6080 }
6081 }
6082
6083 Value *Upd = UpdateOp(OldExprVal, Builder);
6084 Builder.CreateStore(Upd, NewAtomicAddr);
6085 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
6089 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
6090 Result->setVolatile(VolatileX);
6091 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
6092 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6093 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
6094 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
6095
6096 Res.first = OldExprVal;
6097 Res.second = Upd;
6098
6099 // set Insertion point in exit block
6100 if (UnreachableInst *ExitTI =
6101 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
6102 CurBBTI->eraseFromParent();
6103 Builder.SetInsertPoint(ExitBB);
6104 } else {
6105 Builder.SetInsertPoint(ExitTI);
6106 }
6107 }
6108
6109 return Res;
6110}
6111
6113 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
6114 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
6116 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
6117 if (!updateToLocation(Loc))
6118 return Loc.IP;
6119
6120 LLVM_DEBUG({
6121 Type *XTy = X.Var->getType();
6122 assert(XTy->isPointerTy() &&
6123 "OMP Atomic expects a pointer to target memory");
6124 Type *XElemTy = X.ElemTy;
6125 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
6126 XElemTy->isPointerTy()) &&
6127 "OMP atomic capture expected a scalar type");
6128 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
6129 "OpenMP atomic does not support LT or GT operations");
6130 });
6131
6132 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
6133 // 'x' is simply atomically rewritten with 'expr'.
6134 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
6135 std::pair<Value *, Value *> Result =
6136 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
6137 X.IsVolatile, IsXBinopExpr);
6138
6139 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
6140 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
6141
6142 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
6143 return Builder.saveIP();
6144}
6145
6149 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
6150 bool IsFailOnly) {
6151
6153 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
6154 IsPostfixUpdate, IsFailOnly, Failure);
6155}
6156
6160 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
6161 bool IsFailOnly, AtomicOrdering Failure) {
6162
6163 if (!updateToLocation(Loc))
6164 return Loc.IP;
6165
6166 assert(X.Var->getType()->isPointerTy() &&
6167 "OMP atomic expects a pointer to target memory");
6168 // compare capture
6169 if (V.Var) {
6170 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
6171 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
6172 }
6173
6174 bool IsInteger = E->getType()->isIntegerTy();
6175
6176 if (Op == OMPAtomicCompareOp::EQ) {
6177 AtomicCmpXchgInst *Result = nullptr;
6178 if (!IsInteger) {
6179 IntegerType *IntCastTy =
6180 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
6181 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
6182 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
6183 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
6184 AO, Failure);
6185 } else {
6186 Result =
6187 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
6188 }
6189
6190 if (V.Var) {
6191 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
6192 if (!IsInteger)
6193 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
6194 assert(OldValue->getType() == V.ElemTy &&
6195 "OldValue and V must be of same type");
6196 if (IsPostfixUpdate) {
6197 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
6198 } else {
6199 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6200 if (IsFailOnly) {
6201 // CurBB----
6202 // | |
6203 // v |
6204 // ContBB |
6205 // | |
6206 // v |
6207 // ExitBB <-
6208 //
6209 // where ContBB only contains the store of old value to 'v'.
6211 Instruction *CurBBTI = CurBB->getTerminator();
6212 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
6213 BasicBlock *ExitBB = CurBB->splitBasicBlock(
6214 CurBBTI, X.Var->getName() + ".atomic.exit");
6215 BasicBlock *ContBB = CurBB->splitBasicBlock(
6216 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
6217 ContBB->getTerminator()->eraseFromParent();
6218 CurBB->getTerminator()->eraseFromParent();
6219
6220 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
6221
6222 Builder.SetInsertPoint(ContBB);
6223 Builder.CreateStore(OldValue, V.Var);
6224 Builder.CreateBr(ExitBB);
6225
6226 if (UnreachableInst *ExitTI =
6227 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
6228 CurBBTI->eraseFromParent();
6229 Builder.SetInsertPoint(ExitBB);
6230 } else {
6231 Builder.SetInsertPoint(ExitTI);
6232 }
6233 } else {
6234 Value *CapturedValue =
6235 Builder.CreateSelect(SuccessOrFail, E, OldValue);
6236 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
6237 }
6238 }
6239 }
6240 // The comparison result has to be stored.
6241 if (R.Var) {
6242 assert(R.Var->getType()->isPointerTy() &&
6243 "r.var must be of pointer type");
6244 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
6245
6246 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
6247 Value *ResultCast = R.IsSigned
6248 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
6249 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
6250 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
6251 }
6252 } else {
6253 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
6254 "Op should be either max or min at this point");
6255 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
6256
6257 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
6258 // Let's take max as example.
6259 // OpenMP form:
6260 // x = x > expr ? expr : x;
6261 // LLVM form:
6262 // *ptr = *ptr > val ? *ptr : val;
6263 // We need to transform to LLVM form.
6264 // x = x <= expr ? x : expr;
6266 if (IsXBinopExpr) {
6267 if (IsInteger) {
6268 if (X.IsSigned)
6269 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
6271 else
6272 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
6274 } else {
6275 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
6277 }
6278 } else {
6279 if (IsInteger) {
6280 if (X.IsSigned)
6281 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
6283 else
6284 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
6286 } else {
6287 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
6289 }
6290 }
6291
6292 AtomicRMWInst *OldValue =
6293 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
6294 if (V.Var) {
6295 Value *CapturedValue = nullptr;
6296 if (IsPostfixUpdate) {
6297 CapturedValue = OldValue;
6298 } else {
6299 CmpInst::Predicate Pred;
6300 switch (NewOp) {
6301 case AtomicRMWInst::Max:
6302 Pred = CmpInst::ICMP_SGT;
6303 break;
6305 Pred = CmpInst::ICMP_UGT;
6306 break;
6308 Pred = CmpInst::FCMP_OGT;
6309 break;
6310 case AtomicRMWInst::Min:
6311 Pred = CmpInst::ICMP_SLT;
6312 break;
6314 Pred = CmpInst::ICMP_ULT;
6315 break;
6317 Pred = CmpInst::FCMP_OLT;
6318 break;
6319 default:
6320 llvm_unreachable("unexpected comparison op");
6321 }
6322 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
6323 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
6324 }
6325 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
6326 }
6327 }
6328
6329 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
6330
6331 return Builder.saveIP();
6332}
6333
6336 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
6337 Value *NumTeamsUpper, Value *ThreadLimit,
6338 Value *IfExpr) {
6339 if (!updateToLocation(Loc))
6340 return InsertPointTy();
6341
6342 uint32_t SrcLocStrSize;
6343 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6344 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6345 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
6346
6347 // Outer allocation basicblock is the entry block of the current function.
6348 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
6349 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
6350 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
6351 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
6352 }
6353
6354 // The current basic block is split into four basic blocks. After outlining,
6355 // they will be mapped as follows:
6356 // ```
6357 // def current_fn() {
6358 // current_basic_block:
6359 // br label %teams.exit
6360 // teams.exit:
6361 // ; instructions after teams
6362 // }
6363 //
6364 // def outlined_fn() {
6365 // teams.alloca:
6366 // br label %teams.body
6367 // teams.body:
6368 // ; instructions within teams body
6369 // }
6370 // ```
6371 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
6372 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
6373 BasicBlock *AllocaBB =
6374 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
6375
6376 bool SubClausesPresent =
6377 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
6378 // Push num_teams
6379 if (!Config.isTargetDevice() && SubClausesPresent) {
6380 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
6381 "if lowerbound is non-null, then upperbound must also be non-null "
6382 "for bounds on num_teams");
6383
6384 if (NumTeamsUpper == nullptr)
6385 NumTeamsUpper = Builder.getInt32(0);
6386
6387 if (NumTeamsLower == nullptr)
6388 NumTeamsLower = NumTeamsUpper;
6389
6390 if (IfExpr) {
6391 assert(IfExpr->getType()->isIntegerTy() &&
6392 "argument to if clause must be an integer value");
6393
6394 // upper = ifexpr ? upper : 1
6395 if (IfExpr->getType() != Int1)
6396 IfExpr = Builder.CreateICmpNE(IfExpr,
6397 ConstantInt::get(IfExpr->getType(), 0));
6398 NumTeamsUpper = Builder.CreateSelect(
6399 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
6400
6401 // lower = ifexpr ? lower : 1
6402 NumTeamsLower = Builder.CreateSelect(
6403 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
6404 }
6405
6406 if (ThreadLimit == nullptr)
6407 ThreadLimit = Builder.getInt32(0);
6408
6409 Value *ThreadNum = getOrCreateThreadID(Ident);
6411 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
6412 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
6413 }
6414 // Generate the body of teams.
6415 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
6416 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
6417 BodyGenCB(AllocaIP, CodeGenIP);
6418
6419 OutlineInfo OI;
6420 OI.EntryBB = AllocaBB;
6421 OI.ExitBB = ExitBB;
6422 OI.OuterAllocaBB = &OuterAllocaBB;
6423
6424 // Insert fake values for global tid and bound tid.
6425 std::stack<Instruction *> ToBeDeleted;
6426 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
6428 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
6430 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
6431
6432 auto HostPostOutlineCB = [this, Ident,
6433 ToBeDeleted](Function &OutlinedFn) mutable {
6434 // The stale call instruction will be replaced with a new call instruction
6435 // for runtime call with the outlined function.
6436
6437 assert(OutlinedFn.getNumUses() == 1 &&
6438 "there must be a single user for the outlined function");
6439 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
6440 ToBeDeleted.push(StaleCI);
6441
6442 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
6443 "Outlined function must have two or three arguments only");
6444
6445 bool HasShared = OutlinedFn.arg_size() == 3;
6446
6447 OutlinedFn.getArg(0)->setName("global.tid.ptr");
6448 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
6449 if (HasShared)
6450 OutlinedFn.getArg(2)->setName("data");
6451
6452 // Call to the runtime function for teams in the current function.
6453 assert(StaleCI && "Error while outlining - no CallInst user found for the "
6454 "outlined function.");
6455 Builder.SetInsertPoint(StaleCI);
6456 SmallVector<Value *> Args = {
6457 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
6458 if (HasShared)
6459 Args.push_back(StaleCI->getArgOperand(2));
6461 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
6462 Args);
6463
6464 while (!ToBeDeleted.empty()) {
6465 ToBeDeleted.top()->eraseFromParent();
6466 ToBeDeleted.pop();
6467 }
6468 };
6469
6470 if (!Config.isTargetDevice())
6471 OI.PostOutlineCB = HostPostOutlineCB;
6472
6473 addOutlineInfo(std::move(OI));
6474
6475 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
6476
6477 return Builder.saveIP();
6478}
6479
6482 std::string VarName) {
6483 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
6485 Names.size()),
6486 Names);
6487 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
6488 M, MapNamesArrayInit->getType(),
6489 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
6490 VarName);
6491 return MapNamesArrayGlobal;
6492}
6493
6494// Create all simple and struct types exposed by the runtime and remember
6495// the llvm::PointerTypes of them for easy access later.
6496void OpenMPIRBuilder::initializeTypes(Module &M) {
6497 LLVMContext &Ctx = M.getContext();
6498 StructType *T;
6499#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
6500#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
6501 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
6502 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
6503#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
6504 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
6505 VarName##Ptr = PointerType::getUnqual(VarName);
6506#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
6507 T = StructType::getTypeByName(Ctx, StructName); \
6508 if (!T) \
6509 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
6510 VarName = T; \
6511 VarName##Ptr = PointerType::getUnqual(T);
6512#include "llvm/Frontend/OpenMP/OMPKinds.def"
6513}
6514
6517 SmallVectorImpl<BasicBlock *> &BlockVector) {
6519 BlockSet.insert(EntryBB);
6520 BlockSet.insert(ExitBB);
6521
6522 Worklist.push_back(EntryBB);
6523 while (!Worklist.empty()) {
6524 BasicBlock *BB = Worklist.pop_back_val();
6525 BlockVector.push_back(BB);
6526 for (BasicBlock *SuccBB : successors(BB))
6527 if (BlockSet.insert(SuccBB).second)
6528 Worklist.push_back(SuccBB);
6529 }
6530}
6531
6533 uint64_t Size, int32_t Flags,
6535 StringRef Name) {
6536 if (!Config.isGPU()) {
6538 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
6539 "omp_offloading_entries");
6540 return;
6541 }
6542 // TODO: Add support for global variables on the device after declare target
6543 // support.
6544 Function *Fn = dyn_cast<Function>(Addr);
6545 if (!Fn)
6546 return;
6547
6548 Module &M = *(Fn->getParent());
6549 LLVMContext &Ctx = M.getContext();
6550
6551 // Get "nvvm.annotations" metadata node.
6552 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6553
6554 Metadata *MDVals[] = {
6555 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
6556 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
6557 // Append metadata to nvvm.annotations.
6558 MD->addOperand(MDNode::get(Ctx, MDVals));
6559
6560 // Add a function attribute for the kernel.
6561 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
6562 if (T.isAMDGCN())
6563 Fn->addFnAttr("uniform-work-group-size", "true");
6564 Fn->addFnAttr(Attribute::MustProgress);
6565}
6566
6567// We only generate metadata for function that contain target regions.
6570
6571 // If there are no entries, we don't need to do anything.
6573 return;
6574
6578 16>
6579 OrderedEntries(OffloadInfoManager.size());
6580
6581 // Auxiliary methods to create metadata values and strings.
6582 auto &&GetMDInt = [this](unsigned V) {
6583 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
6584 };
6585
6586 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
6587
6588 // Create the offloading info metadata node.
6589 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
6590 auto &&TargetRegionMetadataEmitter =
6591 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
6592 const TargetRegionEntryInfo &EntryInfo,
6594 // Generate metadata for target regions. Each entry of this metadata
6595 // contains:
6596 // - Entry 0 -> Kind of this type of metadata (0).
6597 // - Entry 1 -> Device ID of the file where the entry was identified.
6598 // - Entry 2 -> File ID of the file where the entry was identified.
6599 // - Entry 3 -> Mangled name of the function where the entry was
6600 // identified.
6601 // - Entry 4 -> Line in the file where the entry was identified.
6602 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
6603 // - Entry 6 -> Order the entry was created.
6604 // The first element of the metadata node is the kind.
6605 Metadata *Ops[] = {
6606 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
6607 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
6608 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
6609 GetMDInt(E.getOrder())};
6610
6611 // Save this entry in the right position of the ordered entries array.
6612 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
6613
6614 // Add metadata to the named metadata node.
6615 MD->addOperand(MDNode::get(C, Ops));
6616 };
6617
6618 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
6619
6620 // Create function that emits metadata for each device global variable entry;
6621 auto &&DeviceGlobalVarMetadataEmitter =
6622 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
6623 StringRef MangledName,
6625 // Generate metadata for global variables. Each entry of this metadata
6626 // contains:
6627 // - Entry 0 -> Kind of this type of metadata (1).
6628 // - Entry 1 -> Mangled name of the variable.
6629 // - Entry 2 -> Declare target kind.
6630 // - Entry 3 -> Order the entry was created.
6631 // The first element of the metadata node is the kind.
6632 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
6633 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
6634
6635 // Save this entry in the right position of the ordered entries array.
6636 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
6637 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
6638
6639 // Add metadata to the named metadata node.
6640 MD->addOperand(MDNode::get(C, Ops));
6641 };
6642
6644 DeviceGlobalVarMetadataEmitter);
6645
6646 for (const auto &E : OrderedEntries) {
6647 assert(E.first && "All ordered entries must exist!");
6648 if (const auto *CE =
6649 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
6650 E.first)) {
6651 if (!CE->getID() || !CE->getAddress()) {
6652 // Do not blame the entry if the parent funtion is not emitted.
6653 TargetRegionEntryInfo EntryInfo = E.second;
6654 StringRef FnName = EntryInfo.ParentName;
6655 if (!M.getNamedValue(FnName))
6656 continue;
6657 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
6658 continue;
6659 }
6660 createOffloadEntry(CE->getID(), CE->getAddress(),
6661 /*Size=*/0, CE->getFlags(),
6663 } else if (const auto *CE = dyn_cast<
6665 E.first)) {
6668 CE->getFlags());
6669 switch (Flags) {
6673 continue;
6674 if (!CE->getAddress()) {
6675 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
6676 continue;
6677 }
6678 // The vaiable has no definition - no need to add the entry.
6679 if (CE->getVarSize() == 0)
6680 continue;
6681 break;
6683 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
6684 (!Config.isTargetDevice() && CE->getAddress())) &&
6685 "Declaret target link address is set.");
6686 if (Config.isTargetDevice())
6687 continue;
6688 if (!CE->getAddress()) {
6690 continue;
6691 }
6692 break;
6693 default:
6694 break;
6695 }
6696
6697 // Hidden or internal symbols on the device are not externally visible.
6698 // We should not attempt to register them by creating an offloading
6699 // entry. Indirect variables are handled separately on the device.
6700 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
6701 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
6703 continue;
6704
6705 // Indirect globals need to use a special name that doesn't match the name
6706 // of the associated host global.
6708 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
6709 Flags, CE->getLinkage(), CE->getVarName());
6710 else
6711 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
6712 Flags, CE->getLinkage());
6713
6714 } else {
6715 llvm_unreachable("Unsupported entry kind.");
6716 }
6717 }
6718
6719 // Emit requires directive globals to a special entry so the runtime can
6720 // register them when the device image is loaded.
6721 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
6722 // entries should be redesigned to better suit this use-case.
6726 /*Name=*/"",
6728 Config.getRequiresFlags(), "omp_offloading_entries");
6729}
6730
6732 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
6733 unsigned FileID, unsigned Line, unsigned Count) {
6735 OS << "__omp_offloading" << llvm::format("_%x", DeviceID)
6736 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
6737 if (Count)
6738 OS << "_" << Count;
6739}
6740
6743 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
6745 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
6746 EntryInfo.Line, NewCount);
6747}
6748
6751 StringRef ParentName) {
6753 auto FileIDInfo = CallBack();
6754 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
6755 report_fatal_error(("Unable to get unique ID for file, during "
6756 "getTargetEntryUniqueInfo, error message: " +
6757 EC.message())
6758 .c_str());
6759 }
6760
6761 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
6762 std::get<1>(FileIDInfo));
6763}
6764
6766 unsigned Offset = 0;
6767 for (uint64_t Remain =
6768 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6770 !(Remain & 1); Remain = Remain >> 1)
6771 Offset++;
6772 return Offset;
6773}
6774
6777 // Rotate by getFlagMemberOffset() bits.
6778 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
6779 << getFlagMemberOffset());
6780}
6781
6784 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
6785 // If the entry is PTR_AND_OBJ but has not been marked with the special
6786 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
6787 // marked as MEMBER_OF.
6788 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6790 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
6793 return;
6794
6795 // Reset the placeholder value to prepare the flag for the assignment of the
6796 // proper MEMBER_OF value.
6797 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
6798 Flags |= MemberOfFlag;
6799}
6800
6804 bool IsDeclaration, bool IsExternallyVisible,
6805 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
6806 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
6807 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
6808 std::function<Constant *()> GlobalInitializer,
6809 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
6810 // TODO: convert this to utilise the IRBuilder Config rather than
6811 // a passed down argument.
6812 if (OpenMPSIMD)
6813 return nullptr;
6814
6817 CaptureClause ==
6820 SmallString<64> PtrName;
6821 {
6822 raw_svector_ostream OS(PtrName);
6823 OS << MangledName;
6824 if (!IsExternallyVisible)
6825 OS << format("_%x", EntryInfo.FileID);
6826 OS << "_decl_tgt_ref_ptr";
6827 }
6828
6829 Value *Ptr = M.getNamedValue(PtrName);
6830
6831 if (!Ptr) {
6832 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
6833 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
6834
6835 auto *GV = cast<GlobalVariable>(Ptr);
6836 GV->setLinkage(GlobalValue::WeakAnyLinkage);
6837
6838 if (!Config.isTargetDevice()) {
6839 if (GlobalInitializer)
6840 GV->setInitializer(GlobalInitializer());
6841 else
6842 GV->setInitializer(GlobalValue);
6843 }
6844
6846 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
6847 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
6848 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
6849 }
6850
6851 return cast<Constant>(Ptr);
6852 }
6853
6854 return nullptr;
6855}
6856
6860 bool IsDeclaration, bool IsExternallyVisible,
6861 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
6862 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
6863 std::vector<Triple> TargetTriple,
6864 std::function<Constant *()> GlobalInitializer,
6865 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
6866 Constant *Addr) {
6868 (TargetTriple.empty() && !Config.isTargetDevice()))
6869 return;
6870
6872 StringRef VarName;
6873 int64_t VarSize;
6875
6877 CaptureClause ==
6881 VarName = MangledName;
6882 GlobalValue *LlvmVal = M.getNamedValue(VarName);
6883
6884 if (!IsDeclaration)
6885 VarSize = divideCeil(
6887 else
6888 VarSize = 0;
6889 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
6890
6891 // This is a workaround carried over from Clang which prevents undesired
6892 // optimisation of internal variables.
6893 if (Config.isTargetDevice() &&
6894 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
6895 // Do not create a "ref-variable" if the original is not also available
6896 // on the host.
6898 return;
6899
6900 std::string RefName = createPlatformSpecificName({VarName, "ref"});
6901
6902 if (!M.getNamedValue(RefName)) {
6903 Constant *AddrRef =
6904 getOrCreateInternalVariable(Addr->getType(), RefName);
6905 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
6906 GvAddrRef->setConstant(true);
6907 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
6908 GvAddrRef->setInitializer(Addr);
6909 GeneratedRefs.push_back(GvAddrRef);
6910 }
6911 }
6912 } else {
6915 else
6917
6918 if (Config.isTargetDevice()) {
6919 VarName = (Addr) ? Addr->getName() : "";
6920 Addr = nullptr;
6921 } else {
6923 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
6924 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
6925 LlvmPtrTy, GlobalInitializer, VariableLinkage);
6926 VarName = (Addr) ? Addr->getName() : "";
6927 }
6928 VarSize = M.getDataLayout().getPointerSize();
6930 }
6931
6933 Flags, Linkage);
6934}
6935
6936/// Loads all the offload entries information from the host IR
6937/// metadata.
6939 // If we are in target mode, load the metadata from the host IR. This code has
6940 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
6941
6943 if (!MD)
6944 return;
6945
6946 for (MDNode *MN : MD->operands()) {
6947 auto &&GetMDInt = [MN](unsigned Idx) {
6948 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
6949 return cast<ConstantInt>(V->getValue())->getZExtValue();
6950 };
6951
6952 auto &&GetMDString = [MN](unsigned Idx) {
6953 auto *V = cast<MDString>(MN->getOperand(Idx));
6954 return V->getString();
6955 };
6956
6957 switch (GetMDInt(0)) {
6958 default:
6959 llvm_unreachable("Unexpected metadata!");
6960 break;
6963 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
6964 /*DeviceID=*/GetMDInt(1),
6965 /*FileID=*/GetMDInt(2),
6966 /*Line=*/GetMDInt(4),
6967 /*Count=*/GetMDInt(5));
6969 /*Order=*/GetMDInt(6));
6970 break;
6971 }
6975 /*MangledName=*/GetMDString(1),
6977 /*Flags=*/GetMDInt(2)),
6978 /*Order=*/GetMDInt(3));
6979 break;
6980 }
6981 }
6982}
6983
6985 if (HostFilePath.empty())
6986 return;
6987
6988 auto Buf = MemoryBuffer::getFile(HostFilePath);
6989 if (std::error_code Err = Buf.getError()) {
6990 report_fatal_error(("error opening host file from host file path inside of "
6991 "OpenMPIRBuilder: " +
6992 Err.message())
6993 .c_str());
6994 }
6995
6996 LLVMContext Ctx;
6998 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
6999 if (std::error_code Err = M.getError()) {
7001 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
7002 .c_str());
7003 }
7004
7005 loadOffloadInfoMetadata(*M.get());
7006}
7007
7008//===----------------------------------------------------------------------===//
7009// OffloadEntriesInfoManager
7010//===----------------------------------------------------------------------===//
7011
7013 return OffloadEntriesTargetRegion.empty() &&
7014 OffloadEntriesDeviceGlobalVar.empty();
7015}
7016
7017unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
7018 const TargetRegionEntryInfo &EntryInfo) const {
7019 auto It = OffloadEntriesTargetRegionCount.find(
7020 getTargetRegionEntryCountKey(EntryInfo));
7021 if (It == OffloadEntriesTargetRegionCount.end())
7022 return 0;
7023 return It->second;
7024}
7025
7026void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
7027 const TargetRegionEntryInfo &EntryInfo) {
7028 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
7029 EntryInfo.Count + 1;
7030}
7031
7032/// Initialize target region entry.
7034 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
7035 OffloadEntriesTargetRegion[EntryInfo] =
7036 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
7037 OMPTargetRegionEntryTargetRegion);
7038 ++OffloadingEntriesNum;
7039}
7040
7044 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
7045
7046 // Update the EntryInfo with the next available count for this location.
7047 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
7048
7049 // If we are emitting code for a target, the entry is already initialized,
7050 // only has to be registered.
7051 if (OMPBuilder->Config.isTargetDevice()) {
7052 // This could happen if the device compilation is invoked standalone.
7053 if (!hasTargetRegionEntryInfo(EntryInfo)) {
7054 return;
7055 }
7056 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
7057 Entry.setAddress(Addr);
7058 Entry.setID(ID);
7059 Entry.setFlags(Flags);
7060 } else {
7062 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
7063 return;
7064 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
7065 "Target region entry already registered!");
7066 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
7067 OffloadEntriesTargetRegion[EntryInfo] = Entry;
7068 ++OffloadingEntriesNum;
7069 }
7070 incrementTargetRegionEntryInfoCount(EntryInfo);
7071}
7072
7074 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
7075
7076 // Update the EntryInfo with the next available count for this location.
7077 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
7078
7079 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
7080 if (It == OffloadEntriesTargetRegion.end()) {
7081 return false;
7082 }
7083 // Fail if this entry is already registered.
7084 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
7085 return false;
7086 return true;
7087}
7088
7090 const OffloadTargetRegionEntryInfoActTy &Action) {
7091 // Scan all target region entries and perform the provided action.
7092 for (const auto &It : OffloadEntriesTargetRegion) {
7093 Action(It.first, It.second);
7094 }
7095}
7096
7098 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
7099 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
7100 ++OffloadingEntriesNum;
7101}
7102
7104 StringRef VarName, Constant *Addr, int64_t VarSize,
7106 if (OMPBuilder->Config.isTargetDevice()) {
7107 // This could happen if the device compilation is invoked standalone.
7108 if (!hasDeviceGlobalVarEntryInfo(VarName))
7109 return;
7110 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
7111 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
7112 if (Entry.getVarSize() == 0) {
7113 Entry.setVarSize(VarSize);
7114 Entry.setLinkage(Linkage);
7115 }
7116 return;
7117 }
7118 Entry.setVarSize(VarSize);
7119 Entry.setLinkage(Linkage);
7120 Entry.setAddress(Addr);
7121 } else {
7122 if (hasDeviceGlobalVarEntryInfo(VarName)) {
7123 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
7124 assert(Entry.isValid() && Entry.getFlags() == Flags &&
7125 "Entry not initialized!");
7126 if (Entry.getVarSize() == 0) {
7127 Entry.setVarSize(VarSize);
7128 Entry.setLinkage(Linkage);
7129 }
7130 return;
7131 }
7133 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
7134 Addr, VarSize, Flags, Linkage,
7135 VarName.str());
7136 else
7137 OffloadEntriesDeviceGlobalVar.try_emplace(
7138 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
7139 ++OffloadingEntriesNum;
7140 }
7141}
7142
7145 // Scan all target region entries and perform the provided action.
7146 for (const auto &E : OffloadEntriesDeviceGlobalVar)
7147 Action(E.getKey(), E.getValue());
7148}
7149
7150//===----------------------------------------------------------------------===//
7151// CanonicalLoopInfo
7152//===----------------------------------------------------------------------===//
7153
7154void CanonicalLoopInfo::collectControlBlocks(
7156 // We only count those BBs as control block for which we do not need to
7157 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
7158 // flow. For consistency, this also means we do not add the Body block, which
7159 // is just the entry to the body code.
7160 BBs.reserve(BBs.size() + 6);
7161 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
7162}
7163
7165 assert(isValid() && "Requires a valid canonical loop");
7166 for (BasicBlock *Pred : predecessors(Header)) {
7167 if (Pred != Latch)
7168 return Pred;
7169 }
7170 llvm_unreachable("Missing preheader");
7171}
7172
7173void CanonicalLoopInfo::setTripCount(Value *TripCount) {
7174 assert(isValid() && "Requires a valid canonical loop");
7175
7176 Instruction *CmpI = &getCond()->front();
7177 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
7178 CmpI->setOperand(1, TripCount);
7179
7180#ifndef NDEBUG
7181 assertOK();
7182#endif
7183}
7184
7185void CanonicalLoopInfo::mapIndVar(
7186 llvm::function_ref<Value *(Instruction *)> Updater) {
7187 assert(isValid() && "Requires a valid canonical loop");
7188
7189 Instruction *OldIV = getIndVar();
7190
7191 // Record all uses excluding those introduced by the updater. Uses by the
7192 // CanonicalLoopInfo itself to keep track of the number of iterations are
7193 // excluded.
7194 SmallVector<Use *> ReplacableUses;
7195 for (Use &U : OldIV->uses()) {
7196 auto *User = dyn_cast<Instruction>(U.getUser());
7197 if (!User)
7198 continue;
7199 if (User->getParent() == getCond())
7200 continue;
7201 if (User->getParent() == getLatch())
7202 continue;
7203 ReplacableUses.push_back(&U);
7204 }
7205
7206 // Run the updater that may introduce new uses
7207 Value *NewIV = Updater(OldIV);
7208
7209 // Replace the old uses with the value returned by the updater.
7210 for (Use *U : ReplacableUses)
7211 U->set(NewIV);
7212
7213#ifndef NDEBUG
7214 assertOK();
7215#endif
7216}
7217
7219#ifndef NDEBUG
7220 // No constraints if this object currently does not describe a loop.
7221 if (!isValid())
7222 return;
7223
7224 BasicBlock *Preheader = getPreheader();
7225 BasicBlock *Body = getBody();
7226 BasicBlock *After = getAfter();
7227
7228 // Verify standard control-flow we use for OpenMP loops.
7229 assert(Preheader);
7230 assert(isa<BranchInst>(Preheader->getTerminator()) &&
7231 "Preheader must terminate with unconditional branch");
7232 assert(Preheader->getSingleSuccessor() == Header &&
7233 "Preheader must jump to header");
7234
7235 assert(Header);
7236 assert(isa<BranchInst>(Header->getTerminator()) &&
7237 "Header must terminate with unconditional branch");
7238 assert(Header->getSingleSuccessor() == Cond &&
7239 "Header must jump to exiting block");
7240
7241 assert(Cond);
7242 assert(Cond->getSinglePredecessor() == Header &&
7243 "Exiting block only reachable from header");
7244
7245 assert(isa<BranchInst>(Cond->getTerminator()) &&
7246 "Exiting block must terminate with conditional branch");
7247 assert(size(successors(Cond)) == 2 &&
7248 "Exiting block must have two successors");
7249 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
7250 "Exiting block's first successor jump to the body");
7251 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
7252 "Exiting block's second successor must exit the loop");
7253
7254 assert(Body);
7255 assert(Body->getSinglePredecessor() == Cond &&
7256 "Body only reachable from exiting block");
7257 assert(!isa<PHINode>(Body->front()));
7258
7259 assert(Latch);
7260 assert(isa<BranchInst>(Latch->getTerminator()) &&
7261 "Latch must terminate with unconditional branch");
7262 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
7263 // TODO: To support simple redirecting of the end of the body code that has
7264 // multiple; introduce another auxiliary basic block like preheader and after.
7265 assert(Latch->getSinglePredecessor() != nullptr);
7266 assert(!isa<PHINode>(Latch->front()));
7267
7268 assert(Exit);
7269 assert(isa<BranchInst>(Exit->getTerminator()) &&
7270 "Exit block must terminate with unconditional branch");
7271 assert(Exit->getSingleSuccessor() == After &&
7272 "Exit block must jump to after block");
7273
7274 assert(After);
7275 assert(After->getSinglePredecessor() == Exit &&
7276 "After block only reachable from exit block");
7277 assert(After->empty() || !isa<PHINode>(After->front()));
7278
7279 Instruction *IndVar = getIndVar();
7280 assert(IndVar && "Canonical induction variable not found?");
7281 assert(isa<IntegerType>(IndVar->getType()) &&
7282 "Induction variable must be an integer");
7283 assert(cast<PHINode>(IndVar)->getParent() == Header &&
7284 "Induction variable must be a PHI in the loop header");
7285 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
7286 assert(
7287 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
7288 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
7289
7290 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
7291 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
7292 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
7293 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
7294 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
7295 ->isOne());
7296
7297 Value *TripCount = getTripCount();
7298 assert(TripCount && "Loop trip count not found?");
7299 assert(IndVar->getType() == TripCount->getType() &&
7300 "Trip count and induction variable must have the same type");
7301
7302 auto *CmpI = cast<CmpInst>(&Cond->front());
7303 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
7304 "Exit condition must be a signed less-than comparison");
7305 assert(CmpI->getOperand(0) == IndVar &&
7306 "Exit condition must compare the induction variable");
7307 assert(CmpI->getOperand(1) == TripCount &&
7308 "Exit condition must compare with the trip count");
7309#endif
7310}
7311
7313 Header = nullptr;
7314 Cond = nullptr;
7315 Latch = nullptr;
7316 Exit = nullptr;
7317}
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Rewrite undef for PHI
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:528
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
IntegerType * Int32Ty
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static void emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static Function * createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static void replaceConstantValueUsesInFuncWithInstr(llvm::Value *Input, Function *Func)
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void replaceConstatExprUsesInFuncWithInstr(ConstantExpr *ConstExpr, Function *Func)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
Value * createFakeIntVal(IRBuilder<> &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, std::stack< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
Function * getFreshReductionFunc(Module &M)
Create a function with a unique name and a "void (i8*, i8*)" signature in the given module and return...
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:76
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:59
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:107
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:125
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:112
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:136
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:103
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:321
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:535
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
Class to represent array types.
Definition: DerivedTypes.h:371
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:647
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:539
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:696
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:748
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:760
@ Add
*p = old + v
Definition: Instructions.h:764
@ FAdd
*p = old + v
Definition: Instructions.h:785
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:778
@ Or
*p = old | v
Definition: Instructions.h:772
@ Sub
*p = old - v
Definition: Instructions.h:766
@ And
*p = old & v
Definition: Instructions.h:768
@ Xor
*p = old ^ v
Definition: Instructions.h:774
@ FSub
*p = old - v
Definition: Instructions.h:788
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:800
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:776
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:782
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:796
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:780
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:792
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:804
@ Nand
*p = ~(old & v)
Definition: Instructions.h:770
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:797
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:782
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:93
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:349
LLVM Basic Block Representation.
Definition: BasicBlock.h:60
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:657
iterator end()
Definition: BasicBlock.h:443
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:430
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:409
reverse_iterator rbegin()
Definition: BasicBlock.h:446
bool empty() const
Definition: BasicBlock.h:452
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:360
const Instruction & front() const
Definition: BasicBlock.h:453
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:199
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:570
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:490
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:452
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:167
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:460
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:482
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:206
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:276
reverse_iterator rend()
Definition: BasicBlock.h:448
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:379
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:358
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:221
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:613
const Instruction & back() const
Definition: BasicBlock.h:455
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:289
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:509
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1742
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1662
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1687
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1668
unsigned arg_size() const
Definition: InstrTypes.h:1685
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:993
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:1022
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:1023
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:999
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:997
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:1016
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:1020
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:1018
@ ICMP_NE
not equal
Definition: InstrTypes.h:1015
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:1019
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:46
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:85
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas) const
Compute the set of input values and output values for the code.
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1291
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2881
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:705
A constant value that is initialized with an expression using other constant values.
Definition: Constants.h:1017
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2072
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2087
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2152
Instruction * getAsInstruction() const
Returns an Instruction which implements the same operation as this ConstantExpr.
Definition: Constants.cpp:3310
This is the shared class of boolean and integer constants.
Definition: Constants.h:80
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:849
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:123
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:856
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1775
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1356
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Debug location.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:294
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:533
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:276
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:750
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:420
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:472
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:874
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
Class to represent function types.
Definition: DerivedTypes.h:103
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:585
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:164
const BasicBlock & getEntryBlock() const
Definition: Function.h:787
bool empty() const
Definition: Function.h:809
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:202
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:399
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:701
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:713
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:340
const Function & getFunction() const
Definition: Function.h:162
iterator begin()
Definition: Function.h:803
arg_iterator arg_begin()
Definition: Function.h:818
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:356
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:613
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:732
size_t arg_size() const
Definition: Function.h:851
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:207
iterator end()
Definition: Function.h:805
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:268
Argument * getArg(unsigned i) const
Definition: Function.h:836
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1521
LinkageTypes getLinkage() const
Definition: GlobalValue.h:545
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:536
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:655
void setDSOLocal(bool Local)
Definition: GlobalValue.h:302
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:293
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:67
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:68
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:253
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:50
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:59
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:61
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:58
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:56
@ ExternalLinkage
Externally visible function.
Definition: GlobalValue.h:51
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:55
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:54
Type * getValueType() const
Definition: GlobalValue.h:295
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:257
BasicBlock * getBlock() const
Definition: IRBuilder.h:272
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:270
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:273
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:94
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2257
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1841
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1773
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2523
Constant * CreateGlobalStringPtr(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr)
Same as CreateGlobalString, but return a pointer with "i8*" type instead of a pointer to array of i8.
Definition: IRBuilder.h:1993
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2039
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1263
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2170
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2516
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1307
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:175
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1973
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2033
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2122
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:526
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:174
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:220
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:531
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1876
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2182
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1378
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2245
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:491
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1721
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:277
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:486
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2366
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2397
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1143
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2241
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:145
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:63
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2127
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:497
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1120
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1790
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2021
LLVMContext & getContext() const
Definition: IRBuilder.h:176
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1475
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1090
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1914
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1960
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1803
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1327
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2117
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2549
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1854
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2007
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1497
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:569
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1114
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:169
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2253
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2196
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:289
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2544
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:564
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1826
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2412
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1519
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1866
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2351
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:516
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1404
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:659
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2054
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2666
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:91
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
void insertBefore(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately before the specified instruction.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:454
const BasicBlock * getParent() const
Definition: Instruction.h:152
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:926
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:359
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1635
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:451
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:278
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:184
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:266
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:566
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:957
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:117
Metadata node.
Definition: Metadata.h:1067
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1071
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1549
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1428
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1426
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1541
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:600
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:301
NamedMDNode * getNamedMetadata(const Twine &Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:260
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:284
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:297
iterator_range< global_iterator > globals()
Definition: Module.h:699
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:611
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:446
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:133
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:269
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:461
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:293
A tuple of MDNodes.
Definition: Metadata.h:1729
iterator_range< op_iterator > operands()
Definition: Metadata.h:1825
void addOperand(MDNode *M)
Definition: Metadata.cpp:1387
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:221
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:223
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:354
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:356
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:274
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:276
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:265
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:334
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:340
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:346
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:344
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:338
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:336
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:410
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:90
StringRef separator() const
Definition: OMPIRBuilder.h:157
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:147
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:129
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:451
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:497
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
InsertPointTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
CanonicalLoopInfo * createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
InsertPointTy emitBarrierImpl(const LocationDescription &Loc, omp::Directive DK, bool ForceSimpleCall, bool CheckCancelFlag)
Generate a barrier runtime call.
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
InsertPointTy emitKernelLaunch(const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
OpenMPIRBuilder::InsertPointTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive DK, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool EmitDebug=false, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
InsertPointTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={})
Generator for #omp task
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
InsertPointTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
IRBuilder ::InsertPoint createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:477
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< Function *(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
void emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
InsertPointTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
InsertPointTy createTarget(const LocationDescription &Loc, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, int32_t NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB)
Generator for '#omp target'.
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Definition: PassManager.h:296
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:321
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:360
iterator end() const
Definition: SmallPtrSet.h:385
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:342
iterator begin() const
Definition: SmallPtrSet.h:380
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:427
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:950
void reserve(size_type N)
Definition: SmallVector.h:676
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:696
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
An instruction for storing to memory.
Definition: Instructions.h:317
void setAlignment(Align Align)
Definition: Instructions.h:373
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:400
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:693
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:444
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:609
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:513
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:953
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1011
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1021
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:140
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:125
bool canUnroll() const
Whether it is legal to unroll this loop.
Definition: UnrollLoop.h:138
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:140
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:926
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
Definition: ilist_node.h:109
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:316
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:690
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:450
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:64
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:788
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:456
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:428
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2406
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
AddressSpace
Definition: NVPTXBaseInfo.h:21
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:832
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:156
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:70
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:593
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
AtomicReductionGenTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenTy ReductionGen
Callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * NumTeams
The number of teams.
Value * DynCGGroupMem
The size of the dynamic shared memory.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
Value * NumThreads
The number of threads.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:183
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57