llvm.org GIT mirror llvm / c313a17
[CodeGen] Generic Hardware Loop Support Patch which introduces a target-independent framework for generating hardware loops at the IR level. Most of the code has been taken from PowerPC CTRLoops and PowerPC has been ported over to use this generic pass. The target dependent parts have been moved into TargetTransformInfo, via isHardwareLoopProfitable, with HardwareLoopInfo introduced to transfer information from the backend. Three generic intrinsics have been introduced: - void @llvm.set_loop_iterations Takes as a single operand, the number of iterations to be executed. - i1 @llvm.loop_decrement(anyint) Takes the maximum number of elements processed in an iteration of the loop body and subtracts this from the total count. Returns false when the loop should exit. - anyint @llvm.loop_decrement_reg(anyint, anyint) Takes the number of elements remaining to be processed as well as the maximum numbe of elements processed in an iteration of the loop body. Returns the updated number of elements remaining. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@362774 91177308-0d34-0410-b5e6-96231b3b80d8 Sam Parker 2 months ago
24 changed file(s) with 1085 addition(s) and 599 deletion(s). Raw diff Collapse all Expand all
3434 enum ID : unsigned;
3535 }
3636
37 class AssumptionCache;
38 class BranchInst;
3739 class Function;
3840 class GlobalValue;
3941 class IntrinsicInst;
4345 class ScalarEvolution;
4446 class StoreInst;
4547 class SwitchInst;
48 class TargetLibraryInfo;
4649 class Type;
4750 class User;
4851 class Value;
444447 void getUnrollingPreferences(Loop *L, ScalarEvolution &,
445448 UnrollingPreferences &UP) const;
446449
450 /// Attributes of a target dependent hardware loop. Here, the term 'element'
451 /// describes the work performed by an IR loop that has not been vectorized
452 /// by the compiler.
453 struct HardwareLoopInfo {
454 HardwareLoopInfo() = delete;
455 HardwareLoopInfo(Loop *L) : L(L) { }
456 Loop *L = nullptr;
457 BasicBlock *ExitBlock = nullptr;
458 BranchInst *ExitBranch = nullptr;
459 const SCEV *ExitCount = nullptr;
460 IntegerType *CountType = nullptr;
461 Value *LoopDecrement = nullptr; // The maximum number of elements
462 // processed in the loop body.
463 bool IsNestingLegal = false; // Can a hardware loop be a parent to
464 // another hardware loop.
465 bool CounterInReg = false; // Should loop counter be updated in
466 // the loop via a phi?
467 };
468
469 /// Query the target whether it would be profitable to convert the given loop
470 /// into a hardware loop.
471 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
472 AssumptionCache &AC,
473 TargetLibraryInfo *LibInfo,
474 HardwareLoopInfo &HWLoopInfo) const;
475
447476 /// @}
448477
449478 /// \name Scalar Target Information
10721101 virtual bool isLoweredToCall(const Function *F) = 0;
10731102 virtual void getUnrollingPreferences(Loop *L, ScalarEvolution &,
10741103 UnrollingPreferences &UP) = 0;
1104 virtual bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1105 AssumptionCache &AC,
1106 TargetLibraryInfo *LibInfo,
1107 HardwareLoopInfo &HWLoopInfo) = 0;
10751108 virtual bool isLegalAddImmediate(int64_t Imm) = 0;
10761109 virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
10771110 virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
13031336 UnrollingPreferences &UP) override {
13041337 return Impl.getUnrollingPreferences(L, SE, UP);
13051338 }
1339 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
1340 AssumptionCache &AC,
1341 TargetLibraryInfo *LibInfo,
1342 HardwareLoopInfo &HWLoopInfo) override {
1343 return Impl.isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
1344 }
13061345 bool isLegalAddImmediate(int64_t Imm) override {
13071346 return Impl.isLegalAddImmediate(Imm);
13081347 }
187187 return false;
188188
189189 return true;
190 }
191
192 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
193 AssumptionCache &AC,
194 TargetLibraryInfo *LibInfo,
195 TTI::HardwareLoopInfo &HWLoopInfo) {
196 return false;
190197 }
191198
192199 void getUnrollingPreferences(Loop *, ScalarEvolution &,
488488 // Set number of instructions optimized when "back edge"
489489 // becomes "fall through" to default value of 2.
490490 UP.BEInsns = 2;
491 }
492
493 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
494 AssumptionCache &AC,
495 TargetLibraryInfo *LibInfo,
496 TTI::HardwareLoopInfo &HWLoopInfo) {
497 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
491498 }
492499
493500 int getInstructionLatency(const Instruction *I) {
445445 /// Creates CFI Instruction Inserter pass. \see CFIInstrInserter.cpp
446446 FunctionPass *createCFIInstrInserter();
447447
448 /// Create Hardware Loop pass. \see HardwareLoops.cpp
449 FunctionPass *createHardwareLoopsPass();
450
448451 } // End llvm namespace
449452
450453 #endif
11811181 [llvm_anyvector_ty],
11821182 [IntrNoMem]>;
11831183
1184 //===---------- Intrinsics to control hardware supported loops ----------===//
1185
1186 // Specify that the value given is the number of iterations that the next loop
1187 // will execute.
1188 def int_set_loop_iterations :
1189 Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
1190
1191 // Decrement loop counter by the given argument. Return false if the loop
1192 // should exit.
1193 def int_loop_decrement :
1194 Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], [IntrNoDuplicate]>;
1195
1196 // Decrement the first operand (the loop counter) by the second operand (the
1197 // maximum number of elements processed in an iteration). Return the remaining
1198 // number of iterations still to be executed. This is effectively a sub which
1199 // can be used with a phi, icmp and br to control the number of iterations
1200 // executed, as usual.
1201 def int_loop_decrement_reg :
1202 Intrinsic<[llvm_anyint_ty],
1203 [llvm_anyint_ty, llvm_anyint_ty], [IntrNoDuplicate]>;
1204
11841205 //===----- Intrinsics that are used to provide predicate information -----===//
11851206
11861207 def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
162162 void initializeGlobalSplitPass(PassRegistry&);
163163 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
164164 void initializeGuardWideningLegacyPassPass(PassRegistry&);
165 void initializeHardwareLoopsPass(PassRegistry&);
165166 void initializeHotColdSplittingLegacyPassPass(PassRegistry&);
166167 void initializeHWAddressSanitizerLegacyPassPass(PassRegistry &);
167168 void initializeIPCPPass(PassRegistry&);
222222 (void) llvm::createEliminateAvailableExternallyPass();
223223 (void) llvm::createScalarizeMaskedMemIntrinPass();
224224 (void) llvm::createWarnMissedTransformationsPass();
225 (void) llvm::createHardwareLoopsPass();
225226
226227 (void)new llvm::IntervalPartition();
227228 (void)new llvm::ScalarEvolutionWrapperPass();
127127
128128 bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
129129 return TTIImpl->isLoweredToCall(F);
130 }
131
132 bool TargetTransformInfo::isHardwareLoopProfitable(
133 Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
134 TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
135 return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
130136 }
131137
132138 void TargetTransformInfo::getUnrollingPreferences(
3232 GCRootLowering.cpp
3333 GCStrategy.cpp
3434 GlobalMerge.cpp
35 HardwareLoops.cpp
3536 IfConversion.cpp
3637 ImplicitNullChecks.cpp
3738 IndirectBrExpandPass.cpp
3737 initializeFuncletLayoutPass(Registry);
3838 initializeGCMachineCodeAnalysisPass(Registry);
3939 initializeGCModuleInfoPass(Registry);
40 initializeHardwareLoopsPass(Registry);
4041 initializeIfConverterPass(Registry);
4142 initializeImplicitNullChecksPass(Registry);
4243 initializeIndirectBrExpandPassPass(Registry);
0 //===-- HardwareLoops.cpp - Target Independent Hardware Loops --*- C++ -*-===//
1 //
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 /// \file
8 /// Insert hardware loop intrinsics into loops which are deemed profitable by
9 /// the target, by querying TargetTransformInfo. A hardware loop comprises of
10 /// two intrinsics: one, outside the loop, to set the loop iteration count and
11 /// another, in the exit block, to decrement the counter. The decremented value
12 /// can either be carried through the loop via a phi or handled in some opaque
13 /// way by the target.
14 ///
15 //===----------------------------------------------------------------------===//
16
17 #include "llvm/Pass.h"
18 #include "llvm/PassRegistry.h"
19 #include "llvm/PassSupport.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/Analysis/AssumptionCache.h"
22 #include "llvm/Analysis/CFG.h"
23 #include "llvm/Analysis/LoopInfo.h"
24 #include "llvm/Analysis/LoopIterator.h"
25 #include "llvm/Analysis/ScalarEvolution.h"
26 #include "llvm/Analysis/ScalarEvolutionExpander.h"
27 #include "llvm/Analysis/TargetTransformInfo.h"
28 #include "llvm/CodeGen/Passes.h"
29 #include "llvm/CodeGen/TargetPassConfig.h"
30 #include "llvm/IR/BasicBlock.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/Dominators.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/IRBuilder.h"
35 #include "llvm/IR/Instructions.h"
36 #include "llvm/IR/IntrinsicInst.h"
37 #include "llvm/IR/Value.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Transforms/Scalar.h"
40 #include "llvm/Transforms/Utils.h"
41 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
42 #include "llvm/Transforms/Utils/Local.h"
43 #include "llvm/Transforms/Utils/LoopUtils.h"
44
45 #define DEBUG_TYPE "hardware-loops"
46
47 #define HW_LOOPS_NAME "Hardware Loop Insertion"
48
49 using namespace llvm;
50
51 static cl::opt
52 ForceHardwareLoops("force-hardware-loops", cl::Hidden, cl::init(false),
53 cl::desc("Force hardware loops intrinsics to be inserted"));
54
55 static cl::opt
56 ForceHardwareLoopPHI(
57 "force-hardware-loop-phi", cl::Hidden, cl::init(false),
58 cl::desc("Force hardware loop counter to be updated through a phi"));
59
60 static cl::opt
61 ForceNestedLoop("force-nested-hardware-loop", cl::Hidden, cl::init(false),
62 cl::desc("Force allowance of nested hardware loops"));
63
64 static cl::opt
65 LoopDecrement("hardware-loop-decrement", cl::Hidden, cl::init(1),
66 cl::desc("Set the loop decrement value"));
67
68 static cl::opt
69 CounterBitWidth("hardware-loop-counter-bitwidth", cl::Hidden, cl::init(32),
70 cl::desc("Set the loop counter bitwidth"));
71
72 STATISTIC(NumHWLoops, "Number of loops converted to hardware loops");
73
74 namespace {
75
76 using TTI = TargetTransformInfo;
77
78 class HardwareLoops : public FunctionPass {
79 public:
80 static char ID;
81
82 HardwareLoops() : FunctionPass(ID) {
83 initializeHardwareLoopsPass(*PassRegistry::getPassRegistry());
84 }
85
86 bool runOnFunction(Function &F) override;
87
88 void getAnalysisUsage(AnalysisUsage &AU) const override {
89 AU.addRequired();
90 AU.addPreserved();
91 AU.addRequired();
92 AU.addPreserved();
93 AU.addRequired();
94 AU.addRequired();
95 AU.addRequired();
96 }
97
98 // Try to convert the given Loop into a hardware loop.
99 bool TryConvertLoop(Loop *L);
100
101 // Given that the target believes the loop to be profitable, try to
102 // convert it.
103 bool TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo);
104
105 private:
106 ScalarEvolution *SE = nullptr;
107 LoopInfo *LI = nullptr;
108 const DataLayout *DL = nullptr;
109 const TargetTransformInfo *TTI = nullptr;
110 DominatorTree *DT = nullptr;
111 bool PreserveLCSSA = false;
112 AssumptionCache *AC = nullptr;
113 TargetLibraryInfo *LibInfo = nullptr;
114 Module *M = nullptr;
115 bool MadeChange = false;
116 };
117
118 class HardwareLoop {
119 // Expand the trip count scev into a value that we can use.
120 Value *InitLoopCount(BasicBlock *BB);
121
122 // Insert the set_loop_iteration intrinsic.
123 void InsertIterationSetup(Value *LoopCountInit, BasicBlock *BB);
124
125 // Insert the loop_decrement intrinsic.
126 void InsertLoopDec();
127
128 // Insert the loop_decrement_reg intrinsic.
129 Instruction *InsertLoopRegDec(Value *EltsRem);
130
131 // If the target requires the counter value to be updated in the loop,
132 // insert a phi to hold the value. The intended purpose is for use by
133 // loop_decrement_reg.
134 PHINode *InsertPHICounter(Value *NumElts, Value *EltsRem);
135
136 // Create a new cmp, that checks the returned value of loop_decrement*,
137 // and update the exit branch to use it.
138 void UpdateBranch(Value *EltsRem);
139
140 public:
141 HardwareLoop(TTI::HardwareLoopInfo &Info, ScalarEvolution &SE,
142 const DataLayout &DL) :
143 SE(SE), DL(DL), L(Info.L), M(L->getHeader()->getModule()),
144 ExitCount(Info.ExitCount),
145 CountType(Info.CountType),
146 ExitBranch(Info.ExitBranch),
147 LoopDecrement(Info.LoopDecrement),
148 UsePHICounter(Info.CounterInReg) { }
149
150 void Create();
151
152 private:
153 ScalarEvolution &SE;
154 const DataLayout &DL;
155 Loop *L = nullptr;
156 Module *M = nullptr;
157 const SCEV *ExitCount = nullptr;
158 Type *CountType = nullptr;
159 BranchInst *ExitBranch = nullptr;
160 Value *LoopDecrement = nullptr;
161 bool UsePHICounter = false;
162 };
163 }
164
165 char HardwareLoops::ID = 0;
166
167 bool HardwareLoops::runOnFunction(Function &F) {
168 if (skipFunction(F))
169 return false;
170
171 LLVM_DEBUG(dbgs() << "HWLoops: Running on " << F.getName() << "\n");
172
173 LI = &getAnalysis().getLoopInfo();
174 SE = &getAnalysis().getSE();
175 DT = &getAnalysis().getDomTree();
176 TTI = &getAnalysis().getTTI(F);
177 DL = &F.getParent()->getDataLayout();
178 auto *TLIP = getAnalysisIfAvailable();
179 LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
180 PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
181 AC = &getAnalysis().getAssumptionCache(F);
182 M = F.getParent();
183
184 for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I) {
185 Loop *L = *I;
186 if (!L->getParentLoop())
187 TryConvertLoop(L);
188 }
189
190 return MadeChange;
191 }
192
193 // Return true if the search should stop, which will be when an inner loop is
194 // converted and the parent loop doesn't support containing a hardware loop.
195 bool HardwareLoops::TryConvertLoop(Loop *L) {
196 // Process nested loops first.
197 for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
198 if (TryConvertLoop(*I))
199 return true; // Stop search.
200
201 // Bail out if the loop has irreducible control flow.
202 LoopBlocksRPO RPOT(L);
203 RPOT.perform(LI);
204 if (containsIrreducibleCFG(RPOT, *LI))
205 return false;
206
207 TTI::HardwareLoopInfo HWLoopInfo(L);
208 if (TTI->isHardwareLoopProfitable(L, *SE, *AC, LibInfo, HWLoopInfo) ||
209 ForceHardwareLoops) {
210
211 // Allow overriding of the counter width and loop decrement value.
212 if (CounterBitWidth.getNumOccurrences())
213 HWLoopInfo.CountType =
214 IntegerType::get(M->getContext(), CounterBitWidth);
215
216 if (LoopDecrement.getNumOccurrences())
217 HWLoopInfo.LoopDecrement =
218 ConstantInt::get(HWLoopInfo.CountType, LoopDecrement);
219
220 MadeChange |= TryConvertLoop(HWLoopInfo);
221 return MadeChange && (!HWLoopInfo.IsNestingLegal && !ForceNestedLoop);
222 }
223
224 return false;
225 }
226
227 bool HardwareLoops::TryConvertLoop(TTI::HardwareLoopInfo &HWLoopInfo) {
228
229 Loop *L = HWLoopInfo.L;
230 LLVM_DEBUG(dbgs() << "HWLoops: Try to convert profitable loop: " << *L);
231
232 SmallVector ExitingBlocks;
233 L->getExitingBlocks(ExitingBlocks);
234
235 for (SmallVectorImpl::iterator I = ExitingBlocks.begin(),
236 IE = ExitingBlocks.end(); I != IE; ++I) {
237 const SCEV *EC = SE->getExitCount(L, *I);
238 if (isa(EC))
239 continue;
240 if (const SCEVConstant *ConstEC = dyn_cast(EC)) {
241 if (ConstEC->getValue()->isZero())
242 continue;
243 } else if (!SE->isLoopInvariant(EC, L))
244 continue;
245
246 if (SE->getTypeSizeInBits(EC->getType()) >
247 HWLoopInfo.CountType->getBitWidth())
248 continue;
249
250 // If this exiting block is contained in a nested loop, it is not eligible
251 // for insertion of the branch-and-decrement since the inner loop would
252 // end up messing up the value in the CTR.
253 if (!HWLoopInfo.IsNestingLegal && LI->getLoopFor(*I) != L &&
254 !ForceNestedLoop)
255 continue;
256
257 // We now have a loop-invariant count of loop iterations (which is not the
258 // constant zero) for which we know that this loop will not exit via this
259 // existing block.
260
261 // We need to make sure that this block will run on every loop iteration.
262 // For this to be true, we must dominate all blocks with backedges. Such
263 // blocks are in-loop predecessors to the header block.
264 bool NotAlways = false;
265 for (pred_iterator PI = pred_begin(L->getHeader()),
266 PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
267 if (!L->contains(*PI))
268 continue;
269
270 if (!DT->dominates(*I, *PI)) {
271 NotAlways = true;
272 break;
273 }
274 }
275
276 if (NotAlways)
277 continue;
278
279 // Make sure this blocks ends with a conditional branch.
280 Instruction *TI = (*I)->getTerminator();
281 if (!TI)
282 continue;
283
284 if (BranchInst *BI = dyn_cast(TI)) {
285 if (!BI->isConditional())
286 continue;
287
288 HWLoopInfo.ExitBranch = BI;
289 } else
290 continue;
291
292 // Note that this block may not be the loop latch block, even if the loop
293 // has a latch block.
294 HWLoopInfo.ExitBlock = *I;
295 HWLoopInfo.ExitCount = EC;
296 break;
297 }
298
299 if (!HWLoopInfo.ExitBlock)
300 return false;
301
302 BasicBlock *Preheader = L->getLoopPreheader();
303
304 // If we don't have a preheader, then insert one.
305 if (!Preheader)
306 Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
307 if (!Preheader)
308 return false;
309
310 HardwareLoop HWLoop(HWLoopInfo, *SE, *DL);
311 HWLoop.Create();
312 ++NumHWLoops;
313 return true;
314 }
315
316 void HardwareLoop::Create() {
317 LLVM_DEBUG(dbgs() << "HWLoops: Converting loop..\n");
318 BasicBlock *BeginBB = L->getLoopPreheader();
319 Value *LoopCountInit = InitLoopCount(BeginBB);
320 if (!LoopCountInit)
321 return;
322
323 InsertIterationSetup(LoopCountInit, BeginBB);
324
325 if (UsePHICounter || ForceHardwareLoopPHI) {
326 Instruction *LoopDec = InsertLoopRegDec(LoopCountInit);
327 Value *EltsRem = InsertPHICounter(LoopCountInit, LoopDec);
328 LoopDec->setOperand(0, EltsRem);
329 UpdateBranch(LoopDec);
330 } else
331 InsertLoopDec();
332
333 // Run through the basic blocks of the loop and see if any of them have dead
334 // PHIs that can be removed.
335 for (auto I : L->blocks())
336 DeleteDeadPHIs(I);
337 }
338
339 Value *HardwareLoop::InitLoopCount(BasicBlock *BB) {
340 SCEVExpander SCEVE(SE, DL, "loopcnt");
341 if (!ExitCount->getType()->isPointerTy() &&
342 ExitCount->getType() != CountType)
343 ExitCount = SE.getZeroExtendExpr(ExitCount, CountType);
344
345 ExitCount = SE.getAddExpr(ExitCount, SE.getOne(CountType));
346
347 if (!isSafeToExpandAt(ExitCount, BB->getTerminator(), SE)) {
348 LLVM_DEBUG(dbgs() << "HWLoops: Bailing, unsafe to expand ExitCount "
349 << *ExitCount << "\n");
350 return nullptr;
351 }
352
353 Value *Count = SCEVE.expandCodeFor(ExitCount, CountType,
354 BB->getTerminator());
355 LLVM_DEBUG(dbgs() << "HWLoops: Loop Count: " << *Count << "\n");
356 return Count;
357 }
358
359 void HardwareLoop::InsertIterationSetup(Value *LoopCountInit,
360 BasicBlock *BB) {
361 IRBuilder<> Builder(BB->getTerminator());
362 Type *Ty = LoopCountInit->getType();
363 Function *LoopIter =
364 Intrinsic::getDeclaration(M, Intrinsic::set_loop_iterations, Ty);
365 Value *Call = Builder.CreateCall(LoopIter, LoopCountInit);
366 LLVM_DEBUG(dbgs() << "HWLoops: Iteration set: " << *Call << "\n");
367 }
368
369 void HardwareLoop::InsertLoopDec() {
370 IRBuilder<> CondBuilder(ExitBranch);
371
372 Function *DecFunc =
373 Intrinsic::getDeclaration(M, Intrinsic::loop_decrement,
374 LoopDecrement->getType());
375 Value *Ops[] = { LoopDecrement };
376 Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
377 Value *OldCond = ExitBranch->getCondition();
378 ExitBranch->setCondition(NewCond);
379
380 // The false branch must exit the loop.
381 if (!L->contains(ExitBranch->getSuccessor(0)))
382 ExitBranch->swapSuccessors();
383
384 // The old condition may be dead now, and may have even created a dead PHI
385 // (the original induction variable).
386 RecursivelyDeleteTriviallyDeadInstructions(OldCond);
387
388 LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *NewCond << "\n");
389 }
390
391 Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
392 IRBuilder<> CondBuilder(ExitBranch);
393
394 Function *DecFunc =
395 Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
396 { EltsRem->getType(), EltsRem->getType(),
397 LoopDecrement->getType()
398 });
399 Value *Ops[] = { EltsRem, LoopDecrement };
400 Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
401
402 LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
403 return cast(Call);
404 }
405
406 PHINode* HardwareLoop::InsertPHICounter(Value *NumElts, Value *EltsRem) {
407 BasicBlock *Preheader = L->getLoopPreheader();
408 BasicBlock *Header = L->getHeader();
409 BasicBlock *Latch = ExitBranch->getParent();
410 IRBuilder<> Builder(Header->getFirstNonPHI());
411 PHINode *Index = Builder.CreatePHI(NumElts->getType(), 2);
412 Index->addIncoming(NumElts, Preheader);
413 Index->addIncoming(EltsRem, Latch);
414 LLVM_DEBUG(dbgs() << "HWLoops: PHI Counter: " << *Index << "\n");
415 return Index;
416 }
417
418 void HardwareLoop::UpdateBranch(Value *EltsRem) {
419 IRBuilder<> CondBuilder(ExitBranch);
420 Value *NewCond =
421 CondBuilder.CreateICmpNE(EltsRem, ConstantInt::get(EltsRem->getType(), 0));
422 Value *OldCond = ExitBranch->getCondition();
423 ExitBranch->setCondition(NewCond);
424
425 // The false branch must exit the loop.
426 if (!L->contains(ExitBranch->getSuccessor(0)))
427 ExitBranch->swapSuccessors();
428
429 // The old condition may be dead now, and may have even created a dead PHI
430 // (the original induction variable).
431 RecursivelyDeleteTriviallyDeadInstructions(OldCond);
432 }
433
434 INITIALIZE_PASS_BEGIN(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
435 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
436 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
437 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
438 INITIALIZE_PASS_END(HardwareLoops, DEBUG_TYPE, HW_LOOPS_NAME, false, false)
439
440 FunctionPass *llvm::createHardwareLoopsPass() { return new HardwareLoops(); }
7070 static cl::opt CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
7171 #endif
7272
73 // The latency of mtctr is only justified if there are more than 4
74 // comparisons that will be removed as a result.
75 static cl::opt
76 SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
77 cl::desc("Loops with a constant trip count smaller than "
78 "this value will not use the count register."));
79
80 STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
81
8273 namespace {
83 struct PPCCTRLoops : public FunctionPass {
84
85 #ifndef NDEBUG
86 static int Counter;
87 #endif
88
89 public:
90 static char ID;
91
92 PPCCTRLoops() : FunctionPass(ID) {
93 initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
94 }
95
96 bool runOnFunction(Function &F) override;
97
98 void getAnalysisUsage(AnalysisUsage &AU) const override {
99 AU.addRequired();
100 AU.addPreserved();
101 AU.addRequired();
102 AU.addPreserved();
103 AU.addRequired();
104 AU.addRequired();
105 AU.addRequired();
106 }
107
108 private:
109 bool mightUseCTR(BasicBlock *BB);
110 bool convertToCTRLoop(Loop *L);
111
112 private:
113 const PPCTargetMachine *TM;
114 const PPCSubtarget *STI;
115 const PPCTargetLowering *TLI;
116 const DataLayout *DL;
117 const TargetLibraryInfo *LibInfo;
118 const TargetTransformInfo *TTI;
119 LoopInfo *LI;
120 ScalarEvolution *SE;
121 DominatorTree *DT;
122 bool PreserveLCSSA;
123 TargetSchedModel SchedModel;
124 };
125
126 char PPCCTRLoops::ID = 0;
127 #ifndef NDEBUG
128 int PPCCTRLoops::Counter = 0;
129 #endif
13074
13175 #ifndef NDEBUG
13276 struct PPCCTRLoopsVerify : public MachineFunctionPass {
15195 char PPCCTRLoopsVerify::ID = 0;
15296 #endif // NDEBUG
15397 } // end anonymous namespace
154
155 INITIALIZE_PASS_BEGIN(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
156 false, false)
157 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
158 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
159 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
160 INITIALIZE_PASS_END(PPCCTRLoops, "ppc-ctr-loops", "PowerPC CTR Loops",
161 false, false)
162
163 FunctionPass *llvm::createPPCCTRLoops() { return new PPCCTRLoops(); }
16498
16599 #ifndef NDEBUG
166100 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
173107 return new PPCCTRLoopsVerify();
174108 }
175109 #endif // NDEBUG
176
177 bool PPCCTRLoops::runOnFunction(Function &F) {
178 if (skipFunction(F))
179 return false;
180
181 auto *TPC = getAnalysisIfAvailable();
182 if (!TPC)
183 return false;
184
185 TM = &TPC->getTM();
186 STI = TM->getSubtargetImpl(F);
187 TLI = STI->getTargetLowering();
188
189 LI = &getAnalysis().getLoopInfo();
190 SE = &getAnalysis().getSE();
191 DT = &getAnalysis().getDomTree();
192 TTI = &getAnalysis().getTTI(F);
193 DL = &F.getParent()->getDataLayout();
194 auto *TLIP = getAnalysisIfAvailable();
195 LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
196 PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
197 SchedModel.init(STI);
198
199 bool MadeChange = false;
200
201 for (LoopInfo::iterator I = LI->begin(), E = LI->end();
202 I != E; ++I) {
203 Loop *L = *I;
204 if (!L->getParentLoop())
205 MadeChange |= convertToCTRLoop(L);
206 }
207
208 return MadeChange;
209 }
210
211 static bool isLargeIntegerTy(bool Is32Bit, Type *Ty) {
212 if (IntegerType *ITy = dyn_cast(Ty))
213 return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
214
215 return false;
216 }
217
218 // Determining the address of a TLS variable results in a function call in
219 // certain TLS models.
220 static bool memAddrUsesCTR(const PPCTargetMachine &TM, const Value *MemAddr) {
221 const auto *GV = dyn_cast(MemAddr);
222 if (!GV) {
223 // Recurse to check for constants that refer to TLS global variables.
224 if (const auto *CV = dyn_cast(MemAddr))
225 for (const auto &CO : CV->operands())
226 if (memAddrUsesCTR(TM, CO))
227 return true;
228
229 return false;
230 }
231
232 if (!GV->isThreadLocal())
233 return false;
234 TLSModel::Model Model = TM.getTLSModel(GV);
235 return Model == TLSModel::GeneralDynamic || Model == TLSModel::LocalDynamic;
236 }
237
238 // Loop through the inline asm constraints and look for something that clobbers
239 // ctr.
240 static bool asmClobbersCTR(InlineAsm *IA) {
241 InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
242 for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
243 InlineAsm::ConstraintInfo &C = CIV[i];
244 if (C.Type != InlineAsm::isInput)
245 for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
246 if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
247 return true;
248 }
249 return false;
250 }
251
252 bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
253 for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
254 J != JE; ++J) {
255 if (CallInst *CI = dyn_cast(J)) {
256 // Inline ASM is okay, unless it clobbers the ctr register.
257 if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) {
258 if (asmClobbersCTR(IA))
259 return true;
260 continue;
261 }
262
263 if (Function *F = CI->getCalledFunction()) {
264 // Most intrinsics don't become function calls, but some might.
265 // sin, cos, exp and log are always calls.
266 unsigned Opcode = 0;
267 if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
268 switch (F->getIntrinsicID()) {
269 default: continue;
270 // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
271 // we're definitely using CTR.
272 case Intrinsic::ppc_is_decremented_ctr_nonzero:
273 case Intrinsic::ppc_mtctr:
274 return true;
275
276 // VisualStudio defines setjmp as _setjmp
277 #if defined(_MSC_VER) && defined(setjmp) && \
278 !defined(setjmp_undefined_for_msvc)
279 # pragma push_macro("setjmp")
280 # undef setjmp
281 # define setjmp_undefined_for_msvc
282 #endif
283
284 case Intrinsic::setjmp:
285
286 #if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
287 // let's return it to _setjmp state
288 # pragma pop_macro("setjmp")
289 # undef setjmp_undefined_for_msvc
290 #endif
291
292 case Intrinsic::longjmp:
293
294 // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
295 // because, although it does clobber the counter register, the
296 // control can't then return to inside the loop unless there is also
297 // an eh_sjlj_setjmp.
298 case Intrinsic::eh_sjlj_setjmp:
299
300 case Intrinsic::memcpy:
301 case Intrinsic::memmove:
302 case Intrinsic::memset:
303 case Intrinsic::powi:
304 case Intrinsic::log:
305 case Intrinsic::log2:
306 case Intrinsic::log10:
307 case Intrinsic::exp:
308 case Intrinsic::exp2:
309 case Intrinsic::pow:
310 case Intrinsic::sin:
311 case Intrinsic::cos:
312 return true;
313 case Intrinsic::copysign:
314 if (CI->getArgOperand(0)->getType()->getScalarType()->
315 isPPC_FP128Ty())
316 return true;
317 else
318 continue; // ISD::FCOPYSIGN is never a library call.
319 case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
320 case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
321 case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
322 case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
323 case Intrinsic::rint: Opcode = ISD::FRINT; break;
324 case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
325 case Intrinsic::round: Opcode = ISD::FROUND; break;
326 case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
327 case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
328 case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
329 case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
330 }
331 }
332
333 // PowerPC does not use [US]DIVREM or other library calls for
334 // operations on regular types which are not otherwise library calls
335 // (i.e. soft float or atomics). If adapting for targets that do,
336 // additional care is required here.
337
338 LibFunc Func;
339 if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
340 LibInfo->getLibFunc(F->getName(), Func) &&
341 LibInfo->hasOptimizedCodeGen(Func)) {
342 // Non-read-only functions are never treated as intrinsics.
343 if (!CI->onlyReadsMemory())
344 return true;
345
346 // Conversion happens only for FP calls.
347 if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
348 return true;
349
350 switch (Func) {
351 default: return true;
352 case LibFunc_copysign:
353 case LibFunc_copysignf:
354 continue; // ISD::FCOPYSIGN is never a library call.
355 case LibFunc_copysignl:
356 return true;
357 case LibFunc_fabs:
358 case LibFunc_fabsf:
359 case LibFunc_fabsl:
360 continue; // ISD::FABS is never a library call.
361 case LibFunc_sqrt:
362 case LibFunc_sqrtf:
363 case LibFunc_sqrtl:
364 Opcode = ISD::FSQRT; break;
365 case LibFunc_floor:
366 case LibFunc_floorf:
367 case LibFunc_floorl:
368 Opcode = ISD::FFLOOR; break;
369 case LibFunc_nearbyint:
370 case LibFunc_nearbyintf:
371 case LibFunc_nearbyintl:
372 Opcode = ISD::FNEARBYINT; break;
373 case LibFunc_ceil:
374 case LibFunc_ceilf:
375 case LibFunc_ceill:
376 Opcode = ISD::FCEIL; break;
377 case LibFunc_rint:
378 case LibFunc_rintf:
379 case LibFunc_rintl:
380 Opcode = ISD::FRINT; break;
381 case LibFunc_round:
382 case LibFunc_roundf:
383 case LibFunc_roundl:
384 Opcode = ISD::FROUND; break;
385 case LibFunc_trunc:
386 case LibFunc_truncf:
387 case LibFunc_truncl:
388 Opcode = ISD::FTRUNC; break;
389 case LibFunc_fmin:
390 case LibFunc_fminf:
391 case LibFunc_fminl:
392 Opcode = ISD::FMINNUM; break;
393 case LibFunc_fmax:
394 case LibFunc_fmaxf:
395 case LibFunc_fmaxl:
396 Opcode = ISD::FMAXNUM; break;
397 }
398 }
399
400 if (Opcode) {
401 EVT EVTy =
402 TLI->getValueType(*DL, CI->getArgOperand(0)->getType(), true);
403
404 if (EVTy == MVT::Other)
405 return true;
406
407 if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
408 continue;
409 else if (EVTy.isVector() &&
410 TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
411 continue;
412
413 return true;
414 }
415 }
416
417 return true;
418 } else if (isa(J) &&
419 J->getType()->getScalarType()->isPPC_FP128Ty()) {
420 // Most operations on ppc_f128 values become calls.
421 return true;
422 } else if (isa(J) || isa(J) ||
423 isa(J) || isa(J)) {
424 CastInst *CI = cast(J);
425 if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
426 CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
427 isLargeIntegerTy(!TM->isPPC64(), CI->getSrcTy()->getScalarType()) ||
428 isLargeIntegerTy(!TM->isPPC64(), CI->getDestTy()->getScalarType()))
429 return true;
430 } else if (isLargeIntegerTy(!TM->isPPC64(),
431 J->getType()->getScalarType()) &&
432 (J->getOpcode() == Instruction::UDiv ||
433 J->getOpcode() == Instruction::SDiv ||
434 J->getOpcode() == Instruction::URem ||
435 J->getOpcode() == Instruction::SRem)) {
436 return true;
437 } else if (!TM->isPPC64() &&
438 isLargeIntegerTy(false, J->getType()->getScalarType()) &&
439 (J->getOpcode() == Instruction::Shl ||
440 J->getOpcode() == Instruction::AShr ||
441 J->getOpcode() == Instruction::LShr)) {
442 // Only on PPC32, for 128-bit integers (specifically not 64-bit
443 // integers), these might be runtime calls.
444 return true;
445 } else if (isa(J) || isa(J)) {
446 // On PowerPC, indirect jumps use the counter register.
447 return true;
448 } else if (SwitchInst *SI = dyn_cast(J)) {
449 if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
450 return true;
451 }
452
453 // FREM is always a call.
454 if (J->getOpcode() == Instruction::FRem)
455 return true;
456
457 if (STI->useSoftFloat()) {
458 switch(J->getOpcode()) {
459 case Instruction::FAdd:
460 case Instruction::FSub:
461 case Instruction::FMul:
462 case Instruction::FDiv:
463 case Instruction::FPTrunc:
464 case Instruction::FPExt:
465 case Instruction::FPToUI:
466 case Instruction::FPToSI:
467 case Instruction::UIToFP:
468 case Instruction::SIToFP:
469 case Instruction::FCmp:
470 return true;
471 }
472 }
473
474 for (Value *Operand : J->operands())
475 if (memAddrUsesCTR(*TM, Operand))
476 return true;
477 }
478
479 return false;
480 }
481 bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
482 bool MadeChange = false;
483
484 // Do not convert small short loops to CTR loop.
485 unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
486 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
487 SmallPtrSet EphValues;
488 auto AC = getAnalysis().getAssumptionCache(
489 *L->getHeader()->getParent());
490 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
491 CodeMetrics Metrics;
492 for (BasicBlock *BB : L->blocks())
493 Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
494 // 6 is an approximate latency for the mtctr instruction.
495 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
496 return false;
497 }
498
499 // Process nested loops first.
500 for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
501 MadeChange |= convertToCTRLoop(*I);
502 LLVM_DEBUG(dbgs() << "Nested loop converted\n");
503 }
504
505 // If a nested loop has been converted, then we can't convert this loop.
506 if (MadeChange)
507 return MadeChange;
508
509 // Bail out if the loop has irreducible control flow.
510 LoopBlocksRPO RPOT(L);
511 RPOT.perform(LI);
512 if (containsIrreducibleCFG(RPOT, *LI))
513 return false;
514
515 #ifndef NDEBUG
516 // Stop trying after reaching the limit (if any).
517 int Limit = CTRLoopLimit;
518 if (Limit >= 0) {
519 if (Counter >= CTRLoopLimit)
520 return false;
521 Counter++;
522 }
523 #endif
524
525 // We don't want to spill/restore the counter register, and so we don't
526 // want to use the counter register if the loop contains calls.
527 for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
528 I != IE; ++I)
529 if (mightUseCTR(*I))
530 return MadeChange;
531
532 SmallVector ExitingBlocks;
533 L->getExitingBlocks(ExitingBlocks);
534
535 // If there is an exit edge known to be frequently taken,
536 // we should not transform this loop.
537 for (auto &BB : ExitingBlocks) {
538 Instruction *TI = BB->getTerminator();
539 if (!TI) continue;
540
541 if (BranchInst *BI = dyn_cast(TI)) {
542 uint64_t TrueWeight = 0, FalseWeight = 0;
543 if (!BI->isConditional() ||
544 !BI->extractProfMetadata(TrueWeight, FalseWeight))
545 continue;
546
547 // If the exit path is more frequent than the loop path,
548 // we return here without further analysis for this loop.
549 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
550 if (( TrueIsExit && FalseWeight < TrueWeight) ||
551 (!TrueIsExit && FalseWeight > TrueWeight))
552 return MadeChange;
553 }
554 }
555
556 BasicBlock *CountedExitBlock = nullptr;
557 const SCEV *ExitCount = nullptr;
558 BranchInst *CountedExitBranch = nullptr;
559 for (SmallVectorImpl::iterator I = ExitingBlocks.begin(),
560 IE = ExitingBlocks.end(); I != IE; ++I) {
561 const SCEV *EC = SE->getExitCount(L, *I);
562 LLVM_DEBUG(dbgs() << "Exit Count for " << *L << " from block "
563 << (*I)->getName() << ": " << *EC << "\n");
564 if (isa(EC))
565 continue;
566 if (const SCEVConstant *ConstEC = dyn_cast(EC)) {
567 if (ConstEC->getValue()->isZero())
568 continue;
569 } else if (!SE->isLoopInvariant(EC, L))
570 continue;
571
572 if (SE->getTypeSizeInBits(EC->getType()) > (TM->isPPC64() ? 64 : 32))
573 continue;
574
575 // If this exiting block is contained in a nested loop, it is not eligible
576 // for insertion of the branch-and-decrement since the inner loop would
577 // end up messing up the value in the CTR.
578 if (LI->getLoopFor(*I) != L)
579 continue;
580
581 // We now have a loop-invariant count of loop iterations (which is not the
582 // constant zero) for which we know that this loop will not exit via this
583 // existing block.
584
585 // We need to make sure that this block will run on every loop iteration.
586 // For this to be true, we must dominate all blocks with backedges. Such
587 // blocks are in-loop predecessors to the header block.
588 bool NotAlways = false;
589 for (pred_iterator PI = pred_begin(L->getHeader()),
590 PIE = pred_end(L->getHeader()); PI != PIE; ++PI) {
591 if (!L->contains(*PI))
592 continue;
593
594 if (!DT->dominates(*I, *PI)) {
595 NotAlways = true;
596 break;
597 }
598 }
599
600 if (NotAlways)
601 continue;
602
603 // Make sure this blocks ends with a conditional branch.
604 Instruction *TI = (*I)->getTerminator();
605 if (!TI)
606 continue;
607
608 if (BranchInst *BI = dyn_cast(TI)) {
609 if (!BI->isConditional())
610 continue;
611
612 CountedExitBranch = BI;
613 } else
614 continue;
615
616 // Note that this block may not be the loop latch block, even if the loop
617 // has a latch block.
618 CountedExitBlock = *I;
619 ExitCount = EC;
620 break;
621 }
622
623 if (!CountedExitBlock)
624 return MadeChange;
625
626 BasicBlock *Preheader = L->getLoopPreheader();
627
628 // If we don't have a preheader, then insert one. If we already have a
629 // preheader, then we can use it (except if the preheader contains a use of
630 // the CTR register because some such uses might be reordered by the
631 // selection DAG after the mtctr instruction).
632 if (!Preheader || mightUseCTR(Preheader))
633 Preheader = InsertPreheaderForLoop(L, DT, LI, nullptr, PreserveLCSSA);
634 if (!Preheader)
635 return MadeChange;
636
637 LLVM_DEBUG(dbgs() << "Preheader for exit count: " << Preheader->getName()
638 << "\n");
639
640 // Insert the count into the preheader and replace the condition used by the
641 // selected branch.
642 MadeChange = true;
643
644 SCEVExpander SCEVE(*SE, *DL, "loopcnt");
645 LLVMContext &C = SE->getContext();
646 Type *CountType = TM->isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
647 if (!ExitCount->getType()->isPointerTy() &&
648 ExitCount->getType() != CountType)
649 ExitCount = SE->getZeroExtendExpr(ExitCount, CountType);
650 ExitCount = SE->getAddExpr(ExitCount, SE->getOne(CountType));
651 Value *ECValue =
652 SCEVE.expandCodeFor(ExitCount, CountType, Preheader->getTerminator());
653
654 IRBuilder<> CountBuilder(Preheader->getTerminator());
655 Module *M = Preheader->getParent()->getParent();
656 Function *MTCTRFunc =
657 Intrinsic::getDeclaration(M, Intrinsic::ppc_mtctr, CountType);
658 CountBuilder.CreateCall(MTCTRFunc, ECValue);
659
660 IRBuilder<> CondBuilder(CountedExitBranch);
661 Function *DecFunc =
662 Intrinsic::getDeclaration(M, Intrinsic::ppc_is_decremented_ctr_nonzero);
663 Value *NewCond = CondBuilder.CreateCall(DecFunc, {});
664 Value *OldCond = CountedExitBranch->getCondition();
665 CountedExitBranch->setCondition(NewCond);
666
667 // The false branch must exit the loop.
668 if (!L->contains(CountedExitBranch->getSuccessor(0)))
669 CountedExitBranch->swapSuccessors();
670
671 // The old condition may be dead now, and may have even created a dead PHI
672 // (the original induction variable).
673 RecursivelyDeleteTriviallyDeadInstructions(OldCond);
674 // Run through the basic blocks of the loop and see if any of them have dead
675 // PHIs that can be removed.
676 for (auto I : L->blocks())
677 DeleteDeadPHIs(I);
678
679 ++NumCTRLoops;
680 return MadeChange;
681 }
682110
683111 #ifndef NDEBUG
684112 static bool clobbersCTR(const MachineInstr &MI) {
99439943 }
99449944 case ISD::INTRINSIC_W_CHAIN: {
99459945 if (cast(N->getOperand(1))->getZExtValue() !=
9946 Intrinsic::ppc_is_decremented_ctr_nonzero)
9946 Intrinsic::loop_decrement)
99479947 break;
99489948
99499949 assert(N->getValueType(0) == MVT::i1 &&
1363513635
1363613636 if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
1363713637 cast(Cond.getOperand(1))->getZExtValue() ==
13638 Intrinsic::ppc_is_decremented_ctr_nonzero) {
13638 Intrinsic::loop_decrement) {
1363913639
1364013640 // We now need to make the intrinsic dead (it cannot be instruction
1364113641 // selected).
1366113661 if (LHS.getOpcode() == ISD::AND &&
1366213662 LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
1366313663 cast(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
13664 Intrinsic::ppc_is_decremented_ctr_nonzero &&
13664 Intrinsic::loop_decrement &&
1366513665 isa(LHS.getOperand(1)) &&
1366613666 !isNullConstant(LHS.getOperand(1)))
1366713667 LHS = LHS.getOperand(0);
1366813668
1366913669 if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
1367013670 cast(LHS.getOperand(1))->getZExtValue() ==
13671 Intrinsic::ppc_is_decremented_ctr_nonzero &&
13671 Intrinsic::loop_decrement &&
1367213672 isa(RHS)) {
1367313673 assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
1367413674 "Counter decrement comparison is not EQ or NE");
387387 PPC970_DGroup_First, PPC970_Unit_FXU;
388388 }
389389 let hasSideEffects = 1, Defs = [CTR8] in {
390 let Pattern = [(int_ppc_mtctr i64:$rS)] in
390 let Pattern = [(int_set_loop_iterations i64:$rS)] in
391391 def MTCTR8loop : XFXForm_7_ext<31, 467, 9, (outs), (ins g8rc:$rS),
392392 "mtctr $rS", IIC_SprMTSPR>,
393393 PPC970_DGroup_First, PPC970_Unit_FXU;
26042604 PPC970_DGroup_First, PPC970_Unit_FXU;
26052605 }
26062606 let hasSideEffects = 1, isCodeGenOnly = 1, Defs = [CTR] in {
2607 let Pattern = [(int_ppc_mtctr i32:$rS)] in
2607 let Pattern = [(int_set_loop_iterations i32:$rS)] in
26082608 def MTCTRloop : XFXForm_7_ext<31, 467, 9, (outs), (ins gprc:$rS),
26092609 "mtctr $rS", IIC_SprMTSPR>,
26102610 PPC970_DGroup_First, PPC970_Unit_FXU;
100100 RegisterTargetMachine C(getThePPC64LETarget());
101101
102102 PassRegistry &PR = *PassRegistry::getPassRegistry();
103 initializePPCCTRLoopsPass(PR);
104103 #ifndef NDEBUG
105104 initializePPCCTRLoopsVerifyPass(PR);
106105 #endif
421420 addPass(createPPCLoopPreIncPrepPass(getPPCTargetMachine()));
422421
423422 if (!DisableCTRLoops && getOptLevel() != CodeGenOpt::None)
424 addPass(createPPCCTRLoops());
423 addPass(createHardwareLoopsPass());
425424
426425 return false;
427426 }
66 //===----------------------------------------------------------------------===//
77
88 #include "PPCTargetTransformInfo.h"
9 #include "llvm/Analysis/CodeMetrics.h"
910 #include "llvm/Analysis/TargetTransformInfo.h"
1011 #include "llvm/CodeGen/BasicTTIImpl.h"
1112 #include "llvm/CodeGen/CostTable.h"
1213 #include "llvm/CodeGen/TargetLowering.h"
14 #include "llvm/CodeGen/TargetSchedule.h"
1315 #include "llvm/Support/CommandLine.h"
1416 #include "llvm/Support/Debug.h"
1517 using namespace llvm;
2931 EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
3032 cl::desc("Enable using coldcc calling conv for cold "
3133 "internal functions"));
34
35 // The latency of mtctr is only justified if there are more than 4
36 // comparisons that will be removed as a result.
37 static cl::opt
38 SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
39 cl::desc("Loops with a constant trip count smaller than "
40 "this value will not use the count register."));
3241
3342 //===----------------------------------------------------------------------===//
3443 //
203212 return BaseT::getUserCost(U, Operands);
204213 }
205214
215 bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
216 TargetLibraryInfo *LibInfo) {
217 const PPCTargetMachine &TM = ST->getTargetMachine();
218
219 // Loop through the inline asm constraints and look for something that
220 // clobbers ctr.
221 auto asmClobbersCTR = [](InlineAsm *IA) {
222 InlineAsm::ConstraintInfoVector CIV = IA->ParseConstraints();
223 for (unsigned i = 0, ie = CIV.size(); i < ie; ++i) {
224 InlineAsm::ConstraintInfo &C = CIV[i];
225 if (C.Type != InlineAsm::isInput)
226 for (unsigned j = 0, je = C.Codes.size(); j < je; ++j)
227 if (StringRef(C.Codes[j]).equals_lower("{ctr}"))
228 return true;
229 }
230 return false;
231 };
232
233 // Determining the address of a TLS variable results in a function call in
234 // certain TLS models.
235 std::function memAddrUsesCTR =
236 [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
237 const auto *GV = dyn_cast(MemAddr);
238 if (!GV) {
239 // Recurse to check for constants that refer to TLS global variables.
240 if (const auto *CV = dyn_cast(MemAddr))
241 for (const auto &CO : CV->operands())
242 if (memAddrUsesCTR(CO))
243 return true;
244
245 return false;
246 }
247
248 if (!GV->isThreadLocal())
249 return false;
250 TLSModel::Model Model = TM.getTLSModel(GV);
251 return Model == TLSModel::GeneralDynamic ||
252 Model == TLSModel::LocalDynamic;
253 };
254
255 auto isLargeIntegerTy = [](bool Is32Bit, Type *Ty) {
256 if (IntegerType *ITy = dyn_cast(Ty))
257 return ITy->getBitWidth() > (Is32Bit ? 32U : 64U);
258
259 return false;
260 };
261
262 for (BasicBlock::iterator J = BB->begin(), JE = BB->end();
263 J != JE; ++J) {
264 if (CallInst *CI = dyn_cast(J)) {
265 // Inline ASM is okay, unless it clobbers the ctr register.
266 if (InlineAsm *IA = dyn_cast(CI->getCalledValue())) {
267 if (asmClobbersCTR(IA))
268 return true;
269 continue;
270 }
271
272 if (Function *F = CI->getCalledFunction()) {
273 // Most intrinsics don't become function calls, but some might.
274 // sin, cos, exp and log are always calls.
275 unsigned Opcode = 0;
276 if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
277 switch (F->getIntrinsicID()) {
278 default: continue;
279 // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
280 // we're definitely using CTR.
281 case Intrinsic::set_loop_iterations:
282 case Intrinsic::loop_decrement:
283 return true;
284
285 // VisualStudio defines setjmp as _setjmp
286 #if defined(_MSC_VER) && defined(setjmp) && \
287 !defined(setjmp_undefined_for_msvc)
288 # pragma push_macro("setjmp")
289 # undef setjmp
290 # define setjmp_undefined_for_msvc
291 #endif
292
293 case Intrinsic::setjmp:
294
295 #if defined(_MSC_VER) && defined(setjmp_undefined_for_msvc)
296 // let's return it to _setjmp state
297 # pragma pop_macro("setjmp")
298 # undef setjmp_undefined_for_msvc
299 #endif
300
301 case Intrinsic::longjmp:
302
303 // Exclude eh_sjlj_setjmp; we don't need to exclude eh_sjlj_longjmp
304 // because, although it does clobber the counter register, the
305 // control can't then return to inside the loop unless there is also
306 // an eh_sjlj_setjmp.
307 case Intrinsic::eh_sjlj_setjmp:
308
309 case Intrinsic::memcpy:
310 case Intrinsic::memmove:
311 case Intrinsic::memset:
312 case Intrinsic::powi:
313 case Intrinsic::log:
314 case Intrinsic::log2:
315 case Intrinsic::log10:
316 case Intrinsic::exp:
317 case Intrinsic::exp2:
318 case Intrinsic::pow:
319 case Intrinsic::sin:
320 case Intrinsic::cos:
321 return true;
322 case Intrinsic::copysign:
323 if (CI->getArgOperand(0)->getType()->getScalarType()->
324 isPPC_FP128Ty())
325 return true;
326 else
327 continue; // ISD::FCOPYSIGN is never a library call.
328 case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
329 case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
330 case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
331 case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
332 case Intrinsic::rint: Opcode = ISD::FRINT; break;
333 case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
334 case Intrinsic::round: Opcode = ISD::FROUND; break;
335 case Intrinsic::minnum: Opcode = ISD::FMINNUM; break;
336 case Intrinsic::maxnum: Opcode = ISD::FMAXNUM; break;
337 case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO; break;
338 case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO; break;
339 }
340 }
341
342 // PowerPC does not use [US]DIVREM or other library calls for
343 // operations on regular types which are not otherwise library calls
344 // (i.e. soft float or atomics). If adapting for targets that do,
345 // additional care is required here.
346
347 LibFunc Func;
348 if (!F->hasLocalLinkage() && F->hasName() && LibInfo &&
349 LibInfo->getLibFunc(F->getName(), Func) &&
350 LibInfo->hasOptimizedCodeGen(Func)) {
351 // Non-read-only functions are never treated as intrinsics.
352 if (!CI->onlyReadsMemory())
353 return true;
354
355 // Conversion happens only for FP calls.
356 if (!CI->getArgOperand(0)->getType()->isFloatingPointTy())
357 return true;
358
359 switch (Func) {
360 default: return true;
361 case LibFunc_copysign:
362 case LibFunc_copysignf:
363 continue; // ISD::FCOPYSIGN is never a library call.
364 case LibFunc_copysignl:
365 return true;
366 case LibFunc_fabs:
367 case LibFunc_fabsf:
368 case LibFunc_fabsl:
369 continue; // ISD::FABS is never a library call.
370 case LibFunc_sqrt:
371 case LibFunc_sqrtf:
372 case LibFunc_sqrtl:
373 Opcode = ISD::FSQRT; break;
374 case LibFunc_floor:
375 case LibFunc_floorf:
376 case LibFunc_floorl:
377 Opcode = ISD::FFLOOR; break;
378 case LibFunc_nearbyint:
379 case LibFunc_nearbyintf:
380 case LibFunc_nearbyintl:
381 Opcode = ISD::FNEARBYINT; break;
382 case LibFunc_ceil:
383 case LibFunc_ceilf:
384 case LibFunc_ceill:
385 Opcode = ISD::FCEIL; break;
386 case LibFunc_rint:
387 case LibFunc_rintf:
388 case LibFunc_rintl:
389 Opcode = ISD::FRINT; break;
390 case LibFunc_round:
391 case LibFunc_roundf:
392 case LibFunc_roundl:
393 Opcode = ISD::FROUND; break;
394 case LibFunc_trunc:
395 case LibFunc_truncf:
396 case LibFunc_truncl:
397 Opcode = ISD::FTRUNC; break;
398 case LibFunc_fmin:
399 case LibFunc_fminf:
400 case LibFunc_fminl:
401 Opcode = ISD::FMINNUM; break;
402 case LibFunc_fmax:
403 case LibFunc_fmaxf:
404 case LibFunc_fmaxl:
405 Opcode = ISD::FMAXNUM; break;
406 }
407 }
408
409 if (Opcode) {
410 EVT EVTy =
411 TLI->getValueType(DL, CI->getArgOperand(0)->getType(), true);
412
413 if (EVTy == MVT::Other)
414 return true;
415
416 if (TLI->isOperationLegalOrCustom(Opcode, EVTy))
417 continue;
418 else if (EVTy.isVector() &&
419 TLI->isOperationLegalOrCustom(Opcode, EVTy.getScalarType()))
420 continue;
421
422 return true;
423 }
424 }
425
426 return true;
427 } else if (isa(J) &&
428 J->getType()->getScalarType()->isPPC_FP128Ty()) {
429 // Most operations on ppc_f128 values become calls.
430 return true;
431 } else if (isa(J) || isa(J) ||
432 isa(J) || isa(J)) {
433 CastInst *CI = cast(J);
434 if (CI->getSrcTy()->getScalarType()->isPPC_FP128Ty() ||
435 CI->getDestTy()->getScalarType()->isPPC_FP128Ty() ||
436 isLargeIntegerTy(!TM.isPPC64(), CI->getSrcTy()->getScalarType()) ||
437 isLargeIntegerTy(!TM.isPPC64(), CI->getDestTy()->getScalarType()))
438 return true;
439 } else if (isLargeIntegerTy(!TM.isPPC64(),
440 J->getType()->getScalarType()) &&
441 (J->getOpcode() == Instruction::UDiv ||
442 J->getOpcode() == Instruction::SDiv ||
443 J->getOpcode() == Instruction::URem ||
444 J->getOpcode() == Instruction::SRem)) {
445 return true;
446 } else if (!TM.isPPC64() &&
447 isLargeIntegerTy(false, J->getType()->getScalarType()) &&
448 (J->getOpcode() == Instruction::Shl ||
449 J->getOpcode() == Instruction::AShr ||
450 J->getOpcode() == Instruction::LShr)) {
451 // Only on PPC32, for 128-bit integers (specifically not 64-bit
452 // integers), these might be runtime calls.
453 return true;
454 } else if (isa(J) || isa(J)) {
455 // On PowerPC, indirect jumps use the counter register.
456 return true;
457 } else if (SwitchInst *SI = dyn_cast(J)) {
458 if (SI->getNumCases() + 1 >= (unsigned)TLI->getMinimumJumpTableEntries())
459 return true;
460 }
461
462 // FREM is always a call.
463 if (J->getOpcode() == Instruction::FRem)
464 return true;
465
466 if (ST->useSoftFloat()) {
467 switch(J->getOpcode()) {
468 case Instruction::FAdd:
469 case Instruction::FSub:
470 case Instruction::FMul:
471 case Instruction::FDiv:
472 case Instruction::FPTrunc:
473 case Instruction::FPExt:
474 case Instruction::FPToUI:
475 case Instruction::FPToSI:
476 case Instruction::UIToFP:
477 case Instruction::SIToFP:
478 case Instruction::FCmp:
479 return true;
480 }
481 }
482
483 for (Value *Operand : J->operands())
484 if (memAddrUsesCTR(Operand))
485 return true;
486 }
487
488 return false;
489 }
490
491 bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
492 AssumptionCache &AC,
493 TargetLibraryInfo *LibInfo,
494 TTI::HardwareLoopInfo &HWLoopInfo) {
495 const PPCTargetMachine &TM = ST->getTargetMachine();
496 TargetSchedModel SchedModel;
497 SchedModel.init(ST);
498
499 // Do not convert small short loops to CTR loop.
500 unsigned ConstTripCount = SE.getSmallConstantTripCount(L);
501 if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
502 SmallPtrSet EphValues;
503 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
504 CodeMetrics Metrics;
505 for (BasicBlock *BB : L->blocks())
506 Metrics.analyzeBasicBlock(BB, *this, EphValues);
507 // 6 is an approximate latency for the mtctr instruction.
508 if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
509 return false;
510 }
511
512 // We don't want to spill/restore the counter register, and so we don't
513 // want to use the counter register if the loop contains calls.
514 for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
515 I != IE; ++I)
516 if (mightUseCTR(*I, LibInfo))
517 return false;
518
519 SmallVector ExitingBlocks;
520 L->getExitingBlocks(ExitingBlocks);
521
522 // If there is an exit edge known to be frequently taken,
523 // we should not transform this loop.
524 for (auto &BB : ExitingBlocks) {
525 Instruction *TI = BB->getTerminator();
526 if (!TI) continue;
527
528 if (BranchInst *BI = dyn_cast(TI)) {
529 uint64_t TrueWeight = 0, FalseWeight = 0;
530 if (!BI->isConditional() ||
531 !BI->extractProfMetadata(TrueWeight, FalseWeight))
532 continue;
533
534 // If the exit path is more frequent than the loop path,
535 // we return here without further analysis for this loop.
536 bool TrueIsExit = !L->contains(BI->getSuccessor(0));
537 if (( TrueIsExit && FalseWeight < TrueWeight) ||
538 (!TrueIsExit && FalseWeight > TrueWeight))
539 return false;
540 }
541 }
542
543 LLVMContext &C = L->getHeader()->getContext();
544 HWLoopInfo.CountType = TM.isPPC64() ?
545 Type::getInt64Ty(C) : Type::getInt32Ty(C);
546 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
547 return true;
548 }
549
206550 void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
207551 TTI::UnrollingPreferences &UP) {
208552 if (ST->getDarwinDirective() == PPC::DIR_A2) {
3232
3333 const PPCSubtarget *getST() const { return ST; }
3434 const PPCTargetLowering *getTLI() const { return TLI; }
35 bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo);
3536
3637 public:
3738 explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
5152 unsigned getUserCost(const User *U, ArrayRef Operands);
5253
5354 TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
55 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
56 AssumptionCache &AC,
57 TargetLibraryInfo *LibInfo,
58 TTI::HardwareLoopInfo &HWLoopInfo);
5459 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
5560 TTI::UnrollingPreferences &UP);
5661
262262 %8 = sub i64 0, %int_part_ptr.02534
263263 %scevgep5 = getelementptr i8, i8* %call109, i64 %8
264264 %scevgep56 = ptrtoint i8* %scevgep5 to i64
265 call void @llvm.ppc.mtctr.i64(i64 %scevgep56)
265 call void @llvm.set.loop.iterations.i64(i64 %scevgep56)
266266 br label %for.body.116
267267
268268 for.cond.cleanup: ; preds = %if.end.138, %if.end.105
297297 %conv134 = trunc i32 %add133 to i8
298298 %scevgep = getelementptr i8, i8* inttoptr (i64 -1 to i8*), i64 %call109.pn2
299299 store i8 %conv134, i8* %scevgep, align 1, !tbaa !10
300 %12 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
301 br i1 %12, label %for.body.116, label %for.cond.cleanup.115
300 %12 = call i64 @llvm.loop.dec(i64 %scevgep56, i64 1)
301 %dec.cmp = icmp ne i64 %12, 0
302 br i1 %dec.cmp, label %for.body.116, label %for.cond.cleanup.115
302303
303304 if.then.136: ; preds = %for.cond.cleanup.115
304305 %incdec.ptr137 = getelementptr inbounds i8, i8* %int_part_ptr.0253, i64 -1
322323 declare i8* @memcpy(i8*, i8* nocapture readonly, i64) #1
323324
324325 ; Function Attrs: nounwind
325 declare void @llvm.ppc.mtctr.i64(i64) #0
326
327 ; Function Attrs: nounwind
328 declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #0
326 declare void @llvm.set.loop.iterations.i64(i64) #0
327
328 ; Function Attrs: nounwind
329 declare i64 @llvm.loop.dec(i64, i64) #0
329330
330331 attributes #0 = { nounwind }
331332 attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
None ; Test pass name: ppc-ctr-loops.
1 ; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-CTR-LOOPS
2 ; STOP-BEFORE-CTR-LOOPS-NOT: -ppc-ctr-loops
3 ; STOP-BEFORE-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
4 ; STOP-BEFORE-CTR-LOOPS-NOT: PowerPC CTR Loops
5
6 ; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-after=ppc-ctr-loops -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-AFTER-CTR-LOOPS
7 ; STOP-AFTER-CTR-LOOPS: -ppc-ctr-loops
8 ; STOP-AFTER-CTR-LOOPS-NOT: "ppc-ctr-loops" pass is not registered.
9 ; STOP-AFTER-CTR-LOOPS: PowerPC CTR Loops
10
11
12 ; Test pass name: ppc-loop-preinc-prep.
131 ; RUN: llc -mtriple=powerpc64le-unknown-unknown < %s -debug-pass=Structure -stop-before=ppc-loop-preinc-prep -o /dev/null 2>&1 | FileCheck %s -check-prefix=STOP-BEFORE-LOOP-PREINC-PREP
142 ; STOP-BEFORE-LOOP-PREINC-PREP-NOT: -ppc-loop-preinc-prep
0 ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC
1 ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-REGDEC
2 ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEC --check-prefix=CHECK-NESTED
3
4 ; CHECK-LABEL: while_lt
5 define void @while_lt(i32 %i, i32 %N, i32* nocapture %A) {
6 entry:
7 %cmp4 = icmp ult i32 %i, %N
8 br i1 %cmp4, label %while.body, label %while.end
9
10 ; CHECK: while.body.preheader:
11 ; CHECK: [[COUNT:%[^ ]+]] = sub i32 %N, %i
12 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
13 ; CHECK: br label %while.body
14
15 ; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
16 ; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
17 ; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
18 ; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
19
20 ; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
21 ; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
22
23 while.body:
24 %i.addr.05 = phi i32 [ %inc, %while.body ], [ %i, %entry ]
25 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
26 store i32 %i.addr.05, i32* %arrayidx, align 4
27 %inc = add nuw i32 %i.addr.05, 1
28 %exitcond = icmp eq i32 %inc, %N
29 br i1 %exitcond, label %while.end, label %while.body
30
31 while.end:
32 ret void
33 }
34
35 ; CHECK-LABEL: while_gt
36 ; CHECK: while.body.preheader:
37 ; CHECK: [[COUNT:%[^ ]+]] = sub i32 %i, %N
38 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
39 ; CHECK: br label %while.body
40
41 ; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
42 ; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
43 ; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
44 ; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
45
46 ; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
47 ; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
48
49 define void @while_gt(i32 %i, i32 %N, i32* nocapture %A) {
50 entry:
51 %cmp4 = icmp sgt i32 %i, %N
52 br i1 %cmp4, label %while.body, label %while.end
53
54 while.body:
55 %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
56 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
57 store i32 %i.addr.05, i32* %arrayidx, align 4
58 %dec = add nsw i32 %i.addr.05, -1
59 %cmp = icmp sgt i32 %dec, %N
60 br i1 %cmp, label %while.body, label %while.end
61
62 while.end:
63 ret void
64 }
65
66 ; CHECK-LABEL: while_gte
67 ; CHECK: while.body.preheader:
68 ; CHECK: [[ADD:%[^ ]+]] = add i32 %i, 1
69 ; CHECK: [[SEL:%[^ ]+]] = icmp slt i32 %N, %i
70 ; CHECK: [[MIN:%[^ ]+]] = select i1 [[SEL]], i32 %N, i32 %i
71 ; CHECK: [[COUNT:%[^ ]+]] = sub i32 [[ADD]], [[MIN]]
72 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]])
73 ; CHECK: br label %while.body
74
75 ; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ]
76 ; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
77 ; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
78 ; CHECK-REGDEC: br i1 [[CMP]], label %while.body, label %while.end
79
80 ; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
81 ; CHECK-DEC: br i1 [[LOOP_DEC]], label %while.body, label %while.end
82
83 define void @while_gte(i32 %i, i32 %N, i32* nocapture %A) {
84 entry:
85 %cmp4 = icmp slt i32 %i, %N
86 br i1 %cmp4, label %while.end, label %while.body
87
88 while.body:
89 %i.addr.05 = phi i32 [ %dec, %while.body ], [ %i, %entry ]
90 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.addr.05
91 store i32 %i.addr.05, i32* %arrayidx, align 4
92 %dec = add nsw i32 %i.addr.05, -1
93 %cmp = icmp sgt i32 %i.addr.05, %N
94 br i1 %cmp, label %while.body, label %while.end
95
96 while.end:
97 ret void
98 }
99
100 ; CHECK-LABEL: nested
101 ; CHECK-NESTED: call void @llvm.set.loop.iterations.i32(i32 %N)
102 ; CHECK-NESTED: br label %while.cond1.preheader.us
103
104 ; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N)
105 ; CHECK: br label %while.body3.us
106
107 ; CHECK-DEC: [[LOOP_DEC:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
108
109 ; CHECK-REGDEC: [[REM:%[^ ]+]] = phi i32 [ %N, %while.cond1.preheader.us ], [ [[LOOP_DEC:%[^ ]+]], %while.body3.us ]
110 ; CHECK-REGDEC: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1)
111 ; CHECK-REGDEC: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0
112 ; CHECK-REGDEC: br i1 [[CMP]], label %while.body3.us, label %while.cond1.while.end_crit_edge.us
113
114 ; CHECK-NESTED: [[LOOP_DEC1:%[^ ]+]] = call i1 @llvm.loop.decrement.i32(i32 1)
115 ; CHECK-NESTED: br i1 [[LOOP_DEC1]], label %while.cond1.preheader.us, label %while.end7
116
117 define void @nested(i32* nocapture %A, i32 %N) {
118 entry:
119 %cmp20 = icmp eq i32 %N, 0
120 br i1 %cmp20, label %while.end7, label %while.cond1.preheader.us
121
122 while.cond1.preheader.us:
123 %i.021.us = phi i32 [ %inc6.us, %while.cond1.while.end_crit_edge.us ], [ 0, %entry ]
124 %mul.us = mul i32 %i.021.us, %N
125 br label %while.body3.us
126
127 while.body3.us:
128 %j.019.us = phi i32 [ 0, %while.cond1.preheader.us ], [ %inc.us, %while.body3.us ]
129 %add.us = add i32 %j.019.us, %mul.us
130 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %add.us
131 store i32 %add.us, i32* %arrayidx.us, align 4
132 %inc.us = add nuw i32 %j.019.us, 1
133 %exitcond = icmp eq i32 %inc.us, %N
134 br i1 %exitcond, label %while.cond1.while.end_crit_edge.us, label %while.body3.us
135
136 while.cond1.while.end_crit_edge.us:
137 %inc6.us = add nuw i32 %i.021.us, 1
138 %exitcond23 = icmp eq i32 %inc6.us, %N
139 br i1 %exitcond23, label %while.end7, label %while.cond1.preheader.us
140
141 while.end7:
142 ret void
143 }
0 ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -S %s -o - | FileCheck %s
1 ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-hardware-loop-phi=true -S %s -o - | FileCheck %s
2 ; RUN: opt -hardware-loops -force-hardware-loops=true -hardware-loop-decrement=1 -hardware-loop-counter-bitwidth=32 -force-nested-hardware-loop=true -S %s -o - | FileCheck %s
3
4 ; CHECK-LABEL: float_counter
5 ; CHECK-NOT: set.loop.iterations
6 ; CHECK-NOT: loop.decrement
7 define void @float_counter(i32* nocapture %A, float %N) {
8 entry:
9 %cmp6 = fcmp ogt float %N, 0.000000e+00
10 br i1 %cmp6, label %while.body, label %while.end
11
12 while.body:
13 %i.07 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
14 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.07
15 store i32 %i.07, i32* %arrayidx, align 4
16 %inc = add i32 %i.07, 1
17 %conv = uitofp i32 %inc to float
18 %cmp = fcmp olt float %conv, %N
19 br i1 %cmp, label %while.body, label %while.end
20
21 while.end:
22 ret void
23 }
24
25 ; CHECK-LABEL: variant_counter
26 ; CHECK-NOT: set.loop.iterations
27 ; CHECK-NOT: loop.decrement
28 define void @variant_counter(i32* nocapture %A, i32* nocapture readonly %B) {
29 entry:
30 %0 = load i32, i32* %B, align 4
31 %cmp7 = icmp eq i32 %0, 0
32 br i1 %cmp7, label %while.end, label %while.body
33
34 while.body:
35 %i.08 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
36 %arrayidx1 = getelementptr inbounds i32, i32* %A, i32 %i.08
37 store i32 %i.08, i32* %arrayidx1, align 4
38 %inc = add nuw i32 %i.08, 1
39 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %inc
40 %1 = load i32, i32* %arrayidx, align 4
41 %cmp = icmp ult i32 %inc, %1
42 br i1 %cmp, label %while.body, label %while.end
43
44 while.end:
45 ret void
46 }
307307 initializeVectorization(*Registry);
308308 initializeScalarizeMaskedMemIntrinPass(*Registry);
309309 initializeExpandReductionsPass(*Registry);
310 initializeHardwareLoopsPass(*Registry);
310311
311312 // Initialize debugging passes.
312313 initializeScavengerTestPass(*Registry);
527527 initializeExpandReductionsPass(Registry);
528528 initializeWasmEHPreparePass(Registry);
529529 initializeWriteBitcodePassPass(Registry);
530 initializeHardwareLoopsPass(Registry);
530531
531532 #ifdef LINK_POLLY_INTO_TOOLS
532533 polly::initializePollyPasses(Registry);