llvm.org GIT mirror llvm / bebe48d
Add a loop rerolling pass This adds a loop rerolling pass: the opposite of (partial) loop unrolling. The transformation aims to take loops like this: for (int i = 0; i < 3200; i += 5) { a[i] += alpha * b[i]; a[i + 1] += alpha * b[i + 1]; a[i + 2] += alpha * b[i + 2]; a[i + 3] += alpha * b[i + 3]; a[i + 4] += alpha * b[i + 4]; } and turn them into this: for (int i = 0; i < 3200; ++i) { a[i] += alpha * b[i]; } and loops like this: for (int i = 0; i < 500; ++i) { x[3*i] = foo(0); x[3*i+1] = foo(0); x[3*i+2] = foo(0); } and turn them into this: for (int i = 0; i < 1500; ++i) { x[i] = foo(0); } There are two motivations for this transformation: 1. Code-size reduction (especially relevant, obviously, when compiling for code size). 2. Providing greater choice to the loop vectorizer (and generic unroller) to choose the unrolling factor (and a better ability to vectorize). The loop vectorizer can take vector lengths and register pressure into account when choosing an unrolling factor, for example, and a pre-unrolled loop limits that choice. This is especially problematic if the manual unrolling was optimized for a machine different from the current target. The current implementation is limited to single basic-block loops only. The rerolling recognition should work regardless of how the loop iterations are intermixed within the loop body (subject to dependency and side-effect constraints), but the significant restriction is that the order of the instructions in each iteration must be identical. This seems sufficient to capture all current use cases. This pass is not currently enabled by default at any optimization level. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194939 91177308-0d34-0410-b5e6-96231b3b80d8 Hal Finkel 5 years ago
10 changed file(s) with 1630 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
161161 void initializeLoopSimplifyPass(PassRegistry&);
162162 void initializeLoopStrengthReducePass(PassRegistry&);
163163 void initializeGlobalMergePass(PassRegistry&);
164 void initializeLoopRerollPass(PassRegistry&);
164165 void initializeLoopUnrollPass(PassRegistry&);
165166 void initializeLoopUnswitchPass(PassRegistry&);
166167 void initializeLoopIdiomRecognizePass(PassRegistry&);
9090 (void) llvm::createLoopExtractorPass();
9191 (void) llvm::createLoopSimplifyPass();
9292 (void) llvm::createLoopStrengthReducePass();
93 (void) llvm::createLoopRerollPass();
9394 (void) llvm::createLoopUnrollPass();
9495 (void) llvm::createLoopUnswitchPass();
9596 (void) llvm::createLoopIdiomPass();
144144
145145 //===----------------------------------------------------------------------===//
146146 //
147 // LoopReroll - This pass is a simple loop rerolling pass.
148 //
149 Pass *createLoopRerollPass();
150
151 //===----------------------------------------------------------------------===//
152 //
147153 // LoopRotate - This pass is a simple loop rotating pass.
148154 //
149155 Pass *createLoopRotatePass();
6363
6464 /** See llvm::createLoopRotatePass function. */
6565 void LLVMAddLoopRotatePass(LLVMPassManagerRef PM);
66
67 /** See llvm::createLoopRerollPass function. */
68 void LLVMAddLoopRerollPass(LLVMPassManagerRef PM);
6669
6770 /** See llvm::createLoopUnrollPass function. */
6871 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
5353 cl::init(true), cl::Hidden,
5454 cl::desc("Enable the new, experimental SROA pass"));
5555
56 static cl::opt
57 RunLoopRerolling("reroll-loops", cl::Hidden,
58 cl::desc("Run the loop rerolling pass"));
59
5660 PassManagerBuilder::PassManagerBuilder() {
5761 OptLevel = 2;
5862 SizeLevel = 0;
215219
216220 addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
217221
222 if (RunLoopRerolling)
223 MPM.add(createLoopRerollPass());
218224 if (SLPVectorize)
219225 MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
220226
1515 LoopInstSimplify.cpp
1616 LoopRotation.cpp
1717 LoopStrengthReduce.cpp
18 LoopRerollPass.cpp
1819 LoopUnrollPass.cpp
1920 LoopUnswitch.cpp
2021 LowerAtomic.cpp
0 //===-- LoopReroll.cpp - Loop rerolling pass ------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements a simple loop reroller.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #define DEBUG_TYPE "loop-reroll"
14 #include "llvm/Transforms/Scalar.h"
15 #include "llvm/ADT/SmallSet.h"
16 #include "llvm/ADT/Statistic.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/Analysis/AliasAnalysis.h"
19 #include "llvm/Analysis/AliasSetTracker.h"
20 #include "llvm/Analysis/LoopPass.h"
21 #include "llvm/Analysis/ScalarEvolution.h"
22 #include "llvm/Analysis/ScalarEvolutionExpander.h"
23 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/IR/DataLayout.h"
26 #include "llvm/IR/IntrinsicInst.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Target/TargetLibraryInfo.h"
31 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
32 #include "llvm/Transforms/Utils/Local.h"
33 #include "llvm/Transforms/Utils/LoopUtils.h"
34
35 using namespace llvm;
36
37 STATISTIC(NumRerolledLoops, "Number of rerolled loops");
38
39 static cl::opt
40 MaxInc("max-reroll-increment", cl::init(2048), cl::Hidden,
41 cl::desc("The maximum increment for loop rerolling"));
42
43 // This loop re-rolling transformation aims to transform loops like this:
44 //
45 // int foo(int a);
46 // void bar(int *x) {
47 // for (int i = 0; i < 500; i += 3) {
48 // foo(i);
49 // foo(i+1);
50 // foo(i+2);
51 // }
52 // }
53 //
54 // into a loop like this:
55 //
56 // void bar(int *x) {
57 // for (int i = 0; i < 500; ++i)
58 // foo(i);
59 // }
60 //
61 // It does this by looking for loops that, besides the latch code, are composed
62 // of isomorphic DAGs of instructions, with each DAG rooted at some increment
63 // to the induction variable, and where each DAG is isomorphic to the DAG
64 // rooted at the induction variable (excepting the sub-DAGs which root the
65 // other induction-variable increments). In other words, we're looking for loop
66 // bodies of the form:
67 //
68 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
69 // f(%iv)
70 // %iv.1 = add %iv, 1 <-- a root increment
71 // f(%iv.1)
72 // %iv.2 = add %iv, 2 <-- a root increment
73 // f(%iv.2)
74 // %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
75 // f(%iv.scale_m_1)
76 // ...
77 // %iv.next = add %iv, scale
78 // %cmp = icmp(%iv, ...)
79 // br %cmp, header, exit
80 //
81 // where each f(i) is a set of instructions that, collectively, are a function
82 // only of i (and other loop-invariant values).
83 //
84 // As a special case, we can also reroll loops like this:
85 //
86 // int foo(int);
87 // void bar(int *x) {
88 // for (int i = 0; i < 500; ++i) {
89 // x[3*i] = foo(0);
90 // x[3*i+1] = foo(0);
91 // x[3*i+2] = foo(0);
92 // }
93 // }
94 //
95 // into this:
96 //
97 // void bar(int *x) {
98 // for (int i = 0; i < 1500; ++i)
99 // x[i] = foo(0);
100 // }
101 //
102 // in which case, we're looking for inputs like this:
103 //
104 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
105 // %scaled.iv = mul %iv, scale
106 // f(%scaled.iv)
107 // %scaled.iv.1 = add %scaled.iv, 1
108 // f(%scaled.iv.1)
109 // %scaled.iv.2 = add %scaled.iv, 2
110 // f(%scaled.iv.2)
111 // %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
112 // f(%scaled.iv.scale_m_1)
113 // ...
114 // %iv.next = add %iv, 1
115 // %cmp = icmp(%iv, ...)
116 // br %cmp, header, exit
117
118 namespace {
119 class LoopReroll : public LoopPass {
120 public:
121 static char ID; // Pass ID, replacement for typeid
122 LoopReroll() : LoopPass(ID) {
123 initializeLoopRerollPass(*PassRegistry::getPassRegistry());
124 }
125
126 bool runOnLoop(Loop *L, LPPassManager &LPM);
127
128 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
129 AU.addRequired();
130 AU.addRequired();
131 AU.addPreserved();
132 AU.addRequired();
133 AU.addPreserved();
134 AU.addRequired();
135 AU.addRequired();
136 }
137
138 protected:
139 AliasAnalysis *AA;
140 LoopInfo *LI;
141 ScalarEvolution *SE;
142 DataLayout *DL;
143 TargetLibraryInfo *TLI;
144 DominatorTree *DT;
145
146 typedef SmallVector SmallInstructionVector;
147 typedef SmallSet SmallInstructionSet;
148
149 // A chain of isomorphic instructions, indentified by a single-use PHI,
150 // representing a reduction. Only the last value may be used outside the
151 // loop.
152 struct SimpleLoopReduction {
153 SimpleLoopReduction(Instruction *P, Loop *L)
154 : Valid(false), Instructions(1, P) {
155 assert(isa(P) && "First reduction instruction must be a PHI");
156 add(L);
157 }
158
159 bool valid() const {
160 return Valid;
161 }
162
163 Instruction *getPHI() const {
164 assert(Valid && "Using invalid reduction");
165 return Instructions.front();
166 }
167
168 Instruction *getReducedValue() const {
169 assert(Valid && "Using invalid reduction");
170 return Instructions.back();
171 }
172
173 Instruction *get(size_t i) const {
174 assert(Valid && "Using invalid reduction");
175 return Instructions[i+1];
176 }
177
178 Instruction *operator [] (size_t i) const { return get(i); }
179
180 // The size, ignoring the initial PHI.
181 size_t size() const {
182 assert(Valid && "Using invalid reduction");
183 return Instructions.size()-1;
184 }
185
186 typedef SmallInstructionVector::iterator iterator;
187 typedef SmallInstructionVector::const_iterator const_iterator;
188
189 iterator begin() {
190 assert(Valid && "Using invalid reduction");
191 return llvm::next(Instructions.begin());
192 }
193
194 const_iterator begin() const {
195 assert(Valid && "Using invalid reduction");
196 return llvm::next(Instructions.begin());
197 }
198
199 iterator end() { return Instructions.end(); }
200 const_iterator end() const { return Instructions.end(); }
201
202 protected:
203 bool Valid;
204 SmallInstructionVector Instructions;
205
206 void add(Loop *L);
207 };
208
209 // The set of all reductions, and state tracking of possible reductions
210 // during loop instruction processing.
211 struct ReductionTracker {
212 typedef SmallVector SmallReductionVector;
213
214 // Add a new possible reduction.
215 void addSLR(SimpleLoopReduction &SLR) {
216 PossibleReds.push_back(SLR);
217 }
218
219 // Setup to track possible reductions corresponding to the provided
220 // rerolling scale. Only reductions with a number of non-PHI instructions
221 // that is divisible by the scale are considered. Three instructions sets
222 // are filled in:
223 // - A set of all possible instructions in eligible reductions.
224 // - A set of all PHIs in eligible reductions
225 // - A set of all reduced values (last instructions) in eligible reductions.
226 void restrictToScale(uint64_t Scale,
227 SmallInstructionSet &PossibleRedSet,
228 SmallInstructionSet &PossibleRedPHISet,
229 SmallInstructionSet &PossibleRedLastSet) {
230 PossibleRedIdx.clear();
231 PossibleRedIter.clear();
232 Reds.clear();
233
234 for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
235 if (PossibleReds[i].size() % Scale == 0) {
236 PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
237 PossibleRedPHISet.insert(PossibleReds[i].getPHI());
238
239 PossibleRedSet.insert(PossibleReds[i].getPHI());
240 PossibleRedIdx[PossibleReds[i].getPHI()] = i;
241 for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
242 JE = PossibleReds[i].end(); J != JE; ++J) {
243 PossibleRedSet.insert(*J);
244 PossibleRedIdx[*J] = i;
245 }
246 }
247 }
248
249 // The functions below are used while processing the loop instructions.
250
251 // Are the two instructions both from reductions, and furthermore, from
252 // the same reduction?
253 bool isPairInSame(Instruction *J1, Instruction *J2) {
254 DenseMap::iterator J1I = PossibleRedIdx.find(J1);
255 if (J1I != PossibleRedIdx.end()) {
256 DenseMap::iterator J2I = PossibleRedIdx.find(J2);
257 if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
258 return true;
259 }
260
261 return false;
262 }
263
264 // The two provided instructions, the first from the base iteration, and
265 // the second from iteration i, form a matched pair. If these are part of
266 // a reduction, record that fact.
267 void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
268 if (PossibleRedIdx.count(J1)) {
269 assert(PossibleRedIdx.count(J2) &&
270 "Recording reduction vs. non-reduction instruction?");
271
272 PossibleRedIter[J1] = 0;
273 PossibleRedIter[J2] = i;
274
275 int Idx = PossibleRedIdx[J1];
276 assert(Idx == PossibleRedIdx[J2] &&
277 "Recording pair from different reductions?");
278 Reds.insert(PossibleRedIdx[J1]);
279 }
280 }
281
282 // The functions below can be called after we've finished processing all
283 // instructions in the loop, and we know which reductions were selected.
284
285 // Is the provided instruction the PHI of a reduction selected for
286 // rerolling?
287 bool isSelectedPHI(Instruction *J) {
288 if (!isa(J))
289 return false;
290
291 for (DenseSet::iterator RI = Reds.begin(), RIE = Reds.end();
292 RI != RIE; ++RI) {
293 int i = *RI;
294 if (cast(J) == PossibleReds[i].getPHI())
295 return true;
296 }
297
298 return false;
299 }
300
301 bool validateSelected();
302 void replaceSelected();
303
304 protected:
305 // The vector of all possible reductions (for any scale).
306 SmallReductionVector PossibleReds;
307
308 DenseMap PossibleRedIdx;
309 DenseMap PossibleRedIter;
310 DenseSet Reds;
311 };
312
313 void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
314 void collectPossibleReductions(Loop *L,
315 ReductionTracker &Reductions);
316 void collectInLoopUserSet(Loop *L,
317 const SmallInstructionVector &Roots,
318 const SmallInstructionSet &Exclude,
319 const SmallInstructionSet &Final,
320 DenseSet &Users);
321 void collectInLoopUserSet(Loop *L,
322 Instruction * Root,
323 const SmallInstructionSet &Exclude,
324 const SmallInstructionSet &Final,
325 DenseSet &Users);
326 bool findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
327 Instruction *&IV,
328 SmallInstructionVector &LoopIncs);
329 bool collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale, Instruction *IV,
330 SmallVector &Roots,
331 SmallInstructionSet &AllRoots,
332 SmallInstructionVector &LoopIncs);
333 bool reroll(Instruction *IV, Loop *L, BasicBlock *Header, const SCEV *IterCount,
334 ReductionTracker &Reductions);
335 };
336 }
337
338 char LoopReroll::ID = 0;
339 INITIALIZE_PASS_BEGIN(LoopReroll, "loop-reroll", "Reroll loops", false, false)
340 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
341 INITIALIZE_PASS_DEPENDENCY(LoopInfo)
342 INITIALIZE_PASS_DEPENDENCY(DominatorTree)
343 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
344 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
345 INITIALIZE_PASS_END(LoopReroll, "loop-reroll", "Reroll loops", false, false)
346
347 Pass *llvm::createLoopRerollPass() {
348 return new LoopReroll;
349 }
350
351 // Returns true if the provided instruction is used outside the given loop.
352 // This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
353 // non-loop blocks to be outside the loop.
354 static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
355 for (Value::use_iterator UI = I->use_begin(),
356 UIE = I->use_end(); UI != UIE; ++UI) {
357 Instruction *User = cast(*UI);
358 if (!L->contains(User))
359 return true;
360 }
361
362 return false;
363 }
364
365 // Collect the list of loop induction variables with respect to which it might
366 // be possible to reroll the loop.
367 void LoopReroll::collectPossibleIVs(Loop *L,
368 SmallInstructionVector &PossibleIVs) {
369 BasicBlock *Header = L->getHeader();
370 for (BasicBlock::iterator I = Header->begin(),
371 IE = Header->getFirstInsertionPt(); I != IE; ++I) {
372 if (!isa(I))
373 continue;
374 if (!I->getType()->isIntegerTy())
375 continue;
376
377 if (const SCEVAddRecExpr *PHISCEV =
378 dyn_cast(SE->getSCEV(I))) {
379 if (PHISCEV->getLoop() != L)
380 continue;
381 if (!PHISCEV->isAffine())
382 continue;
383 if (const SCEVConstant *IncSCEV =
384 dyn_cast(PHISCEV->getStepRecurrence(*SE))) {
385 if (!IncSCEV->getValue()->getValue().isStrictlyPositive())
386 continue;
387 if (IncSCEV->getValue()->uge(MaxInc))
388 continue;
389
390 DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " <<
391 *PHISCEV << "\n");
392 PossibleIVs.push_back(I);
393 }
394 }
395 }
396 }
397
398 // Add the remainder of the reduction-variable chain to the instruction vector
399 // (the initial PHINode has already been added). If successful, the object is
400 // marked as valid.
401 void LoopReroll::SimpleLoopReduction::add(Loop *L) {
402 assert(!Valid && "Cannot add to an already-valid chain");
403
404 // The reduction variable must be a chain of single-use instructions
405 // (including the PHI), except for the last value (which is used by the PHI
406 // and also outside the loop).
407 Instruction *C = Instructions.front();
408
409 do {
410 C = cast(*C->use_begin());
411 if (C->hasOneUse()) {
412 if (!C->isBinaryOp())
413 return;
414
415 if (!(isa(Instructions.back()) ||
416 C->isSameOperationAs(Instructions.back())))
417 return;
418
419 Instructions.push_back(C);
420 }
421 } while (C->hasOneUse());
422
423 if (Instructions.size() < 2 ||
424 !C->isSameOperationAs(Instructions.back()) ||
425 C->use_begin() == C->use_end())
426 return;
427
428 // C is now the (potential) last instruction in the reduction chain.
429 for (Value::use_iterator UI = C->use_begin(), UIE = C->use_end();
430 UI != UIE; ++UI) {
431 // The only in-loop user can be the initial PHI.
432 if (L->contains(cast(*UI)))
433 if (cast(*UI ) != Instructions.front())
434 return;
435 }
436
437 Instructions.push_back(C);
438 Valid = true;
439 }
440
441 // Collect the vector of possible reduction variables.
442 void LoopReroll::collectPossibleReductions(Loop *L,
443 ReductionTracker &Reductions) {
444 BasicBlock *Header = L->getHeader();
445 for (BasicBlock::iterator I = Header->begin(),
446 IE = Header->getFirstInsertionPt(); I != IE; ++I) {
447 if (!isa(I))
448 continue;
449 if (!I->getType()->isSingleValueType())
450 continue;
451
452 SimpleLoopReduction SLR(I, L);
453 if (!SLR.valid())
454 continue;
455
456 DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with " <<
457 SLR.size() << " chained instructions)\n");
458 Reductions.addSLR(SLR);
459 }
460 }
461
462 // Collect the set of all users of the provided root instruction. This set of
463 // users contains not only the direct users of the root instruction, but also
464 // all users of those users, and so on. There are two exceptions:
465 //
466 // 1. Instructions in the set of excluded instructions are never added to the
467 // use set (even if they are users). This is used, for example, to exclude
468 // including root increments in the use set of the primary IV.
469 //
470 // 2. Instructions in the set of final instructions are added to the use set
471 // if they are users, but their users are not added. This is used, for
472 // example, to prevent a reduction update from forcing all later reduction
473 // updates into the use set.
474 void LoopReroll::collectInLoopUserSet(Loop *L,
475 Instruction *Root, const SmallInstructionSet &Exclude,
476 const SmallInstructionSet &Final,
477 DenseSet &Users) {
478 SmallInstructionVector Queue(1, Root);
479 while (!Queue.empty()) {
480 Instruction *I = Queue.pop_back_val();
481 if (!Users.insert(I).second)
482 continue;
483
484 if (!Final.count(I))
485 for (Value::use_iterator UI = I->use_begin(),
486 UIE = I->use_end(); UI != UIE; ++UI) {
487 Instruction *User = cast(*UI);
488 if (PHINode *PN = dyn_cast(User)) {
489 // Ignore "wrap-around" uses to PHIs of this loop's header.
490 if (PN->getIncomingBlock(UI) == L->getHeader())
491 continue;
492 }
493
494 if (L->contains(User) && !Exclude.count(User)) {
495 Queue.push_back(User);
496 }
497 }
498
499 // We also want to collect single-user "feeder" values.
500 for (User::op_iterator OI = I->op_begin(),
501 OIE = I->op_end(); OI != OIE; ++OI) {
502 if (Instruction *Op = dyn_cast(*OI))
503 if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
504 !Final.count(Op))
505 Queue.push_back(Op);
506 }
507 }
508 }
509
510 // Collect all of the users of all of the provided root instructions (combined
511 // into a single set).
512 void LoopReroll::collectInLoopUserSet(Loop *L,
513 const SmallInstructionVector &Roots,
514 const SmallInstructionSet &Exclude,
515 const SmallInstructionSet &Final,
516 DenseSet &Users) {
517 for (SmallInstructionVector::const_iterator I = Roots.begin(),
518 IE = Roots.end(); I != IE; ++I)
519 collectInLoopUserSet(L, *I, Exclude, Final, Users);
520 }
521
522 static bool isSimpleLoadStore(Instruction *I) {
523 if (LoadInst *LI = dyn_cast(I))
524 return LI->isSimple();
525 if (StoreInst *SI = dyn_cast(I))
526 return SI->isSimple();
527 if (MemIntrinsic *MI = dyn_cast(I))
528 return !MI->isVolatile();
529 return false;
530 }
531
532 // Recognize loops that are setup like this:
533 //
534 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
535 // %scaled.iv = mul %iv, scale
536 // f(%scaled.iv)
537 // %scaled.iv.1 = add %scaled.iv, 1
538 // f(%scaled.iv.1)
539 // %scaled.iv.2 = add %scaled.iv, 2
540 // f(%scaled.iv.2)
541 // %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
542 // f(%scaled.iv.scale_m_1)
543 // ...
544 // %iv.next = add %iv, 1
545 // %cmp = icmp(%iv, ...)
546 // br %cmp, header, exit
547 //
548 // and, if found, set IV = %scaled.iv, and add %iv.next to LoopIncs.
549 bool LoopReroll::findScaleFromMul(Instruction *RealIV, uint64_t &Scale,
550 Instruction *&IV,
551 SmallInstructionVector &LoopIncs) {
552 // This is a special case: here we're looking for all uses (except for
553 // the increment) to be multiplied by a common factor. The increment must
554 // be by one. This is to capture loops like:
555 // for (int i = 0; i < 500; ++i) {
556 // foo(3*i); foo(3*i+1); foo(3*i+2);
557 // }
558 if (RealIV->getNumUses() != 2)
559 return false;
560 const SCEVAddRecExpr *RealIVSCEV = cast(SE->getSCEV(RealIV));
561 Instruction *User1 = cast(*RealIV->use_begin()),
562 *User2 = cast(*llvm::next(RealIV->use_begin()));
563 if (!SE->isSCEVable(User1->getType()) || !SE->isSCEVable(User2->getType()))
564 return false;
565 const SCEVAddRecExpr *User1SCEV =
566 dyn_cast(SE->getSCEV(User1)),
567 *User2SCEV =
568 dyn_cast(SE->getSCEV(User2));
569 if (!User1SCEV || !User1SCEV->isAffine() ||
570 !User2SCEV || !User2SCEV->isAffine())
571 return false;
572
573 // We assume below that User1 is the scale multiply and User2 is the
574 // increment. If this can't be true, then swap them.
575 if (User1SCEV == RealIVSCEV->getPostIncExpr(*SE)) {
576 std::swap(User1, User2);
577 std::swap(User1SCEV, User2SCEV);
578 }
579
580 if (User2SCEV != RealIVSCEV->getPostIncExpr(*SE))
581 return false;
582 assert(User2SCEV->getStepRecurrence(*SE)->isOne() &&
583 "Invalid non-unit step for multiplicative scaling");
584 LoopIncs.push_back(User2);
585
586 if (const SCEVConstant *MulScale =
587 dyn_cast(User1SCEV->getStepRecurrence(*SE))) {
588 // Make sure that both the start and step have the same multiplier.
589 if (RealIVSCEV->getStart()->getType() != MulScale->getType())
590 return false;
591 if (SE->getMulExpr(RealIVSCEV->getStart(), MulScale) !=
592 User1SCEV->getStart())
593 return false;
594
595 ConstantInt *MulScaleCI = MulScale->getValue();
596 if (!MulScaleCI->uge(2) || MulScaleCI->uge(MaxInc))
597 return false;
598 Scale = MulScaleCI->getZExtValue();
599 IV = User1;
600 } else
601 return false;
602
603 DEBUG(dbgs() << "LRR: Found possible scaling " << *User1 << "\n");
604 return true;
605 }
606
607 // Collect all root increments with respect to the provided induction variable
608 // (normally the PHI, but sometimes a multiply). A root increment is an
609 // instruction, normally an add, with a positive constant less than Scale. In a
610 // rerollable loop, each of these increments is the root of an instruction
611 // graph isomorphic to the others. Also, we collect the final induction
612 // increment (the increment equal to the Scale), and its users in LoopIncs.
613 bool LoopReroll::collectAllRoots(Loop *L, uint64_t Inc, uint64_t Scale,
614 Instruction *IV,
615 SmallVector &Roots,
616 SmallInstructionSet &AllRoots,
617 SmallInstructionVector &LoopIncs) {
618 for (Value::use_iterator UI = IV->use_begin(),
619 UIE = IV->use_end(); UI != UIE; ++UI) {
620 Instruction *User = cast(*UI);
621 if (!SE->isSCEVable(User->getType()))
622 continue;
623 if (User->getType() != IV->getType())
624 continue;
625 if (!L->contains(User))
626 continue;
627 if (hasUsesOutsideLoop(User, L))
628 continue;
629
630 if (const SCEVConstant *Diff = dyn_cast(SE->getMinusSCEV(
631 SE->getSCEV(User), SE->getSCEV(IV)))) {
632 uint64_t Idx = Diff->getValue()->getValue().getZExtValue();
633 if (Idx > 0 && Idx < Scale) {
634 Roots[Idx-1].push_back(User);
635 AllRoots.insert(User);
636 } else if (Idx == Scale && Inc > 1) {
637 LoopIncs.push_back(User);
638 }
639 }
640 }
641
642 if (Roots[0].empty())
643 return false;
644 bool AllSame = true;
645 for (unsigned i = 1; i < Scale-1; ++i)
646 if (Roots[i].size() != Roots[0].size()) {
647 AllSame = false;
648 break;
649 }
650
651 if (!AllSame)
652 return false;
653
654 return true;
655 }
656
657 // Validate the selected reductions. All iterations must have an isomorphic
658 // part of the reduction chain and, for non-associative reductions, the chain
659 // entries must appear in order.
660 bool LoopReroll::ReductionTracker::validateSelected() {
661 // For a non-associative reduction, the chain entries must appear in order.
662 for (DenseSet::iterator RI = Reds.begin(), RIE = Reds.end();
663 RI != RIE; ++RI) {
664 int i = *RI;
665 int PrevIter = 0, BaseCount = 0, Count = 0;
666 for (SimpleLoopReduction::iterator J = PossibleReds[i].begin(),
667 JE = PossibleReds[i].end(); J != JE; ++J) {
668 // Note that all instructions in the chain must have been found because
669 // all instructions in the function must have been assigned to some
670 // iteration.
671 int Iter = PossibleRedIter[*J];
672 if (Iter != PrevIter && Iter != PrevIter + 1 &&
673 !PossibleReds[i].getReducedValue()->isAssociative()) {
674 DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: " <<
675 *J << "\n");
676 return false;
677 }
678
679 if (Iter != PrevIter) {
680 if (Count != BaseCount) {
681 DEBUG(dbgs() << "LRR: Iteration " << PrevIter <<
682 " reduction use count " << Count <<
683 " is not equal to the base use count " <<
684 BaseCount << "\n");
685 return false;
686 }
687
688 Count = 0;
689 }
690
691 ++Count;
692 if (Iter == 0)
693 ++BaseCount;
694
695 PrevIter = Iter;
696 }
697 }
698
699 return true;
700 }
701
702 // For all selected reductions, remove all parts except those in the first
703 // iteration (and the PHI). Replace outside uses of the reduced value with uses
704 // of the first-iteration reduced value (in other words, reroll the selected
705 // reductions).
706 void LoopReroll::ReductionTracker::replaceSelected() {
707 // Fixup reductions to refer to the last instruction associated with the
708 // first iteration (not the last).
709 for (DenseSet::iterator RI = Reds.begin(), RIE = Reds.end();
710 RI != RIE; ++RI) {
711 int i = *RI;
712 int j = 0;
713 for (int e = PossibleReds[i].size(); j != e; ++j)
714 if (PossibleRedIter[PossibleReds[i][j]] != 0) {
715 --j;
716 break;
717 }
718
719 // Replace users with the new end-of-chain value.
720 SmallInstructionVector Users;
721 for (Value::use_iterator UI =
722 PossibleReds[i].getReducedValue()->use_begin(),
723 UIE = PossibleReds[i].getReducedValue()->use_end(); UI != UIE; ++UI)
724 Users.push_back(cast(*UI));
725
726 for (SmallInstructionVector::iterator J = Users.begin(),
727 JE = Users.end(); J != JE; ++J)
728 (*J)->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
729 PossibleReds[i][j]);
730 }
731 }
732
733 // Reroll the provided loop with respect to the provided induction variable.
734 // Generally, we're looking for a loop like this:
735 //
736 // %iv = phi [ (preheader, ...), (body, %iv.next) ]
737 // f(%iv)
738 // %iv.1 = add %iv, 1 <-- a root increment
739 // f(%iv.1)
740 // %iv.2 = add %iv, 2 <-- a root increment
741 // f(%iv.2)
742 // %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
743 // f(%iv.scale_m_1)
744 // ...
745 // %iv.next = add %iv, scale
746 // %cmp = icmp(%iv, ...)
747 // br %cmp, header, exit
748 //
749 // Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
750 // instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
751 // be intermixed with eachother. The restriction imposed by this algorithm is
752 // that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
753 // etc. be the same.
754 //
755 // First, we collect the use set of %iv, excluding the other increment roots.
756 // This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
757 // times, having collected the use set of f(%iv.(i+1)), during which we:
758 // - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
759 // the next unmatched instruction in f(%iv.(i+1)).
760 // - Ensure that both matched instructions don't have any external users
761 // (with the exception of last-in-chain reduction instructions).
762 // - Track the (aliasing) write set, and other side effects, of all
763 // instructions that belong to future iterations that come before the matched
764 // instructions. If the matched instructions read from that write set, then
765 // f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
766 // f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
767 // if any of these future instructions had side effects (could not be
768 // speculatively executed), and so do the matched instructions, when we
769 // cannot reorder those side-effect-producing instructions, and rerolling
770 // fails.
771 //
772 // Finally, we make sure that all loop instructions are either loop increment
773 // roots, belong to simple latch code, parts of validated reductions, part of
774 // f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
775 // have been validated), then we reroll the loop.
776 bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
777 const SCEV *IterCount,
778 ReductionTracker &Reductions) {
779 const SCEVAddRecExpr *RealIVSCEV = cast(SE->getSCEV(IV));
780 uint64_t Inc = cast(RealIVSCEV->getOperand(1))->
781 getValue()->getZExtValue();
782 // The collection of loop increment instructions.
783 SmallInstructionVector LoopIncs;
784 uint64_t Scale = Inc;
785
786 // The effective induction variable, IV, is normally also the real induction
787 // variable. When we're dealing with a loop like:
788 // for (int i = 0; i < 500; ++i)
789 // x[3*i] = ...;
790 // x[3*i+1] = ...;
791 // x[3*i+2] = ...;
792 // then the real IV is still i, but the effective IV is (3*i).
793 Instruction *RealIV = IV;
794 if (Inc == 1 && !findScaleFromMul(RealIV, Scale, IV, LoopIncs))
795 return false;
796
797 assert(Scale <= MaxInc && "Scale is too large");
798 assert(Scale > 1 && "Scale must be at least 2");
799
800 // The set of increment instructions for each increment value.
801 SmallVector Roots(Scale-1);
802 SmallInstructionSet AllRoots;
803 if (!collectAllRoots(L, Inc, Scale, IV, Roots, AllRoots, LoopIncs))
804 return false;
805
806 DEBUG(dbgs() << "LRR: Found all root induction increments for: " <<
807 *RealIV << "\n");
808
809 // An array of just the possible reductions for this scale factor. When we
810 // collect the set of all users of some root instructions, these reduction
811 // instructions are treated as 'final' (their uses are not considered).
812 // This is important because we don't want the root use set to search down
813 // the reduction chain.
814 SmallInstructionSet PossibleRedSet;
815 SmallInstructionSet PossibleRedLastSet, PossibleRedPHISet;
816 Reductions.restrictToScale(Scale, PossibleRedSet, PossibleRedPHISet,
817 PossibleRedLastSet);
818
819 // We now need to check for equivalence of the use graph of each root with
820 // that of the primary induction variable (excluding the roots). Our goal
821 // here is not to solve the full graph isomorphism problem, but rather to
822 // catch common cases without a lot of work. As a result, we will assume
823 // that the relative order of the instructions in each unrolled iteration
824 // is the same (although we will not make an assumption about how the
825 // different iterations are intermixed). Note that while the order must be
826 // the same, the instructions may not be in the same basic block.
827 SmallInstructionSet Exclude(AllRoots);
828 Exclude.insert(LoopIncs.begin(), LoopIncs.end());
829
830 DenseSet BaseUseSet;
831 collectInLoopUserSet(L, IV, Exclude, PossibleRedSet, BaseUseSet);
832
833 DenseSet AllRootUses;
834 std::vector > RootUseSets(Scale-1);
835
836 bool MatchFailed = false;
837 for (unsigned i = 0; i < Scale-1 && !MatchFailed; ++i) {
838 DenseSet &RootUseSet = RootUseSets[i];
839 collectInLoopUserSet(L, Roots[i], SmallInstructionSet(),
840 PossibleRedSet, RootUseSet);
841
842 DEBUG(dbgs() << "LRR: base use set size: " << BaseUseSet.size() <<
843 " vs. iteration increment " << (i+1) <<
844 " use set size: " << RootUseSet.size() << "\n");
845
846 if (BaseUseSet.size() != RootUseSet.size()) {
847 MatchFailed = true;
848 break;
849 }
850
851 // In addition to regular aliasing information, we need to look for
852 // instructions from later (future) iterations that have side effects
853 // preventing us from reordering them past other instructions with side
854 // effects.
855 bool FutureSideEffects = false;
856 AliasSetTracker AST(*AA);
857
858 // The map between instructions in f(%iv.(i+1)) and f(%iv).
859 DenseMap BaseMap;
860
861 assert(L->getNumBlocks() == 1 && "Cannot handle multi-block loops");
862 for (BasicBlock::iterator J1 = Header->begin(), J2 = Header->begin(),
863 JE = Header->end(); J1 != JE && !MatchFailed; ++J1) {
864 if (cast(J1) == RealIV)
865 continue;
866 if (cast(J1) == IV)
867 continue;
868 if (!BaseUseSet.count(J1))
869 continue;
870 if (PossibleRedPHISet.count(J1)) // Skip reduction PHIs.
871 continue;
872
873 while (J2 != JE && (!RootUseSet.count(J2) ||
874 std::find(Roots[i].begin(), Roots[i].end(), J2) !=
875 Roots[i].end())) {
876 // As we iterate through the instructions, instructions that don't
877 // belong to previous iterations (or the base case), must belong to
878 // future iterations. We want to track the alias set of writes from
879 // previous iterations.
880 if (!isa(J2) && !BaseUseSet.count(J2) &&
881 !AllRootUses.count(J2)) {
882 if (J2->mayWriteToMemory())
883 AST.add(J2);
884
885 // Note: This is specifically guarded by a check on isa,
886 // which while a valid (somewhat arbitrary) micro-optimization, is
887 // needed because otherwise isSafeToSpeculativelyExecute returns
888 // false on PHI nodes.
889 if (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2, DL))
890 FutureSideEffects = true;
891 }
892
893 ++J2;
894 }
895
896 if (!J1->isSameOperationAs(J2)) {
897 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
898 " vs. " << *J2 << "\n");
899 MatchFailed = true;
900 break;
901 }
902
903 // Make sure that this instruction, which is in the use set of this
904 // root instruction, does not also belong to the base set or the set of
905 // some previous root instruction.
906 if (BaseUseSet.count(J2) || AllRootUses.count(J2)) {
907 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
908 " vs. " << *J2 << " (prev. case overlap)\n");
909 MatchFailed = true;
910 break;
911 }
912
913 // Make sure that we don't alias with any instruction in the alias set
914 // tracker. If we do, then we depend on a future iteration, and we
915 // can't reroll.
916 if (J2->mayReadFromMemory()) {
917 for (AliasSetTracker::iterator K = AST.begin(), KE = AST.end();
918 K != KE && !MatchFailed; ++K) {
919 if (K->aliasesUnknownInst(J2, *AA)) {
920 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
921 " vs. " << *J2 << " (depends on future store)\n");
922 MatchFailed = true;
923 break;
924 }
925 }
926 }
927
928 // If we've past an instruction from a future iteration that may have
929 // side effects, and this instruction might also, then we can't reorder
930 // them, and this matching fails. As an exception, we allow the alias
931 // set tracker to handle regular (simple) load/store dependencies.
932 if (FutureSideEffects &&
933 ((!isSimpleLoadStore(J1) && !isSafeToSpeculativelyExecute(J1)) ||
934 (!isSimpleLoadStore(J2) && !isSafeToSpeculativelyExecute(J2)))) {
935 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
936 " vs. " << *J2 <<
937 " (side effects prevent reordering)\n");
938 MatchFailed = true;
939 break;
940 }
941
942 // For instructions that are part of a reduction, if the operation is
943 // associative, then don't bother matching the operands (because we
944 // already know that the instructions are isomorphic, and the order
945 // within the iteration does not matter). For non-associative reductions,
946 // we do need to match the operands, because we need to reject
947 // out-of-order instructions within an iteration!
948 // For example (assume floating-point addition), we need to reject this:
949 // x += a[i]; x += b[i];
950 // x += a[i+1]; x += b[i+1];
951 // x += b[i+2]; x += a[i+2];
952 bool InReduction = Reductions.isPairInSame(J1, J2);
953
954 if (!(InReduction && J1->isAssociative())) {
955 bool Swapped = false, SomeOpMatched = false;;
956 for (unsigned j = 0; j < J1->getNumOperands() && !MatchFailed; ++j) {
957 Value *Op2 = J2->getOperand(j);
958
959 // If this is part of a reduction (and the operation is not
960 // associatve), then we match all operands, but not those that are
961 // part of the reduction.
962 if (InReduction)
963 if (Instruction *Op2I = dyn_cast(Op2))
964 if (Reductions.isPairInSame(J2, Op2I))
965 continue;
966
967 DenseMap::iterator BMI = BaseMap.find(Op2);
968 if (BMI != BaseMap.end())
969 Op2 = BMI->second;
970 else if (std::find(Roots[i].begin(), Roots[i].end(),
971 (Instruction*) Op2) != Roots[i].end())
972 Op2 = IV;
973
974 if (J1->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
975 // If we've not already decided to swap the matched operands, and
976 // we've not already matched our first operand (note that we could
977 // have skipped matching the first operand because it is part of a
978 // reduction above), and the instruction is commutative, then try
979 // the swapped match.
980 if (!Swapped && J1->isCommutative() && !SomeOpMatched &&
981 J1->getOperand(!j) == Op2) {
982 Swapped = true;
983 } else {
984 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
985 " vs. " << *J2 << " (operand " << j << ")\n");
986 MatchFailed = true;
987 break;
988 }
989 }
990
991 SomeOpMatched = true;
992 }
993 }
994
995 if ((!PossibleRedLastSet.count(J1) && hasUsesOutsideLoop(J1, L)) ||
996 (!PossibleRedLastSet.count(J2) && hasUsesOutsideLoop(J2, L))) {
997 DEBUG(dbgs() << "LRR: iteration root match failed at " << *J1 <<
998 " vs. " << *J2 << " (uses outside loop)\n");
999 MatchFailed = true;
1000 break;
1001 }
1002
1003 if (!MatchFailed)
1004 BaseMap.insert(std::pair(J2, J1));
1005
1006 AllRootUses.insert(J2);
1007 Reductions.recordPair(J1, J2, i+1);
1008
1009 ++J2;
1010 }
1011 }
1012
1013 if (MatchFailed)
1014 return false;
1015
1016 DEBUG(dbgs() << "LRR: Matched all iteration increments for " <<
1017 *RealIV << "\n");
1018
1019 DenseSet LoopIncUseSet;
1020 collectInLoopUserSet(L, LoopIncs, SmallInstructionSet(),
1021 SmallInstructionSet(), LoopIncUseSet);
1022 DEBUG(dbgs() << "LRR: Loop increment set size: " <<
1023 LoopIncUseSet.size() << "\n");
1024
1025 // Make sure that all instructions in the loop have been included in some
1026 // use set.
1027 for (BasicBlock::iterator J = Header->begin(), JE = Header->end();
1028 J != JE; ++J) {
1029 if (isa(J))
1030 continue;
1031 if (cast(J) == RealIV)
1032 continue;
1033 if (cast(J) == IV)
1034 continue;
1035 if (BaseUseSet.count(J) || AllRootUses.count(J) ||
1036 (LoopIncUseSet.count(J) && (J->isTerminator() ||
1037 isSafeToSpeculativelyExecute(J, DL))))
1038 continue;
1039
1040 if (AllRoots.count(J))
1041 continue;
1042
1043 if (Reductions.isSelectedPHI(J))
1044 continue;
1045
1046 DEBUG(dbgs() << "LRR: aborting reroll based on " << *RealIV <<
1047 " unprocessed instruction found: " << *J << "\n");
1048 MatchFailed = true;
1049 break;
1050 }
1051
1052 if (MatchFailed)
1053 return false;
1054
1055 DEBUG(dbgs() << "LRR: all instructions processed from " <<
1056 *RealIV << "\n");
1057
1058 if (!Reductions.validateSelected())
1059 return false;
1060
1061 // At this point, we've validated the rerolling, and we're committed to
1062 // making changes!
1063
1064 Reductions.replaceSelected();
1065
1066 // Remove instructions associated with non-base iterations.
1067 for (BasicBlock::reverse_iterator J = Header->rbegin();
1068 J != Header->rend();) {
1069 if (AllRootUses.count(&*J)) {
1070 Instruction *D = &*J;
1071 DEBUG(dbgs() << "LRR: removing: " << *D << "\n");
1072 D->eraseFromParent();
1073 continue;
1074 }
1075
1076 ++J;
1077 }
1078
1079 // Insert the new induction variable.
1080 const SCEV *Start = RealIVSCEV->getStart();
1081 if (Inc == 1)
1082 Start = SE->getMulExpr(Start,
1083 SE->getConstant(Start->getType(), Scale));
1084 const SCEVAddRecExpr *H =
1085 cast(SE->getAddRecExpr(Start,
1086 SE->getConstant(RealIVSCEV->getType(), 1),
1087 L, SCEV::FlagAnyWrap));
1088 { // Limit the lifetime of SCEVExpander.
1089 SCEVExpander Expander(*SE, "reroll");
1090 PHINode *NewIV =
1091 cast(Expander.expandCodeFor(H, IV->getType(),
1092 Header->begin()));
1093 for (DenseSet::iterator J = BaseUseSet.begin(),
1094 JE = BaseUseSet.end(); J != JE; ++J)
1095 (*J)->replaceUsesOfWith(IV, NewIV);
1096
1097 if (BranchInst *BI = dyn_cast(Header->getTerminator())) {
1098 if (LoopIncUseSet.count(BI)) {
1099 const SCEV *ICSCEV = RealIVSCEV->evaluateAtIteration(IterCount, *SE);
1100 if (Inc == 1)
1101 ICSCEV =
1102 SE->getMulExpr(ICSCEV, SE->getConstant(ICSCEV->getType(), Scale));
1103 Value *IC;
1104 if (isa(ICSCEV)) {
1105 IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(), BI);
1106 } else {
1107 BasicBlock *Preheader = L->getLoopPreheader();
1108 if (!Preheader)
1109 Preheader = InsertPreheaderForLoop(L, this);
1110
1111 IC = Expander.expandCodeFor(ICSCEV, NewIV->getType(),
1112 Preheader->getTerminator());
1113 }
1114
1115 Value *NewIVNext = NewIV->getIncomingValueForBlock(Header);
1116 Value *Cond = new ICmpInst(BI, CmpInst::ICMP_EQ, NewIVNext, IC,
1117 "exitcond");
1118 BI->setCondition(Cond);
1119
1120 if (BI->getSuccessor(1) != Header)
1121 BI->swapSuccessors();
1122 }
1123 }
1124 }
1125
1126 SimplifyInstructionsInBlock(Header, DL, TLI);
1127 DeleteDeadPHIs(Header, TLI);
1128 ++NumRerolledLoops;
1129 return true;
1130 }
1131
1132 bool LoopReroll::runOnLoop(Loop *L, LPPassManager &LPM) {
1133 AA = &getAnalysis();
1134 LI = &getAnalysis();
1135 SE = &getAnalysis();
1136 TLI = &getAnalysis();
1137 DL = getAnalysisIfAvailable();
1138 DT = &getAnalysis();
1139
1140 BasicBlock *Header = L->getHeader();
1141 DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() <<
1142 "] Loop %" << Header->getName() << " (" <<
1143 L->getNumBlocks() << " block(s))\n");
1144
1145 bool Changed = false;
1146
1147 // For now, we'll handle only single BB loops.
1148 if (L->getNumBlocks() > 1)
1149 return Changed;
1150
1151 if (!SE->hasLoopInvariantBackedgeTakenCount(L))
1152 return Changed;
1153
1154 const SCEV *LIBETC = SE->getBackedgeTakenCount(L);
1155 const SCEV *IterCount =
1156 SE->getAddExpr(LIBETC, SE->getConstant(LIBETC->getType(), 1));
1157 DEBUG(dbgs() << "LRR: iteration count = " << *IterCount << "\n");
1158
1159 // First, we need to find the induction variable with respect to which we can
1160 // reroll (there may be several possible options).
1161 SmallInstructionVector PossibleIVs;
1162 collectPossibleIVs(L, PossibleIVs);
1163
1164 if (PossibleIVs.empty()) {
1165 DEBUG(dbgs() << "LRR: No possible IVs found\n");
1166 return Changed;
1167 }
1168
1169 ReductionTracker Reductions;
1170 collectPossibleReductions(L, Reductions);
1171
1172 // For each possible IV, collect the associated possible set of 'root' nodes
1173 // (i+1, i+2, etc.).
1174 for (SmallInstructionVector::iterator I = PossibleIVs.begin(),
1175 IE = PossibleIVs.end(); I != IE; ++I)
1176 if (reroll(*I, L, Header, IterCount, Reductions)) {
1177 Changed = true;
1178 break;
1179 }
1180
1181 return Changed;
1182 }
1183
4343 initializeLoopInstSimplifyPass(Registry);
4444 initializeLoopRotatePass(Registry);
4545 initializeLoopStrengthReducePass(Registry);
46 initializeLoopRerollPass(Registry);
4647 initializeLoopUnrollPass(Registry);
4748 initializeLoopUnswitchPass(Registry);
4849 initializeLoopIdiomRecognizePass(Registry);
109110
110111 void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
111112 unwrap(PM)->add(createLoopRotatePass());
113 }
114
115 void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
116 unwrap(PM)->add(createLoopRerollPass());
112117 }
113118
114119 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
0 ; RUN: opt < %s -loop-reroll -S | FileCheck %s
1 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
2 target triple = "x86_64-unknown-linux-gnu"
3
4 ; int foo(int a);
5 ; void bar(int *x) {
6 ; for (int i = 0; i < 500; i += 3) {
7 ; foo(i);
8 ; foo(i+1);
9 ; foo(i+2);
10 ; }
11 ; }
12
13 ; Function Attrs: nounwind uwtable
14 define void @bar(i32* nocapture readnone %x) #0 {
15 entry:
16 br label %for.body
17
18 for.body: ; preds = %for.body, %entry
19 %i.08 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
20 %call = tail call i32 @foo(i32 %i.08) #1
21 %add = add nsw i32 %i.08, 1
22 %call1 = tail call i32 @foo(i32 %add) #1
23 %add2 = add nsw i32 %i.08, 2
24 %call3 = tail call i32 @foo(i32 %add2) #1
25 %add3 = add nsw i32 %i.08, 3
26 %exitcond = icmp eq i32 %add3, 500
27 br i1 %exitcond, label %for.end, label %for.body
28
29 ; CHECK-LABEL: @bar
30
31 ; CHECK: for.body:
32 ; CHECK: %indvar = phi i32 [ %indvar.next, %for.body ], [ 0, %entry ]
33 ; CHECK: %call = tail call i32 @foo(i32 %indvar) #1
34 ; CHECK: %indvar.next = add i32 %indvar, 1
35 ; CHECK: %exitcond1 = icmp eq i32 %indvar.next, 498
36 ; CHECK: br i1 %exitcond1, label %for.end, label %for.body
37
38 ; CHECK: ret
39
40 for.end: ; preds = %for.body
41 ret void
42 }
43
44 declare i32 @foo(i32)
45
46 ; void hi1(int *x) {
47 ; for (int i = 0; i < 1500; i += 3) {
48 ; x[i] = foo(0);
49 ; x[i+1] = foo(0);
50 ; x[i+2] = foo(0);
51 ; }
52 ; }
53
54 ; Function Attrs: nounwind uwtable
55 define void @hi1(i32* nocapture %x) #0 {
56 entry:
57 br label %for.body
58
59 for.body: ; preds = %entry, %for.body
60 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
61 %call = tail call i32 @foo(i32 0) #1
62 %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
63 store i32 %call, i32* %arrayidx, align 4
64 %call1 = tail call i32 @foo(i32 0) #1
65 %0 = add nsw i64 %indvars.iv, 1
66 %arrayidx3 = getelementptr inbounds i32* %x, i64 %0
67 store i32 %call1, i32* %arrayidx3, align 4
68 %call4 = tail call i32 @foo(i32 0) #1
69 %1 = add nsw i64 %indvars.iv, 2
70 %arrayidx7 = getelementptr inbounds i32* %x, i64 %1
71 store i32 %call4, i32* %arrayidx7, align 4
72 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 3
73 %2 = trunc i64 %indvars.iv.next to i32
74 %cmp = icmp slt i32 %2, 1500
75 br i1 %cmp, label %for.body, label %for.end
76
77 ; CHECK-LABEL: @hi1
78
79 ; CHECK: for.body:
80 ; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
81 ; CHECK: %call = tail call i32 @foo(i32 0) #1
82 ; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvar
83 ; CHECK: store i32 %call, i32* %arrayidx, align 4
84 ; CHECK: %indvar.next = add i64 %indvar, 1
85 ; CHECK: %exitcond = icmp eq i64 %indvar.next, 1500
86 ; CHECK: br i1 %exitcond, label %for.end, label %for.body
87
88 ; CHECK: ret
89
90 for.end: ; preds = %for.body
91 ret void
92 }
93
94 ; void hi2(int *x) {
95 ; for (int i = 0; i < 500; ++i) {
96 ; x[3*i] = foo(0);
97 ; x[3*i+1] = foo(0);
98 ; x[3*i+2] = foo(0);
99 ; }
100 ; }
101
102 ; Function Attrs: nounwind uwtable
103 define void @hi2(i32* nocapture %x) #0 {
104 entry:
105 br label %for.body
106
107 for.body: ; preds = %for.body, %entry
108 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
109 %call = tail call i32 @foo(i32 0) #1
110 %0 = mul nsw i64 %indvars.iv, 3
111 %arrayidx = getelementptr inbounds i32* %x, i64 %0
112 store i32 %call, i32* %arrayidx, align 4
113 %call1 = tail call i32 @foo(i32 0) #1
114 %1 = add nsw i64 %0, 1
115 %arrayidx4 = getelementptr inbounds i32* %x, i64 %1
116 store i32 %call1, i32* %arrayidx4, align 4
117 %call5 = tail call i32 @foo(i32 0) #1
118 %2 = add nsw i64 %0, 2
119 %arrayidx9 = getelementptr inbounds i32* %x, i64 %2
120 store i32 %call5, i32* %arrayidx9, align 4
121 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
122 %exitcond = icmp eq i64 %indvars.iv.next, 500
123 br i1 %exitcond, label %for.end, label %for.body
124
125 ; CHECK-LABEL: @hi2
126
127 ; CHECK: for.body:
128 ; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
129 ; CHECK: %call = tail call i32 @foo(i32 0) #1
130 ; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
131 ; CHECK: store i32 %call, i32* %arrayidx, align 4
132 ; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
133 ; CHECK: %exitcond1 = icmp eq i64 %indvars.iv.next, 1500
134 ; CHECK: br i1 %exitcond1, label %for.end, label %for.body
135
136 ; CHECK: ret
137
138 for.end: ; preds = %for.body
139 ret void
140 }
141
142 ; void goo(float alpha, float *a, float *b) {
143 ; for (int i = 0; i < 3200; i += 5) {
144 ; a[i] += alpha * b[i];
145 ; a[i + 1] += alpha * b[i + 1];
146 ; a[i + 2] += alpha * b[i + 2];
147 ; a[i + 3] += alpha * b[i + 3];
148 ; a[i + 4] += alpha * b[i + 4];
149 ; }
150 ; }
151
152 ; Function Attrs: nounwind uwtable
153 define void @goo(float %alpha, float* nocapture %a, float* nocapture readonly %b) #0 {
154 entry:
155 br label %for.body
156
157 for.body: ; preds = %entry, %for.body
158 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
159 %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
160 %0 = load float* %arrayidx, align 4
161 %mul = fmul float %0, %alpha
162 %arrayidx2 = getelementptr inbounds float* %a, i64 %indvars.iv
163 %1 = load float* %arrayidx2, align 4
164 %add = fadd float %1, %mul
165 store float %add, float* %arrayidx2, align 4
166 %2 = add nsw i64 %indvars.iv, 1
167 %arrayidx5 = getelementptr inbounds float* %b, i64 %2
168 %3 = load float* %arrayidx5, align 4
169 %mul6 = fmul float %3, %alpha
170 %arrayidx9 = getelementptr inbounds float* %a, i64 %2
171 %4 = load float* %arrayidx9, align 4
172 %add10 = fadd float %4, %mul6
173 store float %add10, float* %arrayidx9, align 4
174 %5 = add nsw i64 %indvars.iv, 2
175 %arrayidx13 = getelementptr inbounds float* %b, i64 %5
176 %6 = load float* %arrayidx13, align 4
177 %mul14 = fmul float %6, %alpha
178 %arrayidx17 = getelementptr inbounds float* %a, i64 %5
179 %7 = load float* %arrayidx17, align 4
180 %add18 = fadd float %7, %mul14
181 store float %add18, float* %arrayidx17, align 4
182 %8 = add nsw i64 %indvars.iv, 3
183 %arrayidx21 = getelementptr inbounds float* %b, i64 %8
184 %9 = load float* %arrayidx21, align 4
185 %mul22 = fmul float %9, %alpha
186 %arrayidx25 = getelementptr inbounds float* %a, i64 %8
187 %10 = load float* %arrayidx25, align 4
188 %add26 = fadd float %10, %mul22
189 store float %add26, float* %arrayidx25, align 4
190 %11 = add nsw i64 %indvars.iv, 4
191 %arrayidx29 = getelementptr inbounds float* %b, i64 %11
192 %12 = load float* %arrayidx29, align 4
193 %mul30 = fmul float %12, %alpha
194 %arrayidx33 = getelementptr inbounds float* %a, i64 %11
195 %13 = load float* %arrayidx33, align 4
196 %add34 = fadd float %13, %mul30
197 store float %add34, float* %arrayidx33, align 4
198 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
199 %14 = trunc i64 %indvars.iv.next to i32
200 %cmp = icmp slt i32 %14, 3200
201 br i1 %cmp, label %for.body, label %for.end
202
203 ; CHECK-LABEL: @goo
204
205 ; CHECK: for.body:
206 ; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
207 ; CHECK: %arrayidx = getelementptr inbounds float* %b, i64 %indvar
208 ; CHECK: %0 = load float* %arrayidx, align 4
209 ; CHECK: %mul = fmul float %0, %alpha
210 ; CHECK: %arrayidx2 = getelementptr inbounds float* %a, i64 %indvar
211 ; CHECK: %1 = load float* %arrayidx2, align 4
212 ; CHECK: %add = fadd float %1, %mul
213 ; CHECK: store float %add, float* %arrayidx2, align 4
214 ; CHECK: %indvar.next = add i64 %indvar, 1
215 ; CHECK: %exitcond = icmp eq i64 %indvar.next, 3200
216 ; CHECK: br i1 %exitcond, label %for.end, label %for.body
217
218 ; CHECK: ret
219
220 for.end: ; preds = %for.body
221 ret void
222 }
223
224 ; void hoo(float alpha, float *a, float *b, int *ip) {
225 ; for (int i = 0; i < 3200; i += 5) {
226 ; a[i] += alpha * b[ip[i]];
227 ; a[i + 1] += alpha * b[ip[i + 1]];
228 ; a[i + 2] += alpha * b[ip[i + 2]];
229 ; a[i + 3] += alpha * b[ip[i + 3]];
230 ; a[i + 4] += alpha * b[ip[i + 4]];
231 ; }
232 ; }
233
234 ; Function Attrs: nounwind uwtable
235 define void @hoo(float %alpha, float* nocapture %a, float* nocapture readonly %b, i32* nocapture readonly %ip) #0 {
236 entry:
237 br label %for.body
238
239 for.body: ; preds = %entry, %for.body
240 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
241 %arrayidx = getelementptr inbounds i32* %ip, i64 %indvars.iv
242 %0 = load i32* %arrayidx, align 4
243 %idxprom1 = sext i32 %0 to i64
244 %arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom1
245 %1 = load float* %arrayidx2, align 4
246 %mul = fmul float %1, %alpha
247 %arrayidx4 = getelementptr inbounds float* %a, i64 %indvars.iv
248 %2 = load float* %arrayidx4, align 4
249 %add = fadd float %2, %mul
250 store float %add, float* %arrayidx4, align 4
251 %3 = add nsw i64 %indvars.iv, 1
252 %arrayidx7 = getelementptr inbounds i32* %ip, i64 %3
253 %4 = load i32* %arrayidx7, align 4
254 %idxprom8 = sext i32 %4 to i64
255 %arrayidx9 = getelementptr inbounds float* %b, i64 %idxprom8
256 %5 = load float* %arrayidx9, align 4
257 %mul10 = fmul float %5, %alpha
258 %arrayidx13 = getelementptr inbounds float* %a, i64 %3
259 %6 = load float* %arrayidx13, align 4
260 %add14 = fadd float %6, %mul10
261 store float %add14, float* %arrayidx13, align 4
262 %7 = add nsw i64 %indvars.iv, 2
263 %arrayidx17 = getelementptr inbounds i32* %ip, i64 %7
264 %8 = load i32* %arrayidx17, align 4
265 %idxprom18 = sext i32 %8 to i64
266 %arrayidx19 = getelementptr inbounds float* %b, i64 %idxprom18
267 %9 = load float* %arrayidx19, align 4
268 %mul20 = fmul float %9, %alpha
269 %arrayidx23 = getelementptr inbounds float* %a, i64 %7
270 %10 = load float* %arrayidx23, align 4
271 %add24 = fadd float %10, %mul20
272 store float %add24, float* %arrayidx23, align 4
273 %11 = add nsw i64 %indvars.iv, 3
274 %arrayidx27 = getelementptr inbounds i32* %ip, i64 %11
275 %12 = load i32* %arrayidx27, align 4
276 %idxprom28 = sext i32 %12 to i64
277 %arrayidx29 = getelementptr inbounds float* %b, i64 %idxprom28
278 %13 = load float* %arrayidx29, align 4
279 %mul30 = fmul float %13, %alpha
280 %arrayidx33 = getelementptr inbounds float* %a, i64 %11
281 %14 = load float* %arrayidx33, align 4
282 %add34 = fadd float %14, %mul30
283 store float %add34, float* %arrayidx33, align 4
284 %15 = add nsw i64 %indvars.iv, 4
285 %arrayidx37 = getelementptr inbounds i32* %ip, i64 %15
286 %16 = load i32* %arrayidx37, align 4
287 %idxprom38 = sext i32 %16 to i64
288 %arrayidx39 = getelementptr inbounds float* %b, i64 %idxprom38
289 %17 = load float* %arrayidx39, align 4
290 %mul40 = fmul float %17, %alpha
291 %arrayidx43 = getelementptr inbounds float* %a, i64 %15
292 %18 = load float* %arrayidx43, align 4
293 %add44 = fadd float %18, %mul40
294 store float %add44, float* %arrayidx43, align 4
295 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
296 %19 = trunc i64 %indvars.iv.next to i32
297 %cmp = icmp slt i32 %19, 3200
298 br i1 %cmp, label %for.body, label %for.end
299
300 ; CHECK-LABEL: @hoo
301
302 ; CHECK: for.body:
303 ; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
304 ; CHECK: %arrayidx = getelementptr inbounds i32* %ip, i64 %indvar
305 ; CHECK: %0 = load i32* %arrayidx, align 4
306 ; CHECK: %idxprom1 = sext i32 %0 to i64
307 ; CHECK: %arrayidx2 = getelementptr inbounds float* %b, i64 %idxprom1
308 ; CHECK: %1 = load float* %arrayidx2, align 4
309 ; CHECK: %mul = fmul float %1, %alpha
310 ; CHECK: %arrayidx4 = getelementptr inbounds float* %a, i64 %indvar
311 ; CHECK: %2 = load float* %arrayidx4, align 4
312 ; CHECK: %add = fadd float %2, %mul
313 ; CHECK: store float %add, float* %arrayidx4, align 4
314 ; CHECK: %indvar.next = add i64 %indvar, 1
315 ; CHECK: %exitcond = icmp eq i64 %indvar.next, 3200
316 ; CHECK: br i1 %exitcond, label %for.end, label %for.body
317
318 ; CHECK: ret
319
320 for.end: ; preds = %for.body
321 ret void
322 }
323
324 attributes #0 = { nounwind uwtable }
325 attributes #1 = { nounwind }
326
0 ; RUN: opt < %s -loop-reroll -S | FileCheck %s
1 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
2 target triple = "x86_64-unknown-linux-gnu"
3
4 define i32 @foo(i32* nocapture readonly %x) #0 {
5 entry:
6 br label %for.body
7
8 for.body: ; preds = %entry, %for.body
9 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
10 %r.029 = phi i32 [ 0, %entry ], [ %add12, %for.body ]
11 %arrayidx = getelementptr inbounds i32* %x, i64 %indvars.iv
12 %0 = load i32* %arrayidx, align 4
13 %add = add nsw i32 %0, %r.029
14 %1 = or i64 %indvars.iv, 1
15 %arrayidx3 = getelementptr inbounds i32* %x, i64 %1
16 %2 = load i32* %arrayidx3, align 4
17 %add4 = add nsw i32 %add, %2
18 %3 = or i64 %indvars.iv, 2
19 %arrayidx7 = getelementptr inbounds i32* %x, i64 %3
20 %4 = load i32* %arrayidx7, align 4
21 %add8 = add nsw i32 %add4, %4
22 %5 = or i64 %indvars.iv, 3
23 %arrayidx11 = getelementptr inbounds i32* %x, i64 %5
24 %6 = load i32* %arrayidx11, align 4
25 %add12 = add nsw i32 %add8, %6
26 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
27 %7 = trunc i64 %indvars.iv.next to i32
28 %cmp = icmp slt i32 %7, 400
29 br i1 %cmp, label %for.body, label %for.end
30
31 ; CHECK-LABEL: @foo
32
33 ; CHECK: for.body:
34 ; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
35 ; CHECK: %r.029 = phi i32 [ 0, %entry ], [ %add, %for.body ]
36 ; CHECK: %arrayidx = getelementptr inbounds i32* %x, i64 %indvar
37 ; CHECK: %0 = load i32* %arrayidx, align 4
38 ; CHECK: %add = add nsw i32 %0, %r.029
39 ; CHECK: %indvar.next = add i64 %indvar, 1
40 ; CHECK: %exitcond = icmp eq i64 %indvar.next, 400
41 ; CHECK: br i1 %exitcond, label %for.end, label %for.body
42
43 ; CHECK: ret
44
45 for.end: ; preds = %for.body
46 ret i32 %add12
47 }
48
49 define float @bar(float* nocapture readonly %x) #0 {
50 entry:
51 br label %for.body
52
53 for.body: ; preds = %entry, %for.body
54 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
55 %r.029 = phi float [ 0.0, %entry ], [ %add12, %for.body ]
56 %arrayidx = getelementptr inbounds float* %x, i64 %indvars.iv
57 %0 = load float* %arrayidx, align 4
58 %add = fadd float %0, %r.029
59 %1 = or i64 %indvars.iv, 1
60 %arrayidx3 = getelementptr inbounds float* %x, i64 %1
61 %2 = load float* %arrayidx3, align 4
62 %add4 = fadd float %add, %2
63 %3 = or i64 %indvars.iv, 2
64 %arrayidx7 = getelementptr inbounds float* %x, i64 %3
65 %4 = load float* %arrayidx7, align 4
66 %add8 = fadd float %add4, %4
67 %5 = or i64 %indvars.iv, 3
68 %arrayidx11 = getelementptr inbounds float* %x, i64 %5
69 %6 = load float* %arrayidx11, align 4
70 %add12 = fadd float %add8, %6
71 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
72 %7 = trunc i64 %indvars.iv.next to i32
73 %cmp = icmp slt i32 %7, 400
74 br i1 %cmp, label %for.body, label %for.end
75
76 ; CHECK-LABEL: @bar
77
78 ; CHECK: for.body:
79 ; CHECK: %indvar = phi i64 [ %indvar.next, %for.body ], [ 0, %entry ]
80 ; CHECK: %r.029 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
81 ; CHECK: %arrayidx = getelementptr inbounds float* %x, i64 %indvar
82 ; CHECK: %0 = load float* %arrayidx, align 4
83 ; CHECK: %add = fadd float %0, %r.029
84 ; CHECK: %indvar.next = add i64 %indvar, 1
85 ; CHECK: %exitcond = icmp eq i64 %indvar.next, 400
86 ; CHECK: br i1 %exitcond, label %for.end, label %for.body
87
88 ; CHECK: ret
89
90 for.end: ; preds = %for.body
91 ret float %add12
92 }
93
94 attributes #0 = { nounwind readonly uwtable }
95