llvm.org GIT mirror llvm / 3dde467
[UnrollAndJam] Add a new Unroll and Jam pass This is a simple implementation of the unroll-and-jam classical loop optimisation. The basic idea is that we take an outer loop of the form: for i.. ForeBlocks(i) for j.. SubLoopBlocks(i, j) AftBlocks(i) Instead of doing normal inner or outer unrolling, we unroll as follows: for i... i+=2 ForeBlocks(i) ForeBlocks(i+1) for j.. SubLoopBlocks(i, j) SubLoopBlocks(i+1, j) AftBlocks(i) AftBlocks(i+1) Remainder So we have unrolled the outer loop, then jammed the two inner loops into one. This can lead to a simpler inner loop if memory accesses can be shared between the now-jammed loops. To do this we have to prove that this is all safe, both for the memory accesses (using dependence analysis) and that ForeBlocks(i+1) can move before AftBlocks(i) and SubLoopBlocks(i, j). Differential Revision: https://reviews.llvm.org/D41953 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@333358 91177308-0d34-0410-b5e6-96231b3b80d8 David Green 1 year, 3 months ago
23 changed file(s) with 3878 addition(s) and 20 deletion(s). Raw diff Collapse all Expand all
421421 bool AllowPeeling;
422422 /// Allow unrolling of all the iterations of the runtime loop remainder.
423423 bool UnrollRemainder;
424 /// Allow unroll and jam. Used to enable unroll and jam for the target.
425 bool UnrollAndJam;
426 /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
427 /// value above is used during unroll and jam for the outer loop size.
428 /// This value is used in the same manner to limit the size of the inner
429 /// loop.
430 unsigned UnrollAndJamInnerLoopThreshold;
424431 };
425432
426433 /// Get target-customized preferences for the generic loop unrolling
225225 void initializeLoopSimplifyPass(PassRegistry&);
226226 void initializeLoopStrengthReducePass(PassRegistry&);
227227 void initializeLoopUnrollPass(PassRegistry&);
228 void initializeLoopUnrollAndJamPass(PassRegistry&);
228229 void initializeLoopUnswitchPass(PassRegistry&);
229230 void initializeLoopVectorizePass(PassRegistry&);
230231 void initializeLoopVersioningLICMPass(PassRegistry&);
129129 (void) llvm::createLoopStrengthReducePass();
130130 (void) llvm::createLoopRerollPass();
131131 (void) llvm::createLoopUnrollPass();
132 (void) llvm::createLoopUnrollAndJamPass();
132133 (void) llvm::createLoopUnswitchPass();
133134 (void) llvm::createLoopVersioningLICMPass();
134135 (void) llvm::createLoopIdiomPass();
0 //===- LoopUnrollAndJamPass.h -----------------------------------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
10 #define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
11
12 #include "llvm/Analysis/LoopAnalysisManager.h"
13 #include "llvm/Analysis/LoopInfo.h"
14 #include "llvm/IR/PassManager.h"
15
16 namespace llvm {
17
18 class Loop;
19 struct LoopStandardAnalysisResults;
20 class LPMUpdater;
21
22 /// A simple loop rotation transformation.
23 class LoopUnrollAndJamPass : public PassInfoMixin {
24 const int OptLevel;
25
26 public:
27 explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
28 PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
29 LoopStandardAnalysisResults &AR, LPMUpdater &U);
30 };
31
32 } // end namespace llvm
33
34 #endif // LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
191191
192192 //===----------------------------------------------------------------------===//
193193 //
194 // LoopUnrollAndJam - This pass is a simple loop unroll and jam pass.
195 //
196 Pass *createLoopUnrollAndJamPass(int OptLevel = 2);
197
198 //===----------------------------------------------------------------------===//
199 //
194200 // LoopReroll - This pass is a simple loop rerolling pass.
195201 //
196202 Pass *createLoopRerollPass();
1818 #include "llvm/ADT/DenseMap.h"
1919 #include "llvm/ADT/StringRef.h"
2020 #include "llvm/Analysis/TargetTransformInfo.h"
21 #include "llvm/Transforms/Utils/ValueMapper.h"
2122
2223 namespace llvm {
2324
2425 class AssumptionCache;
2526 class BasicBlock;
27 class DependenceInfo;
2628 class DominatorTree;
2729 class Loop;
2830 class LoopInfo;
7779 bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
7880 DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
7981
82 LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
83 unsigned TripMultiple, bool UnrollRemainder,
84 LoopInfo *LI, ScalarEvolution *SE,
85 DominatorTree *DT, AssumptionCache *AC,
86 OptimizationRemarkEmitter *ORE);
87
88 bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
89 DependenceInfo &DI);
90
91 bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
92 DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
93 const SmallPtrSetImpl &EphValues,
94 OptimizationRemarkEmitter *ORE, unsigned &TripCount,
95 unsigned MaxTripCount, unsigned &TripMultiple,
96 unsigned LoopSize,
97 TargetTransformInfo::UnrollingPreferences &UP,
98 bool &UseUpperBound);
99
100 BasicBlock *foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
101 ScalarEvolution *SE, DominatorTree *DT);
102
103 void remapInstruction(Instruction *I, ValueToValueMapTy &VMap);
104
105 void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
106 ScalarEvolution *SE, DominatorTree *DT,
107 AssumptionCache *AC);
108
80109 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
110
111 TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
112 Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
113 Optional UserThreshold, Optional UserCount,
114 Optional UserAllowPartial, Optional UserRuntime,
115 Optional UserUpperBound, Optional UserAllowPeeling);
116
117 unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
118 bool &NotDuplicatable, bool &Convergent,
119 const TargetTransformInfo &TTI,
120 const SmallPtrSetImpl &EphValues,
121 unsigned BEInsns);
81122
82123 } // end namespace llvm
83124
8888 /** See llvm::createLoopUnrollPass function. */
8989 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
9090
91 /** See llvm::createLoopUnrollAndJamPass function. */
92 void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM);
93
9194 /** See llvm::createLoopUnswitchPass function. */
9295 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);
9396
118118 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
119119 #include "llvm/Transforms/Scalar/LoopSink.h"
120120 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
121 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
121122 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
122123 #include "llvm/Transforms/Scalar/LowerAtomic.h"
123124 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
177178 "enable-npm-gvn-sink", cl::init(false), cl::Hidden,
178179 cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
179180
181 static cl::opt EnableUnrollAndJam(
182 "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
183 cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
184
180185 static cl::opt EnableSyntheticCounts(
181186 "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
182187 cl::desc("Run synthetic function entry count generation "
788793 // FIXME: It would be really good to use a loop-integrated instruction
789794 // combiner for cleanup here so that the unrolling and LICM can be pipelined
790795 // across the loop nests.
796 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
797 if (EnableUnrollAndJam) {
798 OptimizePM.addPass(
799 createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
800 }
791801 OptimizePM.addPass(LoopUnrollPass(Level));
792802 OptimizePM.addPass(InstCombinePass());
793803 OptimizePM.addPass(RequireAnalysisPass());
238238 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
239239 LOOP_PASS("indvars", IndVarSimplifyPass())
240240 LOOP_PASS("irce", IRCEPass())
241 LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
241242 LOOP_PASS("unroll-full", LoopFullUnrollPass())
242243 LOOP_PASS("unswitch", SimpleLoopUnswitchPass())
243244 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
621621 UP.Runtime = true;
622622 UP.UnrollRemainder = true;
623623 UP.DefaultUnrollRuntimeCount = 4;
624 UP.UnrollAndJam = true;
625 UP.UnrollAndJamInnerLoopThreshold = 60;
624626
625627 // Force unrolling small loops can be very useful because of the branch
626628 // taken cost of the backedge.
9494 "enable-loopinterchange", cl::init(false), cl::Hidden,
9595 cl::desc("Enable the new, experimental LoopInterchange Pass"));
9696
97 static cl::opt EnableUnrollAndJam("enable-unroll-and-jam",
98 cl::init(false), cl::Hidden,
99 cl::desc("Enable Unroll And Jam Pass"));
100
97101 static cl::opt
98102 EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
99103 cl::desc("Enable preparation for ThinLTO."));
652656 addInstructionCombiningPass(MPM);
653657
654658 if (!DisableUnrollLoops) {
659 if (EnableUnrollAndJam) {
660 // Unroll and Jam. We do this before unroll but need to be in a separate
661 // loop pass manager in order for the outer loop to be processed by
662 // unroll and jam before the inner loop is unrolled.
663 MPM.add(createLoopUnrollAndJamPass(OptLevel));
664 }
665
655666 MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops
656667
657668 // LoopUnroll may generate some redundency to cleanup.
3737 LoopSimplifyCFG.cpp
3838 LoopStrengthReduce.cpp
3939 LoopUnrollPass.cpp
40 LoopUnrollAndJamPass.cpp
4041 LoopUnswitch.cpp
4142 LoopVersioningLICM.cpp
4243 LowerAtomic.cpp
0 //===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an unroll and jam pass. Most of the work is done by
10 // Utils/UnrollLoopAndJam.cpp.
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
14 #include "llvm/ADT/None.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/CodeMetrics.h"
20 #include "llvm/Analysis/DependenceAnalysis.h"
21 #include "llvm/Analysis/LoopAnalysisManager.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/LoopPass.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/ScalarEvolution.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/CFG.h"
29 #include "llvm/IR/Constant.h"
30 #include "llvm/IR/Constants.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Metadata.h"
37 #include "llvm/IR/PassManager.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Transforms/Scalar.h"
45 #include "llvm/Transforms/Scalar/LoopPassManager.h"
46 #include "llvm/Transforms/Utils.h"
47 #include "llvm/Transforms/Utils/LoopUtils.h"
48 #include "llvm/Transforms/Utils/UnrollLoop.h"
49 #include
50 #include
51 #include
52 #include
53
54 using namespace llvm;
55
56 #define DEBUG_TYPE "loop-unroll-and-jam"
57
58 static cl::opt UnrollAndJamCount(
59 "unroll-and-jam-count", cl::Hidden,
60 cl::desc("Use this unroll count for all loops including those with "
61 "unroll_and_jam_count pragma values, for testing purposes"));
62
63 static cl::opt UnrollAndJamThreshold(
64 "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
65 cl::desc("Threshold to use for inner loop when doing unroll and jam."));
66
67 static cl::opt PragmaUnrollAndJamThreshold(
68 "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
69 cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
70 "unroll_count pragma."));
71
72 // Returns the loop hint metadata node with the given name (for example,
73 // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
74 // returned.
75 static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
76 if (MDNode *LoopID = L->getLoopID())
77 return GetUnrollMetadata(LoopID, Name);
78 return nullptr;
79 }
80
81 // Returns true if the loop has any metadata starting with Prefix. For example a
82 // Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
83 static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
84 if (MDNode *LoopID = L->getLoopID()) {
85 // First operand should refer to the loop id itself.
86 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
87 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
88
89 for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
90 MDNode *MD = dyn_cast(LoopID->getOperand(i));
91 if (!MD)
92 continue;
93
94 MDString *S = dyn_cast(MD->getOperand(0));
95 if (!S)
96 continue;
97
98 if (S->getString().startswith(Prefix))
99 return true;
100 }
101 }
102 return false;
103 }
104
105 // Returns true if the loop has an unroll_and_jam(enable) pragma.
106 static bool HasUnrollAndJamEnablePragma(const Loop *L) {
107 return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
108 }
109
110 // Returns true if the loop has an unroll_and_jam(disable) pragma.
111 static bool HasUnrollAndJamDisablePragma(const Loop *L) {
112 return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
113 }
114
115 // If loop has an unroll_and_jam_count pragma return the (necessarily
116 // positive) value from the pragma. Otherwise return 0.
117 static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
118 MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
119 if (MD) {
120 assert(MD->getNumOperands() == 2 &&
121 "Unroll count hint metadata should have two operands.");
122 unsigned Count =
123 mdconst::extract(MD->getOperand(1))->getZExtValue();
124 assert(Count >= 1 && "Unroll count must be positive.");
125 return Count;
126 }
127 return 0;
128 }
129
130 // Returns loop size estimation for unrolled loop.
131 static uint64_t
132 getUnrollAndJammedLoopSize(unsigned LoopSize,
133 TargetTransformInfo::UnrollingPreferences &UP) {
134 assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
135 return static_cast(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
136 }
137
138 // Calculates unroll and jam count and writes it to UP.Count. Returns true if
139 // unroll count was set explicitly.
140 static bool computeUnrollAndJamCount(
141 Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
142 LoopInfo *LI, ScalarEvolution &SE,
143 const SmallPtrSetImpl &EphValues,
144 OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
145 unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
146 unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
147 // Check for explicit Count from the "unroll-and-jam-count" option.
148 bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
149 if (UserUnrollCount) {
150 UP.Count = UnrollAndJamCount;
151 UP.Force = true;
152 if (UP.AllowRemainder &&
153 getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
154 getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
155 UP.UnrollAndJamInnerLoopThreshold)
156 return true;
157 }
158
159 // Check for unroll_and_jam pragmas
160 unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
161 if (PragmaCount > 0) {
162 UP.Count = PragmaCount;
163 UP.Runtime = true;
164 UP.Force = true;
165 if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
166 getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
167 getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
168 UP.UnrollAndJamInnerLoopThreshold)
169 return true;
170 }
171
172 // Use computeUnrollCount from the loop unroller to get a sensible count
173 // for the unrolling the outer loop. This uses UP.Threshold /
174 // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
175 // We have already checked that the loop has no unroll.* pragmas.
176 unsigned MaxTripCount = 0;
177 bool UseUpperBound = false;
178 bool ExplicitUnroll = computeUnrollCount(
179 L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
180 OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
181 if (ExplicitUnroll || UseUpperBound) {
182 // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
183 // for the unroller instead.
184 UP.Count = 0;
185 return false;
186 }
187
188 bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
189 ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
190
191 // If the loop has an unrolling pragma, we want to be more aggressive with
192 // unrolling limits.
193 if (ExplicitUnroll && OuterTripCount != 0)
194 UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
195
196 if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
197 UP.UnrollAndJamInnerLoopThreshold) {
198 UP.Count = 0;
199 return false;
200 }
201
202 // If the inner loop count is known and small, leave the entire loop nest to
203 // be the unroller
204 if (!ExplicitUnroll && InnerTripCount &&
205 InnerLoopSize * InnerTripCount < UP.Threshold) {
206 UP.Count = 0;
207 return false;
208 }
209
210 // We have a sensible limit for the outer loop, now adjust it for the inner
211 // loop and UP.UnrollAndJamInnerLoopThreshold.
212 while (UP.Count != 0 && UP.AllowRemainder &&
213 getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
214 UP.UnrollAndJamInnerLoopThreshold)
215 UP.Count--;
216
217 if (!ExplicitUnroll) {
218 // Check for situations where UnJ is likely to be unprofitable. Including
219 // subloops with more than 1 block.
220 if (SubLoop->getBlocks().size() != 1) {
221 UP.Count = 0;
222 return false;
223 }
224
225 // Limit to loops where there is something to gain from unrolling and
226 // jamming the loop. In this case, look for loads that are invariant in the
227 // outer loop and can become shared.
228 unsigned NumInvariant = 0;
229 for (BasicBlock *BB : SubLoop->getBlocks()) {
230 for (Instruction &I : *BB) {
231 if (auto *Ld = dyn_cast(&I)) {
232 Value *V = Ld->getPointerOperand();
233 const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
234 if (SE.isLoopInvariant(LSCEV, L))
235 NumInvariant++;
236 }
237 }
238 }
239 if (NumInvariant == 0) {
240 UP.Count = 0;
241 return false;
242 }
243 }
244
245 return ExplicitUnroll;
246 }
247
248 static LoopUnrollResult
249 tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
250 ScalarEvolution &SE, const TargetTransformInfo &TTI,
251 AssumptionCache &AC, DependenceInfo &DI,
252 OptimizationRemarkEmitter &ORE, int OptLevel) {
253 // Quick checks of the correct loop form
254 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
255 return LoopUnrollResult::Unmodified;
256 Loop *SubLoop = L->getSubLoops()[0];
257 if (!SubLoop->isLoopSimplifyForm())
258 return LoopUnrollResult::Unmodified;
259
260 BasicBlock *Latch = L->getLoopLatch();
261 BasicBlock *Exit = L->getExitingBlock();
262 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
263 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
264
265 if (Latch != Exit || SubLoopLatch != SubLoopExit)
266 return LoopUnrollResult::Unmodified;
267
268 TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
269 L, SE, TTI, OptLevel, None, None, None, None, None, None);
270 if (UnrollAndJamThreshold.getNumOccurrences() > 0)
271 UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
272 // Exit early if unrolling is disabled.
273 if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
274 return LoopUnrollResult::Unmodified;
275
276 LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
277 << L->getHeader()->getParent()->getName() << "] Loop %"
278 << L->getHeader()->getName() << "\n");
279
280 // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
281 // the unroller, so long as it does not explicitly have unroll_and_jam
282 // metadata. This means #pragma nounroll will disable unroll and jam as well
283 // as unrolling
284 if (HasUnrollAndJamDisablePragma(L) ||
285 (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
286 !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
287 LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
288 return LoopUnrollResult::Unmodified;
289 }
290
291 if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
292 LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
293 return LoopUnrollResult::Unmodified;
294 }
295
296 // Approximate the loop size and collect useful info
297 unsigned NumInlineCandidates;
298 bool NotDuplicatable;
299 bool Convergent;
300 SmallPtrSet EphValues;
301 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
302 unsigned InnerLoopSize =
303 ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
304 Convergent, TTI, EphValues, UP.BEInsns);
305 unsigned OuterLoopSize =
306 ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
307 TTI, EphValues, UP.BEInsns);
308 LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n");
309 LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n");
310 if (NotDuplicatable) {
311 LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable "
312 "instructions.\n");
313 return LoopUnrollResult::Unmodified;
314 }
315 if (NumInlineCandidates != 0) {
316 LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
317 return LoopUnrollResult::Unmodified;
318 }
319 if (Convergent) {
320 LLVM_DEBUG(
321 dbgs() << " Not unrolling loop with convergent instructions.\n");
322 return LoopUnrollResult::Unmodified;
323 }
324
325 // Find trip count and trip multiple
326 unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
327 unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
328 unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
329
330 // Decide if, and by how much, to unroll
331 bool IsCountSetExplicitly = computeUnrollAndJamCount(
332 L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
333 OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
334 if (UP.Count <= 1)
335 return LoopUnrollResult::Unmodified;
336 // Unroll factor (Count) must be less or equal to TripCount.
337 if (OuterTripCount && UP.Count > OuterTripCount)
338 UP.Count = OuterTripCount;
339
340 LoopUnrollResult UnrollResult =
341 UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
342 UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
343
344 // If loop has an unroll count pragma or unrolled by explicitly set count
345 // mark loop as unrolled to prevent unrolling beyond that requested.
346 if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
347 L->setLoopAlreadyUnrolled();
348
349 return UnrollResult;
350 }
351
352 namespace {
353
354 class LoopUnrollAndJam : public LoopPass {
355 public:
356 static char ID; // Pass ID, replacement for typeid
357 unsigned OptLevel;
358
359 LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
360 initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
361 }
362
363 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
364 if (skipLoop(L))
365 return false;
366
367 Function &F = *L->getHeader()->getParent();
368
369 auto &DT = getAnalysis().getDomTree();
370 LoopInfo *LI = &getAnalysis().getLoopInfo();
371 ScalarEvolution &SE = getAnalysis().getSE();
372 const TargetTransformInfo &TTI =
373 getAnalysis().getTTI(F);
374 auto &AC = getAnalysis().getAssumptionCache(F);
375 auto &DI = getAnalysis().getDI();
376 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
377 // pass. Function analyses need to be preserved across loop transformations
378 // but ORE cannot be preserved (see comment before the pass definition).
379 OptimizationRemarkEmitter ORE(&F);
380
381 LoopUnrollResult Result =
382 tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
383
384 if (Result == LoopUnrollResult::FullyUnrolled)
385 LPM.markLoopAsDeleted(*L);
386
387 return Result != LoopUnrollResult::Unmodified;
388 }
389
390 /// This transformation requires natural loop information & requires that
391 /// loop preheaders be inserted into the CFG...
392 void getAnalysisUsage(AnalysisUsage &AU) const override {
393 AU.addRequired();
394 AU.addRequired();
395 AU.addRequired();
396 getLoopAnalysisUsage(AU);
397 }
398 };
399
400 } // end anonymous namespace
401
402 char LoopUnrollAndJam::ID = 0;
403
404 INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
405 "Unroll and Jam loops", false, false)
406 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
407 INITIALIZE_PASS_DEPENDENCY(LoopPass)
408 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
409 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
410 INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
411 "Unroll and Jam loops", false, false)
412
413 Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
414 return new LoopUnrollAndJam(OptLevel);
415 }
416
417 PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
418 LoopStandardAnalysisResults &AR,
419 LPMUpdater &) {
420 const auto &FAM =
421 AM.getResult(L, AR).getManager();
422 Function *F = L.getHeader()->getParent();
423
424 auto *ORE = FAM.getCachedResult(*F);
425 // FIXME: This should probably be optional rather than required.
426 if (!ORE)
427 report_fatal_error(
428 "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
429 "a higher level");
430
431 DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
432
433 LoopUnrollResult Result = tryToUnrollAndJamLoop(
434 &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
435
436 if (Result == LoopUnrollResult::Unmodified)
437 return PreservedAnalyses::all();
438
439 return getLoopPassPreservedAnalyses();
440 }
164164
165165 /// Gather the various unrolling parameters based on the defaults, compiler
166166 /// flags, TTI overrides and user specified parameters.
167 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
167 TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
168168 Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
169169 Optional UserThreshold, Optional UserCount,
170170 Optional UserAllowPartial, Optional UserRuntime,
191191 UP.Force = false;
192192 UP.UpperBound = false;
193193 UP.AllowPeeling = true;
194 UP.UnrollAndJam = false;
195 UP.UnrollAndJamInnerLoopThreshold = 60;
194196
195197 // Override with any target specific settings
196198 TTI.getUnrollingPreferences(L, SE, UP);
609611 }
610612
611613 /// ApproximateLoopSize - Approximate the size of the loop.
612 static unsigned
613 ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable,
614 bool &Convergent, const TargetTransformInfo &TTI,
615 const SmallPtrSetImpl &EphValues,
616 unsigned BEInsns) {
614 unsigned llvm::ApproximateLoopSize(
615 const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
616 const TargetTransformInfo &TTI,
617 const SmallPtrSetImpl &EphValues, unsigned BEInsns) {
617618 CodeMetrics Metrics;
618619 for (BasicBlock *BB : L->blocks())
619620 Metrics.analyzeBasicBlock(BB, TTI, EphValues);
706707
707708 // Returns true if unroll count was set explicitly.
708709 // Calculates unroll count and writes it to UP.Count.
709 static bool computeUnrollCount(
710 bool llvm::computeUnrollCount(
710711 Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
711712 ScalarEvolution &SE, const SmallPtrSetImpl &EphValues,
712713 OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
747748
748749 if (ExplicitUnroll && TripCount != 0) {
749750 // If the loop has an unrolling pragma, we want to be more aggressive with
750 // unrolling limits. Set thresholds to at least the PragmaThreshold value
751 // which is larger than the default limits.
751 // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
752 // value which is larger than the default limits.
752753 UP.Threshold = std::max(UP.Threshold, PragmaUnrollThreshold);
753754 UP.PartialThreshold =
754755 std::max(UP.PartialThreshold, PragmaUnrollThreshold);
6868 initializeLoopStrengthReducePass(Registry);
6969 initializeLoopRerollPass(Registry);
7070 initializeLoopUnrollPass(Registry);
71 initializeLoopUnrollAndJamPass(Registry);
7172 initializeLoopUnswitchPass(Registry);
7273 initializeLoopVersioningLICMPass(Registry);
7374 initializeLoopIdiomRecognizeLegacyPassPass(Registry);
183184 unwrap(PM)->add(createLoopUnrollPass());
184185 }
185186
187 void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
188 unwrap(PM)->add(createLoopUnrollAndJamPass());
189 }
190
186191 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
187192 unwrap(PM)->add(createLoopUnswitchPass());
188193 }
2727 LoopRotationUtils.cpp
2828 LoopSimplify.cpp
2929 LoopUnroll.cpp
30 LoopUnrollAndJam.cpp
3031 LoopUnrollPeel.cpp
3132 LoopUnrollRuntime.cpp
3233 LoopUtils.cpp
6262
6363 /// Convert the instruction operands from referencing the current values into
6464 /// those specified by VMap.
65 static inline void remapInstruction(Instruction *I,
66 ValueToValueMapTy &VMap) {
65 void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
6766 for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
6867 Value *Op = I->getOperand(op);
6968
9796 /// Folds a basic block into its predecessor if it only has one predecessor, and
9897 /// that predecessor only has one successor.
9998 /// The LoopInfo Analysis that is passed will be kept consistent.
100 static BasicBlock *
101 foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
102 DominatorTree *DT) {
99 BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
100 ScalarEvolution *SE,
101 DominatorTree *DT) {
103102 // Merge basic blocks into their predecessor if there is only one distinct
104103 // pred, and if there is only one distinct successor of the predecessor, and
105104 // if there are no PHI nodes.
109108 if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
110109 return nullptr;
111110
112 LLVM_DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
111 LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
112 << OnlyPred->getName() << "\n");
113113
114114 // Resolve any PHI nodes at the start of the block. They are all
115115 // guaranteed to have exactly one entry if they exist, unless there are
254254 /// Perform some cleanup and simplifications on loops after unrolling. It is
255255 /// useful to simplify the IV's in the new loop, as well as do a quick
256256 /// simplify/dce pass of the instructions.
257 static void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
258 ScalarEvolution *SE, DominatorTree *DT,
259 AssumptionCache *AC) {
257 void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
258 ScalarEvolution *SE, DominatorTree *DT,
259 AssumptionCache *AC) {
260260 // Simplify any new induction variables in the partially unrolled loop.
261261 if (SE && SimplifyIVs) {
262262 SmallVector DeadInsts;
472472 if (Force)
473473 RuntimeTripCount = false;
474474 else {
475 LLVM_DEBUG(dbgs() << "Wont unroll; remainder loop could not be generated"
476 "when assuming runtime trip count\n");
475 LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
476 "generated when assuming runtime trip count\n");
477477 return LoopUnrollResult::Unmodified;
478478 }
479479 }
0 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/DependenceAnalysis.h"
18 #include "llvm/Analysis/InstructionSimplify.h"
19 #include "llvm/Analysis/LoopAnalysisManager.h"
20 #include "llvm/Analysis/LoopIterator.h"
21 #include "llvm/Analysis/LoopPass.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/ScalarEvolution.h"
24 #include "llvm/Analysis/ScalarEvolutionExpander.h"
25 #include "llvm/Analysis/Utils/Local.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DataLayout.h"
28 #include "llvm/IR/DebugInfoMetadata.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IntrinsicInst.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
35 #include "llvm/Transforms/Utils/Cloning.h"
36 #include "llvm/Transforms/Utils/LoopSimplify.h"
37 #include "llvm/Transforms/Utils/LoopUtils.h"
38 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
39 #include "llvm/Transforms/Utils/UnrollLoop.h"
40 using namespace llvm;
41
42 #define DEBUG_TYPE "loop-unroll-and-jam"
43
44 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46
47 static bool containsBB(std::vector &V, BasicBlock *BB) {
48 return std::find(V.begin(), V.end(), BB) != V.end();
49 }
50
51 // Partition blocks in an outer/inner loop pair into blocks before and after
52 // the loop
53 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
54 std::vector &ForeBlocks,
55 std::vector &SubLoopBlocks,
56 std::vector &AftBlocks,
57 DominatorTree *DT) {
58 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
59 SubLoopBlocks = SubLoop->getBlocks();
60
61 for (BasicBlock *BB : L->blocks()) {
62 if (!SubLoop->contains(BB)) {
63 if (DT->dominates(SubLoopLatch, BB))
64 AftBlocks.push_back(BB);
65 else
66 ForeBlocks.push_back(BB);
67 }
68 }
69
70 // Check that all blocks in ForeBlocks together dominate the subloop
71 // TODO: This might ideally be done better with a dominator/postdominators.
72 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
73 for (BasicBlock *BB : ForeBlocks) {
74 if (BB == SubLoopPreHeader)
75 continue;
76 TerminatorInst *TI = BB->getTerminator();
77 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
78 if (!containsBB(ForeBlocks, TI->getSuccessor(i)))
79 return false;
80 }
81
82 return true;
83 }
84
85 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
86 static void
87 moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch,
88 Instruction *InsertLoc,
89 std::vector &AftBlocks) {
90 // We need to ensure we move the instructions in the correct order,
91 // starting with the earliest required instruction and moving forward.
92 std::vector Worklist;
93 std::vector Visited;
94 for (auto &Phi : Header->phis()) {
95 Value *V = Phi.getIncomingValueForBlock(Latch);
96 if (Instruction *I = dyn_cast(V))
97 Worklist.push_back(I);
98 }
99
100 while (!Worklist.empty()) {
101 Instruction *I = Worklist.back();
102 Worklist.pop_back();
103 if (!containsBB(AftBlocks, I->getParent()))
104 continue;
105
106 Visited.push_back(I);
107 for (auto &U : I->operands())
108 if (Instruction *II = dyn_cast(U))
109 Worklist.push_back(II);
110 }
111
112 // Move all instructions in program order to before the InsertLoc
113 BasicBlock *InsertLocBB = InsertLoc->getParent();
114 for (Instruction *I : reverse(Visited)) {
115 if (I->getParent() != InsertLocBB)
116 I->moveBefore(InsertLoc);
117 }
118 }
119
120 /*
121 This method performs Unroll and Jam. For a simple loop like:
122 for (i = ..)
123 Fore(i)
124 for (j = ..)
125 SubLoop(i, j)
126 Aft(i)
127
128 Instead of doing normal inner or outer unrolling, we do:
129 for (i = .., i+=2)
130 Fore(i)
131 Fore(i+1)
132 for (j = ..)
133 SubLoop(i, j)
134 SubLoop(i+1, j)
135 Aft(i)
136 Aft(i+1)
137
138 So the outer loop is essetially unrolled and then the inner loops are fused
139 ("jammed") together into a single loop. This can increase speed when there
140 are loads in SubLoop that are invariant to i, as they become shared between
141 the now jammed inner loops.
142
143 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
144 Fore blocks are those before the inner loop, Aft are those after. Normal
145 Unroll code is used to copy each of these sets of blocks and the results are
146 combined together into the final form above.
147
148 isSafeToUnrollAndJam should be used prior to calling this to make sure the
149 unrolling will be valid. Checking profitablility is also advisable.
150 */
151 LoopUnrollResult
152 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
153 unsigned TripMultiple, bool UnrollRemainder,
154 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
155 AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
156
157 // When we enter here we should have already checked that it is safe
158 BasicBlock *Header = L->getHeader();
159 assert(L->getSubLoops().size() == 1);
160 Loop *SubLoop = *L->begin();
161
162 // Don't enter the unroll code if there is nothing to do.
163 if (TripCount == 0 && Count < 2) {
164 LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
165 return LoopUnrollResult::Unmodified;
166 }
167
168 assert(Count > 0);
169 assert(TripMultiple > 0);
170 assert(TripCount == 0 || TripCount % TripMultiple == 0);
171
172 // Are we eliminating the loop control altogether?
173 bool CompletelyUnroll = (Count == TripCount);
174
175 // We use the runtime remainder in cases where we don't know trip multiple
176 if (TripMultiple == 1 || TripMultiple % Count != 0) {
177 if (!UnrollRuntimeLoopRemainder(L, Count, false /*AllowExpensiveTripCount*/,
178 /*UseEpilogRemainder*/ true,
179 UnrollRemainder, LI, SE, DT, AC, true)) {
180 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
181 "generated when assuming runtime trip count\n");
182 return LoopUnrollResult::Unmodified;
183 }
184 }
185
186 // Notify ScalarEvolution that the loop will be substantially changed,
187 // if not outright eliminated.
188 if (SE) {
189 SE->forgetLoop(L);
190 SE->forgetLoop(SubLoop);
191 }
192
193 using namespace ore;
194 // Report the unrolling decision.
195 if (CompletelyUnroll) {
196 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
197 << Header->getName() << " with trip count " << TripCount
198 << "!\n");
199 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
200 L->getHeader())
201 << "completely unroll and jammed loop with "
202 << NV("UnrollCount", TripCount) << " iterations");
203 } else {
204 auto DiagBuilder = [&]() {
205 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
206 L->getHeader());
207 return Diag << "unroll and jammed loop by a factor of "
208 << NV("UnrollCount", Count);
209 };
210
211 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
212 << " by " << Count);
213 if (TripMultiple != 1) {
214 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
215 ORE->emit([&]() {
216 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
217 << " trips per branch";
218 });
219 } else {
220 LLVM_DEBUG(dbgs() << " with run-time trip count");
221 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
222 }
223 LLVM_DEBUG(dbgs() << "!\n");
224 }
225
226 BasicBlock *Preheader = L->getLoopPreheader();
227 BasicBlock *LatchBlock = L->getLoopLatch();
228 BranchInst *BI = dyn_cast(LatchBlock->getTerminator());
229 assert(Preheader && LatchBlock && Header);
230 assert(BI && !BI->isUnconditional());
231 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
232 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
233 bool SubLoopContinueOnTrue = SubLoop->contains(
234 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
235
236 // Partition blocks in an outer/inner loop pair into blocks before and after
237 // the loop
238 std::vector SubLoopBlocks;
239 std::vector ForeBlocks;
240 std::vector AftBlocks;
241 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
242 DT);
243
244 // We keep track of the entering/first and exiting/last block of each
245 // of Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
246 // blocks easier.
247 std::vector ForeBlocksFirst;
248 std::vector ForeBlocksLast;
249 std::vector SubLoopBlocksFirst;
250 std::vector SubLoopBlocksLast;
251 std::vector AftBlocksFirst;
252 std::vector AftBlocksLast;
253 ForeBlocksFirst.push_back(Header);
254 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
255 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
256 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
257 AftBlocksFirst.push_back(SubLoop->getExitBlock());
258 AftBlocksLast.push_back(L->getExitingBlock());
259 // Maps Blocks[0] -> Blocks[It]
260 ValueToValueMapTy LastValueMap;
261
262 // Move any instructions from fore phi operands from AftBlocks into Fore.
263 moveHeaderPhiOperandsToForeBlocks(
264 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
265 AftBlocks);
266
267 // The current on-the-fly SSA update requires blocks to be processed in
268 // reverse postorder so that LastValueMap contains the correct value at each
269 // exit.
270 LoopBlocksDFS DFS(L);
271 DFS.perform(LI);
272 // Stash the DFS iterators before adding blocks to the loop.
273 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
274 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
275
276 if (Header->getParent()->isDebugInfoForProfiling())
277 for (BasicBlock *BB : L->getBlocks())
278 for (Instruction &I : *BB)
279 if (!isa(&I))
280 if (const DILocation *DIL = I.getDebugLoc())
281 I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
282
283 // Copy all blocks
284 for (unsigned It = 1; It != Count; ++It) {
285 std::vector NewBlocks;
286 // Maps Blocks[It] -> Blocks[It-1]
287 DenseMap PrevItValueMap;
288
289 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
290 ValueToValueMapTy VMap;
291 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
292 Header->getParent()->getBasicBlockList().push_back(New);
293
294 if (containsBB(ForeBlocks, *BB)) {
295 L->addBasicBlockToLoop(New, *LI);
296
297 if (*BB == ForeBlocksFirst[0])
298 ForeBlocksFirst.push_back(New);
299 if (*BB == ForeBlocksLast[0])
300 ForeBlocksLast.push_back(New);
301 } else if (containsBB(SubLoopBlocks, *BB)) {
302 SubLoop->addBasicBlockToLoop(New, *LI);
303
304 if (*BB == SubLoopBlocksFirst[0])
305 SubLoopBlocksFirst.push_back(New);
306 if (*BB == SubLoopBlocksLast[0])
307 SubLoopBlocksLast.push_back(New);
308 } else if (containsBB(AftBlocks, *BB)) {
309 L->addBasicBlockToLoop(New, *LI);
310
311 if (*BB == AftBlocksFirst[0])
312 AftBlocksFirst.push_back(New);
313 if (*BB == AftBlocksLast[0])
314 AftBlocksLast.push_back(New);
315 } else {
316 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
317 }
318
319 // Update our running maps of newest clones
320 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
321 LastValueMap[*BB] = New;
322 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
323 VI != VE; ++VI) {
324 PrevItValueMap[VI->second] =
325 const_cast(It == 1 ? VI->first : LastValueMap[VI->first]);
326 LastValueMap[VI->first] = VI->second;
327 }
328
329 NewBlocks.push_back(New);
330
331 // Update DomTree:
332 if (*BB == ForeBlocksFirst[0])
333 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
334 else if (*BB == SubLoopBlocksFirst[0])
335 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
336 else if (*BB == AftBlocksFirst[0])
337 DT->addNewBlock(New, AftBlocksLast[It - 1]);
338 else {
339 // Each set of blocks (Fore/Sub/Aft) will have the same
340 // internal domtree structure.
341 auto BBDomNode = DT->getNode(*BB);
342 auto BBIDom = BBDomNode->getIDom();
343 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
344 assert(OriginalBBIDom);
345 assert(LastValueMap[cast(OriginalBBIDom)]);
346 DT->addNewBlock(
347 New, cast(LastValueMap[cast(OriginalBBIDom)]));
348 }
349 }
350
351 // Remap all instructions in the most recent iteration
352 for (BasicBlock *NewBlock : NewBlocks) {
353 for (Instruction &I : *NewBlock) {
354 ::remapInstruction(&I, LastValueMap);
355 if (auto *II = dyn_cast(&I))
356 if (II->getIntrinsicID() == Intrinsic::assume)
357 AC->registerAssumption(II);
358 }
359 }
360
361 // Alter the ForeBlocks phi's, pointing them at the latest version of the
362 // value from the previous iteration's phis
363 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
364 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
365 assert(OldValue && "should have incoming edge from Aft[It]");
366 Value *NewValue = OldValue;
367 if (Value *PrevValue = PrevItValueMap[OldValue])
368 NewValue = PrevValue;
369
370 assert(Phi.getNumOperands() == 2);
371 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
372 Phi.setIncomingValue(0, NewValue);
373 Phi.removeIncomingValue(1);
374 }
375 }
376
377 // Now that all the basic blocks for the unrolled iterations are in place,
378 // finish up connecting the blocks and phi nodes. At this point LastValueMap
379 // is the last unrolled iterations values.
380
381 // Update Phis in BB from OldBB to point to NewBB
382 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
383 BasicBlock *NewBB) {
384 for (PHINode &Phi : BB->phis()) {
385 int I = Phi.getBasicBlockIndex(OldBB);
386 Phi.setIncomingBlock(I, NewBB);
387 }
388 };
389 // Update Phis in BB from OldBB to point to NewBB and use the latest value
390 // from LastValueMap
391 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
392 BasicBlock *NewBB,
393 ValueToValueMapTy &LastValueMap) {
394 for (PHINode &Phi : BB->phis()) {
395 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
396 if (Phi.getIncomingBlock(b) == OldBB) {
397 Value *OldValue = Phi.getIncomingValue(b);
398 if (Value *LastValue = LastValueMap[OldValue])
399 Phi.setIncomingValue(b, LastValue);
400 Phi.setIncomingBlock(b, NewBB);
401 break;
402 }
403 }
404 }
405 };
406 // Move all the phis from Src into Dest
407 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
408 Instruction *insertPoint = Dest->getFirstNonPHI();
409 while (PHINode *Phi = dyn_cast(Src->begin()))
410 Phi->moveBefore(insertPoint);
411 };
412
413 // Update the PHI values outside the loop to point to the last block
414 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
415 LastValueMap);
416
417 // Update ForeBlocks successors and phi nodes
418 BranchInst *ForeTerm =
419 cast(ForeBlocksLast.back()->getTerminator());
420 BasicBlock *Dest = SubLoopBlocksFirst[0];
421 ForeTerm->setSuccessor(0, Dest);
422
423 if (CompletelyUnroll) {
424 while (PHINode *Phi = dyn_cast(ForeBlocksFirst[0]->begin())) {
425 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
426 Phi->getParent()->getInstList().erase(Phi);
427 }
428 } else {
429 // Update the PHI values to point to the last aft block
430 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
431 AftBlocksLast.back(), LastValueMap);
432 }
433
434 for (unsigned It = 1; It != Count; It++) {
435 // Remap ForeBlock successors from previous iteration to this
436 BranchInst *ForeTerm =
437 cast(ForeBlocksLast[It - 1]->getTerminator());
438 BasicBlock *Dest = ForeBlocksFirst[It];
439 ForeTerm->setSuccessor(0, Dest);
440 }
441
442 // Subloop successors and phis
443 BranchInst *SubTerm =
444 cast(SubLoopBlocksLast.back()->getTerminator());
445 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
446 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
447 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
448 ForeBlocksLast.back());
449 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
450 SubLoopBlocksLast.back());
451
452 for (unsigned It = 1; It != Count; It++) {
453 // Replace the conditional branch of the previous iteration subloop
454 // with an unconditional one to this one
455 BranchInst *SubTerm =
456 cast(SubLoopBlocksLast[It - 1]->getTerminator());
457 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
458 SubTerm->eraseFromParent();
459
460 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
461 ForeBlocksLast.back());
462 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
463 SubLoopBlocksLast.back());
464 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
465 }
466
467 // Aft blocks successors and phis
468 BranchInst *Term = cast(AftBlocksLast.back()->getTerminator());
469 if (CompletelyUnroll) {
470 BranchInst::Create(LoopExit, Term);
471 Term->eraseFromParent();
472 } else {
473 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
474 }
475 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
476 SubLoopBlocksLast.back());
477
478 for (unsigned It = 1; It != Count; It++) {
479 // Replace the conditional branch of the previous iteration subloop
480 // with an unconditional one to this one
481 BranchInst *AftTerm =
482 cast(AftBlocksLast[It - 1]->getTerminator());
483 BranchInst::Create(AftBlocksFirst[It], AftTerm);
484 AftTerm->eraseFromParent();
485
486 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
487 SubLoopBlocksLast.back());
488 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
489 }
490
491 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
492 // new ones required.
493 if (Count != 1) {
494 SmallVector DTUpdates;
495 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
496 SubLoopBlocksFirst[0]);
497 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
498 SubLoopBlocksLast[0], AftBlocksFirst[0]);
499
500 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
501 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
502 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
503 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
504 DT->applyUpdates(DTUpdates);
505 }
506
507 // Merge adjacent basic blocks, if possible.
508 SmallPtrSet MergeBlocks;
509 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
510 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
511 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
512 while (!MergeBlocks.empty()) {
513 BasicBlock *BB = *MergeBlocks.begin();
514 BranchInst *Term = dyn_cast(BB->getTerminator());
515 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
516 BasicBlock *Dest = Term->getSuccessor(0);
517 if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
518 // Don't remove BB and add Fold as they are the same BB
519 assert(Fold == BB);
520 (void)Fold;
521 MergeBlocks.erase(Dest);
522 } else
523 MergeBlocks.erase(BB);
524 } else
525 MergeBlocks.erase(BB);
526 }
527
528 // At this point, the code is well formed. We now do a quick sweep over the
529 // inserted code, doing constant propagation and dead code elimination as we
530 // go.
531 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
532 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
533
534 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
535 ++NumUnrolledAndJammed;
536
537 #ifndef NDEBUG
538 Loop *OuterL = L->getParentLoop();
539 #endif
540
541 // Update LoopInfo if the loop is completely removed.
542 if (CompletelyUnroll)
543 LI->erase(L);
544
545 #ifndef NDEBUG
546 // We shouldn't have done anything to break loop simplify form or LCSSA.
547 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
548 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
549 if (!CompletelyUnroll)
550 assert(L->isLoopSimplifyForm());
551 assert(SubLoop->isLoopSimplifyForm());
552 assert(DT->verify());
553 #endif
554
555 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
556 : LoopUnrollResult::PartiallyUnrolled;
557 }
558
559 static bool getLoadsAndStores(std::vector &Blocks,
560 SmallVector &MemInstr) {
561 // Scan the BBs and collect legal loads and stores.
562 // Returns false if non-simple loads/stores are found.
563 for (BasicBlock *BB : Blocks) {
564 for (Instruction &I : *BB) {
565 if (auto *Ld = dyn_cast(&I)) {
566 if (!Ld->isSimple())
567 return false;
568 MemInstr.push_back(&I);
569 } else if (auto *St = dyn_cast(&I)) {
570 if (!St->isSimple())
571 return false;
572 MemInstr.push_back(&I);
573 } else if (I.mayReadOrWriteMemory()) {
574 return false;
575 }
576 }
577 }
578 return true;
579 }
580
581 static bool checkDependencies(SmallVector &Earlier,
582 SmallVector &Later,
583 unsigned LoopDepth, bool InnerLoop,
584 DependenceInfo &DI) {
585 // Use DA to check for dependencies between loads and
586 // stores that make unroll and jam invalid
587 for (Value *I : Earlier) {
588 for (Value *J : Later) {
589 Instruction *Src = cast(I);
590 Instruction *Dst = cast(J);
591 if (Src == Dst)
592 continue;
593 // Ignore Input dependencies.
594 if (isa(Src) && isa(Dst))
595 continue;
596
597 // Track dependencies, and if we find them take a conservative approach
598 // by allowing only = or > (not <), altough some < would be safe
599 // (depending upon unroll width).
600 // FIXME: Allow < so long as distance is less than unroll width
601 if (auto D = DI.depends(Src, Dst, true)) {
602 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
603
604 if (D->isConfused())
605 return false;
606 if (!InnerLoop) {
607 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
608 return false;
609 } else {
610 assert(LoopDepth + 1 <= D->getLevels());
611 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
612 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
613 return false;
614 }
615 }
616 }
617 }
618 return true;
619 }
620
621 static bool checkDependencies(Loop *L, std::vector &ForeBlocks,
622 std::vector &SubLoopBlocks,
623 std::vector &AftBlocks,
624 DependenceInfo &DI) {
625 // Get all loads/store pairs for each blocks
626 SmallVector ForeMemInstr;
627 SmallVector SubLoopMemInstr;
628 SmallVector AftMemInstr;
629 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
630 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
631 !getLoadsAndStores(AftBlocks, AftMemInstr))
632 return false;
633
634 // Check for dependencies between any blocks that may change order
635 unsigned LoopDepth = L->getLoopDepth();
636 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
637 DI) &&
638 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
639 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
640 DI) &&
641 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
642 DI);
643 }
644
645 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
646 DependenceInfo &DI) {
647 /* We currently handle outer loops like this:
648 |
649 ForeFirst <----\ }
650 Blocks | } ForeBlocks
651 ForeLast | }
652 | |
653 SubLoopFirst <\ | }
654 Blocks | | } SubLoopBlocks
655 SubLoopLast -/ | }
656 | |
657 AftFirst | }
658 Blocks | } AftBlocks
659 AftLast ------/ }
660 |
661
662 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
663 and AftBlocks, providing that there is one edge from Fores to SubLoops,
664 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
665 In practice we currently limit Aft blocks to a single block, and limit
666 things further in the profitablility checks of the unroll and jam pass.
667
668 Because of the way we rearrange basic blocks, we also require that
669 the Fore blocks on all unrolled iterations are safe to move before the
670 SubLoop blocks of all iterations. So we require that the phi node looping
671 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
672 we can arrange cloned Fore Blocks before the subloop and match up Phi's
673 correctly.
674
675 i.e. The old order of blocks used to be F1 S1 S1 S1 A1 F2 S2 S2 S2 A2.
676 It needs to be safe to tranform this to F1 F2 S1 S2 S1 S2 S1 S2 A1 A2.
677
678 There are then a number of checks along the lines of no calls, no
679 exceptions, inner loop IV is consistent, etc.
680 */
681
682 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
683 return false;
684 Loop *SubLoop = L->getSubLoops()[0];
685 if (!SubLoop->isLoopSimplifyForm())
686 return false;
687
688 BasicBlock *PreHeader = L->getLoopPreheader();
689 BasicBlock *Header = L->getHeader();
690 BasicBlock *Latch = L->getLoopLatch();
691 BasicBlock *Exit = L->getExitingBlock();
692 BasicBlock *SubLoopHeader = SubLoop->getHeader();
693 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
694 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
695
696 if (Latch != Exit)
697 return false;
698 if (SubLoopLatch != SubLoopExit)
699 return false;
700
701 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
702 return false;
703
704 // Split blocks into Fore/SubLoop/Aft based on dominators
705 std::vector SubLoopBlocks;
706 std::vector ForeBlocks;
707 std::vector AftBlocks;
708 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
709 AftBlocks, &DT))
710 return false;
711
712 // Aft blocks may need to move instructions to fore blocks, which
713 // becomes more difficult if there are multiple (potentially conditionally
714 // executed) blocks. For now we just exclude loops with multiple aft blocks.
715 if (AftBlocks.size() != 1)
716 return false;
717
718 // Check outer loop IV is easily calcable
719 const SCEV *BECountSC = SE.getExitCount(L, Latch);
720 if (isa(BECountSC) ||
721 !BECountSC->getType()->isIntegerTy())
722 return false;
723 // Add 1 since the backedge count doesn't include the first loop iteration.
724 const SCEV *TripCountSC =
725 SE.getAddExpr(BECountSC, SE.getConstant(BECountSC->getType(), 1));
726 if (isa(TripCountSC))
727 return false;
728 BranchInst *PreHeaderBR = cast(PreHeader->getTerminator());
729 const DataLayout &DL = Header->getModule()->getDataLayout();
730 SCEVExpander Expander(SE, DL, "loop-unroll");
731 if (Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR))
732 return false;
733
734 // Check inner loop IV is consistent between all iterations
735 const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
736 if (isa(SubLoopBECountSC) ||
737 !SubLoopBECountSC->getType()->isIntegerTy())
738 return false;
739 ScalarEvolution::LoopDisposition LD =
740 SE.getLoopDisposition(SubLoopBECountSC, L);
741 if (LD != ScalarEvolution::LoopInvariant)
742 return false;
743
744 // Check the loop safety info for exceptions.
745 LoopSafetyInfo LSI;
746 computeLoopSafetyInfo(&LSI, L);
747 if (LSI.MayThrow)
748 return false;
749
750 // We've ruled out the easy stuff, and need to check that there
751 // are no interdependencies which may prevent us from moving
752 // the:
753 // ForeBlocks before Subloop and AftBlocks.
754 // Subloop before AftBlocks.
755 // ForeBlock phi operands before the subloop
756
757 // Make sure we can move all instructions we need to before the subloop
758 SmallVector Worklist;
759 SmallPtrSet Visited;
760 for (auto &Phi : Header->phis()) {
761 Value *V = Phi.getIncomingValueForBlock(Latch);
762 if (Instruction *I = dyn_cast(V))
763 Worklist.push_back(I);
764 }
765 while (!Worklist.empty()) {
766 Instruction *I = Worklist.back();
767 Worklist.pop_back();
768 if (Visited.insert(I).second) {
769 if (SubLoop->contains(I->getParent()))
770 return false;
771 if (containsBB(AftBlocks, I->getParent())) {
772 // If we hit a phi node in afts we know we are done (probably LCSSA)
773 if (isa(I))
774 return false;
775 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
776 return false;
777 for (auto &U : I->operands())
778 if (Instruction *II = dyn_cast(U))
779 Worklist.push_back(II);
780 }
781 }
782 }
783
784 // Check for memory dependencies which prohibit the unrolling
785 // we are doing. Because of the way we are unrolling Fore/Sub/Aft
786 // blocks, we need to check there are no dependencies between
787 // Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
788 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
789 return false;
790
791 return true;
792 }
0 ; RUN: opt -basicaa -loop-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3 target triple = "thumbv8m.main-arm-none-eabi"
4
5 ; CHECK-LABEL: fore_aft_less
6 ; CHECK: %j = phi
7 ; CHECK: %j.1 = phi
8 ; CHECK: %j.2 = phi
9 ; CHECK: %j.3 = phi
10 define void @fore_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
11 entry:
12 %cmp = icmp sgt i32 %N, 0
13 br i1 %cmp, label %for.outer, label %cleanup
14
15 for.outer:
16 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
17 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
18 store i32 1, i32* %arrayidx.us, align 4
19 br label %for.inner
20
21 for.inner:
22 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
23 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
24 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
25 %0 = load i32, i32* %arrayidx5.us, align 4
26 %mul.us = mul nsw i32 %0, %i
27 %add.us = add nsw i32 %mul.us, %sum
28 %add6.us = add nuw nsw i32 %j, 1
29 %exitcond.us = icmp eq i32 %add6.us, %N
30 br i1 %exitcond.us, label %for.latch, label %for.inner
31
32 for.latch:
33 %add7.us = add nuw nsw i32 %i, 1
34 %add7.us2 = add nuw nsw i32 %i, -1
35 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
36 store i32 %add.us, i32* %arrayidx8.us, align 4
37 %exitcond29.us = icmp eq i32 %add7.us, %N
38 br i1 %exitcond29.us, label %cleanup, label %for.outer
39
40 cleanup:
41 ret void
42 }
43
44 ; CHECK-LABEL: fore_aft_eq
45 ; CHECK: %j = phi
46 ; CHECK: %j.1 = phi
47 ; CHECK: %j.2 = phi
48 ; CHECK: %j.3 = phi
49 define void @fore_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
50 entry:
51 %cmp = icmp sgt i32 %N, 0
52 br i1 %cmp, label %for.outer, label %cleanup
53
54 for.outer:
55 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
56 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
57 store i32 1, i32* %arrayidx.us, align 4
58 br label %for.inner
59
60 for.inner:
61 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
62 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
63 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
64 %0 = load i32, i32* %arrayidx5.us, align 4
65 %mul.us = mul nsw i32 %0, %i
66 %add.us = add nsw i32 %mul.us, %sum
67 %add6.us = add nuw nsw i32 %j, 1
68 %exitcond.us = icmp eq i32 %add6.us, %N
69 br i1 %exitcond.us, label %for.latch, label %for.inner
70
71 for.latch:
72 %add7.us = add nuw nsw i32 %i, 1
73 %add7.us2 = add nuw nsw i32 %i, 0
74 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %i
75 store i32 %add.us, i32* %arrayidx8.us, align 4
76 %exitcond29.us = icmp eq i32 %add7.us, %N
77 br i1 %exitcond29.us, label %cleanup, label %for.outer
78
79 cleanup:
80 ret void
81 }
82
83
84 ; CHECK-LABEL: fore_aft_more
85 ; CHECK: %j = phi
86 ; CHECK-NOT: %j.1 = phi
87 define void @fore_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
88 entry:
89 %cmp = icmp sgt i32 %N, 0
90 br i1 %cmp, label %for.outer, label %cleanup
91
92 for.outer:
93 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
94 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
95 store i32 1, i32* %arrayidx.us, align 4
96 br label %for.inner
97
98 for.inner:
99 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
100 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
101 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
102 %0 = load i32, i32* %arrayidx5.us, align 4
103 %mul.us = mul nsw i32 %0, %i
104 %add.us = add nsw i32 %mul.us, %sum
105 %add6.us = add nuw nsw i32 %j, 1
106 %exitcond.us = icmp eq i32 %add6.us, %N
107 br i1 %exitcond.us, label %for.latch, label %for.inner
108
109 for.latch:
110 %add7.us = add nuw nsw i32 %i, 1
111 %add7.us2 = add nuw nsw i32 %i, 1
112 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
113 store i32 %add.us, i32* %arrayidx8.us, align 4
114 %exitcond29.us = icmp eq i32 %add7.us, %N
115 br i1 %exitcond29.us, label %cleanup, label %for.outer
116
117 cleanup:
118 ret void
119 }
120
121
122 ; CHECK-LABEL: fore_sub_less
123 ; CHECK: %j = phi
124 ; CHECK: %j.1 = phi
125 ; CHECK: %j.2 = phi
126 ; CHECK: %j.3 = phi
127 define void @fore_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
128 entry:
129 %cmp = icmp sgt i32 %N, 0
130 br i1 %cmp, label %for.outer, label %cleanup
131
132 for.outer:
133 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
134 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
135 store i32 1, i32* %arrayidx.us, align 4
136 br label %for.inner
137
138 for.inner:
139 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
140 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
141 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
142 %0 = load i32, i32* %arrayidx5.us, align 4
143 %mul.us = mul nsw i32 %0, %i
144 %add.us = add nsw i32 %mul.us, %sum
145 %add7.us2 = add nuw nsw i32 %i, -1
146 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
147 store i32 %add.us, i32* %arrayidx8.us, align 4
148 %add6.us = add nuw nsw i32 %j, 1
149 %exitcond.us = icmp eq i32 %add6.us, %N
150 br i1 %exitcond.us, label %for.latch, label %for.inner
151
152 for.latch:
153 %add7.us = add nuw nsw i32 %i, 1
154 %exitcond29.us = icmp eq i32 %add7.us, %N
155 br i1 %exitcond29.us, label %cleanup, label %for.outer
156
157 cleanup:
158 ret void
159 }
160
161 ; CHECK-LABEL: fore_eq_less
162 ; CHECK: %j = phi
163 ; CHECK: %j.1 = phi
164 ; CHECK: %j.2 = phi
165 ; CHECK: %j.3 = phi
166 define void @fore_eq_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
167 entry:
168 %cmp = icmp sgt i32 %N, 0
169 br i1 %cmp, label %for.outer, label %cleanup
170
171 for.outer:
172 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
173 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
174 store i32 1, i32* %arrayidx.us, align 4
175 br label %for.inner
176
177 for.inner:
178 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
179 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
180 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
181 %0 = load i32, i32* %arrayidx5.us, align 4
182 %mul.us = mul nsw i32 %0, %i
183 %add.us = add nsw i32 %mul.us, %sum
184 %add7.us2 = add nuw nsw i32 %i, 0
185 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
186 store i32 %add.us, i32* %arrayidx8.us, align 4
187 %add6.us = add nuw nsw i32 %j, 1
188 %exitcond.us = icmp eq i32 %add6.us, %N
189 br i1 %exitcond.us, label %for.latch, label %for.inner
190
191 for.latch:
192 %add7.us = add nuw nsw i32 %i, 1
193 %exitcond29.us = icmp eq i32 %add7.us, %N
194 br i1 %exitcond29.us, label %cleanup, label %for.outer
195
196 cleanup:
197 ret void
198 }
199
200 ; CHECK-LABEL: fore_sub_more
201 ; CHECK: %j = phi
202 ; CHECK-NOT: %j.1 = phi
203 define void @fore_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
204 entry:
205 %cmp = icmp sgt i32 %N, 0
206 br i1 %cmp, label %for.outer, label %cleanup
207
208 for.outer:
209 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
210 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
211 store i32 1, i32* %arrayidx.us, align 4
212 br label %for.inner
213
214 for.inner:
215 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
216 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
217 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
218 %0 = load i32, i32* %arrayidx5.us, align 4
219 %mul.us = mul nsw i32 %0, %i
220 %add.us = add nsw i32 %mul.us, %sum
221 %add7.us2 = add nuw nsw i32 %i, 1
222 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
223 store i32 %add.us, i32* %arrayidx8.us, align 4
224 %add6.us = add nuw nsw i32 %j, 1
225 %exitcond.us = icmp eq i32 %add6.us, %N
226 br i1 %exitcond.us, label %for.latch, label %for.inner
227
228 for.latch:
229 %add7.us = add nuw nsw i32 %i, 1
230 %exitcond29.us = icmp eq i32 %add7.us, %N
231 br i1 %exitcond29.us, label %cleanup, label %for.outer
232
233 cleanup:
234 ret void
235 }
236
237 ; CHECK-LABEL: sub_aft_less
238 ; CHECK: %j = phi
239 ; CHECK: %j.1 = phi
240 ; CHECK: %j.2 = phi
241 ; CHECK: %j.3 = phi
242 define void @sub_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
243 entry:
244 %cmp = icmp sgt i32 %N, 0
245 br i1 %cmp, label %for.outer, label %cleanup
246
247 for.outer:
248 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
249 br label %for.inner
250
251 for.inner:
252 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
253 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
254 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
255 %0 = load i32, i32* %arrayidx5.us, align 4
256 %mul.us = mul nsw i32 %0, %i
257 %add.us = add nsw i32 %mul.us, %sum
258 %add6.us = add nuw nsw i32 %j, 1
259 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
260 store i32 1, i32* %arrayidx.us, align 4
261 %exitcond.us = icmp eq i32 %add6.us, %N
262 br i1 %exitcond.us, label %for.latch, label %for.inner
263
264 for.latch:
265 %add7.us = add nuw nsw i32 %i, 1
266 %add7.us2 = add nuw nsw i32 %i, -1
267 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
268 store i32 %add.us, i32* %arrayidx8.us, align 4
269 %exitcond29.us = icmp eq i32 %add7.us, %N
270 br i1 %exitcond29.us, label %cleanup, label %for.outer
271
272 cleanup:
273 ret void
274 }
275
276 ; CHECK-LABEL: sub_aft_eq
277 ; CHECK: %j = phi
278 ; CHECK: %j.1 = phi
279 ; CHECK: %j.2 = phi
280 ; CHECK: %j.3 = phi
281 define void @sub_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
282 entry:
283 %cmp = icmp sgt i32 %N, 0
284 br i1 %cmp, label %for.outer, label %cleanup
285
286 for.outer:
287 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
288 br label %for.inner
289
290 for.inner:
291 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
292 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
293 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
294 %0 = load i32, i32* %arrayidx5.us, align 4
295 %mul.us = mul nsw i32 %0, %i
296 %add.us = add nsw i32 %mul.us, %sum
297 %add6.us = add nuw nsw i32 %j, 1
298 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
299 store i32 1, i32* %arrayidx.us, align 4
300 %exitcond.us = icmp eq i32 %add6.us, %N
301 br i1 %exitcond.us, label %for.latch, label %for.inner
302
303 for.latch:
304 %add7.us = add nuw nsw i32 %i, 1
305 %add7.us2 = add nuw nsw i32 %i, 0
306 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %i
307 store i32 %add.us, i32* %arrayidx8.us, align 4
308 %exitcond29.us = icmp eq i32 %add7.us, %N
309 br i1 %exitcond29.us, label %cleanup, label %for.outer
310
311 cleanup:
312 ret void
313 }
314
315
316 ; CHECK-LABEL: sub_aft_more
317 ; CHECK: %j = phi
318 ; CHECK-NOT: %j.1 = phi
319 define void @sub_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
320 entry:
321 %cmp = icmp sgt i32 %N, 0
322 br i1 %cmp, label %for.outer, label %cleanup
323
324 for.outer:
325 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
326 br label %for.inner
327
328 for.inner:
329 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
330 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
331 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
332 %0 = load i32, i32* %arrayidx5.us, align 4
333 %mul.us = mul nsw i32 %0, %i
334 %add.us = add nsw i32 %mul.us, %sum
335 %add6.us = add nuw nsw i32 %j, 1
336 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
337 store i32 1, i32* %arrayidx.us, align 4
338 %exitcond.us = icmp eq i32 %add6.us, %N
339 br i1 %exitcond.us, label %for.latch, label %for.inner
340
341 for.latch:
342 %add7.us = add nuw nsw i32 %i, 1
343 %add7.us2 = add nuw nsw i32 %i, 1
344 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
345 store i32 %add.us, i32* %arrayidx8.us, align 4
346 %exitcond29.us = icmp eq i32 %add7.us, %N
347 br i1 %exitcond29.us, label %cleanup, label %for.outer
348
349 cleanup:
350 ret void
351 }
352
353
354 ; CHECK-LABEL: sub_sub_less
355 ; CHECK: %j = phi
356 ; CHECK-NOT: %j.1 = phi
357 define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
358 entry:
359 %cmp = icmp sgt i32 %N, 0
360 br i1 %cmp, label %for.outer, label %cleanup
361
362 for.outer:
363 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
364 br label %for.inner
365
366 for.inner:
367 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
368 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
369 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
370 %0 = load i32, i32* %arrayidx5.us, align 4
371 %mul.us = mul nsw i32 %0, %i
372 %add.us = add nsw i32 %mul.us, %sum
373 %add6.us = add nuw nsw i32 %j, 1
374 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
375 store i32 1, i32* %arrayidx.us, align 4
376 %add7.us2 = add nuw nsw i32 %i, -1
377 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
378 store i32 %add.us, i32* %arrayidx8.us, align 4
379 %exitcond.us = icmp eq i32 %add6.us, %N
380 br i1 %exitcond.us, label %for.latch, label %for.inner
381
382 for.latch:
383 %add7.us = add nuw nsw i32 %i, 1
384 %exitcond29.us = icmp eq i32 %add7.us, %N
385 br i1 %exitcond29.us, label %cleanup, label %for.outer
386
387 cleanup:
388 ret void
389 }
390
391
392 ; CHECK-LABEL: sub_sub_eq
393 ; CHECK: %j = phi
394 ; CHECK: %j.1 = phi
395 define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
396 entry:
397 %cmp = icmp sgt i32 %N, 0
398 br i1 %cmp, label %for.outer, label %cleanup
399
400 for.outer:
401 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
402 br label %for.inner
403
404 for.inner:
405 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
406 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
407 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
408 %0 = load i32, i32* %arrayidx5.us, align 4
409 %mul.us = mul nsw i32 %0, %i
410 %add.us = add nsw i32 %mul.us, %sum
411 %add6.us = add nuw nsw i32 %j, 1
412 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
413 store i32 1, i32* %arrayidx.us, align 4
414 %add7.us2 = add nuw nsw i32 %i, 0
415 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
416 store i32 %add.us, i32* %arrayidx8.us, align 4
417 %exitcond.us = icmp eq i32 %add6.us, %N
418 br i1 %exitcond.us, label %for.latch, label %for.inner
419
420 for.latch:
421 %add7.us = add nuw nsw i32 %i, 1
422 %exitcond29.us = icmp eq i32 %add7.us, %N
423 br i1 %exitcond29.us, label %cleanup, label %for.outer
424
425 cleanup:
426 ret void
427 }
428
429
430 ; CHECK-LABEL: sub_sub_more
431 ; CHECK: %j = phi
432 ; CHECK-NOT: %j.1 = phi
433 define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
434 entry:
435 %cmp = icmp sgt i32 %N, 0
436 br i1 %cmp, label %for.outer, label %cleanup
437
438 for.outer:
439 %i = phi i32 [ %add7.us, %for.latch ], [ 0, %entry ]
440 br label %for.inner
441
442 for.inner:
443 %j = phi i32 [ %add6.us, %for.inner ], [ 0, %for.outer ]
444 %sum = phi i32 [ %add.us, %for.inner ], [ 0, %for.outer ]
445 %arrayidx5.us = getelementptr inbounds i32, i32* %B, i32 %j
446 %0 = load i32, i32* %arrayidx5.us, align 4
447 %mul.us = mul nsw i32 %0, %i
448 %add.us = add nsw i32 %mul.us, %sum
449 %add6.us = add nuw nsw i32 %j, 1
450 %arrayidx.us = getelementptr inbounds i32, i32* %A, i32 %i
451 store i32 1, i32* %arrayidx.us, align 4
452 %add7.us2 = add nuw nsw i32 %i, 1
453 %arrayidx8.us = getelementptr inbounds i32, i32* %A, i32 %add7.us2
454 store i32 %add.us, i32* %arrayidx8.us, align 4
455 %exitcond.us = icmp eq i32 %add6.us, %N
456 br i1 %exitcond.us, label %for.latch, label %for.inner
457
458 for.latch:
459 %add7.us = add nuw nsw i32 %i, 1
460 %exitcond29.us = icmp eq i32 %add7.us, %N
461 br i1 %exitcond29.us, label %cleanup, label %for.outer
462
463 cleanup:
464 ret void
465 }
0 ; RUN: opt -loop-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3 target triple = "thumbv8m.main-arm-none-eabi"
4
5 ;; Common check for all tests. None should be unroll and jammed
6 ; CHECK-NOT: remark: {{.*}} unroll and jammed
7
8
9 ; CHECK-LABEL: disabled1
10 ; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i+1] = sum; }
11 ; A[i] to A[i+1] dependency should block unrollandjam
12 define void @disabled1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
13 ; CHECK: %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
14 ; CHECK: %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
15 entry:
16 %cmp = icmp ne i32 %J, 0
17 %cmp127 = icmp ne i32 %I, 0
18 %or.cond = and i1 %cmp127, %cmp
19 br i1 %or.cond, label %for.preheader, label %return
20
21 for.preheader:
22 br label %for.outer
23
24 for.outer:
25 %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
26 %b.028 = phi i32 [ %inc8, %for.latch ], [ 1, %for.preheader ]
27 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.029
28 %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
29 br label %for.inner
30
31 for.inner:
32 %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
33 %sum1.025 = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
34 %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.026
35 %1 = load i32, i32* %arrayidx6, align 4, !tbaa !5
36 %add = add i32 %1, %sum1.025
37 %inc = add nuw i32 %j.026, 1
38 %exitcond = icmp eq i32 %inc, %J
39 br i1 %exitcond, label %for.latch, label %for.inner
40
41 for.latch:
42 %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %b.028
43 store i32 %add, i32* %arrayidx7, align 4, !tbaa !5
44 %inc8 = add nuw nsw i32 %b.028, 1
45 %add10 = add nuw nsw i32 %i.029, 1
46 %exitcond30 = icmp eq i32 %add10, %I
47 br i1 %exitcond30, label %return, label %for.outer
48
49 return:
50 ret void
51 }
52
53
54 ; CHECK-LABEL: disabled2
55 ; Tests an incompatible block layout (for.outer jumps past for.inner)
56 ; FIXME: Make this work
57 define void @disabled2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
58 ; CHECK: %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
59 ; CHECK: %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
60 entry:
61 %cmp = icmp ne i32 %J, 0
62 %cmp131 = icmp ne i32 %I, 0
63 %or.cond = and i1 %cmp131, %cmp
64 br i1 %or.cond, label %for.preheader, label %for.end14
65
66 for.preheader:
67 br label %for.outer
68
69 for.outer:
70 %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
71 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.032
72 %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
73 %tobool = icmp eq i32 %0, 0
74 br i1 %tobool, label %for.latch, label %for.inner
75
76 for.inner:
77 %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.outer ]
78 %sum1.029 = phi i32 [ %sum1.1, %for.inner ], [ 0, %for.outer ]
79 %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.030
80 %1 = load i32, i32* %arrayidx6, align 4, !tbaa !5
81 %tobool7 = icmp eq i32 %1, 0
82 %sub = add i32 %sum1.029, 10
83 %add = sub i32 %sub, %1
84 %sum1.1 = select i1 %tobool7, i32 %sum1.029, i32 %add
85 %inc = add nuw i32 %j.030, 1
86 %exitcond = icmp eq i32 %inc, %J
87 br i1 %exitcond, label %for.latch, label %for.inner
88
89 for.latch:
90 %sum1.1.lcssa = phi i32 [ 0, %for.outer ], [ %sum1.1, %for.inner ]
91 %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %i.032
92 store i32 %sum1.1.lcssa, i32* %arrayidx11, align 4, !tbaa !5
93 %add13 = add nuw i32 %i.032, 1
94 %exitcond33 = icmp eq i32 %add13, %I
95 br i1 %exitcond33, label %for.end14, label %for.outer
96
97 for.end14:
98 ret void
99 }
100
101
102
103 ; CHECK-LABEL: disabled3
104 ; Tests loop carry dependencies in an array S
105 define void @disabled3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
106 ; CHECK: %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
107 ; CHECK: %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
108 entry:
109 %S = alloca [4 x i32], align 4
110 %cmp = icmp eq i32 %J, 0
111 br i1 %cmp, label %return, label %if.end
112
113 if.end:
114 %0 = bitcast [4 x i32]* %S to i8*
115 %cmp128 = icmp eq i32 %I, 0
116 br i1 %cmp128, label %for.cond.cleanup, label %for.preheader
117
118 for.preheader:
119 %arrayidx9 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 0
120 br label %for.outer
121
122 for.cond.cleanup:
123 br label %return
124
125 for.outer:
126 %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
127 br label %for.inner
128
129 for.inner:
130 %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
131 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.027
132 %l2 = load i32, i32* %arrayidx, align 4, !tbaa !5
133 %add = add i32 %j.027, %i.029
134 %rem = urem i32 %add, %J
135 %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %rem
136 %l3 = load i32, i32* %arrayidx6, align 4, !tbaa !5
137 %mul = mul i32 %l3, %l2
138 %rem7 = urem i32 %j.027, 3
139 %arrayidx8 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 %rem7
140 store i32 %mul, i32* %arrayidx8, align 4, !tbaa !5
141 %inc = add nuw i32 %j.027, 1
142 %exitcond = icmp eq i32 %inc, %J
143 br i1 %exitcond, label %for.latch, label %for.inner
144
145 for.latch:
146 %l1 = load i32, i32* %arrayidx9, align 4, !tbaa !5
147 %arrayidx10 = getelementptr inbounds i32, i32* %A, i32 %i.029
148 store i32 %l1, i32* %arrayidx10, align 4, !tbaa !5
149 %add12 = add nuw i32 %i.029, 1
150 %exitcond31 = icmp eq i32 %add12, %I
151 br i1 %exitcond31, label %for.cond.cleanup, label %for.outer
152
153 return:
154 ret void
155 }
156
157
158 ; CHECK-LABEL: disabled4
159 ; Inner looop induction variable in not consistent
160 ; ie for(i = 0..n) for (j = 0..i) sum+=B[j]
161 define void @disabled4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
162 ; CHECK: %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
163 ; CHECK: %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
164 entry:
165 %cmp = icmp ne i32 %J, 0
166 %cmp122 = icmp ugt i32 %I, 1
167 %or.cond = and i1 %cmp122, %cmp
168 br i1 %or.cond, label %for.preheader, label %for.end9
169
170 for.preheader:
171 br label %for.outer
172
173 for.outer:
174 %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
175 br label %for.inner
176
177 for.inner:
178 %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
179 %sum1.020 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
180 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.021
181 %0 = load i32, i32* %arrayidx, align 4, !tbaa !5
182 %add = add i32 %0, %sum1.020
183 %inc = add nuw i32 %j.021, 1
184 %exitcond = icmp eq i32 %inc, %indvars.iv
185 br i1 %exitcond, label %for.latch, label %for.inner
186
187 for.latch:
188 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
189 store i32 %add, i32* %arrayidx6, align 4, !tbaa !5
190 %indvars.iv.next = add nuw i32 %indvars.iv, 1
191 %exitcond24 = icmp eq i32 %indvars.iv.next, %I
192 br i1 %exitcond24, label %for.end9, label %for.outer
193
194 for.end9:
195 ret void
196 }
197
198
199 ; CHECK-LABEL: disabled5
200 ; Test odd uses of phi nodes where the outer IV cannot be moved into Fore as it hits a PHI
201 @f = hidden global i32 0, align 4
202 define i32 @disabled5() #0 {
203 ; CHECK: %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
204 ; CHECK: %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
205 entry:
206 %f.promoted10 = load i32, i32* @f, align 4, !tbaa !5
207 br label %for.outer
208
209 for.outer:
210 %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
211 %d.018 = phi i16 [ 0, %entry ], [ %odd.lcssa, %for.latch ]
212 %inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
213 br label %for.inner
214
215 for.inner:
216 %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
217 %inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
218 %inc = add nuw nsw i32 %inc.sink8, 1
219 %exitcond = icmp ne i32 %inc, 7
220 br i1 %exitcond, label %for.inner, label %for.latch
221
222 for.latch:
223 %.lcssa = phi i32 [ %1, %for.inner ]
224 %odd.lcssa = phi i16 [ 1, %for.inner ]
225 %inc5 = add nuw nsw i32 %inc5.sink9, 1
226 %exitcond11 = icmp ne i32 %inc5, 7
227 br i1 %exitcond11, label %for.outer, label %for.end
228
229 for.end:
230 %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
231 %inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
232 ret i32 0
233 }
234
235
236 ; CHECK-LABEL: disabled6
237 ; There is a dependency in here, between @d and %0 (=@f)
238 @d6 = hidden global i16 5, align 2
239 @f6 = hidden global i16* @d6, align 4
240 define i32 @disabled6() #0 {
241 ; CHECK: %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
242 ; CHECK: %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
243 entry:
244 store i16 1, i16* @d6, align 2
245 %0 = load i16*, i16** @f6, align 4
246 br label %for.body.i
247
248 for.body.i:
249 %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
250 %1 = load i16, i16* %0, align 2
251 br label %for.body6.i
252
253 for.cond.cleanup.i:
254 %inc8.i = add nuw nsw i16 %inc8.sink14.i, 1
255 store i16 %inc8.i, i16* @d6, align 2
256 %cmp.i = icmp ult i16 %inc8.i, 6
257 br i1 %cmp.i, label %for.body.i, label %test.exit
258
259 for.body6.i:
260 %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
261 %inc.i = add nuw nsw i32 %c.013.i, 1
262 %exitcond.i = icmp eq i32 %inc.i, 7
263 br i1 %exitcond.i, label %for.cond.cleanup.i, label %for.body6.i
264
265 test.exit:
266 %conv2.i = sext i16 %1 to i32
267 ret i32 0
268 }
269
270
271
272 ; CHECK-LABEL: disabled7
273 ; Has negative output dependency
274 define void @disabled7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
275 ; CHECK: %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
276 ; CHECK: %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
277 entry:
278 %cmp = icmp ne i32 %J, 0
279 %cmp127 = icmp ne i32 %I, 0
280 %or.cond = and i1 %cmp127, %cmp
281 br i1 %or.cond, label %for.body.preheader, label %for.end12
282
283 for.body.preheader:
284 br label %for.body
285
286 for.body:
287 %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
288 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.028
289 store i32 0, i32* %arrayidx, align 4, !tbaa !5
290 %sub = add i32 %i.028, -1
291 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %sub
292 store i32 2, i32* %arrayidx2, align 4, !tbaa !5
293 br label %for.body6
294
295 for.cond3.for.cond.cleanup5_crit_edge:
296 store i32 %add, i32* %arrayidx, align 4, !tbaa !5
297 %add11 = add nuw i32 %i.028, 1
298 %exitcond29 = icmp eq i32 %add11, %I
299 br i1 %exitcond29, label %for.end12, label %for.body
300
301 for.body6:
302 %0 = phi i32 [ 0, %for.body ], [ %add, %for.body6 ]
303 %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
304 %arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j.026
305 %1 = load i32, i32* %arrayidx7, align 4, !tbaa !5
306 %add = add i32 %1, %0
307 %add9 = add nuw i32 %j.026, 1
308 %exitcond = icmp eq i32 %add9, %J
309 br i1 %exitcond, label %for.cond3.for.cond.cleanup5_crit_edge, label %for.body6
310
311 for.end12:
312 ret void
313 }
314
315
316 ; CHECK-LABEL: disabled8
317 ; Same as above with an extra outer loop nest
318 define void @disabled8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
319 ; CHECK: %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
320 ; CHECK: %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
321 entry:
322 %cmp = icmp eq i32 %J, 0
323 %cmp335 = icmp eq i32 %I, 0
324 %or.cond = or i1 %cmp, %cmp335
325 br i1 %or.cond, label %for.end18, label %for.body.preheader
326
327 for.body.preheader:
328 br label %for.body
329
330 for.body:
331 %x.037 = phi i32 [ %inc, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
332 br label %for.outer
333
334 for.cond.cleanup4:
335 %inc = add nuw nsw i32 %x.037, 1
336 %exitcond40 = icmp eq i32 %inc, 5
337 br i1 %exitcond40, label %for.end18, label %for.body
338
339 for.outer:
340 %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
341 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.036
342 store i32 0, i32* %arrayidx, align 4, !tbaa !5
343 %sub = add i32 %i.036, -1
344 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %sub
345 store i32 2, i32* %arrayidx6, align 4, !tbaa !5
346 br label %for.inner
347
348 for.latch:
349 store i32 %add, i32* %arrayidx, align 4, !tbaa !5
350 %add15 = add nuw i32 %i.036, 1
351 %exitcond38 = icmp eq i32 %add15, %I
352 br i1 %exitcond38, label %for.cond.cleanup4, label %for.outer
353
354 for.inner:
355 %0 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
356 %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
357 %arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j.034
358 %1 = load i32, i32* %arrayidx11, align 4, !tbaa !5
359 %add = add i32 %1, %0
360 %add13 = add nuw i32 %j.034, 1
361 %exitcond = icmp eq i32 %add13, %J
362 br i1 %exitcond, label %for.latch, label %for.inner
363
364 for.end18:
365 ret void
366 }
367
368
369 ; CHECK-LABEL: disabled9
370 ; Can't prove alias between A and B
371 define void @disabled9(i32 %I, i32 %J, i32* nocapture %A, i32* nocapture readonly %B) #0 {
372 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
373 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
374 entry:
375 %cmp = icmp ne i32 %J, 0
376 %cmp122 = icmp ne i32 %I, 0
377 %or.cond = and i1 %cmp, %cmp122
378 br i1 %or.cond, label %for.outer.preheader, label %for.end
379
380 for.outer.preheader:
381 br label %for.outer
382
383 for.outer:
384 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
385 br label %for.inner
386
387 for.inner:
388 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
389 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
390 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
391 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
392 %add.us = add i32 %0, %sum1.us
393 %inc.us = add nuw i32 %j.us, 1
394 %exitcond = icmp eq i32 %inc.us, %J
395 br i1 %exitcond, label %for.latch, label %for.inner
396
397 for.latch:
398 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
399 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
400 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
401 %add8.us = add nuw i32 %i.us, 1
402 %exitcond25 = icmp eq i32 %add8.us, %I
403 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
404
405 for.end.loopexit:
406 br label %for.end
407
408 for.end:
409 ret void
410 }
411
412
413 ; CHECK-LABEL: disable10
414 ; Simple call
415 declare void @f10(i32, i32) #0
416 define void @disable10(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
417 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
418 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
419 entry:
420 %cmp = icmp ne i32 %J, 0
421 %cmp122 = icmp ne i32 %I, 0
422 %or.cond = and i1 %cmp, %cmp122
423 br i1 %or.cond, label %for.outer.preheader, label %for.end
424
425 for.outer.preheader:
426 br label %for.outer
427
428 for.outer:
429 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
430 br label %for.inner
431
432 for.inner:
433 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
434 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
435 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
436 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
437 %add.us = add i32 %0, %sum1.us
438 %inc.us = add nuw i32 %j.us, 1
439 %exitcond = icmp eq i32 %inc.us, %J
440 tail call void @f10(i32 %i.us, i32 %j.us) nounwind
441 br i1 %exitcond, label %for.latch, label %for.inner
442
443 for.latch:
444 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
445 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
446 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
447 %add8.us = add nuw i32 %i.us, 1
448 %exitcond25 = icmp eq i32 %add8.us, %I
449 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
450
451 for.end.loopexit:
452 br label %for.end
453
454 for.end:
455 ret void
456 }
457
458
459 ; CHECK-LABEL: disable11
460 ; volatile
461 define void @disable11(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
462 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
463 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
464 entry:
465 %cmp = icmp ne i32 %J, 0
466 %cmp122 = icmp ne i32 %I, 0
467 %or.cond = and i1 %cmp, %cmp122
468 br i1 %or.cond, label %for.outer.preheader, label %for.end
469
470 for.outer.preheader:
471 br label %for.outer
472
473 for.outer:
474 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
475 br label %for.inner
476
477 for.inner:
478 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
479 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
480 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
481 %0 = load volatile i32, i32* %arrayidx.us, align 4, !tbaa !5
482 %add.us = add i32 %0, %sum1.us
483 %inc.us = add nuw i32 %j.us, 1
484 %exitcond = icmp eq i32 %inc.us, %J
485 br i1 %exitcond, label %for.latch, label %for.inner
486
487 for.latch:
488 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
489 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
490 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
491 %add8.us = add nuw i32 %i.us, 1
492 %exitcond25 = icmp eq i32 %add8.us, %I
493 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
494
495 for.end.loopexit:
496 br label %for.end
497
498 for.end:
499 ret void
500 }
501
502
503 ; CHECK-LABEL: disable12
504 ; Multiple aft blocks
505 define void @disable12(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
506 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch3 ], [ 0, %for.outer.preheader ]
507 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
508 entry:
509 %cmp = icmp ne i32 %J, 0
510 %cmp122 = icmp ne i32 %I, 0
511 %or.cond = and i1 %cmp, %cmp122
512 br i1 %or.cond, label %for.outer.preheader, label %for.end
513
514 for.outer.preheader:
515 br label %for.outer
516
517 for.outer:
518 %i.us = phi i32 [ %add8.us, %for.latch3 ], [ 0, %for.outer.preheader ]
519 br label %for.inner
520
521 for.inner:
522 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
523 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
524 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
525 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
526 %add.us = add i32 %0, %sum1.us
527 %inc.us = add nuw i32 %j.us, 1
528 %exitcond = icmp eq i32 %inc.us, %J
529 br i1 %exitcond, label %for.latch, label %for.inner
530
531 for.latch:
532 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
533 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
534 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
535 %cmpl = icmp eq i32 %add.us.lcssa, 10
536 br i1 %cmpl, label %for.latch2, label %for.latch3
537
538 for.latch2:
539 br label %for.latch3
540
541 for.latch3:
542 %add8.us = add nuw i32 %i.us, 1
543 %exitcond25 = icmp eq i32 %add8.us, %I
544 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
545
546 for.end.loopexit:
547 br label %for.end
548
549 for.end:
550 ret void
551 }
552
553
554 ; CHECK-LABEL: disable13
555 ; Two subloops
556 define void @disable13(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
557 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
558 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
559 ; CHECK: %j.us2 = phi i32 [ %inc.us2, %for.inner2 ], [ 0, %for.inner2.preheader ]
560 entry:
561 %cmp = icmp ne i32 %J, 0
562 %cmp122 = icmp ne i32 %I, 0
563 %or.cond = and i1 %cmp, %cmp122
564 br i1 %or.cond, label %for.outer.preheader, label %for.end
565
566 for.outer.preheader:
567 br label %for.outer
568
569 for.outer:
570 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
571 br label %for.inner
572
573 for.inner:
574 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
575 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
576 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
577 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
578 %add.us = add i32 %0, %sum1.us
579 %inc.us = add nuw i32 %j.us, 1
580 %exitcond = icmp eq i32 %inc.us, %J
581 br i1 %exitcond, label %for.inner2, label %for.inner
582
583 for.inner2:
584 %j.us2 = phi i32 [ 0, %for.inner ], [ %inc.us2, %for.inner2 ]
585 %sum1.us2 = phi i32 [ 0, %for.inner ], [ %add.us2, %for.inner2 ]
586 %arrayidx.us2 = getelementptr inbounds i32, i32* %B, i32 %j.us2
587 %l0 = load i32, i32* %arrayidx.us2, align 4, !tbaa !5
588 %add.us2 = add i32 %l0, %sum1.us2
589 %inc.us2 = add nuw i32 %j.us2, 1
590 %exitcond2 = icmp eq i32 %inc.us2, %J
591 br i1 %exitcond2, label %for.latch, label %for.inner2
592
593 for.latch:
594 %add.us.lcssa = phi i32 [ %add.us, %for.inner2 ]
595 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
596 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
597 %add8.us = add nuw i32 %i.us, 1
598 %exitcond25 = icmp eq i32 %add8.us, %I
599 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
600
601 for.end.loopexit:
602 br label %for.end
603
604 for.end:
605 ret void
606 }
607
608
609 ; CHECK-LABEL: disable14
610 ; Multiple exits blocks
611 define void @disable14(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
612 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
613 ; CHECK: %j.us = phi i32 [ %inc.us, %for.inner ], [ 0, %for.inner.preheader ]
614 entry:
615 %cmp = icmp ne i32 %J, 0
616 %cmp122 = icmp ne i32 %I, 0
617 %or.cond = and i1 %cmp, %cmp122
618 br i1 %or.cond, label %for.outer.preheader, label %for.end
619
620 for.outer.preheader:
621 br label %for.outer
622
623 for.outer:
624 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
625 %add8.us = add nuw i32 %i.us, 1
626 %exitcond23 = icmp eq i32 %add8.us, %I
627 br i1 %exitcond23, label %for.end.loopexit, label %for.inner
628
629 for.inner:
630 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
631 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
632 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
633 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
634 %add.us = add i32 %0, %sum1.us
635 %inc.us = add nuw i32 %j.us, 1
636 %exitcond = icmp eq i32 %inc.us, %J
637 br i1 %exitcond, label %for.latch, label %for.inner
638
639 for.latch:
640 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
641 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
642 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
643 %exitcond25 = icmp eq i32 %add8.us, %I
644 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
645
646 for.end.loopexit:
647 br label %for.end
648
649 for.end:
650 ret void
651 }
652
653
654 ; CHECK-LABEL: disable15
655 ; Latch != exit
656 define void @disable15(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
657 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
658 ; CHECK: %j.us = phi i32 [ %inc.us, %for.inner ], [ 0, %for.inner.preheader ]
659 entry:
660 %cmp = icmp ne i32 %J, 0
661 %cmp122 = icmp ne i32 %I, 0
662 %or.cond = and i1 %cmp, %cmp122
663 br i1 %or.cond, label %for.outer.preheader, label %for.end
664
665 for.outer.preheader:
666 br label %for.outer
667
668 for.outer:
669 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
670 %add8.us = add nuw i32 %i.us, 1
671 %exitcond25 = icmp eq i32 %add8.us, %I
672 br i1 %exitcond25, label %for.end.loopexit, label %for.inner
673
674 for.inner:
675 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
676 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
677 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
678 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
679 %add.us = add i32 %0, %sum1.us
680 %inc.us = add nuw i32 %j.us, 1
681 %exitcond = icmp eq i32 %inc.us, %J
682 br i1 %exitcond, label %for.latch, label %for.inner
683
684 for.latch:
685 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
686 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
687 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
688 br label %for.outer
689
690 for.end.loopexit:
691 br label %for.end
692
693 for.end:
694 ret void
695 }
696
697
698 ; CHECK-LABEL: disable16
699 ; Latch != exit
700 define void @disable16(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
701 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
702 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
703 entry:
704 %cmp = icmp ne i32 %J, 0
705 %cmp122 = icmp ne i32 %I, 0
706 %or.cond = and i1 %cmp, %cmp122
707 br i1 %or.cond, label %for.outer.preheader, label %for.end
708
709 for.outer.preheader:
710 br label %for.outer
711
712 for.outer:
713 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
714 %otherphi = phi i32 [ %other, %for.latch ], [ 0, %for.outer.preheader ]
715 br label %for.inner
716
717 for.inner:
718 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
719 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
720 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
721 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
722 %add.us = add i32 %0, %sum1.us
723 %inc.us = add nuw i32 %j.us, 1
724 %exitcond = icmp eq i32 %inc.us, %J
725 br i1 %exitcond, label %for.latch, label %for.inner
726
727 for.latch:
728 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
729 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
730 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
731 %add8.us = add nuw i32 %i.us, 1
732 %exitcond25 = icmp eq i32 %add8.us, %I
733 %loadarr = getelementptr inbounds i32, i32* %A, i32 %i.us
734 %load = load i32, i32* %arrayidx6.us, align 4, !tbaa !5
735 %other = add i32 %otherphi, %load
736 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
737
738 for.end.loopexit:
739 br label %for.end
740
741 for.end:
742 ret void
743 }
744
745
746 attributes #0 = { "target-cpu"="cortex-m33" }
747
748 !5 = !{!6, !6, i64 0}
749 !6 = !{!"omnipotent char", !7, i64 0}
750 !7 = !{!"Simple C/C++ TBAA"}
0 ; RUN: opt -basicaa -tbaa -loop-unroll-and-jam < %s -S | FileCheck %s
1 ; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -unroll-and-jam-threshold=15 < %s -S | FileCheck %s --check-prefix=CHECK-LOWTHRES
2
3 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
4 target triple = "thumbv8m.main-arm-none-eabi"
5
6 ; CHECK-LABEL: test1
7 ; Basic check that these loops are by default UnJ'd
8 define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
9 ; CHECK: %i.us = phi i32 [ %add8.us.3, %for.latch ], [ 0, %for.outer.preheader.new ]
10 ; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
11 entry:
12 %cmp = icmp ne i32 %J, 0
13 %cmp122 = icmp ne i32 %I, 0
14 %or.cond = and i1 %cmp, %cmp122
15 br i1 %or.cond, label %for.outer.preheader, label %for.end
16
17 for.outer.preheader:
18 br label %for.outer
19
20 for.outer:
21 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
22 br label %for.inner
23
24 for.inner:
25 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
26 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
27 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
28 %0 = load i32, i32* %arrayidx.us, align 4
29 %add.us = add i32 %0, %sum1.us
30 %inc.us = add nuw i32 %j.us, 1
31 %exitcond = icmp eq i32 %inc.us, %J
32 br i1 %exitcond, label %for.latch, label %for.inner
33
34 for.latch:
35 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
36 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
37 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
38 %add8.us = add nuw i32 %i.us, 1
39 %exitcond25 = icmp eq i32 %add8.us, %I
40 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
41
42 for.end.loopexit:
43 br label %for.end
44
45 for.end:
46 ret void
47 }
48
49 ; CHECK-LABEL: nounroll_and_jam
50 ; #pragma nounroll_and_jam
51 define void @nounroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
52 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
53 entry:
54 %cmp = icmp ne i32 %J, 0
55 %cmp122 = icmp ne i32 %I, 0
56 %or.cond = and i1 %cmp, %cmp122
57 br i1 %or.cond, label %for.outer.preheader, label %for.end
58
59 for.outer.preheader:
60 br label %for.outer
61
62 for.outer:
63 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
64 br label %for.inner
65
66 for.inner:
67 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
68 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
69 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
70 %0 = load i32, i32* %arrayidx.us, align 4
71 %add.us = add i32 %0, %sum1.us
72 %inc.us = add nuw i32 %j.us, 1
73 %exitcond = icmp eq i32 %inc.us, %J
74 br i1 %exitcond, label %for.latch, label %for.inner
75
76 for.latch:
77 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
78 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
79 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
80 %add8.us = add nuw i32 %i.us, 1
81 %exitcond25 = icmp eq i32 %add8.us, %I
82 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !1
83
84 for.end.loopexit:
85 br label %for.end
86
87 for.end:
88 ret void
89 }
90
91 ; CHECK-LABEL: unroll_and_jam_count
92 ; #pragma unroll_and_jam(8)
93 define void @unroll_and_jam_count(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
94 ; CHECK: %i.us = phi i32 [ %add8.us.7, %for.latch ], [ 0, %for.outer.preheader.new ]
95 entry:
96 %cmp = icmp ne i32 %J, 0
97 %cmp122 = icmp ne i32 %I, 0
98 %or.cond = and i1 %cmp, %cmp122
99 br i1 %or.cond, label %for.outer.preheader, label %for.end
100
101 for.outer.preheader:
102 br label %for.outer
103
104 for.outer:
105 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
106 br label %for.inner
107
108 for.inner:
109 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
110 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
111 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
112 %0 = load i32, i32* %arrayidx.us, align 4
113 %add.us = add i32 %0, %sum1.us
114 %inc.us = add nuw i32 %j.us, 1
115 %exitcond = icmp eq i32 %inc.us, %J
116 br i1 %exitcond, label %for.latch, label %for.inner
117
118 for.latch:
119 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
120 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
121 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
122 %add8.us = add nuw i32 %i.us, 1
123 %exitcond25 = icmp eq i32 %add8.us, %I
124 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !3
125
126 for.end.loopexit:
127 br label %for.end
128
129 for.end:
130 ret void
131 }
132
133 ; CHECK-LABEL: unroll_and_jam
134 ; #pragma unroll_and_jam
135 define void @unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
136 ; CHECK: %i.us = phi i32 [ %add8.us.3, %for.latch ], [ 0, %for.outer.preheader.new ]
137 ; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us.3, %for.latch ], [ 0, %for.outer.preheader.new ]
138 entry:
139 %cmp = icmp ne i32 %J, 0
140 %cmp122 = icmp ne i32 %I, 0
141 %or.cond = and i1 %cmp, %cmp122
142 br i1 %or.cond, label %for.outer.preheader, label %for.end
143
144 for.outer.preheader:
145 br label %for.outer
146
147 for.outer:
148 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
149 br label %for.inner
150
151 for.inner:
152 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
153 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
154 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
155 %0 = load i32, i32* %arrayidx.us, align 4
156 %add.us = add i32 %0, %sum1.us
157 %inc.us = add nuw i32 %j.us, 1
158 %exitcond = icmp eq i32 %inc.us, %J
159 br i1 %exitcond, label %for.latch, label %for.inner
160
161 for.latch:
162 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
163 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
164 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
165 %add8.us = add nuw i32 %i.us, 1
166 %exitcond25 = icmp eq i32 %add8.us, %I
167 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !5
168
169 for.end.loopexit:
170 br label %for.end
171
172 for.end:
173 ret void
174 }
175
176 ; CHECK-LABEL: nounroll
177 ; #pragma nounroll (which we take to mean disable unroll and jam too)
178 define void @nounroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
179 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
180 entry:
181 %cmp = icmp ne i32 %J, 0
182 %cmp122 = icmp ne i32 %I, 0
183 %or.cond = and i1 %cmp, %cmp122
184 br i1 %or.cond, label %for.outer.preheader, label %for.end
185
186 for.outer.preheader:
187 br label %for.outer
188
189 for.outer:
190 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
191 br label %for.inner
192
193 for.inner:
194 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
195 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
196 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
197 %0 = load i32, i32* %arrayidx.us, align 4
198 %add.us = add i32 %0, %sum1.us
199 %inc.us = add nuw i32 %j.us, 1
200 %exitcond = icmp eq i32 %inc.us, %J
201 br i1 %exitcond, label %for.latch, label %for.inner
202
203 for.latch:
204 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
205 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
206 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
207 %add8.us = add nuw i32 %i.us, 1
208 %exitcond25 = icmp eq i32 %add8.us, %I
209 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !7
210
211 for.end.loopexit:
212 br label %for.end
213
214 for.end:
215 ret void
216 }
217
218 ; CHECK-LABEL: unroll
219 ; #pragma unroll (which we take to mean disable unroll and jam)
220 define void @unroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
221 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
222 entry:
223 %cmp = icmp ne i32 %J, 0
224 %cmp122 = icmp ne i32 %I, 0
225 %or.cond = and i1 %cmp, %cmp122
226 br i1 %or.cond, label %for.outer.preheader, label %for.end
227
228 for.outer.preheader:
229 br label %for.outer
230
231 for.outer:
232 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
233 br label %for.inner
234
235 for.inner:
236 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
237 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
238 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
239 %0 = load i32, i32* %arrayidx.us, align 4
240 %add.us = add i32 %0, %sum1.us
241 %inc.us = add nuw i32 %j.us, 1
242 %exitcond = icmp eq i32 %inc.us, %J
243 br i1 %exitcond, label %for.latch, label %for.inner
244
245 for.latch:
246 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
247 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
248 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
249 %add8.us = add nuw i32 %i.us, 1
250 %exitcond25 = icmp eq i32 %add8.us, %I
251 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !9
252
253 for.end.loopexit:
254 br label %for.end
255
256 for.end:
257 ret void
258 }
259
260 ; CHECK-LABEL: nounroll_plus_unroll_and_jam
261 ; #pragma clang loop nounroll, unroll_and_jam (which we take to mean, do unroll_and_jam)
262 define void @nounroll_plus_unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
263 ; CHECK: %i.us = phi i32 [ %add8.us.3, %for.latch ], [ 0, %for.outer.preheader.new ]
264 entry:
265 %cmp = icmp ne i32 %J, 0
266 %cmp122 = icmp ne i32 %I, 0
267 %or.cond = and i1 %cmp, %cmp122
268 br i1 %or.cond, label %for.outer.preheader, label %for.end
269
270 for.outer.preheader:
271 br label %for.outer
272
273 for.outer:
274 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
275 br label %for.inner
276
277 for.inner:
278 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
279 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
280 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
281 %0 = load i32, i32* %arrayidx.us, align 4
282 %add.us = add i32 %0, %sum1.us
283 %inc.us = add nuw i32 %j.us, 1
284 %exitcond = icmp eq i32 %inc.us, %J
285 br i1 %exitcond, label %for.latch, label %for.inner
286
287 for.latch:
288 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
289 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
290 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
291 %add8.us = add nuw i32 %i.us, 1
292 %exitcond25 = icmp eq i32 %add8.us, %I
293 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !11
294
295 for.end.loopexit:
296 br label %for.end
297
298 for.end:
299 ret void
300 }
301
302 !1 = distinct !{!1, !2}
303 !2 = distinct !{!"llvm.loop.unroll_and_jam.disable"}
304 !3 = distinct !{!3, !4}
305 !4 = distinct !{!"llvm.loop.unroll_and_jam.count", i32 8}
306 !5 = distinct !{!5, !6}
307 !6 = distinct !{!"llvm.loop.unroll_and_jam.enable"}
308 !7 = distinct !{!7, !8}
309 !8 = distinct !{!"llvm.loop.unroll.disable"}
310 !9 = distinct !{!9, !10}
311 !10 = distinct !{!"llvm.loop.unroll.enable"}
312 !11 = distinct !{!11, !8, !6}
0 ; RUN: opt -loop-unroll-and-jam -pass-remarks=loop-unroll < %s -S 2>&1 | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3 target triple = "thumbv8m.main-arm-none-eabi"
4
5 ;; Common check for all tests. None should be unroll and jammed due to profitability
6 ; CHECK-NOT: remark: {{.*}} unroll and jammed
7
8
9 ; CHECK-LABEL: unprof1
10 ; Multiple inner loop blocks
11 define void @unprof1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
12 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
13 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner2 ]
14 entry:
15 %cmp = icmp ne i32 %J, 0
16 %cmp122 = icmp ne i32 %I, 0
17 %or.cond = and i1 %cmp, %cmp122
18 br i1 %or.cond, label %for.outer.preheader, label %for.end
19
20 for.outer.preheader:
21 br label %for.outer
22
23 for.outer:
24 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
25 br label %for.inner
26
27 for.inner:
28 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner2 ]
29 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner2 ]
30 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
31 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
32 %add.us = add i32 %0, %sum1.us
33 br label %for.inner2
34
35 for.inner2:
36 %inc.us = add nuw i32 %j.us, 1
37 %exitcond = icmp eq i32 %inc.us, %J
38 br i1 %exitcond, label %for.latch, label %for.inner
39
40 for.latch:
41 %add.us.lcssa = phi i32 [ %add.us, %for.inner2 ]
42 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
43 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
44 %add8.us = add nuw i32 %i.us, 1
45 %exitcond25 = icmp eq i32 %add8.us, %I
46 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
47
48 for.end.loopexit:
49 br label %for.end
50
51 for.end:
52 ret void
53 }
54
55
56 ; CHECK-LABEL: unprof2
57 ; Constant inner loop count
58 define void @unprof2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
59 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
60 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
61 entry:
62 %cmp = icmp ne i32 %J, 0
63 %cmp122 = icmp ne i32 %I, 0
64 %or.cond = and i1 %cmp, %cmp122
65 br i1 %or.cond, label %for.outer.preheader, label %for.end
66
67 for.outer.preheader:
68 br label %for.outer
69
70 for.outer:
71 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
72 br label %for.inner
73
74 for.inner:
75 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
76 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
77 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
78 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
79 %add.us = add i32 %0, %sum1.us
80 %inc.us = add nuw i32 %j.us, 1
81 %exitcond = icmp eq i32 %inc.us, 10
82 br i1 %exitcond, label %for.latch, label %for.inner
83
84 for.latch:
85 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
86 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
87 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
88 %add8.us = add nuw i32 %i.us, 1
89 %exitcond25 = icmp eq i32 %add8.us, %I
90 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
91
92 for.end.loopexit:
93 br label %for.end
94
95 for.end:
96 ret void
97 }
98
99
100 ; CHECK-LABEL: unprof3
101 ; Complex inner loop
102 define void @unprof3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
103 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
104 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
105 entry:
106 %cmp = icmp ne i32 %J, 0
107 %cmp122 = icmp ne i32 %I, 0
108 %or.cond = and i1 %cmp, %cmp122
109 br i1 %or.cond, label %for.outer.preheader, label %for.end
110
111 for.outer.preheader:
112 br label %for.outer
113
114 for.outer:
115 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
116 br label %for.inner
117
118 for.inner:
119 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
120 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
121 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
122 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
123 %add.us = add i32 %0, %sum1.us
124 %add.us0 = add i32 %0, %sum1.us
125 %add.us1 = add i32 %0, %sum1.us
126 %add.us2 = add i32 %0, %sum1.us
127 %add.us3 = add i32 %0, %sum1.us
128 %add.us4 = add i32 %0, %sum1.us
129 %add.us5 = add i32 %0, %sum1.us
130 %add.us6 = add i32 %0, %sum1.us
131 %add.us7 = add i32 %0, %sum1.us
132 %add.us8 = add i32 %0, %sum1.us
133 %add.us9 = add i32 %0, %sum1.us
134 %add.us10 = add i32 %0, %sum1.us
135 %add.us11 = add i32 %0, %sum1.us
136 %add.us12 = add i32 %0, %sum1.us
137 %add.us13 = add i32 %0, %sum1.us
138 %add.us14 = add i32 %0, %sum1.us
139 %add.us15 = add i32 %0, %sum1.us
140 %add.us16 = add i32 %0, %sum1.us
141 %add.us17 = add i32 %0, %sum1.us
142 %add.us18 = add i32 %0, %sum1.us
143 %add.us19 = add i32 %0, %sum1.us
144 %add.us20 = add i32 %0, %sum1.us
145 %add.us21 = add i32 %0, %sum1.us
146 %add.us22 = add i32 %0, %sum1.us
147 %add.us23 = add i32 %0, %sum1.us
148 %add.us24 = add i32 %0, %sum1.us
149 %add.us25 = add i32 %0, %sum1.us
150 %add.us26 = add i32 %0, %sum1.us
151 %add.us27 = add i32 %0, %sum1.us
152 %add.us28 = add i32 %0, %sum1.us
153 %add.us29 = add i32 %0, %sum1.us
154 %inc.us = add nuw i32 %j.us, 1
155 %exitcond = icmp eq i32 %inc.us, %J
156 br i1 %exitcond, label %for.latch, label %for.inner
157
158 for.latch:
159 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
160 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
161 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
162 %add8.us = add nuw i32 %i.us, 1
163 %exitcond25 = icmp eq i32 %add8.us, %I
164 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
165
166 for.end.loopexit:
167 br label %for.end
168
169 for.end:
170 ret void
171 }
172
173
174 ; CHECK-LABEL: unprof4
175 ; No loop invariant loads
176 define void @unprof4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
177 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
178 ; CHECK: %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
179 entry:
180 %cmp = icmp ne i32 %J, 0
181 %cmp122 = icmp ne i32 %I, 0
182 %or.cond = and i1 %cmp, %cmp122
183 br i1 %or.cond, label %for.outer.preheader, label %for.end
184
185 for.outer.preheader:
186 br label %for.outer
187
188 for.outer:
189 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
190 br label %for.inner
191
192 for.inner:
193 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
194 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
195 %j2 = add i32 %j.us, %i.us
196 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j2
197 %0 = load i32, i32* %arrayidx.us, align 4, !tbaa !5
198 %add.us = add i32 %0, %sum1.us
199 %inc.us = add nuw i32 %j.us, 1
200 %exitcond = icmp eq i32 %inc.us, %J
201 br i1 %exitcond, label %for.latch, label %for.inner
202
203 for.latch:
204 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
205 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
206 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4, !tbaa !5
207 %add8.us = add nuw i32 %i.us, 1
208 %exitcond25 = icmp eq i32 %add8.us, %I
209 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
210
211 for.end.loopexit:
212 br label %for.end
213
214 for.end:
215 ret void
216 }
217
218
219 attributes #0 = { "target-cpu"="cortex-m33" }
220
221 !5 = !{!6, !6, i64 0}
222 !6 = !{!"omnipotent char", !7, i64 0}
223 !7 = !{!"Simple C/C++ TBAA"}
0 ; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3 target triple = "thumbv8m.main-arm-none-eabi"
4
5 ; CHECK-LABEL: test1
6 ; Tests for(i) { sum = 0; for(j) sum += B[j]; A[i] = sum; }
7 ; CHECK-NEXT: entry:
8 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[J:%.*]], 0
9 ; CHECK-NEXT: [[CMP122:%.*]] = icmp ne i32 [[I:%.*]], 0
10 ; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMP122]]
11 ; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
12 ; CHECK: for.outer.preheader:
13 ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[I]], -1
14 ; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[I]], 3
15 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
16 ; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
17 ; CHECK: for.outer.preheader.new:
18 ; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
19 ; CHECK-NEXT: br label [[FOR_OUTER:%.*]]
20 ; CHECK: for.outer:
21 ; CHECK-NEXT: [[I_US:%.*]] = phi i32 [ [[ADD8_US_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
22 ; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_LATCH]] ]
23 ; CHECK-NEXT: [[ADD8_US:%.*]] = add nuw nsw i32 [[I_US]], 1
24 ; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
25 ; CHECK-NEXT: [[ADD8_US_1:%.*]] = add nuw nsw i32 [[ADD8_US]], 1
26 ; CHECK-NEXT: [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1
27 ; CHECK-NEXT: [[ADD8_US_2:%.*]] = add nuw nsw i32 [[ADD8_US_1]], 1
28 ; CHECK-NEXT: [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1
29 ; CHECK-NEXT: [[ADD8_US_3]] = add nuw i32 [[ADD8_US_2]], 1
30 ; CHECK-NEXT: [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1
31 ; CHECK-NEXT: br label [[FOR_INNER:%.*]]
32 ; CHECK: for.inner:
33 ; CHECK-NEXT: [[J_US:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_US:%.*]], [[FOR_INNER]] ]
34 ; CHECK-NEXT: [[SUM1_US:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_US:%.*]], [[FOR_INNER]] ]
35 ; CHECK-NEXT: [[J_US_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_US_1:%.*]], [[FOR_INNER]] ]
36 ; CHECK-NEXT: [[SUM1_US_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_US_1:%.*]], [[FOR_INNER]] ]
37 ; CHECK-NEXT: [[J_US_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_US_2:%.*]], [[FOR_INNER]] ]
38 ; CHECK-NEXT: [[SUM1_US_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_US_2:%.*]], [[FOR_INNER]] ]
39 ; CHECK-NEXT: [[J_US_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_US_3:%.*]], [[FOR_INNER]] ]
40 ; CHECK-NEXT: [[SUM1_US_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_US_3:%.*]], [[FOR_INNER]] ]
41 ; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[J_US]]
42 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !tbaa !0
43 ; CHECK-NEXT: [[ADD_US]] = add i32 [[TMP2]], [[SUM1_US]]
44 ; CHECK-NEXT: [[INC_US]] = add nuw i32 [[J_US]], 1
45 ; CHECK-NEXT: [[ARRAYIDX_US_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_US_1]]
46 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_US_1]], align 4, !tbaa !0
47 ; CHECK-NEXT: [[ADD_US_1]] = add i32 [[TMP3]], [[SUM1_US_1]]
48 ; CHECK-NEXT: [[INC_US_1]] = add nuw i32 [[J_US_1]], 1
49 ; CHECK-NEXT: [[ARRAYIDX_US_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_US_2]]
50 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_US_2]], align 4, !tbaa !0
51 ; CHECK-NEXT: [[ADD_US_2]] = add i32 [[TMP4]], [[SUM1_US_2]]
52 ; CHECK-NEXT: [[INC_US_2]] = add nuw i32 [[J_US_2]], 1
53 ; CHECK-NEXT: [[ARRAYIDX_US_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_US_3]]
54 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_US_3]], align 4, !tbaa !0
55 ; CHECK-NEXT: [[ADD_US_3]] = add i32 [[TMP5]], [[SUM1_US_3]]
56 ; CHECK-NEXT: [[INC_US_3]] = add nuw i32 [[J_US_3]], 1
57 ; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_US_3]], [[J]]
58 ; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
59 ; CHECK: for.latch:
60 ; CHECK-NEXT: [[ADD_US_LCSSA:%.*]] = phi i32 [ [[ADD_US]], [[FOR_INNER]] ]
61 ; CHECK-NEXT: [[ADD_US_LCSSA_1:%.*]] = phi i32 [ [[ADD_US_1]], [[FOR_INNER]] ]
62 ; CHECK-NEXT: [[ADD_US_LCSSA_2:%.*]] = phi i32 [ [[ADD_US_2]], [[FOR_INNER]] ]
63 ; CHECK-NEXT: [[ADD_US_LCSSA_3:%.*]] = phi i32 [ [[ADD_US_3]], [[FOR_INNER]] ]
64