llvm.org GIT mirror llvm / e101271
[UnrollAndJam] New Unroll and Jam pass This is a simple implementation of the unroll-and-jam classical loop optimisation. The basic idea is that we take an outer loop of the form: for i.. ForeBlocks(i) for j.. SubLoopBlocks(i, j) AftBlocks(i) Instead of doing normal inner or outer unrolling, we unroll as follows: for i... i+=2 ForeBlocks(i) ForeBlocks(i+1) for j.. SubLoopBlocks(i, j) SubLoopBlocks(i+1, j) AftBlocks(i) AftBlocks(i+1) Remainder Loop So we have unrolled the outer loop, then jammed the two inner loops into one. This can lead to a simpler inner loop if memory accesses can be shared between the now jammed loops. To do this we have to prove that this is all safe, both for the memory accesses (using dependence analysis) and that ForeBlocks(i+1) can move before AftBlocks(i) and SubLoopBlocks(i, j). Differential Revision: https://reviews.llvm.org/D41953 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@336062 91177308-0d34-0410-b5e6-96231b3b80d8 David Green 1 year, 2 months ago
23 changed file(s) with 3849 addition(s) and 20 deletion(s). Raw diff Collapse all Expand all
421421 bool AllowPeeling;
422422 /// Allow unrolling of all the iterations of the runtime loop remainder.
423423 bool UnrollRemainder;
424 /// Allow unroll and jam. Used to enable unroll and jam for the target.
425 bool UnrollAndJam;
426 /// Threshold for unroll and jam, for inner loop size. The 'Threshold'
427 /// value above is used during unroll and jam for the outer loop size.
428 /// This value is used in the same manner to limit the size of the inner
429 /// loop.
430 unsigned UnrollAndJamInnerLoopThreshold;
424431 };
425432
426433 /// Get target-customized preferences for the generic loop unrolling
225225 void initializeLoopSimplifyPass(PassRegistry&);
226226 void initializeLoopStrengthReducePass(PassRegistry&);
227227 void initializeLoopUnrollPass(PassRegistry&);
228 void initializeLoopUnrollAndJamPass(PassRegistry&);
228229 void initializeLoopUnswitchPass(PassRegistry&);
229230 void initializeLoopVectorizePass(PassRegistry&);
230231 void initializeLoopVersioningLICMPass(PassRegistry&);
131131 (void) llvm::createLoopStrengthReducePass();
132132 (void) llvm::createLoopRerollPass();
133133 (void) llvm::createLoopUnrollPass();
134 (void) llvm::createLoopUnrollAndJamPass();
134135 (void) llvm::createLoopUnswitchPass();
135136 (void) llvm::createLoopVersioningLICMPass();
136137 (void) llvm::createLoopIdiomPass();
0 //===- LoopUnrollAndJamPass.h -----------------------------------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8
9 #ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
10 #define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
11
12 #include "llvm/Analysis/LoopAnalysisManager.h"
13 #include "llvm/Analysis/LoopInfo.h"
14 #include "llvm/IR/PassManager.h"
15
16 namespace llvm {
17
18 class Loop;
19 struct LoopStandardAnalysisResults;
20 class LPMUpdater;
21
22 /// A simple loop rotation transformation.
23 class LoopUnrollAndJamPass : public PassInfoMixin {
24 const int OptLevel;
25
26 public:
27 explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
28 PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
29 LoopStandardAnalysisResults &AR, LPMUpdater &U);
30 };
31
32 } // end namespace llvm
33
34 #endif // LLVM_TRANSFORMS_SCALAR_LOOPUNROLLANDJAMPASS_H
191191
192192 //===----------------------------------------------------------------------===//
193193 //
194 // LoopUnrollAndJam - This pass is a simple loop unroll and jam pass.
195 //
196 Pass *createLoopUnrollAndJamPass(int OptLevel = 2);
197
198 //===----------------------------------------------------------------------===//
199 //
194200 // LoopReroll - This pass is a simple loop rerolling pass.
195201 //
196202 Pass *createLoopRerollPass();
1818 #include "llvm/ADT/DenseMap.h"
1919 #include "llvm/ADT/StringRef.h"
2020 #include "llvm/Analysis/TargetTransformInfo.h"
21 #include "llvm/Transforms/Utils/ValueMapper.h"
2122
2223 namespace llvm {
2324
2425 class AssumptionCache;
2526 class BasicBlock;
27 class DependenceInfo;
2628 class DominatorTree;
2729 class Loop;
2830 class LoopInfo;
7779 bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
7880 DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
7981
82 LoopUnrollResult UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
83 unsigned TripMultiple, bool UnrollRemainder,
84 LoopInfo *LI, ScalarEvolution *SE,
85 DominatorTree *DT, AssumptionCache *AC,
86 OptimizationRemarkEmitter *ORE);
87
88 bool isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
89 DependenceInfo &DI);
90
91 bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI,
92 DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
93 const SmallPtrSetImpl &EphValues,
94 OptimizationRemarkEmitter *ORE, unsigned &TripCount,
95 unsigned MaxTripCount, unsigned &TripMultiple,
96 unsigned LoopSize,
97 TargetTransformInfo::UnrollingPreferences &UP,
98 bool &UseUpperBound);
99
100 BasicBlock *foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
101 ScalarEvolution *SE, DominatorTree *DT);
102
103 void remapInstruction(Instruction *I, ValueToValueMapTy &VMap);
104
105 void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
106 ScalarEvolution *SE, DominatorTree *DT,
107 AssumptionCache *AC);
108
80109 MDNode *GetUnrollMetadata(MDNode *LoopID, StringRef Name);
110
111 TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
112 Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
113 Optional UserThreshold, Optional UserCount,
114 Optional UserAllowPartial, Optional UserRuntime,
115 Optional UserUpperBound, Optional UserAllowPeeling);
116
117 unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls,
118 bool &NotDuplicatable, bool &Convergent,
119 const TargetTransformInfo &TTI,
120 const SmallPtrSetImpl &EphValues,
121 unsigned BEInsns);
81122
82123 } // end namespace llvm
83124
8888 /** See llvm::createLoopUnrollPass function. */
8989 void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM);
9090
91 /** See llvm::createLoopUnrollAndJamPass function. */
92 void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM);
93
9194 /** See llvm::createLoopUnswitchPass function. */
9295 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM);
9396
120120 #include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
121121 #include "llvm/Transforms/Scalar/LoopSink.h"
122122 #include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
123 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
123124 #include "llvm/Transforms/Scalar/LoopUnrollPass.h"
124125 #include "llvm/Transforms/Scalar/LowerAtomic.h"
125126 #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
178179 "enable-npm-gvn-sink", cl::init(false), cl::Hidden,
179180 cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
180181
182 static cl::opt EnableUnrollAndJam(
183 "enable-npm-unroll-and-jam", cl::init(false), cl::Hidden,
184 cl::desc("Enable the Unroll and Jam pass for the new PM (default = off)"));
185
181186 static cl::opt EnableSyntheticCounts(
182187 "enable-npm-synthetic-counts", cl::init(false), cl::Hidden, cl::ZeroOrMore,
183188 cl::desc("Run synthetic function entry count generation "
797802 // FIXME: It would be really good to use a loop-integrated instruction
798803 // combiner for cleanup here so that the unrolling and LICM can be pipelined
799804 // across the loop nests.
805 // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
806 if (EnableUnrollAndJam) {
807 OptimizePM.addPass(
808 createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
809 }
800810 OptimizePM.addPass(LoopUnrollPass(Level));
801811 OptimizePM.addPass(InstCombinePass());
802812 OptimizePM.addPass(RequireAnalysisPass());
240240 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
241241 LOOP_PASS("indvars", IndVarSimplifyPass())
242242 LOOP_PASS("irce", IRCEPass())
243 LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
243244 LOOP_PASS("unroll-full", LoopFullUnrollPass())
244245 LOOP_PASS("unswitch", SimpleLoopUnswitchPass())
245246 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
621621 UP.Runtime = true;
622622 UP.UnrollRemainder = true;
623623 UP.DefaultUnrollRuntimeCount = 4;
624 UP.UnrollAndJam = true;
625 UP.UnrollAndJamInnerLoopThreshold = 60;
624626
625627 // Force unrolling small loops can be very useful because of the branch
626628 // taken cost of the backedge.
9595 "enable-loopinterchange", cl::init(false), cl::Hidden,
9696 cl::desc("Enable the new, experimental LoopInterchange Pass"));
9797
98 static cl::opt EnableUnrollAndJam("enable-unroll-and-jam",
99 cl::init(false), cl::Hidden,
100 cl::desc("Enable Unroll And Jam Pass"));
101
98102 static cl::opt
99103 EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
100104 cl::desc("Enable preparation for ThinLTO."));
668672 addInstructionCombiningPass(MPM);
669673
670674 if (!DisableUnrollLoops) {
675 if (EnableUnrollAndJam) {
676 // Unroll and Jam. We do this before unroll but need to be in a separate
677 // loop pass manager in order for the outer loop to be processed by
678 // unroll and jam before the inner loop is unrolled.
679 MPM.add(createLoopUnrollAndJamPass(OptLevel));
680 }
681
671682 MPM.add(createLoopUnrollPass(OptLevel)); // Unroll small loops
672683
673684 // LoopUnroll may generate some redundency to cleanup.
3838 LoopSimplifyCFG.cpp
3939 LoopStrengthReduce.cpp
4040 LoopUnrollPass.cpp
41 LoopUnrollAndJamPass.cpp
4142 LoopUnswitch.cpp
4243 LoopVersioningLICM.cpp
4344 LowerAtomic.cpp
0 //===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass implements an unroll and jam pass. Most of the work is done by
10 // Utils/UnrollLoopAndJam.cpp.
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
14 #include "llvm/ADT/None.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallPtrSet.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/Analysis/AssumptionCache.h"
19 #include "llvm/Analysis/CodeMetrics.h"
20 #include "llvm/Analysis/DependenceAnalysis.h"
21 #include "llvm/Analysis/LoopAnalysisManager.h"
22 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/Analysis/LoopPass.h"
24 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
25 #include "llvm/Analysis/ScalarEvolution.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/IR/BasicBlock.h"
28 #include "llvm/IR/CFG.h"
29 #include "llvm/IR/Constant.h"
30 #include "llvm/IR/Constants.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/IR/Function.h"
33 #include "llvm/IR/Instruction.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/IntrinsicInst.h"
36 #include "llvm/IR/Metadata.h"
37 #include "llvm/IR/PassManager.h"
38 #include "llvm/Pass.h"
39 #include "llvm/Support/Casting.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/Debug.h"
42 #include "llvm/Support/ErrorHandling.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Transforms/Scalar.h"
45 #include "llvm/Transforms/Scalar/LoopPassManager.h"
46 #include "llvm/Transforms/Utils.h"
47 #include "llvm/Transforms/Utils/LoopUtils.h"
48 #include "llvm/Transforms/Utils/UnrollLoop.h"
49 #include
50 #include
51 #include
52 #include
53
54 using namespace llvm;
55
56 #define DEBUG_TYPE "loop-unroll-and-jam"
57
58 static cl::opt
59 AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
60 cl::desc("Allows loops to be unroll-and-jammed."));
61
62 static cl::opt UnrollAndJamCount(
63 "unroll-and-jam-count", cl::Hidden,
64 cl::desc("Use this unroll count for all loops including those with "
65 "unroll_and_jam_count pragma values, for testing purposes"));
66
67 static cl::opt UnrollAndJamThreshold(
68 "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
69 cl::desc("Threshold to use for inner loop when doing unroll and jam."));
70
71 static cl::opt PragmaUnrollAndJamThreshold(
72 "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
73 cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
74 "unroll_count pragma."));
75
76 // Returns the loop hint metadata node with the given name (for example,
77 // "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
78 // returned.
79 static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
80 if (MDNode *LoopID = L->getLoopID())
81 return GetUnrollMetadata(LoopID, Name);
82 return nullptr;
83 }
84
85 // Returns true if the loop has any metadata starting with Prefix. For example a
86 // Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
87 static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
88 if (MDNode *LoopID = L->getLoopID()) {
89 // First operand should refer to the loop id itself.
90 assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
91 assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
92
93 for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
94 MDNode *MD = dyn_cast(LoopID->getOperand(i));
95 if (!MD)
96 continue;
97
98 MDString *S = dyn_cast(MD->getOperand(0));
99 if (!S)
100 continue;
101
102 if (S->getString().startswith(Prefix))
103 return true;
104 }
105 }
106 return false;
107 }
108
109 // Returns true if the loop has an unroll_and_jam(enable) pragma.
110 static bool HasUnrollAndJamEnablePragma(const Loop *L) {
111 return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
112 }
113
114 // Returns true if the loop has an unroll_and_jam(disable) pragma.
115 static bool HasUnrollAndJamDisablePragma(const Loop *L) {
116 return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.disable");
117 }
118
119 // If loop has an unroll_and_jam_count pragma return the (necessarily
120 // positive) value from the pragma. Otherwise return 0.
121 static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
122 MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
123 if (MD) {
124 assert(MD->getNumOperands() == 2 &&
125 "Unroll count hint metadata should have two operands.");
126 unsigned Count =
127 mdconst::extract(MD->getOperand(1))->getZExtValue();
128 assert(Count >= 1 && "Unroll count must be positive.");
129 return Count;
130 }
131 return 0;
132 }
133
134 // Returns loop size estimation for unrolled loop.
135 static uint64_t
136 getUnrollAndJammedLoopSize(unsigned LoopSize,
137 TargetTransformInfo::UnrollingPreferences &UP) {
138 assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
139 return static_cast(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
140 }
141
142 // Calculates unroll and jam count and writes it to UP.Count. Returns true if
143 // unroll count was set explicitly.
144 static bool computeUnrollAndJamCount(
145 Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
146 LoopInfo *LI, ScalarEvolution &SE,
147 const SmallPtrSetImpl &EphValues,
148 OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
149 unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
150 unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
151 // Check for explicit Count from the "unroll-and-jam-count" option.
152 bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
153 if (UserUnrollCount) {
154 UP.Count = UnrollAndJamCount;
155 UP.Force = true;
156 if (UP.AllowRemainder &&
157 getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
158 getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
159 UP.UnrollAndJamInnerLoopThreshold)
160 return true;
161 }
162
163 // Check for unroll_and_jam pragmas
164 unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
165 if (PragmaCount > 0) {
166 UP.Count = PragmaCount;
167 UP.Runtime = true;
168 UP.Force = true;
169 if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
170 getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
171 getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
172 UP.UnrollAndJamInnerLoopThreshold)
173 return true;
174 }
175
176 // Use computeUnrollCount from the loop unroller to get a sensible count
177 // for the unrolling the outer loop. This uses UP.Threshold /
178 // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
179 // We have already checked that the loop has no unroll.* pragmas.
180 unsigned MaxTripCount = 0;
181 bool UseUpperBound = false;
182 bool ExplicitUnroll = computeUnrollCount(
183 L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
184 OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
185 if (ExplicitUnroll || UseUpperBound) {
186 // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
187 // for the unroller instead.
188 UP.Count = 0;
189 return false;
190 }
191
192 bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
193 ExplicitUnroll = PragmaCount > 0 || PragmaEnableUnroll || UserUnrollCount;
194
195 // If the loop has an unrolling pragma, we want to be more aggressive with
196 // unrolling limits.
197 if (ExplicitUnroll && OuterTripCount != 0)
198 UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
199
200 if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
201 UP.UnrollAndJamInnerLoopThreshold) {
202 UP.Count = 0;
203 return false;
204 }
205
206 // If the inner loop count is known and small, leave the entire loop nest to
207 // be the unroller
208 if (!ExplicitUnroll && InnerTripCount &&
209 InnerLoopSize * InnerTripCount < UP.Threshold) {
210 UP.Count = 0;
211 return false;
212 }
213
214 // We have a sensible limit for the outer loop, now adjust it for the inner
215 // loop and UP.UnrollAndJamInnerLoopThreshold.
216 while (UP.Count != 0 && UP.AllowRemainder &&
217 getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
218 UP.UnrollAndJamInnerLoopThreshold)
219 UP.Count--;
220
221 if (!ExplicitUnroll) {
222 // Check for situations where UnJ is likely to be unprofitable. Including
223 // subloops with more than 1 block.
224 if (SubLoop->getBlocks().size() != 1) {
225 UP.Count = 0;
226 return false;
227 }
228
229 // Limit to loops where there is something to gain from unrolling and
230 // jamming the loop. In this case, look for loads that are invariant in the
231 // outer loop and can become shared.
232 unsigned NumInvariant = 0;
233 for (BasicBlock *BB : SubLoop->getBlocks()) {
234 for (Instruction &I : *BB) {
235 if (auto *Ld = dyn_cast(&I)) {
236 Value *V = Ld->getPointerOperand();
237 const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
238 if (SE.isLoopInvariant(LSCEV, L))
239 NumInvariant++;
240 }
241 }
242 }
243 if (NumInvariant == 0) {
244 UP.Count = 0;
245 return false;
246 }
247 }
248
249 return ExplicitUnroll;
250 }
251
252 static LoopUnrollResult
253 tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
254 ScalarEvolution &SE, const TargetTransformInfo &TTI,
255 AssumptionCache &AC, DependenceInfo &DI,
256 OptimizationRemarkEmitter &ORE, int OptLevel) {
257 // Quick checks of the correct loop form
258 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
259 return LoopUnrollResult::Unmodified;
260 Loop *SubLoop = L->getSubLoops()[0];
261 if (!SubLoop->isLoopSimplifyForm())
262 return LoopUnrollResult::Unmodified;
263
264 BasicBlock *Latch = L->getLoopLatch();
265 BasicBlock *Exit = L->getExitingBlock();
266 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
267 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
268
269 if (Latch != Exit || SubLoopLatch != SubLoopExit)
270 return LoopUnrollResult::Unmodified;
271
272 TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
273 L, SE, TTI, OptLevel, None, None, None, None, None, None);
274 if (AllowUnrollAndJam.getNumOccurrences() > 0)
275 UP.UnrollAndJam = AllowUnrollAndJam;
276 if (UnrollAndJamThreshold.getNumOccurrences() > 0)
277 UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
278 // Exit early if unrolling is disabled.
279 if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
280 return LoopUnrollResult::Unmodified;
281
282 LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
283 << L->getHeader()->getParent()->getName() << "] Loop %"
284 << L->getHeader()->getName() << "\n");
285
286 // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
287 // the unroller, so long as it does not explicitly have unroll_and_jam
288 // metadata. This means #pragma nounroll will disable unroll and jam as well
289 // as unrolling
290 if (HasUnrollAndJamDisablePragma(L) ||
291 (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
292 !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam."))) {
293 LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
294 return LoopUnrollResult::Unmodified;
295 }
296
297 if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
298 LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
299 return LoopUnrollResult::Unmodified;
300 }
301
302 // Approximate the loop size and collect useful info
303 unsigned NumInlineCandidates;
304 bool NotDuplicatable;
305 bool Convergent;
306 SmallPtrSet EphValues;
307 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
308 unsigned InnerLoopSize =
309 ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
310 Convergent, TTI, EphValues, UP.BEInsns);
311 unsigned OuterLoopSize =
312 ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
313 TTI, EphValues, UP.BEInsns);
314 LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n");
315 LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n");
316 if (NotDuplicatable) {
317 LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable "
318 "instructions.\n");
319 return LoopUnrollResult::Unmodified;
320 }
321 if (NumInlineCandidates != 0) {
322 LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
323 return LoopUnrollResult::Unmodified;
324 }
325 if (Convergent) {
326 LLVM_DEBUG(
327 dbgs() << " Not unrolling loop with convergent instructions.\n");
328 return LoopUnrollResult::Unmodified;
329 }
330
331 // Find trip count and trip multiple
332 unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
333 unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
334 unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
335
336 // Decide if, and by how much, to unroll
337 bool IsCountSetExplicitly = computeUnrollAndJamCount(
338 L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
339 OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
340 if (UP.Count <= 1)
341 return LoopUnrollResult::Unmodified;
342 // Unroll factor (Count) must be less or equal to TripCount.
343 if (OuterTripCount && UP.Count > OuterTripCount)
344 UP.Count = OuterTripCount;
345
346 LoopUnrollResult UnrollResult =
347 UnrollAndJamLoop(L, UP.Count, OuterTripCount, OuterTripMultiple,
348 UP.UnrollRemainder, LI, &SE, &DT, &AC, &ORE);
349
350 // If loop has an unroll count pragma or unrolled by explicitly set count
351 // mark loop as unrolled to prevent unrolling beyond that requested.
352 if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
353 L->setLoopAlreadyUnrolled();
354
355 return UnrollResult;
356 }
357
358 namespace {
359
360 class LoopUnrollAndJam : public LoopPass {
361 public:
362 static char ID; // Pass ID, replacement for typeid
363 unsigned OptLevel;
364
365 LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
366 initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
367 }
368
369 bool runOnLoop(Loop *L, LPPassManager &LPM) override {
370 if (skipLoop(L))
371 return false;
372
373 Function &F = *L->getHeader()->getParent();
374
375 auto &DT = getAnalysis().getDomTree();
376 LoopInfo *LI = &getAnalysis().getLoopInfo();
377 ScalarEvolution &SE = getAnalysis().getSE();
378 const TargetTransformInfo &TTI =
379 getAnalysis().getTTI(F);
380 auto &AC = getAnalysis().getAssumptionCache(F);
381 auto &DI = getAnalysis().getDI();
382 // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
383 // pass. Function analyses need to be preserved across loop transformations
384 // but ORE cannot be preserved (see comment before the pass definition).
385 OptimizationRemarkEmitter ORE(&F);
386
387 LoopUnrollResult Result =
388 tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
389
390 if (Result == LoopUnrollResult::FullyUnrolled)
391 LPM.markLoopAsDeleted(*L);
392
393 return Result != LoopUnrollResult::Unmodified;
394 }
395
396 /// This transformation requires natural loop information & requires that
397 /// loop preheaders be inserted into the CFG...
398 void getAnalysisUsage(AnalysisUsage &AU) const override {
399 AU.addRequired();
400 AU.addRequired();
401 AU.addRequired();
402 getLoopAnalysisUsage(AU);
403 }
404 };
405
406 } // end anonymous namespace
407
408 char LoopUnrollAndJam::ID = 0;
409
410 INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
411 "Unroll and Jam loops", false, false)
412 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
413 INITIALIZE_PASS_DEPENDENCY(LoopPass)
414 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
415 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
416 INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
417 "Unroll and Jam loops", false, false)
418
419 Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
420 return new LoopUnrollAndJam(OptLevel);
421 }
422
423 PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
424 LoopStandardAnalysisResults &AR,
425 LPMUpdater &) {
426 const auto &FAM =
427 AM.getResult(L, AR).getManager();
428 Function *F = L.getHeader()->getParent();
429
430 auto *ORE = FAM.getCachedResult(*F);
431 // FIXME: This should probably be optional rather than required.
432 if (!ORE)
433 report_fatal_error(
434 "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
435 "a higher level");
436
437 DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
438
439 LoopUnrollResult Result = tryToUnrollAndJamLoop(
440 &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
441
442 if (Result == LoopUnrollResult::Unmodified)
443 return PreservedAnalyses::all();
444
445 return getLoopPassPreservedAnalyses();
446 }
164164
165165 /// Gather the various unrolling parameters based on the defaults, compiler
166166 /// flags, TTI overrides and user specified parameters.
167 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
167 TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
168168 Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel,
169169 Optional UserThreshold, Optional UserCount,
170170 Optional UserAllowPartial, Optional UserRuntime,
191191 UP.Force = false;
192192 UP.UpperBound = false;
193193 UP.AllowPeeling = true;
194 UP.UnrollAndJam = false;
195 UP.UnrollAndJamInnerLoopThreshold = 60;
194196
195197 // Override with any target specific settings
196198 TTI.getUnrollingPreferences(L, SE, UP);
614616 }
615617
616618 /// ApproximateLoopSize - Approximate the size of the loop.
617 static unsigned
618 ApproximateLoopSize(const Loop *L, unsigned &NumCalls, bool &NotDuplicatable,
619 bool &Convergent, const TargetTransformInfo &TTI,
620 const SmallPtrSetImpl &EphValues,
621 unsigned BEInsns) {
619 unsigned llvm::ApproximateLoopSize(
620 const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
621 const TargetTransformInfo &TTI,
622 const SmallPtrSetImpl &EphValues, unsigned BEInsns) {
622623 CodeMetrics Metrics;
623624 for (BasicBlock *BB : L->blocks())
624625 Metrics.analyzeBasicBlock(BB, TTI, EphValues);
711712
712713 // Returns true if unroll count was set explicitly.
713714 // Calculates unroll count and writes it to UP.Count.
714 static bool computeUnrollCount(
715 bool llvm::computeUnrollCount(
715716 Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
716717 ScalarEvolution &SE, const SmallPtrSetImpl &EphValues,
717718 OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
752753
753754 if (ExplicitUnroll && TripCount != 0) {
754755 // If the loop has an unrolling pragma, we want to be more aggressive with
755 // unrolling limits. Set thresholds to at least the PragmaThreshold value
756 // which is larger than the default limits.
756 // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
757 // value which is larger than the default limits.
757758 UP.Threshold = std::max(UP.Threshold, PragmaUnrollThreshold);
758759 UP.PartialThreshold =
759760 std::max(UP.PartialThreshold, PragmaUnrollThreshold);
6969 initializeLoopStrengthReducePass(Registry);
7070 initializeLoopRerollPass(Registry);
7171 initializeLoopUnrollPass(Registry);
72 initializeLoopUnrollAndJamPass(Registry);
7273 initializeLoopUnswitchPass(Registry);
7374 initializeLoopVersioningLICMPass(Registry);
7475 initializeLoopIdiomRecognizeLegacyPassPass(Registry);
184185 unwrap(PM)->add(createLoopUnrollPass());
185186 }
186187
188 void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
189 unwrap(PM)->add(createLoopUnrollAndJamPass());
190 }
191
187192 void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
188193 unwrap(PM)->add(createLoopUnswitchPass());
189194 }
2727 LoopRotationUtils.cpp
2828 LoopSimplify.cpp
2929 LoopUnroll.cpp
30 LoopUnrollAndJam.cpp
3031 LoopUnrollPeel.cpp
3132 LoopUnrollRuntime.cpp
3233 LoopUtils.cpp
6262
6363 /// Convert the instruction operands from referencing the current values into
6464 /// those specified by VMap.
65 static inline void remapInstruction(Instruction *I,
66 ValueToValueMapTy &VMap) {
65 void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
6766 for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
6867 Value *Op = I->getOperand(op);
6968
9796 /// Folds a basic block into its predecessor if it only has one predecessor, and
9897 /// that predecessor only has one successor.
9998 /// The LoopInfo Analysis that is passed will be kept consistent.
100 static BasicBlock *
101 foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, ScalarEvolution *SE,
102 DominatorTree *DT) {
99 BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
100 ScalarEvolution *SE,
101 DominatorTree *DT) {
103102 // Merge basic blocks into their predecessor if there is only one distinct
104103 // pred, and if there is only one distinct successor of the predecessor, and
105104 // if there are no PHI nodes.
109108 if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
110109 return nullptr;
111110
112 LLVM_DEBUG(dbgs() << "Merging: " << *BB << "into: " << *OnlyPred);
111 LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
112 << OnlyPred->getName() << "\n");
113113
114114 // Resolve any PHI nodes at the start of the block. They are all
115115 // guaranteed to have exactly one entry if they exist, unless there are
254254 /// Perform some cleanup and simplifications on loops after unrolling. It is
255255 /// useful to simplify the IV's in the new loop, as well as do a quick
256256 /// simplify/dce pass of the instructions.
257 static void simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
258 ScalarEvolution *SE, DominatorTree *DT,
259 AssumptionCache *AC) {
257 void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
258 ScalarEvolution *SE, DominatorTree *DT,
259 AssumptionCache *AC) {
260260 // Simplify any new induction variables in the partially unrolled loop.
261261 if (SE && SimplifyIVs) {
262262 SmallVector DeadInsts;
472472 if (Force)
473473 RuntimeTripCount = false;
474474 else {
475 LLVM_DEBUG(dbgs() << "Wont unroll; remainder loop could not be generated"
476 "when assuming runtime trip count\n");
475 LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
476 "generated when assuming runtime trip count\n");
477477 return LoopUnrollResult::Unmodified;
478478 }
479479 }
0 //===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements loop unroll and jam as a routine, much like
10 // LoopUnroll.cpp implements loop unroll.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/ADT/SmallPtrSet.h"
15 #include "llvm/ADT/Statistic.h"
16 #include "llvm/Analysis/AssumptionCache.h"
17 #include "llvm/Analysis/DependenceAnalysis.h"
18 #include "llvm/Analysis/InstructionSimplify.h"
19 #include "llvm/Analysis/LoopAnalysisManager.h"
20 #include "llvm/Analysis/LoopIterator.h"
21 #include "llvm/Analysis/LoopPass.h"
22 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
23 #include "llvm/Analysis/ScalarEvolution.h"
24 #include "llvm/Analysis/ScalarEvolutionExpander.h"
25 #include "llvm/Analysis/Utils/Local.h"
26 #include "llvm/IR/BasicBlock.h"
27 #include "llvm/IR/DataLayout.h"
28 #include "llvm/IR/DebugInfoMetadata.h"
29 #include "llvm/IR/Dominators.h"
30 #include "llvm/IR/IntrinsicInst.h"
31 #include "llvm/IR/LLVMContext.h"
32 #include "llvm/Support/Debug.h"
33 #include "llvm/Support/raw_ostream.h"
34 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
35 #include "llvm/Transforms/Utils/Cloning.h"
36 #include "llvm/Transforms/Utils/LoopSimplify.h"
37 #include "llvm/Transforms/Utils/LoopUtils.h"
38 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
39 #include "llvm/Transforms/Utils/UnrollLoop.h"
40 using namespace llvm;
41
42 #define DEBUG_TYPE "loop-unroll-and-jam"
43
44 STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
45 STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
46
47 static bool containsBB(std::vector &V, BasicBlock *BB) {
48 return std::find(V.begin(), V.end(), BB) != V.end();
49 }
50
51 // Partition blocks in an outer/inner loop pair into blocks before and after
52 // the loop
53 static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
54 std::vector &ForeBlocks,
55 std::vector &SubLoopBlocks,
56 std::vector &AftBlocks,
57 DominatorTree *DT) {
58 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
59 SubLoopBlocks = SubLoop->getBlocks();
60
61 for (BasicBlock *BB : L->blocks()) {
62 if (!SubLoop->contains(BB)) {
63 if (DT->dominates(SubLoopLatch, BB))
64 AftBlocks.push_back(BB);
65 else
66 ForeBlocks.push_back(BB);
67 }
68 }
69
70 // Check that all blocks in ForeBlocks together dominate the subloop
71 // TODO: This might ideally be done better with a dominator/postdominators.
72 BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
73 for (BasicBlock *BB : ForeBlocks) {
74 if (BB == SubLoopPreHeader)
75 continue;
76 TerminatorInst *TI = BB->getTerminator();
77 for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
78 if (!containsBB(ForeBlocks, TI->getSuccessor(i)))
79 return false;
80 }
81
82 return true;
83 }
84
85 // Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
86 static void
87 moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header, BasicBlock *Latch,
88 Instruction *InsertLoc,
89 std::vector &AftBlocks) {
90 // We need to ensure we move the instructions in the correct order,
91 // starting with the earliest required instruction and moving forward.
92 std::vector Worklist;
93 std::vector Visited;
94 for (auto &Phi : Header->phis()) {
95 Value *V = Phi.getIncomingValueForBlock(Latch);
96 if (Instruction *I = dyn_cast(V))
97 Worklist.push_back(I);
98 }
99
100 while (!Worklist.empty()) {
101 Instruction *I = Worklist.back();
102 Worklist.pop_back();
103 if (!containsBB(AftBlocks, I->getParent()))
104 continue;
105
106 Visited.push_back(I);
107 for (auto &U : I->operands())
108 if (Instruction *II = dyn_cast(U))
109 Worklist.push_back(II);
110 }
111
112 // Move all instructions in program order to before the InsertLoc
113 BasicBlock *InsertLocBB = InsertLoc->getParent();
114 for (Instruction *I : reverse(Visited)) {
115 if (I->getParent() != InsertLocBB)
116 I->moveBefore(InsertLoc);
117 }
118 }
119
120 /*
121 This method performs Unroll and Jam. For a simple loop like:
122 for (i = ..)
123 Fore(i)
124 for (j = ..)
125 SubLoop(i, j)
126 Aft(i)
127
128 Instead of doing normal inner or outer unrolling, we do:
129 for (i = .., i+=2)
130 Fore(i)
131 Fore(i+1)
132 for (j = ..)
133 SubLoop(i, j)
134 SubLoop(i+1, j)
135 Aft(i)
136 Aft(i+1)
137
138 So the outer loop is essetially unrolled and then the inner loops are fused
139 ("jammed") together into a single loop. This can increase speed when there
140 are loads in SubLoop that are invariant to i, as they become shared between
141 the now jammed inner loops.
142
143 We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
144 Fore blocks are those before the inner loop, Aft are those after. Normal
145 Unroll code is used to copy each of these sets of blocks and the results are
146 combined together into the final form above.
147
148 isSafeToUnrollAndJam should be used prior to calling this to make sure the
149 unrolling will be valid. Checking profitablility is also advisable.
150 */
151 LoopUnrollResult
152 llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
153 unsigned TripMultiple, bool UnrollRemainder,
154 LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
155 AssumptionCache *AC, OptimizationRemarkEmitter *ORE) {
156
157 // When we enter here we should have already checked that it is safe
158 BasicBlock *Header = L->getHeader();
159 assert(L->getSubLoops().size() == 1);
160 Loop *SubLoop = *L->begin();
161
162 // Don't enter the unroll code if there is nothing to do.
163 if (TripCount == 0 && Count < 2) {
164 LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
165 return LoopUnrollResult::Unmodified;
166 }
167
168 assert(Count > 0);
169 assert(TripMultiple > 0);
170 assert(TripCount == 0 || TripCount % TripMultiple == 0);
171
172 // Are we eliminating the loop control altogether?
173 bool CompletelyUnroll = (Count == TripCount);
174
175 // We use the runtime remainder in cases where we don't know trip multiple
176 if (TripMultiple == 1 || TripMultiple % Count != 0) {
177 if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
178 /*UseEpilogRemainder*/ true,
179 UnrollRemainder, LI, SE, DT, AC, true)) {
180 LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
181 "generated when assuming runtime trip count\n");
182 return LoopUnrollResult::Unmodified;
183 }
184 }
185
186 // Notify ScalarEvolution that the loop will be substantially changed,
187 // if not outright eliminated.
188 if (SE) {
189 SE->forgetLoop(L);
190 SE->forgetLoop(SubLoop);
191 }
192
193 using namespace ore;
194 // Report the unrolling decision.
195 if (CompletelyUnroll) {
196 LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
197 << Header->getName() << " with trip count " << TripCount
198 << "!\n");
199 ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
200 L->getHeader())
201 << "completely unroll and jammed loop with "
202 << NV("UnrollCount", TripCount) << " iterations");
203 } else {
204 auto DiagBuilder = [&]() {
205 OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
206 L->getHeader());
207 return Diag << "unroll and jammed loop by a factor of "
208 << NV("UnrollCount", Count);
209 };
210
211 LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
212 << " by " << Count);
213 if (TripMultiple != 1) {
214 LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
215 ORE->emit([&]() {
216 return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
217 << " trips per branch";
218 });
219 } else {
220 LLVM_DEBUG(dbgs() << " with run-time trip count");
221 ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
222 }
223 LLVM_DEBUG(dbgs() << "!\n");
224 }
225
226 BasicBlock *Preheader = L->getLoopPreheader();
227 BasicBlock *LatchBlock = L->getLoopLatch();
228 BranchInst *BI = dyn_cast(LatchBlock->getTerminator());
229 assert(Preheader && LatchBlock && Header);
230 assert(BI && !BI->isUnconditional());
231 bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
232 BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
233 bool SubLoopContinueOnTrue = SubLoop->contains(
234 SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
235
236 // Partition blocks in an outer/inner loop pair into blocks before and after
237 // the loop
238 std::vector SubLoopBlocks;
239 std::vector ForeBlocks;
240 std::vector AftBlocks;
241 partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
242 DT);
243
244 // We keep track of the entering/first and exiting/last block of each of
245 // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
246 // blocks easier.
247 std::vector ForeBlocksFirst;
248 std::vector ForeBlocksLast;
249 std::vector SubLoopBlocksFirst;
250 std::vector SubLoopBlocksLast;
251 std::vector AftBlocksFirst;
252 std::vector AftBlocksLast;
253 ForeBlocksFirst.push_back(Header);
254 ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
255 SubLoopBlocksFirst.push_back(SubLoop->getHeader());
256 SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
257 AftBlocksFirst.push_back(SubLoop->getExitBlock());
258 AftBlocksLast.push_back(L->getExitingBlock());
259 // Maps Blocks[0] -> Blocks[It]
260 ValueToValueMapTy LastValueMap;
261
262 // Move any instructions from fore phi operands from AftBlocks into Fore.
263 moveHeaderPhiOperandsToForeBlocks(
264 Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
265 AftBlocks);
266
267 // The current on-the-fly SSA update requires blocks to be processed in
268 // reverse postorder so that LastValueMap contains the correct value at each
269 // exit.
270 LoopBlocksDFS DFS(L);
271 DFS.perform(LI);
272 // Stash the DFS iterators before adding blocks to the loop.
273 LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
274 LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
275
276 if (Header->getParent()->isDebugInfoForProfiling())
277 for (BasicBlock *BB : L->getBlocks())
278 for (Instruction &I : *BB)
279 if (!isa(&I))
280 if (const DILocation *DIL = I.getDebugLoc())
281 I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
282
283 // Copy all blocks
284 for (unsigned It = 1; It != Count; ++It) {
285 std::vector NewBlocks;
286 // Maps Blocks[It] -> Blocks[It-1]
287 DenseMap PrevItValueMap;
288
289 for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
290 ValueToValueMapTy VMap;
291 BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
292 Header->getParent()->getBasicBlockList().push_back(New);
293
294 if (containsBB(ForeBlocks, *BB)) {
295 L->addBasicBlockToLoop(New, *LI);
296
297 if (*BB == ForeBlocksFirst[0])
298 ForeBlocksFirst.push_back(New);
299 if (*BB == ForeBlocksLast[0])
300 ForeBlocksLast.push_back(New);
301 } else if (containsBB(SubLoopBlocks, *BB)) {
302 SubLoop->addBasicBlockToLoop(New, *LI);
303
304 if (*BB == SubLoopBlocksFirst[0])
305 SubLoopBlocksFirst.push_back(New);
306 if (*BB == SubLoopBlocksLast[0])
307 SubLoopBlocksLast.push_back(New);
308 } else if (containsBB(AftBlocks, *BB)) {
309 L->addBasicBlockToLoop(New, *LI);
310
311 if (*BB == AftBlocksFirst[0])
312 AftBlocksFirst.push_back(New);
313 if (*BB == AftBlocksLast[0])
314 AftBlocksLast.push_back(New);
315 } else {
316 llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
317 }
318
319 // Update our running maps of newest clones
320 PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
321 LastValueMap[*BB] = New;
322 for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
323 VI != VE; ++VI) {
324 PrevItValueMap[VI->second] =
325 const_cast(It == 1 ? VI->first : LastValueMap[VI->first]);
326 LastValueMap[VI->first] = VI->second;
327 }
328
329 NewBlocks.push_back(New);
330
331 // Update DomTree:
332 if (*BB == ForeBlocksFirst[0])
333 DT->addNewBlock(New, ForeBlocksLast[It - 1]);
334 else if (*BB == SubLoopBlocksFirst[0])
335 DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
336 else if (*BB == AftBlocksFirst[0])
337 DT->addNewBlock(New, AftBlocksLast[It - 1]);
338 else {
339 // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
340 // structure.
341 auto BBDomNode = DT->getNode(*BB);
342 auto BBIDom = BBDomNode->getIDom();
343 BasicBlock *OriginalBBIDom = BBIDom->getBlock();
344 assert(OriginalBBIDom);
345 assert(LastValueMap[cast(OriginalBBIDom)]);
346 DT->addNewBlock(
347 New, cast(LastValueMap[cast(OriginalBBIDom)]));
348 }
349 }
350
351 // Remap all instructions in the most recent iteration
352 for (BasicBlock *NewBlock : NewBlocks) {
353 for (Instruction &I : *NewBlock) {
354 ::remapInstruction(&I, LastValueMap);
355 if (auto *II = dyn_cast(&I))
356 if (II->getIntrinsicID() == Intrinsic::assume)
357 AC->registerAssumption(II);
358 }
359 }
360
361 // Alter the ForeBlocks phi's, pointing them at the latest version of the
362 // value from the previous iteration's phis
363 for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
364 Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
365 assert(OldValue && "should have incoming edge from Aft[It]");
366 Value *NewValue = OldValue;
367 if (Value *PrevValue = PrevItValueMap[OldValue])
368 NewValue = PrevValue;
369
370 assert(Phi.getNumOperands() == 2);
371 Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
372 Phi.setIncomingValue(0, NewValue);
373 Phi.removeIncomingValue(1);
374 }
375 }
376
377 // Now that all the basic blocks for the unrolled iterations are in place,
378 // finish up connecting the blocks and phi nodes. At this point LastValueMap
379 // is the last unrolled iterations values.
380
381 // Update Phis in BB from OldBB to point to NewBB
382 auto updatePHIBlocks = [](BasicBlock *BB, BasicBlock *OldBB,
383 BasicBlock *NewBB) {
384 for (PHINode &Phi : BB->phis()) {
385 int I = Phi.getBasicBlockIndex(OldBB);
386 Phi.setIncomingBlock(I, NewBB);
387 }
388 };
389 // Update Phis in BB from OldBB to point to NewBB and use the latest value
390 // from LastValueMap
391 auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
392 BasicBlock *NewBB,
393 ValueToValueMapTy &LastValueMap) {
394 for (PHINode &Phi : BB->phis()) {
395 for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
396 if (Phi.getIncomingBlock(b) == OldBB) {
397 Value *OldValue = Phi.getIncomingValue(b);
398 if (Value *LastValue = LastValueMap[OldValue])
399 Phi.setIncomingValue(b, LastValue);
400 Phi.setIncomingBlock(b, NewBB);
401 break;
402 }
403 }
404 }
405 };
406 // Move all the phis from Src into Dest
407 auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
408 Instruction *insertPoint = Dest->getFirstNonPHI();
409 while (PHINode *Phi = dyn_cast(Src->begin()))
410 Phi->moveBefore(insertPoint);
411 };
412
413 // Update the PHI values outside the loop to point to the last block
414 updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
415 LastValueMap);
416
417 // Update ForeBlocks successors and phi nodes
418 BranchInst *ForeTerm =
419 cast(ForeBlocksLast.back()->getTerminator());
420 BasicBlock *Dest = SubLoopBlocksFirst[0];
421 ForeTerm->setSuccessor(0, Dest);
422
423 if (CompletelyUnroll) {
424 while (PHINode *Phi = dyn_cast(ForeBlocksFirst[0]->begin())) {
425 Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
426 Phi->getParent()->getInstList().erase(Phi);
427 }
428 } else {
429 // Update the PHI values to point to the last aft block
430 updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
431 AftBlocksLast.back(), LastValueMap);
432 }
433
434 for (unsigned It = 1; It != Count; It++) {
435 // Remap ForeBlock successors from previous iteration to this
436 BranchInst *ForeTerm =
437 cast(ForeBlocksLast[It - 1]->getTerminator());
438 BasicBlock *Dest = ForeBlocksFirst[It];
439 ForeTerm->setSuccessor(0, Dest);
440 }
441
442 // Subloop successors and phis
443 BranchInst *SubTerm =
444 cast(SubLoopBlocksLast.back()->getTerminator());
445 SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
446 SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
447 updatePHIBlocks(SubLoopBlocksFirst[0], ForeBlocksLast[0],
448 ForeBlocksLast.back());
449 updatePHIBlocks(SubLoopBlocksFirst[0], SubLoopBlocksLast[0],
450 SubLoopBlocksLast.back());
451
452 for (unsigned It = 1; It != Count; It++) {
453 // Replace the conditional branch of the previous iteration subloop with an
454 // unconditional one to this one
455 BranchInst *SubTerm =
456 cast(SubLoopBlocksLast[It - 1]->getTerminator());
457 BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
458 SubTerm->eraseFromParent();
459
460 updatePHIBlocks(SubLoopBlocksFirst[It], ForeBlocksLast[It],
461 ForeBlocksLast.back());
462 updatePHIBlocks(SubLoopBlocksFirst[It], SubLoopBlocksLast[It],
463 SubLoopBlocksLast.back());
464 movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
465 }
466
467 // Aft blocks successors and phis
468 BranchInst *Term = cast(AftBlocksLast.back()->getTerminator());
469 if (CompletelyUnroll) {
470 BranchInst::Create(LoopExit, Term);
471 Term->eraseFromParent();
472 } else {
473 Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
474 }
475 updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
476 SubLoopBlocksLast.back());
477
478 for (unsigned It = 1; It != Count; It++) {
479 // Replace the conditional branch of the previous iteration subloop with an
480 // unconditional one to this one
481 BranchInst *AftTerm =
482 cast(AftBlocksLast[It - 1]->getTerminator());
483 BranchInst::Create(AftBlocksFirst[It], AftTerm);
484 AftTerm->eraseFromParent();
485
486 updatePHIBlocks(AftBlocksFirst[It], SubLoopBlocksLast[It],
487 SubLoopBlocksLast.back());
488 movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
489 }
490
491 // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
492 // new ones required.
493 if (Count != 1) {
494 SmallVector DTUpdates;
495 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
496 SubLoopBlocksFirst[0]);
497 DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
498 SubLoopBlocksLast[0], AftBlocksFirst[0]);
499
500 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
501 ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
502 DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
503 SubLoopBlocksLast.back(), AftBlocksFirst[0]);
504 DT->applyUpdates(DTUpdates);
505 }
506
507 // Merge adjacent basic blocks, if possible.
508 SmallPtrSet MergeBlocks;
509 MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
510 MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
511 MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
512 while (!MergeBlocks.empty()) {
513 BasicBlock *BB = *MergeBlocks.begin();
514 BranchInst *Term = dyn_cast(BB->getTerminator());
515 if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
516 BasicBlock *Dest = Term->getSuccessor(0);
517 if (BasicBlock *Fold = foldBlockIntoPredecessor(Dest, LI, SE, DT)) {
518 // Don't remove BB and add Fold as they are the same BB
519 assert(Fold == BB);
520 (void)Fold;
521 MergeBlocks.erase(Dest);
522 } else
523 MergeBlocks.erase(BB);
524 } else
525 MergeBlocks.erase(BB);
526 }
527
528 // At this point, the code is well formed. We now do a quick sweep over the
529 // inserted code, doing constant propagation and dead code elimination as we
530 // go.
531 simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
532 simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
533
534 NumCompletelyUnrolledAndJammed += CompletelyUnroll;
535 ++NumUnrolledAndJammed;
536
537 #ifndef NDEBUG
538 // We shouldn't have done anything to break loop simplify form or LCSSA.
539 Loop *OuterL = L->getParentLoop();
540 Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
541 assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
542 if (!CompletelyUnroll)
543 assert(L->isLoopSimplifyForm());
544 assert(SubLoop->isLoopSimplifyForm());
545 assert(DT->verify());
546 #endif
547
548 // Update LoopInfo if the loop is completely removed.
549 if (CompletelyUnroll)
550 LI->erase(L);
551
552 return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
553 : LoopUnrollResult::PartiallyUnrolled;
554 }
555
556 static bool getLoadsAndStores(std::vector &Blocks,
557 SmallVector &MemInstr) {
558 // Scan the BBs and collect legal loads and stores.
559 // Returns false if non-simple loads/stores are found.
560 for (BasicBlock *BB : Blocks) {
561 for (Instruction &I : *BB) {
562 if (auto *Ld = dyn_cast(&I)) {
563 if (!Ld->isSimple())
564 return false;
565 MemInstr.push_back(&I);
566 } else if (auto *St = dyn_cast(&I)) {
567 if (!St->isSimple())
568 return false;
569 MemInstr.push_back(&I);
570 } else if (I.mayReadOrWriteMemory()) {
571 return false;
572 }
573 }
574 }
575 return true;
576 }
577
578 static bool checkDependencies(SmallVector &Earlier,
579 SmallVector &Later,
580 unsigned LoopDepth, bool InnerLoop,
581 DependenceInfo &DI) {
582 // Use DA to check for dependencies between loads and stores that make unroll
583 // and jam invalid
584 for (Value *I : Earlier) {
585 for (Value *J : Later) {
586 Instruction *Src = cast(I);
587 Instruction *Dst = cast(J);
588 if (Src == Dst)
589 continue;
590 // Ignore Input dependencies.
591 if (isa(Src) && isa(Dst))
592 continue;
593
594 // Track dependencies, and if we find them take a conservative approach
595 // by allowing only = or < (not >), altough some > would be safe
596 // (depending upon unroll width).
597 // For the inner loop, we need to disallow any (> <) dependencies
598 // FIXME: Allow > so long as distance is less than unroll width
599 if (auto D = DI.depends(Src, Dst, true)) {
600 assert(D->isOrdered() && "Expected an output, flow or anti dep.");
601
602 if (D->isConfused())
603 return false;
604 if (!InnerLoop) {
605 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT)
606 return false;
607 } else {
608 assert(LoopDepth + 1 <= D->getLevels());
609 if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
610 D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT)
611 return false;
612 }
613 }
614 }
615 }
616 return true;
617 }
618
619 static bool checkDependencies(Loop *L, std::vector &ForeBlocks,
620 std::vector &SubLoopBlocks,
621 std::vector &AftBlocks,
622 DependenceInfo &DI) {
623 // Get all loads/store pairs for each blocks
624 SmallVector ForeMemInstr;
625 SmallVector SubLoopMemInstr;
626 SmallVector AftMemInstr;
627 if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
628 !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
629 !getLoadsAndStores(AftBlocks, AftMemInstr))
630 return false;
631
632 // Check for dependencies between any blocks that may change order
633 unsigned LoopDepth = L->getLoopDepth();
634 return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
635 DI) &&
636 checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
637 checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
638 DI) &&
639 checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
640 DI);
641 }
642
643 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
644 DependenceInfo &DI) {
645 /* We currently handle outer loops like this:
646 |
647 ForeFirst <----\ }
648 Blocks | } ForeBlocks
649 ForeLast | }
650 | |
651 SubLoopFirst <\ | }
652 Blocks | | } SubLoopBlocks
653 SubLoopLast -/ | }
654 | |
655 AftFirst | }
656 Blocks | } AftBlocks
657 AftLast ------/ }
658 |
659
660 There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
661 and AftBlocks, providing that there is one edge from Fores to SubLoops,
662 one edge from SubLoops to Afts and a single outer loop exit (from Afts).
663 In practice we currently limit Aft blocks to a single block, and limit
664 things further in the profitablility checks of the unroll and jam pass.
665
666 Because of the way we rearrange basic blocks, we also require that
667 the Fore blocks on all unrolled iterations are safe to move before the
668 SubLoop blocks of all iterations. So we require that the phi node looping
669 operands of ForeHeader can be moved to at least the end of ForeEnd, so that
670 we can arrange cloned Fore Blocks before the subloop and match up Phi's
671 correctly.
672
673 i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
674 It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
675
676 There are then a number of checks along the lines of no calls, no
677 exceptions, inner loop IV is consistent, etc. Note that for loops requiring
678 runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
679 UnrollAndJamLoop if the trip count cannot be easily calculated.
680 */
681
682 if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
683 return false;
684 Loop *SubLoop = L->getSubLoops()[0];
685 if (!SubLoop->isLoopSimplifyForm())
686 return false;
687
688 BasicBlock *Header = L->getHeader();
689 BasicBlock *Latch = L->getLoopLatch();
690 BasicBlock *Exit = L->getExitingBlock();
691 BasicBlock *SubLoopHeader = SubLoop->getHeader();
692 BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
693 BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
694
695 if (Latch != Exit)
696 return false;
697 if (SubLoopLatch != SubLoopExit)
698 return false;
699
700 if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken())
701 return false;
702
703 // Split blocks into Fore/SubLoop/Aft based on dominators
704 std::vector SubLoopBlocks;
705 std::vector ForeBlocks;
706 std::vector AftBlocks;
707 if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
708 AftBlocks, &DT))
709 return false;
710
711 // Aft blocks may need to move instructions to fore blocks, which becomes more
712 // difficult if there are multiple (potentially conditionally executed)
713 // blocks. For now we just exclude loops with multiple aft blocks.
714 if (AftBlocks.size() != 1)
715 return false;
716
717 // Check inner loop IV is consistent between all iterations
718 const SCEV *SubLoopBECountSC = SE.getExitCount(SubLoop, SubLoopLatch);
719 if (isa(SubLoopBECountSC) ||
720 !SubLoopBECountSC->getType()->isIntegerTy())
721 return false;
722 ScalarEvolution::LoopDisposition LD =
723 SE.getLoopDisposition(SubLoopBECountSC, L);
724 if (LD != ScalarEvolution::LoopInvariant)
725 return false;
726
727 // Check the loop safety info for exceptions.
728 LoopSafetyInfo LSI;
729 computeLoopSafetyInfo(&LSI, L);
730 if (LSI.MayThrow)
731 return false;
732
733 // We've ruled out the easy stuff and now need to check that there are no
734 // interdependencies which may prevent us from moving the:
735 // ForeBlocks before Subloop and AftBlocks.
736 // Subloop before AftBlocks.
737 // ForeBlock phi operands before the subloop
738
739 // Make sure we can move all instructions we need to before the subloop
740 SmallVector Worklist;
741 SmallPtrSet Visited;
742 for (auto &Phi : Header->phis()) {
743 Value *V = Phi.getIncomingValueForBlock(Latch);
744 if (Instruction *I = dyn_cast(V))
745 Worklist.push_back(I);
746 }
747 while (!Worklist.empty()) {
748 Instruction *I = Worklist.back();
749 Worklist.pop_back();
750 if (Visited.insert(I).second) {
751 if (SubLoop->contains(I->getParent()))
752 return false;
753 if (containsBB(AftBlocks, I->getParent())) {
754 // If we hit a phi node in afts we know we are done (probably LCSSA)
755 if (isa(I))
756 return false;
757 if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
758 return false;
759 for (auto &U : I->operands())
760 if (Instruction *II = dyn_cast(U))
761 Worklist.push_back(II);
762 }
763 }
764 }
765
766 // Check for memory dependencies which prohibit the unrolling we are doing.
767 // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
768 // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
769 if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI))
770 return false;
771
772 return true;
773 }
0 ; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3
4 ; CHECK-LABEL: fore_aft_less
5 ; CHECK: %j = phi
6 ; CHECK: %j.1 = phi
7 ; CHECK: %j.2 = phi
8 ; CHECK: %j.3 = phi
9 define void @fore_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
10 entry:
11 %cmp = icmp sgt i32 %N, 0
12 br i1 %cmp, label %for.outer, label %cleanup
13
14 for.outer:
15 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
16 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
17 store i32 1, i32* %arrayidx, align 4
18 br label %for.inner
19
20 for.inner:
21 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
22 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
23 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
24 %0 = load i32, i32* %arrayidx5, align 4
25 %mul = mul nsw i32 %0, %i
26 %add = add nsw i32 %mul, %sum
27 %add6 = add nuw nsw i32 %j, 1
28 %exitcond = icmp eq i32 %add6, %N
29 br i1 %exitcond, label %for.latch, label %for.inner
30
31 for.latch:
32 %add7 = add nuw nsw i32 %i, 1
33 %add72 = add nuw nsw i32 %i, -1
34 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
35 store i32 %add, i32* %arrayidx8, align 4
36 %exitcond29 = icmp eq i32 %add7, %N
37 br i1 %exitcond29, label %cleanup, label %for.outer
38
39 cleanup:
40 ret void
41 }
42
43
44 ; CHECK-LABEL: fore_aft_eq
45 ; CHECK: %j = phi
46 ; CHECK: %j.1 = phi
47 ; CHECK: %j.2 = phi
48 ; CHECK: %j.3 = phi
49 define void @fore_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
50 entry:
51 %cmp = icmp sgt i32 %N, 0
52 br i1 %cmp, label %for.outer, label %cleanup
53
54 for.outer:
55 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
56 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
57 store i32 1, i32* %arrayidx, align 4
58 br label %for.inner
59
60 for.inner:
61 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
62 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
63 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
64 %0 = load i32, i32* %arrayidx5, align 4
65 %mul = mul nsw i32 %0, %i
66 %add = add nsw i32 %mul, %sum
67 %add6 = add nuw nsw i32 %j, 1
68 %exitcond = icmp eq i32 %add6, %N
69 br i1 %exitcond, label %for.latch, label %for.inner
70
71 for.latch:
72 %add7 = add nuw nsw i32 %i, 1
73 %add72 = add nuw nsw i32 %i, 0
74 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
75 store i32 %add, i32* %arrayidx8, align 4
76 %exitcond29 = icmp eq i32 %add7, %N
77 br i1 %exitcond29, label %cleanup, label %for.outer
78
79 cleanup:
80 ret void
81 }
82
83
84 ; CHECK-LABEL: fore_aft_more
85 ; CHECK: %j = phi
86 ; CHECK-NOT: %j.1 = phi
87 define void @fore_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
88 entry:
89 %cmp = icmp sgt i32 %N, 0
90 br i1 %cmp, label %for.outer, label %cleanup
91
92 for.outer:
93 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
94 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
95 store i32 1, i32* %arrayidx, align 4
96 br label %for.inner
97
98 for.inner:
99 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
100 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
101 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
102 %0 = load i32, i32* %arrayidx5, align 4
103 %mul = mul nsw i32 %0, %i
104 %add = add nsw i32 %mul, %sum
105 %add6 = add nuw nsw i32 %j, 1
106 %exitcond = icmp eq i32 %add6, %N
107 br i1 %exitcond, label %for.latch, label %for.inner
108
109 for.latch:
110 %add7 = add nuw nsw i32 %i, 1
111 %add72 = add nuw nsw i32 %i, 1
112 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
113 store i32 %add, i32* %arrayidx8, align 4
114 %exitcond29 = icmp eq i32 %add7, %N
115 br i1 %exitcond29, label %cleanup, label %for.outer
116
117 cleanup:
118 ret void
119 }
120
121
122 ; CHECK-LABEL: fore_sub_less
123 ; CHECK: %j = phi
124 ; CHECK: %j.1 = phi
125 ; CHECK: %j.2 = phi
126 ; CHECK: %j.3 = phi
127 define void @fore_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
128 entry:
129 %cmp = icmp sgt i32 %N, 0
130 br i1 %cmp, label %for.outer, label %cleanup
131
132 for.outer:
133 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
134 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
135 store i32 1, i32* %arrayidx, align 4
136 br label %for.inner
137
138 for.inner:
139 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
140 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
141 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
142 %0 = load i32, i32* %arrayidx5, align 4
143 %mul = mul nsw i32 %0, %i
144 %add = add nsw i32 %mul, %sum
145 %add72 = add nuw nsw i32 %i, -1
146 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
147 store i32 %add, i32* %arrayidx8, align 4
148 %add6 = add nuw nsw i32 %j, 1
149 %exitcond = icmp eq i32 %add6, %N
150 br i1 %exitcond, label %for.latch, label %for.inner
151
152 for.latch:
153 %add7 = add nuw nsw i32 %i, 1
154 %exitcond29 = icmp eq i32 %add7, %N
155 br i1 %exitcond29, label %cleanup, label %for.outer
156
157 cleanup:
158 ret void
159 }
160
161
162 ; CHECK-LABEL: fore_sub_eq
163 ; CHECK: %j = phi
164 ; CHECK: %j.1 = phi
165 ; CHECK: %j.2 = phi
166 ; CHECK: %j.3 = phi
167 define void @fore_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
168 entry:
169 %cmp = icmp sgt i32 %N, 0
170 br i1 %cmp, label %for.outer, label %cleanup
171
172 for.outer:
173 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
174 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
175 store i32 1, i32* %arrayidx, align 4
176 br label %for.inner
177
178 for.inner:
179 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
180 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
181 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
182 %0 = load i32, i32* %arrayidx5, align 4
183 %mul = mul nsw i32 %0, %i
184 %add = add nsw i32 %mul, %sum
185 %add72 = add nuw nsw i32 %i, 0
186 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
187 store i32 %add, i32* %arrayidx8, align 4
188 %add6 = add nuw nsw i32 %j, 1
189 %exitcond = icmp eq i32 %add6, %N
190 br i1 %exitcond, label %for.latch, label %for.inner
191
192 for.latch:
193 %add7 = add nuw nsw i32 %i, 1
194 %exitcond29 = icmp eq i32 %add7, %N
195 br i1 %exitcond29, label %cleanup, label %for.outer
196
197 cleanup:
198 ret void
199 }
200
201
202 ; CHECK-LABEL: fore_sub_more
203 ; CHECK: %j = phi
204 ; CHECK-NOT: %j.1 = phi
205 define void @fore_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
206 entry:
207 %cmp = icmp sgt i32 %N, 0
208 br i1 %cmp, label %for.outer, label %cleanup
209
210 for.outer:
211 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
212 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
213 store i32 1, i32* %arrayidx, align 4
214 br label %for.inner
215
216 for.inner:
217 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
218 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
219 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
220 %0 = load i32, i32* %arrayidx5, align 4
221 %mul = mul nsw i32 %0, %i
222 %add = add nsw i32 %mul, %sum
223 %add72 = add nuw nsw i32 %i, 1
224 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
225 store i32 %add, i32* %arrayidx8, align 4
226 %add6 = add nuw nsw i32 %j, 1
227 %exitcond = icmp eq i32 %add6, %N
228 br i1 %exitcond, label %for.latch, label %for.inner
229
230 for.latch:
231 %add7 = add nuw nsw i32 %i, 1
232 %exitcond29 = icmp eq i32 %add7, %N
233 br i1 %exitcond29, label %cleanup, label %for.outer
234
235 cleanup:
236 ret void
237 }
238
239
240 ; CHECK-LABEL: sub_aft_less
241 ; CHECK: %j = phi
242 ; CHECK: %j.1 = phi
243 ; CHECK: %j.2 = phi
244 ; CHECK: %j.3 = phi
245 define void @sub_aft_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
246 entry:
247 %cmp = icmp sgt i32 %N, 0
248 br i1 %cmp, label %for.outer, label %cleanup
249
250 for.outer:
251 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
252 br label %for.inner
253
254 for.inner:
255 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
256 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
257 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
258 %0 = load i32, i32* %arrayidx5, align 4
259 %mul = mul nsw i32 %0, %i
260 %add = add nsw i32 %mul, %sum
261 %add6 = add nuw nsw i32 %j, 1
262 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
263 store i32 1, i32* %arrayidx, align 4
264 %exitcond = icmp eq i32 %add6, %N
265 br i1 %exitcond, label %for.latch, label %for.inner
266
267 for.latch:
268 %add7 = add nuw nsw i32 %i, 1
269 %add72 = add nuw nsw i32 %i, -1
270 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
271 store i32 %add, i32* %arrayidx8, align 4
272 %exitcond29 = icmp eq i32 %add7, %N
273 br i1 %exitcond29, label %cleanup, label %for.outer
274
275 cleanup:
276 ret void
277 }
278
279
280 ; CHECK-LABEL: sub_aft_eq
281 ; CHECK: %j = phi
282 ; CHECK: %j.1 = phi
283 ; CHECK: %j.2 = phi
284 ; CHECK: %j.3 = phi
285 define void @sub_aft_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
286 entry:
287 %cmp = icmp sgt i32 %N, 0
288 br i1 %cmp, label %for.outer, label %cleanup
289
290 for.outer:
291 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
292 br label %for.inner
293
294 for.inner:
295 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
296 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
297 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
298 %0 = load i32, i32* %arrayidx5, align 4
299 %mul = mul nsw i32 %0, %i
300 %add = add nsw i32 %mul, %sum
301 %add6 = add nuw nsw i32 %j, 1
302 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
303 store i32 1, i32* %arrayidx, align 4
304 %exitcond = icmp eq i32 %add6, %N
305 br i1 %exitcond, label %for.latch, label %for.inner
306
307 for.latch:
308 %add7 = add nuw nsw i32 %i, 1
309 %add72 = add nuw nsw i32 %i, 0
310 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %i
311 store i32 %add, i32* %arrayidx8, align 4
312 %exitcond29 = icmp eq i32 %add7, %N
313 br i1 %exitcond29, label %cleanup, label %for.outer
314
315 cleanup:
316 ret void
317 }
318
319
320 ; CHECK-LABEL: sub_aft_more
321 ; CHECK: %j = phi
322 ; CHECK-NOT: %j.1 = phi
323 define void @sub_aft_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
324 entry:
325 %cmp = icmp sgt i32 %N, 0
326 br i1 %cmp, label %for.outer, label %cleanup
327
328 for.outer:
329 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
330 br label %for.inner
331
332 for.inner:
333 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
334 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
335 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
336 %0 = load i32, i32* %arrayidx5, align 4
337 %mul = mul nsw i32 %0, %i
338 %add = add nsw i32 %mul, %sum
339 %add6 = add nuw nsw i32 %j, 1
340 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
341 store i32 1, i32* %arrayidx, align 4
342 %exitcond = icmp eq i32 %add6, %N
343 br i1 %exitcond, label %for.latch, label %for.inner
344
345 for.latch:
346 %add7 = add nuw nsw i32 %i, 1
347 %add72 = add nuw nsw i32 %i, 1
348 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
349 store i32 %add, i32* %arrayidx8, align 4
350 %exitcond29 = icmp eq i32 %add7, %N
351 br i1 %exitcond29, label %cleanup, label %for.outer
352
353 cleanup:
354 ret void
355 }
356
357
358 ; CHECK-LABEL: sub_sub_less
359 ; CHECK: %j = phi
360 ; CHECK-NOT: %j.1 = phi
361 define void @sub_sub_less(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
362 entry:
363 %cmp = icmp sgt i32 %N, 0
364 br i1 %cmp, label %for.outer, label %cleanup
365
366 for.outer:
367 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
368 br label %for.inner
369
370 for.inner:
371 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
372 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
373 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
374 %0 = load i32, i32* %arrayidx5, align 4
375 %mul = mul nsw i32 %0, %i
376 %add = add nsw i32 %mul, %sum
377 %add6 = add nuw nsw i32 %j, 1
378 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
379 store i32 1, i32* %arrayidx, align 4
380 %add72 = add nuw nsw i32 %i, -1
381 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
382 store i32 %add, i32* %arrayidx8, align 4
383 %exitcond = icmp eq i32 %add6, %N
384 br i1 %exitcond, label %for.latch, label %for.inner
385
386 for.latch:
387 %add7 = add nuw nsw i32 %i, 1
388 %exitcond29 = icmp eq i32 %add7, %N
389 br i1 %exitcond29, label %cleanup, label %for.outer
390
391 cleanup:
392 ret void
393 }
394
395
396 ; CHECK-LABEL: sub_sub_eq
397 ; CHECK: %j = phi
398 ; CHECK: %j.1 = phi
399 define void @sub_sub_eq(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
400 entry:
401 %cmp = icmp sgt i32 %N, 0
402 br i1 %cmp, label %for.outer, label %cleanup
403
404 for.outer:
405 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
406 br label %for.inner
407
408 for.inner:
409 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
410 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
411 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
412 %0 = load i32, i32* %arrayidx5, align 4
413 %mul = mul nsw i32 %0, %i
414 %add = add nsw i32 %mul, %sum
415 %add6 = add nuw nsw i32 %j, 1
416 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
417 store i32 1, i32* %arrayidx, align 4
418 %add72 = add nuw nsw i32 %i, 0
419 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
420 store i32 %add, i32* %arrayidx8, align 4
421 %exitcond = icmp eq i32 %add6, %N
422 br i1 %exitcond, label %for.latch, label %for.inner
423
424 for.latch:
425 %add7 = add nuw nsw i32 %i, 1
426 %exitcond29 = icmp eq i32 %add7, %N
427 br i1 %exitcond29, label %cleanup, label %for.outer
428
429 cleanup:
430 ret void
431 }
432
433
434 ; CHECK-LABEL: sub_sub_more
435 ; CHECK: %j = phi
436 ; CHECK-NOT: %j.1 = phi
437 define void @sub_sub_more(i32* noalias nocapture %A, i32 %N, i32* noalias nocapture readonly %B) {
438 entry:
439 %cmp = icmp sgt i32 %N, 0
440 br i1 %cmp, label %for.outer, label %cleanup
441
442 for.outer:
443 %i = phi i32 [ %add7, %for.latch ], [ 0, %entry ]
444 br label %for.inner
445
446 for.inner:
447 %j = phi i32 [ %add6, %for.inner ], [ 0, %for.outer ]
448 %sum = phi i32 [ %add, %for.inner ], [ 0, %for.outer ]
449 %arrayidx5 = getelementptr inbounds i32, i32* %B, i32 %j
450 %0 = load i32, i32* %arrayidx5, align 4
451 %mul = mul nsw i32 %0, %i
452 %add = add nsw i32 %mul, %sum
453 %add6 = add nuw nsw i32 %j, 1
454 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i
455 store i32 1, i32* %arrayidx, align 4
456 %add72 = add nuw nsw i32 %i, 1
457 %arrayidx8 = getelementptr inbounds i32, i32* %A, i32 %add72
458 store i32 %add, i32* %arrayidx8, align 4
459 %exitcond = icmp eq i32 %add6, %N
460 br i1 %exitcond, label %for.latch, label %for.inner
461
462 for.latch:
463 %add7 = add nuw nsw i32 %i, 1
464 %exitcond29 = icmp eq i32 %add7, %N
465 br i1 %exitcond29, label %cleanup, label %for.outer
466
467 cleanup:
468 ret void
469 }
0 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3
4 ;; Common check for all tests. None should be unroll and jammed
5 ; CHECK-NOT: remark: {{.*}} unroll and jammed
6
7
8 ; CHECK-LABEL: disabled1
9 ; Tests for(i) { sum = A[i]; for(j) sum += B[j]; A[i+1] = sum; }
10 ; A[i] to A[i+1] dependency should block unrollandjam
11 define void @disabled1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
12 ; CHECK: %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
13 ; CHECK: %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
14 entry:
15 %cmp = icmp ne i32 %J, 0
16 %cmp127 = icmp ne i32 %I, 0
17 %or.cond = and i1 %cmp127, %cmp
18 br i1 %or.cond, label %for.preheader, label %return
19
20 for.preheader:
21 br label %for.outer
22
23 for.outer:
24 %i.029 = phi i32 [ %add10, %for.latch ], [ 0, %for.preheader ]
25 %b.028 = phi i32 [ %inc8, %for.latch ], [ 1, %for.preheader ]
26 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.029
27 %0 = load i32, i32* %arrayidx, align 4
28 br label %for.inner
29
30 for.inner:
31 %j.026 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
32 %sum1.025 = phi i32 [ %0, %for.outer ], [ %add, %for.inner ]
33 %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.026
34 %1 = load i32, i32* %arrayidx6, align 4
35 %add = add i32 %1, %sum1.025
36 %inc = add nuw i32 %j.026, 1
37 %exitcond = icmp eq i32 %inc, %J
38 br i1 %exitcond, label %for.latch, label %for.inner
39
40 for.latch:
41 %arrayidx7 = getelementptr inbounds i32, i32* %A, i32 %b.028
42 store i32 %add, i32* %arrayidx7, align 4
43 %inc8 = add nuw nsw i32 %b.028, 1
44 %add10 = add nuw nsw i32 %i.029, 1
45 %exitcond30 = icmp eq i32 %add10, %I
46 br i1 %exitcond30, label %return, label %for.outer
47
48 return:
49 ret void
50 }
51
52
53 ; CHECK-LABEL: disabled2
54 ; Tests an incompatible block layout (for.outer jumps past for.inner)
55 ; FIXME: Make this work
56 define void @disabled2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
57 ; CHECK: %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
58 ; CHECK: %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
59 entry:
60 %cmp = icmp ne i32 %J, 0
61 %cmp131 = icmp ne i32 %I, 0
62 %or.cond = and i1 %cmp131, %cmp
63 br i1 %or.cond, label %for.preheader, label %for.end14
64
65 for.preheader:
66 br label %for.outer
67
68 for.outer:
69 %i.032 = phi i32 [ %add13, %for.latch ], [ 0, %for.preheader ]
70 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.032
71 %0 = load i32, i32* %arrayidx, align 4
72 %tobool = icmp eq i32 %0, 0
73 br i1 %tobool, label %for.latch, label %for.inner
74
75 for.inner:
76 %j.030 = phi i32 [ %inc, %for.inner ], [ 0, %for.outer ]
77 %sum1.029 = phi i32 [ %sum1.1, %for.inner ], [ 0, %for.outer ]
78 %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %j.030
79 %1 = load i32, i32* %arrayidx6, align 4
80 %tobool7 = icmp eq i32 %1, 0
81 %sub = add i32 %sum1.029, 10
82 %add = sub i32 %sub, %1
83 %sum1.1 = select i1 %tobool7, i32 %sum1.029, i32 %add
84 %inc = add nuw i32 %j.030, 1
85 %exitcond = icmp eq i32 %inc, %J
86 br i1 %exitcond, label %for.latch, label %for.inner
87
88 for.latch:
89 %sum1.1.lcssa = phi i32 [ 0, %for.outer ], [ %sum1.1, %for.inner ]
90 %arrayidx11 = getelementptr inbounds i32, i32* %A, i32 %i.032
91 store i32 %sum1.1.lcssa, i32* %arrayidx11, align 4
92 %add13 = add nuw i32 %i.032, 1
93 %exitcond33 = icmp eq i32 %add13, %I
94 br i1 %exitcond33, label %for.end14, label %for.outer
95
96 for.end14:
97 ret void
98 }
99
100
101 ; CHECK-LABEL: disabled3
102 ; Tests loop carry dependencies in an array S
103 define void @disabled3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
104 ; CHECK: %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
105 ; CHECK: %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
106 entry:
107 %S = alloca [4 x i32], align 4
108 %cmp = icmp eq i32 %J, 0
109 br i1 %cmp, label %return, label %if.end
110
111 if.end:
112 %0 = bitcast [4 x i32]* %S to i8*
113 %cmp128 = icmp eq i32 %I, 0
114 br i1 %cmp128, label %for.cond.cleanup, label %for.preheader
115
116 for.preheader:
117 %arrayidx9 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 0
118 br label %for.outer
119
120 for.cond.cleanup:
121 br label %return
122
123 for.outer:
124 %i.029 = phi i32 [ 0, %for.preheader ], [ %add12, %for.latch ]
125 br label %for.inner
126
127 for.inner:
128 %j.027 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
129 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.027
130 %l2 = load i32, i32* %arrayidx, align 4
131 %add = add i32 %j.027, %i.029
132 %rem = urem i32 %add, %J
133 %arrayidx6 = getelementptr inbounds i32, i32* %B, i32 %rem
134 %l3 = load i32, i32* %arrayidx6, align 4
135 %mul = mul i32 %l3, %l2
136 %rem7 = urem i32 %j.027, 3
137 %arrayidx8 = getelementptr inbounds [4 x i32], [4 x i32]* %S, i32 0, i32 %rem7
138 store i32 %mul, i32* %arrayidx8, align 4
139 %inc = add nuw i32 %j.027, 1
140 %exitcond = icmp eq i32 %inc, %J
141 br i1 %exitcond, label %for.latch, label %for.inner
142
143 for.latch:
144 %l1 = load i32, i32* %arrayidx9, align 4
145 %arrayidx10 = getelementptr inbounds i32, i32* %A, i32 %i.029
146 store i32 %l1, i32* %arrayidx10, align 4
147 %add12 = add nuw i32 %i.029, 1
148 %exitcond31 = icmp eq i32 %add12, %I
149 br i1 %exitcond31, label %for.cond.cleanup, label %for.outer
150
151 return:
152 ret void
153 }
154
155
156 ; CHECK-LABEL: disabled4
157 ; Inner looop induction variable is not consistent
158 ; ie for(i = 0..n) for (j = 0..i) sum+=B[j]
159 define void @disabled4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
160 ; CHECK: %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
161 ; CHECK: %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
162 entry:
163 %cmp = icmp ne i32 %J, 0
164 %cmp122 = icmp ugt i32 %I, 1
165 %or.cond = and i1 %cmp122, %cmp
166 br i1 %or.cond, label %for.preheader, label %for.end9
167
168 for.preheader:
169 br label %for.outer
170
171 for.outer:
172 %indvars.iv = phi i32 [ %indvars.iv.next, %for.latch ], [ 1, %for.preheader ]
173 br label %for.inner
174
175 for.inner:
176 %j.021 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
177 %sum1.020 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
178 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j.021
179 %0 = load i32, i32* %arrayidx, align 4
180 %add = add i32 %0, %sum1.020
181 %inc = add nuw i32 %j.021, 1
182 %exitcond = icmp eq i32 %inc, %indvars.iv
183 br i1 %exitcond, label %for.latch, label %for.inner
184
185 for.latch:
186 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %indvars.iv
187 store i32 %add, i32* %arrayidx6, align 4
188 %indvars.iv.next = add nuw i32 %indvars.iv, 1
189 %exitcond24 = icmp eq i32 %indvars.iv.next, %I
190 br i1 %exitcond24, label %for.end9, label %for.outer
191
192 for.end9:
193 ret void
194 }
195
196
197 ; CHECK-LABEL: disabled5
198 ; Test odd uses of phi nodes where the outer IV cannot be moved into Fore as it hits a PHI
199 @f = hidden global i32 0, align 4
200 define i32 @disabled5() #0 {
201 ; CHECK: %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
202 ; CHECK: %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
203 entry:
204 %f.promoted10 = load i32, i32* @f, align 4
205 br label %for.outer
206
207 for.outer:
208 %0 = phi i32 [ %f.promoted10, %entry ], [ 2, %for.latch ]
209 %d.018 = phi i16 [ 0, %entry ], [ %odd.lcssa, %for.latch ]
210 %inc5.sink9 = phi i32 [ 2, %entry ], [ %inc5, %for.latch ]
211 br label %for.inner
212
213 for.inner:
214 %1 = phi i32 [ %0, %for.outer ], [ 2, %for.inner ]
215 %inc.sink8 = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
216 %inc = add nuw nsw i32 %inc.sink8, 1
217 %exitcond = icmp ne i32 %inc, 7
218 br i1 %exitcond, label %for.inner, label %for.latch
219
220 for.latch:
221 %.lcssa = phi i32 [ %1, %for.inner ]
222 %odd.lcssa = phi i16 [ 1, %for.inner ]
223 %inc5 = add nuw nsw i32 %inc5.sink9, 1
224 %exitcond11 = icmp ne i32 %inc5, 7
225 br i1 %exitcond11, label %for.outer, label %for.end
226
227 for.end:
228 %.lcssa.lcssa = phi i32 [ %.lcssa, %for.latch ]
229 %inc.lcssa.lcssa = phi i32 [ 7, %for.latch ]
230 ret i32 0
231 }
232
233
234 ; CHECK-LABEL: disabled6
235 ; There is a dependency in here, between @d and %0 (=@f)
236 @d6 = hidden global i16 5, align 2
237 @f6 = hidden global i16* @d6, align 4
238 define i32 @disabled6() #0 {
239 ; CHECK: %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
240 ; CHECK: %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
241 entry:
242 store i16 1, i16* @d6, align 2
243 %0 = load i16*, i16** @f6, align 4
244 br label %for.body.i
245
246 for.body.i:
247 %inc8.sink14.i = phi i16 [ 1, %entry ], [ %inc8.i, %for.cond.cleanup.i ]
248 %1 = load i16, i16* %0, align 2
249 br label %for.body6.i
250
251 for.cond.cleanup.i:
252 %inc8.i = add nuw nsw i16 %inc8.sink14.i, 1
253 store i16 %inc8.i, i16* @d6, align 2
254 %cmp.i = icmp ult i16 %inc8.i, 6
255 br i1 %cmp.i, label %for.body.i, label %test.exit
256
257 for.body6.i:
258 %c.013.i = phi i32 [ 0, %for.body.i ], [ %inc.i, %for.body6.i ]
259 %inc.i = add nuw nsw i32 %c.013.i, 1
260 %exitcond.i = icmp eq i32 %inc.i, 7
261 br i1 %exitcond.i, label %for.cond.cleanup.i, label %for.body6.i
262
263 test.exit:
264 %conv2.i = sext i16 %1 to i32
265 ret i32 0
266 }
267
268
269 ; CHECK-LABEL: disabled7
270 ; Has negative output dependency
271 define void @disabled7(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
272 ; CHECK: %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
273 ; CHECK: %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
274 entry:
275 %cmp = icmp ne i32 %J, 0
276 %cmp127 = icmp ne i32 %I, 0
277 %or.cond = and i1 %cmp127, %cmp
278 br i1 %or.cond, label %for.body.preheader, label %for.end12
279
280 for.body.preheader:
281 br label %for.body
282
283 for.body:
284 %i.028 = phi i32 [ %add11, %for.cond3.for.cond.cleanup5_crit_edge ], [ 0, %for.body.preheader ]
285 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.028
286 store i32 0, i32* %arrayidx, align 4
287 %sub = add i32 %i.028, -1
288 %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %sub
289 store i32 2, i32* %arrayidx2, align 4
290 br label %for.body6
291
292 for.cond3.for.cond.cleanup5_crit_edge:
293 store i32 %add, i32* %arrayidx, align 4
294 %add11 = add nuw i32 %i.028, 1
295 %exitcond29 = icmp eq i32 %add11, %I
296 br i1 %exitcond29, label %for.end12, label %for.body
297
298 for.body6:
299 %0 = phi i32 [ 0, %for.body ], [ %add, %for.body6 ]
300 %j.026 = phi i32 [ 0, %for.body ], [ %add9, %for.body6 ]
301 %arrayidx7 = getelementptr inbounds i32, i32* %B, i32 %j.026
302 %1 = load i32, i32* %arrayidx7, align 4
303 %add = add i32 %1, %0
304 %add9 = add nuw i32 %j.026, 1
305 %exitcond = icmp eq i32 %add9, %J
306 br i1 %exitcond, label %for.cond3.for.cond.cleanup5_crit_edge, label %for.body6
307
308 for.end12:
309 ret void
310 }
311
312
313 ; CHECK-LABEL: disabled8
314 ; Same as above with an extra outer loop nest
315 define void @disabled8(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
316 ; CHECK: %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
317 ; CHECK: %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
318 entry:
319 %cmp = icmp eq i32 %J, 0
320 %cmp335 = icmp eq i32 %I, 0
321 %or.cond = or i1 %cmp, %cmp335
322 br i1 %or.cond, label %for.end18, label %for.body.preheader
323
324 for.body.preheader:
325 br label %for.body
326
327 for.body:
328 %x.037 = phi i32 [ %inc, %for.cond.cleanup4 ], [ 0, %for.body.preheader ]
329 br label %for.outer
330
331 for.cond.cleanup4:
332 %inc = add nuw nsw i32 %x.037, 1
333 %exitcond40 = icmp eq i32 %inc, 5
334 br i1 %exitcond40, label %for.end18, label %for.body
335
336 for.outer:
337 %i.036 = phi i32 [ %add15, %for.latch ], [ 0, %for.body ]
338 %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.036
339 store i32 0, i32* %arrayidx, align 4
340 %sub = add i32 %i.036, -1
341 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %sub
342 store i32 2, i32* %arrayidx6, align 4
343 br label %for.inner
344
345 for.latch:
346 store i32 %add, i32* %arrayidx, align 4
347 %add15 = add nuw i32 %i.036, 1
348 %exitcond38 = icmp eq i32 %add15, %I
349 br i1 %exitcond38, label %for.cond.cleanup4, label %for.outer
350
351 for.inner:
352 %0 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
353 %j.034 = phi i32 [ 0, %for.outer ], [ %add13, %for.inner ]
354 %arrayidx11 = getelementptr inbounds i32, i32* %B, i32 %j.034
355 %1 = load i32, i32* %arrayidx11, align 4
356 %add = add i32 %1, %0
357 %add13 = add nuw i32 %j.034, 1
358 %exitcond = icmp eq i32 %add13, %J
359 br i1 %exitcond, label %for.latch, label %for.inner
360
361 for.end18:
362 ret void
363 }
364
365
366 ; CHECK-LABEL: disabled9
367 ; Can't prove alias between A and B
368 define void @disabled9(i32 %I, i32 %J, i32* nocapture %A, i32* nocapture readonly %B) #0 {
369 ; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
370 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
371 entry:
372 %cmp = icmp ne i32 %J, 0
373 %cmp122 = icmp ne i32 %I, 0
374 %or.cond = and i1 %cmp, %cmp122
375 br i1 %or.cond, label %for.outer.preheader, label %for.end
376
377 for.outer.preheader:
378 br label %for.outer
379
380 for.outer:
381 %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
382 br label %for.inner
383
384 for.inner:
385 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
386 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
387 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
388 %0 = load i32, i32* %arrayidx, align 4
389 %add = add i32 %0, %sum1
390 %inc = add nuw i32 %j, 1
391 %exitcond = icmp eq i32 %inc, %J
392 br i1 %exitcond, label %for.latch, label %for.inner
393
394 for.latch:
395 %add.lcssa = phi i32 [ %add, %for.inner ]
396 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
397 store i32 %add.lcssa, i32* %arrayidx6, align 4
398 %add8 = add nuw i32 %i, 1
399 %exitcond25 = icmp eq i32 %add8, %I
400 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
401
402 for.end.loopexit:
403 br label %for.end
404
405 for.end:
406 ret void
407 }
408
409
410 ; CHECK-LABEL: disable10
411 ; Simple call
412 declare void @f10(i32, i32) #0
413 define void @disable10(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
414 ; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
415 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
416 entry:
417 %cmp = icmp ne i32 %J, 0
418 %cmp122 = icmp ne i32 %I, 0
419 %or.cond = and i1 %cmp, %cmp122
420 br i1 %or.cond, label %for.outer.preheader, label %for.end
421
422 for.outer.preheader:
423 br label %for.outer
424
425 for.outer:
426 %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
427 br label %for.inner
428
429 for.inner:
430 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
431 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
432 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
433 %0 = load i32, i32* %arrayidx, align 4
434 %add = add i32 %0, %sum1
435 %inc = add nuw i32 %j, 1
436 %exitcond = icmp eq i32 %inc, %J
437 tail call void @f10(i32 %i, i32 %j) nounwind
438 br i1 %exitcond, label %for.latch, label %for.inner
439
440 for.latch:
441 %add.lcssa = phi i32 [ %add, %for.inner ]
442 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
443 store i32 %add.lcssa, i32* %arrayidx6, align 4
444 %add8 = add nuw i32 %i, 1
445 %exitcond25 = icmp eq i32 %add8, %I
446 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
447
448 for.end.loopexit:
449 br label %for.end
450
451 for.end:
452 ret void
453 }
454
455
456 ; CHECK-LABEL: disable11
457 ; volatile
458 define void @disable11(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
459 ; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
460 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
461 entry:
462 %cmp = icmp ne i32 %J, 0
463 %cmp122 = icmp ne i32 %I, 0
464 %or.cond = and i1 %cmp, %cmp122
465 br i1 %or.cond, label %for.outer.preheader, label %for.end
466
467 for.outer.preheader:
468 br label %for.outer
469
470 for.outer:
471 %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
472 br label %for.inner
473
474 for.inner:
475 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
476 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
477 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
478 %0 = load volatile i32, i32* %arrayidx, align 4
479 %add = add i32 %0, %sum1
480 %inc = add nuw i32 %j, 1
481 %exitcond = icmp eq i32 %inc, %J
482 br i1 %exitcond, label %for.latch, label %for.inner
483
484 for.latch:
485 %add.lcssa = phi i32 [ %add, %for.inner ]
486 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
487 store i32 %add.lcssa, i32* %arrayidx6, align 4
488 %add8 = add nuw i32 %i, 1
489 %exitcond25 = icmp eq i32 %add8, %I
490 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
491
492 for.end.loopexit:
493 br label %for.end
494
495 for.end:
496 ret void
497 }
498
499
500 ; CHECK-LABEL: disable12
501 ; Multiple aft blocks
502 define void @disable12(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
503 ; CHECK: %i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
504 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
505 entry:
506 %cmp = icmp ne i32 %J, 0
507 %cmp122 = icmp ne i32 %I, 0
508 %or.cond = and i1 %cmp, %cmp122
509 br i1 %or.cond, label %for.outer.preheader, label %for.end
510
511 for.outer.preheader:
512 br label %for.outer
513
514 for.outer:
515 %i = phi i32 [ %add8, %for.latch3 ], [ 0, %for.outer.preheader ]
516 br label %for.inner
517
518 for.inner:
519 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
520 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
521 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
522 %0 = load i32, i32* %arrayidx, align 4
523 %add = add i32 %0, %sum1
524 %inc = add nuw i32 %j, 1
525 %exitcond = icmp eq i32 %inc, %J
526 br i1 %exitcond, label %for.latch, label %for.inner
527
528 for.latch:
529 %add.lcssa = phi i32 [ %add, %for.inner ]
530 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
531 store i32 %add.lcssa, i32* %arrayidx6, align 4
532 %cmpl = icmp eq i32 %add.lcssa, 10
533 br i1 %cmpl, label %for.latch2, label %for.latch3
534
535 for.latch2:
536 br label %for.latch3
537
538 for.latch3:
539 %add8 = add nuw i32 %i, 1
540 %exitcond25 = icmp eq i32 %add8, %I
541 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
542
543 for.end.loopexit:
544 br label %for.end
545
546 for.end:
547 ret void
548 }
549
550
551 ; CHECK-LABEL: disable13
552 ; Two subloops
553 define void @disable13(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
554 ; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
555 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
556 ; CHECK: %j2 = phi i32 [ %inc2, %for.inner2 ], [ 0, %for.inner2.preheader ]
557 entry:
558 %cmp = icmp ne i32 %J, 0
559 %cmp122 = icmp ne i32 %I, 0
560 %or.cond = and i1 %cmp, %cmp122
561 br i1 %or.cond, label %for.outer.preheader, label %for.end
562
563 for.outer.preheader:
564 br label %for.outer
565
566 for.outer:
567 %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
568 br label %for.inner
569
570 for.inner:
571 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
572 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
573 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
574 %0 = load i32, i32* %arrayidx, align 4
575 %add = add i32 %0, %sum1
576 %inc = add nuw i32 %j, 1
577 %exitcond = icmp eq i32 %inc, %J
578 br i1 %exitcond, label %for.inner2, label %for.inner
579
580 for.inner2:
581 %j2 = phi i32 [ 0, %for.inner ], [ %inc2, %for.inner2 ]
582 %sum12 = phi i32 [ 0, %for.inner ], [ %add2, %for.inner2 ]
583 %arrayidx2 = getelementptr inbounds i32, i32* %B, i32 %j2
584 %l0 = load i32, i32* %arrayidx2, align 4
585 %add2 = add i32 %l0, %sum12
586 %inc2 = add nuw i32 %j2, 1
587 %exitcond2 = icmp eq i32 %inc2, %J
588 br i1 %exitcond2, label %for.latch, label %for.inner2
589
590 for.latch:
591 %add.lcssa = phi i32 [ %add, %for.inner2 ]
592 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
593 store i32 %add.lcssa, i32* %arrayidx6, align 4
594 %add8 = add nuw i32 %i, 1
595 %exitcond25 = icmp eq i32 %add8, %I
596 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
597
598 for.end.loopexit:
599 br label %for.end
600
601 for.end:
602 ret void
603 }
604
605
606 ; CHECK-LABEL: disable14
607 ; Multiple exits blocks
608 define void @disable14(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
609 ; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
610 ; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
611 entry:
612 %cmp = icmp ne i32 %J, 0
613 %cmp122 = icmp ne i32 %I, 0
614 %or.cond = and i1 %cmp, %cmp122
615 br i1 %or.cond, label %for.outer.preheader, label %for.end
616
617 for.outer.preheader:
618 br label %for.outer
619
620 for.outer:
621 %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
622 %add8 = add nuw i32 %i, 1
623 %exitcond23 = icmp eq i32 %add8, %I
624 br i1 %exitcond23, label %for.end.loopexit, label %for.inner
625
626 for.inner:
627 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
628 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
629 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
630 %0 = load i32, i32* %arrayidx, align 4
631 %add = add i32 %0, %sum1
632 %inc = add nuw i32 %j, 1
633 %exitcond = icmp eq i32 %inc, %J
634 br i1 %exitcond, label %for.latch, label %for.inner
635
636 for.latch:
637 %add.lcssa = phi i32 [ %add, %for.inner ]
638 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
639 store i32 %add.lcssa, i32* %arrayidx6, align 4
640 %exitcond25 = icmp eq i32 %add8, %I
641 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
642
643 for.end.loopexit:
644 br label %for.end
645
646 for.end:
647 ret void
648 }
649
650
651 ; CHECK-LABEL: disable15
652 ; Latch != exit
653 define void @disable15(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
654 ; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
655 ; CHECK: %j = phi i32 [ %inc, %for.inner ], [ 0, %for.inner.preheader ]
656 entry:
657 %cmp = icmp ne i32 %J, 0
658 %cmp122 = icmp ne i32 %I, 0
659 %or.cond = and i1 %cmp, %cmp122
660 br i1 %or.cond, label %for.outer.preheader, label %for.end
661
662 for.outer.preheader:
663 br label %for.outer
664
665 for.outer:
666 %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
667 %add8 = add nuw i32 %i, 1
668 %exitcond25 = icmp eq i32 %add8, %I
669 br i1 %exitcond25, label %for.end.loopexit, label %for.inner
670
671 for.inner:
672 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
673 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
674 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
675 %0 = load i32, i32* %arrayidx, align 4
676 %add = add i32 %0, %sum1
677 %inc = add nuw i32 %j, 1
678 %exitcond = icmp eq i32 %inc, %J
679 br i1 %exitcond, label %for.latch, label %for.inner
680
681 for.latch:
682 %add.lcssa = phi i32 [ %add, %for.inner ]
683 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
684 store i32 %add.lcssa, i32* %arrayidx6, align 4
685 br label %for.outer
686
687 for.end.loopexit:
688 br label %for.end
689
690 for.end:
691 ret void
692 }
693
694
695 ; CHECK-LABEL: disable16
696 ; Cannot move other before inner loop
697 define void @disable16(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
698 ; CHECK: %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
699 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
700 entry:
701 %cmp = icmp ne i32 %J, 0
702 %cmp122 = icmp ne i32 %I, 0
703 %or.cond = and i1 %cmp, %cmp122
704 br i1 %or.cond, label %for.outer.preheader, label %for.end
705
706 for.outer.preheader:
707 br label %for.outer
708
709 for.outer:
710 %i = phi i32 [ %add8, %for.latch ], [ 0, %for.outer.preheader ]
711 %otherphi = phi i32 [ %other, %for.latch ], [ 0, %for.outer.preheader ]
712 br label %for.inner
713
714 for.inner:
715 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
716 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
717 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
718 %0 = load i32, i32* %arrayidx, align 4
719 %add = add i32 %0, %sum1
720 %inc = add nuw i32 %j, 1
721 %exitcond = icmp eq i32 %inc, %J
722 br i1 %exitcond, label %for.latch, label %for.inner
723
724 for.latch:
725 %add.lcssa = phi i32 [ %add, %for.inner ]
726 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
727 store i32 %add.lcssa, i32* %arrayidx6, align 4
728 %add8 = add nuw i32 %i, 1
729 %exitcond25 = icmp eq i32 %add8, %I
730 %loadarr = getelementptr inbounds i32, i32* %A, i32 %i
731 %load = load i32, i32* %arrayidx6, align 4
732 %other = add i32 %otherphi, %load
733 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
734
735 for.end.loopexit:
736 br label %for.end
737
738 for.end:
739 ret void
740 }
0 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime < %s -S | FileCheck %s
1 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-and-jam-threshold=15 < %s -S | FileCheck %s --check-prefix=CHECK-LOWTHRES
2
3 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
4
5 ; CHECK-LABEL: test1
6 ; Basic check that these loops are by default UnJ'd
7 define void @test1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
8 ; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
9 ; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
10 entry:
11 %cmp = icmp ne i32 %J, 0
12 %cmp122 = icmp ne i32 %I, 0
13 %or.cond = and i1 %cmp, %cmp122
14 br i1 %or.cond, label %for.outer.preheader, label %for.end
15
16 for.outer.preheader:
17 br label %for.outer
18
19 for.outer:
20 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
21 br label %for.inner
22
23 for.inner:
24 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
25 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
26 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
27 %0 = load i32, i32* %arrayidx.us, align 4
28 %add.us = add i32 %0, %sum1.us
29 %inc.us = add nuw i32 %j.us, 1
30 %exitcond = icmp eq i32 %inc.us, %J
31 br i1 %exitcond, label %for.latch, label %for.inner
32
33 for.latch:
34 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
35 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
36 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
37 %add8.us = add nuw i32 %i.us, 1
38 %exitcond25 = icmp eq i32 %add8.us, %I
39 br i1 %exitcond25, label %for.end.loopexit, label %for.outer
40
41 for.end.loopexit:
42 br label %for.end
43
44 for.end:
45 ret void
46 }
47
48
49 ; CHECK-LABEL: nounroll_and_jam
50 ; #pragma nounroll_and_jam
51 define void @nounroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
52 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
53 entry:
54 %cmp = icmp ne i32 %J, 0
55 %cmp122 = icmp ne i32 %I, 0
56 %or.cond = and i1 %cmp, %cmp122
57 br i1 %or.cond, label %for.outer.preheader, label %for.end
58
59 for.outer.preheader:
60 br label %for.outer
61
62 for.outer:
63 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
64 br label %for.inner
65
66 for.inner:
67 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
68 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
69 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
70 %0 = load i32, i32* %arrayidx.us, align 4
71 %add.us = add i32 %0, %sum1.us
72 %inc.us = add nuw i32 %j.us, 1
73 %exitcond = icmp eq i32 %inc.us, %J
74 br i1 %exitcond, label %for.latch, label %for.inner
75
76 for.latch:
77 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
78 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
79 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
80 %add8.us = add nuw i32 %i.us, 1
81 %exitcond25 = icmp eq i32 %add8.us, %I
82 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !1
83
84 for.end.loopexit:
85 br label %for.end
86
87 for.end:
88 ret void
89 }
90
91
92 ; CHECK-LABEL: unroll_and_jam_count
93 ; #pragma unroll_and_jam(8)
94 define void @unroll_and_jam_count(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
95 ; CHECK: %i.us = phi i32 [ %add8.us.7, %for.latch ], [ 0, %for.outer.preheader.new ]
96 entry:
97 %cmp = icmp ne i32 %J, 0
98 %cmp122 = icmp ne i32 %I, 0
99 %or.cond = and i1 %cmp, %cmp122
100 br i1 %or.cond, label %for.outer.preheader, label %for.end
101
102 for.outer.preheader:
103 br label %for.outer
104
105 for.outer:
106 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
107 br label %for.inner
108
109 for.inner:
110 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
111 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
112 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
113 %0 = load i32, i32* %arrayidx.us, align 4
114 %add.us = add i32 %0, %sum1.us
115 %inc.us = add nuw i32 %j.us, 1
116 %exitcond = icmp eq i32 %inc.us, %J
117 br i1 %exitcond, label %for.latch, label %for.inner
118
119 for.latch:
120 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
121 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
122 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
123 %add8.us = add nuw i32 %i.us, 1
124 %exitcond25 = icmp eq i32 %add8.us, %I
125 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !3
126
127 for.end.loopexit:
128 br label %for.end
129
130 for.end:
131 ret void
132 }
133
134
135 ; CHECK-LABEL: unroll_and_jam
136 ; #pragma unroll_and_jam
137 define void @unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
138 ; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
139 ; CHECK-LOWTHRES: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
140 entry:
141 %cmp = icmp ne i32 %J, 0
142 %cmp122 = icmp ne i32 %I, 0
143 %or.cond = and i1 %cmp, %cmp122
144 br i1 %or.cond, label %for.outer.preheader, label %for.end
145
146 for.outer.preheader:
147 br label %for.outer
148
149 for.outer:
150 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
151 br label %for.inner
152
153 for.inner:
154 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
155 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
156 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
157 %0 = load i32, i32* %arrayidx.us, align 4
158 %add.us = add i32 %0, %sum1.us
159 %inc.us = add nuw i32 %j.us, 1
160 %exitcond = icmp eq i32 %inc.us, %J
161 br i1 %exitcond, label %for.latch, label %for.inner
162
163 for.latch:
164 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
165 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
166 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
167 %add8.us = add nuw i32 %i.us, 1
168 %exitcond25 = icmp eq i32 %add8.us, %I
169 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !5
170
171 for.end.loopexit:
172 br label %for.end
173
174 for.end:
175 ret void
176 }
177
178
179 ; CHECK-LABEL: nounroll
180 ; #pragma nounroll (which we take to mean disable unroll and jam too)
181 define void @nounroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
182 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
183 entry:
184 %cmp = icmp ne i32 %J, 0
185 %cmp122 = icmp ne i32 %I, 0
186 %or.cond = and i1 %cmp, %cmp122
187 br i1 %or.cond, label %for.outer.preheader, label %for.end
188
189 for.outer.preheader:
190 br label %for.outer
191
192 for.outer:
193 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
194 br label %for.inner
195
196 for.inner:
197 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
198 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
199 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
200 %0 = load i32, i32* %arrayidx.us, align 4
201 %add.us = add i32 %0, %sum1.us
202 %inc.us = add nuw i32 %j.us, 1
203 %exitcond = icmp eq i32 %inc.us, %J
204 br i1 %exitcond, label %for.latch, label %for.inner
205
206 for.latch:
207 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
208 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
209 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
210 %add8.us = add nuw i32 %i.us, 1
211 %exitcond25 = icmp eq i32 %add8.us, %I
212 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !7
213
214 for.end.loopexit:
215 br label %for.end
216
217 for.end:
218 ret void
219 }
220
221
222 ; CHECK-LABEL: unroll
223 ; #pragma unroll (which we take to mean disable unroll and jam)
224 define void @unroll(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
225 ; CHECK: %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
226 entry:
227 %cmp = icmp ne i32 %J, 0
228 %cmp122 = icmp ne i32 %I, 0
229 %or.cond = and i1 %cmp, %cmp122
230 br i1 %or.cond, label %for.outer.preheader, label %for.end
231
232 for.outer.preheader:
233 br label %for.outer
234
235 for.outer:
236 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
237 br label %for.inner
238
239 for.inner:
240 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
241 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
242 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
243 %0 = load i32, i32* %arrayidx.us, align 4
244 %add.us = add i32 %0, %sum1.us
245 %inc.us = add nuw i32 %j.us, 1
246 %exitcond = icmp eq i32 %inc.us, %J
247 br i1 %exitcond, label %for.latch, label %for.inner
248
249 for.latch:
250 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
251 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
252 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
253 %add8.us = add nuw i32 %i.us, 1
254 %exitcond25 = icmp eq i32 %add8.us, %I
255 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !9
256
257 for.end.loopexit:
258 br label %for.end
259
260 for.end:
261 ret void
262 }
263
264
265 ; CHECK-LABEL: nounroll_plus_unroll_and_jam
266 ; #pragma clang loop nounroll, unroll_and_jam (which we take to mean do unroll_and_jam)
267 define void @nounroll_plus_unroll_and_jam(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) {
268 ; CHECK: %i.us = phi i32 [ %add8.us.{{[1-9]*}}, %for.latch ], [ 0, %for.outer.preheader.new ]
269 entry:
270 %cmp = icmp ne i32 %J, 0
271 %cmp122 = icmp ne i32 %I, 0
272 %or.cond = and i1 %cmp, %cmp122
273 br i1 %or.cond, label %for.outer.preheader, label %for.end
274
275 for.outer.preheader:
276 br label %for.outer
277
278 for.outer:
279 %i.us = phi i32 [ %add8.us, %for.latch ], [ 0, %for.outer.preheader ]
280 br label %for.inner
281
282 for.inner:
283 %j.us = phi i32 [ 0, %for.outer ], [ %inc.us, %for.inner ]
284 %sum1.us = phi i32 [ 0, %for.outer ], [ %add.us, %for.inner ]
285 %arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %j.us
286 %0 = load i32, i32* %arrayidx.us, align 4
287 %add.us = add i32 %0, %sum1.us
288 %inc.us = add nuw i32 %j.us, 1
289 %exitcond = icmp eq i32 %inc.us, %J
290 br i1 %exitcond, label %for.latch, label %for.inner
291
292 for.latch:
293 %add.us.lcssa = phi i32 [ %add.us, %for.inner ]
294 %arrayidx6.us = getelementptr inbounds i32, i32* %A, i32 %i.us
295 store i32 %add.us.lcssa, i32* %arrayidx6.us, align 4
296 %add8.us = add nuw i32 %i.us, 1
297 %exitcond25 = icmp eq i32 %add8.us, %I
298 br i1 %exitcond25, label %for.end.loopexit, label %for.outer, !llvm.loop !11
299
300 for.end.loopexit:
301 br label %for.end
302
303 for.end:
304 ret void
305 }
306
307
308 !1 = distinct !{!1, !2}
309 !2 = distinct !{!"llvm.loop.unroll_and_jam.disable"}
310 !3 = distinct !{!3, !4}
311 !4 = distinct !{!"llvm.loop.unroll_and_jam.count", i32 8}
312 !5 = distinct !{!5, !6}
313 !6 = distinct !{!"llvm.loop.unroll_and_jam.enable"}
314 !7 = distinct !{!7, !8}
315 !8 = distinct !{!"llvm.loop.unroll.disable"}
316 !9 = distinct !{!9, !10}
317 !10 = distinct !{!"llvm.loop.unroll.enable"}
318 !11 = distinct !{!11, !8, !6}
0 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -pass-remarks=loop-unroll < %s -S 2>&1 | FileCheck %s
1
2 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
3 target triple = "thumbv8m.main-arm-none-eabi"
4
5 ;; Common check for all tests. None should be unroll and jammed due to profitability
6 ; CHECK-NOT: remark: {{.*}} unroll and jammed
7
8
9 ; CHECK-LABEL: unprof1
10 ; Multiple inner loop blocks
11 define void @unprof1(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
12 ; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
13 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
14 entry:
15 %cmp = icmp ne i32 %J, 0
16 %cmp122 = icmp ne i32 %I, 0
17 %or.cond = and i1 %cmp, %cmp122
18 br i1 %or.cond, label %for.outer.preheader, label %for.end
19
20 for.outer.preheader:
21 br label %for.outer
22
23 for.outer:
24 %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
25 br label %for.inner
26
27 for.inner:
28 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner2 ]
29 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner2 ]
30 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
31 %0 = load i32, i32* %arrayidx, align 4
32 %add = add i32 %0, %sum1
33 br label %for.inner2
34
35 for.inner2:
36 %inc = add nuw i32 %j, 1
37 %exitcond = icmp eq i32 %inc, %J
38 br i1 %exitcond, label %for.latch, label %for.inner
39
40 for.latch:
41 %add.lcssa = phi i32 [ %add, %for.inner2 ]
42 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
43 store i32 %add.lcssa, i32* %arrayidx6, align 4
44 %addinc = add nuw i32 %i, 1
45 %exitcond25 = icmp eq i32 %addinc, %I
46 br i1 %exitcond25, label %for.loopexit, label %for.outer
47
48 for.loopexit:
49 br label %for.end
50
51 for.end:
52 ret void
53 }
54
55
56 ; CHECK-LABEL: unprof2
57 ; Constant inner loop count
58 define void @unprof2(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
59 ; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
60 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
61 entry:
62 %cmp = icmp ne i32 %J, 0
63 %cmp122 = icmp ne i32 %I, 0
64 %or.cond = and i1 %cmp, %cmp122
65 br i1 %or.cond, label %for.outer.preheader, label %for.end
66
67 for.outer.preheader:
68 br label %for.outer
69
70 for.outer:
71 %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
72 br label %for.inner
73
74 for.inner:
75 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
76 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
77 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
78 %0 = load i32, i32* %arrayidx, align 4
79 %add = add i32 %0, %sum1
80 %inc = add nuw i32 %j, 1
81 %exitcond = icmp eq i32 %inc, 10
82 br i1 %exitcond, label %for.latch, label %for.inner
83
84 for.latch:
85 %add.lcssa = phi i32 [ %add, %for.inner ]
86 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
87 store i32 %add.lcssa, i32* %arrayidx6, align 4
88 %addinc = add nuw i32 %i, 1
89 %exitcond25 = icmp eq i32 %addinc, %I
90 br i1 %exitcond25, label %for.loopexit, label %for.outer
91
92 for.loopexit:
93 br label %for.end
94
95 for.end:
96 ret void
97 }
98
99
100 ; CHECK-LABEL: unprof3
101 ; Complex inner loop
102 define void @unprof3(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
103 ; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
104 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
105 entry:
106 %cmp = icmp ne i32 %J, 0
107 %cmp122 = icmp ne i32 %I, 0
108 %or.cond = and i1 %cmp, %cmp122
109 br i1 %or.cond, label %for.outer.preheader, label %for.end
110
111 for.outer.preheader:
112 br label %for.outer
113
114 for.outer:
115 %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
116 br label %for.inner
117
118 for.inner:
119 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
120 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
121 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j
122 %0 = load i32, i32* %arrayidx, align 4
123 %add = add i32 %0, %sum1
124 %add0 = add i32 %0, %sum1
125 %add1 = add i32 %0, %sum1
126 %add2 = add i32 %0, %sum1
127 %add3 = add i32 %0, %sum1
128 %add4 = add i32 %0, %sum1
129 %add5 = add i32 %0, %sum1
130 %add6 = add i32 %0, %sum1
131 %add7 = add i32 %0, %sum1
132 %add8 = add i32 %0, %sum1
133 %add9 = add i32 %0, %sum1
134 %add10 = add i32 %0, %sum1
135 %add11 = add i32 %0, %sum1
136 %add12 = add i32 %0, %sum1
137 %add13 = add i32 %0, %sum1
138 %add14 = add i32 %0, %sum1
139 %add15 = add i32 %0, %sum1
140 %add16 = add i32 %0, %sum1
141 %add17 = add i32 %0, %sum1
142 %add18 = add i32 %0, %sum1
143 %add19 = add i32 %0, %sum1
144 %add20 = add i32 %0, %sum1
145 %add21 = add i32 %0, %sum1
146 %add22 = add i32 %0, %sum1
147 %add23 = add i32 %0, %sum1
148 %add24 = add i32 %0, %sum1
149 %add25 = add i32 %0, %sum1
150 %add26 = add i32 %0, %sum1
151 %add27 = add i32 %0, %sum1
152 %add28 = add i32 %0, %sum1
153 %add29 = add i32 %0, %sum1
154 %inc = add nuw i32 %j, 1
155 %exitcond = icmp eq i32 %inc, %J
156 br i1 %exitcond, label %for.latch, label %for.inner
157
158 for.latch:
159 %add.lcssa = phi i32 [ %add, %for.inner ]
160 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
161 store i32 %add.lcssa, i32* %arrayidx6, align 4
162 %addinc = add nuw i32 %i, 1
163 %exitcond25 = icmp eq i32 %addinc, %I
164 br i1 %exitcond25, label %for.loopexit, label %for.outer
165
166 for.loopexit:
167 br label %for.end
168
169 for.end:
170 ret void
171 }
172
173
174 ; CHECK-LABEL: unprof4
175 ; No loop invariant loads
176 define void @unprof4(i32 %I, i32 %J, i32* noalias nocapture %A, i32* noalias nocapture readonly %B) #0 {
177 ; CHECK: %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
178 ; CHECK: %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
179 entry:
180 %cmp = icmp ne i32 %J, 0
181 %cmp122 = icmp ne i32 %I, 0
182 %or.cond = and i1 %cmp, %cmp122
183 br i1 %or.cond, label %for.outer.preheader, label %for.end
184
185 for.outer.preheader:
186 br label %for.outer
187
188 for.outer:
189 %i = phi i32 [ %addinc, %for.latch ], [ 0, %for.outer.preheader ]
190 br label %for.inner
191
192 for.inner:
193 %j = phi i32 [ 0, %for.outer ], [ %inc, %for.inner ]
194 %sum1 = phi i32 [ 0, %for.outer ], [ %add, %for.inner ]
195 %j2 = add i32 %j, %i
196 %arrayidx = getelementptr inbounds i32, i32* %B, i32 %j2
197 %0 = load i32, i32* %arrayidx, align 4
198 %add = add i32 %0, %sum1
199 %inc = add nuw i32 %j, 1
200 %exitcond = icmp eq i32 %inc, %J
201 br i1 %exitcond, label %for.latch, label %for.inner
202
203 for.latch:
204 %add.lcssa = phi i32 [ %add, %for.inner ]
205 %arrayidx6 = getelementptr inbounds i32, i32* %A, i32 %i
206 store i32 %add.lcssa, i32* %arrayidx6, align 4
207 %addinc = add nuw i32 %i, 1
208 %exitcond25 = icmp eq i32 %addinc, %I
209 br i1 %exitcond25, label %for.loopexit, label %for.outer
210
211 for.loopexit:
212 br label %for.end
213
214 for.end:
215 ret void
216 }
0 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1 ; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
2
3 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
4
5 ; CHECK-LABEL: test1
6 ; Tests for(i) { sum = 0; for(j) sum += B[j]; A[i] = sum; }
7 ; CHECK-NEXT: entry:
8 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[J:%.*]], 0
9 ; CHECK-NEXT: [[CMPJ:%.*]] = icmp ne i32 [[I:%.*]], 0
10 ; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP]], [[CMPJ]]
11 ; CHECK-NEXT: br i1 [[OR_COND]], label [[FOR_OUTER_PREHEADER:%.*]], label [[FOR_END:%.*]]
12 ; CHECK: for.outer.preheader:
13 ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[I]], -1
14 ; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[I]], 3
15 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3
16 ; CHECK-NEXT: br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_OUTER_PREHEADER_NEW:%.*]]
17 ; CHECK: for.outer.preheader.new:
18 ; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[I]], [[XTRAITER]]
19 ; CHECK-NEXT: br label [[FOR_OUTER:%.*]]
20 ; CHECK: for.outer:
21 ; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[ADD8_3:%.*]], [[FOR_LATCH:%.*]] ], [ 0, [[FOR_OUTER_PREHEADER_NEW]] ]
22 ; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_OUTER_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_LATCH]] ]
23 ; CHECK-NEXT: [[ADD8:%.*]] = add nuw nsw i32 [[I]], 1
24 ; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1
25 ; CHECK-NEXT: [[ADD8_1:%.*]] = add nuw nsw i32 [[ADD8]], 1
26 ; CHECK-NEXT: [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1
27 ; CHECK-NEXT: [[ADD8_2:%.*]] = add nuw nsw i32 [[ADD8_1]], 1
28 ; CHECK-NEXT: [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1
29 ; CHECK-NEXT: [[ADD8_3]] = add nuw i32 [[ADD8_2]], 1
30 ; CHECK-NEXT: [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1
31 ; CHECK-NEXT: br label [[FOR_INNER:%.*]]
32 ; CHECK: for.inner:
33 ; CHECK-NEXT: [[J_0:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC:%.*]], [[FOR_INNER]] ]
34 ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD:%.*]], [[FOR_INNER]] ]
35 ; CHECK-NEXT: [[J_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_1:%.*]], [[FOR_INNER]] ]
36 ; CHECK-NEXT: [[SUM_1:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_1:%.*]], [[FOR_INNER]] ]
37 ; CHECK-NEXT: [[J_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_2:%.*]], [[FOR_INNER]] ]
38 ; CHECK-NEXT: [[SUM_2:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_2:%.*]], [[FOR_INNER]] ]
39 ; CHECK-NEXT: [[J_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[INC_3:%.*]], [[FOR_INNER]] ]
40 ; CHECK-NEXT: [[SUM_3:%.*]] = phi i32 [ 0, [[FOR_OUTER]] ], [ [[ADD_3:%.*]], [[FOR_INNER]] ]
41 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[J_0]]
42 ; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa !0
43 ; CHECK-NEXT: [[ADD]] = add i32 [[TMP2]], [[SUM]]
44 ; CHECK-NEXT: [[INC]] = add nuw i32 [[J_0]], 1
45 ; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_1]]
46 ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4, !tbaa !0
47 ; CHECK-NEXT: [[ADD_1]] = add i32 [[TMP3]], [[SUM_1]]
48 ; CHECK-NEXT: [[INC_1]] = add nuw i32 [[J_1]], 1
49 ; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_2]]
50 ; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4, !tbaa !0
51 ; CHECK-NEXT: [[ADD_2]] = add i32 [[TMP4]], [[SUM_2]]
52 ; CHECK-NEXT: [[INC_2]] = add nuw i32 [[J_2]], 1
53 ; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_3]]
54 ; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4, !tbaa !0
55 ; CHECK-NEXT: [[ADD_3]] = add i32 [[TMP5]], [[SUM_3]]
56 ; CHECK-NEXT: [[INC_3]] = add nuw i32 [[J_3]], 1
57 ; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[J]]
58 ; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[FOR_LATCH]], label [[FOR_INNER]]
59 ; CHECK: for.latch:
60 ; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INNER]] ]
61 ; CHECK-NEXT: [[ADD_LCSSA_1:%.*]] = phi i32 [ [[ADD_1]], [[FOR_INNER]] ]
62 ; CHECK-NEXT: [[ADD_LCSSA_2:%.*]] = phi i32 [ [[ADD_2]], [[FOR_INNER]] ]
63 ; CHECK-NEXT: [[ADD_LCSSA_3:%.*]] = phi i32 [ [[ADD_3]], [[FOR_INNER]] ]
64 ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I]]
65 ; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* [[ARRAYIDX6]], align 4, !tbaa !0
66 ; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8]]
67 ; CHECK-NEXT: store i32 [[ADD_LCSSA_1]], i32* [[ARRAYIDX6_1]], align 4, !tbaa !0
68 ; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_1]]
69 ; CHECK-NEXT: store i32 [[ADD_LCSSA_2]], i32* [[ARRAYIDX6_2]], align 4, !tbaa !0
70 ; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[ADD8_2]]
71 ; CHECK-NEXT: store i32 [[ADD_LCSSA_3]], i32* [[ARRAYIDX6_3]], align 4, !tbaa !0
72 ; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i32 [[NITER_NSUB_3]], 0
73 ; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[FOR_OUTER]], !llvm.loop !4
74 ; CHECK: for.end.loopexit.unr-lcssa.loopexit:
75 ; CHECK-NEXT: [[I_UNR_PH:%.*]] = phi i32 [ [[ADD8_3]], [[FOR_LATCH]] ]
76 ; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_UNR_LCSSA]]
77 ; CHECK: for.end.loopexit.unr-lcssa:
78 ; CHECK-NEXT: [[I_UNR:%.*]] = phi i32 [ 0, [[FOR_OUTER_PREHEADER]] ], [ [[I_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ]
79 ; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0
80 ; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_OUTER_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]]
81 ; CHECK: for.outer.epil.preheader:
82 ; CHECK-NEXT: br label [[FOR_OUTER_EPIL:%.*]]
83 ; CHECK: for.outer.epil:
84 ; CHECK-NEXT: br label [[FOR_INNER_EPIL:%.*]]
85 ; CHECK: for.inner.epil:
86 ; CHECK-NEXT: [[J_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[INC_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
87 ; CHECK-NEXT: [[SUM_EPIL:%.*]] = phi i32 [ 0, [[FOR_OUTER_EPIL]] ], [ [[ADD_EPIL:%.*]], [[FOR_INNER_EPIL]] ]
88 ; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[J_EPIL]]
89 <