llvm.org GIT mirror llvm / cebf346
Remove the BBVectorize pass. It served us well, helped kick-start much of the vectorization efforts in LLVM, etc. Its time has come and past. Back in 2014: http://lists.llvm.org/pipermail/llvm-dev/2014-November/079091.html Time to actually let go and move forward. =] I've updated the release notes both about the removal and the deprecation of the corresponding C API. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@306797 91177308-0d34-0410-b5e6-96231b3b80d8 Chandler Carruth 2 years ago
43 changed file(s) with 18 addition(s) and 6180 deletion(s). Raw diff Collapse all Expand all
6969
7070 N: Hal Finkel
7171 E: hfinkel@anl.gov
72 D: BBVectorize, the loop reroller, alias analysis and the PowerPC target
72 D: The loop reroller, alias analysis and the PowerPC target
7373
7474 N: Dan Gohman
7575 E: sunfish@mozilla.com
5353 its nature as a general purpose PDB manipulation / diagnostics tool that does
5454 more than just dumping contents.
5555
56
57 * ... next change ...
56 * The ``BBVectorize`` pass has been removed. It was fully replaced and no
57 longer used back in 2014 but we didn't get around to removing it. Now it is
58 gone. The SLP vectorizer is the suggested non-loop vectorization pass.
5859
5960 .. NOTE
6061 If you would like to document a larger change, then you can add a
110111 Changes to the C API
111112 --------------------
112113
113 During this release ...
114 * Deprecated the ``LLVMAddBBVectorizePass`` interface since the ``BBVectorize``
115 pass has been removed. It is now a no-op and will be removed in the next
116 release. Use ``LLVMAddSLPVectorizePass`` instead to get the supported SLP
117 vectorizer.
118
114119
115120 External Open Source Projects Using LLVM 5
116121 ==========================================
6969 void initializeArgPromotionPass(PassRegistry&);
7070 void initializeAssumptionCacheTrackerPass(PassRegistry&);
7171 void initializeAtomicExpandPass(PassRegistry&);
72 void initializeBBVectorizePass(PassRegistry&);
7372 void initializeBDCELegacyPassPass(PassRegistry&);
7473 void initializeBarrierNoopPass(PassRegistry&);
7574 void initializeBasicAAWrapperPassPass(PassRegistry&);
194194 (void) llvm::createLoopVectorizePass();
195195 (void) llvm::createSLPVectorizerPass();
196196 (void) llvm::createLoadStoreVectorizerPass();
197 (void) llvm::createBBVectorizePass();
198197 (void) llvm::createPartiallyInlineLibCallsPass();
199198 (void) llvm::createScalarizerPass();
200199 (void) llvm::createSeparateConstOffsetFromGEPPass();
144144 bool DisableTailCalls;
145145 bool DisableUnitAtATime;
146146 bool DisableUnrollLoops;
147 bool BBVectorize;
148147 bool SLPVectorize;
149148 bool LoopVectorize;
150149 bool RerollLoops;
107107
108108 //===----------------------------------------------------------------------===//
109109 //
110 // BBVectorize - A basic-block vectorization pass.
111 //
112 BasicBlockPass *
113 createBBVectorizePass(const VectorizeConfig &C = VectorizeConfig());
114
115 //===----------------------------------------------------------------------===//
116 //
117110 // LoopVectorize - Create a loop vectorization pass.
118111 //
119112 Pass *createLoopVectorizePass(bool NoUnrolling = false,
3232 * @{
3333 */
3434
35 /** See llvm::createBBVectorizePass function. */
35 /** DEPRECATED - Use LLVMAddSLPVectorizePass */
3636 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
3737
3838 /** See llvm::createLoopVectorizePass function. */
5555 cl::desc("Run the SLP vectorization passes"));
5656
5757 static cl::opt
58 RunBBVectorization("vectorize-slp-aggressive", cl::Hidden,
59 cl::desc("Run the BB vectorization passes"));
60
61 static cl::opt
6258 UseGVNAfterVectorization("use-gvn-after-vectorization",
6359 cl::init(false), cl::Hidden,
6460 cl::desc("Run GVN instead of Early CSE after vectorization passes"));
165161 Inliner = nullptr;
166162 DisableUnitAtATime = false;
167163 DisableUnrollLoops = false;
168 BBVectorize = RunBBVectorization;
169164 SLPVectorize = RunSLPVectorization;
170165 LoopVectorize = RunLoopVectorization;
171166 RerollLoops = RunLoopRerolling;
383378
384379 if (RerollLoops)
385380 MPM.add(createLoopRerollPass());
386 if (!RunSLPAfterLoopVectorization) {
387 if (SLPVectorize)
388 MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
389
390 if (BBVectorize) {
391 MPM.add(createBBVectorizePass());
392 addInstructionCombiningPass(MPM);
393 addExtensionsToPM(EP_Peephole, MPM);
394 if (OptLevel > 1 && UseGVNAfterVectorization)
395 MPM.add(NewGVN
396 ? createNewGVNPass()
397 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
398 else
399 MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
400
401 // BBVectorize may have significantly shortened a loop body; unroll again.
402 if (!DisableUnrollLoops)
403 MPM.add(createLoopUnrollPass(OptLevel));
404 }
405 }
381 if (!RunSLPAfterLoopVectorization && SLPVectorize)
382 MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
406383
407384 MPM.add(createAggressiveDCEPass()); // Delete dead instructions
408385 MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
634611 addInstructionCombiningPass(MPM);
635612 }
636613
637 if (RunSLPAfterLoopVectorization) {
638 if (SLPVectorize) {
639 MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
640 if (OptLevel > 1 && ExtraVectorizerPasses) {
641 MPM.add(createEarlyCSEPass());
642 }
643 }
644
645 if (BBVectorize) {
646 MPM.add(createBBVectorizePass());
647 addInstructionCombiningPass(MPM);
648 addExtensionsToPM(EP_Peephole, MPM);
649 if (OptLevel > 1 && UseGVNAfterVectorization)
650 MPM.add(NewGVN
651 ? createNewGVNPass()
652 : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
653 else
654 MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
655
656 // BBVectorize may have significantly shortened a loop body; unroll again.
657 if (!DisableUnrollLoops)
658 MPM.add(createLoopUnrollPass(OptLevel));
614 if (RunSLPAfterLoopVectorization && SLPVectorize) {
615 MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
616 if (OptLevel > 1 && ExtraVectorizerPasses) {
617 MPM.add(createEarlyCSEPass());
659618 }
660619 }
661620
+0
-3282
lib/Transforms/Vectorize/BBVectorize.cpp less more
None //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a basic-block vectorization pass. The algorithm was
10 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
11 // et al. It works by looking for chains of pairable operations and then
12 // pairing them.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #define BBV_NAME "bb-vectorize"
17 #include "llvm/ADT/DenseMap.h"
18 #include "llvm/ADT/DenseSet.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallSet.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/Statistic.h"
23 #include "llvm/ADT/StringExtras.h"
24 #include "llvm/Analysis/AliasAnalysis.h"
25 #include "llvm/Analysis/AliasSetTracker.h"
26 #include "llvm/Analysis/GlobalsModRef.h"
27 #include "llvm/Analysis/ScalarEvolution.h"
28 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
29 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
30 #include "llvm/Analysis/TargetLibraryInfo.h"
31 #include "llvm/Analysis/TargetTransformInfo.h"
32 #include "llvm/Analysis/ValueTracking.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/Dominators.h"
37 #include "llvm/IR/Function.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/IntrinsicInst.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/LLVMContext.h"
42 #include "llvm/IR/Metadata.h"
43 #include "llvm/IR/Module.h"
44 #include "llvm/IR/Type.h"
45 #include "llvm/IR/ValueHandle.h"
46 #include "llvm/Pass.h"
47 #include "llvm/Support/CommandLine.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 #include "llvm/Transforms/Utils/Local.h"
51 #include "llvm/Transforms/Vectorize.h"
52 #include
53 using namespace llvm;
54
55 #define DEBUG_TYPE BBV_NAME
56
57 static cl::opt
58 IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false),
59 cl::Hidden, cl::desc("Ignore target information"));
60
61 static cl::opt
62 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
63 cl::desc("The required chain depth for vectorization"));
64
65 static cl::opt
66 UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false),
67 cl::Hidden, cl::desc("Use the chain depth requirement with"
68 " target information"));
69
70 static cl::opt
71 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
72 cl::desc("The maximum search distance for instruction pairs"));
73
74 static cl::opt
75 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
76 cl::desc("Replicating one element to a pair breaks the chain"));
77
78 static cl::opt
79 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
80 cl::desc("The size of the native vector registers"));
81
82 static cl::opt
83 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
84 cl::desc("The maximum number of pairing iterations"));
85
86 static cl::opt
87 Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden,
88 cl::desc("Don't try to form non-2^n-length vectors"));
89
90 static cl::opt
91 MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden,
92 cl::desc("The maximum number of pairable instructions per group"));
93
94 static cl::opt
95 MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden,
96 cl::desc("The maximum number of candidate instruction pairs per group"));
97
98 static cl::opt
99 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
100 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
101 " a full cycle check"));
102
103 static cl::opt
104 NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden,
105 cl::desc("Don't try to vectorize boolean (i1) values"));
106
107 static cl::opt
108 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
109 cl::desc("Don't try to vectorize integer values"));
110
111 static cl::opt
112 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
113 cl::desc("Don't try to vectorize floating-point values"));
114
115 // FIXME: This should default to false once pointer vector support works.
116 static cl::opt
117 NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden,
118 cl::desc("Don't try to vectorize pointer values"));
119
120 static cl::opt
121 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
122 cl::desc("Don't try to vectorize casting (conversion) operations"));
123
124 static cl::opt
125 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
126 cl::desc("Don't try to vectorize floating-point math intrinsics"));
127
128 static cl::opt
129 NoBitManipulation("bb-vectorize-no-bitmanip", cl::init(false), cl::Hidden,
130 cl::desc("Don't try to vectorize BitManipulation intrinsics"));
131
132 static cl::opt
133 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
134 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
135
136 static cl::opt
137 NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden,
138 cl::desc("Don't try to vectorize select instructions"));
139
140 static cl::opt
141 NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden,
142 cl::desc("Don't try to vectorize comparison instructions"));
143
144 static cl::opt
145 NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden,
146 cl::desc("Don't try to vectorize getelementptr instructions"));
147
148 static cl::opt
149 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
150 cl::desc("Don't try to vectorize loads and stores"));
151
152 static cl::opt
153 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
154 cl::desc("Only generate aligned loads and stores"));
155
156 static cl::opt
157 NoMemOpBoost("bb-vectorize-no-mem-op-boost",
158 cl::init(false), cl::Hidden,
159 cl::desc("Don't boost the chain-depth contribution of loads and stores"));
160
161 static cl::opt
162 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
163 cl::desc("Use a fast instruction dependency analysis"));
164
165 #ifndef NDEBUG
166 static cl::opt
167 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
168 cl::init(false), cl::Hidden,
169 cl::desc("When debugging is enabled, output information on the"
170 " instruction-examination process"));
171 static cl::opt
172 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
173 cl::init(false), cl::Hidden,
174 cl::desc("When debugging is enabled, output information on the"
175 " candidate-selection process"));
176 static cl::opt
177 DebugPairSelection("bb-vectorize-debug-pair-selection",
178 cl::init(false), cl::Hidden,
179 cl::desc("When debugging is enabled, output information on the"
180 " pair-selection process"));
181 static cl::opt
182 DebugCycleCheck("bb-vectorize-debug-cycle-check",
183 cl::init(false), cl::Hidden,
184 cl::desc("When debugging is enabled, output information on the"
185 " cycle-checking process"));
186
187 static cl::opt
188 PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair",
189 cl::init(false), cl::Hidden,
190 cl::desc("When debugging is enabled, dump the basic block after"
191 " every pair is fused"));
192 #endif
193
194 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
195
196 namespace {
197 struct BBVectorize : public BasicBlockPass {
198 static char ID; // Pass identification, replacement for typeid
199
200 const VectorizeConfig Config;
201
202 BBVectorize(const VectorizeConfig &C = VectorizeConfig())
203 : BasicBlockPass(ID), Config(C) {
204 initializeBBVectorizePass(*PassRegistry::getPassRegistry());
205 }
206
207 BBVectorize(Pass *P, Function &F, const VectorizeConfig &C)
208 : BasicBlockPass(ID), Config(C) {
209 AA = &P->getAnalysis().getAAResults();
210 DT = &P->getAnalysis().getDomTree();
211 SE = &P->getAnalysis().getSE();
212 TLI = &P->getAnalysis().getTLI();
213 TTI = IgnoreTargetInfo
214 ? nullptr
215 : &P->getAnalysis().getTTI(F);
216 }
217
218 typedef std::pair ValuePair;
219 typedef std::pair ValuePairWithCost;
220 typedef std::pair ValuePairWithDepth;
221 typedef std::pair VPPair; // A ValuePair pair
222 typedef std::pair VPPairWithType;
223
224 AliasAnalysis *AA;
225 DominatorTree *DT;
226 ScalarEvolution *SE;
227 const TargetLibraryInfo *TLI;
228 const TargetTransformInfo *TTI;
229
230 // FIXME: const correct?
231
232 bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false);
233
234 bool getCandidatePairs(BasicBlock &BB,
235 BasicBlock::iterator &Start,
236 DenseMap > &CandidatePairs,
237 DenseSet &FixedOrderPairs,
238 DenseMap &CandidatePairCostSavings,
239 std::vector &PairableInsts, bool NonPow2Len);
240
241 // FIXME: The current implementation does not account for pairs that
242 // are connected in multiple ways. For example:
243 // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap)
244 enum PairConnectionType {
245 PairConnectionDirect,
246 PairConnectionSwap,
247 PairConnectionSplat
248 };
249
250 void computeConnectedPairs(
251 DenseMap > &CandidatePairs,
252 DenseSet &CandidatePairsSet,
253 std::vector &PairableInsts,
254 DenseMap > &ConnectedPairs,
255 DenseMap &PairConnectionTypes);
256
257 void buildDepMap(BasicBlock &BB,
258 DenseMap > &CandidatePairs,
259 std::vector &PairableInsts,
260 DenseSet &PairableInstUsers);
261
262 void choosePairs(DenseMap > &CandidatePairs,
263 DenseSet &CandidatePairsSet,
264 DenseMap &CandidatePairCostSavings,
265 std::vector &PairableInsts,
266 DenseSet &FixedOrderPairs,
267 DenseMap &PairConnectionTypes,
268 DenseMap > &ConnectedPairs,
269 DenseMap > &ConnectedPairDeps,
270 DenseSet &PairableInstUsers,
271 DenseMap& ChosenPairs);
272
273 void fuseChosenPairs(BasicBlock &BB,
274 std::vector &PairableInsts,
275 DenseMap& ChosenPairs,
276 DenseSet &FixedOrderPairs,
277 DenseMap &PairConnectionTypes,
278 DenseMap > &ConnectedPairs,
279 DenseMap > &ConnectedPairDeps);
280
281
282 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
283
284 bool areInstsCompatible(Instruction *I, Instruction *J,
285 bool IsSimpleLoadStore, bool NonPow2Len,
286 int &CostSavings, int &FixedOrder);
287
288 bool trackUsesOfI(DenseSet &Users,
289 AliasSetTracker &WriteSet, Instruction *I,
290 Instruction *J, bool UpdateUsers = true,
291 DenseSet *LoadMoveSetPairs = nullptr);
292
293 void computePairsConnectedTo(
294 DenseMap > &CandidatePairs,
295 DenseSet &CandidatePairsSet,
296 std::vector &PairableInsts,
297 DenseMap > &ConnectedPairs,
298 DenseMap &PairConnectionTypes,
299 ValuePair P);
300
301 bool pairsConflict(ValuePair P, ValuePair Q,
302 DenseSet &PairableInstUsers,
303 DenseMap >
304 *PairableInstUserMap = nullptr,
305 DenseSet *PairableInstUserPairSet = nullptr);
306
307 bool pairWillFormCycle(ValuePair P,
308 DenseMap > &PairableInstUsers,
309 DenseSet &CurrentPairs);
310
311 void pruneDAGFor(
312 DenseMap > &CandidatePairs,
313 std::vector &PairableInsts,
314 DenseMap > &ConnectedPairs,
315 DenseSet &PairableInstUsers,
316 DenseMap > &PairableInstUserMap,
317 DenseSet &PairableInstUserPairSet,
318 DenseMap &ChosenPairs,
319 DenseMap &DAG,
320 DenseSet &PrunedDAG, ValuePair J,
321 bool UseCycleCheck);
322
323 void buildInitialDAGFor(
324 DenseMap > &CandidatePairs,
325 DenseSet &CandidatePairsSet,
326 std::vector &PairableInsts,
327 DenseMap > &ConnectedPairs,
328 DenseSet &PairableInstUsers,
329 DenseMap &ChosenPairs,
330 DenseMap &DAG, ValuePair J);
331
332 void findBestDAGFor(
333 DenseMap > &CandidatePairs,
334 DenseSet &CandidatePairsSet,
335 DenseMap &CandidatePairCostSavings,
336 std::vector &PairableInsts,
337 DenseSet &FixedOrderPairs,
338 DenseMap &PairConnectionTypes,
339 DenseMap > &ConnectedPairs,
340 DenseMap > &ConnectedPairDeps,
341 DenseSet &PairableInstUsers,
342 DenseMap > &PairableInstUserMap,
343 DenseSet &PairableInstUserPairSet,
344 DenseMap &ChosenPairs,
345 DenseSet &BestDAG, size_t &BestMaxDepth,
346 int &BestEffSize, Value *II, std::vector&JJ,
347 bool UseCycleCheck);
348
349 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
350 Instruction *J, unsigned o);
351
352 void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
353 unsigned MaskOffset, unsigned NumInElem,
354 unsigned NumInElem1, unsigned IdxOffset,
355 std::vector &Mask);
356
357 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
358 Instruction *J);
359
360 bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J,
361 unsigned o, Value *&LOp, unsigned numElemL,
362 Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ,
363 unsigned IdxOff = 0);
364
365 Value *getReplacementInput(LLVMContext& Context, Instruction *I,
366 Instruction *J, unsigned o, bool IBeforeJ);
367
368 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
369 Instruction *J, SmallVectorImpl &ReplacedOperands,
370 bool IBeforeJ);
371
372 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
373 Instruction *J, Instruction *K,
374 Instruction *&InsertionPt, Instruction *&K1,
375 Instruction *&K2);
376
377 void collectPairLoadMoveSet(BasicBlock &BB,
378 DenseMap &ChosenPairs,
379 DenseMap > &LoadMoveSet,
380 DenseSet &LoadMoveSetPairs,
381 Instruction *I);
382
383 void collectLoadMoveSet(BasicBlock &BB,
384 std::vector &PairableInsts,
385 DenseMap &ChosenPairs,
386 DenseMap > &LoadMoveSet,
387 DenseSet &LoadMoveSetPairs);
388
389 bool canMoveUsesOfIAfterJ(BasicBlock &BB,
390 DenseSet &LoadMoveSetPairs,
391 Instruction *I, Instruction *J);
392
393 void moveUsesOfIAfterJ(BasicBlock &BB,
394 DenseSet &LoadMoveSetPairs,
395 Instruction *&InsertionPt,
396 Instruction *I, Instruction *J);
397
398 bool vectorizeBB(BasicBlock &BB) {
399 if (skipBasicBlock(BB))
400 return false;
401 if (!DT->isReachableFromEntry(&BB)) {
402 DEBUG(dbgs() << "BBV: skipping unreachable " << BB.getName() <<
403 " in " << BB.getParent()->getName() << "\n");
404 return false;
405 }
406
407 DEBUG(if (TTI) dbgs() << "BBV: using target information\n");
408
409 bool changed = false;
410 // Iterate a sufficient number of times to merge types of size 1 bit,
411 // then 2 bits, then 4, etc. up to half of the target vector width of the
412 // target vector register.
413 unsigned n = 1;
414 for (unsigned v = 2;
415 (TTI || v <= Config.VectorBits) &&
416 (!Config.MaxIter || n <= Config.MaxIter);
417 v *= 2, ++n) {
418 DEBUG(dbgs() << "BBV: fusing loop #" << n <<
419 " for " << BB.getName() << " in " <<
420 BB.getParent()->getName() << "...\n");
421 if (vectorizePairs(BB))
422 changed = true;
423 else
424 break;
425 }
426
427 if (changed && !Pow2LenOnly) {
428 ++n;
429 for (; !Config.MaxIter || n <= Config.MaxIter; ++n) {
430 DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " <<
431 n << " for " << BB.getName() << " in " <<
432 BB.getParent()->getName() << "...\n");
433 if (!vectorizePairs(BB, true)) break;
434 }
435 }
436
437 DEBUG(dbgs() << "BBV: done!\n");
438 return changed;
439 }
440
441 bool runOnBasicBlock(BasicBlock &BB) override {
442 // OptimizeNone check deferred to vectorizeBB().
443
444 AA = &getAnalysis().getAAResults();
445 DT = &getAnalysis().getDomTree();
446 SE = &getAnalysis().getSE();
447 TLI = &getAnalysis().getTLI();
448 TTI = IgnoreTargetInfo
449 ? nullptr
450 : &getAnalysis().getTTI(
451 *BB.getParent());
452
453 return vectorizeBB(BB);
454 }
455
456 void getAnalysisUsage(AnalysisUsage &AU) const override {
457 BasicBlockPass::getAnalysisUsage(AU);
458 AU.addRequired();
459 AU.addRequired();
460 AU.addRequired();
461 AU.addRequired();
462 AU.addRequired();
463 AU.addPreserved();
464 AU.addPreserved();
465 AU.addPreserved();
466 AU.addPreserved();
467 AU.setPreservesCFG();
468 }
469
470 static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) {
471 assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() &&
472 "Cannot form vector from incompatible scalar types");
473 Type *STy = ElemTy->getScalarType();
474
475 unsigned numElem;
476 if (VectorType *VTy = dyn_cast(ElemTy)) {
477 numElem = VTy->getNumElements();
478 } else {
479 numElem = 1;
480 }
481
482 if (VectorType *VTy = dyn_cast(Elem2Ty)) {
483 numElem += VTy->getNumElements();
484 } else {
485 numElem += 1;
486 }
487
488 return VectorType::get(STy, numElem);
489 }
490
491 static inline void getInstructionTypes(Instruction *I,
492 Type *&T1, Type *&T2) {
493 if (StoreInst *SI = dyn_cast(I)) {
494 // For stores, it is the value type, not the pointer type that matters
495 // because the value is what will come from a vector register.
496
497 Value *IVal = SI->getValueOperand();
498 T1 = IVal->getType();
499 } else {
500 T1 = I->getType();
501 }
502
503 if (CastInst *CI = dyn_cast(I))
504 T2 = CI->getSrcTy();
505 else
506 T2 = T1;
507
508 if (SelectInst *SI = dyn_cast(I)) {
509 T2 = SI->getCondition()->getType();
510 } else if (ShuffleVectorInst *SI = dyn_cast(I)) {
511 T2 = SI->getOperand(0)->getType();
512 } else if (CmpInst *CI = dyn_cast(I)) {
513 T2 = CI->getOperand(0)->getType();
514 }
515 }
516
517 // Returns the weight associated with the provided value. A chain of
518 // candidate pairs has a length given by the sum of the weights of its
519 // members (one weight per pair; the weight of each member of the pair
520 // is assumed to be the same). This length is then compared to the
521 // chain-length threshold to determine if a given chain is significant
522 // enough to be vectorized. The length is also used in comparing
523 // candidate chains where longer chains are considered to be better.
524 // Note: when this function returns 0, the resulting instructions are
525 // not actually fused.
526 inline size_t getDepthFactor(Value *V) {
527 // InsertElement and ExtractElement have a depth factor of zero. This is
528 // for two reasons: First, they cannot be usefully fused. Second, because
529 // the pass generates a lot of these, they can confuse the simple metric
530 // used to compare the dags in the next iteration. Thus, giving them a
531 // weight of zero allows the pass to essentially ignore them in
532 // subsequent iterations when looking for vectorization opportunities
533 // while still tracking dependency chains that flow through those
534 // instructions.
535 if (isa(V) || isa(V))
536 return 0;
537
538 // Give a load or store half of the required depth so that load/store
539 // pairs will vectorize.
540 if (!Config.NoMemOpBoost && (isa(V) || isa(V)))
541 return Config.ReqChainDepth/2;
542
543 return 1;
544 }
545
546 // Returns the cost of the provided instruction using TTI.
547 // This does not handle loads and stores.
548 unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
549 TargetTransformInfo::OperandValueKind Op1VK =
550 TargetTransformInfo::OK_AnyValue,
551 TargetTransformInfo::OperandValueKind Op2VK =
552 TargetTransformInfo::OK_AnyValue,
553 const Instruction *I = nullptr) {
554 switch (Opcode) {
555 default: break;
556 case Instruction::GetElementPtr:
557 // We mark this instruction as zero-cost because scalar GEPs are usually
558 // lowered to the instruction addressing mode. At the moment we don't
559 // generate vector GEPs.
560 return 0;
561 case Instruction::Br:
562 return TTI->getCFInstrCost(Opcode);
563 case Instruction::PHI:
564 return 0;
565 case Instruction::Add:
566 case Instruction::FAdd:
567 case Instruction::Sub:
568 case Instruction::FSub:
569 case Instruction::Mul:
570 case Instruction::FMul:
571 case Instruction::UDiv:
572 case Instruction::SDiv:
573 case Instruction::FDiv:
574 case Instruction::URem:
575 case Instruction::SRem:
576 case Instruction::FRem:
577 case Instruction::Shl:
578 case Instruction::LShr:
579 case Instruction::AShr:
580 case Instruction::And:
581 case Instruction::Or:
582 case Instruction::Xor:
583 return TTI->getArithmeticInstrCost(Opcode, T1, Op1VK, Op2VK);
584 case Instruction::Select:
585 case Instruction::ICmp:
586 case Instruction::FCmp:
587 return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
588 case Instruction::ZExt:
589 case Instruction::SExt:
590 case Instruction::FPToUI:
591 case Instruction::FPToSI:
592 case Instruction::FPExt:
593 case Instruction::PtrToInt:
594 case Instruction::IntToPtr:
595 case Instruction::SIToFP:
596 case Instruction::UIToFP:
597 case Instruction::Trunc:
598 case Instruction::FPTrunc:
599 case Instruction::BitCast:
600 case Instruction::ShuffleVector:
601 return TTI->getCastInstrCost(Opcode, T1, T2, I);
602 }
603
604 return 1;
605 }
606
607 // This determines the relative offset of two loads or stores, returning
608 // true if the offset could be determined to be some constant value.
609 // For example, if OffsetInElmts == 1, then J accesses the memory directly
610 // after I; if OffsetInElmts == -1 then I accesses the memory
611 // directly after J.
612 bool getPairPtrInfo(Instruction *I, Instruction *J,
613 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
614 unsigned &IAddressSpace, unsigned &JAddressSpace,
615 int64_t &OffsetInElmts, bool ComputeOffset = true) {
616 OffsetInElmts = 0;
617 if (LoadInst *LI = dyn_cast(I)) {
618 LoadInst *LJ = cast(J);
619 IPtr = LI->getPointerOperand();
620 JPtr = LJ->getPointerOperand();
621 IAlignment = LI->getAlignment();
622 JAlignment = LJ->getAlignment();
623 IAddressSpace = LI->getPointerAddressSpace();
624 JAddressSpace = LJ->getPointerAddressSpace();
625 } else {
626 StoreInst *SI = cast(I), *SJ = cast(J);
627 IPtr = SI->getPointerOperand();
628 JPtr = SJ->getPointerOperand();
629 IAlignment = SI->getAlignment();
630 JAlignment = SJ->getAlignment();
631 IAddressSpace = SI->getPointerAddressSpace();
632 JAddressSpace = SJ->getPointerAddressSpace();
633 }
634
635 if (!ComputeOffset)
636 return true;
637
638 const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
639 const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
640
641 // If this is a trivial offset, then we'll get something like
642 // 1*sizeof(type). With target data, which we need anyway, this will get
643 // constant folded into a number.
644 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
645 if (const SCEVConstant *ConstOffSCEV =
646 dyn_cast(OffsetSCEV)) {
647 ConstantInt *IntOff = ConstOffSCEV->getValue();
648 int64_t Offset = IntOff->getSExtValue();
649 const DataLayout &DL = I->getModule()->getDataLayout();
650 Type *VTy = IPtr->getType()->getPointerElementType();
651 int64_t VTyTSS = (int64_t)DL.getTypeStoreSize(VTy);
652
653 Type *VTy2 = JPtr->getType()->getPointerElementType();
654 if (VTy != VTy2 && Offset < 0) {
655 int64_t VTy2TSS = (int64_t)DL.getTypeStoreSize(VTy2);
656 OffsetInElmts = Offset/VTy2TSS;
657 return (std::abs(Offset) % VTy2TSS) == 0;
658 }
659
660 OffsetInElmts = Offset/VTyTSS;
661 return (std::abs(Offset) % VTyTSS) == 0;
662 }
663
664 return false;
665 }
666
667 // Returns true if the provided CallInst represents an intrinsic that can
668 // be vectorized.
669 bool isVectorizableIntrinsic(CallInst* I) {
670 Function *F = I->getCalledFunction();
671 if (!F) return false;
672
673 Intrinsic::ID IID = F->getIntrinsicID();
674 if (!IID) return false;
675
676 switch(IID) {
677 default:
678 return false;
679 case Intrinsic::sqrt:
680 case Intrinsic::powi:
681 case Intrinsic::sin:
682 case Intrinsic::cos:
683 case Intrinsic::log:
684 case Intrinsic::log2:
685 case Intrinsic::log10:
686 case Intrinsic::exp:
687 case Intrinsic::exp2:
688 case Intrinsic::pow:
689 case Intrinsic::round:
690 case Intrinsic::copysign:
691 case Intrinsic::ceil:
692 case Intrinsic::nearbyint:
693 case Intrinsic::rint:
694 case Intrinsic::trunc:
695 case Intrinsic::floor:
696 case Intrinsic::fabs:
697 case Intrinsic::minnum:
698 case Intrinsic::maxnum:
699 return Config.VectorizeMath;
700 case Intrinsic::bswap:
701 case Intrinsic::ctpop:
702 case Intrinsic::ctlz:
703 case Intrinsic::cttz:
704 return Config.VectorizeBitManipulations;
705 case Intrinsic::fma:
706 case Intrinsic::fmuladd:
707 return Config.VectorizeFMA;
708 }
709 }
710
711 bool isPureIEChain(InsertElementInst *IE) {
712 InsertElementInst *IENext = IE;
713 do {
714 if (!isa(IENext->getOperand(0)) &&
715 !isa(IENext->getOperand(0))) {
716 return false;
717 }
718 } while ((IENext =
719 dyn_cast(IENext->getOperand(0))));
720
721 return true;
722 }
723 };
724
725 // This function implements one vectorization iteration on the provided
726 // basic block. It returns true if the block is changed.
727 bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) {
728 bool ShouldContinue;
729 BasicBlock::iterator Start = BB.getFirstInsertionPt();
730
731 std::vector AllPairableInsts;
732 DenseMap AllChosenPairs;
733 DenseSet AllFixedOrderPairs;
734 DenseMap AllPairConnectionTypes;
735 DenseMap > AllConnectedPairs,
736 AllConnectedPairDeps;
737
738 do {
739 std::vector PairableInsts;
740 DenseMap > CandidatePairs;
741 DenseSet FixedOrderPairs;
742 DenseMap CandidatePairCostSavings;
743 ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs,
744 FixedOrderPairs,
745 CandidatePairCostSavings,
746 PairableInsts, NonPow2Len);
747 if (PairableInsts.empty()) continue;
748
749 // Build the candidate pair set for faster lookups.
750 DenseSet CandidatePairsSet;
751 for (DenseMap >::iterator I =
752 CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I)
753 for (std::vector::iterator J = I->second.begin(),
754 JE = I->second.end(); J != JE; ++J)
755 CandidatePairsSet.insert(ValuePair(I->first, *J));
756
757 // Now we have a map of all of the pairable instructions and we need to
758 // select the best possible pairing. A good pairing is one such that the
759 // users of the pair are also paired. This defines a (directed) forest
760 // over the pairs such that two pairs are connected iff the second pair
761 // uses the first.
762
763 // Note that it only matters that both members of the second pair use some
764 // element of the first pair (to allow for splatting).
765
766 DenseMap > ConnectedPairs,
767 ConnectedPairDeps;
768 DenseMap PairConnectionTypes;
769 computeConnectedPairs(CandidatePairs, CandidatePairsSet,
770 PairableInsts, ConnectedPairs, PairConnectionTypes);
771 if (ConnectedPairs.empty()) continue;
772
773 for (DenseMap >::iterator
774 I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
775 I != IE; ++I)
776 for (std::vector::iterator J = I->second.begin(),
777 JE = I->second.end(); J != JE; ++J)
778 ConnectedPairDeps[*J].push_back(I->first);
779
780 // Build the pairable-instruction dependency map
781 DenseSet PairableInstUsers;
782 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
783
784 // There is now a graph of the connected pairs. For each variable, pick
785 // the pairing with the largest dag meeting the depth requirement on at
786 // least one branch. Then select all pairings that are part of that dag
787 // and remove them from the list of available pairings and pairable
788 // variables.
789
790 DenseMap ChosenPairs;
791 choosePairs(CandidatePairs, CandidatePairsSet,
792 CandidatePairCostSavings,
793 PairableInsts, FixedOrderPairs, PairConnectionTypes,
794 ConnectedPairs, ConnectedPairDeps,
795 PairableInstUsers, ChosenPairs);
796
797 if (ChosenPairs.empty()) continue;
798 AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(),
799 PairableInsts.end());
800 AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end());
801
802 // Only for the chosen pairs, propagate information on fixed-order pairs,
803 // pair connections, and their types to the data structures used by the
804 // pair fusion procedures.
805 for (DenseMap::iterator I = ChosenPairs.begin(),
806 IE = ChosenPairs.end(); I != IE; ++I) {
807 if (FixedOrderPairs.count(*I))
808 AllFixedOrderPairs.insert(*I);
809 else if (FixedOrderPairs.count(ValuePair(I->second, I->first)))
810 AllFixedOrderPairs.insert(ValuePair(I->second, I->first));
811
812 for (DenseMap::iterator J = ChosenPairs.begin();
813 J != IE; ++J) {
814 DenseMap::iterator K =
815 PairConnectionTypes.find(VPPair(*I, *J));
816 if (K != PairConnectionTypes.end()) {
817 AllPairConnectionTypes.insert(*K);
818 } else {
819 K = PairConnectionTypes.find(VPPair(*J, *I));
820 if (K != PairConnectionTypes.end())
821 AllPairConnectionTypes.insert(*K);
822 }
823 }
824 }
825
826 for (DenseMap >::iterator
827 I = ConnectedPairs.begin(), IE = ConnectedPairs.end();
828 I != IE; ++I)
829 for (std::vector::iterator J = I->second.begin(),
830 JE = I->second.end(); J != JE; ++J)
831 if (AllPairConnectionTypes.count(VPPair(I->first, *J))) {
832 AllConnectedPairs[I->first].push_back(*J);
833 AllConnectedPairDeps[*J].push_back(I->first);
834 }
835 } while (ShouldContinue);
836
837 if (AllChosenPairs.empty()) return false;
838 NumFusedOps += AllChosenPairs.size();
839
840 // A set of pairs has now been selected. It is now necessary to replace the
841 // paired instructions with vector instructions. For this procedure each
842 // operand must be replaced with a vector operand. This vector is formed
843 // by using build_vector on the old operands. The replaced values are then
844 // replaced with a vector_extract on the result. Subsequent optimization
845 // passes should coalesce the build/extract combinations.
846
847 fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs,
848 AllPairConnectionTypes,
849 AllConnectedPairs, AllConnectedPairDeps);
850
851 // It is important to cleanup here so that future iterations of this
852 // function have less work to do.
853 (void)SimplifyInstructionsInBlock(&BB, TLI);
854 return true;
855 }
856
857 // This function returns true if the provided instruction is capable of being
858 // fused into a vector instruction. This determination is based only on the
859 // type and other attributes of the instruction.
860 bool BBVectorize::isInstVectorizable(Instruction *I,
861 bool &IsSimpleLoadStore) {
862 IsSimpleLoadStore = false;
863
864 if (CallInst *C = dyn_cast(I)) {
865 if (!isVectorizableIntrinsic(C))
866 return false;
867 } else if (LoadInst *L = dyn_cast(I)) {
868 // Vectorize simple loads if possbile:
869 IsSimpleLoadStore = L->isSimple();
870 if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
871 return false;
872 } else if (StoreInst *S = dyn_cast(I)) {
873 // Vectorize simple stores if possbile:
874 IsSimpleLoadStore = S->isSimple();
875 if (!IsSimpleLoadStore || !Config.VectorizeMemOps)
876 return false;
877 } else if (CastInst *C = dyn_cast(I)) {
878 // We can vectorize casts, but not casts of pointer types, etc.
879 if (!Config.VectorizeCasts)
880 return false;
881
882 Type *SrcTy = C->getSrcTy();
883 if (!SrcTy->isSingleValueType())
884 return false;
885
886 Type *DestTy = C->getDestTy();
887 if (!DestTy->isSingleValueType())
888 return false;
889 } else if (SelectInst *SI = dyn_cast(I)) {
890 if (!Config.VectorizeSelect)
891 return false;
892 // We can vectorize a select if either all operands are scalars,
893 // or all operands are vectors. Trying to "widen" a select between
894 // vectors that has a scalar condition results in a malformed select.
895 // FIXME: We could probably be smarter about this by rewriting the select
896 // with different types instead.
897 return (SI->getCondition()->getType()->isVectorTy() ==
898 SI->getTrueValue()->getType()->isVectorTy());
899 } else if (isa(I)) {
900 if (!Config.VectorizeCmp)
901 return false;
902 } else if (GetElementPtrInst *G = dyn_cast(I)) {
903 if (!Config.VectorizeGEP)
904 return false;
905
906 // Currently, vector GEPs exist only with one index.
907 if (G->getNumIndices() != 1)
908 return false;
909 } else if (!(I->isBinaryOp() || isa(I) ||
910 isa(I) || isa(I))) {
911 return false;
912 }
913
914 Type *T1, *T2;
915 getInstructionTypes(I, T1, T2);
916
917 // Not every type can be vectorized...
918 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
919 !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
920 return false;
921
922 if (T1->getScalarSizeInBits() == 1) {
923 if (!Config.VectorizeBools)
924 return false;
925 } else {
926 if (!Config.VectorizeInts && T1->isIntOrIntVectorTy())
927 return false;
928 }
929
930 if (T2->getScalarSizeInBits() == 1) {
931 if (!Config.VectorizeBools)
932 return false;
933 } else {
934 if (!Config.VectorizeInts && T2->isIntOrIntVectorTy())
935 return false;
936 }
937
938 if (!Config.VectorizeFloats
939 && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
940 return false;
941
942 // Don't vectorize target-specific types.
943 if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy())
944 return false;
945 if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy())
946 return false;
947
948 if (!Config.VectorizePointers && (T1->getScalarType()->isPointerTy() ||
949 T2->getScalarType()->isPointerTy()))
950 return false;
951
952 if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits ||
953 T2->getPrimitiveSizeInBits() >= Config.VectorBits))
954 return false;
955
956 return true;
957 }
958
959 // This function returns true if the two provided instructions are compatible
960 // (meaning that they can be fused into a vector instruction). This assumes
961 // that I has already been determined to be vectorizable and that J is not
962 // in the use dag of I.
963 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
964 bool IsSimpleLoadStore, bool NonPow2Len,
965 int &CostSavings, int &FixedOrder) {
966 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
967 " <-> " << *J << "\n");
968
969 CostSavings = 0;
970 FixedOrder = 0;
971
972 // Loads and stores can be merged if they have different alignments,
973 // but are otherwise the same.
974 if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment |
975 (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0)))
976 return false;
977
978 Type *IT1, *IT2, *JT1, *JT2;
979 getInstructionTypes(I, IT1, IT2);
980 getInstructionTypes(J, JT1, JT2);
981 unsigned MaxTypeBits = std::max(
982 IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(),
983 IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits());
984 if (!TTI && MaxTypeBits > Config.VectorBits)
985 return false;
986
987 // FIXME: handle addsub-type operations!
988
989 if (IsSimpleLoadStore) {
990 Value *IPtr, *JPtr;
991 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
992 int64_t OffsetInElmts = 0;
993 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
994 IAddressSpace, JAddressSpace, OffsetInElmts) &&
995 std::abs(OffsetInElmts) == 1) {
996 FixedOrder = (int) OffsetInElmts;
997 unsigned BottomAlignment = IAlignment;
998 if (OffsetInElmts < 0) BottomAlignment = JAlignment;
999
1000 Type *aTypeI = isa(I) ?
1001 cast(I)->getValueOperand()->getType() : I->getType();
1002 Type *aTypeJ = isa(J) ?
1003 cast(J)->getValueOperand()->getType() : J->getType();
1004 Type *VType = getVecTypeForPair(aTypeI, aTypeJ);
1005
1006 if (Config.AlignedOnly) {
1007 // An aligned load or store is possible only if the instruction
1008 // with the lower offset has an alignment suitable for the
1009 // vector type.
1010 const DataLayout &DL = I->getModule()->getDataLayout();
1011 unsigned VecAlignment = DL.getPrefTypeAlignment(VType);
1012 if (BottomAlignment < VecAlignment)
1013 return false;
1014 }
1015
1016 if (TTI) {
1017 unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI,
1018 IAlignment, IAddressSpace);
1019 unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ,
1020 JAlignment, JAddressSpace);
1021 unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType,
1022 BottomAlignment,
1023 IAddressSpace);
1024
1025 ICost += TTI->getAddressComputationCost(aTypeI);
1026 JCost += TTI->getAddressComputationCost(aTypeJ);
1027 VCost += TTI->getAddressComputationCost(VType);
1028
1029 if (VCost > ICost + JCost)
1030 return false;
1031
1032 // We don't want to fuse to a type that will be split, even
1033 // if the two input types will also be split and there is no other
1034 // associated cost.
1035 unsigned VParts = TTI->getNumberOfParts(VType);
1036 if (VParts > 1)
1037 return false;
1038 else if (!VParts && VCost == ICost + JCost)
1039 return false;
1040
1041 CostSavings = ICost + JCost - VCost;
1042 }
1043 } else {
1044 return false;
1045 }
1046 } else if (TTI) {
1047 TargetTransformInfo::OperandValueKind Op1VK =
1048 TargetTransformInfo::OK_AnyValue;
1049 TargetTransformInfo::OperandValueKind Op2VK =
1050 TargetTransformInfo::OK_AnyValue;
1051 unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
1052 unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
1053 Type *VT1 = getVecTypeForPair(IT1, JT1),
1054 *VT2 = getVecTypeForPair(IT2, JT2);
1055
1056 // On some targets (example X86) the cost of a vector shift may vary
1057 // depending on whether the second operand is a Uniform or
1058 // NonUniform Constant.
1059 switch (I->getOpcode()) {
1060 default : break;
1061 case Instruction::Shl:
1062 case Instruction::LShr:
1063 case Instruction::AShr:
1064
1065 // If both I and J are scalar shifts by constant, then the
1066 // merged vector shift count would be either a constant splat value
1067 // or a non-uniform vector of constants.
1068 if (ConstantInt *CII = dyn_cast(I->getOperand(1))) {
1069 if (ConstantInt *CIJ = dyn_cast(J->getOperand(1)))
1070 Op2VK = CII == CIJ ? TargetTransformInfo::OK_UniformConstantValue :
1071 TargetTransformInfo::OK_NonUniformConstantValue;
1072 } else {
1073 // Check for a splat of a constant or for a non uniform vector
1074 // of constants.
1075 Value *IOp = I->getOperand(1);
1076 Value *JOp = J->getOperand(1);
1077 if ((isa(IOp) || isa(IOp)) &&
1078 (isa(JOp) || isa(JOp))) {
1079 Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
1080 Constant *SplatValue = cast(IOp)->getSplatValue();
1081 if (SplatValue != nullptr &&
1082 SplatValue == cast(JOp)->getSplatValue())
1083 Op2VK = TargetTransformInfo::OK_UniformConstantValue;
1084 }
1085 }
1086 }
1087
1088 // Note that this procedure is incorrect for insert and extract element
1089 // instructions (because combining these often results in a shuffle),
1090 // but this cost is ignored (because insert and extract element
1091 // instructions are assigned a zero depth factor and are not really
1092 // fused in general).
1093 unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
1094
1095 if (VCost > ICost + JCost)
1096 return false;
1097
1098 // We don't want to fuse to a type that will be split, even
1099 // if the two input types will also be split and there is no other
1100 // associated cost.
1101 unsigned VParts1 = TTI->getNumberOfParts(VT1),
1102 VParts2 = TTI->getNumberOfParts(VT2);
1103 if (VParts1 > 1 || VParts2 > 1)
1104 return false;
1105 else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
1106 return false;
1107
1108 CostSavings = ICost + JCost - VCost;
1109 }
1110
1111 // The powi,ctlz,cttz intrinsics are special because only the first
1112 // argument is vectorized, the second arguments must be equal.
1113 CallInst *CI = dyn_cast(I);
1114 Function *FI;
1115 if (CI && (FI = CI->getCalledFunction())) {
1116 Intrinsic::ID IID = FI->getIntrinsicID();
1117 if (IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
1118 IID == Intrinsic::cttz) {
1119 Value *A1I = CI->getArgOperand(1),
1120 *A1J = cast(J)->getArgOperand(1);
1121 const SCEV *A1ISCEV = SE->getSCEV(A1I),
1122 *A1JSCEV = SE->getSCEV(A1J);
1123 return (A1ISCEV == A1JSCEV);
1124 }
1125
1126 if (IID && TTI) {
1127 FastMathFlags FMFCI;
1128 if (auto *FPMOCI = dyn_cast(CI))
1129 FMFCI = FPMOCI->getFastMathFlags();
1130 SmallVector IArgs(CI->arg_operands());
1131 unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
1132
1133 CallInst *CJ = cast(J);
1134
1135 FastMathFlags FMFCJ;
1136 if (auto *FPMOCJ = dyn_cast(CJ))
1137 FMFCJ = FPMOCJ->getFastMathFlags();
1138
1139 SmallVector JArgs(CJ->arg_operands());
1140 unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
1141
1142 assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
1143 "Intrinsic argument counts differ");
1144 SmallVector Tys;
1145 SmallVector VecArgs;
1146 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1147 if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
1148 IID == Intrinsic::cttz) && i == 1) {
1149 Tys.push_back(CI->getArgOperand(i)->getType());
1150 VecArgs.push_back(CI->getArgOperand(i));
1151 }
1152 else {
1153 Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
1154 CJ->getArgOperand(i)->getType()));
1155 // Add both operands, and then count their scalarization overhead
1156 // with VF 1.
1157 VecArgs.push_back(CI->getArgOperand(i));
1158 VecArgs.push_back(CJ->getArgOperand(i));
1159 }
1160 }
1161
1162 // Compute the scalarization cost here with the original operands (to
1163 // check for uniqueness etc), and then call getIntrinsicInstrCost()
1164 // with the constructed vector types.
1165 Type *RetTy = getVecTypeForPair(IT1, JT1);
1166 unsigned ScalarizationCost = 0;
1167 if (!RetTy->isVoidTy())
1168 ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
1169 ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
1170
1171 FastMathFlags FMFV = FMFCI;
1172 FMFV &= FMFCJ;
1173 unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
1174 ScalarizationCost);
1175
1176 if (VCost > ICost + JCost)
1177 return false;
1178
1179 // We don't want to fuse to a type that will be split, even
1180 // if the two input types will also be split and there is no other
1181 // associated cost.
1182 unsigned RetParts = TTI->getNumberOfParts(RetTy);
1183 if (RetParts > 1)
1184 return false;
1185 else if (!RetParts && VCost == ICost + JCost)
1186 return false;
1187
1188 for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
1189 if (!Tys[i]->isVectorTy())
1190 continue;
1191
1192 unsigned NumParts = TTI->getNumberOfParts(Tys[i]);
1193 if (NumParts > 1)
1194 return false;
1195 else if (!NumParts && VCost == ICost + JCost)
1196 return false;
1197 }
1198
1199 CostSavings = ICost + JCost - VCost;
1200 }
1201 }
1202
1203 return true;
1204 }
1205
1206 // Figure out whether or not J uses I and update the users and write-set
1207 // structures associated with I. Specifically, Users represents the set of
1208 // instructions that depend on I. WriteSet represents the set
1209 // of memory locations that are dependent on I. If UpdateUsers is true,
1210 // and J uses I, then Users is updated to contain J and WriteSet is updated
1211 // to contain any memory locations to which J writes. The function returns
1212 // true if J uses I. By default, alias analysis is used to determine
1213 // whether J reads from memory that overlaps with a location in WriteSet.
1214 // If LoadMoveSet is not null, then it is a previously-computed map
1215 // where the key is the memory-based user instruction and the value is
1216 // the instruction to be compared with I. So, if LoadMoveSet is provided,
1217 // then the alias analysis is not used. This is necessary because this
1218 // function is called during the process of moving instructions during
1219 // vectorization and the results of the alias analysis are not stable during
1220 // that process.
1221 bool BBVectorize::trackUsesOfI(DenseSet &Users,
1222 AliasSetTracker &WriteSet, Instruction *I,
1223 Instruction *J, bool UpdateUsers,
1224 DenseSet *LoadMoveSetPairs) {
1225 bool UsesI = false;
1226
1227 // This instruction may already be marked as a user due, for example, to
1228 // being a member of a selected pair.
1229 if (Users.count(J))
1230 UsesI = true;
1231
1232 if (!UsesI)
1233 for (User::op_iterator JU = J->op_begin(), JE = J->op_end();
1234 JU != JE; ++JU) {
1235 Value *V = *JU;
1236 if (I == V || Users.count(V)) {
1237 UsesI = true;
1238 break;
1239 }
1240 }
1241 if (!UsesI && J->mayReadFromMemory()) {
1242 if (LoadMoveSetPairs) {
1243 UsesI = LoadMoveSetPairs->count(ValuePair(J, I));
1244 } else {
1245 for (AliasSetTracker::iterator W = WriteSet.begin(),
1246 WE = WriteSet.end(); W != WE; ++W) {
1247 if (W->aliasesUnknownInst(J, *AA)) {
1248 UsesI = true;
1249 break;
1250 }
1251 }
1252 }
1253 }
1254
1255 if (UsesI && UpdateUsers) {
1256 if (J->mayWriteToMemory()) WriteSet.add(J);
1257 Users.insert(J);
1258 }
1259
1260 return UsesI;
1261 }
1262
1263 // This function iterates over all instruction pairs in the provided
1264 // basic block and collects all candidate pairs for vectorization.
1265 bool BBVectorize::getCandidatePairs(BasicBlock &BB,
1266 BasicBlock::iterator &Start,
1267 DenseMap > &CandidatePairs,
1268 DenseSet &FixedOrderPairs,
1269 DenseMap &CandidatePairCostSavings,
1270 std::vector &PairableInsts, bool NonPow2Len) {
1271 size_t TotalPairs = 0;
1272 BasicBlock::iterator E = BB.end();
1273 if (Start == E) return false;
1274
1275 bool ShouldContinue = false, IAfterStart = false;
1276 for (BasicBlock::iterator I = Start++; I != E; ++I) {
1277 if (I == Start) IAfterStart = true;
1278
1279 bool IsSimpleLoadStore;
1280 if (!isInstVectorizable(&*I, IsSimpleLoadStore))
1281 continue;
1282
1283 // Look for an instruction with which to pair instruction *I...
1284 DenseSet Users;
1285 AliasSetTracker WriteSet(*AA);
1286 if (I->mayWriteToMemory())
1287 WriteSet.add(&*I);
1288
1289 bool JAfterStart = IAfterStart;
1290 BasicBlock::iterator J = std::next(I);
1291 for (unsigned ss = 0; J != E && ss <= Config.SearchLimit; ++J, ++ss) {
1292 if (J == Start)
1293 JAfterStart = true;
1294
1295 // Determine if J uses I, if so, exit the loop.
1296 bool UsesI = trackUsesOfI(Users, WriteSet, &*I, &*J, !Config.FastDep);
1297 if (Config.FastDep) {
1298 // Note: For this heuristic to be effective, independent operations
1299 // must tend to be intermixed. This is likely to be true from some
1300 // kinds of grouped loop unrolling (but not the generic LLVM pass),
1301 // but otherwise may require some kind of reordering pass.
1302
1303 // When using fast dependency analysis,
1304 // stop searching after first use:
1305 if (UsesI) break;
1306 } else {
1307 if (UsesI) continue;
1308 }
1309
1310 // J does not use I, and comes before the first use of I, so it can be
1311 // merged with I if the instructions are compatible.
1312 int CostSavings, FixedOrder;
1313 if (!areInstsCompatible(&*I, &*J, IsSimpleLoadStore, NonPow2Len,
1314 CostSavings, FixedOrder))
1315 continue;
1316
1317 // J is a candidate for merging with I.
1318 if (PairableInsts.empty() ||
1319 PairableInsts[PairableInsts.size() - 1] != &*I) {
1320 PairableInsts.push_back(&*I);
1321 }
1322
1323 CandidatePairs[&*I].push_back(&*J);
1324 ++TotalPairs;
1325 if (TTI)
1326 CandidatePairCostSavings.insert(
1327 ValuePairWithCost(ValuePair(&*I, &*J), CostSavings));
1328
1329 if (FixedOrder == 1)
1330 FixedOrderPairs.insert(ValuePair(&*I, &*J));
1331 else if (FixedOrder == -1)
1332 FixedOrderPairs.insert(ValuePair(&*J, &*I));
1333
1334 // The next call to this function must start after the last instruction
1335 // selected during this invocation.
1336 if (JAfterStart) {
1337 Start = std::next(J);
1338 IAfterStart = JAfterStart = false;
1339 }
1340
1341 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
1342 << *I << " <-> " << *J << " (cost savings: " <<
1343 CostSavings << ")\n");
1344
1345 // If we have already found too many pairs, break here and this function
1346 // will be called again starting after the last instruction selected
1347 // during this invocation.
1348 if (PairableInsts.size() >= Config.MaxInsts ||
1349 TotalPairs >= Config.MaxPairs) {
1350 ShouldContinue = true;
1351 break;
1352 }
1353 }
1354
1355 if (ShouldContinue)
1356 break;
1357 }
1358
1359 DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
1360 << " instructions with candidate pairs\n");
1361
1362 return ShouldContinue;
1363 }
1364
1365 // Finds candidate pairs connected to the pair P = . This means that
1366 // it looks for pairs such that both members have an input which is an
1367 // output of PI or PJ.
1368 void BBVectorize::computePairsConnectedTo(
1369 DenseMap > &CandidatePairs,
1370 DenseSet &CandidatePairsSet,
1371 std::vector &PairableInsts,
1372 DenseMap > &ConnectedPairs,
1373 DenseMap &PairConnectionTypes,
1374 ValuePair P) {
1375 StoreInst *SI, *SJ;
1376
1377 // For each possible pairing for this variable, look at the uses of
1378 // the first value...
1379 for (Value::user_iterator I = P.first->user_begin(),
1380 E = P.first->user_end();
1381 I != E; ++I) {
1382 User *UI = *I;
1383 if (isa(UI)) {
1384 // A pair cannot be connected to a load because the load only takes one
1385 // operand (the address) and it is a scalar even after vectorization.
1386 continue;
1387 } else if ((SI = dyn_cast(UI)) &&
1388 P.first == SI->getPointerOperand()) {
1389 // Similarly, a pair cannot be connected to a store through its
1390 // pointer operand.
1391 continue;
1392 }
1393
1394 // For each use of the first variable, look for uses of the second
1395 // variable...
1396 for (User *UJ : P.second->users()) {
1397 if ((SJ = dyn_cast(UJ)) &&
1398 P.second == SJ->getPointerOperand())
1399 continue;
1400
1401 // Look for :
1402 if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1403 VPPair VP(P, ValuePair(UI, UJ));
1404 ConnectedPairs[VP.first].push_back(VP.second);
1405 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect));
1406 }
1407
1408 // Look for :
1409 if (CandidatePairsSet.count(ValuePair(UJ, UI))) {
1410 VPPair VP(P, ValuePair(UJ, UI));
1411 ConnectedPairs[VP.first].push_back(VP.second);
1412 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap));
1413 }
1414 }
1415
1416 if (Config.SplatBreaksChain) continue;
1417 // Look for cases where just the first value in the pair is used by
1418 // both members of another pair (splatting).
1419 for (Value::user_iterator J = P.first->user_begin(); J != E; ++J) {
1420 User *UJ = *J;
1421 if ((SJ = dyn_cast(UJ)) &&
1422 P.first == SJ->getPointerOperand())
1423 continue;
1424
1425 if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1426 VPPair VP(P, ValuePair(UI, UJ));
1427 ConnectedPairs[VP.first].push_back(VP.second);
1428 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1429 }
1430 }
1431 }
1432
1433 if (Config.SplatBreaksChain) return;
1434 // Look for cases where just the second value in the pair is used by
1435 // both members of another pair (splatting).
1436 for (Value::user_iterator I = P.second->user_begin(),
1437 E = P.second->user_end();
1438 I != E; ++I) {
1439 User *UI = *I;
1440 if (isa(UI))
1441 continue;
1442 else if ((SI = dyn_cast(UI)) &&
1443 P.second == SI->getPointerOperand())
1444 continue;
1445
1446 for (Value::user_iterator J = P.second->user_begin(); J != E; ++J) {
1447 User *UJ = *J;
1448 if ((SJ = dyn_cast(UJ)) &&
1449 P.second == SJ->getPointerOperand())
1450 continue;
1451
1452 if (CandidatePairsSet.count(ValuePair(UI, UJ))) {
1453 VPPair VP(P, ValuePair(UI, UJ));
1454 ConnectedPairs[VP.first].push_back(VP.second);
1455 PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat));
1456 }
1457 }
1458 }
1459 }
1460
1461 // This function figures out which pairs are connected. Two pairs are
1462 // connected if some output of the first pair forms an input to both members
1463 // of the second pair.
1464 void BBVectorize::computeConnectedPairs(
1465 DenseMap > &CandidatePairs,
1466 DenseSet &CandidatePairsSet,
1467 std::vector &PairableInsts,
1468 DenseMap > &ConnectedPairs,
1469 DenseMap &PairConnectionTypes) {
1470 for (std::vector::iterator PI = PairableInsts.begin(),
1471 PE = PairableInsts.end(); PI != PE; ++PI) {
1472 DenseMap >::iterator PP =
1473 CandidatePairs.find(*PI);
1474 if (PP == CandidatePairs.end())
1475 continue;
1476
1477 for (std::vector::iterator P = PP->second.begin(),
1478 E = PP->second.end(); P != E; ++P)
1479 computePairsConnectedTo(CandidatePairs, CandidatePairsSet,
1480 PairableInsts, ConnectedPairs,
1481 PairConnectionTypes, ValuePair(*PI, *P));
1482 }
1483
1484 DEBUG(size_t TotalPairs = 0;
1485 for (DenseMap >::iterator I =
1486 ConnectedPairs.begin(), IE = ConnectedPairs.end(); I != IE; ++I)
1487 TotalPairs += I->second.size();
1488 dbgs() << "BBV: found " << TotalPairs
1489 << " pair connections.\n");
1490 }
1491
1492 // This function builds a set of use tuples such that is in the set
1493 // if B is in the use dag of A. If B is in the use dag of A, then B
1494 // depends on the output of A.
1495 void BBVectorize::buildDepMap(
1496 BasicBlock &BB,
1497 DenseMap > &CandidatePairs,
1498 std::vector &PairableInsts,
1499 DenseSet &PairableInstUsers) {
1500 DenseSet IsInPair;
1501 for (DenseMap >::iterator C =
1502 CandidatePairs.begin(), E = CandidatePairs.end(); C != E; ++C) {
1503 IsInPair.insert(C->first);
1504 IsInPair.insert(C->second.begin(), C->second.end());
1505 }
1506
1507 // Iterate through the basic block, recording all users of each
1508 // pairable instruction.
1509
1510 BasicBlock::iterator E = BB.end(), EL =
1511 BasicBlock::iterator(cast(PairableInsts.back()));
1512 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
1513 if (IsInPair.find(&*I) == IsInPair.end())
1514 continue;
1515
1516 DenseSet Users;
1517 AliasSetTracker WriteSet(*AA);
1518 if (I->mayWriteToMemory())
1519 WriteSet.add(&*I);
1520
1521 for (BasicBlock::iterator J = std::next(I); J != E; ++J) {
1522 (void)trackUsesOfI(Users, WriteSet, &*I, &*J);
1523
1524 if (J == EL)
1525 break;
1526 }
1527
1528 for (DenseSet::iterator U = Users.begin(), E = Users.end();
1529 U != E; ++U) {
1530 if (IsInPair.find(*U) == IsInPair.end()) continue;
1531 PairableInstUsers.insert(ValuePair(&*I, *U));
1532 }
1533
1534 if (I == EL)
1535 break;
1536 }
1537 }
1538
1539 // Returns true if an input to pair P is an output of pair Q and also an
1540 // input of pair Q is an output of pair P. If this is the case, then these
1541 // two pairs cannot be simultaneously fused.
1542 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
1543 DenseSet &PairableInstUsers,
1544 DenseMap > *PairableInstUserMap,
1545 DenseSet *PairableInstUserPairSet) {
1546 // Two pairs are in conflict if they are mutual Users of eachother.
1547 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
1548 PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
1549 PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
1550 PairableInstUsers.count(ValuePair(P.second, Q.second));
1551 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
1552 PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
1553 PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
1554 PairableInstUsers.count(ValuePair(Q.second, P.second));
1555 if (PairableInstUserMap) {
1556 // FIXME: The expensive part of the cycle check is not so much the cycle
1557 // check itself but this edge insertion procedure. This needs some
1558 // profiling and probably a different data structure.
1559 if (PUsesQ) {
1560 if (PairableInstUserPairSet->insert(VPPair(Q, P)).second)
1561 (*PairableInstUserMap)[Q].push_back(P);
1562 }
1563 if (QUsesP) {
1564 if (PairableInstUserPairSet->insert(VPPair(P, Q)).second)
1565 (*PairableInstUserMap)[P].push_back(Q);
1566 }
1567 }
1568
1569 return (QUsesP && PUsesQ);
1570 }
1571
1572 // This function walks the use graph of current pairs to see if, starting
1573 // from P, the walk returns to P.
1574 bool BBVectorize::pairWillFormCycle(ValuePair P,
1575 DenseMap > &PairableInstUserMap,
1576 DenseSet &CurrentPairs) {
1577 DEBUG(if (DebugCycleCheck)
1578 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
1579 << *P.second << "\n");
1580 // A lookup table of visisted pairs is kept because the PairableInstUserMap
1581 // contains non-direct associations.
1582 DenseSet Visited;
1583 SmallVector Q;
1584 // General depth-first post-order traversal:
1585 Q.push_back(P);
1586 do {
1587 ValuePair QTop = Q.pop_back_val();
1588 Visited.insert(QTop);
1589
1590 DEBUG(if (DebugCycleCheck)
1591 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
1592 << *QTop.second << "\n");
1593 DenseMap >::iterator QQ =
1594 PairableInstUserMap.find(QTop);
1595 if (QQ == PairableInstUserMap.end())
1596 continue;
1597
1598 for (std::vector::iterator C = QQ->second.begin(),
1599 CE = QQ->second.end(); C != CE; ++C) {
1600 if (*C == P) {
1601 DEBUG(dbgs()
1602 << "BBV: rejected to prevent non-trivial cycle formation: "
1603 << QTop.first << " <-> " << C->second << "\n");
1604 return true;
1605 }
1606
1607 if (CurrentPairs.count(*C) && !Visited.count(*C))
1608 Q.push_back(*C);
1609 }
1610 } while (!Q.empty());
1611
1612 return false;
1613 }
1614
1615 // This function builds the initial dag of connected pairs with the
1616 // pair J at the root.
1617 void BBVectorize::buildInitialDAGFor(
1618 DenseMap > &CandidatePairs,
1619 DenseSet &CandidatePairsSet,
1620 std::vector &PairableInsts,
1621 DenseMap > &ConnectedPairs,
1622 DenseSet &PairableInstUsers,
1623 DenseMap &ChosenPairs,
1624 DenseMap &DAG, ValuePair J) {
1625 // Each of these pairs is viewed as the root node of a DAG. The DAG
1626 // is then walked (depth-first). As this happens, we keep track of
1627 // the pairs that compose the DAG and the maximum depth of the DAG.
1628 SmallVector Q;
1629 // General depth-first post-order traversal:
1630 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1631 do {
1632 ValuePairWithDepth QTop = Q.back();
1633
1634 // Push each child onto the queue:
1635 bool MoreChildren = false;
1636 size_t MaxChildDepth = QTop.second;
1637 DenseMap >::iterator QQ =
1638 ConnectedPairs.find(QTop.first);
1639 if (QQ != ConnectedPairs.end())
1640 for (std::vector::iterator k = QQ->second.begin(),
1641 ke = QQ->second.end(); k != ke; ++k) {
1642 // Make sure that this child pair is still a candidate:
1643 if (CandidatePairsSet.count(*k)) {
1644 DenseMap::iterator C = DAG.find(*k);
1645 if (C == DAG.end()) {
1646 size_t d = getDepthFactor(k->first);
1647 Q.push_back(ValuePairWithDepth(*k, QTop.second+d));
1648 MoreChildren = true;
1649 } else {
1650 MaxChildDepth = std::max(MaxChildDepth, C->second);
1651 }
1652 }
1653 }
1654
1655 if (!MoreChildren) {
1656 // Record the current pair as part of the DAG:
1657 DAG.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
1658 Q.pop_back();
1659 }
1660 } while (!Q.empty());
1661 }
1662
1663 // Given some initial dag, prune it by removing conflicting pairs (pairs
1664 // that cannot be simultaneously chosen for vectorization).
1665 void BBVectorize::pruneDAGFor(
1666 DenseMap > &CandidatePairs,
1667 std::vector &PairableInsts,
1668 DenseMap > &ConnectedPairs,
1669 DenseSet &PairableInstUsers,
1670 DenseMap > &PairableInstUserMap,
1671 DenseSet &PairableInstUserPairSet,
1672 DenseMap &ChosenPairs,
1673 DenseMap &DAG,
1674 DenseSet &PrunedDAG, ValuePair J,
1675 bool UseCycleCheck) {
1676 SmallVector Q;
1677 // General depth-first post-order traversal:
1678 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
1679 do {
1680 ValuePairWithDepth QTop = Q.pop_back_val();
1681 PrunedDAG.insert(QTop.first);
1682
1683 // Visit each child, pruning as necessary...
1684 SmallVector BestChildren;
1685 DenseMap >::iterator QQ =
1686 ConnectedPairs.find(QTop.first);
1687 if (QQ == ConnectedPairs.end())
1688 continue;
1689
1690 for (std::vector::iterator K = QQ->second.begin(),
1691 KE = QQ->second.end(); K != KE; ++K) {
1692 DenseMap::iterator C = DAG.find(*K);
1693 if (C == DAG.end()) continue;
1694
1695 // This child is in the DAG, now we need to make sure it is the
1696 // best of any conflicting children. There could be multiple
1697 // conflicting children, so first, determine if we're keeping
1698 // this child, then delete conflicting children as necessary.
1699
1700 // It is also necessary to guard against pairing-induced
1701 // dependencies. Consider instructions a .. x .. y .. b
1702 // such that (a,b) are to be fused and (x,y) are to be fused
1703 // but a is an input to x and b is an output from y. This
1704 // means that y cannot be moved after b but x must be moved
1705 // after b for (a,b) to be fused. In other words, after
1706 // fusing (a,b) we have y .. a/b .. x where y is an input
1707 // to a/b and x is an output to a/b: x and y can no longer
1708 // be legally fused. To prevent this condition, we must
1709 // make sure that a child pair added to the DAG is not
1710 // both an input and output of an already-selected pair.
1711
1712 // Pairing-induced dependencies can also form from more complicated
1713 // cycles. The pair vs. pair conflicts are easy to check, and so
1714 // that is done explicitly for "fast rejection", and because for
1715 // child vs. child conflicts, we may prefer to keep the current
1716 // pair in preference to the already-selected child.
1717 DenseSet CurrentPairs;
1718
1719 bool CanAdd = true;
1720 for (SmallVectorImpl::iterator C2
1721 = BestChildren.begin(), E2 = BestChildren.end();
1722 C2 != E2; ++C2) {
1723 if (C2->first.first == C->first.first ||
1724 C2->first.first == C->first.second ||
1725 C2->first.second == C->first.first ||
1726 C2->first.second == C->first.second ||
1727 pairsConflict(C2->first, C->first, PairableInstUsers,
1728 UseCycleCheck ? &PairableInstUserMap : nullptr,
1729 UseCycleCheck ? &PairableInstUserPairSet
1730 : nullptr)) {
1731 if (C2->second >= C->second) {
1732 CanAdd = false;
1733 break;
1734 }
1735
1736 CurrentPairs.insert(C2->first);
1737 }
1738 }
1739 if (!CanAdd) continue;
1740
1741 // Even worse, this child could conflict with another node already
1742 // selected for the DAG. If that is the case, ignore this child.
1743 for (DenseSet::iterator T = PrunedDAG.begin(),
1744 E2 = PrunedDAG.end(); T != E2; ++T) {
1745 if (T->first == C->first.first ||
1746 T->first == C->first.second ||
1747 T->second == C->first.first ||
1748 T->second == C->first.second ||
1749 pairsConflict(*T, C->first, PairableInstUsers,
1750 UseCycleCheck ? &PairableInstUserMap : nullptr,
1751 UseCycleCheck ? &PairableInstUserPairSet
1752 : nullptr)) {
1753 CanAdd = false;
1754 break;
1755 }
1756
1757 CurrentPairs.insert(*T);
1758 }
1759 if (!CanAdd) continue;
1760
1761 // And check the queue too...
1762 for (SmallVectorImpl::iterator C2 = Q.begin(),
1763 E2 = Q.end(); C2 != E2; ++C2) {
1764 if (C2->first.first == C->first.first ||
1765 C2->first.first == C->first.second ||
1766 C2->first.second == C->first.first ||
1767 C2->first.second == C->first.second ||
1768 pairsConflict(C2->first, C->first, PairableInstUsers,
1769 UseCycleCheck ? &PairableInstUserMap : nullptr,
1770 UseCycleCheck ? &PairableInstUserPairSet
1771 : nullptr)) {
1772 CanAdd = false;
1773 break;
1774 }
1775
1776 CurrentPairs.insert(C2->first);
1777 }
1778 if (!CanAdd) continue;
1779
1780 // Last but not least, check for a conflict with any of the
1781 // already-chosen pairs.
1782 for (DenseMap::iterator C2 =
1783 ChosenPairs.begin(), E2 = ChosenPairs.end();
1784 C2 != E2; ++C2) {
1785 if (pairsConflict(*C2, C->first, PairableInstUsers,
1786 UseCycleCheck ? &PairableInstUserMap : nullptr,
1787 UseCycleCheck ? &PairableInstUserPairSet
1788 : nullptr)) {
1789 CanAdd = false;
1790 break;
1791 }
1792
1793 CurrentPairs.insert(*C2);
1794 }
1795 if (!CanAdd) continue;
1796
1797 // To check for non-trivial cycles formed by the addition of the
1798 // current pair we've formed a list of all relevant pairs, now use a
1799 // graph walk to check for a cycle. We start from the current pair and
1800 // walk the use dag to see if we again reach the current pair. If we
1801 // do, then the current pair is rejected.
1802
1803 // FIXME: It may be more efficient to use a topological-ordering
1804 // algorithm to improve the cycle check. This should be investigated.
1805 if (UseCycleCheck &&
1806 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1807 continue;
1808
1809 // This child can be added, but we may have chosen it in preference
1810 // to an already-selected child. Check for this here, and if a
1811 // conflict is found, then remove the previously-selected child
1812 // before adding this one in its place.
1813 for (SmallVectorImpl::iterator C2
1814 = BestChildren.begin(); C2 != BestChildren.end();) {
1815 if (C2->first.first == C->first.first ||
1816 C2->first.first == C->first.second ||
1817 C2->first.second == C->first.first ||
1818 C2->first.second == C->first.second ||
1819 pairsConflict(C2->first, C->first, PairableInstUsers))
1820 C2 = BestChildren.erase(C2);
1821 else
1822 ++C2;
1823 }
1824
1825 BestChildren.push_back(ValuePairWithDepth(C->first, C->second));
1826 }
1827
1828 for (SmallVectorImpl::iterator C
1829 = BestChildren.begin(), E2 = BestChildren.end();
1830 C != E2; ++C) {
1831 size_t DepthF = getDepthFactor(C->first.first);
1832 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1833 }
1834 } while (!Q.empty());
1835 }
1836
1837 // This function finds the best dag of mututally-compatible connected
1838 // pairs, given the choice of root pairs as an iterator range.
1839 void BBVectorize::findBestDAGFor(
1840 DenseMap > &CandidatePairs,
1841 DenseSet &CandidatePairsSet,
1842 DenseMap &CandidatePairCostSavings,
1843 std::vector &PairableInsts,
1844 DenseSet &FixedOrderPairs,
1845 DenseMap &PairConnectionTypes,
1846 DenseMap > &ConnectedPairs,
1847 DenseMap > &ConnectedPairDeps,
1848 DenseSet &PairableInstUsers,
1849 DenseMap > &PairableInstUserMap,
1850 DenseSet &PairableInstUserPairSet,
1851 DenseMap &ChosenPairs,
1852 DenseSet &BestDAG, size_t &BestMaxDepth,
1853 int &BestEffSize, Value *II, std::vector&JJ,
1854 bool UseCycleCheck) {
1855 for (std::vector::iterator J = JJ.begin(), JE = JJ.end();
1856 J != JE; ++J) {
1857 ValuePair IJ(II, *J);
1858 if (!CandidatePairsSet.count(IJ))
1859 continue;
1860
1861 // Before going any further, make sure that this pair does not
1862 // conflict with any already-selected pairs (see comment below
1863 // near the DAG pruning for more details).
1864 DenseSet ChosenPairSet;
1865 bool DoesConflict = false;
1866 for (DenseMap::iterator C = ChosenPairs.begin(),
1867 E = ChosenPairs.end(); C != E; ++C) {
1868 if (pairsConflict(*C, IJ, PairableInstUsers,
1869 UseCycleCheck ? &PairableInstUserMap : nullptr,
1870 UseCycleCheck ? &PairableInstUserPairSet : nullptr)) {
1871 DoesConflict = true;
1872 break;
1873 }
1874
1875 ChosenPairSet.insert(*C);
1876 }
1877 if (DoesConflict) continue;
1878
1879 if (UseCycleCheck &&
1880 pairWillFormCycle(IJ, PairableInstUserMap, ChosenPairSet))
1881 continue;
1882
1883 DenseMap DAG;
1884 buildInitialDAGFor(CandidatePairs, CandidatePairsSet,
1885 PairableInsts, ConnectedPairs,
1886 PairableInstUsers, ChosenPairs, DAG, IJ);
1887
1888 // Because we'll keep the child with the largest depth, the largest
1889 // depth is still the same in the unpruned DAG.
1890 size_t MaxDepth = DAG.lookup(IJ);
1891
1892 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
1893 << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
1894 MaxDepth << " and size " << DAG.size() << "\n");
1895
1896 // At this point the DAG has been constructed, but, may contain
1897 // contradictory children (meaning that different children of
1898 // some dag node may be attempting to fuse the same instruction).
1899 // So now we walk the dag again, in the case of a conflict,
1900 // keep only the child with the largest depth. To break a tie,
1901 // favor the first child.
1902
1903 DenseSet PrunedDAG;
1904 pruneDAGFor(CandidatePairs, PairableInsts, ConnectedPairs,
1905 PairableInstUsers, PairableInstUserMap,
1906 PairableInstUserPairSet,
1907 ChosenPairs, DAG, PrunedDAG, IJ, UseCycleCheck);
1908
1909 int EffSize = 0;
1910 if (TTI) {
1911 DenseSet PrunedDAGInstrs;
1912 for (DenseSet::iterator S = PrunedDAG.begin(),
1913 E = PrunedDAG.end(); S != E; ++S) {
1914 PrunedDAGInstrs.insert(S->first);
1915 PrunedDAGInstrs.insert(S->second);
1916 }
1917
1918 // The set of pairs that have already contributed to the total cost.
1919 DenseSet IncomingPairs;
1920
1921 // If the cost model were perfect, this might not be necessary; but we
1922 // need to make sure that we don't get stuck vectorizing our own
1923 // shuffle chains.
1924 bool HasNontrivialInsts = false;
1925
1926 // The node weights represent the cost savings associated with
1927 // fusing the pair of instructions.
1928 for (DenseSet::iterator S = PrunedDAG.begin(),
1929 E = PrunedDAG.end(); S != E; ++S) {
1930 if (!isa(S->first) &&
1931 !isa(S->first) &&
1932 !isa(S->first))
1933 HasNontrivialInsts = true;
1934
1935 bool FlipOrder = false;
1936
1937 if (getDepthFactor(S->first)) {
1938 int ESContrib = CandidatePairCostSavings.find(*S)->second;
1939 DEBUG(if (DebugPairSelection) dbgs() << "\tweight {"
1940 << *S->first << " <-> " << *S->second << "} = " <<
1941 ESContrib << "\n");
1942 EffSize += ESContrib;
1943 }
1944
1945 // The edge weights contribute in a negative sense: they represent
1946 // the cost of shuffles.
1947 DenseMap >::iterator SS =
1948 ConnectedPairDeps.find(*S);
1949 if (SS != ConnectedPairDeps.end()) {
1950 unsigned NumDepsDirect = 0, NumDepsSwap = 0;
1951 for (std::vector::iterator T = SS->second.begin(),
1952 TE = SS->second.end(); T != TE; ++T) {
1953 VPPair Q(*S, *T);
1954 if (!PrunedDAG.count(Q.second))
1955 continue;
1956 DenseMap::iterator R =
1957 PairConnectionTypes.find(VPPair(Q.second, Q.first));
1958 assert(R != PairConnectionTypes.end() &&
1959 "Cannot find pair connection type");
1960 if (R->second == PairConnectionDirect)
1961 ++NumDepsDirect;
1962 else if (R->second == PairConnectionSwap)
1963 ++NumDepsSwap;
1964 }
1965
1966 // If there are more swaps than direct connections, then
1967 // the pair order will be flipped during fusion. So the real
1968 // number of swaps is the minimum number.
1969 FlipOrder = !FixedOrderPairs.count(*S) &&
1970 ((NumDepsSwap > NumDepsDirect) ||
1971 FixedOrderPairs.count(ValuePair(S->second, S->first)));
1972
1973 for (std::vector::iterator T = SS->second.begin(),
1974 TE = SS->second.end(); T != TE; ++T) {
1975 VPPair Q(*S, *T);
1976 if (!PrunedDAG.count(Q.second))
1977 continue;
1978 DenseMap::iterator R =
1979 PairConnectionTypes.find(VPPair(Q.second, Q.first));
1980 assert(R != PairConnectionTypes.end() &&
1981 "Cannot find pair connection type");
1982 Type *Ty1 = Q.second.first->getType(),
1983 *Ty2 = Q.second.second->getType();
1984 Type *VTy = getVecTypeForPair(Ty1, Ty2);
1985 if ((R->second == PairConnectionDirect && FlipOrder) ||
1986 (R->second == PairConnectionSwap && !FlipOrder) ||
1987 R->second == PairConnectionSplat) {
1988 int ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
1989 VTy, VTy);
1990
1991 if (VTy->getVectorNumElements() == 2) {
1992 if (R->second == PairConnectionSplat)
1993 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1994 TargetTransformInfo::SK_Broadcast, VTy));
1995 else
1996 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
1997 TargetTransformInfo::SK_Reverse, VTy));
1998 }
1999
2000 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2001 *Q.second.first << " <-> " << *Q.second.second <<
2002 "} -> {" <<
2003 *S->first << " <-> " << *S->second << "} = " <<
2004 ESContrib << "\n");
2005 EffSize -= ESContrib;
2006 }
2007 }
2008 }
2009
2010 // Compute the cost of outgoing edges. We assume that edges outgoing
2011 // to shuffles, inserts or extracts can be merged, and so contribute
2012 // no additional cost.
2013 if (!S->first->getType()->isVoidTy()) {
2014 Type *Ty1 = S->first->getType(),
2015 *Ty2 = S->second->getType();
2016 Type *VTy = getVecTypeForPair(Ty1, Ty2);
2017
2018 bool NeedsExtraction = false;
2019 for (User *U : S->first->users()) {
2020 if (ShuffleVectorInst *SI = dyn_cast(U)) {
2021 // Shuffle can be folded if it has no other input
2022 if (isa(SI->getOperand(1)))
2023 continue;
2024 }
2025 if (isa(U))
2026 continue;
2027 if (PrunedDAGInstrs.count(U))
2028 continue;
2029 NeedsExtraction = true;
2030 break;
2031 }
2032
2033 if (NeedsExtraction) {
2034 int ESContrib;
2035 if (Ty1->isVectorTy()) {
2036 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2037 Ty1, VTy);
2038 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2039 TargetTransformInfo::SK_ExtractSubvector, VTy, 0, Ty1));
2040 } else
2041 ESContrib = (int) TTI->getVectorInstrCost(
2042 Instruction::ExtractElement, VTy, 0);
2043
2044 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2045 *S->first << "} = " << ESContrib << "\n");
2046 EffSize -= ESContrib;
2047 }
2048
2049 NeedsExtraction = false;
2050 for (User *U : S->second->users()) {
2051 if (ShuffleVectorInst *SI = dyn_cast(U)) {
2052 // Shuffle can be folded if it has no other input
2053 if (isa(SI->getOperand(1)))
2054 continue;
2055 }
2056 if (isa(U))
2057 continue;
2058 if (PrunedDAGInstrs.count(U))
2059 continue;
2060 NeedsExtraction = true;
2061 break;
2062 }
2063
2064 if (NeedsExtraction) {
2065 int ESContrib;
2066 if (Ty2->isVectorTy()) {
2067 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2068 Ty2, VTy);
2069 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2070 TargetTransformInfo::SK_ExtractSubvector, VTy,
2071 Ty1->isVectorTy() ? Ty1->getVectorNumElements() : 1, Ty2));
2072 } else
2073 ESContrib = (int) TTI->getVectorInstrCost(
2074 Instruction::ExtractElement, VTy, 1);
2075 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" <<
2076 *S->second << "} = " << ESContrib << "\n");
2077 EffSize -= ESContrib;
2078 }
2079 }
2080
2081 // Compute the cost of incoming edges.
2082 if (!isa(S->first) && !isa(S->first)) {
2083 Instruction *S1 = cast(S->first),
2084 *S2 = cast(S->second);
2085 for (unsigned o = 0; o < S1->getNumOperands(); ++o) {
2086 Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o);
2087
2088 // Combining constants into vector constants (or small vector
2089 // constants into larger ones are assumed free).
2090 if (isa(O1) && isa(O2))
2091 continue;
2092
2093 if (FlipOrder)
2094 std::swap(O1, O2);
2095
2096 ValuePair VP = ValuePair(O1, O2);
2097 ValuePair VPR = ValuePair(O2, O1);
2098
2099 // Internal edges are not handled here.
2100 if (PrunedDAG.count(VP) || PrunedDAG.count(VPR))
2101 continue;
2102
2103 Type *Ty1 = O1->getType(),
2104 *Ty2 = O2->getType();
2105 Type *VTy = getVecTypeForPair(Ty1, Ty2);
2106
2107 // Combining vector operations of the same type is also assumed
2108 // folded with other operations.
2109 if (Ty1 == Ty2) {
2110 // If both are insert elements, then both can be widened.
2111 InsertElementInst *IEO1 = dyn_cast(O1),
2112 *IEO2 = dyn_cast(O2);
2113 if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
2114 continue;
2115 // If both are extract elements, and both have the same input
2116 // type, then they can be replaced with a shuffle
2117 ExtractElementInst *EIO1 = dyn_cast(O1),
2118 *EIO2 = dyn_cast(O2);
2119 if (EIO1 && EIO2 &&
2120 EIO1->getOperand(0)->getType() ==
2121 EIO2->getOperand(0)->getType())
2122 continue;
2123 // If both are a shuffle with equal operand types and only two
2124 // unqiue operands, then they can be replaced with a single
2125 // shuffle
2126 ShuffleVectorInst *SIO1 = dyn_cast(O1),
2127 *SIO2 = dyn_cast(O2);
2128 if (SIO1 && SIO2 &&
2129 SIO1->getOperand(0)->getType() ==
2130 SIO2->getOperand(0)->getType()) {
2131 SmallSet SIOps;
2132 SIOps.insert(SIO1->getOperand(0));
2133 SIOps.insert(SIO1->getOperand(1));
2134 SIOps.insert(SIO2->getOperand(0));
2135 SIOps.insert(SIO2->getOperand(1));
2136 if (SIOps.size() <= 2)
2137 continue;
2138 }
2139 }
2140
2141 int ESContrib;
2142 // This pair has already been formed.
2143 if (IncomingPairs.count(VP)) {
2144 continue;
2145 } else if (IncomingPairs.count(VPR)) {
2146 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2147 VTy, VTy);
2148
2149 if (VTy->getVectorNumElements() == 2)
2150 ESContrib = std::min(ESContrib, (int) TTI->getShuffleCost(
2151 TargetTransformInfo::SK_Reverse, VTy));
2152 } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) {
2153 ESContrib = (int) TTI->getVectorInstrCost(
2154 Instruction::InsertElement, VTy, 0);
2155 ESContrib += (int) TTI->getVectorInstrCost(
2156 Instruction::InsertElement, VTy, 1);
2157 } else if (!Ty1->isVectorTy()) {
2158 // O1 needs to be inserted into a vector of size O2, and then
2159 // both need to be shuffled together.
2160 ESContrib = (int) TTI->getVectorInstrCost(
2161 Instruction::InsertElement, Ty2, 0);
2162 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2163 VTy, Ty2);
2164 } else if (!Ty2->isVectorTy()) {
2165 // O2 needs to be inserted into a vector of size O1, and then
2166 // both need to be shuffled together.
2167 ESContrib = (int) TTI->getVectorInstrCost(
2168 Instruction::InsertElement, Ty1, 0);
2169 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2170 VTy, Ty1);
2171 } else {
2172 Type *TyBig = Ty1, *TySmall = Ty2;
2173 if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements())
2174 std::swap(TyBig, TySmall);
2175
2176 ESContrib = (int) getInstrCost(Instruction::ShuffleVector,
2177 VTy, TyBig);
2178 if (TyBig != TySmall)
2179 ESContrib += (int) getInstrCost(Instruction::ShuffleVector,
2180 TyBig, TySmall);
2181 }
2182
2183 DEBUG(if (DebugPairSelection) dbgs() << "\tcost {"
2184 << *O1 << " <-> " << *O2 << "} = " <<
2185 ESContrib << "\n");
2186 EffSize -= ESContrib;
2187 IncomingPairs.insert(VP);
2188 }
2189 }
2190 }
2191
2192 if (!HasNontrivialInsts) {
2193 DEBUG(if (DebugPairSelection) dbgs() <<
2194 "\tNo non-trivial instructions in DAG;"
2195 " override to zero effective size\n");
2196 EffSize = 0;
2197 }
2198 } else {
2199 for (DenseSet::iterator S = PrunedDAG.begin(),
2200 E = PrunedDAG.end(); S != E; ++S)
2201 EffSize += (int) getDepthFactor(S->first);
2202 }
2203
2204 DEBUG(if (DebugPairSelection)
2205 dbgs() << "BBV: found pruned DAG for pair {"
2206 << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
2207 MaxDepth << " and size " << PrunedDAG.size() <<
2208 " (effective size: " << EffSize << ")\n");
2209 if (((TTI && !UseChainDepthWithTI) ||
2210 MaxDepth >= Config.ReqChainDepth) &&
2211 EffSize > 0 && EffSize > BestEffSize) {
2212 BestMaxDepth = MaxDepth;
2213 BestEffSize = EffSize;
2214 BestDAG = PrunedDAG;
2215 }
2216 }
2217 }
2218
2219 // Given the list of candidate pairs, this function selects those
2220 // that will be fused into vector instructions.
2221 void BBVectorize::choosePairs(
2222 DenseMap > &CandidatePairs,
2223 DenseSet &CandidatePairsSet,
2224 DenseMap &CandidatePairCostSavings,
2225 std::vector &PairableInsts,
2226 DenseSet &FixedOrderPairs,
2227 DenseMap &PairConnectionTypes,
2228 DenseMap > &ConnectedPairs,
2229 DenseMap > &ConnectedPairDeps,
2230 DenseSet &PairableInstUsers,
2231 DenseMap& ChosenPairs) {
2232 bool UseCycleCheck =
2233 CandidatePairsSet.size() <= Config.MaxCandPairsForCycleCheck;
2234
2235 DenseMap > CandidatePairs2;
2236 for (DenseSet::iterator I = CandidatePairsSet.begin(),
2237 E = CandidatePairsSet.end(); I != E; ++I) {
2238 std::vector &JJ = CandidatePairs2[I->second];
2239 if (JJ.empty()) JJ.reserve(32);
2240 JJ.push_back(I->first);
2241 }
2242
2243 DenseMap > PairableInstUserMap;
2244 DenseSet PairableInstUserPairSet;
2245 for (std::vector::iterator I = PairableInsts.begin(),
2246 E = PairableInsts.end(); I != E; ++I) {
2247 // The number of possible pairings for this variable:
2248 size_t NumChoices = CandidatePairs.lookup(*I).size();
2249 if (!NumChoices) continue;
2250
2251 std::vector &JJ = CandidatePairs[*I];
2252
2253 // The best pair to choose and its dag:
2254 size_t BestMaxDepth = 0;
2255 int BestEffSize = 0;
2256 DenseSet BestDAG;
2257 findBestDAGFor(CandidatePairs, CandidatePairsSet,
2258 CandidatePairCostSavings,
2259 PairableInsts, FixedOrderPairs, PairConnectionTypes,
2260 ConnectedPairs, ConnectedPairDeps,
2261 PairableInstUsers, PairableInstUserMap,
2262 PairableInstUserPairSet, ChosenPairs,
2263 BestDAG, BestMaxDepth, BestEffSize, *I, JJ,
2264 UseCycleCheck);
2265
2266 if (BestDAG.empty())
2267 continue;
2268
2269 // A dag has been chosen (or not) at this point. If no dag was
2270 // chosen, then this instruction, I, cannot be paired (and is no longer
2271 // considered).
2272
2273 DEBUG(dbgs() << "BBV: selected pairs in the best DAG for: "
2274 << *cast(*I) << "\n");
2275
2276 for (DenseSet::iterator S = BestDAG.begin(),
2277 SE2 = BestDAG.end(); S != SE2; ++S) {
2278 // Insert the members of this dag into the list of chosen pairs.
2279 ChosenPairs.insert(ValuePair(S->first, S->second));
2280 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
2281 *S->second << "\n");
2282
2283 // Remove all candidate pairs that have values in the chosen dag.
2284 std::vector &KK = CandidatePairs[S->first];
2285 for (std::vector::iterator K = KK.begin(), KE = KK.end();
2286 K != KE; ++K) {
2287 if (*K == S->second)
2288 continue;
2289
2290 CandidatePairsSet.erase(ValuePair(S->first, *K));
2291 }
2292
2293 std::vector &LL = CandidatePairs2[S->second];
2294 for (std::vector::iterator L = LL.begin(), LE = LL.end();
2295 L != LE; ++L) {
2296 if (*L == S->first)
2297 continue;
2298
2299 CandidatePairsSet.erase(ValuePair(*L, S->second));
2300 }
2301
2302 std::vector &MM = CandidatePairs[S->second];
2303 for (std::vector::iterator M = MM.begin(), ME = MM.end();
2304 M != ME; ++M) {
2305 assert(*M != S->first && "Flipped pair in candidate list?");
2306 CandidatePairsSet.erase(ValuePair(S->second, *M));
2307 }
2308
2309 std::vector &NN = CandidatePairs2[S->first];
2310 for (std::vector::iterator N = NN.begin(), NE = NN.end();
2311 N != NE; ++N) {
2312 assert(*N != S->second && "Flipped pair in candidate list?");
2313 CandidatePairsSet.erase(ValuePair(*N, S->first));
2314 }
2315 }
2316 }
2317
2318 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
2319 }
2320
2321 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
2322 unsigned n = 0) {
2323 if (!I->hasName())
2324 return "";
2325
2326 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
2327 (n > 0 ? "." + utostr(n) : "")).str();
2328 }
2329
2330 // Returns the value that is to be used as the pointer input to the vector
2331 // instruction that fuses I with J.
2332 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
2333 Instruction *I, Instruction *J, unsigned o) {
2334 Value *IPtr, *JPtr;
2335 unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace;
2336 int64_t OffsetInElmts;
2337
2338 // Note: the analysis might fail here, that is why the pair order has
2339 // been precomputed (OffsetInElmts must be unused here).
2340 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
2341 IAddressSpace, JAddressSpace,
2342 OffsetInElmts, false);
2343
2344 // The pointer value is taken to be the one with the lowest offset.
2345 Value *VPtr = IPtr;
2346
2347 Type *ArgTypeI = IPtr->getType()->getPointerElementType();
2348 Type *ArgTypeJ = JPtr->getType()->getPointerElementType();
2349 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2350 Type *VArgPtrType
2351 = PointerType::get(VArgType,
2352 IPtr->getType()->getPointerAddressSpace());
2353 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
2354 /* insert before */ I);
2355 }
2356
2357 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
2358 unsigned MaskOffset, unsigned NumInElem,
2359 unsigned NumInElem1, unsigned IdxOffset,
2360 std::vector &Mask) {
2361 unsigned NumElem1 = J->getType()->getVectorNumElements();
2362 for (unsigned v = 0; v < NumElem1; ++v) {
2363 int m = cast(J)->getMaskValue(v);
2364 if (m < 0) {
2365 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
2366 } else {
2367 unsigned mm = m + (int) IdxOffset;
2368 if (m >= (int) NumInElem1)
2369 mm += (int) NumInElem;
2370
2371 Mask[v+MaskOffset] =
2372 ConstantInt::get(Type::getInt32Ty(Context), mm);
2373 }
2374 }
2375 }
2376
2377 // Returns the value that is to be used as the vector-shuffle mask to the
2378 // vector instruction that fuses I with J.
2379 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
2380 Instruction *I, Instruction *J) {
2381 // This is the shuffle mask. We need to append the second
2382 // mask to the first, and the numbers need to be adjusted.
2383
2384 Type *ArgTypeI = I->getType();
2385 Type *ArgTypeJ = J->getType();
2386 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2387
2388 unsigned NumElemI = ArgTypeI->getVectorNumElements();
2389
2390 // Get the total number of elements in the fused vector type.
2391 // By definition, this must equal the number of elements in
2392 // the final mask.
2393 unsigned NumElem = VArgType->getVectorNumElements();
2394 std::vector Mask(NumElem);
2395
2396 Type *OpTypeI = I->getOperand(0)->getType();
2397 unsigned NumInElemI = OpTypeI->getVectorNumElements();
2398 Type *OpTypeJ = J->getOperand(0)->getType();
2399 unsigned NumInElemJ = OpTypeJ->getVectorNumElements();
2400
2401 // The fused vector will be:
2402 // -----------------------------------------------------
2403 // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ |
2404 // -----------------------------------------------------
2405 // from which we'll extract NumElem total elements (where the first NumElemI
2406 // of them come from the mask in I and the remainder come from the mask
2407 // in J.
2408
2409 // For the mask from the first pair...
2410 fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI,
2411 0, Mask);
2412
2413 // For the mask from the second pair...
2414 fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ,
2415 NumInElemI, Mask);
2416
2417 return ConstantVector::get(Mask);
2418 }
2419
2420 bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I,
2421 Instruction *J, unsigned o, Value *&LOp,
2422 unsigned numElemL,
2423 Type *ArgTypeL, Type *ArgTypeH,
2424 bool IBeforeJ, unsigned IdxOff) {
2425 bool ExpandedIEChain = false;
2426 if (InsertElementInst *LIE = dyn_cast(LOp)) {
2427 // If we have a pure insertelement chain, then this can be rewritten
2428 // into a chain that directly builds the larger type.
2429 if (isPureIEChain(LIE)) {
2430 SmallVector VectElemts(numElemL,
2431 UndefValue::get(ArgTypeL->getScalarType()));
2432 InsertElementInst *LIENext = LIE;
2433 do {
2434 unsigned Idx =
2435 cast(LIENext->getOperand(2))->getSExtValue();
2436 VectElemts[Idx] = LIENext->getOperand(1);
2437 } while ((LIENext =
2438 dyn_cast(LIENext->getOperand(0))));
2439
2440 LIENext = nullptr;
2441 Value *LIEPrev = UndefValue::get(ArgTypeH);
2442 for (unsigned i = 0; i < numElemL; ++i) {
2443 if (isa(VectElemts[i])) continue;
2444 LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i],
2445 ConstantInt::get(Type::getInt32Ty(Context),
2446 i + IdxOff),
2447 getReplacementName(IBeforeJ ? I : J,
2448 true, o, i+1));
2449 LIENext->insertBefore(IBeforeJ ? J : I);
2450 LIEPrev = LIENext;
2451 }
2452
2453 LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH);
2454 ExpandedIEChain = true;
2455 }
2456 }
2457
2458 return ExpandedIEChain;
2459 }
2460
2461 static unsigned getNumScalarElements(Type *Ty) {
2462 if (VectorType *VecTy = dyn_cast(Ty))
2463 return VecTy->getNumElements();
2464 return 1;
2465 }
2466
2467 // Returns the value to be used as the specified operand of the vector
2468 // instruction that fuses I with J.
2469 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
2470 Instruction *J, unsigned o, bool IBeforeJ) {
2471 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2472 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
2473
2474 // Compute the fused vector type for this operand
2475 Type *ArgTypeI = I->getOperand(o)->getType();
2476 Type *ArgTypeJ = J->getOperand(o)->getType();
2477 VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2478
2479 Instruction *L = I, *H = J;
2480 Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ;
2481
2482 unsigned numElemL = getNumScalarElements(ArgTypeL);
2483 unsigned numElemH = getNumScalarElements(ArgTypeH);
2484
2485 Value *LOp = L->getOperand(o);
2486 Value *HOp = H->getOperand(o);
2487 unsigned numElem = VArgType->getNumElements();
2488
2489 // First, we check if we can reuse the "original" vector outputs (if these
2490 // exist). We might need a shuffle.
2491 ExtractElementInst *LEE = dyn_cast(LOp);
2492 ExtractElementInst *HEE = dyn_cast(HOp);
2493 ShuffleVectorInst *LSV = dyn_cast(LOp);
2494 ShuffleVectorInst *HSV = dyn_cast(HOp);
2495
2496 // FIXME: If we're fusing shuffle instructions, then we can't apply this
2497 // optimization. The input vectors to the shuffle might be a different
2498 // length from the shuffle outputs. Unfortunately, the replacement
2499 // shuffle mask has already been formed, and the mask entries are sensitive
2500 // to the sizes of the inputs.
2501 bool IsSizeChangeShuffle =
2502 isa(L) &&
2503 (LOp->getType() != L->getType() || HOp->getType() != H->getType());
2504
2505 if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) {
2506 // We can have at most two unique vector inputs.
2507 bool CanUseInputs = true;
2508 Value *I1, *I2 = nullptr;
2509 if (LEE) {
2510 I1 = LEE->getOperand(0);
2511 } else {
2512 I1 = LSV->getOperand(0);
2513 I2 = LSV->getOperand(1);
2514 if (I2 == I1 || isa(I2))
2515 I2 = nullptr;
2516 }
2517
2518 if (HEE) {
2519 Value *I3 = HEE->getOperand(0);
2520 if (!I2 && I3 != I1)
2521 I2 = I3;
2522 else if (I3 != I1 && I3 != I2)
2523 CanUseInputs = false;
2524 } else {
2525 Value *I3 = HSV->getOperand(0);
2526 if (!I2 && I3 != I1)
2527 I2 = I3;
2528 else if (I3 != I1 && I3 != I2)
2529 CanUseInputs = false;
2530
2531 if (CanUseInputs) {
2532 Value *I4 = HSV->getOperand(1);
2533 if (!isa(I4)) {
2534 if (!I2 && I4 != I1)
2535 I2 = I4;
2536 else if (I4 != I1 && I4 != I2)
2537 CanUseInputs = false;
2538 }
2539 }
2540 }
2541
2542 if (CanUseInputs) {
2543 unsigned LOpElem =
2544 cast(LOp)->getOperand(0)->getType()
2545 ->getVectorNumElements();
2546
2547 unsigned HOpElem =
2548 cast(HOp)->getOperand(0)->getType()
2549 ->getVectorNumElements();
2550
2551 // We have one or two input vectors. We need to map each index of the
2552 // operands to the index of the original vector.
2553 SmallVector, 8> II(numElem);
2554 for (unsigned i = 0; i < numElemL; ++i) {
2555 int Idx, INum;
2556 if (LEE) {
2557 Idx =
2558 cast(LEE->getOperand(1))->getSExtValue();
2559 INum = LEE->getOperand(0) == I1 ? 0 : 1;
2560 } else {
2561 Idx = LSV->getMaskValue(i);
2562 if (Idx < (int) LOpElem) {
2563 INum = LSV->getOperand(0) == I1 ? 0 : 1;
2564 } else {
2565 Idx -= LOpElem;
2566 INum = LSV->getOperand(1) == I1 ? 0 : 1;
2567 }
2568 }
2569
2570 II[i] = std::pair(Idx, INum);
2571 }
2572 for (unsigned i = 0; i < numElemH; ++i) {
2573 int Idx, INum;
2574 if (HEE) {
2575 Idx =
2576 cast(HEE->getOperand(1))->getSExtValue();
2577 INum = HEE->getOperand(0) == I1 ? 0 : 1;
2578 } else {
2579 Idx = HSV->getMaskValue(i);
2580 if (Idx < (int) HOpElem) {
2581 INum = HSV->getOperand(0) == I1 ? 0 : 1;
2582 } else {
2583 Idx -= HOpElem;
2584 INum = HSV->getOperand(1) == I1 ? 0 : 1;
2585 }
2586 }
2587
2588 II[i + numElemL] = std::pair(Idx, INum);
2589 }
2590
2591 // We now have an array which tells us from which index of which
2592 // input vector each element of the operand comes.
2593 VectorType *I1T = cast(I1->getType());
2594 unsigned I1Elem = I1T->getNumElements();
2595
2596 if (!I2) {
2597 // In this case there is only one underlying vector input. Check for
2598 // the trivial case where we can use the input directly.
2599 if (I1Elem == numElem) {
2600 bool ElemInOrder = true;
2601 for (unsigned i = 0; i < numElem; ++i) {
2602 if (II[i].first != (int) i && II[i].first != -1) {
2603 ElemInOrder = false;
2604 break;
2605 }
2606 }
2607
2608 if (ElemInOrder)
2609 return I1;
2610 }
2611
2612 // A shuffle is needed.
2613 std::vector Mask(numElem);
2614 for (unsigned i = 0; i < numElem; ++i) {
2615 int Idx = II[i].first;
2616 if (Idx == -1)
2617 Mask[i] = UndefValue::get(Type::getInt32Ty(Context));
2618 else
2619 Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2620 }
2621
2622 Instruction *S =
2623 new ShuffleVectorInst(I1, UndefValue::get(I1T),
2624 ConstantVector::get(Mask),
2625 getReplacementName(IBeforeJ ? I : J,
2626 true, o));
2627 S->insertBefore(IBeforeJ ? J : I);
2628 return S;
2629 }
2630
2631 VectorType *I2T = cast(I2->getType());
2632 unsigned I2Elem = I2T->getNumElements();
2633
2634 // This input comes from two distinct vectors. The first step is to
2635 // make sure that both vectors are the same length. If not, the
2636 // smaller one will need to grow before they can be shuffled together.
2637 if (I1Elem < I2Elem) {
2638 std::vector Mask(I2Elem);
2639 unsigned v = 0;
2640 for (; v < I1Elem; ++v)
2641 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2642 for (; v < I2Elem; ++v)
2643 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2644
2645 Instruction *NewI1 =
2646 new ShuffleVectorInst(I1, UndefValue::get(I1T),
2647 ConstantVector::get(Mask),
2648 getReplacementName(IBeforeJ ? I : J,
2649 true, o, 1));
2650 NewI1->insertBefore(IBeforeJ ? J : I);
2651 I1 = NewI1;
2652 I1Elem = I2Elem;
2653 } else if (I1Elem > I2Elem) {
2654 std::vector Mask(I1Elem);
2655 unsigned v = 0;
2656 for (; v < I2Elem; ++v)
2657 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2658 for (; v < I1Elem; ++v)
2659 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2660
2661 Instruction *NewI2 =
2662 new ShuffleVectorInst(I2, UndefValue::get(I2T),
2663 ConstantVector::get(Mask),
2664 getReplacementName(IBeforeJ ? I : J,
2665 true, o, 1));
2666 NewI2->insertBefore(IBeforeJ ? J : I);
2667 I2 = NewI2;
2668 }
2669
2670 // Now that both I1 and I2 are the same length we can shuffle them
2671 // together (and use the result).
2672 std::vector Mask(numElem);
2673 for (unsigned v = 0; v < numElem; ++v) {
2674 if (II[v].first == -1) {
2675 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2676 } else {
2677 int Idx = II[v].first + II[v].second * I1Elem;
2678 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2679 }
2680 }
2681
2682 Instruction *NewOp =
2683 new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask),
2684 getReplacementName(IBeforeJ ? I : J, true, o));
2685 NewOp->insertBefore(IBeforeJ ? J : I);
2686 return NewOp;
2687 }
2688 }
2689
2690 Type *ArgType = ArgTypeL;
2691 if (numElemL < numElemH) {
2692 if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH,
2693 ArgTypeL, VArgType, IBeforeJ, 1)) {
2694 // This is another short-circuit case: we're combining a scalar into
2695 // a vector that is formed by an IE chain. We've just expanded the IE
2696 // chain, now insert the scalar and we're done.
2697
2698 Instruction *S = InsertElementInst::Create(HOp, LOp, CV0,
2699 getReplacementName(IBeforeJ ? I : J, true, o));
2700 S->insertBefore(IBeforeJ ? J : I);
2701 return S;
2702 } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL,
2703 ArgTypeH, IBeforeJ)) {
2704 // The two vector inputs to the shuffle must be the same length,
2705 // so extend the smaller vector to be the same length as the larger one.
2706 Instruction *NLOp;
2707 if (numElemL > 1) {
2708
2709 std::vector Mask(numElemH);
2710 unsigned v = 0;
2711 for (; v < numElemL; ++v)
2712 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2713 for (; v < numElemH; ++v)
2714 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2715
2716 NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
2717 ConstantVector::get(Mask),
2718 getReplacementName(IBeforeJ ? I : J,
2719 true, o, 1));
2720 } else {
2721 NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0,
2722 getReplacementName(IBeforeJ ? I : J,
2723 true, o, 1));
2724 }
2725
2726 NLOp->insertBefore(IBeforeJ ? J : I);
2727 LOp = NLOp;
2728 }
2729
2730 ArgType = ArgTypeH;
2731 } else if (numElemL > numElemH) {
2732 if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
2733 ArgTypeH, VArgType, IBeforeJ)) {
2734 Instruction *S =
2735 InsertElementInst::Create(LOp, HOp,
2736 ConstantInt::get(Type::getInt32Ty(Context),
2737 numElemL),
2738 getReplacementName(IBeforeJ ? I : J,
2739 true, o));
2740 S->insertBefore(IBeforeJ ? J : I);
2741 return S;
2742 } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH,
2743 ArgTypeL, IBeforeJ)) {
2744 Instruction *NHOp;
2745 if (numElemH > 1) {
2746 std::vector Mask(numElemL);
2747 unsigned v = 0;
2748 for (; v < numElemH; ++v)
2749 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2750 for (; v < numElemL; ++v)
2751 Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
2752
2753 NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
2754 ConstantVector::get(Mask),
2755 getReplacementName(IBeforeJ ? I : J,
2756 true, o, 1));
2757 } else {
2758 NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0,
2759 getReplacementName(IBeforeJ ? I : J,
2760 true, o, 1));
2761 }
2762
2763 NHOp->insertBefore(IBeforeJ ? J : I);
2764 HOp = NHOp;
2765 }
2766 }
2767
2768 if (ArgType->isVectorTy()) {
2769 unsigned numElem = VArgType->getVectorNumElements();
2770 std::vector Mask(numElem);
2771 for (unsigned v = 0; v < numElem; ++v) {
2772 unsigned Idx = v;
2773 // If the low vector was expanded, we need to skip the extra
2774 // undefined entries.
2775 if (v >= numElemL && numElemH > numElemL)
2776 Idx += (numElemH - numElemL);
2777 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx);
2778 }
2779
2780 Instruction *BV = new ShuffleVectorInst(LOp, HOp,
2781 ConstantVector::get(Mask),
2782 getReplacementName(IBeforeJ ? I : J, true, o));
2783 BV->insertBefore(IBeforeJ ? J : I);
2784 return BV;
2785 }
2786
2787 Instruction *BV1 = InsertElementInst::Create(
2788 UndefValue::get(VArgType), LOp, CV0,
2789 getReplacementName(IBeforeJ ? I : J,
2790 true, o, 1));
2791 BV1->insertBefore(IBeforeJ ? J : I);
2792 Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1,
2793 getReplacementName(IBeforeJ ? I : J,
2794 true, o, 2));
2795 BV2->insertBefore(IBeforeJ ? J : I);
2796 return BV2;
2797 }
2798
2799 // This function creates an array of values that will be used as the inputs
2800 // to the vector instruction that fuses I with J.
2801 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
2802 Instruction *I, Instruction *J,
2803 SmallVectorImpl &ReplacedOperands,
2804 bool IBeforeJ) {
2805 unsigned NumOperands = I->getNumOperands();
2806
2807 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
2808 // Iterate backward so that we look at the store pointer
2809 // first and know whether or not we need to flip the inputs.
2810
2811 if (isa(I) || (o == 1 && isa(I))) {
2812 // This is the pointer for a load/store instruction.
2813 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o);
2814 continue;
2815 } else if (isa(I)) {
2816 Function *F = cast(I)->getCalledFunction();
2817 Intrinsic::ID IID = F->getIntrinsicID();
2818 if (o == NumOperands-1) {
2819 BasicBlock &BB = *I->getParent();
2820
2821 Module *M = BB.getParent()->getParent();
2822 Type *ArgTypeI = I->getType();
2823 Type *ArgTypeJ = J->getType();
2824 Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ);
2825
2826 ReplacedOperands[o] = Intrinsic::getDeclaration(M, IID, VArgType);
2827 continue;
2828 } else if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
2829 IID == Intrinsic::cttz) && o == 1) {
2830 // The second argument of powi/ctlz/cttz is a single integer/constant
2831 // and we've already checked that both arguments are equal.
2832 // As a result, we just keep I's second argument.
2833 ReplacedOperands[o] = I->getOperand(o);
2834 continue;
2835 }
2836 } else if (isa(I) && o == NumOperands-1) {
2837 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
2838 continue;
2839 }
2840
2841 ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ);
2842 }
2843 }
2844
2845 // This function creates two values that represent the outputs of the
2846 // original I and J instructions. These are generally vector shuffles
2847 // or extracts. In many cases, these will end up being unused and, thus,
2848 // eliminated by later passes.
2849 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
2850 Instruction *J, Instruction *K,
2851 Instruction *&InsertionPt,
2852 Instruction *&K1, Instruction *&K2) {
2853 if (isa(I))
2854 return;
2855
2856 Type *IType = I->getType();
2857 Type *JType = J->getType();
2858
2859 VectorType *VType = getVecTypeForPair(IType, JType);
2860 unsigned numElem = VType->getNumElements();
2861
2862 unsigned numElemI = getNumScalarElements(IType);
2863 unsigned numElemJ = getNumScalarElements(JType);
2864
2865 if (IType->isVectorTy()) {
2866 std::vector Mask1(numElemI), Mask2(numElemI);
2867 for (unsigned v = 0; v < numElemI; ++v) {
2868 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2869 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ + v);
2870 }
2871
2872 K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
2873 ConstantVector::get(Mask1),
2874 getReplacementName(K, false, 1));
2875 } else {
2876 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
2877 K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1));
2878 }
2879
2880 if (JType->isVectorTy()) {
2881 std::vector Mask1(numElemJ), Mask2(numElemJ);
2882 for (unsigned v = 0; v < numElemJ; ++v) {
2883 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
2884 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI + v);
2885 }
2886
2887 K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
2888 ConstantVector::get(Mask2),
2889 getReplacementName(K, false, 2));
2890 } else {
2891 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem - 1);
2892 K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2));
2893 }
2894
2895 K1->insertAfter(K);
2896 K2->insertAfter(K1);
2897 InsertionPt = K2;
2898 }
2899
2900 // Move all uses of the function I (including pairing-induced uses) after J.
2901 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
2902 DenseSet &LoadMoveSetPairs,
2903 Instruction *I, Instruction *J) {
2904 // Skip to the first instruction past I.
2905 BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2906
2907 DenseSet Users;
2908 AliasSetTracker WriteSet(*AA);
2909 if (I->mayWriteToMemory()) WriteSet.add(I);
2910
2911 for (; cast(L) != J; ++L)
2912 (void)trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs);
2913
2914 assert(cast(L) == J &&
2915 "Tracking has not proceeded far enough to check for dependencies");
2916 // If J is now in the use set of I, then trackUsesOfI will return true
2917 // and we have a dependency cycle (and the fusing operation must abort).
2918 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSetPairs);
2919 }
2920
2921 // Move all uses of the function I (including pairing-induced uses) after J.
2922 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
2923 DenseSet &LoadMoveSetPairs,
2924 Instruction *&InsertionPt,
2925 Instruction *I, Instruction *J) {
2926 // Skip to the first instruction past I.
2927 BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2928
2929 DenseSet Users;
2930 AliasSetTracker WriteSet(*AA);
2931 if (I->mayWriteToMemory()) WriteSet.add(I);
2932
2933 for (; cast(L) != J;) {
2934 if (trackUsesOfI(Users, WriteSet, I, &*L, true, &LoadMoveSetPairs)) {
2935 // Move this instruction
2936 Instruction *InstToMove = &*L++;
2937
2938 DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
2939 " to after " << *InsertionPt << "\n");
2940 InstToMove->removeFromParent();
2941 InstToMove->insertAfter(InsertionPt);
2942 InsertionPt = InstToMove;
2943 } else {
2944 ++L;
2945 }
2946 }
2947 }
2948
2949 // Collect all load instruction that are in the move set of a given first
2950 // pair member. These loads depend on the first instruction, I, and so need
2951 // to be moved after J (the second instruction) when the pair is fused.
2952 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
2953 DenseMap &ChosenPairs,
2954 DenseMap > &LoadMoveSet,
2955 DenseSet &LoadMoveSetPairs,
2956 Instruction *I) {
2957 // Skip to the first instruction past I.
2958 BasicBlock::iterator L = std::next(BasicBlock::iterator(I));
2959
2960 DenseSet Users;
2961 AliasSetTracker WriteSet(*AA);
2962 if (I->mayWriteToMemory()) WriteSet.add(I);
2963
2964 // Note: We cannot end the loop when we reach J because J could be moved
2965 // farther down the use chain by another instruction pairing. Also, J
2966 // could be before I if this is an inverted input.
2967 for (BasicBlock::iterator E = BB.end(); L != E; ++L) {
2968 if (trackUsesOfI(Users, WriteSet, I, &*L)) {
2969 if (L->mayReadFromMemory()) {
2970 LoadMoveSet[&*L].push_back(I);
2971 LoadMoveSetPairs.insert(ValuePair(&*L, I));
2972 }
2973 }
2974 }
2975 }
2976
2977 // In cases where both load/stores and the computation of their pointers
2978 // are chosen for vectorization, we can end up in a situation where the
2979 // aliasing analysis starts returning different query results as the
2980 // process of fusing instruction pairs continues. Because the algorithm
2981 // relies on finding the same use dags here as were found earlier, we'll
2982 // need to precompute the necessary aliasing information here and then
2983 // manually update it during the fusion process.
2984 void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
2985 std::vector &PairableInsts,
2986 DenseMap &ChosenPairs,
2987 DenseMap > &LoadMoveSet,
2988 DenseSet &LoadMoveSetPairs) {
2989 for (std::vector::iterator PI = PairableInsts.begin(),
2990 PIE = PairableInsts.end(); PI != PIE; ++PI) {
2991 DenseMap::iterator P = ChosenPairs.find(*PI);
2992 if (P == ChosenPairs.end()) continue;
2993
2994 Instruction *I = cast(P->first);
2995 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet,
2996 LoadMoveSetPairs, I);
2997 }
2998 }
2999
3000 // This function fuses the chosen instruction pairs into vector instructions,
3001 // taking care preserve any needed scalar outputs and, then, it reorders the
3002 // remaining instructions as needed (users of the first member of the pair
3003 // need to be moved to after the location of the second member of the pair
3004 // because the vector instruction is inserted in the location of the pair's
3005 // second member).
3006 void BBVectorize::fuseChosenPairs(BasicBlock &BB,
3007 std::vector &PairableInsts,
3008 DenseMap &ChosenPairs,
3009 DenseSet &FixedOrderPairs,
3010 DenseMap &PairConnectionTypes,
3011 DenseMap > &ConnectedPairs,
3012 DenseMap > &ConnectedPairDeps) {
3013 LLVMContext& Context = BB.getContext();
3014
3015 // During the vectorization process, the order of the pairs to be fused
3016 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
3017 // list. After a pair is fused, the flipped pair is removed from the list.
3018 DenseSet FlippedPairs;
3019 for (DenseMap::iterator P = ChosenPairs.begin(),
3020 E = ChosenPairs.end(); P != E; ++P)
3021 FlippedPairs.insert(ValuePair(P->second, P->first));
3022 for (DenseSet::iterator P = FlippedPairs.begin(),
3023 E = FlippedPairs.end(); P != E; ++P)
3024 ChosenPairs.insert(*P);
3025
3026 DenseMap > LoadMoveSet;
3027 DenseSet LoadMoveSetPairs;
3028 collectLoadMoveSet(BB, PairableInsts, ChosenPairs,
3029 LoadMoveSet, LoadMoveSetPairs);
3030
3031 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
3032
3033 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
3034 DenseMap::iterator P = ChosenPairs.find(&*PI);
3035 if (P == ChosenPairs.end()) {
3036 ++PI;
3037 continue;
3038 }
3039
3040 if (getDepthFactor(P->first) == 0) {
3041 // These instructions are not really fused, but are tracked as though
3042 // they are. Any case in which it would be interesting to fuse them
3043 // will be taken care of by InstCombine.
3044 --NumFusedOps;
3045 ++PI;
3046 continue;
3047 }
3048
3049 Instruction *I = cast(P->first),
3050 *J = cast(P->second);
3051
3052 DEBUG(dbgs() << "BBV: fusing: " << *I <<
3053 " <-> " << *J << "\n");
3054
3055 // Remove the pair and flipped pair from the list.
3056 DenseMap::iterator FP = ChosenPairs.find(P->second);
3057 assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
3058 ChosenPairs.erase(FP);
3059 ChosenPairs.erase(P);
3060
3061 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSetPairs, I, J)) {
3062 DEBUG(dbgs() << "BBV: fusion of: " << *I <<
3063 " <-> " << *J <<
3064 " aborted because of non-trivial dependency cycle\n");
3065 --NumFusedOps;
3066 ++PI;
3067 continue;
3068 }
3069
3070 // If the pair must have the other order, then flip it.
3071 bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I));
3072 if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) {
3073 // This pair does not have a fixed order, and so we might want to
3074 // flip it if that will yield fewer shuffles. We count the number
3075 // of dependencies connected via swaps, and those directly connected,
3076 // and flip the order if the number of swaps is greater.
3077 bool OrigOrder = true;
3078 DenseMap >::iterator IJ =
3079 ConnectedPairDeps.find(ValuePair(I, J));
3080 if (IJ == ConnectedPairDeps.end()) {
3081 IJ = ConnectedPairDeps.find(ValuePair(J, I));
3082 OrigOrder = false;
3083 }
3084
3085 if (IJ != ConnectedPairDeps.end()) {
3086 unsigned NumDepsDirect = 0, NumDepsSwap = 0;
3087 for (std::vector::iterator T = IJ->second.begin(),
3088 TE = IJ->second.end(); T != TE; ++T) {
3089 VPPair Q(IJ->first, *T);
3090 DenseMap::iterator R =
3091 PairConnectionTypes.find(VPPair(Q.second, Q.first));
3092 assert(R != PairConnectionTypes.end() &&
3093 "Cannot find pair connection type");
3094 if (R->second == PairConnectionDirect)
3095 ++NumDepsDirect;
3096 else if (R->second == PairConnectionSwap)
3097 ++NumDepsSwap;
3098 }
3099
3100 if (!OrigOrder)
3101 std::swap(NumDepsDirect, NumDepsSwap);
3102
3103 if (NumDepsSwap > NumDepsDirect) {
3104 FlipPairOrder = true;
3105 DEBUG(dbgs() << "BBV: reordering pair: " << *I <<
3106 " <-> " << *J << "\n");
3107 }
3108 }
3109 }
3110
3111 Instruction *L = I, *H = J;
3112 if (FlipPairOrder)
3113 std::swap(H, L);
3114
3115 // If the pair being fused uses the opposite order from that in the pair
3116 // connection map, then we need to flip the types.
3117 DenseMap >::iterator HL =
3118 ConnectedPairs.find(ValuePair(H, L));
3119 if (HL != ConnectedPairs.end())
3120 for (std::vector::iterator T = HL->second.begin(),
3121 TE = HL->second.end(); T != TE; ++T) {
3122 VPPair Q(HL->first, *T);
3123 DenseMap::iterator R = PairConnectionTypes.find(Q);
3124 assert(R != PairConnectionTypes.end() &&
3125 "Cannot find pair connection type");
3126 if (R->second == PairConnectionDirect)
3127 R->second = PairConnectionSwap;
3128 else if (R->second == PairConnectionSwap)
3129 R->second = PairConnectionDirect;
3130 }
3131
3132 bool LBeforeH = !FlipPairOrder;
3133 unsigned NumOperands = I->getNumOperands();
3134 SmallVector ReplacedOperands(NumOperands);
3135 getReplacementInputsForPair(Context, L, H, ReplacedOperands,
3136 LBeforeH);
3137
3138 // Make a copy of the original operation, change its type to the vector
3139 // type and replace its operands with the vector operands.
3140 Instruction *K = L->clone();
3141 if (L->hasName())
3142 K->takeName(L);
3143 else if (H->hasName())
3144 K->takeName(H);
3145
3146 if (auto CS = CallSite(K)) {
3147 SmallVector Tys;
3148 FunctionType *Old = CS.getFunctionType();
3149 unsigned NumOld = Old->getNumParams();
3150 assert(NumOld <= ReplacedOperands.size());
3151 for (unsigned i = 0; i != NumOld; ++i)
3152 Tys.push_back(ReplacedOperands[i]->getType());
3153 CS.mutateFunctionType(
3154 FunctionType::get(getVecTypeForPair(L->getType(), H->getType()),
3155 Tys, Old->isVarArg()));
3156 } else if (!isa(K))
3157 K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
3158
3159 unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
3160 LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
3161 LLVMContext::MD_invariant_group};
3162 combineMetadata(K, H, KnownIDs);
3163 K->andIRFlags(H);
3164
3165 for (unsigned o = 0; o < NumOperands; ++o)
3166 K->setOperand(o, ReplacedOperands[o]);
3167
3168 K->insertAfter(J);
3169
3170 // Instruction insertion point:
3171 Instruction *InsertionPt = K;
3172 Instruction *K1 = nullptr, *K2 = nullptr;
3173 replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2);
3174
3175 // The use dag of the first original instruction must be moved to after
3176 // the location of the second instruction. The entire use dag of the
3177 // first instruction is disjoint from the input dag of the second
3178 // (by definition), and so commutes with it.
3179
3180 moveUsesOfIAfterJ(BB, LoadMoveSetPairs, InsertionPt, I, J);
3181
3182 if (!isa(I)) {
3183 L->replaceAllUsesWith(K1);
3184 H->replaceAllUsesWith(K2);
3185 }
3186
3187 // Instructions that may read from memory may be in the load move set.
3188 // Once an instruction is fused, we no longer need its move set, and so
3189 // the values of the map never need to be updated. However, when a load
3190 // is fused, we need to merge the entries from both instructions in the
3191 // pair in case those instructions were in the move set of some other
3192 // yet-to-be-fused pair. The loads in question are the keys of the map.
3193 if (I->mayReadFromMemory()) {
3194 std::vector NewSetMembers;
3195 DenseMap >::iterator II =
3196 LoadMoveSet.find(I);
3197 if (II != LoadMoveSet.end())
3198 for (std::vector::iterator N = II->second.begin(),
3199 NE = II->second.end(); N != NE; ++N)
3200 NewSetMembers.push_back(ValuePair(K, *N));
3201 DenseMap >::iterator JJ =
3202 LoadMoveSet.find(J);
3203 if (JJ != LoadMoveSet.end())
3204 for (std::vector::iterator N = JJ->second.begin(),
3205 NE = JJ->second.end(); N != NE; ++N)
3206 NewSetMembers.push_back(ValuePair(K, *N));
3207 for (std::vector::iterator A = NewSetMembers.begin(),
3208 AE = NewSetMembers.end(); A != AE; ++A) {
3209 LoadMoveSet[A->first].push_back(A->second);
3210 LoadMoveSetPairs.insert(*A);
3211 }
3212 }
3213
3214 // Before removing I, set the iterator to the next instruction.
3215 PI = std::next(BasicBlock::iterator(I));
3216 if (cast(PI) == J)
3217 ++PI;
3218
3219 SE->forgetValue(I);
3220 SE->forgetValue(J);
3221 I->eraseFromParent();
3222 J->eraseFromParent();
3223
3224 DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" <<
3225 BB << "\n");
3226 }
3227
3228 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
3229 }
3230 }
3231
3232 char BBVectorize::ID = 0;
3233 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
3234 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3235 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
3236 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
3237 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
3238 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
3239 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
3240 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
3241 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
3242 INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
3243 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
3244
3245 BasicBlockPass *llvm::createBBVectorizePass(const VectorizeConfig &C) {
3246 return new BBVectorize(C);
3247 }
3248
3249 bool
3250 llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) {
3251 BBVectorize BBVectorizer(P, *BB.getParent(), C);
3252 return BBVectorizer.vectorizeBB(BB);
3253 }
3254
3255 //===----------------------------------------------------------------------===//
3256 VectorizeConfig::VectorizeConfig() {
3257 VectorBits = ::VectorBits;
3258 VectorizeBools = !::NoBools;
3259 VectorizeInts = !::NoInts;
3260 VectorizeFloats = !::NoFloats;
3261 VectorizePointers = !::NoPointers;
3262 VectorizeCasts = !::NoCasts;
3263 VectorizeMath = !::NoMath;
3264 VectorizeBitManipulations = !::NoBitManipulation;
3265 VectorizeFMA = !::NoFMA;
3266 VectorizeSelect = !::NoSelect;
3267 VectorizeCmp = !::NoCmp;
3268 VectorizeGEP = !::NoGEP;
3269 VectorizeMemOps = !::NoMemOps;
3270 AlignedOnly = ::AlignedOnly;
3271 ReqChainDepth= ::ReqChainDepth;
3272 SearchLimit = ::SearchLimit;
3273 MaxCandPairsForCycleCheck = ::MaxCandPairsForCycleCheck;
3274 SplatBreaksChain = ::SplatBreaksChain;
3275 MaxInsts = ::MaxInsts;
3276 MaxPairs = ::MaxPairs;
3277 MaxIter = ::MaxIter;
3278 Pow2LenOnly = ::Pow2LenOnly;
3279 NoMemOpBoost = ::NoMemOpBoost;
3280 FastDep = ::FastDep;
3281 }
0 add_llvm_library(LLVMVectorize
1 BBVectorize.cpp
21 LoadStoreVectorizer.cpp
32 LoopVectorize.cpp
43 SLPVectorizer.cpp
2525 /// initializeVectorizationPasses - Initialize all passes linked into the
2626 /// Vectorization library.
2727 void llvm::initializeVectorization(PassRegistry &Registry) {
28 initializeBBVectorizePass(Registry);
2928 initializeLoopVectorizePass(Registry);
3029 initializeSLPVectorizerPass(Registry);
3130 initializeLoadStoreVectorizerPass(Registry);
3534 initializeVectorization(*unwrap(R));
3635 }
3736
37 // DEPRECATED: Remove after the LLVM 5 release.
3838 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
39 unwrap(PM)->add(createBBVectorizePass());
4039 }
4140
4241 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
11 ; RUN: opt -O1 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1
22 ; RUN: opt -O2 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1 --check-prefix=OPT-O2O3
33 ; RUN: opt -O3 -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-O1 --check-prefix=OPT-O2O3
4 ; RUN: opt -bb-vectorize -dce -die -gvn-hoist -loweratomic -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-MORE
4 ; RUN: opt -dce -die -gvn-hoist -loweratomic -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-MORE
55 ; RUN: opt -indvars -licm -loop-deletion -loop-extract -loop-idiom -loop-instsimplify -loop-reduce -loop-reroll -loop-rotate -loop-unroll -loop-unswitch -S -debug %s 2>&1 | FileCheck %s --check-prefix=OPT-LOOP
66
77 ; REQUIRES: asserts
5454 ; OPT-O2O3-DAG: Skipping pass 'SLP Vectorizer'
5555
5656 ; Additional IR passes that opt doesn't turn on by default.
57 ; OPT-MORE-DAG: Skipping pass 'Basic-Block Vectorization'
5857 ; OPT-MORE-DAG: Skipping pass 'Dead Code Elimination'
5958 ; OPT-MORE-DAG: Skipping pass 'Dead Instruction Elimination'
6059 ; OPT-MORE-DAG: Skipping pass 'Lower atomic intrinsics
+0
-16
test/Transforms/BBVectorize/X86/cmp-types.ll less more
None target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
1 target triple = "x86_64-unknown-linux-gnu"
2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
3
4 %"struct.btSoftBody" = type { float, float, float*, i8 }
5
6 define void @test1(%"struct.btSoftBody"* %n1, %"struct.btSoftBody"* %n2) uwtable align 2 {
7 entry:
8 %tobool15 = icmp ne %"struct.btSoftBody"* %n1, null
9 %cond16 = zext i1 %tobool15 to i32
10 %tobool21 = icmp ne %"struct.btSoftBody"* %n2, null
11 %cond22 = zext i1 %tobool21 to i32
12 ret void
13 ; CHECK-LABEL: @test1(
14 }
15
+0
-61
test/Transforms/BBVectorize/X86/loop1.ll less more
None target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
1 target triple = "x86_64-unknown-linux-gnu"
2 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
3 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -basicaa -loop-unroll -unroll-partial-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
4 ; The second check covers the use of alias analysis (with loop unrolling).
5
6 define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
7 entry:
8 br label %for.body
9 ; CHECK-LABEL: @test1(
10 ; CHECK-UNRL-LABEL: @test1(
11
12 for.body: ; preds = %for.body, %entry
13 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
14 %arrayidx = getelementptr inbounds double, double* %in1, i64 %indvars.iv
15 %0 = load double, double* %arrayidx, align 8
16 %arrayidx2 = getelementptr inbounds double, double* %in2, i64 %indvars.iv
17 %1 = load double, double* %arrayidx2, align 8
18 %mul = fmul double %0, %0
19 %mul3 = fmul double %0, %1
20 %add = fadd double %mul, %mul3
21 %add4 = fadd double %1, %1
22 %add5 = fadd double %add4, %0
23 %mul6 = fmul double %0, %add5
24 %add7 = fadd double %add, %mul6
25 %mul8 = fmul double %1, %1
26 %add9 = fadd double %0, %0
27 %add10 = fadd double %add9, %0
28 %mul11 = fmul double %mul8, %add10
29 %add12 = fadd double %add7, %mul11
30 %arrayidx14 = getelementptr inbounds double, double* %out, i64 %indvars.iv
31 store double %add12, double* %arrayidx14, align 8
32 %indvars.iv.next = add i64 %indvars.iv, 1
33 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
34 %exitcond = icmp eq i32 %lftr.wideiv, 10
35 br i1 %exitcond, label %for.end, label %for.body
36 ; CHECK: insertelement
37 ; CHECK-NEXT: insertelement
38 ; CHECK-NEXT: fadd <2 x double>
39 ; CHECK-NEXT: insertelement
40 ; CHECK-NEXT: shufflevector
41 ; CHECK-NEXT: fadd <2 x double>
42 ; CHECK-NEXT: insertelement
43 ; CHECK-NEXT: fmul <2 x double>
44
45 ; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
46 ; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
47 ; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
48 ; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
49 ; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
50 ; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
51 ; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
52 ; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
53 ; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
54 ; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
55 ; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
56 ; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
57
58 for.end: ; preds = %for.body
59 ret void
60 }
+0
-95
test/Transforms/BBVectorize/X86/pr15289.ll less more
None ; RUN: opt < %s -basicaa -bb-vectorize -disable-output
1 ; This is a bugpoint-reduced test case. It did not always assert, but does reproduce the bug
2 ; and running under valgrind (or some similar tool) will catch the error.
3
4 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
5 target triple = "x86_64-apple-darwin12.2.0"
6
7 %0 = type { [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }], [10 x { float, float }] }
8 %1 = type { [10 x [8 x i8]] }
9 %2 = type { i64, i64 }
10 %3 = type { [10 x i64], i64, i64, i64, i64, i64 }
11 %4 = type { i64, i64, i64, i64, i64, i64 }
12 %5 = type { [10 x i64] }
13 %6 = type { [10 x float], [10 x float], [10 x float], [10 x float] }
14 %struct.__st_par