llvm.org GIT mirror llvm / de5e5ec
Add a basic-block autovectorization pass. This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure. Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8 Hal Finkel 7 years ago
35 changed file(s) with 2635 addition(s) and 12 deletion(s). Raw diff Collapse all Expand all
125125
-adceAggressive Dead Code Elimination
126126
-always-inlineInliner for always_inline functions
127127
-argpromotionPromote 'by reference' arguments to scalars
128
-bb-vectorizeCombine instructions to form vector instructions within basic blocks
128129
-block-placementProfile Guided Basic Block Placement
129130
-break-crit-edgesBreak critical edges in CFG
130131
-codegenprepareOptimize for code generation
816817
817818
818819

820 -bb-vectorize: Basic-Block Vectorization
821
822
823

This pass combines instructions inside basic blocks to form vector

824 instructions. It iterates over each basic block, attempting to pair
825 compatible instructions, repeating this process until no additional
826 pairs are selected for vectorization. When the outputs of some pair
827 of compatible instructions are used as inputs by some other pair of
828 compatible instructions, those pairs are part of a potential
829 vectorization chain. Instruction pairs are only fused into vector
830 instructions when they are part of a chain longer than some
831 threshold length. Moreover, the pass attempts to find the best
832 possible chain for each pair of compatible instructions. These
833 heuristics are intended to prevent vectorization in cases where
834 it would not yield a performance increase of the resulting code.
835

836
837
838
839

819840 -block-placement: Profile Guided Basic Block Placement
820841
821842
2929 /// initializeScalarOpts - Initialize all passes linked into the
3030 /// ScalarOpts library.
3131 void initializeScalarOpts(PassRegistry&);
32
33 /// initializeVectorization - Initialize all passes linked into the
34 /// Vectorize library.
35 void initializeVectorization(PassRegistry&);
3236
3337 /// initializeInstCombine - Initialize all passes linked into the
3438 /// ScalarOpts library.
235239 void initializeInstSimplifierPass(PassRegistry&);
236240 void initializeUnpackMachineBundlesPass(PassRegistry&);
237241 void initializeFinalizeMachineBundlesPass(PassRegistry&);
238
242 void initializeBBVectorizePass(PassRegistry&);
239243 }
240244
241245 #endif
3030 #include "llvm/Transforms/Instrumentation.h"
3131 #include "llvm/Transforms/IPO.h"
3232 #include "llvm/Transforms/Scalar.h"
33 #include "llvm/Transforms/Vectorize.h"
3334 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
3435 #include
3536
150151 (void) llvm::createCorrelatedValuePropagationPass();
151152 (void) llvm::createMemDepPrinter();
152153 (void) llvm::createInstructionSimplifierPass();
154 (void) llvm::createBBVectorizePass();
153155
154156 (void)new llvm::IntervalPartition();
155157 (void)new llvm::FindUsedTypes();
9898 bool DisableSimplifyLibCalls;
9999 bool DisableUnitAtATime;
100100 bool DisableUnrollLoops;
101 bool Vectorize;
101102
102103 private:
103104 /// ExtensionList - This is list of all of the extensions that are registered.
0 //===-- Vectorize.h - Vectorization Transformations -------------*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This header file defines prototypes for accessor functions that expose passes
10 // in the Vectorize transformations library.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #ifndef LLVM_TRANSFORMS_VECTORIZE_H
15 #define LLVM_TRANSFORMS_VECTORIZE_H
16
17 namespace llvm {
18
19 class BasicBlockPass;
20
21 //===----------------------------------------------------------------------===//
22 //
23 // BBVectorize - A basic-block vectorization pass.
24 //
25 BasicBlockPass *createBBVectorizePass();
26
27 } // End llvm namespace
28
29 #endif
2424 void LLVMInitializeCore(LLVMPassRegistryRef R);
2525 void LLVMInitializeTransformUtils(LLVMPassRegistryRef R);
2626 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
27 void LLVMInitializeVectorization(LLVMPassRegistryRef R);
2728 void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
2829 void LLVMInitializeIPO(LLVMPassRegistryRef R);
2930 void LLVMInitializeInstrumentation(LLVMPassRegistryRef R);
0 /*===---------------------------Vectorize.h ------------------- -*- C++ -*-===*\
1 |*===----------- Vectorization Transformation Library C Interface ---------===*|
2 |* *|
3 |* The LLVM Compiler Infrastructure *|
4 |* *|
5 |* This file is distributed under the University of Illinois Open Source *|
6 |* License. See LICENSE.TXT for details. *|
7 |* *|
8 |*===----------------------------------------------------------------------===*|
9 |* *|
10 |* This header declares the C interface to libLLVMVectorize.a, which *|
11 |* implements various vectorization transformations of the LLVM IR. *|
12 |* *|
13 |* Many exotic languages can interoperate with C code but have a harder time *|
14 |* with C++ due to name mangling. So in addition to C, this interface enables *|
15 |* tools written in such languages. *|
16 |* *|
17 \*===----------------------------------------------------------------------===*/
18
19 #ifndef LLVM_C_TRANSFORMS_VECTORIZE_H
20 #define LLVM_C_TRANSFORMS_VECTORIZE_H
21
22 #include "llvm-c/Core.h"
23
24 #ifdef __cplusplus
25 extern "C" {
26 #endif
27
28 /** See llvm::createBBVectorizePass function. */
29 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM);
30
31 #ifdef __cplusplus
32 }
33 #endif /* defined(__cplusplus) */
34
35 #endif
36
22 add_subdirectory(InstCombine)
33 add_subdirectory(Scalar)
44 add_subdirectory(IPO)
5 add_subdirectory(Vectorize)
56 add_subdirectory(Hello)
1919 name = IPO
2020 parent = Transforms
2121 library_name = ipo
22 required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils
22 required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils
2020 #include "llvm/DefaultPasses.h"
2121 #include "llvm/PassManager.h"
2222 #include "llvm/Analysis/Passes.h"
23 #include "llvm/Analysis/Verifier.h"
24 #include "llvm/Support/CommandLine.h"
2325 #include "llvm/Target/TargetLibraryInfo.h"
2426 #include "llvm/Transforms/Scalar.h"
27 #include "llvm/Transforms/Vectorize.h"
2528 #include "llvm/Transforms/IPO.h"
2629 #include "llvm/ADT/SmallVector.h"
2730 #include "llvm/Support/ManagedStatic.h"
2831
2932 using namespace llvm;
33
34 static cl::opt
35 RunVectorization("vectorize", cl::desc("Run vectorization passes"));
3036
3137 PassManagerBuilder::PassManagerBuilder() {
3238 OptLevel = 2;
3642 DisableSimplifyLibCalls = false;
3743 DisableUnitAtATime = false;
3844 DisableUnrollLoops = false;
45 Vectorize = RunVectorization;
3946 }
4047
4148 PassManagerBuilder::~PassManagerBuilder() {
171178
172179 addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
173180
181 if (Vectorize) {
182 MPM.add(createBBVectorizePass());
183 MPM.add(createInstructionCombiningPass());
184 if (OptLevel > 1)
185 MPM.add(createGVNPass()); // Remove redundancies
186 }
187
174188 MPM.add(createAggressiveDCEPass()); // Delete dead instructions
175189 MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
176190 MPM.add(createInstructionCombiningPass()); // Clean up after everything.
1515 ;===------------------------------------------------------------------------===;
1616
1717 [common]
18 subdirectories = IPO InstCombine Instrumentation Scalar Utils
18 subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize
1919
2020 [component_0]
2121 type = Group
77 ##===----------------------------------------------------------------------===##
88
99 LEVEL = ../..
10 PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello
10 PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello
1111
1212 include $(LEVEL)/Makefile.config
1313
0 //===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a basic-block vectorization pass. The algorithm was
10 // inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
11 // et al. It works by looking for chains of pairable operations and then
12 // pairing them.
13 //
14 //===----------------------------------------------------------------------===//
15
16 #define BBV_NAME "bb-vectorize"
17 #define DEBUG_TYPE BBV_NAME
18 #include "llvm/Constants.h"
19 #include "llvm/DerivedTypes.h"
20 #include "llvm/Function.h"
21 #include "llvm/Instructions.h"
22 #include "llvm/IntrinsicInst.h"
23 #include "llvm/Intrinsics.h"
24 #include "llvm/LLVMContext.h"
25 #include "llvm/Pass.h"
26 #include "llvm/Type.h"
27 #include "llvm/ADT/DenseMap.h"
28 #include "llvm/ADT/DenseSet.h"
29 #include "llvm/ADT/SmallVector.h"
30 #include "llvm/ADT/Statistic.h"
31 #include "llvm/ADT/STLExtras.h"
32 #include "llvm/ADT/StringExtras.h"
33 #include "llvm/Analysis/AliasAnalysis.h"
34 #include "llvm/Analysis/AliasSetTracker.h"
35 #include "llvm/Analysis/ScalarEvolution.h"
36 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
37 #include "llvm/Analysis/ValueTracking.h"
38 #include "llvm/Support/CommandLine.h"
39 #include "llvm/Support/Debug.h"
40 #include "llvm/Support/raw_ostream.h"
41 #include "llvm/Support/ValueHandle.h"
42 #include "llvm/Target/TargetData.h"
43 #include "llvm/Transforms/Vectorize.h"
44 #include
45 #include
46 using namespace llvm;
47
48 static cl::opt
49 ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
50 cl::desc("The required chain depth for vectorization"));
51
52 static cl::opt
53 SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
54 cl::desc("The maximum search distance for instruction pairs"));
55
56 static cl::opt
57 SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
58 cl::desc("Replicating one element to a pair breaks the chain"));
59
60 static cl::opt
61 VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
62 cl::desc("The size of the native vector registers"));
63
64 static cl::opt
65 MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
66 cl::desc("The maximum number of pairing iterations"));
67
68 static cl::opt
69 MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
70 cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
71 " a full cycle check"));
72
73 static cl::opt
74 NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
75 cl::desc("Don't try to vectorize integer values"));
76
77 static cl::opt
78 NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
79 cl::desc("Don't try to vectorize floating-point values"));
80
81 static cl::opt
82 NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
83 cl::desc("Don't try to vectorize casting (conversion) operations"));
84
85 static cl::opt
86 NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
87 cl::desc("Don't try to vectorize floating-point math intrinsics"));
88
89 static cl::opt
90 NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
91 cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
92
93 static cl::opt
94 NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
95 cl::desc("Don't try to vectorize loads and stores"));
96
97 static cl::opt
98 AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
99 cl::desc("Only generate aligned loads and stores"));
100
101 static cl::opt
102 FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
103 cl::desc("Use a fast instruction dependency analysis"));
104
105 #ifndef NDEBUG
106 static cl::opt
107 DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
108 cl::init(false), cl::Hidden,
109 cl::desc("When debugging is enabled, output information on the"
110 " instruction-examination process"));
111 static cl::opt
112 DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
113 cl::init(false), cl::Hidden,
114 cl::desc("When debugging is enabled, output information on the"
115 " candidate-selection process"));
116 static cl::opt
117 DebugPairSelection("bb-vectorize-debug-pair-selection",
118 cl::init(false), cl::Hidden,
119 cl::desc("When debugging is enabled, output information on the"
120 " pair-selection process"));
121 static cl::opt
122 DebugCycleCheck("bb-vectorize-debug-cycle-check",
123 cl::init(false), cl::Hidden,
124 cl::desc("When debugging is enabled, output information on the"
125 " cycle-checking process"));
126 #endif
127
128 STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
129
130 namespace {
131 struct BBVectorize : public BasicBlockPass {
132 static char ID; // Pass identification, replacement for typeid
133 BBVectorize() : BasicBlockPass(ID) {
134 initializeBBVectorizePass(*PassRegistry::getPassRegistry());
135 }
136
137 typedef std::pair ValuePair;
138 typedef std::pair ValuePairWithDepth;
139 typedef std::pair VPPair; // A ValuePair pair
140 typedef std::pair::iterator,
141 std::multimap::iterator> VPIteratorPair;
142 typedef std::pair::iterator,
143 std::multimap::iterator>
144 VPPIteratorPair;
145
146 AliasAnalysis *AA;
147 ScalarEvolution *SE;
148 TargetData *TD;
149
150 // FIXME: const correct?
151
152 bool vectorizePairs(BasicBlock &BB);
153
154 void getCandidatePairs(BasicBlock &BB,
155 std::multimap &CandidatePairs,
156 std::vector &PairableInsts);
157
158 void computeConnectedPairs(std::multimap &CandidatePairs,
159 std::vector &PairableInsts,
160 std::multimap &ConnectedPairs);
161
162 void buildDepMap(BasicBlock &BB,
163 std::multimap &CandidatePairs,
164 std::vector &PairableInsts,
165 DenseSet &PairableInstUsers);
166
167 void choosePairs(std::multimap &CandidatePairs,
168 std::vector &PairableInsts,
169 std::multimap &ConnectedPairs,
170 DenseSet &PairableInstUsers,
171 DenseMap& ChosenPairs);
172
173 void fuseChosenPairs(BasicBlock &BB,
174 std::vector &PairableInsts,
175 DenseMap& ChosenPairs);
176
177 bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
178
179 bool areInstsCompatible(Instruction *I, Instruction *J,
180 bool IsSimpleLoadStore);
181
182 bool trackUsesOfI(DenseSet &Users,
183 AliasSetTracker &WriteSet, Instruction *I,
184 Instruction *J, bool UpdateUsers = true,
185 std::multimap *LoadMoveSet = 0);
186
187 void computePairsConnectedTo(
188 std::multimap &CandidatePairs,
189 std::vector &PairableInsts,
190 std::multimap &ConnectedPairs,
191 ValuePair P);
192
193 bool pairsConflict(ValuePair P, ValuePair Q,
194 DenseSet &PairableInstUsers,
195 std::multimap *PairableInstUserMap = 0);
196
197 bool pairWillFormCycle(ValuePair P,
198 std::multimap &PairableInstUsers,
199 DenseSet &CurrentPairs);
200
201 void pruneTreeFor(
202 std::multimap &CandidatePairs,
203 std::vector &PairableInsts,
204 std::multimap &ConnectedPairs,
205 DenseSet &PairableInstUsers,
206 std::multimap &PairableInstUserMap,
207 DenseMap &ChosenPairs,
208 DenseMap &Tree,
209 DenseSet &PrunedTree, ValuePair J,
210 bool UseCycleCheck);
211
212 void buildInitialTreeFor(
213 std::multimap &CandidatePairs,
214 std::vector &PairableInsts,
215 std::multimap &ConnectedPairs,
216 DenseSet &PairableInstUsers,
217 DenseMap &ChosenPairs,
218 DenseMap &Tree, ValuePair J);
219
220 void findBestTreeFor(
221 std::multimap &CandidatePairs,
222 std::vector &PairableInsts,
223 std::multimap &ConnectedPairs,
224 DenseSet &PairableInstUsers,
225 std::multimap &PairableInstUserMap,
226 DenseMap &ChosenPairs,
227 DenseSet &BestTree, size_t &BestMaxDepth,
228 size_t &BestEffSize, VPIteratorPair ChoiceRange,
229 bool UseCycleCheck);
230
231 Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
232 Instruction *J, unsigned o, bool &FlipMemInputs);
233
234 void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
235 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
236 unsigned IdxOffset, std::vector &Mask);
237
238 Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
239 Instruction *J);
240
241 Value *getReplacementInput(LLVMContext& Context, Instruction *I,
242 Instruction *J, unsigned o, bool FlipMemInputs);
243
244 void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
245 Instruction *J, SmallVector &ReplacedOperands,
246 bool &FlipMemInputs);
247
248 void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
249 Instruction *J, Instruction *K,
250 Instruction *&InsertionPt, Instruction *&K1,
251 Instruction *&K2, bool &FlipMemInputs);
252
253 void collectPairLoadMoveSet(BasicBlock &BB,
254 DenseMap &ChosenPairs,
255 std::multimap &LoadMoveSet,
256 Instruction *I);
257
258 void collectLoadMoveSet(BasicBlock &BB,
259 std::vector &PairableInsts,
260 DenseMap &ChosenPairs,
261 std::multimap &LoadMoveSet);
262
263 bool canMoveUsesOfIAfterJ(BasicBlock &BB,
264 std::multimap &LoadMoveSet,
265 Instruction *I, Instruction *J);
266
267 void moveUsesOfIAfterJ(BasicBlock &BB,
268 std::multimap &LoadMoveSet,
269 Instruction *&InsertionPt,
270 Instruction *I, Instruction *J);
271
272 virtual bool runOnBasicBlock(BasicBlock &BB) {
273 AA = &getAnalysis();
274 SE = &getAnalysis();
275 TD = getAnalysisIfAvailable();
276
277 bool changed = false;
278 // Iterate a sufficient number of times to merge types of size 1 bit,
279 // then 2 bits, then 4, etc. up to half of the target vector width of the
280 // target vector register.
281 for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter);
282 v *= 2, ++n) {
283 DEBUG(dbgs() << "BBV: fusing loop #" << n <<
284 " for " << BB.getName() << " in " <<
285 BB.getParent()->getName() << "...\n");
286 if (vectorizePairs(BB))
287 changed = true;
288 else
289 break;
290 }
291
292 DEBUG(dbgs() << "BBV: done!\n");
293 return changed;
294 }
295
296 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
297 BasicBlockPass::getAnalysisUsage(AU);
298 AU.addRequired();
299 AU.addRequired();
300 AU.addPreserved();
301 AU.addPreserved();
302 }
303
304 // This returns the vector type that holds a pair of the provided type.
305 // If the provided type is already a vector, then its length is doubled.
306 static inline VectorType *getVecTypeForPair(Type *ElemTy) {
307 if (VectorType *VTy = dyn_cast(ElemTy)) {
308 unsigned numElem = VTy->getNumElements();
309 return VectorType::get(ElemTy->getScalarType(), numElem*2);
310 } else {
311 return VectorType::get(ElemTy, 2);
312 }
313 }
314
315 // Returns the weight associated with the provided value. A chain of
316 // candidate pairs has a length given by the sum of the weights of its
317 // members (one weight per pair; the weight of each member of the pair
318 // is assumed to be the same). This length is then compared to the
319 // chain-length threshold to determine if a given chain is significant
320 // enough to be vectorized. The length is also used in comparing
321 // candidate chains where longer chains are considered to be better.
322 // Note: when this function returns 0, the resulting instructions are
323 // not actually fused.
324 static inline size_t getDepthFactor(Value *V) {
325 // InsertElement and ExtractElement have a depth factor of zero. This is
326 // for two reasons: First, they cannot be usefully fused. Second, because
327 // the pass generates a lot of these, they can confuse the simple metric
328 // used to compare the trees in the next iteration. Thus, giving them a
329 // weight of zero allows the pass to essentially ignore them in
330 // subsequent iterations when looking for vectorization opportunities
331 // while still tracking dependency chains that flow through those
332 // instructions.
333 if (isa(V) || isa(V))
334 return 0;
335
336 return 1;
337 }
338
339 // This determines the relative offset of two loads or stores, returning
340 // true if the offset could be determined to be some constant value.
341 // For example, if OffsetInElmts == 1, then J accesses the memory directly
342 // after I; if OffsetInElmts == -1 then I accesses the memory
343 // directly after J. This function assumes that both instructions
344 // have the same type.
345 bool getPairPtrInfo(Instruction *I, Instruction *J,
346 Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
347 int64_t &OffsetInElmts) {
348 OffsetInElmts = 0;
349 if (isa(I)) {
350 IPtr = cast(I)->getPointerOperand();
351 JPtr = cast(J)->getPointerOperand();
352 IAlignment = cast(I)->getAlignment();
353 JAlignment = cast(J)->getAlignment();
354 } else {
355 IPtr = cast(I)->getPointerOperand();
356 JPtr = cast(J)->getPointerOperand();
357 IAlignment = cast(I)->getAlignment();
358 JAlignment = cast(J)->getAlignment();
359 }
360
361 const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
362 const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
363
364 // If this is a trivial offset, then we'll get something like
365 // 1*sizeof(type). With target data, which we need anyway, this will get
366 // constant folded into a number.
367 const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
368 if (const SCEVConstant *ConstOffSCEV =
369 dyn_cast(OffsetSCEV)) {
370 ConstantInt *IntOff = ConstOffSCEV->getValue();
371 int64_t Offset = IntOff->getSExtValue();
372
373 Type *VTy = cast(IPtr->getType())->getElementType();
374 int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
375
376 assert(VTy == cast(JPtr->getType())->getElementType());
377
378 OffsetInElmts = Offset/VTyTSS;
379 return (abs64(Offset) % VTyTSS) == 0;
380 }
381
382 return false;
383 }
384
385 // Returns true if the provided CallInst represents an intrinsic that can
386 // be vectorized.
387 bool isVectorizableIntrinsic(CallInst* I) {
388 Function *F = I->getCalledFunction();
389 if (!F) return false;
390
391 unsigned IID = F->getIntrinsicID();
392 if (!IID) return false;
393
394 switch(IID) {
395 default:
396 return false;
397 case Intrinsic::sqrt:
398 case Intrinsic::powi:
399 case Intrinsic::sin:
400 case Intrinsic::cos:
401 case Intrinsic::log:
402 case Intrinsic::log2:
403 case Intrinsic::log10:
404 case Intrinsic::exp:
405 case Intrinsic::exp2:
406 case Intrinsic::pow:
407 return !NoMath;
408 case Intrinsic::fma:
409 return !NoFMA;
410 }
411 }
412
413 // Returns true if J is the second element in some pair referenced by
414 // some multimap pair iterator pair.
415 template
416 bool isSecondInIteratorPair(V J, std::pair<
417 typename std::multimap::iterator,
418 typename std::multimap::iterator> PairRange) {
419 for (typename std::multimap::iterator K = PairRange.first;
420 K != PairRange.second; ++K)
421 if (K->second == J) return true;
422
423 return false;
424 }
425 };
426
427 // This function implements one vectorization iteration on the provided
428 // basic block. It returns true if the block is changed.
429 bool BBVectorize::vectorizePairs(BasicBlock &BB) {
430 std::vector PairableInsts;
431 std::multimap CandidatePairs;
432 getCandidatePairs(BB, CandidatePairs, PairableInsts);
433 if (PairableInsts.size() == 0) return false;
434
435 // Now we have a map of all of the pairable instructions and we need to
436 // select the best possible pairing. A good pairing is one such that the
437 // users of the pair are also paired. This defines a (directed) forest
438 // over the pairs such that two pairs are connected iff the second pair
439 // uses the first.
440
441 // Note that it only matters that both members of the second pair use some
442 // element of the first pair (to allow for splatting).
443
444 std::multimap ConnectedPairs;
445 computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
446 if (ConnectedPairs.size() == 0) return false;
447
448 // Build the pairable-instruction dependency map
449 DenseSet PairableInstUsers;
450 buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
451
452 // There is now a graph of the connected pairs. For each variable, pick the
453 // pairing with the largest tree meeting the depth requirement on at least
454 // one branch. Then select all pairings that are part of that tree and
455 // remove them from the list of available pairings and pairable variables.
456
457 DenseMap ChosenPairs;
458 choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
459 PairableInstUsers, ChosenPairs);
460
461 if (ChosenPairs.size() == 0) return false;
462 NumFusedOps += ChosenPairs.size();
463
464 // A set of pairs has now been selected. It is now necessary to replace the
465 // paired instructions with vector instructions. For this procedure each
466 // operand much be replaced with a vector operand. This vector is formed
467 // by using build_vector on the old operands. The replaced values are then
468 // replaced with a vector_extract on the result. Subsequent optimization
469 // passes should coalesce the build/extract combinations.
470
471 fuseChosenPairs(BB, PairableInsts, ChosenPairs);
472
473 return true;
474 }
475
476 // This function returns true if the provided instruction is capable of being
477 // fused into a vector instruction. This determination is based only on the
478 // type and other attributes of the instruction.
479 bool BBVectorize::isInstVectorizable(Instruction *I,
480 bool &IsSimpleLoadStore) {
481 IsSimpleLoadStore = false;
482
483 if (CallInst *C = dyn_cast(I)) {
484 if (!isVectorizableIntrinsic(C))
485 return false;
486 } else if (LoadInst *L = dyn_cast(I)) {
487 // Vectorize simple loads if possbile:
488 IsSimpleLoadStore = L->isSimple();
489 if (!IsSimpleLoadStore || NoMemOps)
490 return false;
491 } else if (StoreInst *S = dyn_cast(I)) {
492 // Vectorize simple stores if possbile:
493 IsSimpleLoadStore = S->isSimple();
494 if (!IsSimpleLoadStore || NoMemOps)
495 return false;
496 } else if (CastInst *C = dyn_cast(I)) {
497 // We can vectorize casts, but not casts of pointer types, etc.
498 if (NoCasts)
499 return false;
500
501 Type *SrcTy = C->getSrcTy();
502 if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy())
503 return false;
504
505 Type *DestTy = C->getDestTy();
506 if (!DestTy->isSingleValueType() || DestTy->isPointerTy())
507 return false;
508 } else if (!(I->isBinaryOp() || isa(I) ||
509 isa(I) || isa(I))) {
510 return false;
511 }
512
513 // We can't vectorize memory operations without target data
514 if (TD == 0 && IsSimpleLoadStore)
515 return false;
516
517 Type *T1, *T2;
518 if (isa(I)) {
519 // For stores, it is the value type, not the pointer type that matters
520 // because the value is what will come from a vector register.
521
522 Value *IVal = cast(I)->getValueOperand();
523 T1 = IVal->getType();
524 } else {
525 T1 = I->getType();
526 }
527
528 if (I->isCast())
529 T2 = cast(I)->getSrcTy();
530 else
531 T2 = T1;
532
533 // Not every type can be vectorized...
534 if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
535 !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
536 return false;
537
538 if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
539 return false;
540
541 if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
542 return false;
543
544 if (T1->getPrimitiveSizeInBits() > VectorBits/2 ||
545 T2->getPrimitiveSizeInBits() > VectorBits/2)
546 return false;
547
548 return true;
549 }
550
551 // This function returns true if the two provided instructions are compatible
552 // (meaning that they can be fused into a vector instruction). This assumes
553 // that I has already been determined to be vectorizable and that J is not
554 // in the use tree of I.
555 bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
556 bool IsSimpleLoadStore) {
557 DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
558 " <-> " << *J << "\n");
559
560 // Loads and stores can be merged if they have different alignments,
561 // but are otherwise the same.
562 LoadInst *LI, *LJ;
563 StoreInst *SI, *SJ;
564 if ((LI = dyn_cast(I)) && (LJ = dyn_cast(J))) {
565 if (I->getType() != J->getType())
566 return false;
567
568 if (LI->getPointerOperand()->getType() !=
569 LJ->getPointerOperand()->getType() ||
570 LI->isVolatile() != LJ->isVolatile() ||
571 LI->getOrdering() != LJ->getOrdering() ||
572 LI->getSynchScope() != LJ->getSynchScope())
573 return false;
574 } else if ((SI = dyn_cast(I)) && (SJ = dyn_cast(J))) {
575 if (SI->getValueOperand()->getType() !=
576 SJ->getValueOperand()->getType() ||
577 SI->getPointerOperand()->getType() !=
578 SJ->getPointerOperand()->getType() ||
579 SI->isVolatile() != SJ->isVolatile() ||
580 SI->getOrdering() != SJ->getOrdering() ||
581 SI->getSynchScope() != SJ->getSynchScope())
582 return false;
583 } else if (!J->isSameOperationAs(I)) {
584 return false;
585 }
586 // FIXME: handle addsub-type operations!
587
588 if (IsSimpleLoadStore) {
589 Value *IPtr, *JPtr;
590 unsigned IAlignment, JAlignment;
591 int64_t OffsetInElmts = 0;
592 if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
593 OffsetInElmts) && abs64(OffsetInElmts) == 1) {
594 if (AlignedOnly) {
595 Type *aType = isa(I) ?
596 cast(I)->getValueOperand()->getType() : I->getType();
597 // An aligned load or store is possible only if the instruction
598 // with the lower offset has an alignment suitable for the
599 // vector type.
600
601 unsigned BottomAlignment = IAlignment;
602 if (OffsetInElmts < 0) BottomAlignment = JAlignment;
603
604 Type *VType = getVecTypeForPair(aType);
605 unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
606 if (BottomAlignment < VecAlignment)
607 return false;
608 }
609 } else {
610 return false;
611 }
612 } else if (isa(I)) {
613 // Only merge two shuffles if they're both constant
614 return isa(I->getOperand(2)) &&
615 isa(J->getOperand(2));
616 // FIXME: We may want to vectorize non-constant shuffles also.
617 }
618
619 return true;
620 }
621
622 // Figure out whether or not J uses I and update the users and write-set
623 // structures associated with I. Specifically, Users represents the set of
624 // instructions that depend on I. WriteSet represents the set
625 // of memory locations that are dependent on I. If UpdateUsers is true,
626 // and J uses I, then Users is updated to contain J and WriteSet is updated
627 // to contain any memory locations to which J writes. The function returns
628 // true if J uses I. By default, alias analysis is used to determine
629 // whether J reads from memory that overlaps with a location in WriteSet.
630 // If LoadMoveSet is not null, then it is a previously-computed multimap
631 // where the key is the memory-based user instruction and the value is
632 // the instruction to be compared with I. So, if LoadMoveSet is provided,
633 // then the alias analysis is not used. This is necessary because this
634 // function is called during the process of moving instructions during
635 // vectorization and the results of the alias analysis are not stable during
636 // that process.
637 bool BBVectorize::trackUsesOfI(DenseSet &Users,
638 AliasSetTracker &WriteSet, Instruction *I,
639 Instruction *J, bool UpdateUsers,
640 std::multimap *LoadMoveSet) {
641 bool UsesI = false;
642
643 // This instruction may already be marked as a user due, for example, to
644 // being a member of a selected pair.
645 if (Users.count(J))
646 UsesI = true;
647
648 if (!UsesI)
649 for (User::op_iterator JU = J->op_begin(), e = J->op_end();
650 JU != e; ++JU) {
651 Value *V = *JU;
652 if (I == V || Users.count(V)) {
653 UsesI = true;
654 break;
655 }
656 }
657 if (!UsesI && J->mayReadFromMemory()) {
658 if (LoadMoveSet) {
659 VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
660 UsesI = isSecondInIteratorPair(I, JPairRange);
661 } else {
662 for (AliasSetTracker::iterator W = WriteSet.begin(),
663 WE = WriteSet.end(); W != WE; ++W) {
664 for (AliasSet::iterator A = W->begin(), AE = W->end();
665 A != AE; ++A) {
666 AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(),
667 A->getTBAAInfo());
668 if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) {
669 UsesI = true;
670 break;
671 }
672 }
673 if (UsesI) break;
674 }
675 }
676 }
677
678 if (UsesI && UpdateUsers) {
679 if (J->mayWriteToMemory()) WriteSet.add(J);
680 Users.insert(J);
681 }
682
683 return UsesI;
684 }
685
686 // This function iterates over all instruction pairs in the provided
687 // basic block and collects all candidate pairs for vectorization.
688 void BBVectorize::getCandidatePairs(BasicBlock &BB,
689 std::multimap &CandidatePairs,
690 std::vector &PairableInsts) {
691 BasicBlock::iterator E = BB.end();
692 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
693 bool IsSimpleLoadStore;
694 if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
695
696 // Look for an instruction with which to pair instruction *I...
697 DenseSet Users;
698 AliasSetTracker WriteSet(*AA);
699 BasicBlock::iterator J = I; ++J;
700 for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) {
701 // Determine if J uses I, if so, exit the loop.
702 bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep);
703 if (FastDep) {
704 // Note: For this heuristic to be effective, independent operations
705 // must tend to be intermixed. This is likely to be true from some
706 // kinds of grouped loop unrolling (but not the generic LLVM pass),
707 // but otherwise may require some kind of reordering pass.
708
709 // When using fast dependency analysis,
710 // stop searching after first use:
711 if (UsesI) break;
712 } else {
713 if (UsesI) continue;
714 }
715
716 // J does not use I, and comes before the first use of I, so it can be
717 // merged with I if the instructions are compatible.
718 if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
719
720 // J is a candidate for merging with I.
721 if (!PairableInsts.size() ||
722 PairableInsts[PairableInsts.size()-1] != I) {
723 PairableInsts.push_back(I);
724 }
725 CandidatePairs.insert(ValuePair(I, J));
726 DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
727 << *I << " <-> " << *J << "\n");
728 }
729 }
730
731 DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
732 << " instructions with candidate pairs\n");
733 }
734
735 // Finds candidate pairs connected to the pair P = . This means that
736 // it looks for pairs such that both members have an input which is an
737 // output of PI or PJ.
738 void BBVectorize::computePairsConnectedTo(
739 std::multimap &CandidatePairs,
740 std::vector &PairableInsts,
741 std::multimap &ConnectedPairs,
742 ValuePair P) {
743 // For each possible pairing for this variable, look at the uses of
744 // the first value...
745 for (Value::use_iterator I = P.first->use_begin(),
746 E = P.first->use_end(); I != E; ++I) {
747 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
748
749 // For each use of the first variable, look for uses of the second
750 // variable...
751 for (Value::use_iterator J = P.second->use_begin(),
752 E2 = P.second->use_end(); J != E2; ++J) {
753 VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
754
755 // Look for :
756 if (isSecondInIteratorPair(*J, IPairRange))
757 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
758
759 // Look for :
760 if (isSecondInIteratorPair(*I, JPairRange))
761 ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
762 }
763
764 if (SplatBreaksChain) continue;
765 // Look for cases where just the first value in the pair is used by
766 // both members of another pair (splatting).
767 for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
768 if (isSecondInIteratorPair(*J, IPairRange))
769 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
770 }
771 }
772
773 if (SplatBreaksChain) return;
774 // Look for cases where just the second value in the pair is used by
775 // both members of another pair (splatting).
776 for (Value::use_iterator I = P.second->use_begin(),
777 E = P.second->use_end(); I != E; ++I) {
778 VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
779
780 for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
781 if (isSecondInIteratorPair(*J, IPairRange))
782 ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
783 }
784 }
785 }
786
787 // This function figures out which pairs are connected. Two pairs are
788 // connected if some output of the first pair forms an input to both members
789 // of the second pair.
790 void BBVectorize::computeConnectedPairs(
791 std::multimap &CandidatePairs,
792 std::vector &PairableInsts,
793 std::multimap &ConnectedPairs) {
794
795 for (std::vector::iterator PI = PairableInsts.begin(),
796 PE = PairableInsts.end(); PI != PE; ++PI) {
797 VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
798
799 for (std::multimap::iterator P = choiceRange.first;
800 P != choiceRange.second; ++P)
801 computePairsConnectedTo(CandidatePairs, PairableInsts,
802 ConnectedPairs, *P);
803 }
804
805 DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
806 << " pair connections.\n");
807 }
808
809 // This function builds a set of use tuples such that is in the set
810 // if B is in the use tree of A. If B is in the use tree of A, then B
811 // depends on the output of A.
812 void BBVectorize::buildDepMap(
813 BasicBlock &BB,
814 std::multimap &CandidatePairs,
815 std::vector &PairableInsts,
816 DenseSet &PairableInstUsers) {
817 DenseSet IsInPair;
818 for (std::multimap::iterator C = CandidatePairs.begin(),
819 E = CandidatePairs.end(); C != E; ++C) {
820 IsInPair.insert(C->first);
821 IsInPair.insert(C->second);
822 }
823
824 // Iterate through the basic block, recording all Users of each
825 // pairable instruction.
826
827 BasicBlock::iterator E = BB.end();
828 for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
829 if (IsInPair.find(I) == IsInPair.end()) continue;
830
831 DenseSet Users;
832 AliasSetTracker WriteSet(*AA);
833 for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
834 (void) trackUsesOfI(Users, WriteSet, I, J);
835
836 for (DenseSet::iterator U = Users.begin(), E = Users.end();
837 U != E; ++U)
838 PairableInstUsers.insert(ValuePair(I, *U));
839 }
840 }
841
842 // Returns true if an input to pair P is an output of pair Q and also an
843 // input of pair Q is an output of pair P. If this is the case, then these
844 // two pairs cannot be simultaneously fused.
845 bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
846 DenseSet &PairableInstUsers,
847 std::multimap *PairableInstUserMap) {
848 // Two pairs are in conflict if they are mutual Users of eachother.
849 bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
850 PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
851 PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
852 PairableInstUsers.count(ValuePair(P.second, Q.second));
853 bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
854 PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
855 PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
856 PairableInstUsers.count(ValuePair(Q.second, P.second));
857 if (PairableInstUserMap) {
858 // FIXME: The expensive part of the cycle check is not so much the cycle
859 // check itself but this edge insertion procedure. This needs some
860 // profiling and probably a different data structure (same is true of
861 // most uses of std::multimap).
862 if (PUsesQ) {
863 VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
864 if (!isSecondInIteratorPair(P, QPairRange))
865 PairableInstUserMap->insert(VPPair(Q, P));
866 }
867 if (QUsesP) {
868 VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
869 if (!isSecondInIteratorPair(Q, PPairRange))
870 PairableInstUserMap->insert(VPPair(P, Q));
871 }
872 }
873
874 return (QUsesP && PUsesQ);
875 }
876
877 // This function walks the use graph of current pairs to see if, starting
878 // from P, the walk returns to P.
879 bool BBVectorize::pairWillFormCycle(ValuePair P,
880 std::multimap &PairableInstUserMap,
881 DenseSet &CurrentPairs) {
882 DEBUG(if (DebugCycleCheck)
883 dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
884 << *P.second << "\n");
885 // A lookup table of visisted pairs is kept because the PairableInstUserMap
886 // contains non-direct associations.
887 DenseSet Visited;
888 std::vector Q;
889 // General depth-first post-order traversal:
890 Q.push_back(P);
891 while (!Q.empty()) {
892 ValuePair QTop = Q.back();
893
894 Visited.insert(QTop);
895 Q.pop_back();
896
897 DEBUG(if (DebugCycleCheck)
898 dbgs() << "BBV: cycle check visiting: " << *QTop.first << " <-> "
899 << *QTop.second << "\n");
900 VPPIteratorPair QPairRange = PairableInstUserMap.equal_range(QTop);
901 for (std::multimap::iterator C = QPairRange.first;
902 C != QPairRange.second; ++C) {
903 if (C->second == P) {
904 DEBUG(dbgs()
905 << "BBV: rejected to prevent non-trivial cycle formation: "
906 << *C->first.first << " <-> " << *C->first.second << "\n");
907 return true;
908 }
909
910 if (CurrentPairs.count(C->second) > 0 &&
911 Visited.count(C->second) == 0)
912 Q.push_back(C->second);
913 }
914 }
915
916 return false;
917 }
918
919 // This function builds the initial tree of connected pairs with the
920 // pair J at the root.
921 void BBVectorize::buildInitialTreeFor(
922 std::multimap &CandidatePairs,
923 std::vector &PairableInsts,
924 std::multimap &ConnectedPairs,
925 DenseSet &PairableInstUsers,
926 DenseMap &ChosenPairs,
927 DenseMap &Tree, ValuePair J) {
928 // Each of these pairs is viewed as the root node of a Tree. The Tree
929 // is then walked (depth-first). As this happens, we keep track of
930 // the pairs that compose the Tree and the maximum depth of the Tree.
931 std::vector Q;
932 // General depth-first post-order traversal:
933 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
934 while (!Q.empty()) {
935 ValuePairWithDepth QTop = Q.back();
936
937 // Push each child onto the queue:
938 bool MoreChildren = false;
939 size_t MaxChildDepth = QTop.second;
940 VPPIteratorPair qtRange = ConnectedPairs.equal_range(QTop.first);
941 for (std::map::iterator k = qtRange.first;
942 k != qtRange.second; ++k) {
943 // Make sure that this child pair is still a candidate:
944 bool IsStillCand = false;
945 VPIteratorPair checkRange =
946 CandidatePairs.equal_range(k->second.first);
947 for (std::multimap::iterator m = checkRange.first;
948 m != checkRange.second; ++m) {
949 if (m->second == k->second.second) {
950 IsStillCand = true;
951 break;
952 }
953 }
954
955 if (IsStillCand) {
956 DenseMap::iterator C = Tree.find(k->second);
957 if (C == Tree.end()) {
958 size_t d = getDepthFactor(k->second.first);
959 Q.push_back(ValuePairWithDepth(k->second, QTop.second+d));
960 MoreChildren = true;
961 } else {
962 MaxChildDepth = std::max(MaxChildDepth, C->second);
963 }
964 }
965 }
966
967 if (!MoreChildren) {
968 // Record the current pair as part of the Tree:
969 Tree.insert(ValuePairWithDepth(QTop.first, MaxChildDepth));
970 Q.pop_back();
971 }
972 }
973 }
974
975 // Given some initial tree, prune it by removing conflicting pairs (pairs
976 // that cannot be simultaneously chosen for vectorization).
977 void BBVectorize::pruneTreeFor(
978 std::multimap &CandidatePairs,
979 std::vector &PairableInsts,
980 std::multimap &ConnectedPairs,
981 DenseSet &PairableInstUsers,
982 std::multimap &PairableInstUserMap,
983 DenseMap &ChosenPairs,
984 DenseMap &Tree,
985 DenseSet &PrunedTree, ValuePair J,
986 bool UseCycleCheck) {
987 std::vector Q;
988 // General depth-first post-order traversal:
989 Q.push_back(ValuePairWithDepth(J, getDepthFactor(J.first)));
990 while (!Q.empty()) {
991 ValuePairWithDepth QTop = Q.back();
992 PrunedTree.insert(QTop.first);
993 Q.pop_back();
994
995 // Visit each child, pruning as necessary...
996 DenseMap BestChilden;
997 VPPIteratorPair QTopRange = ConnectedPairs.equal_range(QTop.first);
998 for (std::map::iterator K = QTopRange.first;
999 K != QTopRange.second; ++K) {
1000 DenseMap::iterator C = Tree.find(K->second);
1001 if (C == Tree.end()) continue;
1002
1003 // This child is in the Tree, now we need to make sure it is the
1004 // best of any conflicting children. There could be multiple
1005 // conflicting children, so first, determine if we're keeping
1006 // this child, then delete conflicting children as necessary.
1007
1008 // It is also necessary to guard against pairing-induced
1009 // dependencies. Consider instructions a .. x .. y .. b
1010 // such that (a,b) are to be fused and (x,y) are to be fused
1011 // but a is an input to x and b is an output from y. This
1012 // means that y cannot be moved after b but x must be moved
1013 // after b for (a,b) to be fused. In other words, after
1014 // fusing (a,b) we have y .. a/b .. x where y is an input
1015 // to a/b and x is an output to a/b: x and y can no longer
1016 // be legally fused. To prevent this condition, we must
1017 // make sure that a child pair added to the Tree is not
1018 // both an input and output of an already-selected pair.
1019
1020 // Pairing-induced dependencies can also form from more complicated
1021 // cycles. The pair vs. pair conflicts are easy to check, and so
1022 // that is done explicitly for "fast rejection", and because for
1023 // child vs. child conflicts, we may prefer to keep the current
1024 // pair in preference to the already-selected child.
1025 DenseSet CurrentPairs;
1026
1027 bool CanAdd = true;
1028 for (DenseMap::iterator C2
1029 = BestChilden.begin(), E2 = BestChilden.end();
1030 C2 != E2; ++C2) {
1031 if (C2->first.first == C->first.first ||
1032 C2->first.first == C->first.second ||
1033 C2->first.second == C->first.first ||
1034 C2->first.second == C->first.second ||
1035 pairsConflict(C2->first, C->first, PairableInstUsers,
1036 UseCycleCheck ? &PairableInstUserMap : 0)) {
1037 if (C2->second >= C->second) {
1038 CanAdd = false;
1039 break;
1040 }
1041
1042 CurrentPairs.insert(C2->first);
1043 }
1044 }
1045 if (!CanAdd) continue;
1046
1047 // Even worse, this child could conflict with another node already
1048 // selected for the Tree. If that is the case, ignore this child.
1049 for (DenseSet::iterator T = PrunedTree.begin(),
1050 E2 = PrunedTree.end(); T != E2; ++T) {
1051 if (T->first == C->first.first ||
1052 T->first == C->first.second ||
1053 T->second == C->first.first ||
1054 T->second == C->first.second ||
1055 pairsConflict(*T, C->first, PairableInstUsers,
1056 UseCycleCheck ? &PairableInstUserMap : 0)) {
1057 CanAdd = false;
1058 break;
1059 }
1060
1061 CurrentPairs.insert(*T);
1062 }
1063 if (!CanAdd) continue;
1064
1065 // And check the queue too...
1066 for (std::vector::iterator C2 = Q.begin(),
1067 E2 = Q.end(); C2 != E2; ++C2) {
1068 if (C2->first.first == C->first.first ||
1069 C2->first.first == C->first.second ||
1070 C2->first.second == C->first.first ||
1071 C2->first.second == C->first.second ||
1072 pairsConflict(C2->first, C->first, PairableInstUsers,
1073 UseCycleCheck ? &PairableInstUserMap : 0)) {
1074 CanAdd = false;
1075 break;
1076 }
1077
1078 CurrentPairs.insert(C2->first);
1079 }
1080 if (!CanAdd) continue;
1081
1082 // Last but not least, check for a conflict with any of the
1083 // already-chosen pairs.
1084 for (DenseMap::iterator C2 =
1085 ChosenPairs.begin(), E2 = ChosenPairs.end();
1086 C2 != E2; ++C2) {
1087 if (pairsConflict(*C2, C->first, PairableInstUsers,
1088 UseCycleCheck ? &PairableInstUserMap : 0)) {
1089 CanAdd = false;
1090 break;
1091 }
1092
1093 CurrentPairs.insert(*C2);
1094 }
1095 if (!CanAdd) continue;
1096
1097 // To check for non-trivial cycles formed by the addition of the
1098 // current pair we've formed a list of all relevant pairs, now use a
1099 // graph walk to check for a cycle. We start from the current pair and
1100 // walk the use tree to see if we again reach the current pair. If we
1101 // do, then the current pair is rejected.
1102
1103 // FIXME: It may be more efficient to use a topological-ordering
1104 // algorithm to improve the cycle check. This should be investigated.
1105 if (UseCycleCheck &&
1106 pairWillFormCycle(C->first, PairableInstUserMap, CurrentPairs))
1107 continue;
1108
1109 // This child can be added, but we may have chosen it in preference
1110 // to an already-selected child. Check for this here, and if a
1111 // conflict is found, then remove the previously-selected child
1112 // before adding this one in its place.
1113 for (DenseMap::iterator C2
1114 = BestChilden.begin(); C2 != BestChilden.end();) {
1115 if (C2->first.first == C->first.first ||
1116 C2->first.first == C->first.second ||
1117 C2->first.second == C->first.first ||
1118 C2->first.second == C->first.second ||
1119 pairsConflict(C2->first, C->first, PairableInstUsers))
1120 BestChilden.erase(C2++);
1121 else
1122 ++C2;
1123 }
1124
1125 BestChilden.insert(ValuePairWithDepth(C->first, C->second));
1126 }
1127
1128 for (DenseMap::iterator C
1129 = BestChilden.begin(), E2 = BestChilden.end();
1130 C != E2; ++C) {
1131 size_t DepthF = getDepthFactor(C->first.first);
1132 Q.push_back(ValuePairWithDepth(C->first, QTop.second+DepthF));
1133 }
1134 }
1135 }
1136
1137 // This function finds the best tree of mututally-compatible connected
1138 // pairs, given the choice of root pairs as an iterator range.
1139 void BBVectorize::findBestTreeFor(
1140 std::multimap &CandidatePairs,
1141 std::vector &PairableInsts,
1142 std::multimap &ConnectedPairs,
1143 DenseSet &PairableInstUsers,
1144 std::multimap &PairableInstUserMap,
1145 DenseMap &ChosenPairs,
1146 DenseSet &BestTree, size_t &BestMaxDepth,
1147 size_t &BestEffSize, VPIteratorPair ChoiceRange,
1148 bool UseCycleCheck) {
1149 for (std::multimap::iterator J = ChoiceRange.first;
1150 J != ChoiceRange.second; ++J) {
1151
1152 // Before going any further, make sure that this pair does not
1153 // conflict with any already-selected pairs (see comment below
1154 // near the Tree pruning for more details).
1155 DenseSet ChosenPairSet;
1156 bool DoesConflict = false;
1157 for (DenseMap::iterator C = ChosenPairs.begin(),
1158 E = ChosenPairs.end(); C != E; ++C) {
1159 if (pairsConflict(*C, *J, PairableInstUsers,
1160 UseCycleCheck ? &PairableInstUserMap : 0)) {
1161 DoesConflict = true;
1162 break;
1163 }
1164
1165 ChosenPairSet.insert(*C);
1166 }
1167 if (DoesConflict) continue;
1168
1169 if (UseCycleCheck &&
1170 pairWillFormCycle(*J, PairableInstUserMap, ChosenPairSet))
1171 continue;
1172
1173 DenseMap Tree;
1174 buildInitialTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1175 PairableInstUsers, ChosenPairs, Tree, *J);
1176
1177 // Because we'll keep the child with the largest depth, the largest
1178 // depth is still the same in the unpruned Tree.
1179 size_t MaxDepth = Tree.lookup(*J);
1180
1181 DEBUG(if (DebugPairSelection) dbgs() << "BBV: found Tree for pair {"
1182 << *J->first << " <-> " << *J->second << "} of depth " <<
1183 MaxDepth << " and size " << Tree.size() << "\n");
1184
1185 // At this point the Tree has been constructed, but, may contain
1186 // contradictory children (meaning that different children of
1187 // some tree node may be attempting to fuse the same instruction).
1188 // So now we walk the tree again, in the case of a conflict,
1189 // keep only the child with the largest depth. To break a tie,
1190 // favor the first child.
1191
1192 DenseSet PrunedTree;
1193 pruneTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1194 PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree,
1195 PrunedTree, *J, UseCycleCheck);
1196
1197 size_t EffSize = 0;
1198 for (DenseSet::iterator S = PrunedTree.begin(),
1199 E = PrunedTree.end(); S != E; ++S)
1200 EffSize += getDepthFactor(S->first);
1201
1202 DEBUG(if (DebugPairSelection)
1203 dbgs() << "BBV: found pruned Tree for pair {"
1204 << *J->first << " <-> " << *J->second << "} of depth " <<
1205 MaxDepth << " and size " << PrunedTree.size() <<
1206 " (effective size: " << EffSize << ")\n");
1207 if (MaxDepth >= ReqChainDepth && EffSize > BestEffSize) {
1208 BestMaxDepth = MaxDepth;
1209 BestEffSize = EffSize;
1210 BestTree = PrunedTree;
1211 }
1212 }
1213 }
1214
1215 // Given the list of candidate pairs, this function selects those
1216 // that will be fused into vector instructions.
1217 void BBVectorize::choosePairs(
1218 std::multimap &CandidatePairs,
1219 std::vector &PairableInsts,
1220 std::multimap &ConnectedPairs,
1221 DenseSet &PairableInstUsers,
1222 DenseMap& ChosenPairs) {
1223 bool UseCycleCheck = CandidatePairs.size() <= MaxCandPairsForCycleCheck;
1224 std::multimap PairableInstUserMap;
1225 for (std::vector::iterator I = PairableInsts.begin(),
1226 E = PairableInsts.end(); I != E; ++I) {
1227 // The number of possible pairings for this variable:
1228 size_t NumChoices = CandidatePairs.count(*I);
1229 if (!NumChoices) continue;
1230
1231 VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I);
1232
1233 // The best pair to choose and its tree:
1234 size_t BestMaxDepth = 0, BestEffSize = 0;
1235 DenseSet BestTree;
1236 findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs,
1237 PairableInstUsers, PairableInstUserMap, ChosenPairs,
1238 BestTree, BestMaxDepth, BestEffSize, ChoiceRange,
1239 UseCycleCheck);
1240
1241 // A tree has been chosen (or not) at this point. If no tree was
1242 // chosen, then this instruction, I, cannot be paired (and is no longer
1243 // considered).
1244
1245 DEBUG(if (BestTree.size() > 0)
1246 dbgs() << "BBV: selected pairs in the best tree for: "
1247 << *cast(*I) << "\n");
1248
1249 for (DenseSet::iterator S = BestTree.begin(),
1250 SE2 = BestTree.end(); S != SE2; ++S) {
1251 // Insert the members of this tree into the list of chosen pairs.
1252 ChosenPairs.insert(ValuePair(S->first, S->second));
1253 DEBUG(dbgs() << "BBV: selected pair: " << *S->first << " <-> " <<
1254 *S->second << "\n");
1255
1256 // Remove all candidate pairs that have values in the chosen tree.
1257 for (std::multimap::iterator K =
1258 CandidatePairs.begin(); K != CandidatePairs.end();) {
1259 if (K->first == S->first || K->second == S->first ||
1260 K->second == S->second || K->first == S->second) {
1261 // Don't remove the actual pair chosen so that it can be used
1262 // in subsequent tree selections.
1263 if (!(K->first == S->first && K->second == S->second))
1264 CandidatePairs.erase(K++);
1265 else
1266 ++K;
1267 } else {
1268 ++K;
1269 }
1270 }
1271 }
1272 }
1273
1274 DEBUG(dbgs() << "BBV: selected " << ChosenPairs.size() << " pairs.\n");
1275 }
1276
1277 std::string getReplacementName(Instruction *I, bool IsInput, unsigned o,
1278 unsigned n = 0) {
1279 if (!I->hasName())
1280 return "";
1281
1282 return (I->getName() + (IsInput ? ".v.i" : ".v.r") + utostr(o) +
1283 (n > 0 ? "." + utostr(n) : "")).str();
1284 }
1285
1286 // Returns the value that is to be used as the pointer input to the vector
1287 // instruction that fuses I with J.
1288 Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context,
1289 Instruction *I, Instruction *J, unsigned o,
1290 bool &FlipMemInputs) {
1291 Value *IPtr, *JPtr;
1292 unsigned IAlignment, JAlignment;
1293 int64_t OffsetInElmts;
1294 (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
1295 OffsetInElmts);
1296
1297 // The pointer value is taken to be the one with the lowest offset.
1298 Value *VPtr;
1299 if (OffsetInElmts > 0) {
1300 VPtr = IPtr;
1301 } else {
1302 FlipMemInputs = true;
1303 VPtr = JPtr;
1304 }
1305
1306 Type *ArgType = cast(IPtr->getType())->getElementType();
1307 Type *VArgType = getVecTypeForPair(ArgType);
1308 Type *VArgPtrType = PointerType::get(VArgType,
1309 cast(IPtr->getType())->getAddressSpace());
1310 return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o),
1311 /* insert before */ FlipMemInputs ? J : I);
1312 }
1313
1314 void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J,
1315 unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
1316 unsigned IdxOffset, std::vector &Mask) {
1317 for (unsigned v = 0; v < NumElem/2; ++v) {
1318 int m = cast(J)->getMaskValue(v);
1319 if (m < 0) {
1320 Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context));
1321 } else {
1322 unsigned mm = m + (int) IdxOffset;
1323 if (m >= (int) NumInElem)
1324 mm += (int) NumInElem;
1325
1326 Mask[v+MaskOffset] =
1327 ConstantInt::get(Type::getInt32Ty(Context), mm);
1328 }
1329 }
1330 }
1331
1332 // Returns the value that is to be used as the vector-shuffle mask to the
1333 // vector instruction that fuses I with J.
1334 Value *BBVectorize::getReplacementShuffleMask(LLVMContext& Context,
1335 Instruction *I, Instruction *J) {
1336 // This is the shuffle mask. We need to append the second
1337 // mask to the first, and the numbers need to be adjusted.
1338
1339 Type *ArgType = I->getType();
1340 Type *VArgType = getVecTypeForPair(ArgType);
1341
1342 // Get the total number of elements in the fused vector type.
1343 // By definition, this must equal the number of elements in
1344 // the final mask.
1345 unsigned NumElem = cast(VArgType)->getNumElements();
1346 std::vector Mask(NumElem);
1347
1348 Type *OpType = I->getOperand(0)->getType();
1349 unsigned NumInElem = cast(OpType)->getNumElements();
1350
1351 // For the mask from the first pair...
1352 fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask);
1353
1354 // For the mask from the second pair...
1355 fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem,
1356 Mask);
1357
1358 return ConstantVector::get(Mask);
1359 }
1360
1361 // Returns the value to be used as the specified operand of the vector
1362 // instruction that fuses I with J.
1363 Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I,
1364 Instruction *J, unsigned o, bool FlipMemInputs) {
1365 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
1366 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
1367
1368 // Compute the fused vector type for this operand
1369 Type *ArgType = I->getOperand(o)->getType();
1370 VectorType *VArgType = getVecTypeForPair(ArgType);
1371
1372 Instruction *L = I, *H = J;
1373 if (FlipMemInputs) {
1374 L = J;
1375 H = I;
1376 }
1377
1378 if (ArgType->isVectorTy()) {
1379 unsigned numElem = cast(VArgType)->getNumElements();
1380 std::vector Mask(numElem);
1381 for (unsigned v = 0; v < numElem; ++v)
1382 Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
1383
1384 Instruction *BV = new ShuffleVectorInst(L->getOperand(o),
1385 H->getOperand(o),
1386 ConstantVector::get(Mask),
1387 getReplacementName(I, true, o));
1388 BV->insertBefore(J);
1389 return BV;
1390 }
1391
1392 // If these two inputs are the output of another vector instruction,
1393 // then we should use that output directly. It might be necessary to
1394 // permute it first. [When pairings are fused recursively, you can
1395 // end up with cases where a large vector is decomposed into scalars
1396 // using extractelement instructions, then built into size-2
1397 // vectors using insertelement and the into larger vectors using
1398 // shuffles. InstCombine does not simplify all of these cases well,
1399 // and so we make sure that shuffles are generated here when possible.
1400 ExtractElementInst *LEE
1401 = dyn_cast(L->getOperand(o));
1402 ExtractElementInst *HEE
1403 = dyn_cast(H->getOperand(o));
1404
1405 if (LEE && HEE &&
1406 LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) {
1407 VectorType *EEType = cast(LEE->getOperand(0)->getType());
1408 unsigned LowIndx = cast(LEE->getOperand(1))->getZExtValue();
1409 unsigned HighIndx = cast(HEE->getOperand(1))->getZExtValue();
1410 if (LEE->getOperand(0) == HEE->getOperand(0)) {
1411 if (LowIndx == 0 && HighIndx == 1)
1412 return LEE->getOperand(0);
1413
1414 std::vector Mask(2);
1415 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
1416 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
1417
1418 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
1419 UndefValue::get(EEType),
1420 ConstantVector::get(Mask),
1421 getReplacementName(I, true, o));
1422 BV->insertBefore(J);
1423 return BV;
1424 }
1425
1426 std::vector Mask(2);
1427 HighIndx += EEType->getNumElements();
1428 Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx);
1429 Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx);
1430
1431 Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0),
1432 HEE->getOperand(0),
1433 ConstantVector::get(Mask),
1434 getReplacementName(I, true, o));
1435 BV->insertBefore(J);
1436 return BV;
1437 }
1438
1439 Instruction *BV1 = InsertElementInst::Create(
1440 UndefValue::get(VArgType),
1441 L->getOperand(o), CV0,
1442 getReplacementName(I, true, o, 1));
1443 BV1->insertBefore(I);
1444 Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o),
1445 CV1,
1446 getReplacementName(I, true, o, 2));
1447 BV2->insertBefore(J);
1448 return BV2;
1449 }
1450
1451 // This function creates an array of values that will be used as the inputs
1452 // to the vector instruction that fuses I with J.
1453 void BBVectorize::getReplacementInputsForPair(LLVMContext& Context,
1454 Instruction *I, Instruction *J,
1455 SmallVector &ReplacedOperands,
1456 bool &FlipMemInputs) {
1457 FlipMemInputs = false;
1458 unsigned NumOperands = I->getNumOperands();
1459
1460 for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) {
1461 // Iterate backward so that we look at the store pointer
1462 // first and know whether or not we need to flip the inputs.
1463
1464 if (isa(I) || (o == 1 && isa(I))) {
1465 // This is the pointer for a load/store instruction.
1466 ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o,
1467 FlipMemInputs);
1468 continue;
1469 } else if (isa(I) && o == NumOperands-1) {
1470 Function *F = cast(I)->getCalledFunction();
1471 unsigned IID = F->getIntrinsicID();
1472 BasicBlock &BB = *I->getParent();
1473
1474 Module *M = BB.getParent()->getParent();
1475 Type *ArgType = I->getType();
1476 Type *VArgType = getVecTypeForPair(ArgType);
1477
1478 // FIXME: is it safe to do this here?
1479 ReplacedOperands[o] = Intrinsic::getDeclaration(M,
1480 (Intrinsic::ID) IID, VArgType);
1481 continue;
1482 } else if (isa(I) && o == NumOperands-1) {
1483 ReplacedOperands[o] = getReplacementShuffleMask(Context, I, J);
1484 continue;
1485 }
1486
1487 ReplacedOperands[o] =
1488 getReplacementInput(Context, I, J, o, FlipMemInputs);
1489 }
1490 }
1491
1492 // This function creates two values that represent the outputs of the
1493 // original I and J instructions. These are generally vector shuffles
1494 // or extracts. In many cases, these will end up being unused and, thus,
1495 // eliminated by later passes.
1496 void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
1497 Instruction *J, Instruction *K,
1498 Instruction *&InsertionPt,
1499 Instruction *&K1, Instruction *&K2,
1500 bool &FlipMemInputs) {
1501 Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0);
1502 Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1);
1503
1504 if (isa(I)) {
1505 AA->replaceWithNewValue(I, K);
1506 AA->replaceWithNewValue(J, K);
1507 } else {
1508 Type *IType = I->getType();
1509 Type *VType = getVecTypeForPair(IType);
1510
1511 if (IType->isVectorTy()) {
1512 unsigned numElem = cast(IType)->getNumElements();
1513 std::vector Mask1(numElem), Mask2(numElem);
1514 for (unsigned v = 0; v < numElem; ++v) {
1515 Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
1516 Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v);
1517 }
1518
1519 K1 = new ShuffleVectorInst(K, UndefValue::get(VType),
1520 ConstantVector::get(
1521 FlipMemInputs ? Mask2 : Mask1),
1522 getReplacementName(K, false, 1));
1523 K2 = new ShuffleVectorInst(K, UndefValue::get(VType),
1524 ConstantVector::get(
1525 FlipMemInputs ? Mask1 : Mask2),
1526 getReplacementName(K, false, 2));
1527 } else {
1528 K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0,
1529 getReplacementName(K, false, 1));
1530 K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1,
1531 getReplacementName(K, false, 2));
1532 }
1533
1534 K1->insertAfter(K);
1535 K2->insertAfter(K1);
1536 InsertionPt = K2;
1537 }
1538 }
1539
1540 // Move all uses of the function I (including pairing-induced uses) after J.
1541 bool BBVectorize::canMoveUsesOfIAfterJ(BasicBlock &BB,
1542 std::multimap &LoadMoveSet,
1543 Instruction *I, Instruction *J) {
1544 // Skip to the first instruction past I.
1545 BasicBlock::iterator L = BB.begin();
1546 for (; cast(L) != I; ++L);
1547 ++L;
1548
1549 DenseSet Users;
1550 AliasSetTracker WriteSet(*AA);
1551 for (; cast(L) != J; ++L)
1552 (void) trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet);
1553
1554 assert(cast(L) == J &&
1555 "Tracking has not proceeded far enough to check for dependencies");
1556 // If J is now in the use set of I, then trackUsesOfI will return true
1557 // and we have a dependency cycle (and the fusing operation must abort).
1558 return !trackUsesOfI(Users, WriteSet, I, J, true, &LoadMoveSet);
1559 }
1560
1561 // Move all uses of the function I (including pairing-induced uses) after J.
1562 void BBVectorize::moveUsesOfIAfterJ(BasicBlock &BB,
1563 std::multimap &LoadMoveSet,
1564 Instruction *&InsertionPt,
1565 Instruction *I, Instruction *J) {
1566 // Skip to the first instruction past I.
1567 BasicBlock::iterator L = BB.begin();
1568 for (; cast(L) != I; ++L);
1569 ++L;
1570
1571 DenseSet Users;
1572 AliasSetTracker WriteSet(*AA);
1573 for (; cast(L) != J;) {
1574 if (trackUsesOfI(Users, WriteSet, I, L, true, &LoadMoveSet)) {
1575 // Move this instruction
1576 Instruction *InstToMove = L; ++L;
1577
1578 DEBUG(dbgs() << "BBV: moving: " << *InstToMove <<
1579 " to after " << *InsertionPt << "\n");
1580 InstToMove->removeFromParent();
1581 InstToMove->insertAfter(InsertionPt);
1582 InsertionPt = InstToMove;
1583 } else {
1584 ++L;
1585 }
1586 }
1587 }
1588
1589 // Collect all load instruction that are in the move set of a given first
1590 // pair member. These loads depend on the first instruction, I, and so need
1591 // to be moved after J (the second instruction) when the pair is fused.
1592 void BBVectorize::collectPairLoadMoveSet(BasicBlock &BB,
1593 DenseMap &ChosenPairs,
1594 std::multimap &LoadMoveSet,
1595 Instruction *I) {
1596 // Skip to the first instruction past I.
1597 BasicBlock::iterator L = BB.begin();
1598 for (; cast(L) != I; ++L);
1599 ++L;
1600
1601 DenseSet Users;
1602 AliasSetTracker WriteSet(*AA);
1603
1604 // Note: We cannot end the loop when we reach J because J could be moved
1605 // farther down the use chain by another instruction pairing. Also, J
1606 // could be before I if this is an inverted input.
1607 for (BasicBlock::iterator E = BB.end(); cast(L) != E; ++L) {
1608 if (trackUsesOfI(Users, WriteSet, I, L)) {
1609 if (L->mayReadFromMemory())
1610 LoadMoveSet.insert(ValuePair(L, I));
1611 }
1612 }
1613 }
1614
1615 // In cases where both load/stores and the computation of their pointers
1616 // are chosen for vectorization, we can end up in a situation where the
1617 // aliasing analysis starts returning different query results as the
1618 // process of fusing instruction pairs continues. Because the algorithm
1619 // relies on finding the same use trees here as were found earlier, we'll
1620 // need to precompute the necessary aliasing information here and then
1621 // manually update it during the fusion process.
1622 void BBVectorize::collectLoadMoveSet(BasicBlock &BB,
1623 std::vector &PairableInsts,
1624 DenseMap &ChosenPairs,
1625 std::multimap &LoadMoveSet) {
1626 for (std::vector::iterator PI = PairableInsts.begin(),
1627 PIE = PairableInsts.end(); PI != PIE; ++PI) {
1628 DenseMap::iterator P = ChosenPairs.find(*PI);
1629 if (P == ChosenPairs.end()) continue;
1630
1631 Instruction *I = cast(P->first);
1632 collectPairLoadMoveSet(BB, ChosenPairs, LoadMoveSet, I);
1633 }
1634 }
1635
1636 // This function fuses the chosen instruction pairs into vector instructions,
1637 // taking care preserve any needed scalar outputs and, then, it reorders the
1638 // remaining instructions as needed (users of the first member of the pair
1639 // need to be moved to after the location of the second member of the pair
1640 // because the vector instruction is inserted in the location of the pair's
1641 // second member).
1642 void BBVectorize::fuseChosenPairs(BasicBlock &BB,
1643 std::vector &PairableInsts,
1644 DenseMap &ChosenPairs) {
1645 LLVMContext& Context = BB.getContext();
1646
1647 // During the vectorization process, the order of the pairs to be fused
1648 // could be flipped. So we'll add each pair, flipped, into the ChosenPairs
1649 // list. After a pair is fused, the flipped pair is removed from the list.
1650 std::vector FlippedPairs;
1651 FlippedPairs.reserve(ChosenPairs.size());
1652 for (DenseMap::iterator P = ChosenPairs.begin(),
1653 E = ChosenPairs.end(); P != E; ++P)
1654 FlippedPairs.push_back(ValuePair(P->second, P->first));
1655 for (std::vector::iterator P = FlippedPairs.begin(),
1656 E = FlippedPairs.end(); P != E; ++P)
1657 ChosenPairs.insert(*P);
1658
1659 std::multimap LoadMoveSet;
1660 collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet);
1661
1662 DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n");
1663
1664 for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) {
1665 DenseMap::iterator P = ChosenPairs.find(PI);
1666 if (P == ChosenPairs.end()) {
1667 ++PI;
1668 continue;
1669 }
1670
1671 if (getDepthFactor(P->first) == 0) {
1672 // These instructions are not really fused, but are tracked as though
1673 // they are. Any case in which it would be interesting to fuse them
1674 // will be taken care of by InstCombine.
1675 --NumFusedOps;
1676 ++PI;
1677 continue;
1678 }
1679
1680 Instruction *I = cast(P->first),
1681 *J = cast(P->second);
1682
1683 DEBUG(dbgs() << "BBV: fusing: " << *I <<
1684 " <-> " << *J << "\n");
1685
1686 // Remove the pair and flipped pair from the list.
1687 DenseMap::iterator FP = ChosenPairs.find(P->second);
1688 assert(FP != ChosenPairs.end() && "Flipped pair not found in list");
1689 ChosenPairs.erase(FP);
1690 ChosenPairs.erase(P);
1691
1692 if (!canMoveUsesOfIAfterJ(BB, LoadMoveSet, I, J)) {
1693 DEBUG(dbgs() << "BBV: fusion of: " << *I <<
1694 " <-> " << *J <<
1695 " aborted because of non-trivial dependency cycle\n");
1696 --NumFusedOps;
1697 ++PI;
1698 continue;
1699 }
1700
1701 bool FlipMemInputs;
1702 unsigned NumOperands = I->getNumOperands();
1703 SmallVector ReplacedOperands(NumOperands);
1704 getReplacementInputsForPair(Context, I, J, ReplacedOperands,
1705 FlipMemInputs);
1706
1707 // Make a copy of the original operation, change its type to the vector
1708 // type and replace its operands with the vector operands.
1709 Instruction *K = I->clone();
1710 if (I->hasName()) K->takeName(I);
1711
1712 if (!isa(K))
1713 K->mutateType(getVecTypeForPair(I->getType()));
1714
1715 for (unsigned o = 0; o < NumOperands; ++o)
1716 K->setOperand(o, ReplacedOperands[o]);
1717
1718 // If we've flipped the memory inputs, make sure that we take the correct
1719 // alignment.
1720 if (FlipMemInputs) {
1721 if (isa(K))
1722 cast(K)->setAlignment(cast(J)->getAlignment());
1723 else
1724 cast(K)->setAlignment(cast(J)->getAlignment());
1725 }
1726
1727 K->insertAfter(J);
1728
1729 // Instruction insertion point:
1730 Instruction *InsertionPt = K;
1731 Instruction *K1 = 0, *K2 = 0;
1732 replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2,
1733 FlipMemInputs);
1734
1735 // The use tree of the first original instruction must be moved to after
1736 // the location of the second instruction. The entire use tree of the
1737 // first instruction is disjoint from the input tree of the second
1738 // (by definition), and so commutes with it.
1739
1740 moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J);
1741
1742 if (!isa(I)) {
1743 I->replaceAllUsesWith(K1);
1744 J->replaceAllUsesWith(K2);
1745 AA->replaceWithNewValue(I, K1);
1746 AA->replaceWithNewValue(J, K2);
1747 }
1748
1749 // Instructions that may read from memory may be in the load move set.
1750 // Once an instruction is fused, we no longer need its move set, and so
1751 // the values of the map never need to be updated. However, when a load
1752 // is fused, we need to merge the entries from both instructions in the
1753 // pair in case those instructions were in the move set of some other
1754 // yet-to-be-fused pair. The loads in question are the keys of the map.
1755 if (I->mayReadFromMemory()) {
1756 std::vector NewSetMembers;
1757 VPIteratorPair IPairRange = LoadMoveSet.equal_range(I);
1758 VPIteratorPair JPairRange = LoadMoveSet.equal_range(J);
1759 for (std::multimap::iterator N = IPairRange.first;
1760 N != IPairRange.second; ++N)
1761 NewSetMembers.push_back(ValuePair(K, N->second));
1762 for (std::multimap::iterator N = JPairRange.first;
1763 N != JPairRange.second; ++N)
1764 NewSetMembers.push_back(ValuePair(K, N->second));
1765 for (std::vector::iterator A = NewSetMembers.begin(),
1766 AE = NewSetMembers.end(); A != AE; ++A)
1767 LoadMoveSet.insert(*A);
1768 }
1769
1770 // Before removing I, set the iterator to the next instruction.
1771 PI = llvm::next(BasicBlock::iterator(I));
1772 if (cast(PI) == J)
1773 ++PI;
1774
1775 SE->forgetValue(I);
1776 SE->forgetValue(J);
1777 I->eraseFromParent();
1778 J->eraseFromParent();
1779 }
1780
1781 DEBUG(dbgs() << "BBV: final: \n" << BB << "\n");
1782 }
1783 }
1784
1785 char BBVectorize::ID = 0;
1786 static const char bb_vectorize_name[] = "Basic-Block Vectorization";
1787 INITIALIZE_PASS_BEGIN(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
1788 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
1789 INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
1790 INITIALIZE_PASS_END(BBVectorize, BBV_NAME, bb_vectorize_name, false, false)
1791
1792 BasicBlockPass *llvm::createBBVectorizePass() {
1793 return new BBVectorize();
1794 }
1795
0 add_llvm_library(LLVMVectorize
1 BBVectorize.cpp
2 Vectorize.cpp
3 )
0 ;===- ./lib/Transforms/Scalar/LLVMBuild.txt --------------------*- Conf -*--===;
1 ;
2 ; The LLVM Compiler Infrastructure
3 ;
4 ; This file is distributed under the University of Illinois Open Source
5 ; License. See LICENSE.TXT for details.
6 ;
7 ;===------------------------------------------------------------------------===;
8 ;
9 ; This is an LLVMBuild description file for the components in this subdirectory.
10 ;
11 ; For more information on the LLVMBuild system, please see:
12 ;
13 ; http://llvm.org/docs/LLVMBuild.html
14 ;
15 ;===------------------------------------------------------------------------===;
16
17 [component_0]
18 type = Library
19 name = Vectorize
20 parent = Transforms
21 library_name = Vectorize
22 required_libraries = Analysis Core InstCombine Support Target TransformUtils
23
0 ##===- lib/Transforms/Vectorize/Makefile -----------------*- Makefile -*-===##
1 #
2 # The LLVM Compiler Infrastructure
3 #
4 # This file is distributed under the University of Illinois Open Source
5 # License. See LICENSE.TXT for details.
6 #
7 ##===----------------------------------------------------------------------===##
8
9 LEVEL = ../../..
10 LIBRARYNAME = LLVMVectorize
11 BUILD_ARCHIVE = 1
12
13 include $(LEVEL)/Makefile.common
14
0 //===-- Vectorize.cpp -----------------------------------------------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements common infrastructure for libLLVMVectorizeOpts.a, which
10 // implements several vectorization transformations over the LLVM intermediate
11 // representation, including the C bindings for that library.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "llvm-c/Transforms/Vectorize.h"
16 #include "llvm-c/Initialization.h"
17 #include "llvm/InitializePasses.h"
18 #include "llvm/PassManager.h"
19 #include "llvm/Analysis/Passes.h"
20 #include "llvm/Analysis/Verifier.h"
21 #include "llvm/Transforms/Vectorize.h"
22
23 using namespace llvm;
24
25 /// initializeVectorizationPasses - Initialize all passes linked into the
26 /// Vectorization library.
27 void llvm::initializeVectorization(PassRegistry &Registry) {
28 initializeBBVectorizePass(Registry);
29 }
30
31 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
32 initializeVectorization(*unwrap(R));
33 }
34
35 void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
36 unwrap(PM)->add(createBBVectorizePass());
37 }
38
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
1 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
2
3 ; This test checks the non-trivial pairing-induced cycle avoidance. Without this cycle avoidance, the algorithm would otherwise
4 ; want to select the pairs:
5 ; %div77 = fdiv double %sub74, %mul76.v.r1 <-> %div125 = fdiv double %mul121, %mul76.v.r2 (div125 depends on mul117)
6 ; %add84 = fadd double %sub83, 2.000000e+00 <-> %add127 = fadd double %mul126, 1.000000e+00 (add127 depends on div77)
7 ; %mul95 = fmul double %sub45.v.r1, %sub36.v.r1 <-> %mul88 = fmul double %sub36.v.r1, %sub87 (mul88 depends on add84)
8 ; %mul117 = fmul double %sub39.v.r1, %sub116 <-> %mul97 = fmul double %mul96, %sub39.v.r1 (mul97 depends on mul95)
9 ; and so a dependency cycle would be created.
10
11 declare double @fabs(double) nounwind readnone
12 define void @test1(double %a, double %b, double %c, double %add80, double %mul1, double %mul2.v.r1, double %mul73, double %sub, double %sub65, double %F.0, i32 %n.0, double %Bnm3.0, double %Bnm2.0, double %Bnm1.0, double %Anm3.0, double %Anm2.0, double %Anm1.0) {
13 entry:
14 br label %go
15 go:
16 %conv = sitofp i32 %n.0 to double
17 %add35 = fadd double %conv, %a
18 %sub36 = fadd double %add35, -1.000000e+00
19 %add38 = fadd double %conv, %b
20 %sub39 = fadd double %add38, -1.000000e+00
21 %add41 = fadd double %conv, %c
22 %sub42 = fadd double %add41, -1.000000e+00
23 %sub45 = fadd double %add35, -2.000000e+00
24 %sub48 = fadd double %add38, -2.000000e+00
25 %sub51 = fadd double %add41, -2.000000e+00
26 %mul52 = shl nsw i32 %n.0, 1
27 %sub53 = add nsw i32 %mul52, -1
28 %conv54 = sitofp i32 %sub53 to double
29 %sub56 = add nsw i32 %mul52, -3
30 %conv57 = sitofp i32 %sub56 to double
31 %sub59 = add nsw i32 %mul52, -5
32 %conv60 = sitofp i32 %sub59 to double
33 %mul61 = mul nsw i32 %n.0, %n.0
34 %conv62 = sitofp i32 %mul61 to double
35 %mul63 = fmul double %conv62, 3.000000e+00
36 %mul67 = fmul double %sub65, %conv
37 %add68 = fadd double %mul63, %mul67
38 %add69 = fadd double %add68, 2.000000e+00
39 %sub71 = fsub double %add69, %mul2.v.r1
40 %sub74 = fsub double %sub71, %mul73
41 %mul75 = fmul double %conv57, 2.000000e+00
42 %mul76 = fmul double %mul75, %sub42
43 %div77 = fdiv double %sub74, %mul76
44 %mul82 = fmul double %add80, %conv
45 %sub83 = fsub double %mul63, %mul82
46 %add84 = fadd double %sub83, 2.000000e+00
47 %sub86 = fsub double %add84, %mul2.v.r1
48 %sub87 = fsub double -0.000000e+00, %sub86
49 %mul88 = fmul double %sub36, %sub87
50 %mul89 = fmul double %mul88, %sub39
51 %mul90 = fmul double %conv54, 4.000000e+00
52 %mul91 = fmul double %mul90, %conv57
53 %mul92 = fmul double %mul91, %sub51
54 %mul93 = fmul double %mul92, %sub42
55 %div94 = fdiv double %mul89, %mul93
56 %mul95 = fmul double %sub45, %sub36
57 %mul96 = fmul double %mul95, %sub48
58 %mul97 = fmul double %mul96, %sub39
59 %sub99 = fsub double %conv, %a
60 %sub100 = fadd double %sub99, -2.000000e+00
61 %mul101 = fmul double %mul97, %sub100
62 %sub103 = fsub double %conv, %b
63 %sub104 = fadd double %sub103, -2.000000e+00
64 %mul105 = fmul double %mul101, %sub104
65 %mul106 = fmul double %conv57, 8.000000e+00
66 %mul107 = fmul double %mul106, %conv57
67 %mul108 = fmul double %mul107, %conv60
68 %sub111 = fadd double %add41, -3.000000e+00
69 %mul112 = fmul double %mul108, %sub111
70 %mul113 = fmul double %mul112, %sub51
71 %mul114 = fmul double %mul113, %sub42
72 %div115 = fdiv double %mul105, %mul114
73 %sub116 = fsub double -0.000000e+00, %sub36
74 %mul117 = fmul double %sub39, %sub116
75 %sub119 = fsub double %conv, %c
76 %sub120 = fadd double %sub119, -1.000000e+00
77 %mul121 = fmul double %mul117, %sub120
78 %mul123 = fmul double %mul75, %sub51
79 %mul124 = fmul double %mul123, %sub42
80 %div125 = fdiv double %mul121, %mul124
81 %mul126 = fmul double %div77, %sub
82 %add127 = fadd double %mul126, 1.000000e+00
83 %mul128 = fmul double %add127, %Anm1.0
84 %mul129 = fmul double %div94, %sub
85 %add130 = fadd double %div125, %mul129
86 %mul131 = fmul double %add130, %sub
87 %mul132 = fmul double %mul131, %Anm2.0
88 %add133 = fadd double %mul128, %mul132
89 %mul134 = fmul double %div115, %mul1
90 %mul135 = fmul double %mul134, %Anm3.0
91 %add136 = fadd double %add133, %mul135
92 %mul139 = fmul double %add127, %Bnm1.0
93 %mul143 = fmul double %mul131, %Bnm2.0
94 %add144 = fadd double %mul139, %mul143
95 %mul146 = fmul double %mul134, %Bnm3.0
96 %add147 = fadd double %add144, %mul146
97 %div148 = fdiv double %add136, %add147
98 %sub149 = fsub double %F.0, %div148
99 %div150 = fdiv double %sub149, %F.0
100 %call = tail call double @fabs(double %div150) nounwind readnone
101 %cmp = fcmp olt double %call, 0x3CB0000000000000
102 %cmp152 = icmp sgt i32 %n.0, 20000
103 %or.cond = or i1 %cmp, %cmp152
104 br i1 %or.cond, label %done, label %go
105 done:
106 ret void
107 ; CHECK: @test1
108 ; CHECK: go:
109 ; CHECK-NEXT: %conv.v.i0.1 = insertelement <2 x i32> undef, i32 %n.0, i32 0
110 ; FIXME: When tree pruning is deterministic, include the entire output.
111 }
0 load_lib llvm.exp
1
2 RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
1 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
2
3 define double @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
4 entry:
5 %i0 = load double* %a, align 8
6 %i1 = load double* %b, align 8
7 %mul = fmul double %i0, %i1
8 %i2 = load double* %c, align 8
9 %add = fadd double %mul, %i2
10 %arrayidx3 = getelementptr inbounds double* %a, i64 1
11 %i3 = load double* %arrayidx3, align 8
12 %arrayidx4 = getelementptr inbounds double* %b, i64 1
13 %i4 = load double* %arrayidx4, align 8
14 %mul5 = fmul double %i3, %i4
15 %arrayidx6 = getelementptr inbounds double* %c, i64 1
16 %i5 = load double* %arrayidx6, align 8
17 %add7 = fadd double %mul5, %i5
18 %mul9 = fmul double %add, %i1
19 %add11 = fadd double %mul9, %i2
20 %mul13 = fmul double %add7, %i4
21 %add15 = fadd double %mul13, %i5
22 %mul16 = fmul double %add11, %add15
23 ret double %mul16
24 ; CHECK: @test1
25 ; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
26 ; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
27 ; CHECK: %i2.v.i0 = bitcast double* %c to <2 x double>*
28 ; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
29 ; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
30 ; CHECK: %mul = fmul <2 x double> %i0, %i1
31 ; CHECK: %i2 = load <2 x double>* %i2.v.i0, align 8
32 ; CHECK: %add = fadd <2 x double> %mul, %i2
33 ; CHECK: %mul9 = fmul <2 x double> %add, %i1
34 ; CHECK: %add11 = fadd <2 x double> %mul9, %i2
35 ; CHECK: %add11.v.r1 = extractelement <2 x double> %add11, i32 0
36 ; CHECK: %add11.v.r2 = extractelement <2 x double> %add11, i32 1
37 ; CHECK: %mul16 = fmul double %add11.v.r1, %add11.v.r2
38 ; CHECK: ret double %mul16
39 }
40
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
1 target triple = "x86_64-unknown-linux-gnu"
2 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
3 ; RUN: opt < %s -basicaa -loop-unroll -unroll-threshold=45 -unroll-allow-partial -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-UNRL
4 ; The second check covers the use of alias analysis (with loop unrolling).
5
6 define void @test1(double* noalias %out, double* noalias %in1, double* noalias %in2) nounwind uwtable {
7 entry:
8 br label %for.body
9 ; CHECK: @test1
10 ; CHECK-UNRL: @test1
11
12 for.body: ; preds = %for.body, %entry
13 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
14 %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
15 %0 = load double* %arrayidx, align 8
16 %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
17 %1 = load double* %arrayidx2, align 8
18 %mul = fmul double %0, %0
19 %mul3 = fmul double %0, %1
20 %add = fadd double %mul, %mul3
21 %add4 = fadd double %1, %1
22 %add5 = fadd double %add4, %0
23 %mul6 = fmul double %0, %add5
24 %add7 = fadd double %add, %mul6
25 %mul8 = fmul double %1, %1
26 %add9 = fadd double %0, %0
27 %add10 = fadd double %add9, %0
28 %mul11 = fmul double %mul8, %add10
29 %add12 = fadd double %add7, %mul11
30 %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
31 store double %add12, double* %arrayidx14, align 8
32 %indvars.iv.next = add i64 %indvars.iv, 1
33 %lftr.wideiv = trunc i64 %indvars.iv.next to i32
34 %exitcond = icmp eq i32 %lftr.wideiv, 10
35 br i1 %exitcond, label %for.end, label %for.body
36 ; CHECK: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
37 ; CHECK: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
38 ; CHECK: %0 = load double* %arrayidx, align 8
39 ; CHECK: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
40 ; CHECK: %1 = load double* %arrayidx2, align 8
41 ; CHECK: %mul = fmul double %0, %0
42 ; CHECK: %mul3 = fmul double %0, %1
43 ; CHECK: %add = fadd double %mul, %mul3
44 ; CHECK: %add4.v.i1.1 = insertelement <2 x double> undef, double %1, i32 0
45 ; CHECK: %mul8 = fmul double %1, %1
46 ; CHECK: %add4.v.i1.2 = insertelement <2 x double> %add4.v.i1.1, double %0, i32 1
47 ; CHECK: %add4 = fadd <2 x double> %add4.v.i1.2, %add4.v.i1.2
48 ; CHECK: %add5.v.i1.1 = insertelement <2 x double> undef, double %0, i32 0
49 ; CHECK: %add5.v.i1.2 = insertelement <2 x double> %add5.v.i1.1, double %0, i32 1
50 ; CHECK: %add5 = fadd <2 x double> %add4, %add5.v.i1.2
51 ; CHECK: %mul6.v.i0.2 = insertelement <2 x double> %add5.v.i1.1, double %mul8, i32 1
52 ; CHECK: %mul6 = fmul <2 x double> %mul6.v.i0.2, %add5
53 ; CHECK: %mul6.v.r1 = extractelement <2 x double> %mul6, i32 0
54 ; CHECK: %mul6.v.r2 = extractelement <2 x double> %mul6, i32 1
55 ; CHECK: %add7 = fadd double %add, %mul6.v.r1
56 ; CHECK: %add12 = fadd double %add7, %mul6.v.r2
57 ; CHECK: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
58 ; CHECK: store double %add12, double* %arrayidx14, align 8
59 ; CHECK: %indvars.iv.next = add i64 %indvars.iv, 1
60 ; CHECK: %lftr.wideiv = trunc i64 %indvars.iv.next to i32
61 ; CHECK: %exitcond = icmp eq i32 %lftr.wideiv, 10
62 ; CHECK: br i1 %exitcond, label %for.end, label %for.body
63 ; CHECK-UNRL: %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next.1, %for.body ]
64 ; CHECK-UNRL: %arrayidx = getelementptr inbounds double* %in1, i64 %indvars.iv
65 ; CHECK-UNRL: %0 = bitcast double* %arrayidx to <2 x double>*
66 ; CHECK-UNRL: %arrayidx2 = getelementptr inbounds double* %in2, i64 %indvars.iv
67 ; CHECK-UNRL: %1 = bitcast double* %arrayidx2 to <2 x double>*
68 ; CHECK-UNRL: %arrayidx14 = getelementptr inbounds double* %out, i64 %indvars.iv
69 ; CHECK-UNRL: %2 = load <2 x double>* %0, align 8
70 ; CHECK-UNRL: %3 = load <2 x double>* %1, align 8
71 ; CHECK-UNRL: %mul = fmul <2 x double> %2, %2
72 ; CHECK-UNRL: %mul3 = fmul <2 x double> %2, %3
73 ; CHECK-UNRL: %add = fadd <2 x double> %mul, %mul3
74 ; CHECK-UNRL: %add4 = fadd <2 x double> %3, %3
75 ; CHECK-UNRL: %add5 = fadd <2 x double> %add4, %2
76 ; CHECK-UNRL: %mul6 = fmul <2 x double> %2, %add5
77 ; CHECK-UNRL: %add7 = fadd <2 x double> %add, %mul6
78 ; CHECK-UNRL: %mul8 = fmul <2 x double> %3, %3
79 ; CHECK-UNRL: %add9 = fadd <2 x double> %2, %2
80 ; CHECK-UNRL: %add10 = fadd <2 x double> %add9, %2
81 ; CHECK-UNRL: %mul11 = fmul <2 x double> %mul8, %add10
82 ; CHECK-UNRL: %add12 = fadd <2 x double> %add7, %mul11
83 ; CHECK-UNRL: %4 = bitcast double* %arrayidx14 to <2 x double>*
84 ; CHECK-UNRL: store <2 x double> %add12, <2 x double>* %4, align 8
85 ; CHECK-UNRL: %indvars.iv.next.1 = add i64 %indvars.iv, 2
86 ; CHECK-UNRL: %lftr.wideiv.1 = trunc i64 %indvars.iv.next.1 to i32
87 ; CHECK-UNRL: %exitcond.1 = icmp eq i32 %lftr.wideiv.1, 10
88 ; CHECK-UNRL: br i1 %exitcond.1, label %for.end, label %for.body
89
90 for.end: ; preds = %for.body
91 ret void
92 }
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
1 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 3 -S | FileCheck %s -check-prefix=CHECK-RD3
2 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth 2 -S | FileCheck %s -check-prefix=CHECK-RD2
3
4 define double @test1(double %A1, double %A2, double %B1, double %B2) {
5 %X1 = fsub double %A1, %B1
6 %X2 = fsub double %A2, %B2
7 %Y1 = fmul double %X1, %A1
8 %Y2 = fmul double %X2, %A2
9 %R = fmul double %Y1, %Y2
10 ret double %R
11 ; CHECK-RD3: @test1
12 ; CHECK-RD2: @test1
13 ; CHECK-RD3-NOT: <2 x double>
14 ; CHECK-RD2: <2 x double>
15 }
16
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
1 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
2 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-search-limit=4 -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-SL4
3
4 define double @test1(double %A1, double %A2, double %B1, double %B2) {
5 ; CHECK: @test1
6 ; CHECK-SL4: @test1
7 ; CHECK-SL4-NOT: <2 x double>
8 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
9 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
10 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
11 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
12 %X1 = fsub double %A1, %B1
13 %X2 = fsub double %A2, %B2
14 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
15 %Y1 = fmul double %X1, %A1
16 %Y2 = fmul double %X2, %A2
17 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
18 %Z1 = fadd double %Y1, %B1
19 ; Here we have a dependency chain: the short search limit will not
20 ; see past this chain and so will not see the second part of the
21 ; pair to vectorize.
22 %mul41 = fmul double %Z1, %Y2
23 %sub48 = fsub double %Z1, %mul41
24 %mul62 = fmul double %Z1, %sub48
25 %sub69 = fsub double %Z1, %mul62
26 %mul83 = fmul double %Z1, %sub69
27 %sub90 = fsub double %Z1, %mul83
28 %mul104 = fmul double %Z1, %sub90
29 %sub111 = fsub double %Z1, %mul104
30 %mul125 = fmul double %Z1, %sub111
31 %sub132 = fsub double %Z1, %mul125
32 %mul146 = fmul double %Z1, %sub132
33 %sub153 = fsub double %Z1, %mul146
34 ; end of chain.
35 %Z2 = fadd double %Y2, %B2
36 ; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
37 %R1 = fdiv double %Z1, %Z2
38 %R = fmul double %R1, %sub153
39 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
40 ; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
41 ; CHECK: %R1 = fdiv double %Z1.v.r1, %Z1.v.r2
42 ret double %R
43 ; CHECK: ret double %R
44 }
45
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
1 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
2
3 declare double @llvm.fma.f64(double, double, double)
4 declare double @llvm.cos.f64(double)
5
6 ; Basic depth-3 chain with fma
7 define double @test1(double %A1, double %A2, double %B1, double %B2, double %C1, double %C2) {
8 %X1 = fsub double %A1, %B1
9 %X2 = fsub double %A2, %B2
10 %Y1 = call double @llvm.fma.f64(double %X1, double %A1, double %C1)
11 %Y2 = call double @llvm.fma.f64(double %X2, double %A2, double %C2)
12 %Z1 = fadd double %Y1, %B1
13 %Z2 = fadd double %Y2, %B2
14 %R = fmul double %Z1, %Z2
15 ret double %R
16 ; CHECK: @test1
17 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
18 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
19 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
20 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
21 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
22 ; CHECK: %Y1.v.i2.1 = insertelement <2 x double> undef, double %C1, i32 0
23 ; CHECK: %Y1.v.i2.2 = insertelement <2 x double> %Y1.v.i2.1, double %C2, i32 1
24 ; CHECK: %Y1 = call <2 x double> @llvm.fma.v2f64(<2 x double> %X1, <2 x double> %X1.v.i0.2, <2 x double> %Y1.v.i2.2)
25 ; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
26 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
27 ; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
28 ; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
29 ; CHECK: ret double %R
30 }
31
32 ; Basic depth-3 chain with cos
33 define double @test2(double %A1, double %A2, double %B1, double %B2) {
34 %X1 = fsub double %A1, %B1
35 %X2 = fsub double %A2, %B2
36 %Y1 = call double @llvm.cos.f64(double %X1)
37 %Y2 = call double @llvm.cos.f64(double %X2)
38 %Z1 = fadd double %Y1, %B1
39 %Z2 = fadd double %Y2, %B2
40 %R = fmul double %Z1, %Z2
41 ret double %R
42 ; CHECK: @test2
43 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
44 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
45 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
46 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
47 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
48 ; CHECK: %Y1 = call <2 x double> @llvm.cos.v2f64(<2 x double> %X1)
49 ; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
50 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
51 ; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
52 ; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
53 ; CHECK: ret double %R
54 }
55
56 ; CHECK: declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
57 ; CHECK: declare <2 x double> @llvm.cos.v2f64(<2 x double>) nounwind readonly
58
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
1 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
2 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -bb-vectorize-aligned-only -instcombine -gvn -S | FileCheck %s -check-prefix=CHECK-AO
3
4 ; Simple 3-pair chain with loads and stores
5 define void @test1(double* %a, double* %b, double* %c) nounwind uwtable readonly {
6 entry:
7 %i0 = load double* %a, align 8
8 %i1 = load double* %b, align 8
9 %mul = fmul double %i0, %i1
10 %arrayidx3 = getelementptr inbounds double* %a, i64 1
11 %i3 = load double* %arrayidx3, align 8
12 %arrayidx4 = getelementptr inbounds double* %b, i64 1
13 %i4 = load double* %arrayidx4, align 8
14 %mul5 = fmul double %i3, %i4
15 store double %mul, double* %c, align 8
16 %arrayidx5 = getelementptr inbounds double* %c, i64 1
17 store double %mul5, double* %arrayidx5, align 8
18 ret void
19 ; CHECK: @test1
20 ; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
21 ; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
22 ; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
23 ; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
24 ; CHECK: %mul = fmul <2 x double> %i0, %i1
25 ; CHECK: %0 = bitcast double* %c to <2 x double>*
26 ; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
27 ; CHECK: ret void
28 ; CHECK-AO: @test1
29 ; CHECK-AO-NOT: <2 x double>
30 }
31
32 ; Simple chain with extending loads and stores
33 define void @test2(float* %a, float* %b, double* %c) nounwind uwtable readonly {
34 entry:
35 %i0f = load float* %a, align 4
36 %i0 = fpext float %i0f to double
37 %i1f = load float* %b, align 4
38 %i1 = fpext float %i1f to double
39 %mul = fmul double %i0, %i1
40 %arrayidx3 = getelementptr inbounds float* %a, i64 1
41 %i3f = load float* %arrayidx3, align 4
42 %i3 = fpext float %i3f to double
43 %arrayidx4 = getelementptr inbounds float* %b, i64 1
44 %i4f = load float* %arrayidx4, align 4
45 %i4 = fpext float %i4f to double
46 %mul5 = fmul double %i3, %i4
47 store double %mul, double* %c, align 8
48 %arrayidx5 = getelementptr inbounds double* %c, i64 1
49 store double %mul5, double* %arrayidx5, align 8
50 ret void
51 ; CHECK: @test2
52 ; CHECK: %i0f.v.i0 = bitcast float* %a to <2 x float>*
53 ; CHECK: %i1f.v.i0 = bitcast float* %b to <2 x float>*
54 ; CHECK: %i0f = load <2 x float>* %i0f.v.i0, align 4
55 ; CHECK: %i0 = fpext <2 x float> %i0f to <2 x double>
56 ; CHECK: %i1f = load <2 x float>* %i1f.v.i0, align 4
57 ; CHECK: %i1 = fpext <2 x float> %i1f to <2 x double>
58 ; CHECK: %mul = fmul <2 x double> %i0, %i1
59 ; CHECK: %0 = bitcast double* %c to <2 x double>*
60 ; CHECK: store <2 x double> %mul, <2 x double>* %0, align 8
61 ; CHECK: ret void
62 ; CHECK-AO: @test2
63 ; CHECK-AO-NOT: <2 x double>
64 }
65
66 ; Simple chain with loads and truncating stores
67 define void @test3(double* %a, double* %b, float* %c) nounwind uwtable readonly {
68 entry:
69 %i0 = load double* %a, align 8
70 %i1 = load double* %b, align 8
71 %mul = fmul double %i0, %i1
72 %mulf = fptrunc double %mul to float
73 %arrayidx3 = getelementptr inbounds double* %a, i64 1
74 %i3 = load double* %arrayidx3, align 8
75 %arrayidx4 = getelementptr inbounds double* %b, i64 1
76 %i4 = load double* %arrayidx4, align 8
77 %mul5 = fmul double %i3, %i4
78 %mul5f = fptrunc double %mul5 to float
79 store float %mulf, float* %c, align 8
80 %arrayidx5 = getelementptr inbounds float* %c, i64 1
81 store float %mul5f, float* %arrayidx5, align 4
82 ret void
83 ; CHECK: @test3
84 ; CHECK: %i0.v.i0 = bitcast double* %a to <2 x double>*
85 ; CHECK: %i1.v.i0 = bitcast double* %b to <2 x double>*
86 ; CHECK: %i0 = load <2 x double>* %i0.v.i0, align 8
87 ; CHECK: %i1 = load <2 x double>* %i1.v.i0, align 8
88 ; CHECK: %mul = fmul <2 x double> %i0, %i1
89 ; CHECK: %mulf = fptrunc <2 x double> %mul to <2 x float>
90 ; CHECK: %0 = bitcast float* %c to <2 x float>*
91 ; CHECK: store <2 x float> %mulf, <2 x float>* %0, align 8
92 ; CHECK: ret void
93 ; CHECK-AO: @test3
94 ; CHECK-AO: %i0 = load double* %a, align 8
95 ; CHECK-AO: %i1 = load double* %b, align 8
96 ; CHECK-AO: %mul.v.i1.1 = insertelement <2 x double> undef, double %i1, i32 0
97 ; CHECK-AO: %mul.v.i0.1 = insertelement <2 x double> undef, double %i0, i32 0
98 ; CHECK-AO: %arrayidx3 = getelementptr inbounds double* %a, i64 1
99 ; CHECK-AO: %i3 = load double* %arrayidx3, align 8
100 ; CHECK-AO: %arrayidx4 = getelementptr inbounds double* %b, i64 1
101 ; CHECK-AO: %i4 = load double* %arrayidx4, align 8
102 ; CHECK-AO: %mul.v.i1.2 = insertelement <2 x double> %mul.v.i1.1, double %i4, i32 1
103 ; CHECK-AO: %mul.v.i0.2 = insertelement <2 x double> %mul.v.i0.1, double %i3, i32 1
104 ; CHECK-AO: %mul = fmul <2 x double> %mul.v.i0.2, %mul.v.i1.2
105 ; CHECK-AO: %mulf = fptrunc <2 x double> %mul to <2 x float>
106 ; CHECK-AO: %0 = bitcast float* %c to <2 x float>*
107 ; CHECK-AO: store <2 x float> %mulf, <2 x float>* %0, align 8
108 ; CHECK-AO: ret void
109 }
0 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
1 ; RUN: opt < %s -bb-vectorize -bb-vectorize-req-chain-depth=3 -instcombine -gvn -S | FileCheck %s
2
3 ; Basic depth-3 chain
4 define double @test1(double %A1, double %A2, double %B1, double %B2) {
5 ; CHECK: @test1
6 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
7 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
8 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
9 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
10 %X1 = fsub double %A1, %B1
11 %X2 = fsub double %A2, %B2
12 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
13 %Y1 = fmul double %X1, %A1
14 %Y2 = fmul double %X2, %A2
15 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
16 %Z1 = fadd double %Y1, %B1
17 %Z2 = fadd double %Y2, %B2
18 ; CHECK: %Z1 = fadd <2 x double> %Y1, %X1.v.i1.2
19 %R = fmul double %Z1, %Z2
20 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
21 ; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
22 ; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
23 ret double %R
24 ; CHECK: ret double %R
25 }
26
27 ; Basic depth-3 chain (last pair permuted)
28 define double @test2(double %A1, double %A2, double %B1, double %B2) {
29 ; CHECK: @test2
30 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
31 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
32 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
33 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
34 %X1 = fsub double %A1, %B1
35 %X2 = fsub double %A2, %B2
36 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
37 %Y1 = fmul double %X1, %A1
38 %Y2 = fmul double %X2, %A2
39 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
40 %Z1 = fadd double %Y2, %B1
41 %Z2 = fadd double %Y1, %B2
42 ; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32>
43 ; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
44 %R = fmul double %Z1, %Z2
45 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
46 ; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
47 ; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
48 ret double %R
49 ; CHECK: ret double %R
50 }
51
52 ; Basic depth-3 chain (last pair first splat)
53 define double @test3(double %A1, double %A2, double %B1, double %B2) {
54 ; CHECK: @test3
55 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
56 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
57 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
58 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
59 %X1 = fsub double %A1, %B1
60 %X2 = fsub double %A2, %B2
61 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
62 %Y1 = fmul double %X1, %A1
63 %Y2 = fmul double %X2, %A2
64 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
65 %Z1 = fadd double %Y2, %B1
66 %Z2 = fadd double %Y2, %B2
67 ; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32>
68 ; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
69 %R = fmul double %Z1, %Z2
70 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
71 ; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
72 ; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
73 ret double %R
74 ; CHECK: ret double %R
75 }
76
77 ; Basic depth-3 chain (last pair second splat)
78 define double @test4(double %A1, double %A2, double %B1, double %B2) {
79 ; CHECK: @test4
80 ; CHECK: %X1.v.i1.1 = insertelement <2 x double> undef, double %B1, i32 0
81 ; CHECK: %X1.v.i0.1 = insertelement <2 x double> undef, double %A1, i32 0
82 ; CHECK: %X1.v.i1.2 = insertelement <2 x double> %X1.v.i1.1, double %B2, i32 1
83 ; CHECK: %X1.v.i0.2 = insertelement <2 x double> %X1.v.i0.1, double %A2, i32 1
84 %X1 = fsub double %A1, %B1
85 %X2 = fsub double %A2, %B2
86 ; CHECK: %X1 = fsub <2 x double> %X1.v.i0.2, %X1.v.i1.2
87 %Y1 = fmul double %X1, %A1
88 %Y2 = fmul double %X2, %A2
89 ; CHECK: %Y1 = fmul <2 x double> %X1, %X1.v.i0.2
90 %Z1 = fadd double %Y1, %B1
91 %Z2 = fadd double %Y1, %B2
92 ; CHECK: %Z1.v.i0 = shufflevector <2 x double> %Y1, <2 x double> undef, <2 x i32> zeroinitializer
93 ; CHECK: %Z1 = fadd <2 x double> %Z1.v.i0, %X1.v.i1.2
94 %R = fmul double %Z1, %Z2
95 ; CHECK: %Z1.v.r1 = extractelement <2 x double> %Z1, i32 0
96 ; CHECK: %Z1.v.r2 = extractelement <2 x double> %Z1, i32 1
97 ; CHECK: %R = fmul double %Z1.v.r1, %Z1.v.r2
98 ret double %R
99 ; CHECK: ret double %R
100 }
101
102 ; Basic depth-3 chain
103 define <2 x float> @test5(<2 x float> %A1, <2 x float> %A2, <2 x float> %B1, <2 x float> %B2) {
104 ; CHECK: @test5
105 ; CHECK: %X1.v.i1 = shufflevector <2 x float> %B1, <2 x float> %B2, <4 x i32>
106 ; CHECK: %X1.v.i0 = shufflevector <2 x float> %A1, <2 x float> %A2, <4 x i32>
107 %X1 = fsub <2 x float> %A1, %B1
108 %X2 = fsub <2 x float> %A2, %B2
109 ; CHECK: %X1 = fsub <4 x float> %X1.v.i0, %X1.v.i1
110 %Y1 = fmul <2 x float> %X1, %A1
111 %Y2 = fmul <2 x float> %X2, %A2
112 ; CHECK: %Y1 = fmul <4 x float> %X1, %X1.v.i0
113 %Z1 = fadd <2 x float> %Y1, %B1
114 %Z2 = fadd <2 x float> %Y2, %B2
115 ; CHECK: %Z1 = fadd <4 x float> %Y1, %X1.v.i1
116 %R = fmul <2 x float> %Z1, %Z2
117 ; CHECK: %Z1.v.r1 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32>
118 ; CHECK: %Z1.v.r2 = shufflevector <4 x float> %Z1, <4 x float> undef, <2 x i32>
119 ; CHECK: %R = fmul <2 x float> %Z1.v.r1, %Z1.v.r2
120 ret <2 x float> %R
121 ; CHECK: ret <2 x float> %R
122 }
123
124 ; Basic chain with shuffles
125 define <8 x i8> @test6(<8 x i8> %A1, <8 x i8> %A2, <8 x i8> %B1, <8 x i8> %B2) {
126 ; CHECK: @test6
127 ; CHECK: %X1.v.i1 = shufflevector <8 x i8> %B1, <8 x i8> %B2, <16 x i32>
128 ; CHECK: %X1.v.i0 = shufflevector <8 x i8> %A1, <8 x i8> %A2, <16 x i32>
129 %X1 = sub <8 x i8> %A1, %B1
130 %X2 = sub <8 x i8> %A2, %B2
131 ; CHECK: %X1 = sub <16 x i8> %X1.v.i0, %X1.v.i1
132 %Y1 = mul <8 x i8> %X1, %A1
133 %Y2 = mul <8 x i8> %X2, %A2
134 ; CHECK: %Y1 = mul <16 x i8> %X1, %X1.v.i0
135 %Z1 = add <8 x i8> %Y1, %B1
136 %Z2 = add <8 x i8> %Y2, %B2
137 ; CHECK: %Z1 = add <16 x i8> %Y1, %X1.v.i1
138 %Q1 = shufflevector <8 x i8> %Z1, <8 x i8> %Z2, <8 x i32>
139 %Q2 = shufflevector <8 x i8> %Z2, <8 x i8> %Z2, <8 x i32>
140 ; CHECK: %Z1.v.r2 = shufflevector <16 x i8> %Z1, <16 x i8> undef, <8 x i32>
141 ; CHECK: %Q1.v.i1 = shufflevector <8 x i8> %Z1.v.r2, <8 x i8> undef, <16 x i32>
142 ; CHECK: %Q1 = shufflevector <16 x i8> %Z1, <16 x i8> %Q1.v.i1, <16 x i32>
143 %R = mul <8 x i8> %Q1, %Q2
144 ; CHECK: %Q1.v.r1 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32>
145 ; CHECK: %Q1.v.r2 = shufflevector <16 x i8> %Q1, <16 x i8> undef, <8 x i32>
146 ; CHECK: %R = mul <8 x i8> %Q1.v.r1, %Q1.v.r2
147 ret <8 x i8> %R
148 ; CHECK: ret <8 x i8> %R
149 }
150
151
0 set(LLVM_LINK_COMPONENTS asmparser instrumentation scalaropts ipo
1 linker bitreader bitwriter)
1 linker bitreader bitwriter vectorize)
22
33 add_llvm_tool(bugpoint
44 BugDriver.cpp
99 LEVEL := ../..
1010 TOOLNAME := bugpoint
1111 LINK_COMPONENTS := asmparser instrumentation scalaropts ipo linker bitreader \
12 bitwriter
12 bitwriter vectorize
1313
1414 include $(LEVEL)/Makefile.common
None set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter)
0 set(LLVM_LINK_COMPONENTS ipo scalaropts linker archive bitwriter vectorize)
11
22 add_llvm_tool(llvm-ld
33 Optimize.cpp
88
99 LEVEL := ../..
1010 TOOLNAME := llvm-ld
11 LINK_COMPONENTS := ipo scalaropts linker archive bitwriter
11 LINK_COMPONENTS := ipo scalaropts linker archive bitwriter vectorize
1212
1313 include $(LEVEL)/Makefile.common
0 set(LLVM_LINK_COMPONENTS
11 ${LLVM_TARGETS_TO_BUILD}
2 ipo scalaropts linker bitreader bitwriter mcdisassembler)
2 ipo scalaropts linker bitreader bitwriter mcdisassembler vectorize)
33
44 add_definitions( -DLLVM_VERSION_INFO=\"${PACKAGE_VERSION}\" )
55
99 LEVEL := ../..
1010 LIBRARYNAME := LTO
1111 LINK_COMPONENTS := all-targets ipo scalaropts linker bitreader bitwriter \
12 mcdisassembler
12 mcdisassembler vectorize
1313 LINK_LIBS_IN_SHARED := 1
1414 SHARED_LIBRARY := 1
1515
None set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo)
0 set(LLVM_LINK_COMPONENTS bitreader asmparser bitwriter instrumentation scalaropts ipo vectorize)
11
22 add_llvm_tool(opt
33 AnalysisWrappers.cpp
88
99 LEVEL := ../..
1010 TOOLNAME := opt
11 LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo
11 LINK_COMPONENTS := bitreader bitwriter asmparser instrumentation scalaropts ipo vectorize
1212
1313 include $(LEVEL)/Makefile.common
479479 PassRegistry &Registry = *PassRegistry::getPassRegistry();
480480 initializeCore(Registry);
481481 initializeScalarOpts(Registry);
482 initializeVectorization(Registry);
482483 initializeIPO(Registry);
483484 initializeAnalysis(Registry);
484485 initializeIPA(Registry);