llvm.org GIT mirror llvm / 5e067bb
Subject: [PATCH] [CodeGen] Add pass to combine interleaved loads. This patch defines an interleaved-load-combine pass. The pass searches for ShuffleVector instructions that represent interleaved loads. Matches are converted such that they will be captured by the InterleavedAccessPass. The pass extends LLVMs capabilities to use target specific instruction selection of interleaved load patterns (e.g.: ld4 on Aarch64 architectures). Differential Revision: https://reviews.llvm.org/D52653 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@347208 91177308-0d34-0410-b5e6-96231b3b80d8 Martin Elshuber 9 months ago
9 changed file(s) with 1789 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
378378 ///
379379 FunctionPass *createInterleavedAccessPass();
380380
381 /// InterleavedLoadCombines Pass - This pass identifies interleaved loads and
382 /// combines them into wide loads detectable by InterleavedAccessPass
383 ///
384 FunctionPass *createInterleavedLoadCombinePass();
385
381386 /// LowerEmuTLS - This pass generates __emutls_[vt].xyz variables for all
382387 /// TLS variables for the emulated TLS model.
383388 ///
181181 void initializeInstructionCombiningPassPass(PassRegistry&);
182182 void initializeInstructionSelectPass(PassRegistry&);
183183 void initializeInterleavedAccessPass(PassRegistry&);
184 void initializeInterleavedLoadCombinePass(PassRegistry &);
184185 void initializeInternalizeLegacyPassPass(PassRegistry&);
185186 void initializeIntervalPartitionPass(PassRegistry&);
186187 void initializeJumpThreadingPass(PassRegistry&);
3838 InlineSpiller.cpp
3939 InterferenceCache.cpp
4040 InterleavedAccessPass.cpp
41 InterleavedLoadCombinePass.cpp
4142 IntrinsicLowering.cpp
4243 LatencyPriorityQueue.cpp
4344 LazyMachineBlockFrequencyInfo.cpp
4141 initializeIfConverterPass(Registry);
4242 initializeImplicitNullChecksPass(Registry);
4343 initializeIndirectBrExpandPassPass(Registry);
44 initializeInterleavedLoadCombinePass(Registry);
4445 initializeInterleavedAccessPass(Registry);
4546 initializeLiveDebugValuesPass(Registry);
4647 initializeLiveDebugVariablesPass(Registry);
0 //===- InterleavedLoadCombine.cpp - Combine Interleaved Loads ---*- C++ -*-===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 //
11 // This file defines the interleaved-load-combine pass. The pass searches for
12 // ShuffleVectorInstruction that execute interleaving loads. If a matching
13 // pattern is found, it adds a combined load and further instructions in a
14 // pattern that is detectable by InterleavedAccesPass. The old instructions are
15 // left dead to be removed later. The pass is specifically designed to be
16 // executed just before InterleavedAccesPass to find any left-over instances
17 // that are not detected within former passes.
18 //
19 //===----------------------------------------------------------------------===//
20
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/Analysis/MemoryLocation.h"
23 #include "llvm/Analysis/MemorySSA.h"
24 #include "llvm/Analysis/MemorySSAUpdater.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/CodeGen/Passes.h"
28 #include "llvm/CodeGen/TargetLowering.h"
29 #include "llvm/CodeGen/TargetPassConfig.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/IR/DataLayout.h"
32 #include "llvm/IR/Dominators.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/IR/LegacyPassManager.h"
36 #include "llvm/IR/Module.h"
37 #include "llvm/Pass.h"
38 #include "llvm/Support/Debug.h"
39 #include "llvm/Support/ErrorHandling.h"
40 #include "llvm/Support/raw_ostream.h"
41 #include "llvm/Target/TargetMachine.h"
42 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
43
44 #include
45 #include
46 #include
47
48 using namespace llvm;
49
50 #define DEBUG_TYPE "interleaved-load-combine"
51
52 namespace {
53
54 /// Statistic counter
55 STATISTIC(NumInterleavedLoadCombine, "Number of combined loads");
56
57 /// Option to disable the pass
58 static cl::opt DisableInterleavedLoadCombine(
59 "disable-" DEBUG_TYPE, cl::init(false), cl::Hidden,
60 cl::desc("Disable combining of interleaved loads"));
61
62 struct VectorInfo;
63
64 struct InterleavedLoadCombineImpl {
65 public:
66 InterleavedLoadCombineImpl(Function &F, DominatorTree &DT, MemorySSA &MSSA,
67 TargetMachine &TM)
68 : F(F), DT(DT), MSSA(MSSA),
69 TLI(*TM.getSubtargetImpl(F)->getTargetLowering()),
70 TTI(TM.getTargetTransformInfo(F)) {}
71
72 /// Scan the function for interleaved load candidates and execute the
73 /// replacement if applicable.
74 bool run();
75
76 private:
77 /// Function this pass is working on
78 Function &F;
79
80 /// Dominator Tree Analysis
81 DominatorTree &DT;
82
83 /// Memory Alias Analyses
84 MemorySSA &MSSA;
85
86 /// Target Lowering Information
87 const TargetLowering &TLI;
88
89 /// Target Transform Information
90 const TargetTransformInfo TTI;
91
92 /// Find the instruction in sets LIs that dominates all others, return nullptr
93 /// if there is none.
94 LoadInst *findFirstLoad(const std::set &LIs);
95
96 /// Replace interleaved load candidates. It does additional
97 /// analyses if this makes sense. Returns true on success and false
98 /// of nothing has been changed.
99 bool combine(std::list &InterleavedLoad,
100 OptimizationRemarkEmitter &ORE);
101
102 /// Given a set of VectorInfo containing candidates for a given interleave
103 /// factor, find a set that represents a 'factor' interleaved load.
104 bool findPattern(std::list &Candidates,
105 std::list &InterleavedLoad, unsigned Factor,
106 const DataLayout &DL);
107 }; // InterleavedLoadCombine
108
109 /// First Order Polynomial on an n-Bit Integer Value
110 ///
111 /// Polynomial(Value) = Value * B + A + E*2^(n-e)
112 ///
113 /// A and B are the coefficients. E*2^(n-e) is an error within 'e' most
114 /// significant bits. It is introduced if an exact computation cannot be proven
115 /// (e.q. division by 2).
116 ///
117 /// As part of this optimization multiple loads will be combined. It necessary
118 /// to prove that loads are within some relative offset to each other. This
119 /// class is used to prove relative offsets of values loaded from memory.
120 ///
121 /// Representing an integer in this form is sound since addition in two's
122 /// complement is associative (trivial) and multiplication distributes over the
123 /// addition (see Proof(1) in Polynomial::mul). Further, both operations
124 /// commute.
125 //
126 // Example:
127 // declare @fn(i64 %IDX, <4 x float>* %PTR) {
128 // %Pa1 = add i64 %IDX, 2
129 // %Pa2 = lshr i64 %Pa1, 1
130 // %Pa3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pa2
131 // %Va = load <4 x float>, <4 x float>* %Pa3
132 //
133 // %Pb1 = add i64 %IDX, 4
134 // %Pb2 = lshr i64 %Pb1, 1
135 // %Pb3 = getelementptr inbounds <4 x float>, <4 x float>* %PTR, i64 %Pb2
136 // %Vb = load <4 x float>, <4 x float>* %Pb3
137 // ... }
138 //
139 // The goal is to prove that two loads load consecutive addresses.
140 //
141 // In this case the polynomials are constructed by the following
142 // steps.
143 //
144 // The number tag #e specifies the error bits.
145 //
146 // Pa_0 = %IDX #0
147 // Pa_1 = %IDX + 2 #0 | add 2
148 // Pa_2 = %IDX/2 + 1 #1 | lshr 1
149 // Pa_3 = %IDX/2 + 1 #1 | GEP, step signext to i64
150 // Pa_4 = (%IDX/2)*16 + 16 #0 | GEP, multiply index by sizeof(4) for floats
151 // Pa_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
152 //
153 // Pb_0 = %IDX #0
154 // Pb_1 = %IDX + 4 #0 | add 2
155 // Pb_2 = %IDX/2 + 2 #1 | lshr 1
156 // Pb_3 = %IDX/2 + 2 #1 | GEP, step signext to i64
157 // Pb_4 = (%IDX/2)*16 + 32 #0 | GEP, multiply index by sizeof(4) for floats
158 // Pb_5 = (%IDX/2)*16 + 16 #0 | GEP, add offset of leading components
159 //
160 // Pb_5 - Pa_5 = 16 #0 | subtract to get the offset
161 //
162 // Remark: %PTR is not maintained within this class. So in this instance the
163 // offset of 16 can only be assumed if the pointers are equal.
164 //
165 class Polynomial {
166 /// Operations on B
167 enum BOps {
168 LShr,
169 Mul,
170 SExt,
171 Trunc,
172 };
173
174 /// Number of Error Bits e
175 unsigned ErrorMSBs;
176
177 /// Value
178 Value *V;
179
180 /// Coefficient B
181 SmallVector, 4> B;
182
183 /// Coefficient A
184 APInt A;
185
186 public:
187 Polynomial(Value *V) : ErrorMSBs((unsigned)-1), V(V), B(), A() {
188 IntegerType *Ty = dyn_cast(V->getType());
189 if (Ty) {
190 ErrorMSBs = 0;
191 this->V = V;
192 A = APInt(Ty->getBitWidth(), 0);
193 }
194 }
195
196 Polynomial(const APInt &A, unsigned ErrorMSBs = 0)
197 : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(A) {}
198
199 Polynomial(unsigned BitWidth, uint64_t A, unsigned ErrorMSBs = 0)
200 : ErrorMSBs(ErrorMSBs), V(NULL), B(), A(BitWidth, A) {}
201
202 Polynomial() : ErrorMSBs((unsigned)-1), V(NULL), B(), A() {}
203
204 /// Increment and clamp the number of undefined bits.
205 void incErrorMSBs(unsigned amt) {
206 if (ErrorMSBs == (unsigned)-1)
207 return;
208
209 ErrorMSBs += amt;
210 if (ErrorMSBs > A.getBitWidth())
211 ErrorMSBs = A.getBitWidth();
212 }
213
214 /// Decrement and clamp the number of undefined bits.
215 void decErrorMSBs(unsigned amt) {
216 if (ErrorMSBs == (unsigned)-1)
217 return;
218
219 if (ErrorMSBs > amt)
220 ErrorMSBs -= amt;
221 else
222 ErrorMSBs = 0;
223 }
224
225 /// Apply an add on the polynomial
226 Polynomial &add(const APInt &C) {
227 // Note: Addition is associative in two's complement even when in case of
228 // signed overflow.
229 //
230 // Error bits can only propagate into higher significant bits. As these are
231 // already regarded as undefined, there is no change.
232 //
233 // Theorem: Adding a constant to a polynomial does not change the error
234 // term.
235 //
236 // Proof:
237 //
238 // Since the addition is associative and commutes:
239 //
240 // (B + A + E*2^(n-e)) + C = B + (A + C) + E*2^(n-e)
241 // [qed]
242
243 if (C.getBitWidth() != A.getBitWidth()) {
244 ErrorMSBs = (unsigned)-1;
245 return *this;
246 }
247
248 A += C;
249 return *this;
250 }
251
252 /// Apply a multiplication onto the polynomial.
253 Polynomial &mul(const APInt &C) {
254 // Note: Multiplication distributes over the addition
255 //
256 // Theorem: Multiplication distributes over the addition
257 //
258 // Proof(1):
259 //
260 // (B+A)*C =-
261 // = (B + A) + (B + A) + .. {C Times}
262 // addition is associative and commutes, hence
263 // = B + B + .. {C Times} .. + A + A + .. {C times}
264 // = B*C + A*C
265 // (see (function add) for signed values and overflows)
266 // [qed]
267 //
268 // Theorem: If C has c trailing zeros, errors bits in A or B are shifted out
269 // to the left.
270 //
271 // Proof(2):
272 //
273 // Let B' and A' be the n-Bit inputs with some unknown errors EA,
274 // EB at e leading bits. B' and A' can be written down as:
275 //
276 // B' = B + 2^(n-e)*EB
277 // A' = A + 2^(n-e)*EA
278 //
279 // Let C' be an input with c trailing zero bits. C' can be written as
280 //
281 // C' = C*2^c
282 //
283 // Therefore we can compute the result by using distributivity and
284 // commutativity.
285 //
286 // (B'*C' + A'*C') = [B + 2^(n-e)*EB] * C' + [A + 2^(n-e)*EA] * C' =
287 // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
288 // = (B'+A') * C' =
289 // = [B + 2^(n-e)*EB + A + 2^(n-e)*EA] * C' =
290 // = [B + A + 2^(n-e)*EB + 2^(n-e)*EA] * C' =
291 // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C' =
292 // = (B + A) * C' + [2^(n-e)*EB + 2^(n-e)*EA)] * C*2^c =
293 // = (B + A) * C' + C*(EB + EA)*2^(n-e)*2^c =
294 //
295 // Let EC be the final error with EC = C*(EB + EA)
296 //
297 // = (B + A)*C' + EC*2^(n-e)*2^c =
298 // = (B + A)*C' + EC*2^(n-(e-c))
299 //
300 // Since EC is multiplied by 2^(n-(e-c)) the resulting error contains c
301 // less error bits than the input. c bits are shifted out to the left.
302 // [qed]
303
304 if (C.getBitWidth() != A.getBitWidth()) {
305 ErrorMSBs = (unsigned)-1;
306 return *this;
307 }
308
309 // Multiplying by one is a no-op.
310 if (C.isOneValue()) {
311 return *this;
312 }
313
314 // Multiplying by zero removes the coefficient B and defines all bits.
315 if (C.isNullValue()) {
316 ErrorMSBs = 0;
317 deleteB();
318 }
319
320 // See Proof(2): Trailing zero bits indicate a left shift. This removes
321 // leading bits from the result even if they are undefined.
322 decErrorMSBs(C.countTrailingZeros());
323
324 A *= C;
325 pushBOperation(Mul, C);
326 return *this;
327 }
328
329 /// Apply a logical shift right on the polynomial
330 Polynomial &lshr(const APInt &C) {
331 // Theorem(1): (B + A + E*2^(n-e)) >> 1 => (B >> 1) + (A >> 1) + E'*2^(n-e')
332 // where
333 // e' = e + 1,
334 // E is a e-bit number,
335 // E' is a e'-bit number,
336 // holds under the following precondition:
337 // pre(1): A % 2 = 0
338 // pre(2): e < n, (see Theorem(2) for the trivial case with e=n)
339 // where >> expresses a logical shift to the right, with adding zeros.
340 //
341 // We need to show that for every, E there is a E'
342 //
343 // B = b_h * 2^(n-1) + b_m * 2 + b_l
344 // A = a_h * 2^(n-1) + a_m * 2 (pre(1))
345 //
346 // where a_h, b_h, b_l are single bits, and a_m, b_m are (n-2) bit numbers
347 //
348 // Let X = (B + A + E*2^(n-e)) >> 1
349 // Let Y = (B >> 1) + (A >> 1) + E*2^(n-e) >> 1
350 //
351 // X = [B + A + E*2^(n-e)] >> 1 =
352 // = [ b_h * 2^(n-1) + b_m * 2 + b_l +
353 // + a_h * 2^(n-1) + a_m * 2 +
354 // + E * 2^(n-e) ] >> 1 =
355 //
356 // The sum is built by putting the overflow of [a_m + b+n] into the term
357 // 2^(n-1). As there are no more bits beyond 2^(n-1) the overflow within
358 // this bit is discarded. This is expressed by % 2.
359 //
360 // The bit in position 0 cannot overflow into the term (b_m + a_m).
361 //
362 // = [ ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-1) +
363 // + ((b_m + a_m) % 2^(n-2)) * 2 +
364 // + b_l + E * 2^(n-e) ] >> 1 =
365 //
366 // The shift is computed by dividing the terms by 2 and by cutting off
367 // b_l.
368 //
369 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
370 // + ((b_m + a_m) % 2^(n-2)) +
371 // + E * 2^(n-(e+1)) =
372 //
373 // by the definition in the Theorem e+1 = e'
374 //
375 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
376 // + ((b_m + a_m) % 2^(n-2)) +
377 // + E * 2^(n-e') =
378 //
379 // Compute Y by applying distributivity first
380 //
381 // Y = (B >> 1) + (A >> 1) + E*2^(n-e') =
382 // = (b_h * 2^(n-1) + b_m * 2 + b_l) >> 1 +
383 // + (a_h * 2^(n-1) + a_m * 2) >> 1 +
384 // + E * 2^(n-e) >> 1 =
385 //
386 // Again, the shift is computed by dividing the terms by 2 and by cutting
387 // off b_l.
388 //
389 // = b_h * 2^(n-2) + b_m +
390 // + a_h * 2^(n-2) + a_m +
391 // + E * 2^(n-(e+1)) =
392 //
393 // Again, the sum is built by putting the overflow of [a_m + b+n] into
394 // the term 2^(n-1). But this time there is room for a second bit in the
395 // term 2^(n-2) we add this bit to a new term and denote it o_h in a
396 // second step.
397 //
398 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] >> 1) * 2^(n-1) +
399 // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
400 // + ((b_m + a_m) % 2^(n-2)) +
401 // + E * 2^(n-(e+1)) =
402 //
403 // Let o_h = [b_h + a_h + (b_m + a_m) >> (n-2)] >> 1
404 // Further replace e+1 by e'.
405 //
406 // = o_h * 2^(n-1) +
407 // + ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
408 // + ((b_m + a_m) % 2^(n-2)) +
409 // + E * 2^(n-e') =
410 //
411 // Move o_h into the error term and construct E'. To ensure that there is
412 // no 2^x with negative x, this step requires pre(2) (e < n).
413 //
414 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
415 // + ((b_m + a_m) % 2^(n-2)) +
416 // + o_h * 2^(e'-1) * 2^(n-e') + | pre(2), move 2^(e'-1)
417 // | out of the old exponent
418 // + E * 2^(n-e') =
419 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
420 // + ((b_m + a_m) % 2^(n-2)) +
421 // + [o_h * 2^(e'-1) + E] * 2^(n-e') + | move 2^(e'-1) out of
422 // | the old exponent
423 //
424 // Let E' = o_h * 2^(e'-1) + E
425 //
426 // = ([b_h + a_h + (b_m + a_m) >> (n-2)] % 2) * 2^(n-2) +
427 // + ((b_m + a_m) % 2^(n-2)) +
428 // + E' * 2^(n-e')
429 //
430 // Because X and Y are distinct only in there error terms and E' can be
431 // constructed as shown the theorem holds.
432 // [qed]
433 //
434 // For completeness in case of the case e=n it is also required to show that
435 // distributivity can be applied.
436 //
437 // In this case Theorem(1) transforms to (the pre-condition on A can also be
438 // dropped)
439 //
440 // Theorem(2): (B + A + E) >> 1 => (B >> 1) + (A >> 1) + E'
441 // where
442 // A, B, E, E' are two's complement numbers with the same bit
443 // width
444 //
445 // Let A + B + E = X
446 // Let (B >> 1) + (A >> 1) = Y
447 //
448 // Therefore we need to show that for every X and Y there is an E' which
449 // makes the equation
450 //
451 // X = Y + E'
452 //
453 // hold. This is trivially the case for E' = X - Y.
454 //
455 // [qed]
456 //
457 // Remark: Distributing lshr with and arbitrary number n can be expressed as
458 // ((((B + A) lshr 1) lshr 1) ... ) {n times}.
459 // This construction induces n additional error bits at the left.
460
461 if (C.getBitWidth() != A.getBitWidth()) {
462 ErrorMSBs = (unsigned)-1;
463 return *this;
464 }
465
466 if (C.isNullValue())
467 return *this;
468
469 // Test if the result will be zero
470 unsigned shiftAmt = C.getZExtValue();
471 if (shiftAmt >= C.getBitWidth())
472 return mul(APInt(C.getBitWidth(), 0));
473
474 // The proof that shiftAmt LSBs are zero for at least one summand is only
475 // possible for the constant number.
476 //
477 // If this can be proven add shiftAmt to the error counter
478 // `ErrorMSBs`. Otherwise set all bits as undefined.
479 if (A.countTrailingZeros() < shiftAmt)
480 ErrorMSBs = A.getBitWidth();
481 else
482 incErrorMSBs(shiftAmt);
483
484 // Apply the operation.
485 pushBOperation(LShr, C);
486 A = A.lshr(shiftAmt);
487
488 return *this;
489 }
490
491 /// Apply a sign-extend or truncate operation on the polynomial.
492 Polynomial &sextOrTrunc(unsigned n) {
493 if (n < A.getBitWidth()) {
494 // Truncate: Clearly undefined Bits on the MSB side are removed
495 // if there are any.
496 decErrorMSBs(A.getBitWidth() - n);
497 A = A.trunc(n);
498 pushBOperation(Trunc, APInt(sizeof(n) * 8, n));
499 }
500 if (n > A.getBitWidth()) {
501 // Extend: Clearly extending first and adding later is different
502 // to adding first and extending later in all extended bits.
503 incErrorMSBs(n - A.getBitWidth());
504 A = A.sext(n);
505 pushBOperation(SExt, APInt(sizeof(n) * 8, n));
506 }
507
508 return *this;
509 }
510
511 /// Test if there is a coefficient B.
512 bool isFirstOrder() const { return V != nullptr; }
513
514 /// Test coefficient B of two Polynomials are equal.
515 bool isCompatibleTo(const Polynomial &o) const {
516 // The polynomial use different bit width.
517 if (A.getBitWidth() != o.A.getBitWidth())
518 return false;
519
520 // If neither Polynomial has the Coefficient B.
521 if (!isFirstOrder() && !o.isFirstOrder())
522 return true;
523
524 // The index variable is different.
525 if (V != o.V)
526 return false;
527
528 // Check the operations.
529 if (B.size() != o.B.size())
530 return false;
531
532 auto ob = o.B.begin();
533 for (auto &b : B) {
534 if (b != *ob)
535 return false;
536 ob++;
537 }
538
539 return true;
540 }
541
542 /// Subtract two polynomials, return an undefined polynomial if
543 /// subtraction is not possible.
544 Polynomial operator-(const Polynomial &o) const {
545 // Return an undefined polynomial if incompatible.
546 if (!isCompatibleTo(o))
547 return Polynomial();
548
549 // If the polynomials are compatible (meaning they have the same
550 // coefficient on B), B is eliminated. Thus a polynomial solely
551 // containing A is returned
552 return Polynomial(A - o.A, std::max(ErrorMSBs, o.ErrorMSBs));
553 }
554
555 /// Subtract a constant from a polynomial,
556 Polynomial operator-(uint64_t C) const {
557 Polynomial Result(*this);
558 Result.A -= C;
559 return Result;
560 }
561
562 /// Add a constant to a polynomial,
563 Polynomial operator+(uint64_t C) const {
564 Polynomial Result(*this);
565 Result.A += C;
566 return Result;
567 }
568
569 /// Returns true if it can be proven that two Polynomials are equal.
570 bool isProvenEqualTo(const Polynomial &o) {
571 // Subtract both polynomials and test if it is fully defined and zero.
572 Polynomial r = *this - o;
573 return (r.ErrorMSBs == 0) && (!r.isFirstOrder()) && (r.A.isNullValue());
574 }
575
576 /// Print the polynomial into a stream.
577 void print(raw_ostream &OS) const {
578 OS << "[{#ErrBits:" << ErrorMSBs << "} ";
579
580 if (V) {
581 for (auto b : B)
582 OS << "(";
583 OS << "(" << *V << ") ";
584
585 for (auto b : B) {
586 switch (b.first) {
587 case LShr:
588 OS << "LShr ";
589 break;
590 case Mul:
591 OS << "Mul ";
592 break;
593 case SExt:
594 OS << "SExt ";
595 break;
596 case Trunc:
597 OS << "Trunc ";
598 break;
599 }
600
601 OS << b.second << ") ";
602 }
603 }
604
605 OS << "+ " << A << "]";
606 }
607
608 private:
609 void deleteB() {
610 V = nullptr;
611 B.clear();
612 }
613
614 void pushBOperation(const BOps Op, const APInt &C) {
615 if (isFirstOrder()) {
616 B.push_back(std::make_pair(Op, C));
617 return;
618 }
619 }
620 };
621
622 static raw_ostream &operator<<(raw_ostream &OS, const Polynomial &P) {
623 P.print(OS);
624 return OS;
625 }
626
627 /// VectorInfo stores abstract the following information for each vector
628 /// element:
629 ///
630 /// 1) The the memory address loaded into the element as Polynomial
631 /// 2) a set of load instruction necessary to construct the vector,
632 /// 3) a set of all other instructions that are necessary to create the vector and
633 /// 4) a pointer value that can be used as relative base for all elements.
634 struct VectorInfo {
635 private:
636 VectorInfo(const VectorInfo &c) : VTy(c.VTy) {
637 llvm_unreachable(
638 "Copying VectorInfo is neither implemented nor necessary,");
639 }
640
641 public:
642 /// Information of a Vector Element
643 struct ElementInfo {
644 /// Offset Polynomial.
645 Polynomial Ofs;
646
647 /// The Load Instruction used to Load the entry. LI is null if the pointer
648 /// of the load instruction does not point on to the entry
649 LoadInst *LI;
650
651 ElementInfo(Polynomial Offset = Polynomial(), LoadInst *LI = nullptr)
652 : Ofs(Offset), LI(LI) {}
653 };
654
655 /// Basic-block the load instructions are within
656 BasicBlock *BB;
657
658 /// Pointer value of all participation load instructions
659 Value *PV;
660
661 /// Participating load instructions
662 std::set LIs;
663
664 /// Participating instructions
665 std::set Is;
666
667 /// Final shuffle-vector instruction
668 ShuffleVectorInst *SVI;
669
670 /// Information of the offset for each vector element
671 ElementInfo *EI;
672
673 /// Vector Type
674 VectorType *const VTy;
675
676 VectorInfo(VectorType *VTy)
677 : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
678 EI = new ElementInfo[VTy->getNumElements()];
679 }
680
681 virtual ~VectorInfo() { delete[] EI; }
682
683 unsigned getDimension() const { return VTy->getNumElements(); }
684
685 /// Test if the VectorInfo can be part of an interleaved load with the
686 /// specified factor.
687 ///
688 /// \param Factor of the interleave
689 /// \param DL Targets Datalayout
690 ///
691 /// \returns true if this is possible and false if not
692 bool isInterleaved(unsigned Factor, const DataLayout &DL) const {
693 unsigned Size = DL.getTypeAllocSize(VTy->getElementType());
694 for (unsigned i = 1; i < getDimension(); i++) {
695 if (!EI[i].Ofs.isProvenEqualTo(EI[0].Ofs + i * Factor * Size)) {
696 return false;
697 }
698 }
699 return true;
700 }
701
702 /// Recursively computes the vector information stored in V.
703 ///
704 /// This function delegates the work to specialized implementations
705 ///
706 /// \param V Value to operate on
707 /// \param Result Result of the computation
708 ///
709 /// \returns false if no sensible information can be gathered.
710 static bool compute(Value *V, VectorInfo &Result, const DataLayout &DL) {
711 ShuffleVectorInst *SVI = dyn_cast(V);
712 if (SVI)
713 return computeFromSVI(SVI, Result, DL);
714 LoadInst *LI = dyn_cast(V);
715 if (LI)
716 return computeFromLI(LI, Result, DL);
717 BitCastInst *BCI = dyn_cast(V);
718 if (BCI)
719 return computeFromBCI(BCI, Result, DL);
720 return false;
721 }
722
723 /// BitCastInst specialization to compute the vector information.
724 ///
725 /// \param BCI BitCastInst to operate on
726 /// \param Result Result of the computation
727 ///
728 /// \returns false if no sensible information can be gathered.
729 static bool computeFromBCI(BitCastInst *BCI, VectorInfo &Result,
730 const DataLayout &DL) {
731 Instruction *Op = dyn_cast(BCI->getOperand(0));
732
733 if (!Op)
734 return false;
735
736 VectorType *VTy = dyn_cast(Op->getType());
737 if (!VTy)
738 return false;
739
740 // We can only cast from large to smaller vectors
741 if (Result.VTy->getNumElements() % VTy->getNumElements())
742 return false;
743
744 unsigned Factor = Result.VTy->getNumElements() / VTy->getNumElements();
745 unsigned NewSize = DL.getTypeAllocSize(Result.VTy->getElementType());
746 unsigned OldSize = DL.getTypeAllocSize(VTy->getElementType());
747
748 if (NewSize * Factor != OldSize)
749 return false;
750
751 VectorInfo Old(VTy);
752 if (!compute(Op, Old, DL))
753 return false;
754
755 for (unsigned i = 0; i < Result.VTy->getNumElements(); i += Factor) {
756 for (unsigned j = 0; j < Factor; j++) {
757 Result.EI[i + j] =
758 ElementInfo(Old.EI[i / Factor].Ofs + j * NewSize,
759 j == 0 ? Old.EI[i / Factor].LI : nullptr);
760 }
761 }
762
763 Result.BB = Old.BB;
764 Result.PV = Old.PV;
765 Result.LIs.insert(Old.LIs.begin(), Old.LIs.end());
766 Result.Is.insert(Old.Is.begin(), Old.Is.end());
767 Result.Is.insert(BCI);
768 Result.SVI = nullptr;
769
770 return true;
771 }
772
773 /// ShuffleVectorInst specialization to compute vector information.
774 ///
775 /// \param SVI ShuffleVectorInst to operate on
776 /// \param Result Result of the computation
777 ///
778 /// Compute the left and the right side vector information and merge them by
779 /// applying the shuffle operation. This function also ensures that the left
780 /// and right side have compatible loads. This means that all loads are with
781 /// in the same basic block and are based on the same pointer.
782 ///
783 /// \returns false if no sensible information can be gathered.
784 static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
785 const DataLayout &DL) {
786 VectorType *ArgTy = dyn_cast(SVI->getOperand(0)->getType());
787 assert(ArgTy && "ShuffleVector Operand is not a VectorType");
788
789 // Compute the left hand vector information.
790 VectorInfo LHS(ArgTy);
791 if (!compute(SVI->getOperand(0), LHS, DL))
792 LHS.BB = nullptr;
793
794 // Compute the right hand vector information.
795 VectorInfo RHS(ArgTy);
796 if (!compute(SVI->getOperand(1), RHS, DL))
797 RHS.BB = nullptr;
798
799 // Neither operand produced sensible results?
800 if (!LHS.BB && !RHS.BB)
801 return false;
802 // Only RHS produced sensible results?
803 else if (!LHS.BB) {
804 Result.BB = RHS.BB;
805 Result.PV = RHS.PV;
806 }
807 // Only LHS produced sensible results?
808 else if (!RHS.BB) {
809 Result.BB = LHS.BB;
810 Result.PV = LHS.PV;
811 }
812 // Both operands produced sensible results?
813 else if ((LHS.BB == RHS.BB) && (LHS.PV == LHS.PV)) {
814 Result.BB = LHS.BB;
815 Result.PV = LHS.PV;
816 }
817 // Both operands produced sensible results but they are incompatible.
818 else {
819 return false;
820 }
821
822 // Merge and apply the operation on the offset information.
823 if (LHS.BB) {
824 Result.LIs.insert(LHS.LIs.begin(), LHS.LIs.end());
825 Result.Is.insert(LHS.Is.begin(), LHS.Is.end());
826 }
827 if (RHS.BB) {
828 Result.LIs.insert(RHS.LIs.begin(), RHS.LIs.end());
829 Result.Is.insert(RHS.Is.begin(), RHS.Is.end());
830 }
831 Result.Is.insert(SVI);
832 Result.SVI = SVI;
833
834 int j = 0;
835 for (int i : SVI->getShuffleMask()) {
836 assert((i < 2 * (signed)ArgTy->getNumElements()) &&
837 "Invalid ShuffleVectorInst (index out of bounds)");
838
839 if (i < 0)
840 Result.EI[j] = ElementInfo();
841 else if (i < (signed)ArgTy->getNumElements()) {
842 if (LHS.BB)
843 Result.EI[j] = LHS.EI[i];
844 else
845 Result.EI[j] = ElementInfo();
846 } else {
847 if (RHS.BB)
848 Result.EI[j] = RHS.EI[i - ArgTy->getNumElements()];
849 else
850 Result.EI[j] = ElementInfo();
851 }
852 j++;
853 }
854
855 return true;
856 }
857
858 /// LoadInst specialization to compute vector information.
859 ///
860 /// This function also acts as abort condition to the recursion.
861 ///
862 /// \param LI LoadInst to operate on
863 /// \param Result Result of the computation
864 ///
865 /// \returns false if no sensible information can be gathered.
866 static bool computeFromLI(LoadInst *LI, VectorInfo &Result,
867 const DataLayout &DL) {
868 Value *BasePtr;
869 Polynomial Offset;
870
871 if (LI->isVolatile())
872 return false;
873
874 if (LI->isAtomic())
875 return false;
876
877 // Get the base polynomial
878 computePolynomialFromPointer(*LI->getPointerOperand(), Offset, BasePtr, DL);
879
880 Result.BB = LI->getParent();
881 Result.PV = BasePtr;
882 Result.LIs.insert(LI);
883 Result.Is.insert(LI);
884
885 for (unsigned i = 0; i < Result.getDimension(); i++) {
886 Value *Idx[2] = {
887 ConstantInt::get(Type::getInt32Ty(LI->getContext()), 0),
888 ConstantInt::get(Type::getInt32Ty(LI->getContext()), i),
889 };
890 int64_t Ofs = DL.getIndexedOffsetInType(Result.VTy, makeArrayRef(Idx, 2));
891 Result.EI[i] = ElementInfo(Offset + Ofs, i == 0 ? LI : nullptr);
892 }
893
894 return true;
895 }
896
897 /// Recursively compute polynomial of a value.
898 ///
899 /// \param BO Input binary operation
900 /// \param Result Result polynomial
901 static void computePolynomialBinOp(BinaryOperator &BO, Polynomial &Result) {
902 Value *LHS = BO.getOperand(0);
903 Value *RHS = BO.getOperand(1);
904
905 // Find the RHS Constant if any
906 ConstantInt *C = dyn_cast(RHS);
907 if ((!C) && BO.isCommutative()) {
908 C = dyn_cast(LHS);
909 if (C)
910 std::swap(LHS, RHS);
911 }
912
913 switch (BO.getOpcode()) {
914 case Instruction::Add:
915 if (!C)
916 break;
917
918 computePolynomial(*LHS, Result);
919 Result.add(C->getValue());
920 return;
921
922 case Instruction::LShr:
923 if (!C)
924 break;
925
926 computePolynomial(*LHS, Result);
927 Result.lshr(C->getValue());
928 return;
929
930 default:
931 break;
932 }
933
934 Result = Polynomial(&BO);
935 }
936
937 /// Recursively compute polynomial of a value
938 ///
939 /// \param V input value
940 /// \param Result result polynomial
941 static void computePolynomial(Value &V, Polynomial &Result) {
942 if (isa(&V))
943 computePolynomialBinOp(*dyn_cast(&V), Result);
944 else
945 Result = Polynomial(&V);
946 }
947
948 /// Compute the Polynomial representation of a Pointer type.
949 ///
950 /// \param Ptr input pointer value
951 /// \param Result result polynomial
952 /// \param BasePtr pointer the polynomial is based on
953 /// \param DL Datalayout of the target machine
954 static void computePolynomialFromPointer(Value &Ptr, Polynomial &Result,
955 Value *&BasePtr,
956 const DataLayout &DL) {
957 // Not a pointer type? Return an undefined polynomial
958 PointerType *PtrTy = dyn_cast(Ptr.getType());
959 if (!PtrTy) {
960 Result = Polynomial();
961 BasePtr = nullptr;
962 }
963 unsigned PointerBits =
964 DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace());
965
966 /// Skip pointer casts. Return Zero polynomial otherwise
967 if (isa(&Ptr)) {
968 CastInst &CI = *cast(&Ptr);
969 switch (CI.getOpcode()) {
970 case Instruction::BitCast:
971 computePolynomialFromPointer(*CI.getOperand(0), Result, BasePtr, DL);
972 break;
973 default:
974 BasePtr = &Ptr;
975 Polynomial(PointerBits, 0);
976 break;
977 }
978 }
979 /// Resolve GetElementPtrInst.
980 else if (isa(&Ptr)) {
981 GetElementPtrInst &GEP = *cast(&Ptr);
982
983 APInt BaseOffset(PointerBits, 0);
984
985 // Check if we can compute the Offset with accumulateConstantOffset
986 if (GEP.accumulateConstantOffset(DL, BaseOffset)) {
987 Result = Polynomial(BaseOffset);
988 BasePtr = GEP.getPointerOperand();
989 return;
990 } else {
991 // Otherwise we allow that the last index operand of the GEP is
992 // non-constant.
993 unsigned idxOperand, e;
994 SmallVector Indices;
995 for (idxOperand = 1, e = GEP.getNumOperands(); idxOperand < e;
996 idxOperand++) {
997 ConstantInt *IDX = dyn_cast(GEP.getOperand(idxOperand));
998 if (!IDX)
999 break;
1000 Indices.push_back(IDX);
1001 }
1002
1003 // It must also be the last operand.
1004 if (idxOperand + 1 != e) {
1005 Result = Polynomial();
1006 BasePtr = nullptr;
1007 return;
1008 }
1009
1010 // Compute the polynomial of the index operand.
1011 computePolynomial(*GEP.getOperand(idxOperand), Result);
1012
1013 // Compute base offset from zero based index, excluding the last
1014 // variable operand.
1015 BaseOffset =
1016 DL.getIndexedOffsetInType(GEP.getSourceElementType(), Indices);
1017
1018 // Apply the operations of GEP to the polynomial.
1019 unsigned ResultSize = DL.getTypeAllocSize(GEP.getResultElementType());
1020 Result.sextOrTrunc(PointerBits);
1021 Result.mul(APInt(PointerBits, ResultSize));
1022 Result.add(BaseOffset);
1023 BasePtr = GEP.getPointerOperand();
1024 }
1025 }
1026 // All other instructions are handled by using the value as base pointer and
1027 // a zero polynomial.
1028 else {
1029 BasePtr = &Ptr;
1030 Polynomial(DL.getIndexSizeInBits(PtrTy->getPointerAddressSpace()), 0);
1031 }
1032 }
1033
1034 #ifndef NDEBUG
1035 void print(raw_ostream &OS) const {
1036 if (PV)
1037 OS << *PV;
1038 else
1039 OS << "(none)";
1040 OS << " + ";
1041 for (unsigned i = 0; i < getDimension(); i++)
1042 OS << ((i == 0) ? "[" : ", ") << EI[i].Ofs;
1043 OS << "]";
1044 }
1045 #endif
1046 };
1047
1048 #ifndef NDEBUG
1049 static raw_ostream &operator<<(raw_ostream &OS, const VectorInfo &S) {
1050 S.print(OS);
1051 return OS;
1052 }
1053 #endif
1054 } // anonymous namespace
1055
1056 bool InterleavedLoadCombineImpl::findPattern(
1057 std::list &Candidates, std::list &InterleavedLoad,
1058 unsigned Factor, const DataLayout &DL) {
1059 for (auto C0 = Candidates.begin(), E0 = Candidates.end(); C0 != E0; ++C0) {
1060 unsigned i;
1061 // Try to find an interleaved load using the front of Worklist as first line
1062 unsigned Size = DL.getTypeAllocSize(C0->VTy->getElementType());
1063
1064 // List containing iterators pointing to the VectorInfos of the candidates
1065 std::vector::iterator> Res(Factor, Candidates.end());
1066
1067 for (auto C = Candidates.begin(), E = Candidates.end(); C != E; C++) {
1068 if (C->VTy != C0->VTy)
1069 continue;
1070 if (C->BB != C0->BB)
1071 continue;
1072 if (C->PV != C0->PV)
1073 continue;
1074
1075 // Check the current value matches any of factor - 1 remaining lines
1076 for (i = 1; i < Factor; i++) {
1077 if (C->EI[0].Ofs.isProvenEqualTo(C0->EI[0].Ofs + i * Size)) {
1078 Res[i] = C;
1079 }
1080 }
1081
1082 for (i = 1; i < Factor; i++) {
1083 if (Res[i] == Candidates.end())
1084 break;
1085 }
1086 if (i == Factor) {
1087 Res[0] = C0;
1088 break;
1089 }
1090 }
1091
1092 if (Res[0] != Candidates.end()) {
1093 // Move the result into the output
1094 for (unsigned i = 0; i < Factor; i++) {
1095 InterleavedLoad.splice(InterleavedLoad.end(), Candidates, Res[i]);
1096 }
1097
1098 return true;
1099 }
1100 }
1101 return false;
1102 }
1103
1104 LoadInst *
1105 InterleavedLoadCombineImpl::findFirstLoad(const std::set &LIs) {
1106 assert(!LIs.empty() && "No load instructions given.");
1107
1108 // All LIs are within the same BB. Select the first for a reference.
1109 BasicBlock *BB = (*LIs.begin())->getParent();
1110 BasicBlock::iterator FLI =
1111 std::find_if(BB->begin(), BB->end(), [&LIs](Instruction &I) -> bool {
1112 return is_contained(LIs, &I);
1113 });
1114 assert(FLI != BB->end());
1115
1116 return cast(FLI);
1117 }
1118
1119 bool InterleavedLoadCombineImpl::combine(std::list &InterleavedLoad,
1120 OptimizationRemarkEmitter &ORE) {
1121 LLVM_DEBUG(dbgs() << "Checking interleaved load\n");
1122 for (auto &VI : InterleavedLoad)
1123 LLVM_DEBUG(dbgs() << VI << "\n");
1124
1125 // The insertion point is the LoadInst which loads the first values. The
1126 // following tests are used to proof that the combined load can be inserted
1127 // just before InsertionPoint.
1128 LoadInst *InsertionPoint = InterleavedLoad.front().EI[0].LI;
1129
1130 // Test if the offset is computed
1131 if (!InsertionPoint)
1132 return false;
1133
1134 std::set LIs;
1135 std::set Is;
1136 std::set SVIs;
1137
1138 unsigned InterleavedCost;
1139 unsigned InstructionCost = 0;
1140
1141 // Get the interleave factor
1142 unsigned Factor = InterleavedLoad.size();
1143
1144 // Merge all input sets used in analysis
1145 for (auto &VI : InterleavedLoad) {
1146 // Generate a set of all load instructions to be combined
1147 LIs.insert(VI.LIs.begin(), VI.LIs.end());
1148
1149 // Generate a set of all instructions taking part in load
1150 // interleaved. This list excludes the instructions necessary for the
1151 // polynomial construction.
1152 Is.insert(VI.Is.begin(), VI.Is.end());
1153
1154 // Generate the set of the final ShuffleVectorInst.
1155 SVIs.insert(VI.SVI);
1156 }
1157
1158 // There is nothing to combine.
1159 if (LIs.size() < 2)
1160 return false;
1161
1162 // Test if all participating instruction will be dead after the
1163 // transformation. If intermediate results are used, no performance gain can
1164 // be expected. Also sum the cost of the Instructions beeing left dead.
1165 for (auto &I : Is) {
1166 // Compute the old cost
1167 InstructionCost +=
1168 TTI.getInstructionCost(I, TargetTransformInfo::TCK_Latency);
1169
1170 // The final SVIs are allowed not to be dead, all uses will be replaced
1171 if (SVIs.find(I) != SVIs.end())
1172 continue;
1173
1174 // If there are users outside the set to be eliminated, we abort the
1175 // transformation. No gain can be expected.
1176 for (const auto &U : I->users()) {
1177 if (Is.find(dyn_cast(U)) == Is.end())
1178 return false;
1179 }
1180 }
1181
1182 // We know that all LoadInst are within the same BB. This guarantees that
1183 // either everything or nothing is loaded.
1184 LoadInst *First = findFirstLoad(LIs);
1185
1186 // To be safe that the loads can be combined, iterate over all loads and test
1187 // that the corresponding defining access dominates first LI. This guarantees
1188 // that there are no aliasing stores in between the loads.
1189 auto FMA = MSSA.getMemoryAccess(First);
1190 for (auto LI : LIs) {
1191 auto MADef = MSSA.getMemoryAccess(LI)->getDefiningAccess();
1192 if (!MSSA.dominates(MADef, FMA))
1193 return false;
1194 }
1195 assert(!LIs.empty() && "There are no LoadInst to combine");
1196
1197 // It is necessary that insertion point dominates all final ShuffleVectorInst.
1198 for (auto &VI : InterleavedLoad) {
1199 if (!DT.dominates(InsertionPoint, VI.SVI))
1200 return false;
1201 }
1202
1203 // All checks are done. Add instructions detectable by InterleavedAccessPass
1204 // The old instruction will are left dead.
1205 IRBuilder<> Builder(InsertionPoint);
1206 Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
1207 unsigned ElementsPerSVI =
1208 InterleavedLoad.front().SVI->getType()->getNumElements();
1209 VectorType *ILTy = VectorType::get(ETy, Factor * ElementsPerSVI);
1210
1211 SmallVector Indices;
1212 for (unsigned i = 0; i < Factor; i++)
1213 Indices.push_back(i);
1214 InterleavedCost = TTI.getInterleavedMemoryOpCost(
1215 Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
1216 InsertionPoint->getPointerAddressSpace());
1217
1218 if (InterleavedCost >= InstructionCost) {
1219 return false;
1220 }
1221
1222 // Create a pointer cast for the wide load.
1223 auto CI = Builder.CreatePointerCast(InsertionPoint->getOperand(0),
1224 ILTy->getPointerTo(),
1225 "interleaved.wide.ptrcast");
1226
1227 // Create the wide load and update the MemorySSA.
1228 auto LI = Builder.CreateAlignedLoad(CI, InsertionPoint->getAlignment(),
1229 "interleaved.wide.load");
1230 auto MSSAU = MemorySSAUpdater(&MSSA);
1231 MemoryUse *MSSALoad = cast(MSSAU.createMemoryAccessBefore(
1232 LI, nullptr, MSSA.getMemoryAccess(InsertionPoint)));
1233 MSSAU.insertUse(MSSALoad);
1234
1235 // Create the final SVIs and replace all uses.
1236 int i = 0;
1237 for (auto &VI : InterleavedLoad) {
1238 SmallVector Mask;
1239 for (unsigned j = 0; j < ElementsPerSVI; j++)
1240 Mask.push_back(i + j * Factor);
1241
1242 Builder.SetInsertPoint(VI.SVI);
1243 auto SVI = Builder.CreateShuffleVector(LI, UndefValue::get(LI->getType()),
1244 Mask, "interleaved.shuffle");
1245 VI.SVI->replaceAllUsesWith(SVI);
1246 i++;
1247 }
1248
1249 NumInterleavedLoadCombine++;
1250 ORE.emit([&]() {
1251 return OptimizationRemark(DEBUG_TYPE, "Combined Interleaved Load", LI)
1252 << "Load interleaved combined with factor "
1253 << ore::NV("Factor", Factor);
1254 });
1255
1256 return true;
1257 }
1258
1259 bool InterleavedLoadCombineImpl::run() {
1260 OptimizationRemarkEmitter ORE(&F);
1261 bool changed = false;
1262 unsigned MaxFactor = TLI.getMaxSupportedInterleaveFactor();
1263
1264 auto &DL = F.getParent()->getDataLayout();
1265
1266 // Start with the highest factor to avoid combining and recombining.
1267 for (unsigned Factor = MaxFactor; Factor >= 2; Factor--) {
1268 std::list Candidates;
1269
1270 for (BasicBlock &BB : F) {
1271 for (Instruction &I : BB) {
1272 if (auto SVI = dyn_cast(&I)) {
1273
1274 Candidates.emplace_back(SVI->getType());
1275
1276 if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
1277 Candidates.pop_back();
1278 continue;
1279 }
1280
1281 if (!Candidates.back().isInterleaved(Factor, DL)) {
1282 Candidates.pop_back();
1283 }
1284 }
1285 }
1286 }
1287
1288 std::list InterleavedLoad;
1289 while (findPattern(Candidates, InterleavedLoad, Factor, DL)) {
1290 if (combine(InterleavedLoad, ORE)) {
1291 changed = true;
1292 } else {
1293 // Remove the first element of the Interleaved Load but put the others
1294 // back on the list and continue searching
1295 Candidates.splice(Candidates.begin(), InterleavedLoad,
1296 std::next(InterleavedLoad.begin()),
1297 InterleavedLoad.end());
1298 }
1299 InterleavedLoad.clear();
1300 }
1301 }
1302
1303 return changed;
1304 }
1305
1306 namespace {
1307 /// This pass combines interleaved loads into a pattern detectable by
1308 /// InterleavedAccessPass.
1309 struct InterleavedLoadCombine : public FunctionPass {
1310 static char ID;
1311
1312 InterleavedLoadCombine() : FunctionPass(ID) {
1313 initializeInterleavedLoadCombinePass(*PassRegistry::getPassRegistry());
1314 }
1315
1316 StringRef getPassName() const override {
1317 return "Interleaved Load Combine Pass";
1318 }
1319
1320 bool runOnFunction(Function &F) override {
1321 if (DisableInterleavedLoadCombine)
1322 return false;
1323
1324 auto *TPC = getAnalysisIfAvailable();
1325 if (!TPC)
1326 return false;
1327
1328 LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName()
1329 << "\n");
1330
1331 return InterleavedLoadCombineImpl(
1332 F, getAnalysis().getDomTree(),
1333 getAnalysis().getMSSA(),
1334 TPC->getTM())
1335 .run();
1336 }
1337
1338 void getAnalysisUsage(AnalysisUsage &AU) const override {
1339 AU.addRequired();
1340 AU.addRequired();
1341 FunctionPass::getAnalysisUsage(AU);
1342 }
1343
1344 private:
1345 };
1346 } // anonymous namespace
1347
1348 char InterleavedLoadCombine::ID = 0;
1349
1350 INITIALIZE_PASS_BEGIN(
1351 InterleavedLoadCombine, DEBUG_TYPE,
1352 "Combine interleaved loads into wide loads and shufflevector instructions",
1353 false, false)
1354 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1355 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
1356 INITIALIZE_PASS_END(
1357 InterleavedLoadCombine, DEBUG_TYPE,
1358 "Combine interleaved loads into wide loads and shufflevector instructions",
1359 false, false)
1360
1361 FunctionPass *
1362 llvm::createInterleavedLoadCombinePass() {
1363 auto P = new InterleavedLoadCombine();
1364 return P;
1365 }
418418 TargetPassConfig::addIRPasses();
419419
420420 // Match interleaved memory accesses to ldN/stN intrinsics.
421 if (TM->getOptLevel() != CodeGenOpt::None)
421 if (TM->getOptLevel() != CodeGenOpt::None) {
422 addPass(createInterleavedLoadCombinePass());
422423 addPass(createInterleavedAccessPass());
424 }
423425
424426 if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
425427 // Call SeparateConstOffsetFromGEP pass to extract constants within indices
4747 ; CHECK-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
4848 ; CHECK-NEXT: Scalarize Masked Memory Intrinsics
4949 ; CHECK-NEXT: Expand reduction intrinsics
50 ; CHECK-NEXT: Dominator Tree Construction
51 ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl)
52 ; CHECK-NEXT: Function Alias Analysis Results
53 ; CHECK-NEXT: Memory SSA
54 ; CHECK-NEXT: Interleaved Load Combine Pass
5055 ; CHECK-NEXT: Dominator Tree Construction
5156 ; CHECK-NEXT: Interleaved Access Pass
5257 ; CHECK-NEXT: Natural Loop Information
0 ; RUN: llc < %s | FileCheck --check-prefix AS %s
1 ; RUN: opt -S -interleaved-load-combine < %s | FileCheck %s
2
3 ; ModuleID = 'aarch64_interleaved-ld-combine.bc'
4 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
5 target triple = "arm64--linux-gnu"
6
7 ; This should be lowered into LD4
8 define void @aarch64_ilc_const(<4 x float>* %ptr) {
9 entry:
10
11 ;;; Check LLVM transformation
12 ; CHECK-LABEL: @aarch64_ilc_const(
13 ; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
14 ; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>*
15 ; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 16
16 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
17 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
18 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
19 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
20 ; CHECK: ret void
21
22 ;;; Check if it gets lowerd
23 ; AS-LABEL: aarch64_ilc_const
24 ; AS: ld4
25 ; AS: ret
26
27 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 2
28 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 3
29 %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 4
30 %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 5
31 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
32 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
33 %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
34 %ld4 = load <4 x float>, <4 x float>* %gep4, align 16
35 %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
36 %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
37 %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32>
38 %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32>
39 %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32>
40 %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32>
41 %m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32>
42 %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32>
43
44 store <4 x float> %m0_3, <4 x float>* %gep1, align 16
45 store <4 x float> %m4_7, <4 x float>* %gep2, align 16
46 store <4 x float> %m8_11, <4 x float>* %gep3, align 16
47 store <4 x float> %m12_15, <4 x float>* %gep4, align 16
48 ret void
49 }
50
51 ; This should be lowered into LD4
52 define void @aarch64_ilc_idx(<4 x float>* %ptr, i64 %idx) {
53 entry:
54
55 ;;; Check LLVM transformation
56 ; CHECK-LABEL: @aarch64_ilc_idx(
57 ; CHECK-DAG: [[ADD:%.+]] = add i64 %idx, 16
58 ; CHECK-DAG: [[LSHR:%.+]] = lshr i64 [[ADD]], 2
59 ; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]]
60 ; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>*
61 ; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 16
62 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
63 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
64 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
65 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
66 ; CHECK: ret void
67
68 ; AS-LABEL: aarch64_ilc_idx
69 ; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2
70 ; AS-DAG: add [[ADD:x[0-9]+]], [[LSL]], #64
71 ; AS-DAG: and [[AND:x[0-9]+]], [[ADD]], #0xfffffffffffffff0
72 ; AS-DAG: add [[ADR:x[0-9]+]], x0, [[AND]]
73 ; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}}
74 ; AS-DAG: str q[[V0]]
75 ; AS-DAG: str q[[V1]]
76 ; AS-DAG: str q[[V2]]
77 ; AS-DAG: str q[[V3]]
78 ; AS: ret
79
80 %a2 = add i64 %idx, 20
81 %idx2 = lshr i64 %a2, 2
82 %a3 = add i64 %idx, 24
83 %a1 = add i64 %idx, 16
84 %idx1 = lshr i64 %a1, 2
85 %idx3 = lshr i64 %a3, 2
86 %a4 = add i64 %idx, 28
87 %idx4 = lshr i64 %a4, 2
88
89 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx2
90 %gep4 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx4
91 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx1
92 %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx3
93 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
94 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
95 %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
96 %ld4 = load <4 x float>, <4 x float>* %gep4, align 16
97 %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
98 %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
99 %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32>
100 %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32>
101 %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32>
102 %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32>
103 %m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32>
104 %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32>
105
106 store <4 x float> %m0_3, <4 x float>* %gep1, align 16
107 store <4 x float> %m4_7, <4 x float>* %gep2, align 16
108 store <4 x float> %m8_11, <4 x float>* %gep3, align 16
109 store <4 x float> %m12_15, <4 x float>* %gep4, align 16
110 ret void
111 }
112
113 ; This should be lowered into LD4, a offset of has to be taken into account
114 %struct.ilc = type <{ float, [0 x <4 x float>] }>
115 define void @aarch64_ilc_struct(%struct.ilc* %ptr, i64 %idx) {
116 entry:
117
118 ;;; Check LLVM transformation
119 ; CHECK-LABEL: @aarch64_ilc_struct(
120 ; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
121 ; CHECK-DAG: [[GEP:%.+]] = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 [[LSHR]]
122 ; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <16 x float>*
123 ; CHECK-DAG: [[LOAD:%.+]] = load <16 x float>, <16 x float>* [[CAST]], align 4
124 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
125 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
126 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
127 ; CHECK-DAG: %{{.* }}= shufflevector <16 x float> [[LOAD]], <16 x float> undef, <4 x i32>
128 ; CHECK: ret void
129
130 ; AS-LABEL: aarch64_ilc_struct
131 ; AS-DAG: lsl [[LSL:x[0-9]+]], x1, #2
132 ; AS-DAG: add [[ADD:x[0-9]+]], x0, #4
133 ; AS-DAG: and [[AND:x[0-9]+]], [[LSL]], #0xfffffffffffffff0
134 ; AS-DAG: add [[ADR:x[0-9]+]], [[ADD]], [[AND]]
135 ; AS-DAG: ld4 { v[[V0:[0-9]+]].4s, v[[V1:[0-9]+]].4s, v[[V2:[0-9]+]].4s, v[[V3:[0-9]+]].4s }, {{\[}}[[ADR]]{{\]}}
136 ; AS-DAG: str q[[V0]]
137 ; AS-DAG: str q[[V1]]
138 ; AS-DAG: str q[[V2]]
139 ; AS-DAG: str q[[V3]]
140 ; AS: ret
141
142 %a1 = add i64 %idx, 4
143 %idx2 = lshr i64 %a1, 2
144 %a2 = add i64 %idx, 8
145 %idx3 = lshr i64 %a2, 2
146 %a3 = add i64 %idx, 12
147 %idx4 = lshr i64 %a3, 2
148
149 %gep2 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx2
150 %gep3 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx3
151 %gep4 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx4
152 %idx1 = lshr i64 %idx, 2
153 %gep1 = getelementptr %struct.ilc, %struct.ilc* %ptr, i32 0, i32 1, i64 %idx1
154 %ld1 = load <4 x float>, <4 x float>* %gep1, align 4
155 %ld2 = load <4 x float>, <4 x float>* %gep2, align 4
156 %ld3 = load <4 x float>, <4 x float>* %gep3, align 4
157 %ld4 = load <4 x float>, <4 x float>* %gep4, align 4
158 %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
159 %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
160 %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32>
161 %sv4 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32>
162 %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32>
163 %m4_7 = shufflevector <4 x float> %sv1, <4 x float> %sv3, <4 x i32>
164 %m8_11 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32>
165 %m12_15 = shufflevector <4 x float> %sv2, <4 x float> %sv4, <4 x i32>
166
167 store <4 x float> %m0_3, <4 x float>* %gep1, align 16
168 store <4 x float> %m4_7, <4 x float>* %gep2, align 16
169 store <4 x float> %m8_11, <4 x float>* %gep3, align 16
170 store <4 x float> %m12_15, <4 x float>* %gep4, align 16
171 ret void
172 }
173
174 ; This should be lowered into LD2
175 define void @aarch64_ilc_idx_ld2(<4 x float>* %ptr, i64 %idx) {
176 entry:
177 ; CHECK-LABEL: @aarch64_ilc_idx_ld2(
178 ; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
179 ; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]]
180 ; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <8 x float>*
181 ; CHECK-DAG: [[LOAD:%.+]] = load <8 x float>, <8 x float>* [[CAST]], align 16
182 ; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> undef, <4 x i32>
183 ; CHECK: %{{.* }}= shufflevector <8 x float> [[LOAD]], <8 x float> undef, <4 x i32>
184 ; CHECK-DAG: ret void
185
186 ; AS-LABEL: aarch64_ilc_idx_ld2
187 ; AS: ld2
188 ; AS: ret
189
190 %idx1 = lshr i64 %idx, 2
191 %a1 = add i64 %idx, 4
192 %idx2 = lshr i64 %a1, 2
193
194 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx1
195 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx2
196 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
197 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
198 %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
199 %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
200
201 store <4 x float> %m0_3, <4 x float>* %gep1
202 store <4 x float> %m4_7, <4 x float>* %gep2
203 ret void
204 }
205
206 ; This should be lowered into LD3
207 define void @aarch64_ilc_idx_ld3(<4 x float>* %ptr, i64 %idx) {
208 entry:
209 ; CHECK-LABEL: @aarch64_ilc_idx_ld3(
210 ; CHECK-DAG: [[LSHR:%.+]] = lshr i64 %idx, 2
211 ; CHECK-DAG: [[GEP:%.+]] = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 [[LSHR]]
212 ; CHECK-DAG: [[CAST:%.+]] = bitcast <4 x float>* [[GEP]] to <12 x float>*
213 ; CHECK-DAG: [[LOAD:%.+]] = load <12 x float>, <12 x float>* [[CAST]], align 16
214 ; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32>
215 ; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32>
216 ; CHECK: %{{.* }}= shufflevector <12 x float> [[LOAD]], <12 x float> undef, <4 x i32>
217 ; CHECK-DAG: ret void
218
219 ; AS-LABEL: aarch64_ilc_idx_ld3
220 ; AS: ld3
221 ; AS: ret
222
223 %idx1 = lshr i64 %idx, 2
224 %a1 = add i64 %idx, 4
225 %idx2 = lshr i64 %a1, 2
226 %a2 = add i64 %idx, 8
227 %idx3 = lshr i64 %a2, 2
228
229 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx1
230 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx2
231 %gep3 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i64 %idx3
232 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
233 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
234 %ld3 = load <4 x float>, <4 x float>* %gep3, align 16
235
236 %sv1 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
237 %sv2 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
238 %sv3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
239 %m0_3 = shufflevector <4 x float> %sv1, <4 x float> %ld3, <4 x i32>
240 %m4_7 = shufflevector <4 x float> %sv2, <4 x float> %ld3, <4 x i32>
241 %m8_11 = shufflevector <4 x float> %sv3, <4 x float> %ld3, <4 x i32>
242
243 store <4 x float> %m0_3, <4 x float>* %gep1, align 16
244 store <4 x float> %m4_7, <4 x float>* %gep2, align 16
245 store <4 x float> %m8_11, <4 x float>* %gep3, align 16
246 ret void
247 }
248 ; %sv3 = shufflevector <4 x float> %ld3, <4 x float> %ld4, <4 x i32>
249
250 ; This must not be lowered
251 define void @aarch64_ilc_i32_idx(<4 x float>* %ptr, i32 %idx) {
252 ; CHECK-LABEL: @aarch64_ilc_i32_idx(
253 ; CHECK: %idx1 = lshr i32 %idx, 2
254 ; CHECK-NEXT: %a1 = add i32 %idx, 4
255 ; CHECK-NEXT: %idx2 = lshr i32 %a1, 2
256 ; CHECK-NEXT: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx1
257 ; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx2
258 ; CHECK-NEXT: %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
259 ; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
260 ; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
261 ; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
262 ; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16
263 ; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16
264 ; CHECK-NEXT: ret void
265
266 ; AS-LABEL: aarch64_ilc_i32_idx
267 ; AS-DAG: @function
268 ; AS-NOT: ld2
269 ; AS-NOT: ld3
270 ; AS-NOT: ld4
271 ; AS-DAG: ret
272
273 entry:
274 %idx1 = lshr i32 %idx, 2
275 %a1 = add i32 %idx, 4
276 %idx2 = lshr i32 %a1, 2
277
278 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx1
279 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 %idx2
280 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
281 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
282 %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
283 %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
284
285 store <4 x float> %m0_3, <4 x float>* %gep1, align 16
286 store <4 x float> %m4_7, <4 x float>* %gep2, align 16
287 ret void
288 }
289
290 ; Volatile loads must not be lowered
291 define void @aarch64_ilc_volatile(<4 x float>* %ptr) {
292 ; CHECK-LABEL: @aarch64_ilc_volatile(
293 ; CHECK: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
294 ; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
295 ; CHECK-NEXT: %ld1 = load volatile <4 x float>, <4 x float>* %gep1, align 16
296 ; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
297 ; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
298 ; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
299 ; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16
300 ; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16
301 ; CHECK-NEXT: ret void
302
303 ; AS-LABEL: aarch64_ilc_volatile
304 ; AS-DAG: @function
305 ; AS-NOT: ld2
306 ; AS-NOT: ld3
307 ; AS-NOT: ld4
308 ; AS-DAG: ret
309
310 entry:
311 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
312 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
313 %ld1 = load volatile <4 x float>, <4 x float>* %gep1, align 16
314 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
315 %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
316 %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
317 store <4 x float> %m0_3, <4 x float>* %gep1, align 16
318 store <4 x float> %m4_7, <4 x float>* %gep2, align 16
319 ret void
320 }
321
322 ; This must not be lowered
323 define void @aarch64_ilc_depmem(<4 x float>* %ptr, i32 %idx) {
324 entry:
325 ; CHECK-LABEL: @aarch64_ilc_depmem(
326 ; CHECK: %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
327 ; CHECK-NEXT: %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
328 ; CHECK-NEXT: %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
329 ; CHECK-NEXT: store <4 x float> %ld1, <4 x float>* %gep2, align 16
330 ; CHECK-NEXT: %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
331 ; CHECK-NEXT: %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
332 ; CHECK-NEXT: %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
333 ; CHECK-NEXT: store <4 x float> %m0_3, <4 x float>* %gep1, align 16
334 ; CHECK-NEXT: store <4 x float> %m4_7, <4 x float>* %gep2, align 16
335 ; CHECK-NEXT: ret void
336
337 ; AS-LABEL: aarch64_ilc_depmem
338 ; AS-DAG: @function
339 ; AS-NOT: ld2
340 ; AS-NOT: ld3
341 ; AS-NOT: ld4
342 ; AS-DAG: ret
343
344 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 0
345 %gep2 = getelementptr inbounds <4 x float>, <4 x float>* %ptr, i32 1
346 %ld1 = load <4 x float>, <4 x float>* %gep1, align 16
347 store <4 x float> %ld1, <4 x float>* %gep2, align 16
348 %ld2 = load <4 x float>, <4 x float>* %gep2, align 16
349 %m0_3 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
350 %m4_7 = shufflevector <4 x float> %ld1, <4 x float> %ld2, <4 x i32>
351
352 store <4 x float> %m0_3, <4 x float>* %gep1, align 16
353 store <4 x float> %m4_7, <4 x float>* %gep2, align 16
354 ret void
355 }
356
357 ; This cannot be converted - insertion position cannot be determined
358 define void @aarch64_no_insertion_pos(float* %ptr) {
359 entry:
360 ; CHECK-LABEL: @aarch64_no_insertion_pos(
361 ; CHECK: %p0 = getelementptr inbounds float, float* %ptr, i32 0
362 ; CHECK-NEXT: %p1 = getelementptr inbounds float, float* %ptr, i32 4
363 ; CHECK-NEXT: %b0 = bitcast float* %p0 to <5 x float>*
364 ; CHECK-NEXT: %b1 = bitcast float* %p1 to <5 x float>*
365 ; CHECK-NEXT: %l0 = load <5 x float>, <5 x float>* %b0
366 ; CHECK-NEXT: %l1 = load <5 x float>, <5 x float>* %b1
367 ; CHECK-NEXT: %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32>
368 ; CHECK-NEXT: %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32>
369 ; CHECK-NEXT: ret void
370
371 %p0 = getelementptr inbounds float, float* %ptr, i32 0
372 %p1 = getelementptr inbounds float, float* %ptr, i32 4
373 %b0 = bitcast float* %p0 to <5 x float>*
374 %b1 = bitcast float* %p1 to <5 x float>*
375 %l0 = load <5 x float>, <5 x float>* %b0
376 %l1 = load <5 x float>, <5 x float>* %b1
377 %s0 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32>
378 %s1 = shufflevector <5 x float> %l0, <5 x float> %l1, <4 x i32>
379 ret void
380 }
381
382 ; This cannot be converted - the insertion position does not dominate all
383 ; uses
384 define void @aarch64_insertpos_does_not_dominate(float* %ptr) {
385 entry:
386 ; CHECK-LABEL: @aarch64_insertpos_does_not_dominate(
387 ; CHECK: %p0 = getelementptr inbounds float, float* %ptr, i32 0
388 ; CHECK-NEXT: %p1 = getelementptr inbounds float, float* %ptr, i32 1
389 ; CHECK-NEXT: %b0 = bitcast float* %p0 to <7 x float>*
390 ; CHECK-NEXT: %b1 = bitcast float* %p1 to <7 x float>*
391 ; CHECK-NEXT: %l1 = load <7 x float>, <7 x float>* %b1
392 ; CHECK-NEXT: %s1 = shufflevector <7 x float> %l1, <7 x float> undef, <4 x i32>
393 ; CHECK-NEXT: %l0 = load <7 x float>, <7 x float>* %b0
394 ; CHECK-NEXT: %s0 = shufflevector <7 x float> %l0, <7 x float> undef, <4 x i32>
395 ; CHECK-NEXT: ret void
396 %p0 = getelementptr inbounds float, float* %ptr, i32 0
397 %p1 = getelementptr inbounds float, float* %ptr, i32 1
398 %b0 = bitcast float* %p0 to <7 x float>*
399 %b1 = bitcast float* %p1 to <7 x float>*
400 %l1 = load <7 x float>, <7 x float>* %b1
401 %s1 = shufflevector <7 x float> %l1, <7 x float> undef, <4 x i32>
402 %l0 = load <7 x float>, <7 x float>* %b0
403 %s0 = shufflevector <7 x float> %l0, <7 x float> undef, <4 x i32>
404 ret void
405 }
462462 initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
463463 initializeGlobalMergePass(Registry);
464464 initializeIndirectBrExpandPassPass(Registry);
465 initializeInterleavedLoadCombinePass(Registry);
465466 initializeInterleavedAccessPass(Registry);
466467 initializeEntryExitInstrumenterPass(Registry);
467468 initializePostInlineEntryExitInstrumenterPass(Registry);