llvm.org GIT mirror llvm / fadee63
Revert "ARM: Enable MachineScheduler and disable PostRAScheduler for swift." This reverts commit r242500. It broke some internal tests and Matthias asked me to revert it while he is investigating. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@242553 91177308-0d34-0410-b5e6-96231b3b80d8 Adam Nemet 4 years ago
10 changed file(s) with 1069 addition(s) and 48 deletion(s). Raw diff Collapse all Expand all
205205 /// scheduling class (itinerary class or SchedRW list).
206206 bool isComplete() const { return CompleteModel; }
207207
208 /// Return true if machine supports out of order execution.
209 bool isOutOfOrder() const { return MicroOpBufferSize > 1; }
210
211208 unsigned getNumProcResourceKinds() const {
212209 return NumProcResourceKinds;
213210 }
3636 // FIXME: Add preload instruction when it is documented.
3737 // FIXME: Model non-pipelined nature of FP div / sqrt unit.
3838
39 def SwiftItineraries : ProcessorItineraries<
40 [SW_DIS0, SW_DIS1, SW_DIS2, SW_ALU0, SW_ALU1, SW_LS, SW_IDIV, SW_FDIV], [], [
41 //
42 // Move instructions, unconditional
43 InstrItinData,
44 InstrStage<1, [SW_ALU0, SW_ALU1]>],
45 [1]>,
46 InstrItinData,
47 InstrStage<1, [SW_ALU0, SW_ALU1]>],
48 [1]>,
49 InstrItinData,
50 InstrStage<1, [SW_ALU0, SW_ALU1]>],
51 [1]>,
52 InstrItinData,
53 InstrStage<1, [SW_ALU0, SW_ALU1]>],
54 [1]>,
55 InstrItinData,
56 InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
57 InstrStage<1, [SW_ALU0, SW_ALU1]>,
58 InstrStage<1, [SW_ALU0, SW_ALU1]>],
59 [2]>,
60 InstrItinData,
61 InstrStage<1, [SW_ALU0, SW_ALU1]>,
62 InstrStage<1, [SW_ALU0, SW_ALU1]>,
63 InstrStage<1, [SW_ALU0, SW_ALU1]>],
64 [3]>,
65 InstrItinData,
66 InstrStage<1, [SW_ALU0, SW_ALU1]>,
67 InstrStage<1, [SW_ALU0, SW_ALU1]>,
68 InstrStage<1, [SW_LS]>],
69 [5]>,
70 //
71 // MVN instructions
72 InstrItinData,
73 InstrStage<1, [SW_ALU0, SW_ALU1]>],
74 [1]>,
75 InstrItinData,
76 InstrStage<1, [SW_ALU0, SW_ALU1]>],
77 [1]>,
78 InstrItinData,
79 InstrStage<1, [SW_ALU0, SW_ALU1]>],
80 [1]>,
81 InstrItinData,
82 InstrStage<1, [SW_ALU0, SW_ALU1]>],
83 [1]>,
84 //
85 // No operand cycles
86 InstrItinData,
87 InstrStage<1, [SW_ALU0, SW_ALU1]>]>,
88 //
89 // Binary Instructions that produce a result
90 InstrItinData,
91 InstrStage<1, [SW_ALU0, SW_ALU1]>],
92 [1, 1]>,
93 InstrItinData,
94 InstrStage<1, [SW_ALU0, SW_ALU1]>],
95 [1, 1, 1]>,
96 InstrItinData,
97 InstrStage<1, [SW_ALU0, SW_ALU1]>],
98 [2, 1, 1]>,
99 InstrItinData,
100 InstrStage<1, [SW_ALU0, SW_ALU1]>],
101 [2, 1, 1]>,
102 InstrItinData,
103 InstrStage<1, [SW_ALU0, SW_ALU1]>],
104 [2, 1, 1, 1]>,
105 //
106 // Bitwise Instructions that produce a result
107 InstrItinData,
108 InstrStage<1, [SW_ALU0, SW_ALU1]>],
109 [1, 1]>,
110 InstrItinData,
111 InstrStage<1, [SW_ALU0, SW_ALU1]>],
112 [1, 1, 1]>,
113 InstrItinData,
114 InstrStage<1, [SW_ALU0, SW_ALU1]>],
115 [2, 1, 1]>,
116 InstrItinData,
117 InstrStage<1, [SW_ALU0, SW_ALU1]>],
118 [2, 1, 1, 1]>,
119 //
120 // Unary Instructions that produce a result
121
122 // CLZ, RBIT, etc.
123 InstrItinData,
124 InstrStage<1, [SW_ALU0, SW_ALU1]>],
125 [1, 1]>,
126
127 // BFC, BFI, UBFX, SBFX
128 InstrItinData,
129 InstrStage<1, [SW_ALU0, SW_ALU1]>],
130 [2, 1]>,
131
132 //
133 // Zero and sign extension instructions
134 InstrItinData,
135 InstrStage<1, [SW_ALU0, SW_ALU1]>],
136 [1, 1]>,
137 InstrItinData,
138 InstrStage<1, [SW_ALU0, SW_ALU1]>],
139 [1, 1, 1]>,
140 InstrItinData,
141 InstrStage<1, [SW_ALU0, SW_ALU1]>],
142 [1, 1, 1, 1]>,
143 //
144 // Compare instructions
145 InstrItinData,
146 InstrStage<1, [SW_ALU0, SW_ALU1]>],
147 [1]>,
148 InstrItinData,
149 InstrStage<1, [SW_ALU0, SW_ALU1]>],
150 [1, 1]>,
151 InstrItinData,
152 InstrStage<2, [SW_ALU0, SW_ALU1]>],
153 [1, 1]>,
154 InstrItinData,
155 InstrStage<2, [SW_ALU0, SW_ALU1]>],
156 [1, 1, 1]>,
157 //
158 // Test instructions
159 InstrItinData,
160 InstrStage<1, [SW_ALU0, SW_ALU1]>],
161 [1]>,
162 InstrItinData,
163 InstrStage<1, [SW_ALU0, SW_ALU1]>],
164 [1, 1]>,
165 InstrItinData,
166 InstrStage<2, [SW_ALU0, SW_ALU1]>],
167 [1, 1]>,
168 InstrItinData,
169 InstrStage<2, [SW_ALU0, SW_ALU1]>],
170 [1, 1, 1]>,
171 //
172 // Move instructions, conditional
173 // FIXME: Correctly model the extra input dep on the destination.
174 InstrItinData,
175 InstrStage<1, [SW_ALU0, SW_ALU1]>],
176 [1]>,
177 InstrItinData,
178 InstrStage<1, [SW_ALU0, SW_ALU1]>],
179 [1, 1]>,
180 InstrItinData,
181 InstrStage<1, [SW_ALU0, SW_ALU1]>],
182 [1, 1]>,
183 InstrItinData,
184 InstrStage<1, [SW_ALU0, SW_ALU1]>],
185 [2, 1, 1]>,
186 InstrItinData,
187 InstrStage<1, [SW_DIS0, SW_DIS1, SW_DIS2], 0>,
188 InstrStage<1, [SW_ALU0, SW_ALU1]>,
189 InstrStage<1, [SW_ALU0, SW_ALU1]>],
190 [2]>,
191
192 // Integer multiply pipeline
193 //
194 InstrItinData,
195 InstrStage<1, [SW_ALU0]>],
196 [3, 1, 1]>,
197 InstrItinData,
198 InstrStage<1, [SW_ALU0]>],
199 [3, 1, 1, 1]>,
200 InstrItinData,
201 InstrStage<1, [SW_ALU0]>],
202 [4, 1, 1]>,
203 InstrItinData,
204 InstrStage<1, [SW_ALU0]>],
205 [4, 1, 1, 1]>,
206 InstrItinData,
207 InstrStage<1, [SW_DIS1], 0>,
208 InstrStage<1, [SW_DIS2], 0>,
209 InstrStage<1, [SW_ALU0], 1>,
210 InstrStage<1, [SW_ALU0], 3>,
211 InstrStage<1, [SW_ALU0]>],
212 [5, 5, 1, 1]>,
213 InstrItinData,
214 InstrStage<1, [SW_DIS1], 0>,
215 InstrStage<1, [SW_DIS2], 0>,
216 InstrStage<1, [SW_ALU0], 1>,
217 InstrStage<1, [SW_ALU0], 1>,
218 InstrStage<1, [SW_ALU0, SW_ALU1], 3>,
219 InstrStage<1, [SW_ALU0, SW_ALU1]>],
220 [5, 6, 1, 1]>,
221 //
222 // Integer divide
223 InstrItinData,
224 InstrStage<1, [SW_ALU0], 0>,
225 InstrStage<14, [SW_IDIV]>],
226 [14, 1, 1]>,
227
228 // Integer load pipeline
229 // FIXME: The timings are some rough approximations
230 //
231 // Immediate offset
232 InstrItinData,
233 InstrStage<1, [SW_LS]>],
234 [3, 1]>,
235 InstrItinData,
236 InstrStage<1, [SW_LS]>],
237 [3, 1]>,
238 InstrItinData,
239 InstrStage<1, [SW_DIS1], 0>,
240 InstrStage<1, [SW_LS], 1>,
241 InstrStage<1, [SW_LS]>],
242 [3, 4, 1]>,
243 //
244 // Register offset
245 InstrItinData,
246 InstrStage<1, [SW_LS]>],
247 [3, 1, 1]>,
248 InstrItinData,
249 InstrStage<1, [SW_LS]>],
250 [3, 1, 1]>,
251 InstrItinData,
252 InstrStage<1, [SW_DIS1], 0>,
253 InstrStage<1, [SW_DIS2], 0>,
254 InstrStage<1, [SW_LS], 1>,
255 InstrStage<1, [SW_LS], 3>,
256 InstrStage<1, [SW_ALU0, SW_ALU1]>],
257 [3, 4, 1, 1]>,
258 //
259 // Scaled register offset
260 InstrItinData,
261 InstrStage<1, [SW_DIS1], 0>,
262 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
263 InstrStage<1, [SW_LS]>],
264 [5, 1, 1]>,
265 InstrItinData,
266 InstrStage<1, [SW_DIS1], 0>,
267 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
268 InstrStage<1, [SW_LS]>],
269 [5, 1, 1]>,
270 //
271 // Immediate offset with update
272 InstrItinData,
273 InstrStage<1, [SW_DIS1], 0>,
274 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
275 InstrStage<1, [SW_LS]>],
276 [3, 1, 1]>,
277 InstrItinData,
278 InstrStage<1, [SW_DIS1], 0>,
279 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
280 InstrStage<1, [SW_LS]>],
281 [3, 1, 1]>,
282 //
283 // Register offset with update
284 InstrItinData,
285 InstrStage<1, [SW_DIS1], 0>,
286 InstrStage<1, [SW_ALU0], 1>,
287 InstrStage<1, [SW_LS]>],
288 [3, 1, 1, 1]>,
289 InstrItinData,
290 InstrStage<1, [SW_DIS1], 0>,
291 InstrStage<1, [SW_ALU0], 1>,
292 InstrStage<1, [SW_LS]>],
293 [3, 1, 1, 1]>,
294 InstrItinData,
295 InstrStage<1, [SW_DIS1], 0>,
296 InstrStage<1, [SW_DIS2], 0>,
297 InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
298 InstrStage<1, [SW_LS], 3>,
299 InstrStage<1, [SW_LS], 0>,
300 InstrStage<1, [SW_ALU0, SW_ALU1]>],
301 [3, 4, 1, 1]>,
302 //
303 // Scaled register offset with update
304 InstrItinData,
305 InstrStage<1, [SW_DIS1], 0>,
306 InstrStage<1, [SW_DIS2], 0>,
307 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
308 InstrStage<1, [SW_LS], 3>,
309 InstrStage<1, [SW_ALU0, SW_ALU1]>],
310 [5, 3, 1, 1]>,
311 InstrItinData,
312 InstrStage<1, [SW_DIS1], 0>,
313 InstrStage<1, [SW_DIS2], 0>,
314 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
315 InstrStage<1, [SW_LS], 0>,
316 InstrStage<1, [SW_ALU0, SW_ALU1]>],
317 [5, 3, 1, 1]>,
318 //
319 // Load multiple, def is the 5th operand.
320 // FIXME: This assumes 3 to 4 registers.
321 InstrItinData,
322 InstrStage<1, [SW_DIS1], 0>,
323 InstrStage<1, [SW_DIS2], 0>,
324 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
325 InstrStage<1, [SW_LS]>],
326 [1, 1, 1, 1, 3], [], -1>, // dynamic uops
327
328 //
329 // Load multiple + update, defs are the 1st and 5th operands.
330 InstrItinData,
331 InstrStage<1, [SW_DIS1], 0>,
332 InstrStage<1, [SW_DIS2], 0>,
333 InstrStage<1, [SW_ALU0, SW_ALU1], 0>,
334 InstrStage<1, [SW_LS], 3>,
335 InstrStage<1, [SW_ALU0, SW_ALU1]>],
336 [2, 1, 1, 1, 3], [], -1>, // dynamic uops
337 //
338 // Load multiple plus branch
339 InstrItinData,
340 InstrStage<1, [SW_DIS1], 0>,
341 InstrStage<1, [SW_DIS2], 0>,
342 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
343 InstrStage<1, [SW_LS]>],
344 [1, 1, 1, 1, 3], [], -1>, // dynamic uops
345 //
346 // Pop, def is the 3rd operand.
347 InstrItinData,
348 InstrStage<1, [SW_DIS1], 0>,
349 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
350 InstrStage<1, [SW_LS]>],
351 [1, 1, 3], [], -1>, // dynamic uops
352 //
353 // Pop + branch, def is the 3rd operand.
354 InstrItinData,
355 InstrStage<1, [SW_DIS1], 0>,
356 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
357 InstrStage<1, [SW_LS]>],
358 [1, 1, 3], [], -1>, // dynamic uops
359
360 //
361 // iLoadi + iALUr for t2LDRpci_pic.
362 InstrItinData,
363 InstrStage<1, [SW_LS], 3>,
364 InstrStage<1, [SW_ALU0, SW_ALU1]>],
365 [4, 1]>,
366
367 // Integer store pipeline
368 ///
369 // Immediate offset
370 InstrItinData,
371 InstrStage<1, [SW_LS]>],
372 [1, 1]>,
373 InstrItinData,
374 InstrStage<1, [SW_LS]>],
375 [1, 1]>,
376 InstrItinData,
377 InstrStage<1, [SW_DIS1], 0>,
378 InstrStage<1, [SW_DIS2], 0>,
379 InstrStage<1, [SW_LS], 0>,
380 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
381 InstrStage<1, [SW_LS]>],
382 [1, 1]>,
383 //
384 // Register offset
385 InstrItinData,
386 InstrStage<1, [SW_LS]>],
387 [1, 1, 1]>,
388 InstrItinData,
389 InstrStage<1, [SW_LS]>],
390 [1, 1, 1]>,
391 InstrItinData,
392 InstrStage<1, [SW_DIS1], 0>,
393 InstrStage<1, [SW_DIS2], 0>,
394 InstrStage<1, [SW_LS], 0>,
395 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
396 InstrStage<1, [SW_LS]>],
397 [1, 1, 1]>,
398 //
399 // Scaled register offset
400 InstrItinData,
401 InstrStage<1, [SW_DIS1], 0>,
402 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
403 InstrStage<1, [SW_LS]>],
404 [1, 1, 1]>,
405 InstrItinData,
406 InstrStage<1, [SW_DIS1], 0>,
407 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
408 InstrStage<1, [SW_LS]>],
409 [1, 1, 1]>,
410 //
411 // Immediate offset with update
412 InstrItinData,
413 InstrStage<1, [SW_DIS1], 0>,
414 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
415 InstrStage<1, [SW_LS]>],
416 [1, 1, 1]>,
417 InstrItinData,
418 InstrStage<1, [SW_DIS1], 0>,
419 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
420 InstrStage<1, [SW_LS]>],
421 [1, 1, 1]>,
422 //
423 // Register offset with update
424 InstrItinData,
425 InstrStage<1, [SW_DIS1], 0>,
426 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
427 InstrStage<1, [SW_LS]>],
428 [1, 1, 1, 1]>,
429 InstrItinData,
430 InstrStage<1, [SW_DIS1], 0>,
431 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
432 InstrStage<1, [SW_LS]>],
433 [1, 1, 1, 1]>,
434 InstrItinData,
435 InstrStage<1, [SW_DIS1], 0>,
436 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
437 InstrStage<1, [SW_LS]>],
438 [1, 1, 1, 1]>,
439 //
440 // Scaled register offset with update
441 InstrItinData,
442 InstrStage<1, [SW_DIS1], 0>,
443 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
444 InstrStage<1, [SW_LS], 0>,
445 InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
446 [3, 1, 1, 1]>,
447 InstrItinData,
448 InstrStage<1, [SW_DIS1], 0>,
449 InstrStage<1, [SW_ALU0, SW_ALU1], 2>,
450 InstrStage<1, [SW_LS], 0>,
451 InstrStage<1, [SW_ALU0, SW_ALU1], 1>],
452 [3, 1, 1, 1]>,
453 //
454 // Store multiple
455 InstrItinData,
456 InstrStage<1, [SW_DIS1], 0>,
457 InstrStage<1, [SW_DIS2], 0>,
458 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
459 InstrStage<1, [SW_LS], 1>,
460 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
461 InstrStage<1, [SW_LS], 1>,
462 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
463 InstrStage<1, [SW_LS]>],
464 [], [], -1>, // dynamic uops
465 //
466 // Store multiple + update
467 InstrItinData,
468 InstrStage<1, [SW_DIS1], 0>,
469 InstrStage<1, [SW_DIS2], 0>,
470 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
471 InstrStage<1, [SW_LS], 1>,
472 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
473 InstrStage<1, [SW_LS], 1>,
474 InstrStage<1, [SW_ALU0, SW_ALU1], 1>,
475 InstrStage<1, [SW_LS]>],
476 [2], [], -1>, // dynamic uops
477
478 //
479 // Preload
480 InstrItinData], [1, 1]>,
481
482 // Branch
483 //
484 // no delay slots, so the latency of a branch is unimportant
485 InstrItinData]>,
486
487 // FP Special Register to Integer Register File Move
488 InstrItinData,
489 InstrStage<1, [SW_ALU0, SW_ALU1]>],
490 [1]>,
491 //
492 // Single-precision FP Unary
493 //
494 // Most floating-point moves get issued on ALU0.
495 InstrItinData,
496 InstrStage<1, [SW_ALU0]>],
497 [2, 1]>,
498 //
499 // Double-precision FP Unary
500 InstrItinData,
501 InstrStage<1, [SW_ALU0]>],
502 [2, 1]>,
503
504 //
505 // Single-precision FP Compare
506 InstrItinData,
507 InstrStage<1, [SW_ALU0]>],
508 [1, 1]>,
509 //
510 // Double-precision FP Compare
511 InstrItinData,
512 InstrStage<1, [SW_ALU0]>],
513 [1, 1]>,
514 //
515 // Single to Double FP Convert
516 InstrItinData,
517 InstrStage<1, [SW_ALU1]>],
518 [4, 1]>,
519 //
520 // Double to Single FP Convert
521 InstrItinData,
522 InstrStage<1, [SW_ALU1]>],
523 [4, 1]>,
524
525 //
526 // Single to Half FP Convert
527 InstrItinData,
528 InstrStage<1, [SW_DIS1], 0>,
529 InstrStage<1, [SW_ALU1], 4>,
530 InstrStage<1, [SW_ALU1]>],
531 [6, 1]>,
532 //
533 // Half to Single FP Convert
534 InstrItinData,
535 InstrStage<1, [SW_ALU1]>],
536 [4, 1]>,
537
538 //
539 // Single-Precision FP to Integer Convert
540 InstrItinData,
541 InstrStage<1, [SW_ALU1]>],
542 [4, 1]>,
543 //
544 // Double-Precision FP to Integer Convert
545 InstrItinData,
546 InstrStage<1, [SW_ALU1]>],
547 [4, 1]>,
548 //
549 // Integer to Single-Precision FP Convert
550 InstrItinData,
551 InstrStage<1, [SW_ALU1]>],
552 [4, 1]>,
553 //
554 // Integer to Double-Precision FP Convert
555 InstrItinData,
556 InstrStage<1, [SW_ALU1]>],
557 [4, 1]>,
558 //
559 // Single-precision FP ALU
560 InstrItinData,
561 InstrStage<1, [SW_ALU0]>],
562 [2, 1, 1]>,
563 //
564 // Double-precision FP ALU
565 InstrItinData,
566 InstrStage<1, [SW_ALU0]>],
567 [2, 1, 1]>,
568 //
569 // Single-precision FP Multiply
570 InstrItinData,
571 InstrStage<1, [SW_ALU1]>],
572 [4, 1, 1]>,
573 //
574 // Double-precision FP Multiply
575 InstrItinData,
576 InstrStage<1, [SW_ALU1]>],
577 [6, 1, 1]>,
578 //
579 // Single-precision FP MAC
580 InstrItinData,
581 InstrStage<1, [SW_ALU1]>],
582 [8, 1, 1]>,
583 //
584 // Double-precision FP MAC
585 InstrItinData,
586 InstrStage<1, [SW_ALU1]>],
587 [12, 1, 1]>,
588 //
589 // Single-precision Fused FP MAC
590 InstrItinData,
591 InstrStage<1, [SW_ALU1]>],
592 [8, 1, 1]>,
593 //
594 // Double-precision Fused FP MAC
595 InstrItinData,
596 InstrStage<1, [SW_ALU1]>],
597 [12, 1, 1]>,
598 //
599 // Single-precision FP DIV
600 InstrItinData,
601 InstrStage<1, [SW_ALU1], 0>,
602 InstrStage<15, [SW_FDIV]>],
603 [17, 1, 1]>,
604 //
605 // Double-precision FP DIV
606 InstrItinData,
607 InstrStage<1, [SW_ALU1], 0>,
608 InstrStage<30, [SW_FDIV]>],
609 [32, 1, 1]>,
610 //
611 // Single-precision FP SQRT
612 InstrItinData,
613 InstrStage<1, [SW_ALU1], 0>,
614 InstrStage<15, [SW_FDIV]>],
615 [17, 1]>,
616 //
617 // Double-precision FP SQRT
618 InstrItinData,
619 InstrStage<1, [SW_ALU1], 0>,
620 InstrStage<30, [SW_FDIV]>],
621 [32, 1, 1]>,
622
623 //
624 // Integer to Single-precision Move
625 InstrItinData,
626 InstrStage<1, [SW_DIS1], 0>,
627 InstrStage<1, [SW_LS], 4>,
628 InstrStage<1, [SW_ALU0]>],
629 [6, 1]>,
630 //
631 // Integer to Double-precision Move
632 InstrItinData,
633 InstrStage<1, [SW_LS]>],
634 [4, 1]>,
635 //
636 // Single-precision to Integer Move
637 InstrItinData,
638 InstrStage<1, [SW_LS]>],
639 [3, 1]>,
640 //
641 // Double-precision to Integer Move
642 InstrItinData,
643 InstrStage<1, [SW_DIS1], 0>,
644 InstrStage<1, [SW_LS], 3>,
645 InstrStage<1, [SW_LS]>],
646 [3, 4, 1]>,
647 //
648 // Single-precision FP Load
649 InstrItinData,
650 InstrStage<1, [SW_LS]>],
651 [4, 1]>,
652 //
653 // Double-precision FP Load
654 InstrItinData,
655 InstrStage<1, [SW_LS]>],
656 [4, 1]>,
657 //
658 // FP Load Multiple
659 // FIXME: Assumes a single Q register.
660 InstrItinData,
661 InstrStage<1, [SW_LS]>],
662 [1, 1, 1, 4], [], -1>, // dynamic uops
663 //
664 // FP Load Multiple + update
665 // FIXME: Assumes a single Q register.
666 InstrItinData,
667 InstrStage<1, [SW_DIS1], 0>,
668 InstrStage<1, [SW_LS], 4>,
669 InstrStage<1, [SW_ALU0, SW_ALU1]>],
670 [2, 1, 1, 1, 4], [], -1>, // dynamic uops
671 //
672 // Single-precision FP Store
673 InstrItinData,
674 InstrStage<1, [SW_LS]>],
675 [1, 1]>,
676 //
677 // Double-precision FP Store
678 InstrItinData,
679 InstrStage<1, [SW_LS]>],
680 [1, 1]>,
681 //
682 // FP Store Multiple
683 // FIXME: Assumes a single Q register.
684 InstrItinData,
685 InstrStage<1, [SW_LS]>],
686 [1, 1, 1], [], -1>, // dynamic uops
687 //
688 // FP Store Multiple + update
689 // FIXME: Assumes a single Q register.
690 InstrItinData,
691 InstrStage<1, [SW_DIS1], 0>,
692 InstrStage<1, [SW_LS], 4>,
693 InstrStage<1, [SW_ALU0, SW_ALU1]>],
694 [2, 1, 1, 1], [], -1>, // dynamic uops
695 // NEON
696 //
697 // Double-register Integer Unary
698 InstrItinData,
699 InstrStage<1, [SW_ALU0]>],
700 [4, 1]>,
701 //
702 // Quad-register Integer Unary
703 InstrItinData,
704 InstrStage<1, [SW_ALU0]>],
705 [4, 1]>,
706 //
707 // Double-register Integer Q-Unary
708 InstrItinData,
709 InstrStage<1, [SW_ALU0]>],
710 [4, 1]>,
711 //
712 // Quad-register Integer CountQ-Unary
713 InstrItinData,
714 InstrStage<1, [SW_ALU0]>],
715 [4, 1]>,
716 //
717 // Double-register Integer Binary
718 InstrItinData,
719 InstrStage<1, [SW_ALU0]>],
720 [2, 1, 1]>,
721 //
722 // Quad-register Integer Binary
723 InstrItinData,
724 InstrStage<1, [SW_ALU0]>],
725 [2, 1, 1]>,
726 //
727 // Double-register Integer Subtract
728 InstrItinData,
729 InstrStage<1, [SW_ALU0]>],
730 [2, 1, 1]>,
731 //
732 // Quad-register Integer Subtract
733 InstrItinData,
734 InstrStage<1, [SW_ALU0]>],
735 [2, 1, 1]>,
736 //
737 // Double-register Integer Shift
738 InstrItinData,
739 InstrStage<1, [SW_ALU0]>],
740 [2, 1, 1]>,
741 //
742 // Quad-register Integer Shift
743 InstrItinData,
744 InstrStage<1, [SW_ALU0]>],
745 [2, 1, 1]>,
746 //
747 // Double-register Integer Shift (4 cycle)
748 InstrItinData,
749 InstrStage<1, [SW_ALU0]>],
750 [4, 1, 1]>,
751 //
752 // Quad-register Integer Shift (4 cycle)
753 InstrItinData,
754 InstrStage<1, [SW_ALU0]>],
755 [4, 1, 1]>,
756 //
757 // Double-register Integer Binary (4 cycle)
758 InstrItinData,
759 InstrStage<1, [SW_ALU0]>],
760 [4, 1, 1]>,
761 //
762 // Quad-register Integer Binary (4 cycle)
763 InstrItinData,
764 InstrStage<1, [SW_ALU0]>],
765 [4, 1, 1]>,
766 //
767 // Double-register Integer Subtract (4 cycle)
768 InstrItinData,
769 InstrStage<1, [SW_ALU0]>],
770 [4, 1, 1]>,
771 //
772 // Quad-register Integer Subtract (4 cycle)
773 InstrItinData,
774 InstrStage<1, [SW_ALU0]>],
775 [4, 1, 1]>,
776
777 //
778 // Double-register Integer Count
779 InstrItinData,
780 InstrStage<1, [SW_ALU0]>],
781 [2, 1, 1]>,
782 //
783 // Quad-register Integer Count
784 InstrItinData,
785 InstrStage<1, [SW_ALU0]>],
786 [2, 1, 1]>,
787 //
788 // Double-register Absolute Difference and Accumulate
789 InstrItinData,
790 InstrStage<1, [SW_ALU0]>],
791 [4, 1, 1, 1]>,
792 //
793 // Quad-register Absolute Difference and Accumulate
794 InstrItinData,
795 InstrStage<1, [SW_ALU0]>],
796 [4, 1, 1, 1]>,
797 //
798 // Double-register Integer Pair Add Long
799 InstrItinData,
800 InstrStage<1, [SW_ALU0]>],
801 [4, 1, 1]>,
802 //
803 // Quad-register Integer Pair Add Long
804 InstrItinData,
805 InstrStage<1, [SW_ALU0]>],
806 [4, 1, 1]>,
807
808 //
809 // Double-register Integer Multiply (.8, .16)
810 InstrItinData,
811 InstrStage<1, [SW_ALU1]>],
812 [4, 1, 1]>,
813 //
814 // Quad-register Integer Multiply (.8, .16)
815 InstrItinData,
816 InstrStage<1, [SW_ALU1]>],
817 [4, 1, 1]>,
818
819 //
820 // Double-register Integer Multiply (.32)
821 InstrItinData,
822 InstrStage<1, [SW_ALU1]>],
823 [4, 1, 1]>,
824 //
825 // Quad-register Integer Multiply (.32)
826 InstrItinData,
827 InstrStage<1, [SW_ALU1]>],
828 [4, 1, 1]>,
829 //
830 // Double-register Integer Multiply-Accumulate (.8, .16)
831 InstrItinData,
832 InstrStage<1, [SW_ALU1]>],
833 [4, 1, 1, 1]>,
834 //
835 // Double-register Integer Multiply-Accumulate (.32)
836 InstrItinData,
837 InstrStage<1, [SW_ALU1]>],
838 [4, 1, 1, 1]>,
839 //
840 // Quad-register Integer Multiply-Accumulate (.8, .16)
841 InstrItinData,
842 InstrStage<1, [SW_ALU1]>],
843 [4, 1, 1, 1]>,
844 //
845 // Quad-register Integer Multiply-Accumulate (.32)
846 InstrItinData,
847 InstrStage<1, [SW_ALU1]>],
848 [4, 1, 1, 1]>,
849
850 //
851 // Move
852 InstrItinData,
853 InstrStage<1, [SW_ALU0]>],
854 [2, 1]>,
855 //
856 // Move Immediate
857 InstrItinData,
858 InstrStage<1, [SW_ALU0]>],
859 [2]>,
860 //
861 // Double-register Permute Move
862 InstrItinData,
863 InstrStage<1, [SW_ALU1]>],
864 [2, 1]>,
865 //
866 // Quad-register Permute Move
867 InstrItinData,
868 InstrStage<1, [SW_ALU1]>],
869 [2, 1]>,
870 //
871 // Integer to Single-precision Move
872 InstrItinData,
873 InstrStage<1, [SW_DIS1], 0>,
874 InstrStage<1, [SW_LS], 4>,
875 InstrStage<1, [SW_ALU0]>],
876 [6, 1]>,
877 //
878 // Integer to Double-precision Move
879 InstrItinData,
880 InstrStage<1, [SW_LS]>],
881 [4, 1, 1]>,
882 //
883 // Single-precision to Integer Move
884 InstrItinData,
885 InstrStage<1, [SW_LS]>],
886 [3, 1]>,
887 //
888 // Double-precision to Integer Move
889 InstrItinData,
890 InstrStage<1, [SW_DIS1], 0>,
891 InstrStage<1, [SW_LS], 3>,
892 InstrStage<1, [SW_LS]>],
893 [3, 4, 1]>,
894 //
895 // Integer to Lane Move
896 // FIXME: I think this is correct, but it is not clear from the tuning guide.
897 InstrItinData,
898 InstrStage<1, [SW_DIS1], 0>,
899 InstrStage<1, [SW_LS], 4>,
900 InstrStage<1, [SW_ALU0]>],
901 [6, 1]>,
902
903 //
904 // Vector narrow move
905 InstrItinData,
906 InstrStage<1, [SW_ALU1]>],
907 [2, 1]>,
908 //
909 // Double-register FP Unary
910 // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
911 // and they issue on a different pipeline.
912 InstrItinData,
913 InstrStage<1, [SW_ALU0]>],
914 [2, 1]>,
915 //
916 // Quad-register FP Unary
917 // FIXME: VRECPE / VRSQRTE has a longer latency than VABS, which is used here,
918 // and they issue on a different pipeline.
919 InstrItinData,
920 InstrStage<1, [SW_ALU0]>],
921 [2, 1]>,
922 //
923 // Double-register FP Binary
924 // FIXME: We're using this itin for many instructions.
925 InstrItinData,
926 InstrStage<1, [SW_ALU0]>],
927 [4, 1, 1]>,
928
929 //
930 // VPADD, etc.
931 InstrItinData,
932 InstrStage<1, [SW_ALU0]>],
933 [4, 1, 1]>,
934 //
935 // Double-register FP VMUL
936 InstrItinData,
937 InstrStage<1, [SW_ALU1]>],
938 [4, 1, 1]>,
939 //
940 // Quad-register FP Binary
941 InstrItinData,
942 InstrStage<1, [SW_ALU0]>],
943 [4, 1, 1]>,
944 //
945 // Quad-register FP VMUL
946 InstrItinData,
947 InstrStage<1, [SW_ALU1]>],
948 [4, 1, 1]>,
949 //
950 // Double-register FP Multiple-Accumulate
951 InstrItinData,
952 InstrStage<1, [SW_ALU1]>],
953 [8, 1, 1]>,
954 //
955 // Quad-register FP Multiple-Accumulate
956 InstrItinData,
957 InstrStage<1, [SW_ALU1]>],
958 [8, 1, 1]>,
959 //
960 // Double-register Fused FP Multiple-Accumulate
961 InstrItinData,
962 InstrStage<1, [SW_ALU1]>],
963 [8, 1, 1]>,
964 //
965 // Quad-register FusedF P Multiple-Accumulate
966 InstrItinData,
967 InstrStage<1, [SW_ALU1]>],
968 [8, 1, 1]>,
969 //
970 // Double-register Reciprical Step
971 InstrItinData,
972 InstrStage<1, [SW_ALU1]>],
973 [8, 1, 1]>,
974 //
975 // Quad-register Reciprical Step
976 InstrItinData,
977 InstrStage<1, [SW_ALU1]>],
978 [8, 1, 1]>,
979 //
980 // Double-register Permute
981 // FIXME: The latencies are unclear from the documentation.
982 InstrItinData,
983 InstrStage<1, [SW_DIS1], 0>,
984 InstrStage<1, [SW_DIS2], 0>,
985 InstrStage<1, [SW_ALU1], 2>,
986 InstrStage<1, [SW_ALU1], 2>,
987 InstrStage<1, [SW_ALU1]>],
988 [3, 4, 3, 4]>,
989 //
990 // Quad-register Permute
991 // FIXME: The latencies are unclear from the documentation.
992 InstrItinData,
993 InstrStage<1, [SW_DIS1], 0>,
994 InstrStage<1, [SW_DIS2], 0>,
995 InstrStage<1, [SW_ALU1], 2>,
996 InstrStage<1, [SW_ALU1], 2>,
997 InstrStage<1, [SW_ALU1]>],
998 [3, 4, 3, 4]>,
999 //
1000 // Quad-register Permute (3 cycle issue on A9)
1001 InstrItinData,
1002 InstrStage<1, [SW_DIS1], 0>,
1003 InstrStage<1, [SW_DIS2], 0>,
1004 InstrStage<1, [SW_ALU1], 2>,
1005 InstrStage<1, [SW_ALU1], 2>,
1006 InstrStage<1, [SW_ALU1]>],
1007 [3, 4, 3, 4]>,
1008
1009 //
1010 // Double-register VEXT
1011 InstrItinData,
1012 InstrStage<1, [SW_ALU1]>],
1013 [2, 1, 1]>,
1014 //
1015 // Quad-register VEXT
1016 InstrItinData,
1017 InstrStage<1, [SW_ALU1]>],
1018 [2, 1, 1]>,
1019 //
1020 // VTB
1021 InstrItinData,
1022 InstrStage<1, [SW_ALU1]>],
1023 [2, 1, 1]>,
1024 InstrItinData,
1025 InstrStage<1, [SW_DIS1], 0>,
1026 InstrStage<1, [SW_ALU1], 2>,
1027 InstrStage<1, [SW_ALU1]>],
1028 [4, 1, 3, 3]>,
1029 InstrItinData,
1030 InstrStage<1, [SW_DIS1], 0>,
1031 InstrStage<1, [SW_DIS2], 0>,
1032 InstrStage<1, [SW_ALU1], 2>,
1033 InstrStage<1, [SW_ALU1], 2>,
1034 InstrStage<1, [SW_ALU1]>],
1035 [6, 1, 3, 5, 5]>,
1036 InstrItinData,
1037 InstrStage<1, [SW_DIS1], 0>,
1038 InstrStage<1, [SW_DIS2], 0>,
1039 InstrStage<1, [SW_ALU1], 2>,
1040 InstrStage<1, [SW_ALU1], 2>,
1041 InstrStage<1, [SW_ALU1], 2>,
1042 InstrStage<1, [SW_ALU1]>],
1043 [8, 1, 3, 5, 7, 7]>,
1044 //
1045 // VTBX
1046 InstrItinData,
1047 InstrStage<1, [SW_ALU1]>],
1048 [2, 1, 1]>,
1049 InstrItinData,
1050 InstrStage<1, [SW_DIS1], 0>,
1051 InstrStage<1, [SW_ALU1], 2>,
1052 InstrStage<1, [SW_ALU1]>],
1053 [4, 1, 3, 3]>,
1054 InstrItinData,
1055 InstrStage<1, [SW_DIS1], 0>,
1056 InstrStage<1, [SW_DIS2], 0>,
1057 InstrStage<1, [SW_ALU1], 2>,
1058 InstrStage<1, [SW_ALU1], 2>,
1059 InstrStage<1, [SW_ALU1]>],
1060 [6, 1, 3, 5, 5]>,
1061 InstrItinData,
1062 InstrStage<1, [SW_DIS1], 0>,
1063 InstrStage<1, [SW_DIS2], 0>,
1064 InstrStage<1, [SW_ALU1], 2>,
1065 InstrStage<1, [SW_ALU1], 2>,
1066 InstrStage<1, [SW_ALU1], 2>,
1067 InstrStage<1, [SW_ALU1]>],
1068 [8, 1, 3, 5, 7, 7]>
1069 ]>;
1070
1071 // ===---------------------------------------------------------------------===//
1072 // This following definitions describe the simple machine model which
1073 // will replace itineraries.
1074
391075 // Swift machine model for scheduling and other instruction cost heuristics.
401076 def SwiftModel : SchedMachineModel {
411077 let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
421078 let MicroOpBufferSize = 45; // Based on NEON renamed registers.
431079 let LoadLatency = 3;
441080 let MispredictPenalty = 14; // A branch direction mispredict.
1081
1082 let Itineraries = SwiftItineraries;
451083 }
461084
471085 // Swift predicates.
318318 return getTargetTriple().isiOS() && !getTargetTriple().isOSVersionLT(7, 0);
319319 }
320320
321 bool ARMSubtarget::enableMachineScheduler() const {
322 // Enable the MachineScheduler before register allocation for out-of-order
323 // architectures where we do not use the PostRA scheduler anymore (for now
324 // restricted to swift).
325 return getSchedModel().isOutOfOrder() && isSwift();
326 }
327
328321 // This overrides the PostRAScheduler bit in the SchedModel for any CPU.
329322 bool ARMSubtarget::enablePostRAScheduler() const {
330 // No need for PostRA scheduling on out of order CPUs (for now restricted to
331 // swift).
332 if (getSchedModel().isOutOfOrder() && isSwift())
333 return false;
334323 return (!isThumb() || hasThumb2());
335324 }
336325
432432 /// compiler runtime or math libraries.
433433 bool hasSinCos() const;
434434
435 /// Returns true if machine scheduler should be enabled.
436 bool enableMachineScheduler() const override;
437
438435 /// True for some subtargets at > -O0.
439436 bool enablePostRAScheduler() const override;
440437
1010 ; r0 = r0 / r2
1111 ; r1 = r1 / r3
1212 ;
13 ; NOOPT: vmov [[A:d[0-9]+]], r0, r1
14 ; NOOPT-NEXT: vmov [[B:d[0-9]+]], r2, r3
13 ; NOOPT: vmov [[B:d[0-9]+]], r2, r3
14 ; NOOPT-NEXT: vmov [[A:d[0-9]+]], r0, r1
1515 ; Move the low part of B into a register.
1616 ; Unfortunately, we cannot express that the 's' register is the low
1717 ; part of B, i.e., sIdx == BIdx x 2. E.g., B = d1, B_low = s2.
1818 ; NOOPT-NEXT: vmov [[B_LOW:r[0-9]+]], s{{[0-9]+}}
19 ; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
20 ; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
1921 ; NOOPT-NEXT: vmov [[B_HIGH:r[0-9]+]], s{{[0-9]+}}
20 ; NOOPT-NEXT: vmov [[A_LOW:r[0-9]+]], s{{[0-9]+}}
2122 ; NOOPT-NEXT: vmov [[A_HIGH:r[0-9]+]], s{{[0-9]+}}
22 ; NOOPT-NEXT: udiv [[RES_LOW:r[0-9]+]], [[A_LOW]], [[B_LOW]]
23 ; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
2324 ; NOOPT-NEXT: vmov.32 [[RES:d[0-9]+]][0], [[RES_LOW]]
24 ; NOOPT-NEXT: udiv [[RES_HIGH:r[0-9]+]], [[A_HIGH]], [[B_HIGH]]
2525 ; NOOPT-NEXT: vmov.32 [[RES]][1], [[RES_HIGH]]
2626 ; NOOPT-NEXT: vmov r0, r1, [[RES]]
2727 ; NOOPT-NEXT: bx lr
2828 ;
2929 ; OPT-NOT: vmov
30 ; OPT: udiv r1, r1, r3
31 ; OPT-NEXT: udiv r0, r0, r2
30 ; OPT: udiv r0, r0, r2
31 ; OPT-NEXT: udiv r1, r1, r3
3232 ; OPT-NEXT: bx lr
3333 define <2 x i32> @simpleVectorDiv(<2 x i32> %A, <2 x i32> %B) nounwind {
3434 entry:
None ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CORTEX
1 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SWIFT
0 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a9 | FileCheck %s
1 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=swift | FileCheck %s
22 ; Avoid some 's' 16-bit instruction which partially update CPSR (and add false
33 ; dependency) when it isn't dependent on last CPSR defining instruction.
44 ; rdar://8928208
66 define i32 @t1(i32 %a, i32 %b, i32 %c, i32 %d) nounwind readnone {
77 entry:
88 ; CHECK-LABEL: t1:
9 ; CHECK-CORTEX: muls [[REG:(r[0-9]+)]], r3, r2
10 ; CHECK-CORTEX-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
11 ; CHECK-SWIFT: muls [[REG2:(r[0-9]+)]], r1, r0
12 ; CHECK-SWIFT-NEXT: mul [[REG:(r[0-9]+)]], r2, r3
9 ; CHECK: muls [[REG:(r[0-9]+)]], r3, r2
10 ; CHECK-NEXT: mul [[REG2:(r[0-9]+)]], r1, r0
1311 ; CHECK-NEXT: muls r0, [[REG]], [[REG2]]
1412 %0 = mul nsw i32 %a, %b
1513 %1 = mul nsw i32 %c, %d
2220 define void @t2(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind {
2321 entry:
2422 ; CHECK-LABEL: t2:
25 br label %while.body
23 %tobool7 = icmp eq i32* %ptr2, null
24 br i1 %tobool7, label %while.end, label %while.body
2625
2726 while.body:
2827 ; CHECK: while.body
5554 define void @t3(i32* nocapture %ptr1, i32* %ptr2, i32 %c) nounwind minsize {
5655 entry:
5756 ; CHECK-LABEL: t3:
58 br label %while.body
57 %tobool7 = icmp eq i32* %ptr2, null
58 br i1 %tobool7, label %while.end, label %while.body
5959
6060 while.body:
6161 ; CHECK: while.body
1414 ; CHECK: bne [[LOOP]]
1515
1616 ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
17 ; CHECK: movs r0, #1
1718 ; CHECK: dmb ish
18 ; CHECK: movs r0, #1
1919 ; CHECK: bx lr
2020
2121 ; CHECK: [[FAILED]]:
2222 ; CHECK-NOT: cmp {{r[0-9]+}}, {{r[0-9]+}}
23 ; CHECK: movs r0, #0
2324 ; CHECK: dmb ish
24 ; CHECK: movs r0, #0
2525 ; CHECK: bx lr
2626
2727 %pair = cmpxchg i32* %p, i32 %oldval, i32 %newval seq_cst seq_cst
3333 define i1 @test_return_bool(i8* %value, i8 %oldValue, i8 %newValue) {
3434 ; CHECK-LABEL: test_return_bool:
3535
36 ; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
3637 ; CHECK: dmb ishst
37 ; CHECK: uxtb [[OLDBYTE:r[0-9]+]], r1
3838
3939 ; CHECK: [[LOOP:LBB[0-9]+_[0-9]+]]:
4040 ; CHECK: ldrexb [[LOADED:r[0-9]+]], [r0]
1919
2020 for.body: ; preds = %entry, %for.body.3
2121 ; CHECK: %for.body
22 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
23 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
22 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
23 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
2424 %i.09 = phi i32 [ %add5.3, %for.body.3 ], [ 0, %entry ]
2525 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.09
2626 %0 = load i8, i8* %arrayidx, align 1
4141
4242 for.body.1: ; preds = %for.body
4343 ; CHECK: %for.body.1
44 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
45 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
44 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
45 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
4646 %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %add5
4747 %2 = load i8, i8* %arrayidx.1, align 1
4848 %conv6.1 = zext i8 %2 to i32
5959
6060 for.body.2: ; preds = %for.body.1
6161 ; CHECK: %for.body.2
62 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
63 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
62 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
63 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
6464 %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %add5.1
6565 %4 = load i8, i8* %arrayidx.2, align 1
6666 %conv6.2 = zext i8 %4 to i32
7777
7878 for.body.3: ; preds = %for.body.2
7979 ; CHECK: %for.body.3
80 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
81 ; CHECK: ldrb {{r[0-9]+|lr}}, [{{r[0-9]+|lr}}, {{r[0-9]+|lr}}]!
80 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
81 ; CHECK: ldrb {{r[0-9]|lr}}, [{{r[0-9]|lr}}, {{r[0-9]|lr}}]!
8282 %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %add5.2
8383 %6 = load i8, i8* %arrayidx.3, align 1
8484 %conv6.3 = zext i8 %6 to i32
237237
238238 define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
239239 ;CHECK-LABEL: zextload_v8i8tov8i32_fake_update:
240 ;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
240 ;CHECK: ldr.w r[[PTRREG:[0-9]+]], [r0]
241241 ;CHECK: vld1.32 {{{d[0-9]+}}[0]}, [r[[PTRREG]]:32]
242242 ;CHECK: add.w r[[INCREG:[0-9]+]], r[[PTRREG]], #16
243 ;CHECK: str.w r[[INCREG]], [r0]
243244 ;CHECK: vmovl.u8 {{q[0-9]+}}, {{d[0-9]+}}
244245 ;CHECK: vmovl.u16 {{q[0-9]+}}, {{d[0-9]+}}
245 ;CHECK: str r[[INCREG]], [r0]
246246 %A = load <4 x i8>*, <4 x i8>** %ptr
247247 %lA = load <4 x i8>, <4 x i8>* %A, align 4
248248 %inc = getelementptr <4 x i8>, <4 x i8>* %A, i38 4
227227 ;CHECK: ldr.w r9, [sp]
228228 ;CHECK: vmov {{d[0-9]+}}, r3, r9
229229 ;CHECK: vmov {{d[0-9]+}}, r1, r2
230 ;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
231230 ;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
232231 ;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
232 ;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
233233 ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32]
234234 %A = load <4 x i8>*, <4 x i8>** %ptr
235235 %trunc = trunc <4 x i32> %val to <4 x i8>
242242 ;CHECK: ldr.w r9, [sp]
243243 ;CHECK: vmov {{d[0-9]+}}, r3, r9
244244 ;CHECK: vmov {{d[0-9]+}}, r1, r2
245 ;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
245 ;CHECK: movs [[IMM16:r[0-9]+]], #16
246246 ;CHECK: vmovn.i32 [[VECLO:d[0-9]+]], {{q[0-9]+}}
247247 ;CHECK: vuzp.8 [[VECLO]], {{d[0-9]+}}
248 ;CHECK: movs [[IMM16:r[0-9]+]], #16
248 ;CHECK: ldr r[[PTRREG:[0-9]+]], [r0]
249249 ;CHECK: vst1.32 {[[VECLO]][0]}, [r[[PTRREG]]:32], [[IMM16]]
250250 ;CHECK: str r[[PTRREG]], [r0]
251251 %A = load <4 x i8>*, <4 x i8>** %ptr