llvm.org GIT mirror llvm / 62c7c25
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Issues identified by buildbots addressed since original review: - Fixed ARMLoadStoreOptimizer bug exposed by this change in r311907. - The pass no longer forwards COPYs to physical register uses, since doing so can break code that implicitly relies on the physical register number of the use. - The pass no longer forwards COPYs to undef uses, since doing so can break the machine verifier by creating LiveRanges that don't end on a use (since the undef operand is not considered a use). [MachineCopyPropagation] Extend pass to do COPY source forwarding This change extends MachineCopyPropagation to do COPY source forwarding. This change also extends the MachineCopyPropagation pass to be able to be run during register allocation, after physical registers have been assigned, but before the virtual registers have been re-written, which allows it to remove virtual register COPY LiveIntervals that become dead through the forwarding of all of their uses. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@312154 91177308-0d34-0410-b5e6-96231b3b80d8 Geoff Berry 2 years ago
78 changed file(s) with 1500 addition(s) and 957 deletion(s). Raw diff Collapse all Expand all
277277 /// MachineSinking - This pass performs sinking on machine instructions.
278278 extern char &MachineSinkingID;
279279
280 /// MachineCopyPropagationPreRegRewrite - This pass performs copy propagation
281 /// on machine instructions after register allocation but before virtual
282 /// register re-writing..
283 extern char &MachineCopyPropagationPreRegRewriteID;
284
280285 /// MachineCopyPropagation - This pass performs copy propagation on
281286 /// machine instructions.
282287 extern char &MachineCopyPropagationID;
231231 void initializeMachineCSEPass(PassRegistry&);
232232 void initializeMachineCombinerPass(PassRegistry&);
233233 void initializeMachineCopyPropagationPass(PassRegistry&);
234 void initializeMachineCopyPropagationPreRegRewritePass(PassRegistry&);
234235 void initializeMachineDominanceFrontierPass(PassRegistry&);
235236 void initializeMachineDominatorTreePass(PassRegistry&);
236237 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
5252 initializeMachineCSEPass(Registry);
5353 initializeMachineCombinerPass(Registry);
5454 initializeMachineCopyPropagationPass(Registry);
55 initializeMachineCopyPropagationPreRegRewritePass(Registry);
5556 initializeMachineDominatorTreePass(Registry);
5657 initializeMachineFunctionPrinterPassPass(Registry);
5758 initializeMachineLICMPass(Registry);
66 //
77 //===----------------------------------------------------------------------===//
88 //
9 // This is an extremely simple MachineInstr-level copy propagation pass.
9 // This is a simple MachineInstr-level copy forwarding pass. It may be run at
10 // two places in the codegen pipeline:
11 // - After register allocation but before virtual registers have been remapped
12 // to physical registers.
13 // - After physical register remapping.
14 //
15 // The optimizations done vary slightly based on whether virtual registers are
16 // still present. In both cases, this pass forwards the source of COPYs to the
17 // users of their destinations when doing so is legal. For example:
18 //
19 // %vreg1 = COPY %vreg0
20 // ...
21 // ... = OP %vreg1
22 //
23 // If
24 // - the physical register assigned to %vreg0 has not been clobbered by the
25 // time of the use of %vreg1
26 // - the register class constraints are satisfied
27 // - the COPY def is the only value that reaches OP
28 // then this pass replaces the above with:
29 //
30 // %vreg1 = COPY %vreg0
31 // ...
32 // ... = OP %vreg0
33 //
34 // and updates the relevant state required by VirtRegMap (e.g. LiveIntervals).
35 // COPYs whose LiveIntervals become dead as a result of this forwarding (i.e. if
36 // all uses of %vreg1 are changed to %vreg0) are removed.
37 //
38 // When being run with only physical registers, this pass will also remove some
39 // redundant COPYs. For example:
40 //
41 // %R1 = COPY %R0
42 // ... // No clobber of %R1
43 // %R0 = COPY %R1 <<< Removed
44 //
45 // or
46 //
47 // %R1 = COPY %R0
48 // ... // No clobber of %R0
49 // %R1 = COPY %R0 <<< Removed
1050 //
1151 //===----------------------------------------------------------------------===//
1252
53 #include "LiveDebugVariables.h"
1354 #include "llvm/ADT/DenseMap.h"
1455 #include "llvm/ADT/STLExtras.h"
1556 #include "llvm/ADT/SetVector.h"
1657 #include "llvm/ADT/SmallVector.h"
1758 #include "llvm/ADT/Statistic.h"
1859 #include "llvm/ADT/iterator_range.h"
60 #include "llvm/CodeGen/LiveRangeEdit.h"
61 #include "llvm/CodeGen/LiveStackAnalysis.h"
1962 #include "llvm/CodeGen/MachineBasicBlock.h"
2063 #include "llvm/CodeGen/MachineFunction.h"
2164 #include "llvm/CodeGen/MachineFunctionPass.h"
2265 #include "llvm/CodeGen/MachineInstr.h"
2366 #include "llvm/CodeGen/MachineOperand.h"
2467 #include "llvm/CodeGen/MachineRegisterInfo.h"
68 #include "llvm/CodeGen/Passes.h"
69 #include "llvm/CodeGen/VirtRegMap.h"
2570 #include "llvm/MC/MCRegisterInfo.h"
2671 #include "llvm/Pass.h"
2772 #include "llvm/Support/Debug.h"
73 #include "llvm/Support/DebugCounter.h"
2874 #include "llvm/Support/raw_ostream.h"
2975 #include "llvm/Target/TargetInstrInfo.h"
3076 #include "llvm/Target/TargetRegisterInfo.h"
3783 #define DEBUG_TYPE "machine-cp"
3884
3985 STATISTIC(NumDeletes, "Number of dead copies deleted");
86 STATISTIC(NumCopyForwards, "Number of copy uses forwarded");
87 DEBUG_COUNTER(FwdCounter, "machine-cp-fwd",
88 "Controls which register COPYs are forwarded");
4089
4190 namespace {
4291
4493 using SourceMap = DenseMap;
4594 using Reg2MIMap = DenseMap;
4695
47 class MachineCopyPropagation : public MachineFunctionPass {
96 class MachineCopyPropagation : public MachineFunctionPass,
97 private LiveRangeEdit::Delegate {
4898 const TargetRegisterInfo *TRI;
4999 const TargetInstrInfo *TII;
50 const MachineRegisterInfo *MRI;
100 MachineRegisterInfo *MRI;
101 MachineFunction *MF;
102 SlotIndexes *Indexes;
103 LiveIntervals *LIS;
104 const VirtRegMap *VRM;
105 // True if this pass being run before virtual registers are remapped to
106 // physical ones.
107 bool PreRegRewrite;
108 bool NoSubRegLiveness;
109
110 protected:
111 MachineCopyPropagation(char &ID, bool PreRegRewrite)
112 : MachineFunctionPass(ID), PreRegRewrite(PreRegRewrite) {}
51113
52114 public:
53115 static char ID; // Pass identification, replacement for typeid
54116
55 MachineCopyPropagation() : MachineFunctionPass(ID) {
117 MachineCopyPropagation() : MachineCopyPropagation(ID, false) {
56118 initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
57119 }
58120
59121 void getAnalysisUsage(AnalysisUsage &AU) const override {
122 if (PreRegRewrite) {
123 AU.addRequired();
124 AU.addPreserved();
125 AU.addRequired();
126 AU.addPreserved();
127 AU.addRequired();
128 AU.addPreserved();
129 AU.addPreserved();
130 AU.addPreserved();
131 }
60132 AU.setPreservesCFG();
61133 MachineFunctionPass::getAnalysisUsage(AU);
62134 }
64136 bool runOnMachineFunction(MachineFunction &MF) override;
65137
66138 MachineFunctionProperties getRequiredProperties() const override {
139 if (PreRegRewrite)
140 return MachineFunctionProperties()
141 .set(MachineFunctionProperties::Property::NoPHIs)
142 .set(MachineFunctionProperties::Property::TracksLiveness);
67143 return MachineFunctionProperties().set(
68144 MachineFunctionProperties::Property::NoVRegs);
69145 }
73149 void ReadRegister(unsigned Reg);
74150 void CopyPropagateBlock(MachineBasicBlock &MBB);
75151 bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
152 unsigned getPhysReg(unsigned Reg, unsigned SubReg);
153 unsigned getPhysReg(const MachineOperand &Opnd) {
154 return getPhysReg(Opnd.getReg(), Opnd.getSubReg());
155 }
156 unsigned getFullPhysReg(const MachineOperand &Opnd) {
157 return getPhysReg(Opnd.getReg(), 0);
158 }
159 void forwardUses(MachineInstr &MI);
160 bool isForwardableRegClassCopy(const MachineInstr &Copy,
161 const MachineInstr &UseI);
162 std::tuple
163 checkUseSubReg(const MachineOperand &CopySrc, const MachineOperand &MOUse);
164 bool hasImplicitOverlap(const MachineInstr &MI, const MachineOperand &Use);
165 void narrowRegClass(const MachineInstr &MI, const MachineOperand &MOUse,
166 unsigned NewUseReg, unsigned NewUseSubReg);
167 void updateForwardedCopyLiveInterval(const MachineInstr &Copy,
168 const MachineInstr &UseMI,
169 unsigned OrigUseReg,
170 unsigned NewUseReg,
171 unsigned NewUseSubReg);
172 /// LiveRangeEdit callback for eliminateDeadDefs().
173 void LRE_WillEraseInstruction(MachineInstr *MI) override;
76174
77175 /// Candidates for deletion.
78176 SmallSetVector MaybeDeadCopies;
89187 bool Changed;
90188 };
91189
190 class MachineCopyPropagationPreRegRewrite : public MachineCopyPropagation {
191 public:
192 static char ID; // Pass identification, replacement for typeid
193 MachineCopyPropagationPreRegRewrite()
194 : MachineCopyPropagation(ID, true) {
195 initializeMachineCopyPropagationPreRegRewritePass(*PassRegistry::getPassRegistry());
196 }
197 };
92198 } // end anonymous namespace
93199
94200 char MachineCopyPropagation::ID = 0;
97203
98204 INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
99205 "Machine Copy Propagation Pass", false, false)
206
207 /// We have two separate passes that are very similar, the only difference being
208 /// where they are meant to be run in the pipeline. This is done for several
209 /// reasons:
210 /// - the two passes have different dependencies
211 /// - some targets want to disable the later run of this pass, but not the
212 /// earlier one (e.g. NVPTX and WebAssembly)
213 /// - it allows for easier debugging via llc
214
215 char MachineCopyPropagationPreRegRewrite::ID = 0;
216 char &llvm::MachineCopyPropagationPreRegRewriteID = MachineCopyPropagationPreRegRewrite::ID;
217
218 INITIALIZE_PASS_BEGIN(MachineCopyPropagationPreRegRewrite,
219 "machine-cp-prerewrite",
220 "Machine Copy Propagation Pre-Register Rewrite Pass",
221 false, false)
222 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
223 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
224 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
225 INITIALIZE_PASS_END(MachineCopyPropagationPreRegRewrite,
226 "machine-cp-prerewrite",
227 "Machine Copy Propagation Pre-Register Rewrite Pass", false,
228 false)
100229
101230 /// Remove any entry in \p Map where the register is a subregister or equal to
102231 /// a register contained in \p Regs.
138267 }
139268
140269 void MachineCopyPropagation::ReadRegister(unsigned Reg) {
270 // We don't track MaybeDeadCopies when running pre-VirtRegRewriter.
271 if (PreRegRewrite)
272 return;
273
141274 // If 'Reg' is defined by a copy, the copy is no longer a candidate
142275 // for elimination.
143276 for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
169302 return SubIdx == TRI->getSubRegIndex(PreviousDef, Def);
170303 }
171304
305 /// Return the physical register assigned to \p Reg if it is a virtual register,
306 /// otherwise just return the physical reg from the operand itself.
307 ///
308 /// If \p SubReg is 0 then return the full physical register assigned to the
309 /// virtual register ignoring subregs. If we aren't tracking sub-reg liveness
310 /// then we need to use this to be more conservative with clobbers by killing
311 /// all super reg and their sub reg COPYs as well. This is to prevent COPY
312 /// forwarding in cases like the following:
313 ///
314 /// %vreg2 = COPY %vreg1:sub1
315 /// %vreg3 = COPY %vreg1:sub0
316 /// ... = OP1 %vreg2
317 /// ... = OP2 %vreg3
318 ///
319 /// After forward %vreg2 (assuming this is the last use of %vreg1) and
320 /// VirtRegRewriter adding kill markers we have:
321 ///
322 /// %vreg3 = COPY %vreg1:sub0
323 /// ... = OP1 %vreg1:sub1
324 /// ... = OP2 %vreg3
325 ///
326 /// If %vreg3 is assigned to a sub-reg of %vreg1, then after rewriting we have:
327 ///
328 /// ... = OP1 R0:sub1, R0
329 /// ... = OP2 R0:sub0
330 ///
331 /// and the use of R0 by OP2 will not have a valid definition.
332 unsigned MachineCopyPropagation::getPhysReg(unsigned Reg, unsigned SubReg) {
333
334 // Physical registers cannot have subregs.
335 if (!TargetRegisterInfo::isVirtualRegister(Reg))
336 return Reg;
337
338 assert(PreRegRewrite && "Unexpected virtual register encountered");
339 Reg = VRM->getPhys(Reg);
340 if (SubReg && !NoSubRegLiveness)
341 Reg = TRI->getSubReg(Reg, SubReg);
342 return Reg;
343 }
344
172345 /// Remove instruction \p Copy if there exists a previous copy that copies the
173346 /// register \p Src to the register \p Def; This may happen indirectly by
174347 /// copying the super registers.
206379 return true;
207380 }
208381
382
383 /// Decide whether we should forward the destination of \param Copy to its use
384 /// in \param UseI based on the register class of the Copy operands. Same-class
385 /// COPYs are always accepted by this function, but cross-class COPYs are only
386 /// accepted if they are forwarded to another COPY with the operand register
387 /// classes reversed. For example:
388 ///
389 /// RegClassA = COPY RegClassB // Copy parameter
390 /// ...
391 /// RegClassB = COPY RegClassA // UseI parameter
392 ///
393 /// which after forwarding becomes
394 ///
395 /// RegClassA = COPY RegClassB
396 /// ...
397 /// RegClassB = COPY RegClassB
398 ///
399 /// so we have reduced the number of cross-class COPYs and potentially
400 /// introduced a no COPY that can be removed.
401 bool MachineCopyPropagation::isForwardableRegClassCopy(
402 const MachineInstr &Copy, const MachineInstr &UseI) {
403 auto isCross = [&](const MachineOperand &Dst, const MachineOperand &Src) {
404 unsigned DstReg = Dst.getReg();
405 unsigned SrcPhysReg = getPhysReg(Src);
406 const TargetRegisterClass *DstRC;
407 if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
408 DstRC = MRI->getRegClass(DstReg);
409 unsigned DstSubReg = Dst.getSubReg();
410 if (DstSubReg)
411 SrcPhysReg = TRI->getMatchingSuperReg(SrcPhysReg, DstSubReg, DstRC);
412 } else
413 DstRC = TRI->getMinimalPhysRegClass(DstReg);
414
415 return !DstRC->contains(SrcPhysReg);
416 };
417
418 const MachineOperand &CopyDst = Copy.getOperand(0);
419 const MachineOperand &CopySrc = Copy.getOperand(1);
420
421 if (!isCross(CopyDst, CopySrc))
422 return true;
423
424 if (!UseI.isCopy())
425 return false;
426
427 assert(getFullPhysReg(UseI.getOperand(1)) == getFullPhysReg(CopyDst));
428 return !isCross(UseI.getOperand(0), CopySrc);
429 }
430
431 /// Check that the subregs on the copy source operand (\p CopySrc) and the use
432 /// operand to be forwarded to (\p MOUse) are compatible with doing the
433 /// forwarding. Also computes the new register and subregister to be used in
434 /// the forwarded-to instruction.
435 std::tuple MachineCopyPropagation::checkUseSubReg(
436 const MachineOperand &CopySrc, const MachineOperand &MOUse) {
437 unsigned NewUseReg = CopySrc.getReg();
438 unsigned NewUseSubReg;
439
440 if (TargetRegisterInfo::isPhysicalRegister(NewUseReg)) {
441 // If MOUse is a virtual reg, we need to apply it to the new physical reg
442 // we're going to replace it with.
443 if (MOUse.getSubReg())
444 NewUseReg = TRI->getSubReg(NewUseReg, MOUse.getSubReg());
445 // If the original use subreg isn't valid on the new src reg, we can't
446 // forward it here.
447 if (!NewUseReg)
448 return std::make_tuple(0, 0, false);
449 NewUseSubReg = 0;
450 } else {
451 // %v1 = COPY %v2:sub1
452 // USE %v1:sub2
453 // The new use is %v2:sub1:sub2
454 NewUseSubReg =
455 TRI->composeSubRegIndices(CopySrc.getSubReg(), MOUse.getSubReg());
456 // Check that NewUseSubReg is valid on NewUseReg
457 if (NewUseSubReg &&
458 !TRI->getSubClassWithSubReg(MRI->getRegClass(NewUseReg), NewUseSubReg))
459 return std::make_tuple(0, 0, false);
460 }
461
462 return std::make_tuple(NewUseReg, NewUseSubReg, true);
463 }
464
465 /// Check that \p MI does not have implicit uses that overlap with it's \p Use
466 /// operand (the register being replaced), since these can sometimes be
467 /// implicitly tied to other operands. For example, on AMDGPU:
468 ///
469 /// V_MOVRELS_B32_e32 %VGPR2, %M0, %EXEC, %VGPR2_VGPR3_VGPR4_VGPR5
470 ///
471 /// the %VGPR2 is implicitly tied to the larger reg operand, but we have no
472 /// way of knowing we need to update the latter when updating the former.
473 bool MachineCopyPropagation::hasImplicitOverlap(const MachineInstr &MI,
474 const MachineOperand &Use) {
475 if (!TargetRegisterInfo::isPhysicalRegister(Use.getReg()))
476 return false;
477
478 for (const MachineOperand &MIUse : MI.uses())
479 if (&MIUse != &Use && MIUse.isReg() && MIUse.isImplicit() &&
480 TRI->regsOverlap(Use.getReg(), MIUse.getReg()))
481 return true;
482
483 return false;
484 }
485
486 /// Narrow the register class of the forwarded vreg so it matches any
487 /// instruction constraints. \p MI is the instruction being forwarded to. \p
488 /// MOUse is the operand being replaced in \p MI (which hasn't yet been updated
489 /// at the time this function is called). \p NewUseReg and \p NewUseSubReg are
490 /// what the \p MOUse will be changed to after forwarding.
491 ///
492 /// If we are forwarding
493 /// A:RCA = COPY B:RCB
494 /// into
495 /// ... = OP A:RCA
496 ///
497 /// then we need to narrow the register class of B so that it is a subclass
498 /// of RCA so that it meets the instruction register class constraints.
499 void MachineCopyPropagation::narrowRegClass(const MachineInstr &MI,
500 const MachineOperand &MOUse,
501 unsigned NewUseReg,
502 unsigned NewUseSubReg) {
503 if (!TargetRegisterInfo::isVirtualRegister(NewUseReg))
504 return;
505
506 // Make sure the virtual reg class allows the subreg.
507 if (NewUseSubReg) {
508 const TargetRegisterClass *CurUseRC = MRI->getRegClass(NewUseReg);
509 const TargetRegisterClass *NewUseRC =
510 TRI->getSubClassWithSubReg(CurUseRC, NewUseSubReg);
511 if (CurUseRC != NewUseRC) {
512 DEBUG(dbgs() << "MCP: Setting regclass of " << PrintReg(NewUseReg, TRI)
513 << " to " << TRI->getRegClassName(NewUseRC) << "\n");
514 MRI->setRegClass(NewUseReg, NewUseRC);
515 }
516 }
517
518 unsigned MOUseOpNo = &MOUse - &MI.getOperand(0);
519 const TargetRegisterClass *InstRC =
520 TII->getRegClass(MI.getDesc(), MOUseOpNo, TRI, *MF);
521 if (InstRC) {
522 const TargetRegisterClass *CurUseRC = MRI->getRegClass(NewUseReg);
523 if (NewUseSubReg)
524 InstRC = TRI->getMatchingSuperRegClass(CurUseRC, InstRC, NewUseSubReg);
525 if (!InstRC->hasSubClassEq(CurUseRC)) {
526 const TargetRegisterClass *NewUseRC =
527 TRI->getCommonSubClass(InstRC, CurUseRC);
528 DEBUG(dbgs() << "MCP: Setting regclass of " << PrintReg(NewUseReg, TRI)
529 << " to " << TRI->getRegClassName(NewUseRC) << "\n");
530 MRI->setRegClass(NewUseReg, NewUseRC);
531 }
532 }
533 }
534
535 /// Update the LiveInterval information to reflect the destination of \p Copy
536 /// being forwarded to a use in \p UseMI. \p OrigUseReg is the register being
537 /// forwarded through. It should be the destination register of \p Copy and has
538 /// already been replaced in \p UseMI at the point this function is called. \p
539 /// NewUseReg and \p NewUseSubReg are the register and subregister being
540 /// forwarded. They should be the source register of the \p Copy and should be
541 /// the value of the \p UseMI operand being forwarded at the point this function
542 /// is called.
543 void MachineCopyPropagation::updateForwardedCopyLiveInterval(
544 const MachineInstr &Copy, const MachineInstr &UseMI, unsigned OrigUseReg,
545 unsigned NewUseReg, unsigned NewUseSubReg) {
546
547 assert(TRI->isSubRegisterEq(getPhysReg(OrigUseReg, 0),
548 getFullPhysReg(Copy.getOperand(0))) &&
549 "OrigUseReg mismatch");
550 assert(TRI->isSubRegisterEq(getFullPhysReg(Copy.getOperand(1)),
551 getPhysReg(NewUseReg, 0)) &&
552 "NewUseReg mismatch");
553
554 // Extend live range starting from COPY early-clobber slot, since that
555 // is where the original src live range ends.
556 SlotIndex CopyUseIdx =
557 Indexes->getInstructionIndex(Copy).getRegSlot(true /*=EarlyClobber*/);
558 SlotIndex UseIdx = Indexes->getInstructionIndex(UseMI).getRegSlot();
559 if (TargetRegisterInfo::isVirtualRegister(NewUseReg)) {
560 LiveInterval &LI = LIS->getInterval(NewUseReg);
561 LI.extendInBlock(CopyUseIdx, UseIdx);
562 LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(NewUseSubReg);
563 for (auto &S : LI.subranges())
564 if ((S.LaneMask & UseMask).any() && S.find(CopyUseIdx))
565 S.extendInBlock(CopyUseIdx, UseIdx);
566 } else {
567 assert(NewUseSubReg == 0 && "Unexpected subreg on physical register!");
568 for (MCRegUnitIterator UI(NewUseReg, TRI); UI.isValid(); ++UI) {
569 LiveRange &LR = LIS->getRegUnit(*UI);
570 LR.extendInBlock(CopyUseIdx, UseIdx);
571 }
572 }
573
574 if (!TargetRegisterInfo::isVirtualRegister(OrigUseReg))
575 return;
576
577 LiveInterval &LI = LIS->getInterval(OrigUseReg);
578
579 // Can happen for undef uses.
580 if (LI.empty())
581 return;
582
583 SlotIndex UseIndex = Indexes->getInstructionIndex(UseMI);
584 const LiveRange::Segment *UseSeg = LI.getSegmentContaining(UseIndex);
585
586 // Only shrink if forwarded use is the end of a segment.
587 if (UseSeg->end != UseIndex.getRegSlot())
588 return;
589
590 SmallVector DeadInsts;
591 LIS->shrinkToUses(&LI, &DeadInsts);
592 if (!DeadInsts.empty()) {
593 SmallVector NewRegs;
594 LiveRangeEdit(nullptr, NewRegs, *MF, *LIS, nullptr, this)
595 .eliminateDeadDefs(DeadInsts);
596 }
597 }
598
599 void MachineCopyPropagation::LRE_WillEraseInstruction(MachineInstr *MI) {
600 // Remove this COPY from further consideration for forwarding.
601 ClobberRegister(getFullPhysReg(MI->getOperand(0)));
602 Changed = true;
603 }
604
605 /// Look for available copies whose destination register is used by \p MI and
606 /// replace the use in \p MI with the copy's source register.
607 void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
608 // We can't generally forward uses after virtual registers have been renamed
609 // because some targets generate code that has implicit dependencies on the
610 // physical register numbers. For example, in PowerPC, when spilling
611 // condition code registers, the following code pattern is generated:
612 //
613 // %CR7 = COPY %CR0
614 // %R6 = MFOCRF %CR7
615 // %R6 = RLWINM %R6, 29, 31, 31
616 //
617 // where the shift amount in the RLWINM instruction depends on the source
618 // register number of the MFOCRF instruction. If we were to forward %CR0 to
619 // the MFOCRF instruction, the shift amount would no longer be correct.
620 //
621 // FIXME: It may be possible to define a target hook that checks the register
622 // class or user opcode and allows some cases, but prevents cases like the
623 // above from being broken to enable later register copy forwarding.
624 if (!PreRegRewrite)
625 return;
626
627 if (AvailCopyMap.empty())
628 return;
629
630 // Look for non-tied explicit vreg uses that have an active COPY
631 // instruction that defines the physical register allocated to them.
632 // Replace the vreg with the source of the active COPY.
633 for (MachineOperand &MOUse : MI.explicit_uses()) {
634 // Don't forward into undef use operands since doing so can cause problems
635 // with the machine verifier, since it doesn't treat undef reads as reads,
636 // so we can end up with a live range the ends on an undef read, leading to
637 // an error that the live range doesn't end on a read of the live range
638 // register.
639 if (!MOUse.isReg() || MOUse.isTied() || MOUse.isUndef())
640 continue;
641
642 unsigned UseReg = MOUse.getReg();
643 if (!UseReg)
644 continue;
645
646 // See comment above check for !PreRegRewrite regarding forwarding changing
647 // physical registers.
648 if (!TargetRegisterInfo::isVirtualRegister(UseReg))
649 continue;
650
651 UseReg = VRM->getPhys(UseReg);
652
653 // Don't forward COPYs via non-allocatable regs since they can have
654 // non-standard semantics.
655 if (!MRI->isAllocatable(UseReg))
656 continue;
657
658 auto CI = AvailCopyMap.find(UseReg);
659 if (CI == AvailCopyMap.end())
660 continue;
661
662 MachineInstr &Copy = *CI->second;
663 MachineOperand &CopyDst = Copy.getOperand(0);
664 MachineOperand &CopySrc = Copy.getOperand(1);
665
666 // Don't forward COPYs that are already NOPs due to register assignment.
667 if (getPhysReg(CopyDst) == getPhysReg(CopySrc))
668 continue;
669
670 // FIXME: Don't handle partial uses of wider COPYs yet.
671 if (CopyDst.getSubReg() != 0 || UseReg != getPhysReg(CopyDst))
672 continue;
673
674 // Don't forward COPYs of non-allocatable regs unless they are constant.
675 unsigned CopySrcReg = CopySrc.getReg();
676 if (TargetRegisterInfo::isPhysicalRegister(CopySrcReg) &&
677 !MRI->isAllocatable(CopySrcReg) && !MRI->isConstantPhysReg(CopySrcReg))
678 continue;
679
680 if (!isForwardableRegClassCopy(Copy, MI))
681 continue;
682
683 unsigned NewUseReg, NewUseSubReg;
684 bool SubRegOK;
685 std::tie(NewUseReg, NewUseSubReg, SubRegOK) =
686 checkUseSubReg(CopySrc, MOUse);
687 if (!SubRegOK)
688 continue;
689
690 if (hasImplicitOverlap(MI, MOUse))
691 continue;
692
693 if (!DebugCounter::shouldExecute(FwdCounter))
694 continue;
695
696 DEBUG(dbgs() << "MCP: Replacing "
697 << PrintReg(MOUse.getReg(), TRI, MOUse.getSubReg())
698 << "\n with "
699 << PrintReg(NewUseReg, TRI, CopySrc.getSubReg())
700 << "\n in "
701 << MI
702 << " from "
703 << Copy);
704
705 narrowRegClass(MI, MOUse, NewUseReg, NewUseSubReg);
706
707 unsigned OrigUseReg = MOUse.getReg();
708 MOUse.setReg(NewUseReg);
709 MOUse.setSubReg(NewUseSubReg);
710
711 DEBUG(dbgs() << "MCP: After replacement: " << MI << "\n");
712
713 if (PreRegRewrite)
714 updateForwardedCopyLiveInterval(Copy, MI, OrigUseReg, NewUseReg,
715 NewUseSubReg);
716 else
717 for (MachineInstr &KMI :
718 make_range(Copy.getIterator(), std::next(MI.getIterator())))
719 KMI.clearRegisterKills(NewUseReg, TRI);
720
721 ++NumCopyForwards;
722 Changed = true;
723 }
724 }
725
209726 void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
210727 DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");
211728
214731 ++I;
215732
216733 if (MI->isCopy()) {
217 unsigned Def = MI->getOperand(0).getReg();
218 unsigned Src = MI->getOperand(1).getReg();
219
220 assert(!TargetRegisterInfo::isVirtualRegister(Def) &&
221 !TargetRegisterInfo::isVirtualRegister(Src) &&
222 "MachineCopyPropagation should be run after register allocation!");
734 unsigned Def = getPhysReg(MI->getOperand(0));
735 unsigned Src = getPhysReg(MI->getOperand(1));
223736
224737 // The two copies cancel out and the source of the first copy
225738 // hasn't been overridden, eliminate the second one. e.g.
236749 // %ECX = COPY %EAX
237750 // =>
238751 // %ECX = COPY %EAX
239 if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
240 continue;
752 if (!PreRegRewrite)
753 if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
754 continue;
755
756 forwardUses(*MI);
757
758 // Src may have been changed by forwardUses()
759 Src = getPhysReg(MI->getOperand(1));
760 unsigned DefClobber = getFullPhysReg(MI->getOperand(0));
761 unsigned SrcClobber = getFullPhysReg(MI->getOperand(1));
241762
242763 // If Src is defined by a previous copy, the previous copy cannot be
243764 // eliminated.
254775 DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
255776
256777 // Copy is now a candidate for deletion.
257 if (!MRI->isReserved(Def))
778 // Only look for dead COPYs if we're not running just before
779 // VirtRegRewriter, since presumably these COPYs will have already been
780 // removed.
781 if (!PreRegRewrite && !MRI->isReserved(Def))
258782 MaybeDeadCopies.insert(MI);
259783
260784 // If 'Def' is previously source of another copy, then this earlier copy's
264788 // %xmm2 = copy %xmm0
265789 // ...
266790 // %xmm2 = copy %xmm9
267 ClobberRegister(Def);
791 ClobberRegister(DefClobber);
268792 for (const MachineOperand &MO : MI->implicit_operands()) {
269793 if (!MO.isReg() || !MO.isDef())
270794 continue;
271 unsigned Reg = MO.getReg();
795 unsigned Reg = getFullPhysReg(MO);
272796 if (!Reg)
273797 continue;
274798 ClobberRegister(Reg);
283807
284808 // Remember source that's copied to Def. Once it's clobbered, then
285809 // it's no longer available for copy propagation.
286 RegList &DestList = SrcMap[Src];
287 if (!is_contained(DestList, Def))
288 DestList.push_back(Def);
289
290 continue;
291 }
810 RegList &DestList = SrcMap[SrcClobber];
811 if (!is_contained(DestList, DefClobber))
812 DestList.push_back(DefClobber);
813
814 continue;
815 }
816
817 // Clobber any earlyclobber regs first.
818 for (const MachineOperand &MO : MI->operands())
819 if (MO.isReg() && MO.isEarlyClobber()) {
820 unsigned Reg = getFullPhysReg(MO);
821 // If we have a tied earlyclobber, that means it is also read by this
822 // instruction, so we need to make sure we don't remove it as dead
823 // later.
824 if (MO.isTied())
825 ReadRegister(Reg);
826 ClobberRegister(Reg);
827 }
828
829 forwardUses(*MI);
292830
293831 // Not a copy.
294832 SmallVector Defs;
298836 RegMask = &MO;
299837 if (!MO.isReg())
300838 continue;
301 unsigned Reg = MO.getReg();
839 unsigned Reg = getFullPhysReg(MO);
302840 if (!Reg)
303841 continue;
304842
305 assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
306 "MachineCopyPropagation should be run after register allocation!");
307
308 if (MO.isDef()) {
843 if (MO.isDef() && !MO.isEarlyClobber()) {
309844 Defs.push_back(Reg);
310845 continue;
311846 } else if (MO.readsReg())
362897 // since we don't want to trust live-in lists.
363898 if (MBB.succ_empty()) {
364899 for (MachineInstr *MaybeDead : MaybeDeadCopies) {
900 DEBUG(dbgs() << "MCP: Removing copy due to no live-out succ: ";
901 MaybeDead->dump());
365902 assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
366903 MaybeDead->eraseFromParent();
367904 Changed = true;
384921 TRI = MF.getSubtarget().getRegisterInfo();
385922 TII = MF.getSubtarget().getInstrInfo();
386923 MRI = &MF.getRegInfo();
924 this->MF = &MF;
925 if (PreRegRewrite) {
926 Indexes = &getAnalysis();
927 LIS = &getAnalysis();
928 VRM = &getAnalysis();
929 }
930 NoSubRegLiveness = !MRI->subRegLivenessEnabled();
387931
388932 for (MachineBasicBlock &MBB : MF)
389933 CopyPropagateBlock(MBB);
8787 cl::desc("Disable Codegen Prepare"));
8888 static cl::opt DisableCopyProp("disable-copyprop", cl::Hidden,
8989 cl::desc("Disable Copy Propagation pass"));
90 static cl::opt DisableCopyPropPreRegRewrite("disable-copyprop-prerewrite", cl::Hidden,
91 cl::desc("Disable Copy Propagation Pre-Register Re-write pass"));
9092 static cl::opt DisablePartialLibcallInlining("disable-partial-libcall-inlining",
9193 cl::Hidden, cl::desc("Disable Partial Libcall Inlining"));
9294 static cl::opt EnableImplicitNullChecks(
246248
247249 if (StandardID == &MachineCopyPropagationID)
248250 return applyDisable(TargetID, DisableCopyProp);
251
252 if (StandardID == &MachineCopyPropagationPreRegRewriteID)
253 return applyDisable(TargetID, DisableCopyPropPreRegRewrite);
249254
250255 return TargetID;
251256 }
10551060 // Allow targets to change the register assignments before rewriting.
10561061 addPreRewrite();
10571062
1063 // Copy propagate to forward register uses and try to eliminate COPYs that
1064 // were not coalesced.
1065 addPass(&MachineCopyPropagationPreRegRewriteID);
1066
10581067 // Finally rewrite virtual registers.
10591068 addPass(&VirtRegRewriterID);
10601069
88 ; CHECK-LABEL: halfword:
99 ; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
1010 ; CHECK: ldrh [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #1]
11 ; CHECK: strh [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #1]
11 ; CHECK: mov [[REG3:x[0-9]+]], [[REG2]]
12 ; CHECK: strh [[REG1]], [{{.*}}[[REG3]], [[REG]], lsl #1]
1213 %shr81 = lshr i32 %xor72, 9
1314 %conv82 = zext i32 %shr81 to i64
1415 %idxprom83 = and i64 %conv82, 255
2324 ; CHECK-LABEL: word:
2425 ; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
2526 ; CHECK: ldr [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #2]
26 ; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #2]
27 ; CHECK: mov [[REG3:x[0-9]+]], [[REG2]]
28 ; CHECK: str [[REG1]], [{{.*}}[[REG3]], [[REG]], lsl #2]
2729 %shr81 = lshr i32 %xor72, 9
2830 %conv82 = zext i32 %shr81 to i64
2931 %idxprom83 = and i64 %conv82, 255
3840 ; CHECK-LABEL: doubleword:
3941 ; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
4042 ; CHECK: ldr [[REG1:x[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #3]
41 ; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #3]
43 ; CHECK: mov [[REG3:x[0-9]+]], [[REG2]]
44 ; CHECK: str [[REG1]], [{{.*}}[[REG3]], [[REG]], lsl #3]
4245 %shr81 = lshr i32 %xor72, 9
4346 %conv82 = zext i32 %shr81 to i64
4447 %idxprom83 = and i64 %conv82, 255
77 ; CHECK: add.2d v[[REG:[0-9]+]], v0, v1
88 ; CHECK: add d[[REG3:[0-9]+]], d[[REG]], d1
99 ; CHECK: sub d[[REG2:[0-9]+]], d[[REG]], d1
10 ; Without advanced copy optimization, we end up with cross register
11 ; banks copies that cannot be coalesced.
12 ; CHECK-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
13 ; With advanced copy optimization, we end up with just one copy
14 ; to insert the computed high part into the V register.
15 ; CHECK-OPT-NOT: fmov
10 ; CHECK-NOT: fmov
1611 ; CHECK: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
17 ; CHECK-NOOPT: fmov d0, [[COPY_REG3]]
18 ; CHECK-OPT-NOT: fmov
12 ; CHECK-NOT: fmov
1913 ; CHECK: ins.d v0[1], [[COPY_REG2]]
2014 ; CHECK-NEXT: ret
2115 ;
2317 ; GENERIC: add v[[REG:[0-9]+]].2d, v0.2d, v1.2d
2418 ; GENERIC: add d[[REG3:[0-9]+]], d[[REG]], d1
2519 ; GENERIC: sub d[[REG2:[0-9]+]], d[[REG]], d1
26 ; GENERIC-NOOPT: fmov [[COPY_REG3:x[0-9]+]], d[[REG3]]
27 ; GENERIC-OPT-NOT: fmov
20 ; GENERIC-NOT: fmov
2821 ; GENERIC: fmov [[COPY_REG2:x[0-9]+]], d[[REG2]]
29 ; GENERIC-NOOPT: fmov d0, [[COPY_REG3]]
30 ; GENERIC-OPT-NOT: fmov
22 ; GENERIC-NOT: fmov
3123 ; GENERIC: ins v0.d[1], [[COPY_REG2]]
3224 ; GENERIC-NEXT: ret
3325 %add = add <2 x i64> %a, %b
33 define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
44 entry:
55 ; CHECK-LABEL: t:
6 ; CHECK: mov x0, [[REG1:x[0-9]+]]
7 ; CHECK: mov x1, [[REG2:x[0-9]+]]
6 ; CHECK: mov [[REG2:x[0-9]+]], x3
7 ; CHECK: mov [[REG1:x[0-9]+]], x2
8 ; CHECK: mov x0, x2
9 ; CHECK: mov x1, x3
810 ; CHECK: bl _foo
911 ; CHECK: mov x0, [[REG1]]
1012 ; CHECK: mov x1, [[REG2]]
488488
489489 ; CHECK-COMMON-LABEL: test_phi:
490490 ; CHECK-COMMON: mov x[[PTR:[0-9]+]], x0
491 ; CHECK-COMMON: ldr h[[AB:[0-9]+]], [x[[PTR]]]
491 ; CHECK-COMMON: ldr h[[AB:[0-9]+]], [x0]
492492 ; CHECK-COMMON: [[LOOP:LBB[0-9_]+]]:
493493 ; CHECK-COMMON: mov.16b v[[R:[0-9]+]], v[[AB]]
494494 ; CHECK-COMMON: ldr h[[AB]], [x[[PTR]]]
1616 %val = zext i1 %test to i32
1717 ; CHECK: cset {{[xw][0-9]+}}, ne
1818
19 ; CHECK: mov [[RHSCOPY:w[0-9]+]], [[RHS]]
20 ; CHECK: mov [[LHSCOPY:w[0-9]+]], [[LHS]]
21
1922 store i32 %val, i32* @var
2023
2124 call void @bar()
2427 ; Currently, the comparison is emitted again. An MSR/MRS pair would also be
2528 ; acceptable, but assuming the call preserves NZCV is not.
2629 br i1 %test, label %iftrue, label %iffalse
27 ; CHECK: cmp [[LHS]], [[RHS]]
30 ; CHECK: cmp [[LHSCOPY]], [[RHSCOPY]]
2831 ; CHECK: b.eq
2932
3033 iftrue:
77 define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg) {
88 ;CHECK-LABEL: test
99 entry:
10 ; A53: mov [[DATA:w[0-9]+]], w1
1110 ; A53: str q{{[0-9]+}}, {{.*}}
1211 ; A53: str q{{[0-9]+}}, {{.*}}
13 ; A53: str [[DATA]], {{.*}}
12 ; A53: str w1, {{.*}}
1413
1514 %0 = bitcast %struct1* %fde to i8*
1615 tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 40, i32 8, i1 false)
66 define void @test(i32 %px) {
77 ; CHECK_LABEL: test:
88 ; CHECK_LABEL: %entry
9 ; CHECK: subs
10 ; CHECK-NEXT: csel
9 ; CHECK: subs [[REG0:w[0-9]+]],
10 ; CHECK: csel {{w[0-9]+}}, wzr, [[REG0]]
1111 entry:
1212 %sub = add nsw i32 %px, -1
1313 %cmp = icmp slt i32 %px, 1
546546 ; GCN: s_mov_b32 s5, s32
547547 ; GCN: s_add_u32 s32, s32, 0x300
548548
549 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-9]+]], s14
550 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-9]+]], s15
551 ; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-9]+]], s16
549 ; GCN-DAG: s_mov_b32 [[SAVE_X:s[0-57-9][0-9]*]], s14
550 ; GCN-DAG: s_mov_b32 [[SAVE_Y:s[0-68-9][0-9]*]], s15
551 ; GCN-DAG: s_mov_b32 [[SAVE_Z:s[0-79][0-9]*]], s16
552552 ; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[6:7]
553553 ; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[8:9]
554554 ; GCN-DAG: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, s[10:11]
555555
556 ; GCN-DAG: s_mov_b32 s6, [[SAVE_X]]
557 ; GCN-DAG: s_mov_b32 s7, [[SAVE_Y]]
558 ; GCN-DAG: s_mov_b32 s8, [[SAVE_Z]]
556 ; GCN-DAG: s_mov_b32 s6, s14
557 ; GCN-DAG: s_mov_b32 s7, s15
558 ; GCN-DAG: s_mov_b32 s8, s16
559559 ; GCN: s_swappc_b64
560560
561561 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s5 offset:4
11 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
22
33 ; GCN-LABEL: {{^}}vgpr:
4 ; GCN: v_mov_b32_e32 v1, v0
5 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
6 ; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
7 ; GCN: s_waitcnt expcnt(0)
4 ; GCN-DAG: v_mov_b32_e32 v1, v0
5 ; GCN-DAG: exp mrt0 v0, v0, v0, v0 done vm
6 ; GCN: s_waitcnt expcnt(0)
7 ; GCN: v_add_f32_e32 v0, 1.0, v0
88 ; GCN-NOT: s_endpgm
99 define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
1010 bb:
203203 }
204204
205205 ; GCN-LABEL: {{^}}both:
206 ; GCN: v_mov_b32_e32 v1, v0
207 ; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
208 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
206 ; GCN-DAG: exp mrt0 v0, v0, v0, v0 done vm
207 ; GCN-DAG: v_mov_b32_e32 v1, v0
208 ; GCN-DAG: s_mov_b32 s1, s2
209 ; GCN: s_waitcnt expcnt(0)
210 ; GCN: v_add_f32_e32 v0, 1.0, v0
209211 ; GCN-DAG: s_add_i32 s0, s3, 2
210 ; GCN-DAG: s_mov_b32 s1, s2
211 ; GCN: s_mov_b32 s2, s3
212 ; GCN: s_waitcnt expcnt(0)
212 ; GCN-DAG: s_mov_b32 s2, s3
213213 ; GCN-NOT: s_endpgm
214214 define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
215215 bb:
286286
287287 %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst monotonic
288288 %oldval = extractvalue { i32, i1 } %pair, 0
289 ; CHECK-ARMV7: ldrex [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
289 ; CHECK-ARMV7: mov r[[ADDR:[0-9]+]], r0
290 ; CHECK-ARMV7: ldrex [[OLDVAL:r[0-9]+]], [r0]
290291 ; CHECK-ARMV7: cmp [[OLDVAL]], r1
291292 ; CHECK-ARMV7: bne [[FAIL_BB:\.?LBB[0-9]+_[0-9]+]]
292293 ; CHECK-ARMV7: dmb ish
304305 ; CHECK-ARMV7: dmb ish
305306 ; CHECK-ARMV7: bx lr
306307
307 ; CHECK-T2: ldrex [[OLDVAL:r[0-9]+]], [r[[ADDR:[0-9]+]]]
308 ; CHECK-T2: mov r[[ADDR:[0-9]+]], r0
309 ; CHECK-T2: ldrex [[OLDVAL:r[0-9]+]], [r0]
308310 ; CHECK-T2: cmp [[OLDVAL]], r1
309311 ; CHECK-T2: bne [[FAIL_BB:\.?LBB.*]]
310312 ; CHECK-T2: dmb ish
180180 ; CHECK-APPLE: beq
181181 ; CHECK-APPLE: mov r0, #16
182182 ; CHECK-APPLE: malloc
183 ; CHECK-APPLE: strb r{{.*}}, [{{.*}}[[ID]], #8]
183 ; CHECK-APPLE: strb r{{.*}}, [r0, #8]
184184 ; CHECK-APPLE: ble
185185 ; CHECK-APPLE: mov r8, [[ID]]
186186
164164 ; MMR3: subu16 $5, $[[T19]], $[[T20]]
165165
166166 ; MMR6: move $[[T0:[0-9]+]], $7
167 ; MMR6: sw $[[T0]], 8($sp)
167 ; MMR6: sw $7, 8($sp)
168168 ; MMR6: move $[[T1:[0-9]+]], $5
169169 ; MMR6: sw $4, 12($sp)
170170 ; MMR6: lw $[[T2:[0-9]+]], 48($sp)
1313 ret double %r
1414
1515 ; CHECK: @foo3
16 ; CHECK: xsnmsubadp [[REG:[0-9]+]], {{[0-9]+}}, [[REG]]
16 ; CHECK: fmr [[REG:[0-9]+]], [[REG2:[0-9]+]]
17 ; CHECK: xsnmsubadp [[REG]], {{[0-9]+}}, [[REG2]]
1718 ; CHECK: xsmaddmdp
1819 ; CHECK: xsmaddadp
1920 }
7474
7575 ; CHECK-DAG: mr [[REG:[0-9]+]], 3
7676 ; CHECK-DAG: li 0, 1076
77 ; CHECK: stw [[REG]],
77 ; CHECK-DAG: stw 3,
7878
7979 ; CHECK: #APP
8080 ; CHECK: sc
2222 ;CHECK-LABEL: straight_test:
2323 ; test1 may have been merged with entry
2424 ;CHECK: mr [[TAGREG:[0-9]+]], 3
25 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
25 ;CHECK: andi. {{[0-9]+}}, [[TAGREG:[0-9]+]], 1
2626 ;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
2727 ;CHECK-NEXT: # %test2
2828 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
234234
235235 ; CHECK-LABEL: test_load_add_i32
236236 ; CHECK: membar
237 ; CHECK: add [[V:%[gilo][0-7]]], %o1, [[U:%[gilo][0-7]]]
238 ; CHECK: cas [%o0], [[V]], [[U]]
237 ; CHECK: mov [[U:%[gilo][0-7]]], [[V:%[gilo][0-7]]]
238 ; CHECK: add [[U:%[gilo][0-7]]], %o1, [[V2:%[gilo][0-7]]]
239 ; CHECK: cas [%o0], [[V]], [[V2]]
239240 ; CHECK: membar
240241 define zeroext i32 @test_load_add_i32(i32* %p, i32 zeroext %v) {
241242 entry:
597597 define i32 @b_to_bx(i32 %value) {
598598 ; CHECK-LABEL: b_to_bx:
599599 ; DISABLE: push {r7, lr}
600 ; CHECK: cmp r1, #49
600 ; CHECK: cmp r0, #49
601601 ; CHECK-NEXT: bgt [[ELSE_LABEL:LBB[0-9_]+]]
602602 ; ENABLE: push {r7, lr}
603603
66 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
77 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx
88 ; CHECK-NEXT: movl %ecx, %edx
9 ; CHECK-NEXT: imull %edx, %edx
9 ; CHECK-NEXT: imull %ecx, %edx
1010 ; CHECK-NEXT: imull %eax, %ecx
1111 ; CHECK-NEXT: imull %eax, %eax
1212 ; CHECK-NEXT: addl %edx, %eax
105105 ; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
106106 ; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
107107 ; CHECK-DAG: movl %[[r2]], 4(%esp)
108 ; CHECK-DAG: movl %[[r1]], (%esp)
108 ; CHECK-DAG: movl %edx, (%esp)
109109 ; CHECK: movl %esp, %[[reg:[^ ]*]]
110110 ; CHECK: pushl %[[reg]]
111111 ; CHECK: calll _addrof_i64
406406 ; SSE2-NEXT: pand %xmm0, %xmm2
407407 ; SSE2-NEXT: packuswb %xmm1, %xmm2
408408 ; SSE2-NEXT: packuswb %xmm10, %xmm2
409 ; SSE2-NEXT: movdqa %xmm2, %xmm1
410409 ; SSE2-NEXT: psrld $1, %xmm4
411410 ; SSE2-NEXT: psrld $1, %xmm12
412411 ; SSE2-NEXT: pand %xmm0, %xmm12
443442 ; SSE2-NEXT: movdqu %xmm7, (%rax)
444443 ; SSE2-NEXT: movdqu %xmm11, (%rax)
445444 ; SSE2-NEXT: movdqu %xmm13, (%rax)
446 ; SSE2-NEXT: movdqu %xmm1, (%rax)
445 ; SSE2-NEXT: movdqu %xmm2, (%rax)
447446 ; SSE2-NEXT: retq
448447 ;
449448 ; AVX1-LABEL: avg_v64i8:
1111 ; CHECK-NEXT: movq %rdx, %r14
1212 ; CHECK-NEXT: movq %rsi, %r15
1313 ; CHECK-NEXT: movq %rdi, %rbx
14 ; CHECK-NEXT: vmovaps (%rbx), %ymm0
14 ; CHECK-NEXT: vmovaps (%rdi), %ymm0
1515 ; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
16 ; CHECK-NEXT: vmovaps (%r15), %ymm1
16 ; CHECK-NEXT: vmovaps (%rsi), %ymm1
1717 ; CHECK-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill
18 ; CHECK-NEXT: vmovaps (%r14), %ymm2
18 ; CHECK-NEXT: vmovaps (%rdx), %ymm2
1919 ; CHECK-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill
2020 ; CHECK-NEXT: callq dummy
2121 ; CHECK-NEXT: vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
88 ; CHECK-NEXT: pushq %rbx
99 ; CHECK-NEXT: subq $112, %rsp
1010 ; CHECK-NEXT: movq %rdi, %rbx
11 ; CHECK-NEXT: vmovups (%rbx), %zmm0
11 ; CHECK-NEXT: vmovups (%rdi), %zmm0
1212 ; CHECK-NEXT: vmovups %zmm0, (%rsp) ## 64-byte Spill
1313 ; CHECK-NEXT: vbroadcastss {{.*}}(%rip), %zmm1
14 ; CHECK-NEXT: vmovaps %zmm1, (%rbx)
14 ; CHECK-NEXT: vmovaps %zmm1, (%rdi)
1515 ; CHECK-NEXT: callq _Print__512
1616 ; CHECK-NEXT: vmovups (%rsp), %zmm0 ## 64-byte Reload
1717 ; CHECK-NEXT: callq _Print__512
465465 ; KNL_X32-NEXT: movl %edi, (%esp)
466466 ; KNL_X32-NEXT: calll _test11
467467 ; KNL_X32-NEXT: movl %eax, %ebx
468 ; KNL_X32-NEXT: movzbl %bl, %eax
468 ; KNL_X32-NEXT: movzbl %al, %eax
469469 ; KNL_X32-NEXT: movl %eax, {{[0-9]+}}(%esp)
470470 ; KNL_X32-NEXT: movl %esi, {{[0-9]+}}(%esp)
471471 ; KNL_X32-NEXT: movl %edi, (%esp)
11701170 ; KNL-NEXT: kmovw %esi, %k0
11711171 ; KNL-NEXT: kshiftlw $7, %k0, %k2
11721172 ; KNL-NEXT: kshiftrw $15, %k2, %k2
1173 ; KNL-NEXT: kmovw %k2, %eax
11741173 ; KNL-NEXT: kshiftlw $6, %k0, %k0
11751174 ; KNL-NEXT: kshiftrw $15, %k0, %k0
11761175 ; KNL-NEXT: kmovw %k0, %ecx
11831182 ; KNL-NEXT: vptestmq %zmm0, %zmm0, %k0
11841183 ; KNL-NEXT: kshiftlw $1, %k0, %k0
11851184 ; KNL-NEXT: kshiftrw $1, %k0, %k0
1186 ; KNL-NEXT: kmovw %eax, %k1
1187 ; KNL-NEXT: kshiftlw $7, %k1, %k1
1185 ; KNL-NEXT: kshiftlw $7, %k2, %k1
11881186 ; KNL-NEXT: korw %k1, %k0, %k1
11891187 ; KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
11901188 ; KNL-NEXT: vpmovqw %zmm0, %xmm0
11961194 ; SKX-NEXT: kmovd %esi, %k1
11971195 ; SKX-NEXT: kshiftlw $7, %k1, %k2
11981196 ; SKX-NEXT: kshiftrw $15, %k2, %k2
1199 ; SKX-NEXT: kmovd %k2, %eax
12001197 ; SKX-NEXT: kshiftlw $6, %k1, %k1
12011198 ; SKX-NEXT: kshiftrw $15, %k1, %k1
1202 ; SKX-NEXT: kmovd %k1, %ecx
12031199 ; SKX-NEXT: vpmovm2q %k0, %zmm0
1204 ; SKX-NEXT: kmovd %ecx, %k0
1205 ; SKX-NEXT: vpmovm2q %k0, %zmm1
1200 ; SKX-NEXT: vpmovm2q %k1, %zmm1
12061201 ; SKX-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
12071202 ; SKX-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
12081203 ; SKX-NEXT: vpmovq2m %zmm2, %k0
12091204 ; SKX-NEXT: kshiftlb $1, %k0, %k0
12101205 ; SKX-NEXT: kshiftrb $1, %k0, %k0
1211 ; SKX-NEXT: kmovd %eax, %k1
1212 ; SKX-NEXT: kshiftlb $7, %k1, %k1
1206 ; SKX-NEXT: kshiftlb $7, %k2, %k1
12131207 ; SKX-NEXT: korb %k1, %k0, %k0
12141208 ; SKX-NEXT: vpmovm2w %k0, %xmm0
12151209 ; SKX-NEXT: vzeroupper
12211215 ; AVX512BW-NEXT: kmovd %esi, %k0
12221216 ; AVX512BW-NEXT: kshiftlw $7, %k0, %k2
12231217 ; AVX512BW-NEXT: kshiftrw $15, %k2, %k2
1224 ; AVX512BW-NEXT: kmovd %k2, %eax
12251218 ; AVX512BW-NEXT: kshiftlw $6, %k0, %k0
12261219 ; AVX512BW-NEXT: kshiftrw $15, %k0, %k0
12271220 ; AVX512BW-NEXT: kmovd %k0, %ecx
12341227 ; AVX512BW-NEXT: vptestmq %zmm0, %zmm0, %k0
12351228 ; AVX512BW-NEXT: kshiftlw $1, %k0, %k0
12361229 ; AVX512BW-NEXT: kshiftrw $1, %k0, %k0
1237 ; AVX512BW-NEXT: kmovd %eax, %k1
1238 ; AVX512BW-NEXT: kshiftlw $7, %k1, %k1
1230 ; AVX512BW-NEXT: kshiftlw $7, %k2, %k1
12391231 ; AVX512BW-NEXT: korw %k1, %k0, %k0
12401232 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0
12411233 ; AVX512BW-NEXT: ## kill: %XMM0 %XMM0 %ZMM0
12481240 ; AVX512DQ-NEXT: kmovw %esi, %k1
12491241 ; AVX512DQ-NEXT: kshiftlw $7, %k1, %k2
12501242 ; AVX512DQ-NEXT: kshiftrw $15, %k2, %k2
1251 ; AVX512DQ-NEXT: kmovw %k2, %eax
12521243 ; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1
12531244 ; AVX512DQ-NEXT: kshiftrw $15, %k1, %k1
1254 ; AVX512DQ-NEXT: kmovw %k1, %ecx
12551245 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
1256 ; AVX512DQ-NEXT: kmovw %ecx, %k0
1257 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm1
1246 ; AVX512DQ-NEXT: vpmovm2q %k1, %zmm1
12581247 ; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
12591248 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
12601249 ; AVX512DQ-NEXT: vpmovq2m %zmm2, %k0
12611250 ; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0
12621251 ; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0
1263 ; AVX512DQ-NEXT: kmovw %eax, %k1
1264 ; AVX512DQ-NEXT: kshiftlb $7, %k1, %k1
1252 ; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1
12651253 ; AVX512DQ-NEXT: korb %k1, %k0, %k0
12661254 ; AVX512DQ-NEXT: vpmovm2q %k0, %zmm0
12671255 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
20022002 ; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
20032003 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
20042004 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
2005 ; AVX512F-32-NEXT: movl %esi, %eax
2005 ; AVX512F-32-NEXT: movl %ecx, %eax
20062006 ; AVX512F-32-NEXT: shrl $30, %eax
20072007 ; AVX512F-32-NEXT: kmovd %eax, %k1
20082008 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
20132013 ; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
20142014 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
20152015 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
2016 ; AVX512F-32-NEXT: movl %esi, %eax
2016 ; AVX512F-32-NEXT: movl %ecx, %eax
20172017 ; AVX512F-32-NEXT: shrl $31, %eax
20182018 ; AVX512F-32-NEXT: kmovd %eax, %k1
20192019 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
28862886 ; AVX512F-32-NEXT: vpblendvb %ymm3, %ymm2, %ymm7, %ymm7
28872887 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm7[0,1,2,3],zmm2[4,5,6,7]
28882888 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
2889 ; AVX512F-32-NEXT: movl %esi, %eax
2889 ; AVX512F-32-NEXT: movl %ecx, %eax
28902890 ; AVX512F-32-NEXT: shrl $30, %eax
28912891 ; AVX512F-32-NEXT: kmovd %eax, %k1
28922892 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
28972897 ; AVX512F-32-NEXT: vpblendvb %ymm7, %ymm3, %ymm2, %ymm2
28982898 ; AVX512F-32-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm3[4,5,6,7]
28992899 ; AVX512F-32-NEXT: vpmovb2m %zmm2, %k0
2900 ; AVX512F-32-NEXT: movl %esi, %eax
2900 ; AVX512F-32-NEXT: movl %ecx, %eax
29012901 ; AVX512F-32-NEXT: shrl $31, %eax
29022902 ; AVX512F-32-NEXT: kmovd %eax, %k1
29032903 ; AVX512F-32-NEXT: vpmovm2b %k1, %zmm2
3737 ; SSE2-LABEL: test_negative_zero_1:
3838 ; SSE2: # BB#0: # %entry
3939 ; SSE2-NEXT: movaps %xmm0, %xmm1
40 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
40 ; SSE2-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
4141 ; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
4242 ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4343 ; SSE2-NEXT: xorps %xmm2, %xmm2
230230 ; SSE-NEXT: cvtss2sd %xmm2, %xmm4
231231 ; SSE-NEXT: movshdup {{.*#+}} xmm5 = xmm2[1,1,3,3]
232232 ; SSE-NEXT: movaps %xmm2, %xmm6
233 ; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm6[1,1]
234 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
233 ; SSE-NEXT: movhlps {{.*#+}} xmm6 = xmm2[1],xmm6[1]
234 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm2[2,3]
235235 ; SSE-NEXT: movaps {{.*#+}} xmm7
236236 ; SSE-NEXT: movaps %xmm0, %xmm2
237237 ; SSE-NEXT: andps %xmm7, %xmm2
246246 ; SSE-NEXT: orps %xmm0, %xmm4
247247 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm4[0]
248248 ; SSE-NEXT: movaps %xmm1, %xmm0
249 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
249 ; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm1[1],xmm0[1]
250250 ; SSE-NEXT: andps %xmm7, %xmm0
251251 ; SSE-NEXT: cvtss2sd %xmm3, %xmm3
252252 ; SSE-NEXT: andps %xmm8, %xmm3
293293 ; SSE-NEXT: orps %xmm6, %xmm1
294294 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
295295 ; SSE-NEXT: movaps %xmm3, %xmm1
296 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
296 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm3[1],xmm1[1]
297297 ; SSE-NEXT: andps %xmm5, %xmm1
298298 ; SSE-NEXT: xorps %xmm6, %xmm6
299299 ; SSE-NEXT: cvtsd2ss %xmm2, %xmm6
1313 ; SSE: # BB#0:
1414 ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
1515 ; SSE-NEXT: movaps %xmm0, %xmm2
16 ; SSE-NEXT: addss %xmm2, %xmm2
16 ; SSE-NEXT: addss %xmm0, %xmm2
1717 ; SSE-NEXT: mulss %xmm1, %xmm2
1818 ; SSE-NEXT: mulss %xmm0, %xmm0
1919 ; SSE-NEXT: mulss %xmm1, %xmm1
5757 ; SSE-LABEL: complex_square_f64:
5858 ; SSE: # BB#0:
5959 ; SSE-NEXT: movaps %xmm0, %xmm1
60 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
60 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
6161 ; SSE-NEXT: movaps %xmm0, %xmm2
62 ; SSE-NEXT: addsd %xmm2, %xmm2
62 ; SSE-NEXT: addsd %xmm0, %xmm2
6363 ; SSE-NEXT: mulsd %xmm1, %xmm2
6464 ; SSE-NEXT: mulsd %xmm0, %xmm0
6565 ; SSE-NEXT: mulsd %xmm1, %xmm1
160160 ; SSE-LABEL: complex_mul_f64:
161161 ; SSE: # BB#0:
162162 ; SSE-NEXT: movaps %xmm0, %xmm2
163 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
163 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
164164 ; SSE-NEXT: movaps %xmm1, %xmm3
165 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm3[1,1]
165 ; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1]
166166 ; SSE-NEXT: movaps %xmm3, %xmm4
167167 ; SSE-NEXT: mulsd %xmm0, %xmm4
168168 ; SSE-NEXT: mulsd %xmm1, %xmm0
317317 ; X64: # BB#0: # %entry
318318 ; X64-NEXT: movq %rdi, %rcx
319319 ; X64-NEXT: movabsq $6120523590596543007, %rdx # imm = 0x54F077C718E7C21F
320 ; X64-NEXT: movq %rcx, %rax
320 ; X64-NEXT: movq %rdi, %rax
321321 ; X64-NEXT: mulq %rdx
322322 ; X64-NEXT: shrq $12, %rdx
323323 ; X64-NEXT: imulq $12345, %rdx, %rax # imm = 0x3039
1717
1818 ; CHECK-LABEL: @test_fmaxf
1919 ; SSE: movaps %xmm0, %xmm2
20 ; SSE-NEXT: cmpunordss %xmm2, %xmm2
20 ; SSE-NEXT: cmpunordss %xmm0, %xmm2
2121 ; SSE-NEXT: movaps %xmm2, %xmm3
2222 ; SSE-NEXT: andps %xmm1, %xmm3
2323 ; SSE-NEXT: maxss %xmm0, %xmm1
4646
4747 ; CHECK-LABEL: @test_fmax
4848 ; SSE: movapd %xmm0, %xmm2
49 ; SSE-NEXT: cmpunordsd %xmm2, %xmm2
49 ; SSE-NEXT: cmpunordsd %xmm0, %xmm2
5050 ; SSE-NEXT: movapd %xmm2, %xmm3
5151 ; SSE-NEXT: andpd %xmm1, %xmm3
5252 ; SSE-NEXT: maxsd %xmm0, %xmm1
7373
7474 ; CHECK-LABEL: @test_intrinsic_fmaxf
7575 ; SSE: movaps %xmm0, %xmm2
76 ; SSE-NEXT: cmpunordss %xmm2, %xmm2
76 ; SSE-NEXT: cmpunordss %xmm0, %xmm2
7777 ; SSE-NEXT: movaps %xmm2, %xmm3
7878 ; SSE-NEXT: andps %xmm1, %xmm3
7979 ; SSE-NEXT: maxss %xmm0, %xmm1
9494
9595 ; CHECK-LABEL: @test_intrinsic_fmax
9696 ; SSE: movapd %xmm0, %xmm2
97 ; SSE-NEXT: cmpunordsd %xmm2, %xmm2
97 ; SSE-NEXT: cmpunordsd %xmm0, %xmm2
9898 ; SSE-NEXT: movapd %xmm2, %xmm3
9999 ; SSE-NEXT: andpd %xmm1, %xmm3
100100 ; SSE-NEXT: maxsd %xmm0, %xmm1
1717
1818 ; CHECK-LABEL: @test_fminf
1919 ; SSE: movaps %xmm0, %xmm2
20 ; SSE-NEXT: cmpunordss %xmm2, %xmm2
20 ; SSE-NEXT: cmpunordss %xmm0, %xmm2
2121 ; SSE-NEXT: movaps %xmm2, %xmm3
2222 ; SSE-NEXT: andps %xmm1, %xmm3
2323 ; SSE-NEXT: minss %xmm0, %xmm1
3939
4040 ; CHECK-LABEL: @test_fmin
4141 ; SSE: movapd %xmm0, %xmm2
42 ; SSE-NEXT: cmpunordsd %xmm2, %xmm2
42 ; SSE-NEXT: cmpunordsd %xmm0, %xmm2
4343 ; SSE-NEXT: movapd %xmm2, %xmm3
4444 ; SSE-NEXT: andpd %xmm1, %xmm3
4545 ; SSE-NEXT: minsd %xmm0, %xmm1
6666
6767 ; CHECK-LABEL: @test_intrinsic_fminf
6868 ; SSE: movaps %xmm0, %xmm2
69 ; SSE-NEXT: cmpunordss %xmm2, %xmm2
69 ; SSE-NEXT: cmpunordss %xmm0, %xmm2
7070 ; SSE-NEXT: movaps %xmm2, %xmm3
7171 ; SSE-NEXT: andps %xmm1, %xmm3
7272 ; SSE-NEXT: minss %xmm0, %xmm1
8686
8787 ; CHECK-LABEL: @test_intrinsic_fmin
8888 ; SSE: movapd %xmm0, %xmm2
89 ; SSE-NEXT: cmpunordsd %xmm2, %xmm2
89 ; SSE-NEXT: cmpunordsd %xmm0, %xmm2
9090 ; SSE-NEXT: movapd %xmm2, %xmm3
9191 ; SSE-NEXT: andpd %xmm1, %xmm3
9292 ; SSE-NEXT: minsd %xmm0, %xmm1
226226 ; CHECK: # BB#0: # %entry
227227 ; CHECK-NEXT: subq $40, %rsp
228228 ; CHECK-NEXT: movaps %xmm0, %xmm1
229 ; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
229 ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
230230 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
231231 ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
232232 ; CHECK-NEXT: movq $0, (%rsp)
274274 ; CHECK: # BB#0: # %entry
275275 ; CHECK-NEXT: subq $40, %rsp
276276 ; CHECK-NEXT: movaps %xmm0, %xmm1
277 ; CHECK-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp)
277 ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
278278 ; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
279279 ; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
280280 ; CHECK-NEXT: movq $0, (%rsp)
907907 ; SSE-LABEL: not_a_hsub_2:
908908 ; SSE: # BB#0:
909909 ; SSE-NEXT: movaps %xmm0, %xmm2
910 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
910 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
911911 ; SSE-NEXT: movaps %xmm0, %xmm3
912 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
912 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
913913 ; SSE-NEXT: subss %xmm3, %xmm2
914914 ; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
915915 ; SSE-NEXT: subss %xmm3, %xmm0
916916 ; SSE-NEXT: movaps %xmm1, %xmm3
917 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
917 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3]
918918 ; SSE-NEXT: movaps %xmm1, %xmm4
919 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
919 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
920920 ; SSE-NEXT: subss %xmm4, %xmm3
921921 ; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
922922 ; SSE-NEXT: subss %xmm4, %xmm1
964964 ; SSE-LABEL: not_a_hsub_3:
965965 ; SSE: # BB#0:
966966 ; SSE-NEXT: movaps %xmm1, %xmm2
967 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
967 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
968968 ; SSE-NEXT: subsd %xmm2, %xmm1
969969 ; SSE-NEXT: movaps %xmm0, %xmm2
970 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
970 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
971971 ; SSE-NEXT: subsd %xmm0, %xmm2
972972 ; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
973973 ; SSE-NEXT: movapd %xmm2, %xmm0
102102 ; SSE-LABEL: test5_undef:
103103 ; SSE: # BB#0:
104104 ; SSE-NEXT: movaps %xmm0, %xmm1
105 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
105 ; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1]
106106 ; SSE-NEXT: addsd %xmm0, %xmm1
107107 ; SSE-NEXT: movapd %xmm1, %xmm0
108108 ; SSE-NEXT: retq
167167 ; SSE-NEXT: movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
168168 ; SSE-NEXT: addss %xmm0, %xmm1
169169 ; SSE-NEXT: movaps %xmm0, %xmm2
170 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
170 ; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
171171 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
172172 ; SSE-NEXT: addss %xmm2, %xmm0
173173 ; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
385385 ; CHECK-LIBCALL-NEXT: pushq %rbx
386386 ; CHECK-LIBCALL-NEXT: subq $48, %rsp
387387 ; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
388 ; CHECK-LIBCALL-NEXT: movzwl (%rbx), %edi
388 ; CHECK-LIBCALL-NEXT: movzwl (%rdi), %edi
389389 ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
390390 ; CHECK-LIBCALL-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
391391 ; CHECK-LIBCALL-NEXT: movzwl 2(%rbx), %edi
471471 ; CHECK-LIBCALL-NEXT: pushq %rbx
472472 ; CHECK-LIBCALL-NEXT: subq $16, %rsp
473473 ; CHECK-LIBCALL-NEXT: movq %rdi, %rbx
474 ; CHECK-LIBCALL-NEXT: movzwl 4(%rbx), %edi
474 ; CHECK-LIBCALL-NEXT: movzwl 4(%rdi), %edi
475475 ; CHECK-LIBCALL-NEXT: callq __gnu_h2f_ieee
476476 ; CHECK-LIBCALL-NEXT: movss %xmm0, {{[0-9]+}}(%rsp) # 4-byte Spill
477477 ; CHECK-LIBCALL-NEXT: movzwl 6(%rbx), %edi
656656 ; CHECK-I686-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) # 16-byte Spill
657657 ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %ebp
658658 ; CHECK-I686-NEXT: movaps %xmm0, %xmm1
659 ; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
659 ; CHECK-I686-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[2,3]
660660 ; CHECK-I686-NEXT: movss %xmm1, (%esp)
661661 ; CHECK-I686-NEXT: calll __gnu_f2h_ieee
662662 ; CHECK-I686-NEXT: movw %ax, %si
161161 ; CHECK-NEXT: fstpt (%esp)
162162 ; CHECK-NEXT: calll _ceil
163163 ; CHECK-NEXT: fld %st(0)
164 ; CHECK-NEXT: fxch %st(1)
164165 ; CHECK-NEXT: ## InlineAsm Start
165166 ; CHECK-NEXT: fistpl %st(0)
166167 ; CHECK-NEXT: ## InlineAsm End
2323 call void @foo()
2424 ; CHECK-LABEL: bar:
2525 ; CHECK: callq foo
26 ; CHECK-NEXT: movl %eax, %r15d
26 ; CHECK-NEXT: movl %edi, %r15d
2727 call void asm sideeffect "movl $0, %r12d", "{r15}~{r12}"(i32 %X)
2828 ret void
2929 }
2626
2727 ; X64-LABEL: print_framealloc_from_fp:
2828 ; X64: movq %rcx, %[[parent_fp:[a-z]+]]
29 ; X64: movl .Lalloc_func$frame_escape_0(%[[parent_fp]]), %edx
29 ; X64: movl .Lalloc_func$frame_escape_0(%rcx), %edx
3030 ; X64: leaq {{.*}}(%rip), %[[str:[a-z]+]]
3131 ; X64: movq %[[str]], %rcx
3232 ; X64: callq printf
158158 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
159159 ; X32-NEXT: pushl %esi
160160 ; X32-NEXT: movl %esi, %ebx
161 ; X32-NEXT: movl %ebx, {{[0-9]+}}(%esp) # 4-byte Spill
161 ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
162162 ; X32-NEXT: pushl %edi
163163 ; X32-NEXT: movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
164164 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
751751 ; X32-NEXT: pushl $0
752752 ; X32-NEXT: pushl %edi
753753 ; X32-NEXT: movl %ebx, %esi
754 ; X32-NEXT: pushl %esi
755 ; X32-NEXT: pushl $0
756 ; X32-NEXT: pushl $0
757 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
758 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
759 ; X32-NEXT: pushl %eax
760 ; X32-NEXT: calll __multi3
761 ; X32-NEXT: addl $32, %esp
762 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
763 ; X32-NEXT: pushl $0
764 ; X32-NEXT: pushl $0
765 ; X32-NEXT: pushl $0
766 ; X32-NEXT: pushl $0
767 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
768 ; X32-NEXT: pushl %ebx
769 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
770 ; X32-NEXT: pushl %edi
771 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
772 ; X32-NEXT: pushl %esi
773 ; X32-NEXT: pushl %eax
774 ; X32-NEXT: calll __multi3
775 ; X32-NEXT: addl $32, %esp
776 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
777 ; X32-NEXT: pushl $0
778 ; X32-NEXT: pushl $0
779 ; X32-NEXT: pushl %ebx
780 ; X32-NEXT: pushl %edi
781 ; X32-NEXT: pushl $0
782 ; X32-NEXT: pushl $0
783 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
784 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
785 ; X32-NEXT: pushl %eax
786 ; X32-NEXT: calll __multi3
787 ; X32-NEXT: addl $32, %esp
788 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
789 ; X32-NEXT: pushl $0
790 ; X32-NEXT: pushl $0
791 ; X32-NEXT: pushl %ebx
792 ; X32-NEXT: pushl %edi
793 ; X32-NEXT: pushl $0
794 ; X32-NEXT: pushl $0
795 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
796 ; X32-NEXT: pushl %edi
797 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
798 ; X32-NEXT: pushl %esi
799 ; X32-NEXT: pushl %eax
800 ; X32-NEXT: calll __multi3
801 ; X32-NEXT: addl $32, %esp
802 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
803 ; X32-NEXT: pushl $0
804 ; X32-NEXT: pushl $0
805 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
806 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
807 ; X32-NEXT: pushl %ebx
808 ; X32-NEXT: pushl $0
809 ; X32-NEXT: pushl $0
810 ; X32-NEXT: pushl %edi
811 ; X32-NEXT: pushl %esi
812 ; X32-NEXT: pushl %eax
813 ; X32-NEXT: calll __multi3
814 ; X32-NEXT: addl $32, %esp
815 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
816 ; X32-NEXT: pushl $0
817 ; X32-NEXT: pushl $0
818 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
819 ; X32-NEXT: pushl %ebx
820 ; X32-NEXT: pushl $0
821 ; X32-NEXT: pushl $0
822 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
823 ; X32-NEXT: pushl %edi
824 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
825 ; X32-NEXT: pushl %esi
826 ; X32-NEXT: pushl %eax
827 ; X32-NEXT: calll __multi3
828 ; X32-NEXT: addl $32, %esp
829 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
830 ; X32-NEXT: pushl $0
831 ; X32-NEXT: pushl $0
832 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
833 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
834 ; X32-NEXT: pushl %ebx
835 ; X32-NEXT: pushl $0
836 ; X32-NEXT: pushl $0
837 ; X32-NEXT: pushl %edi
838 ; X32-NEXT: pushl %esi
839 ; X32-NEXT: pushl %eax
840 ; X32-NEXT: calll __multi3
841 ; X32-NEXT: addl $32, %esp
842 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
843 ; X32-NEXT: pushl $0
844 ; X32-NEXT: pushl $0
845 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
846 ; X32-NEXT: pushl %ebx
847 ; X32-NEXT: pushl $0
848 ; X32-NEXT: pushl $0
849 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
850 ; X32-NEXT: pushl %esi
851 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
852 ; X32-NEXT: pushl %ebx
853 ; X32-NEXT: pushl %eax
854 ; X32-NEXT: calll __multi3
855 ; X32-NEXT: addl $32, %esp
856 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
857 ; X32-NEXT: pushl $0
858 ; X32-NEXT: pushl $0
859 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
860 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
861 ; X32-NEXT: pushl %edi
862 ; X32-NEXT: pushl $0
863 ; X32-NEXT: pushl $0
864 ; X32-NEXT: pushl %esi
865 ; X32-NEXT: pushl %ebx
866 ; X32-NEXT: pushl %eax
867 ; X32-NEXT: calll __multi3
868 ; X32-NEXT: addl $32, %esp
869 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
870 ; X32-NEXT: pushl $0
871 ; X32-NEXT: pushl $0
872 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
873 ; X32-NEXT: pushl %ebx
874 ; X32-NEXT: pushl %edi
875 ; X32-NEXT: pushl $0
876 ; X32-NEXT: pushl $0
877 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
878 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
879 ; X32-NEXT: pushl %eax
880 ; X32-NEXT: calll __multi3
881 ; X32-NEXT: addl $32, %esp
882 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
883 ; X32-NEXT: pushl $0
884 ; X32-NEXT: pushl $0
885 ; X32-NEXT: pushl $0
886 ; X32-NEXT: pushl $0
887 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
888 ; X32-NEXT: pushl %edi
889 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
890 ; X32-NEXT: pushl %esi
891 ; X32-NEXT: pushl %ebx
892 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
893 ; X32-NEXT: pushl %eax
894 ; X32-NEXT: calll __multi3
895 ; X32-NEXT: addl $32, %esp
896 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
897 ; X32-NEXT: pushl $0
898 ; X32-NEXT: pushl $0
899 ; X32-NEXT: pushl %edi
754 ; X32-NEXT: pushl %ebx
755 ; X32-NEXT: pushl $0
756 ; X32-NEXT: pushl $0
757 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
758 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
759 ; X32-NEXT: pushl %eax
760 ; X32-NEXT: calll __multi3
761 ; X32-NEXT: addl $32, %esp
762 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
763 ; X32-NEXT: pushl $0
764 ; X32-NEXT: pushl $0
765 ; X32-NEXT: pushl $0
766 ; X32-NEXT: pushl $0
767 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
768 ; X32-NEXT: pushl %ebx
769 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
770 ; X32-NEXT: pushl %edi
771 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
772 ; X32-NEXT: pushl %esi
773 ; X32-NEXT: pushl %eax
774 ; X32-NEXT: calll __multi3
775 ; X32-NEXT: addl $32, %esp
776 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
777 ; X32-NEXT: pushl $0
778 ; X32-NEXT: pushl $0
779 ; X32-NEXT: pushl %ebx
780 ; X32-NEXT: pushl %edi
781 ; X32-NEXT: pushl $0
782 ; X32-NEXT: pushl $0
783 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
784 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
785 ; X32-NEXT: pushl %eax
786 ; X32-NEXT: calll __multi3
787 ; X32-NEXT: addl $32, %esp
788 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
789 ; X32-NEXT: pushl $0
790 ; X32-NEXT: pushl $0
791 ; X32-NEXT: pushl %ebx
792 ; X32-NEXT: pushl %edi
793 ; X32-NEXT: pushl $0
794 ; X32-NEXT: pushl $0
795 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
796 ; X32-NEXT: pushl %edi
797 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
798 ; X32-NEXT: pushl %esi
799 ; X32-NEXT: pushl %eax
800 ; X32-NEXT: calll __multi3
801 ; X32-NEXT: addl $32, %esp
802 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
803 ; X32-NEXT: pushl $0
804 ; X32-NEXT: pushl $0
805 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
806 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
807 ; X32-NEXT: pushl %ebx
808 ; X32-NEXT: pushl $0
809 ; X32-NEXT: pushl $0
810 ; X32-NEXT: pushl %edi
811 ; X32-NEXT: pushl %esi
812 ; X32-NEXT: pushl %eax
813 ; X32-NEXT: calll __multi3
814 ; X32-NEXT: addl $32, %esp
815 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
816 ; X32-NEXT: pushl $0
817 ; X32-NEXT: pushl $0
818 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
819 ; X32-NEXT: pushl %ebx
820 ; X32-NEXT: pushl $0
821 ; X32-NEXT: pushl $0
822 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
823 ; X32-NEXT: pushl %edi
824 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
825 ; X32-NEXT: pushl %esi
826 ; X32-NEXT: pushl %eax
827 ; X32-NEXT: calll __multi3
828 ; X32-NEXT: addl $32, %esp
829 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
830 ; X32-NEXT: pushl $0
831 ; X32-NEXT: pushl $0
832 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
833 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
834 ; X32-NEXT: pushl %ebx
835 ; X32-NEXT: pushl $0
836 ; X32-NEXT: pushl $0
837 ; X32-NEXT: pushl %edi
838 ; X32-NEXT: pushl %esi
839 ; X32-NEXT: pushl %eax
840 ; X32-NEXT: calll __multi3
841 ; X32-NEXT: addl $32, %esp
842 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
843 ; X32-NEXT: pushl $0
844 ; X32-NEXT: pushl $0
845 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
846 ; X32-NEXT: pushl %ebx
847 ; X32-NEXT: pushl $0
848 ; X32-NEXT: pushl $0
849 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
850 ; X32-NEXT: pushl %esi
851 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
852 ; X32-NEXT: pushl %ebx
853 ; X32-NEXT: pushl %eax
854 ; X32-NEXT: calll __multi3
855 ; X32-NEXT: addl $32, %esp
856 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
857 ; X32-NEXT: pushl $0
858 ; X32-NEXT: pushl $0
859 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
860 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
861 ; X32-NEXT: pushl %edi
862 ; X32-NEXT: pushl $0
863 ; X32-NEXT: pushl $0
864 ; X32-NEXT: pushl %esi
865 ; X32-NEXT: pushl %ebx
866 ; X32-NEXT: pushl %eax
867 ; X32-NEXT: calll __multi3
868 ; X32-NEXT: addl $32, %esp
869 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
870 ; X32-NEXT: pushl $0
871 ; X32-NEXT: pushl $0
872 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
873 ; X32-NEXT: pushl %ebx
874 ; X32-NEXT: pushl %edi
875 ; X32-NEXT: pushl $0
876 ; X32-NEXT: pushl $0
877 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
878 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
879 ; X32-NEXT: pushl %eax
880 ; X32-NEXT: calll __multi3
881 ; X32-NEXT: addl $32, %esp
882 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
883 ; X32-NEXT: pushl $0
884 ; X32-NEXT: pushl $0
885 ; X32-NEXT: pushl $0
886 ; X32-NEXT: pushl $0
887 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
888 ; X32-NEXT: pushl %edi
889 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
890 ; X32-NEXT: pushl %esi
891 ; X32-NEXT: pushl %ebx
892 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
893 ; X32-NEXT: pushl %eax
894 ; X32-NEXT: calll __multi3
895 ; X32-NEXT: addl $32, %esp
896 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
897 ; X32-NEXT: pushl $0
898 ; X32-NEXT: pushl $0
899 ; X32-NEXT: pushl %edi
900 ; X32-NEXT: pushl %esi
901 ; X32-NEXT: pushl $0
902 ; X32-NEXT: pushl $0
903 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
904 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
905 ; X32-NEXT: pushl %eax
906 ; X32-NEXT: calll __multi3
907 ; X32-NEXT: addl $32, %esp
908 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
909 ; X32-NEXT: pushl $0
910 ; X32-NEXT: pushl $0
911 ; X32-NEXT: pushl %edi
912 ; X32-NEXT: pushl %esi
913 ; X32-NEXT: pushl $0
914 ; X32-NEXT: pushl $0
915 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
916 ; X32-NEXT: pushl %esi
917 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
918 ; X32-NEXT: pushl %ebx
919 ; X32-NEXT: pushl %eax
920 ; X32-NEXT: calll __multi3
921 ; X32-NEXT: addl $32, %esp
922 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
923 ; X32-NEXT: pushl $0
924 ; X32-NEXT: pushl $0
925 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
926 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
927 ; X32-NEXT: pushl %edi
928 ; X32-NEXT: pushl $0
929 ; X32-NEXT: pushl $0
930 ; X32-NEXT: pushl %esi
931 ; X32-NEXT: pushl %ebx
932 ; X32-NEXT: pushl %eax
933 ; X32-NEXT: calll __multi3
934 ; X32-NEXT: addl $32, %esp
935 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
936 ; X32-NEXT: pushl $0
937 ; X32-NEXT: pushl $0
938 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
939 ; X32-NEXT: pushl %edi
940 ; X32-NEXT: pushl $0
941 ; X32-NEXT: pushl $0
942 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
943 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
944 ; X32-NEXT: pushl %eax
945 ; X32-NEXT: calll __multi3
946 ; X32-NEXT: addl $32, %esp
947 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
948 ; X32-NEXT: pushl $0
949 ; X32-NEXT: pushl $0
950 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
951 ; X32-NEXT: pushl %edi
952 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
953 ; X32-NEXT: pushl %esi
954 ; X32-NEXT: pushl $0
955 ; X32-NEXT: pushl $0
956 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
957 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
958 ; X32-NEXT: pushl %eax
959 ; X32-NEXT: calll __multi3
960 ; X32-NEXT: addl $32, %esp
961 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
962 ; X32-NEXT: pushl $0
963 ; X32-NEXT: pushl $0
964 ; X32-NEXT: pushl %edi
965 ; X32-NEXT: pushl %esi
966 ; X32-NEXT: pushl $0
967 ; X32-NEXT: pushl $0
968 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
969 ; X32-NEXT: pushl %edi
970 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
971 ; X32-NEXT: pushl %esi
972 ; X32-NEXT: pushl %eax
973 ; X32-NEXT: calll __multi3
974 ; X32-NEXT: addl $32, %esp
975 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
976 ; X32-NEXT: pushl $0
977 ; X32-NEXT: pushl $0
978 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
979 ; X32-NEXT: pushl %ebx
980 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
981 ; X32-NEXT: pushl $0
982 ; X32-NEXT: pushl $0
983 ; X32-NEXT: pushl %edi
984 ; X32-NEXT: pushl %esi
985 ; X32-NEXT: pushl %eax
986 ; X32-NEXT: calll __multi3
987 ; X32-NEXT: addl $32, %esp
988 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
989 ; X32-NEXT: pushl $0
990 ; X32-NEXT: pushl $0
991 ; X32-NEXT: pushl %ebx
992 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
993 ; X32-NEXT: pushl $0
994 ; X32-NEXT: pushl $0
995 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
996 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
997 ; X32-NEXT: pushl %eax
998 ; X32-NEXT: calll __multi3
999 ; X32-NEXT: addl $32, %esp
1000 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1001 ; X32-NEXT: pushl $0
1002 ; X32-NEXT: pushl $0
1003 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1004 ; X32-NEXT: pushl %edi
1005 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1006 ; X32-NEXT: pushl %esi
1007 ; X32-NEXT: pushl $0
1008 ; X32-NEXT: pushl $0
1009 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1010 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1011 ; X32-NEXT: pushl %eax
1012 ; X32-NEXT: calll __multi3
1013 ; X32-NEXT: addl $32, %esp
1014 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1015 ; X32-NEXT: pushl $0
1016 ; X32-NEXT: pushl $0
1017 ; X32-NEXT: pushl %edi
1018 ; X32-NEXT: pushl %esi
1019 ; X32-NEXT: pushl $0
1020 ; X32-NEXT: pushl $0
1021 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1022 ; X32-NEXT: pushl %edi
1023 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1024 ; X32-NEXT: pushl %ebx
1025 ; X32-NEXT: pushl %eax
1026 ; X32-NEXT: calll __multi3
1027 ; X32-NEXT: addl $32, %esp
1028 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1029 ; X32-NEXT: pushl $0
1030 ; X32-NEXT: pushl $0
1031 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1032 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1033 ; X32-NEXT: pushl %esi
1034 ; X32-NEXT: pushl $0
1035 ; X32-NEXT: pushl $0
1036 ; X32-NEXT: pushl %edi
1037 ; X32-NEXT: pushl %ebx
1038 ; X32-NEXT: pushl %eax
1039 ; X32-NEXT: calll __multi3
1040 ; X32-NEXT: addl $32, %esp
1041 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1042 ; X32-NEXT: pushl $0
1043 ; X32-NEXT: pushl $0
1044 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1045 ; X32-NEXT: pushl %esi
1046 ; X32-NEXT: pushl $0
1047 ; X32-NEXT: pushl $0
1048 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1049 ; X32-NEXT: pushl %edi
1050 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1051 ; X32-NEXT: pushl %eax
1052 ; X32-NEXT: calll __multi3
1053 ; X32-NEXT: addl $32, %esp
1054 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1055 ; X32-NEXT: pushl $0
1056 ; X32-NEXT: pushl $0
1057 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1058 ; X32-NEXT: pushl %esi
1059 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1060 ; X32-NEXT: pushl %ebx
1061 ; X32-NEXT: pushl $0
1062 ; X32-NEXT: pushl $0
1063 ; X32-NEXT: pushl %edi
1064 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1065 ; X32-NEXT: pushl %eax
1066 ; X32-NEXT: calll __multi3
1067 ; X32-NEXT: addl $32, %esp
1068 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1069 ; X32-NEXT: pushl $0
1070 ; X32-NEXT: pushl $0
1071 ; X32-NEXT: pushl %esi
1072 ; X32-NEXT: pushl %ebx
1073 ; X32-NEXT: pushl $0
1074 ; X32-NEXT: pushl $0
1075 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1076 ; X32-NEXT: pushl %esi
1077 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1078 ; X32-NEXT: pushl %eax
1079 ; X32-NEXT: calll __multi3
1080 ; X32-NEXT: addl $32, %esp
1081 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1082 ; X32-NEXT: pushl $0
1083 ; X32-NEXT: pushl $0
1084 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1085 ; X32-NEXT: pushl %ebx
1086 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1087 ; X32-NEXT: pushl %edi
1088 ; X32-NEXT: pushl $0
1089 ; X32-NEXT: pushl $0
1090 ; X32-NEXT: pushl %esi
1091 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1092 ; X32-NEXT: pushl %eax
1093 ; X32-NEXT: calll __multi3
1094 ; X32-NEXT: addl $32, %esp
1095 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1096 ; X32-NEXT: pushl $0
1097 ; X32-NEXT: pushl $0
1098 ; X32-NEXT: pushl %ebx
1099 ; X32-NEXT: movl %ebx, %esi
1100 ; X32-NEXT: pushl %edi
1101 ; X32-NEXT: pushl $0
1102 ; X32-NEXT: pushl $0
1103 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1104 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1105 ; X32-NEXT: pushl %eax
1106 ; X32-NEXT: calll __multi3
1107 ; X32-NEXT: addl $32, %esp
1108 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1109 ; X32-NEXT: pushl $0
1110 ; X32-NEXT: pushl $0
1111 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1112 ; X32-NEXT: pushl %edi
1113 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1114 ; X32-NEXT: pushl %ebx
1115 ; X32-NEXT: pushl $0
1116 ; X32-NEXT: pushl $0
1117 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1118 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1119 ; X32-NEXT: pushl %eax
1120 ; X32-NEXT: calll __multi3
1121 ; X32-NEXT: addl $32, %esp
1122 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1123 ; X32-NEXT: pushl $0
1124 ; X32-NEXT: pushl $0
1125 ; X32-NEXT: pushl %edi
1126 ; X32-NEXT: pushl %ebx
1127 ; X32-NEXT: pushl $0
1128 ; X32-NEXT: pushl $0
1129 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1130 ; X32-NEXT: pushl %ebx
1131 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1132 ; X32-NEXT: pushl %edi
1133 ; X32-NEXT: pushl %eax
1134 ; X32-NEXT: calll __multi3
1135 ; X32-NEXT: addl $32, %esp
1136 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1137 ; X32-NEXT: pushl $0
1138 ; X32-NEXT: pushl $0
1139 ; X32-NEXT: pushl %esi
1140 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1141 ; X32-NEXT: pushl %esi
1142 ; X32-NEXT: pushl $0
1143 ; X32-NEXT: pushl $0
1144 ; X32-NEXT: pushl %ebx
1145 ; X32-NEXT: pushl %edi
1146 ; X32-NEXT: pushl %eax
1147 ; X32-NEXT: calll __multi3
1148 ; X32-NEXT: addl $32, %esp
1149 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1150 ; X32-NEXT: pushl $0
1151 ; X32-NEXT: pushl $0
1152 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1153 ; X32-NEXT: pushl %esi
1154 ; X32-NEXT: pushl $0
1155 ; X32-NEXT: pushl $0
1156 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1157 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1158 ; X32-NEXT: pushl %eax
1159 ; X32-NEXT: calll __multi3
1160 ; X32-NEXT: addl $32, %esp
1161 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1162 ; X32-NEXT: pushl $0
1163 ; X32-NEXT: pushl $0
1164 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1165 ; X32-NEXT: pushl %edi
1166 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1167 ; X32-NEXT: pushl %esi
1168 ; X32-NEXT: pushl $0
1169 ; X32-NEXT: pushl $0
1170 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1171 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1172 ; X32-NEXT: pushl %eax
1173 ; X32-NEXT: calll __multi3
1174 ; X32-NEXT: addl $32, %esp
1175 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1176 ; X32-NEXT: pushl $0
1177 ; X32-NEXT: pushl $0
1178 ; X32-NEXT: pushl %edi
1179 ; X32-NEXT: pushl %esi
1180 ; X32-NEXT: pushl $0
1181 ; X32-NEXT: pushl $0
1182 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1183 ; X32-NEXT: pushl %edi
1184 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1185 ; X32-NEXT: pushl %esi
1186 ; X32-NEXT: pushl %eax
1187 ; X32-NEXT: calll __multi3
1188 ; X32-NEXT: addl $32, %esp
1189 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1190 ; X32-NEXT: pushl $0
1191 ; X32-NEXT: pushl $0
1192 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1193 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1194 ; X32-NEXT: pushl %ebx
1195 ; X32-NEXT: pushl $0
1196 ; X32-NEXT: pushl $0
1197 ; X32-NEXT: pushl %edi
1198 ; X32-NEXT: pushl %esi
1199 ; X32-NEXT: pushl %eax
1200 ; X32-NEXT: calll __multi3
1201 ; X32-NEXT: addl $32, %esp
1202 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1203 ; X32-NEXT: pushl $0
1204 ; X32-NEXT: pushl $0
1205 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1206 ; X32-NEXT: pushl %esi
1207 ; X32-NEXT: pushl %ebx
1208 ; X32-NEXT: pushl $0
1209 ; X32-NEXT: pushl $0
1210 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1211 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1212 ; X32-NEXT: pushl %eax
1213 ; X32-NEXT: calll __multi3
1214 ; X32-NEXT: addl $32, %esp
1215 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1216 ; X32-NEXT: pushl $0
1217 ; X32-NEXT: pushl $0
1218 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1219 ; X32-NEXT: pushl %ebx
1220 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1221 ; X32-NEXT: pushl %edi
1222 ; X32-NEXT: pushl $0
1223 ; X32-NEXT: pushl $0
1224 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1225 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1226 ; X32-NEXT: pushl %eax
1227 ; X32-NEXT: calll __multi3
1228 ; X32-NEXT: addl $32, %esp
1229 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1230 ; X32-NEXT: pushl $0
1231 ; X32-NEXT: pushl $0
1232 ; X32-NEXT: pushl %ebx
1233 ; X32-NEXT: pushl %edi
1234 ; X32-NEXT: pushl $0
1235 ; X32-NEXT: pushl $0
1236 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1237 ; X32-NEXT: pushl %ebx
1238 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1239 ; X32-NEXT: pushl %edi
1240 ; X32-NEXT: pushl %eax
1241 ; X32-NEXT: calll __multi3
1242 ; X32-NEXT: addl $32, %esp
1243 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1244 ; X32-NEXT: pushl $0
1245 ; X32-NEXT: pushl $0
1246 ; X32-NEXT: pushl %esi
1247 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1248 ; X32-NEXT: pushl $0
1249 ; X32-NEXT: pushl $0
1250 ; X32-NEXT: pushl %ebx
1251 ; X32-NEXT: pushl %edi
1252 ; X32-NEXT: pushl %eax
1253 ; X32-NEXT: calll __multi3
1254 ; X32-NEXT: addl $32, %esp
1255 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1256 ; X32-NEXT: pushl $0
1257 ; X32-NEXT: pushl $0
1258 ; X32-NEXT: pushl %esi
1259 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1260 ; X32-NEXT: pushl $0
1261 ; X32-NEXT: pushl $0
1262 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1263 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1264 ; X32-NEXT: pushl %edi
1265 ; X32-NEXT: pushl %eax
1266 ; X32-NEXT: calll __multi3
1267 ; X32-NEXT: addl $32, %esp
1268 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1269 ; X32-NEXT: pushl $0
1270 ; X32-NEXT: pushl $0
1271 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1272 ; X32-NEXT: pushl %ebx
1273 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1274 ; X32-NEXT: pushl %esi
1275 ; X32-NEXT: pushl $0
1276 ; X32-NEXT: pushl $0
1277 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1278 ; X32-NEXT: pushl %edi
1279 ; X32-NEXT: pushl %eax
1280 ; X32-NEXT: calll __multi3
1281 ; X32-NEXT: addl $32, %esp
1282 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1283 ; X32-NEXT: pushl $0
1284 ; X32-NEXT: pushl $0
1285 ; X32-NEXT: pushl %ebx
1286 ; X32-NEXT: pushl %esi
1287 ; X32-NEXT: pushl $0
1288 ; X32-NEXT: pushl $0
1289 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1290 ; X32-NEXT: pushl %ebx
1291 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1292 ; X32-NEXT: pushl %eax
1293 ; X32-NEXT: calll __multi3
1294 ; X32-NEXT: addl $32, %esp
1295 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1296 ; X32-NEXT: pushl $0
1297 ; X32-NEXT: pushl $0
1298 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1299 ; X32-NEXT: pushl %edi
1300 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1301 ; X32-NEXT: pushl %esi
1302 ; X32-NEXT: pushl $0
1303 ; X32-NEXT: pushl $0
1304 ; X32-NEXT: pushl %ebx
1305 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1306 ; X32-NEXT: pushl %eax
1307 ; X32-NEXT: calll __multi3
1308 ; X32-NEXT: addl $32, %esp
1309 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1310 ; X32-NEXT: pushl $0
1311 ; X32-NEXT: pushl $0
1312 ; X32-NEXT: pushl %edi
1313 ; X32-NEXT: pushl %esi
1314 ; X32-NEXT: pushl $0
1315 ; X32-NEXT: pushl $0
1316 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1317 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1318 ; X32-NEXT: pushl %eax
1319 ; X32-NEXT: calll __multi3
1320 ; X32-NEXT: addl $32, %esp
1321 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1322 ; X32-NEXT: pushl $0
1323 ; X32-NEXT: pushl $0
1324 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1325 ; X32-NEXT: pushl %edi
1326 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1327 ; X32-NEXT: pushl %esi
1328 ; X32-NEXT: pushl $0
1329 ; X32-NEXT: pushl $0
1330 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1331 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1332 ; X32-NEXT: pushl %eax
1333 ; X32-NEXT: calll __multi3
1334 ; X32-NEXT: addl $32, %esp
1335 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1336 ; X32-NEXT: pushl $0
1337 ; X32-NEXT: pushl $0
1338 ; X32-NEXT: pushl %edi
1339 ; X32-NEXT: pushl %esi
1340 ; X32-NEXT: pushl $0
1341 ; X32-NEXT: pushl $0
1342 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1343 ; X32-NEXT: pushl %esi
1344 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1345 ; X32-NEXT: pushl %ebx
1346 ; X32-NEXT: pushl %eax
1347 ; X32-NEXT: calll __multi3
1348 ; X32-NEXT: addl $32, %esp
1349 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1350 ; X32-NEXT: pushl $0
1351 ; X32-NEXT: pushl $0
1352 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1353 ; X32-NEXT: pushl %edi
1354 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1355 ; X32-NEXT: pushl $0
1356 ; X32-NEXT: pushl $0
1357 ; X32-NEXT: pushl %esi
1358 ; X32-NEXT: pushl %ebx
1359 ; X32-NEXT: pushl %eax
1360 ; X32-NEXT: calll __multi3
1361 ; X32-NEXT: addl $32, %esp
1362 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1363 ; X32-NEXT: pushl $0
1364 ; X32-NEXT: pushl $0
9001365 ; X32-NEXT: movl %edi, %ebx
901 ; X32-NEXT: pushl %esi
902 ; X32-NEXT: pushl $0
903 ; X32-NEXT: pushl $0
904 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
905 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
906 ; X32-NEXT: pushl %eax
907 ; X32-NEXT: calll __multi3
908 ; X32-NEXT: addl $32, %esp
909 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
910 ; X32-NEXT: pushl $0
911 ; X32-NEXT: pushl $0
912 ; X32-NEXT: pushl %ebx
913 ; X32-NEXT: pushl %esi
914 ; X32-NEXT: pushl $0
915 ; X32-NEXT: pushl $0
916 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
917 ; X32-NEXT: pushl %esi
918 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
919 ; X32-NEXT: pushl %ebx
920 ; X32-NEXT: pushl %eax
921 ; X32-NEXT: calll __multi3
922 ; X32-NEXT: addl $32, %esp
923 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
924 ; X32-NEXT: pushl $0
925 ; X32-NEXT: pushl $0
926 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
927 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
928 ; X32-NEXT: pushl %edi
929 ; X32-NEXT: pushl $0
930 ; X32-NEXT: pushl $0
931 ; X32-NEXT: pushl %esi
932 ; X32-NEXT: pushl %ebx
933 ; X32-NEXT: pushl %eax
934 ; X32-NEXT: calll __multi3
935 ; X32-NEXT: addl $32, %esp
936 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
937 ; X32-NEXT: pushl $0
938 ; X32-NEXT: pushl $0
939 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
940 ; X32-NEXT: pushl %edi
941 ; X32-NEXT: pushl $0
942 ; X32-NEXT: pushl $0
943 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
944 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
945 ; X32-NEXT: pushl %eax
946 ; X32-NEXT: calll __multi3
947 ; X32-NEXT: addl $32, %esp
948 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
949 ; X32-NEXT: pushl $0
950 ; X32-NEXT: pushl $0
951 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
952 ; X32-NEXT: pushl %edi
953 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
954 ; X32-NEXT: pushl %esi
955 ; X32-NEXT: pushl $0
956 ; X32-NEXT: pushl $0
957 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
958 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
959 ; X32-NEXT: pushl %eax
960 ; X32-NEXT: calll __multi3
961 ; X32-NEXT: addl $32, %esp
962 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
963 ; X32-NEXT: pushl $0
964 ; X32-NEXT: pushl $0
965 ; X32-NEXT: pushl %edi
966 ; X32-NEXT: pushl %esi
967 ; X32-NEXT: pushl $0
968 ; X32-NEXT: pushl $0
969 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
970 ; X32-NEXT: pushl %edi
971 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
972 ; X32-NEXT: pushl %esi
973 ; X32-NEXT: pushl %eax
974 ; X32-NEXT: calll __multi3
975 ; X32-NEXT: addl $32, %esp
976 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
977 ; X32-NEXT: pushl $0
978 ; X32-NEXT: pushl $0
979 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
980 ; X32-NEXT: pushl %ebx
981 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
982 ; X32-NEXT: pushl $0
983 ; X32-NEXT: pushl $0
984 ; X32-NEXT: pushl %edi
985 ; X32-NEXT: pushl %esi
986 ; X32-NEXT: pushl %eax
987 ; X32-NEXT: calll __multi3
988 ; X32-NEXT: addl $32, %esp
989 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
990 ; X32-NEXT: pushl $0
991 ; X32-NEXT: pushl $0
992 ; X32-NEXT: pushl %ebx
993 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
994 ; X32-NEXT: pushl $0
995 ; X32-NEXT: pushl $0
996 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
997 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
998 ; X32-NEXT: pushl %eax
999 ; X32-NEXT: calll __multi3
1000 ; X32-NEXT: addl $32, %esp
1001 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1002 ; X32-NEXT: pushl $0
1003 ; X32-NEXT: pushl $0
1004 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1005 ; X32-NEXT: pushl %edi
1006 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1007 ; X32-NEXT: pushl %esi
1008 ; X32-NEXT: pushl $0
1009 ; X32-NEXT: pushl $0
1010 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1011 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1012 ; X32-NEXT: pushl %eax
1013 ; X32-NEXT: calll __multi3
1014 ; X32-NEXT: addl $32, %esp
1015 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1016 ; X32-NEXT: pushl $0
1017 ; X32-NEXT: pushl $0
1018 ; X32-NEXT: pushl %edi
1019 ; X32-NEXT: pushl %esi
1020 ; X32-NEXT: pushl $0
1021 ; X32-NEXT: pushl $0
1022 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1023 ; X32-NEXT: pushl %edi
1024 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1025 ; X32-NEXT: pushl %ebx
1026 ; X32-NEXT: pushl %eax
1027 ; X32-NEXT: calll __multi3
1028 ; X32-NEXT: addl $32, %esp
1029 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1030 ; X32-NEXT: pushl $0
1031 ; X32-NEXT: pushl $0
1032 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1033 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1034 ; X32-NEXT: pushl %esi
1035 ; X32-NEXT: pushl $0
1036 ; X32-NEXT: pushl $0
1037 ; X32-NEXT: pushl %edi
1038 ; X32-NEXT: pushl %ebx
1039 ; X32-NEXT: pushl %eax
1040 ; X32-NEXT: calll __multi3
1041 ; X32-NEXT: addl $32, %esp
1042 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1043 ; X32-NEXT: pushl $0
1044 ; X32-NEXT: pushl $0
1045 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1046 ; X32-NEXT: pushl %esi
1047 ; X32-NEXT: pushl $0
1048 ; X32-NEXT: pushl $0
1049 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1050 ; X32-NEXT: pushl %edi
1051 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1052 ; X32-NEXT: pushl %eax
1053 ; X32-NEXT: calll __multi3
1054 ; X32-NEXT: addl $32, %esp
1055 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1056 ; X32-NEXT: pushl $0
1057 ; X32-NEXT: pushl $0
1058 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1059 ; X32-NEXT: pushl %esi
1060 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1061 ; X32-NEXT: pushl %ebx
1062 ; X32-NEXT: pushl $0
1063 ; X32-NEXT: pushl $0
1064 ; X32-NEXT: pushl %edi
1065 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1066 ; X32-NEXT: pushl %eax
1067 ; X32-NEXT: calll __multi3
1068 ; X32-NEXT: addl $32, %esp
1069 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1070 ; X32-NEXT: pushl $0
1071 ; X32-NEXT: pushl $0
1072 ; X32-NEXT: pushl %esi
1073 ; X32-NEXT: pushl %ebx
1074 ; X32-NEXT: pushl $0
1075 ; X32-NEXT: pushl $0
1076 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1077 ; X32-NEXT: pushl %esi
1078 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1079 ; X32-NEXT: pushl %eax
1080 ; X32-NEXT: calll __multi3
1081 ; X32-NEXT: addl $32, %esp
1082 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1083 ; X32-NEXT: pushl $0
1084 ; X32-NEXT: pushl $0
1085 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1086 ; X32-NEXT: pushl %ebx
1087 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1088 ; X32-NEXT: pushl %edi
1089 ; X32-NEXT: pushl $0
1090 ; X32-NEXT: pushl $0
1091 ; X32-NEXT: pushl %esi
1092 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1093 ; X32-NEXT: pushl %eax
1094 ; X32-NEXT: calll __multi3
1095 ; X32-NEXT: addl $32, %esp
1096 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1097 ; X32-NEXT: pushl $0
1098 ; X32-NEXT: pushl $0
1099 ; X32-NEXT: pushl %ebx
1100 ; X32-NEXT: movl %ebx, %esi
1101 ; X32-NEXT: pushl %edi
1102 ; X32-NEXT: pushl $0
1103 ; X32-NEXT: pushl $0
1104 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1105 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1106 ; X32-NEXT: pushl %eax
1107 ; X32-NEXT: calll __multi3
1108 ; X32-NEXT: addl $32, %esp
1109 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1110 ; X32-NEXT: pushl $0
1111 ; X32-NEXT: pushl $0
1112 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1113 ; X32-NEXT: pushl %edi
1114 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1115 ; X32-NEXT: pushl %ebx
1116 ; X32-NEXT: pushl $0
1117 ; X32-NEXT: pushl $0
1118 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1119 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1120 ; X32-NEXT: pushl %eax
1121 ; X32-NEXT: calll __multi3
1122 ; X32-NEXT: addl $32, %esp
1123 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1124 ; X32-NEXT: pushl $0
1125 ; X32-NEXT: pushl $0
1126 ; X32-NEXT: pushl %edi
1127 ; X32-NEXT: pushl %ebx
1128 ; X32-NEXT: pushl $0
1129 ; X32-NEXT: pushl $0
1130 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1131 ; X32-NEXT: pushl %ebx
1132 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1133 ; X32-NEXT: pushl %edi
1134 ; X32-NEXT: pushl %eax
1135 ; X32-NEXT: calll __multi3
1136 ; X32-NEXT: addl $32, %esp
1137 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1138 ; X32-NEXT: pushl $0
1139 ; X32-NEXT: pushl $0
1140 ; X32-NEXT: pushl %esi
1141 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1142 ; X32-NEXT: pushl %esi
1143 ; X32-NEXT: pushl $0
1144 ; X32-NEXT: pushl $0
1145 ; X32-NEXT: pushl %ebx
1146 ; X32-NEXT: pushl %edi
1147 ; X32-NEXT: pushl %eax
1148 ; X32-NEXT: calll __multi3
1149 ; X32-NEXT: addl $32, %esp
1150 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1151 ; X32-NEXT: pushl $0
1152 ; X32-NEXT: pushl $0
1153 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1154 ; X32-NEXT: pushl %esi
1155 ; X32-NEXT: pushl $0
1156 ; X32-NEXT: pushl $0
1157 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1158 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1159 ; X32-NEXT: pushl %eax
1160 ; X32-NEXT: calll __multi3
1161 ; X32-NEXT: addl $32, %esp
1162 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1163 ; X32-NEXT: pushl $0
1164 ; X32-NEXT: pushl $0
1165 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1166 ; X32-NEXT: pushl %edi
1167 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1168 ; X32-NEXT: pushl %esi
1169 ; X32-NEXT: pushl $0
1170 ; X32-NEXT: pushl $0
1171 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1172 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1173 ; X32-NEXT: pushl %eax
1174 ; X32-NEXT: calll __multi3
1175 ; X32-NEXT: addl $32, %esp
1176 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1177 ; X32-NEXT: pushl $0
1178 ; X32-NEXT: pushl $0
1179 ; X32-NEXT: pushl %edi
1180 ; X32-NEXT: pushl %esi
1181 ; X32-NEXT: pushl $0
1182 ; X32-NEXT: pushl $0
1183 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1184 ; X32-NEXT: pushl %edi
1185 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1186 ; X32-NEXT: pushl %esi
1187 ; X32-NEXT: pushl %eax
1188 ; X32-NEXT: calll __multi3
1189 ; X32-NEXT: addl $32, %esp
1190 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1191 ; X32-NEXT: pushl $0
1192 ; X32-NEXT: pushl $0
1193 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1194 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1195 ; X32-NEXT: pushl %ebx
1196 ; X32-NEXT: pushl $0
1197 ; X32-NEXT: pushl $0
1198 ; X32-NEXT: pushl %edi
1199 ; X32-NEXT: pushl %esi
1200 ; X32-NEXT: pushl %eax
1201 ; X32-NEXT: calll __multi3
1202 ; X32-NEXT: addl $32, %esp
1203 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1204 ; X32-NEXT: pushl $0
1205 ; X32-NEXT: pushl $0
1206 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1207 ; X32-NEXT: pushl %esi
1208 ; X32-NEXT: pushl %ebx
1209 ; X32-NEXT: pushl $0
1210 ; X32-NEXT: pushl $0
1211 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1212 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1213 ; X32-NEXT: pushl %eax
1214 ; X32-NEXT: calll __multi3
1215 ; X32-NEXT: addl $32, %esp
1216 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1217 ; X32-NEXT: pushl $0
1218 ; X32-NEXT: pushl $0
1219 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1220 ; X32-NEXT: pushl %ebx
1221 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1222 ; X32-NEXT: pushl %edi
1223 ; X32-NEXT: pushl $0
1224 ; X32-NEXT: pushl $0
1225 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1226 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1227 ; X32-NEXT: pushl %eax
1228 ; X32-NEXT: calll __multi3
1229 ; X32-NEXT: addl $32, %esp
1230 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1231 ; X32-NEXT: pushl $0
1232 ; X32-NEXT: pushl $0
1233 ; X32-NEXT: pushl %ebx
1234 ; X32-NEXT: pushl %edi
1235 ; X32-NEXT: pushl $0
1236 ; X32-NEXT: pushl $0
1237 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1238 ; X32-NEXT: pushl %ebx
1239 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1240 ; X32-NEXT: pushl %edi
1241 ; X32-NEXT: pushl %eax
1242 ; X32-NEXT: calll __multi3
1243 ; X32-NEXT: addl $32, %esp
1244 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1245 ; X32-NEXT: pushl $0
1246 ; X32-NEXT: pushl $0
1247 ; X32-NEXT: pushl %esi
1248 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1249 ; X32-NEXT: pushl $0
1250 ; X32-NEXT: pushl $0
1251 ; X32-NEXT: pushl %ebx
1252 ; X32-NEXT: pushl %edi
1253 ; X32-NEXT: pushl %eax
1254 ; X32-NEXT: calll __multi3
1255 ; X32-NEXT: addl $32, %esp
1256 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1257 ; X32-NEXT: pushl $0
1258 ; X32-NEXT: pushl $0
1259 ; X32-NEXT: pushl %esi
1260 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1261 ; X32-NEXT: pushl $0
1262 ; X32-NEXT: pushl $0
1263 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1264 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1265 ; X32-NEXT: pushl %edi
1266 ; X32-NEXT: pushl %eax
1267 ; X32-NEXT: calll __multi3
1268 ; X32-NEXT: addl $32, %esp
1269 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1270 ; X32-NEXT: pushl $0
1271 ; X32-NEXT: pushl $0
1272 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1273 ; X32-NEXT: pushl %ebx
1274 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1275 ; X32-NEXT: pushl %esi
1276 ; X32-NEXT: pushl $0
1277 ; X32-NEXT: pushl $0
1278 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1279 ; X32-NEXT: pushl %edi
1280 ; X32-NEXT: pushl %eax
1281 ; X32-NEXT: calll __multi3
1282 ; X32-NEXT: addl $32, %esp
1283 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1284 ; X32-NEXT: pushl $0
1285 ; X32-NEXT: pushl $0
1286 ; X32-NEXT: pushl %ebx
1287 ; X32-NEXT: pushl %esi
1288 ; X32-NEXT: pushl $0
1289 ; X32-NEXT: pushl $0
1290 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1291 ; X32-NEXT: pushl %ebx
1292 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1293 ; X32-NEXT: pushl %eax
1294 ; X32-NEXT: calll __multi3
1295 ; X32-NEXT: addl $32, %esp
1296 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1297 ; X32-NEXT: pushl $0
1298 ; X32-NEXT: pushl $0
1299 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1300 ; X32-NEXT: pushl %edi
1301 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1302 ; X32-NEXT: pushl %esi
1303 ; X32-NEXT: pushl $0
1304 ; X32-NEXT: pushl $0
1305 ; X32-NEXT: pushl %ebx
1306 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1307 ; X32-NEXT: pushl %eax
1308 ; X32-NEXT: calll __multi3
1309 ; X32-NEXT: addl $32, %esp
1310 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1311 ; X32-NEXT: pushl $0
1312 ; X32-NEXT: pushl $0
1313 ; X32-NEXT: pushl %edi
1314 ; X32-NEXT: pushl %esi
1315 ; X32-NEXT: pushl $0
1316 ; X32-NEXT: pushl $0
1317 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1318 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1319 ; X32-NEXT: pushl %eax
1320 ; X32-NEXT: calll __multi3
1321 ; X32-NEXT: addl $32, %esp
1322 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1323 ; X32-NEXT: pushl $0
1324 ; X32-NEXT: pushl $0
1325 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1326 ; X32-NEXT: pushl %edi
1327 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1328 ; X32-NEXT: pushl %esi
1329 ; X32-NEXT: pushl $0
1330 ; X32-NEXT: pushl $0
1331 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1332 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1333 ; X32-NEXT: pushl %eax
1334 ; X32-NEXT: calll __multi3
1335 ; X32-NEXT: addl $32, %esp
1336 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1337 ; X32-NEXT: pushl $0
1338 ; X32-NEXT: pushl $0
1339 ; X32-NEXT: pushl %edi
1340 ; X32-NEXT: pushl %esi
1341 ; X32-NEXT: pushl $0
1342 ; X32-NEXT: pushl $0
1343 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
1344 ; X32-NEXT: pushl %esi
1345 ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx # 4-byte Reload
1346 ; X32-NEXT: pushl %ebx
1347 ; X32-NEXT: pushl %eax
1348 ; X32-NEXT: calll __multi3
1349 ; X32-NEXT: addl $32, %esp
1350 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1351 ; X32-NEXT: pushl $0
1352 ; X32-NEXT: pushl $0
1353 ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
1354 ; X32-NEXT: pushl %edi
1355 ; X32-NEXT: pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
1356 ; X32-NEXT: pushl $0
1357 ; X32-NEXT: pushl $0
1358 ; X32-NEXT: pushl %esi
1359 ; X32-NEXT: pushl %ebx
1360 ; X32-NEXT: pushl %eax
1361 ; X32-NEXT: calll __multi3
1362 ; X32-NEXT: addl $32, %esp
1363 ; X32-NEXT: leal {{[0-9]+}}(%esp), %eax
1364 ; X32-NEXT: pushl $0
1365 ; X32-NEXT: pushl $0
1366 ; X32-NEXT: movl %edi, %ebx
1367 ; X32-NEXT: pushl %ebx
1366 ; X32-NEXT: pushl %edi
13681367 ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
13691368 ; X32-NEXT: pushl %esi
13701369 ; X32-NEXT: pushl $0
24412440 ; X32-NEXT: movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
24422441 ; X32-NEXT: adcl %edi, %eax
24432442 ; X32-NEXT: movl %eax, %esi
2444 ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
2443 ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
24452444 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
24462445 ; X32-NEXT: addl %eax, {{[0-9]+}}(%esp) # 4-byte Folded Spill
24472446 ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
42644263 ; X64-NEXT: adcq $0, %rbp
42654264 ; X64-NEXT: addq %rcx, %rbx
42664265 ; X64-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
4267 ; X64-NEXT: movq %rcx, %r11
42684266 ; X64-NEXT: adcq %rdi, %rbp
42694267 ; X64-NEXT: setb %bl
42704268 ; X64-NEXT: movzbl %bl, %ebx
42744272 ; X64-NEXT: mulq %r8
42754273 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
42764274 ; X64-NEXT: movq %rdx, {{[0-9]+}}(%rsp) # 8-byte Spill
4277 ; X64-NEXT: movq %r11, %r12
4278 ; X64-NEXT: movq %r11, %r8
4275 ; X64-NEXT: movq %rcx, %r12
4276 ; X64-NEXT: movq %rcx, %r8
42794277 ; X64-NEXT: addq %rax, %r12
42804278 ; X64-NEXT: movq %rdi, %rax
42814279 ; X64-NEXT: movq %rdi, %r9
4282 ; X64-NEXT: movq %r9, (%rsp) # 8-byte Spill
4280 ; X64-NEXT: movq %rdi, (%rsp) # 8-byte Spill
42834281 ; X64-NEXT: adcq %rdx, %rax
42844282 ; X64-NEXT: addq %rbp, %r12
42854283 ; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
43084306 ; X64-NEXT: adcq %rdx, %rbx
43094307 ; X64-NEXT: movq 16(%rsi), %rax
43104308 ; X64-NEXT: movq %rsi, %r13
4311 ; X64-NEXT: movq %r13, {{[0-9]+}}(%rsp) # 8-byte Spill
4309 ; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
43124310 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
43134311 ; X64-NEXT: mulq %r11
43144312 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
43214319 ; X64-NEXT: adcq %rbx, %r11
43224320 ; X64-NEXT: movq %r8, %rax
43234321 ; X64-NEXT: movq %r8, %rbp
4324 ; X64-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
4322 ; X64-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
43254323 ; X64-NEXT: addq %rdi, %rax
43264324 ; X64-NEXT: movq %r9, %rax
43274325 ; X64-NEXT: adcq %rcx, %rax
43334331 ; X64-NEXT: movq %rdx, %rsi
43344332 ; X64-NEXT: movq %rax, %rbx
43354333 ; X64-NEXT: addq %rdi, %rax
4336 ; X64-NEXT: movq %rdi, %r9
4337 ; X64-NEXT: movq %rsi, %rax
4334 ; X64-NEXT: movq %rdx, %rax
43384335 ; X64-NEXT: adcq %rcx, %rax
43394336 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
43404337 ; X64-NEXT: movq 32(%r13), %rax
43504347 ; X64-NEXT: adcq %rdx, %rax
43514348 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
43524349 ; X64-NEXT: movq %rbp, %rax
4353 ; X64-NEXT: addq %r9, %rax
4350 ; X64-NEXT: addq %rdi, %rax
43544351 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
4355 ; X64-NEXT: movq %r9, {{[0-9]+}}(%rsp) # 8-byte Spill
4352 ; X64-NEXT: movq %rdi, %r9
4353 ; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
43564354 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rax # 8-byte Reload
43574355 ; X64-NEXT: adcq %r15, %rax
43584356 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
43704368 ; X64-NEXT: addq %rsi, %r11
43714369 ; X64-NEXT: movq %rdx, %rbp
43724370 ; X64-NEXT: adcq $0, %rbp
4373 ; X64-NEXT: addq %rcx, %r11
4371 ; X64-NEXT: addq %rbx, %r11
43744372 ; X64-NEXT: adcq %rsi, %rbp
43754373 ; X64-NEXT: movq %rsi, {{[0-9]+}}(%rsp) # 8-byte Spill
43764374 ; X64-NEXT: setb %bl
43914389 ; X64-NEXT: adcq %rbx, %r10
43924390 ; X64-NEXT: movq %rcx, %rdx
43934391 ; X64-NEXT: movq %rcx, %r12
4394 ; X64-NEXT: movq %r12, {{[0-9]+}}(%rsp) # 8-byte Spill
4392 ; X64-NEXT: movq %rcx, {{[0-9]+}}(%rsp) # 8-byte Spill
43954393 ; X64-NEXT: addq %r9, %rdx
43964394 ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
43974395 ; X64-NEXT: movq %r11, %r8
4398 ; X64-NEXT: adcq %r8, %r15
4396 ; X64-NEXT: adcq %r11, %r15
43994397 ; X64-NEXT: movq %r15, {{[0-9]+}}(%rsp) # 8-byte Spill
44004398 ; X64-NEXT: adcq %rax, %r14
44014399 ; X64-NEXT: movq %r14, {{[0-9]+}}(%rsp) # 8-byte Spill
44914489 ; X64-NEXT: adcq %rdx, %r12
44924490 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
44934491 ; X64-NEXT: movq %rcx, %rax
4494 ; X64-NEXT: movq %r10, %rbp
4495 ; X64-NEXT: mulq %rbp
4492 ; X64-NEXT: mulq %r10
44964493 ; X64-NEXT: movq %rdx, %rsi
44974494 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
44984495 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rdi # 8-byte Reload
44994496 ; X64-NEXT: movq %rdi, %rax
4500 ; X64-NEXT: mulq %rbp
4497 ; X64-NEXT: mulq %r10
45014498 ; X64-NEXT: movq %rdx, %rbp
45024499 ; X64-NEXT: movq %rax, %rbx
45034500 ; X64-NEXT: addq %rsi, %rbx
45244521 ; X64-NEXT: adcq $0, %r15
45254522 ; X64-NEXT: adcq $0, %r12
45264523 ; X64-NEXT: movq %r10, %rbx
4527 ; X64-NEXT: movq %rbx, %rax
4524 ; X64-NEXT: movq %r10, %rax
45284525 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r11 # 8-byte Reload
45294526 ; X64-NEXT: mulq %r11
45304527 ; X64-NEXT: movq %rdx, %rcx
45414538 ; X64-NEXT: movq %rbx, %rax
45424539 ; X64-NEXT: mulq %rcx
45434540 ; X64-NEXT: movq %rcx, %rbx
4544 ; X64-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
4541 ; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
45454542 ; X64-NEXT: movq %rdx, %rcx
45464543 ; X64-NEXT: movq %rax, %r8
45474544 ; X64-NEXT: addq %rbp, %r8
45724569 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
45734570 ; X64-NEXT: movq %rcx, %rax
45744571 ; X64-NEXT: movq %r11, %rsi
4575 ; X64-NEXT: mulq %rsi
4572 ; X64-NEXT: mulq %r11
45764573 ; X64-NEXT: movq %rdx, %r11
45774574 ; X64-NEXT: movq %rax, %r13
45784575 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r12 # 8-byte Reload
46524649 ; X64-NEXT: adcq %rdx, %r10
46534650 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
46544651 ; X64-NEXT: movq %rcx, %rax
4655 ; X64-NEXT: movq %r11, %rbp
4656 ; X64-NEXT: mulq %rbp
4652 ; X64-NEXT: mulq %r11
46574653 ; X64-NEXT: movq %rdx, %rdi
46584654 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
46594655 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
46604656 ; X64-NEXT: movq %rsi, %rax
4661 ; X64-NEXT: mulq %rbp
4657 ; X64-NEXT: mulq %r11
46624658 ; X64-NEXT: movq %rdx, %rbp
46634659 ; X64-NEXT: movq %rax, %rbx
46644660 ; X64-NEXT: addq %rdi, %rbx
47884784 ; X64-NEXT: movq %rdx, %rsi
47894785 ; X64-NEXT: movq %rax, %r14
47904786 ; X64-NEXT: movq %r8, %rbp
4791 ; X64-NEXT: movq %rbp, %rax
4787 ; X64-NEXT: movq %r8, %rax
47924788 ; X64-NEXT: mulq %rcx
47934789 ; X64-NEXT: movq %rcx, %r11
47944790 ; X64-NEXT: movq %rdx, %rbx
48484844 ; X64-NEXT: adcq $0, %r9
48494845 ; X64-NEXT: adcq $0, %r10
48504846 ; X64-NEXT: movq %rbp, %rsi
4851 ; X64-NEXT: movq %rsi, %rax
4847 ; X64-NEXT: movq %rbp, %rax
48524848 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
48534849 ; X64-NEXT: mulq %rcx
48544850 ; X64-NEXT: movq %rdx, %r14
49054901 ; X64-NEXT: adcq $0, %r15
49064902 ; X64-NEXT: movq %rbp, %rax
49074903 ; X64-NEXT: movq %r8, %rdi
4908 ; X64-NEXT: movq %rdi, {{[0-9]+}}(%rsp) # 8-byte Spill
4909 ; X64-NEXT: mulq %rdi
4904 ; X64-NEXT: movq %r8, {{[0-9]+}}(%rsp) # 8-byte Spill
4905 ; X64-NEXT: mulq %r8
49104906 ; X64-NEXT: movq %rdx, %r9
49114907 ; X64-NEXT: movq %rax, %r8
49124908 ; X64-NEXT: addq %rbx, %r8
49894985 ; X64-NEXT: movq %rcx, %r14
49904986 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
49914987 ; X64-NEXT: movq %rcx, %rax
4992 ; X64-NEXT: movq %r10, %rdi
4993 ; X64-NEXT: mulq %rdi
4988 ; X64-NEXT: mulq %r10
49944989 ; X64-NEXT: movq %rdx, %r11
49954990 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
49964991 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rsi # 8-byte Reload
49974992 ; X64-NEXT: movq %rsi, %rax
4998 ; X64-NEXT: mulq %rdi
4993 ; X64-NEXT: mulq %r10
49994994 ; X64-NEXT: movq %rdx, %rdi
50004995 ; X64-NEXT: movq %rax, %rbx
50014996 ; X64-NEXT: addq %r11, %rbx
50235018 ; X64-NEXT: movq %r8, -{{[0-9]+}}(%rsp) # 8-byte Spill
50245019 ; X64-NEXT: adcq $0, %r14
50255020 ; X64-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
5026 ; X64-NEXT: movq %r13, %rbx
5027 ; X64-NEXT: movq %rbx, %rax
5021 ; X64-NEXT: movq %r13, %rax
50285022 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rcx # 8-byte Reload
50295023 ; X64-NEXT: mulq %rcx
50305024 ; X64-NEXT: movq %rdx, %r8
50375031 ; X64-NEXT: movq %rax, %rcx
50385032 ; X64-NEXT: addq %r8, %rcx
50395033 ; X64-NEXT: adcq $0, %rsi
5040 ; X64-NEXT: movq %rbx, %rax
5034 ; X64-NEXT: movq %r13, %rax
50415035 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r13 # 8-byte Reload
50425036 ; X64-NEXT: mulq %r13
50435037 ; X64-NEXT: movq %rdx, %rbx
50715065 ; X64-NEXT: setb -{{[0-9]+}}(%rsp) # 1-byte Folded Spill
50725066 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx # 8-byte Reload
50735067 ; X64-NEXT: movq %rbx, %rax
5074 ; X64-NEXT: movq %r10, %rsi
5075 ; X64-NEXT: mulq %rsi
5068 ; X64-NEXT: mulq %r10
50765069 ; X64-NEXT: movq %rdx, %rcx
50775070 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
50785071 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r8 # 8-byte Reload
50795072 ; X64-NEXT: movq %r8, %rax
5080 ; X64-NEXT: mulq %rsi
5073 ; X64-NEXT: mulq %r10
50815074 ; X64-NEXT: movq %rdx, %rsi
50825075 ; X64-NEXT: movq %rax, %rdi
50835076 ; X64-NEXT: addq %rcx, %rdi
51535146 ; X64-NEXT: movq %r9, %rax
51545147 ; X64-NEXT: mulq %rcx
51555148 ; X64-NEXT: movq %rcx, %r10
5156 ; X64-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
5149 ; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
51575150 ; X64-NEXT: movq %rdx, %rcx
51585151 ; X64-NEXT: movq %rax, %rdi
51595152 ; X64-NEXT: addq %rsi, %rdi
51655158 ; X64-NEXT: movq %rax, %rbx
51665159 ; X64-NEXT: movq %rdx, %r14
51675160 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r12 # 8-byte Reload
5168 ; X64-NEXT: addq %rbx, %r12
5161 ; X64-NEXT: addq %rax, %r12
51695162 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %r15 # 8-byte Reload
5170 ; X64-NEXT: adcq %r14, %r15
5163 ; X64-NEXT: adcq %rdx, %r15
51715164 ; X64-NEXT: addq %rdi, %r12
51725165 ; X64-NEXT: adcq %rcx, %r15
51735166 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rcx # 8-byte Reload
51745167 ; X64-NEXT: movq %rcx, %rax
51755168 ; X64-NEXT: movq %r11, %rsi
5176 ; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
5177 ; X64-NEXT: mulq %rsi
5169 ; X64-NEXT: movq %r11, -{{[0-9]+}}(%rsp) # 8-byte Spill
5170 ; X64-NEXT: mulq %r11
51785171 ; X64-NEXT: movq %rdx, %r11
51795172 ; X64-NEXT: movq %rax, {{[0-9]+}}(%rsp) # 8-byte Spill
51805173 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9 # 8-byte Reload
52385231 ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
52395232 ; X64-NEXT: movq %rax, %r9
52405233 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbp # 8-byte Reload
5241 ; X64-NEXT: addq %r9, %rbp
5234 ; X64-NEXT: addq %rax, %rbp
52425235 ; X64-NEXT: movq {{[0-9]+}}(%rsp), %rax # 8-byte Reload
52435236 ; X64-NEXT: adcq %rdx, %rax
52445237 ; X64-NEXT: addq %rsi, %rbp
54165409 ; X64-NEXT: movq 88(%rsi), %rax
54175410 ; X64-NEXT: movq %rsi, %r9
54185411 ; X64-NEXT: movq %rax, %rsi
5419 ; X64-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) # 8-byte Spill
5412 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
54205413 ; X64-NEXT: mulq %rcx
54215414 ; X64-NEXT: movq %rcx, %r11
54225415 ; X64-NEXT: movq %rdx, %rbp
54525445 ; X64-NEXT: adcq %r8, %r10
54535446 ; X64-NEXT: addq %rbx, %rsi
54545447 ; X64-NEXT: adcq %rbp, %r10
5455 ; X64-NEXT: movq %r9, %rdi
5456 ; X64-NEXT: movq 64(%rdi), %r13
5448 ; X64-NEXT: movq 64(%r9), %r13
54575449 ; X64-NEXT: movq %r13, %rax
54585450 ; X64-NEXT: mulq %r11
54595451 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
54605452 ; X64-NEXT: movq %rdx, %rcx
5461 ; X64-NEXT: movq 72(%rdi), %r9
5453 ; X64-NEXT: movq 72(%r9), %r9
54625454 ; X64-NEXT: movq %r9, %rax
54635455 ; X64-NEXT: mulq %r11
54645456 ; X64-NEXT: movq %rdx, %rbp
54865478 ; X64-NEXT: movq %rdx, %r11
54875479 ; X64-NEXT: movq %rax, %r15
54885480 ; X64-NEXT: movq %r12, %rcx
5489 ; X64-NEXT: addq %r15, %rcx
5490 ; X64-NEXT: adcq %r11, %r8
5481 ; X64-NEXT: addq %rax, %rcx
5482 ; X64-NEXT: adcq %rdx, %r8
54915483 ; X64-NEXT: addq %rbp, %rcx
54925484 ; X64-NEXT: adcq %rbx, %r8
54935485 ; X64-NEXT: addq -{{[0-9]+}}(%rsp), %rcx # 8-byte Folded Reload
55395531 ; X64-NEXT: setb %r10b
55405532 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rsi # 8-byte Reload
55415533 ; X64-NEXT: movq %rsi, %rax
5542 ; X64-NEXT: movq %r8, %rdi
5543 ; X64-NEXT: mulq %rdi
5534 ; X64-NEXT: mulq %r8
55445535 ; X64-NEXT: movq %rdx, %rcx
55455536 ; X64-NEXT: movq %rax, %r9
55465537 ; X64-NEXT: movq -{{[0-9]+}}(%rsp), %rbp # 8-byte Reload
55475538 ; X64-NEXT: movq %rbp, %rax
5548 ; X64-NEXT: mulq %rdi
5549 ; X64-NEXT: movq %rdi, %r12
5539 ; X64-NEXT: mulq %r8
5540 ; X64-NEXT: movq %r8, %r12
55505541 ; X64-NEXT: movq %rdx, %rdi
55515542 ; X64-NEXT: movq %rax, %rbx
55525543 ; X64-NEXT: addq %rcx, %rbx
55855576 ; X64-NEXT: imulq %rcx, %rdi
55865577 ; X64-NEXT: movq %rcx, %rax
55875578 ; X64-NEXT: movq %r12, %rsi
5588 ; X64-NEXT: mulq %rsi
5579 ; X64-NEXT: mulq %r12
55895580 ; X64-NEXT: movq %rax, %r9
55905581 ; X64-NEXT: addq %rdi, %rdx
55915582 ; X64-NEXT: movq 104(%rbp), %r8
908908 ; X64-NEXT: movq 8(%rsi), %rbp
909909 ; X64-NEXT: movq %r15, %rax
910910 ; X64-NEXT: movq %rdx, %rsi
911 ; X64-NEXT: mulq %rsi
911 ; X64-NEXT: mulq %rdx
912912 ; X64-NEXT: movq %rdx, %r9
913913 ; X64-NEXT: movq %rax, %r8
914914 ; X64-NEXT: movq %r11, %rax
931931 ; X64-NEXT: movq %r11, %rax
932932 ; X64-NEXT: mulq %rbp
933933 ; X64-NEXT: movq %rbp, %r14
934 ; X64-NEXT: movq %r14, -{{[0-9]+}}(%rsp) # 8-byte Spill
934 ; X64-NEXT: movq %rbp, -{{[0-9]+}}(%rsp) # 8-byte Spill
935935 ; X64-NEXT: movq %rdx, %rsi
936936 ; X64-NEXT: movq %rax, %rbp
937937 ; X64-NEXT: addq %rcx, %rbp
938938 ; X64-NEXT: adcq %rbx, %rsi
939939 ; X64-NEXT: xorl %ecx, %ecx
940940 ; X64-NEXT: movq %r10, %rbx
941 ; X64-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) # 8-byte Spill
942 ; X64-NEXT: movq %rbx, %rax
941 ; X64-NEXT: movq %r10, -{{[0-9]+}}(%rsp) # 8-byte Spill
942 ; X64-NEXT: movq %r10, %rax
943943 ; X64-NEXT: mulq %rcx
944944 ; X64-NEXT: movq %rdx, %r13
945945 ; X64-NEXT: movq %rax, %r10
946946 ; X64-NEXT: movq %r15, %rax
947947 ; X64-NEXT: mulq %rcx
948948 ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
949 ; X64-NEXT: # kill: %RAX
950 ; X64-NEXT: movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
949951 ; X64-NEXT: movq %rax, %r15
950 ; X64-NEXT: movq %r15, -{{[0-9]+}}(%rsp) # 8-byte Spill
951952 ; X64-NEXT: addq %r10, %r15
952953 ; X64-NEXT: adcq %r13, %rdx
953954 ; X64-NEXT: addq %rbp, %r15
986987 ; X64-NEXT: mulq %rdx
987988 ; X64-NEXT: movq %rdx, %r14
988989 ; X64-NEXT: movq %rax, %r11
989 ; X64-NEXT: addq %r11, %r10
990 ; X64-NEXT: adcq %r14, %r13
990 ; X64-NEXT: addq %rax, %r10
991 ; X64-NEXT: adcq %rdx, %r13
991992 ; X64-NEXT: addq %rbp, %r10
992993 ; X64-NEXT: adcq %rsi, %r13
993994 ; X64-NEXT: addq %r8, %r10
9991000 ; X64-NEXT: movq 16(%rsi), %r8
10001001 ; X64-NEXT: movq %rcx, %rax
10011002 ; X64-NEXT: movq %rcx, %r9
1002 ; X64-NEXT: movq %r9, -{{[0-9]+}}(%rsp) # 8-byte Spill
1003 ; X64-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) # 8-byte Spill
10031004 ; X64-NEXT: mulq %r8
10041005 ; X64-NEXT: movq %rdx, %rdi
10051006 ; X64-NEXT: movq %rax, %r12
10301031 ; X64-NEXT: mulq %rcx
10311032 ; X64-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) # 8-byte Spill
10321033 ; X64-NEXT: movq %rax, %rbp
1033 ; X64-NEXT: addq %rbp, %r11
1034 ; X64-NEXT: addq %rax, %r11
10341035 ; X64-NEXT: adcq %rdx, %r14
10351036 ; X64-NEXT: addq %r9, %r11
10361037 ; X64-NEXT: adcq %rbx, %r14
66 ; X64-NEXT: movq %rdx, %r8
77 ; X64-NEXT: imulq %rdi, %rcx
88 ; X64-NEXT: movq %rdi, %rax
9 ; X64-NEXT: mulq %r8
9 ; X64-NEXT: mulq %rdx
1010 ; X64-NEXT: addq %rcx, %rdx
1111 ; X64-NEXT: imulq %r8, %rsi
1212 ; X64-NEXT: addq %rsi, %rdx
88 ; SSE2-LABEL: mul_v16i8c:
99 ; SSE2: # BB#0: # %entry
1010 ; SSE2-NEXT: movdqa %xmm0, %xmm1
11 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
11 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1212 ; SSE2-NEXT: psraw $8, %xmm1
1313 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [117,117,117,117,117,117,117,117]
1414 ; SSE2-NEXT: pmullw %xmm2, %xmm1
142142 ; SSE2-LABEL: mul_v16i8:
143143 ; SSE2: # BB#0: # %entry
144144 ; SSE2-NEXT: movdqa %xmm1, %xmm2
145 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
145 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
146146 ; SSE2-NEXT: psraw $8, %xmm2
147147 ; SSE2-NEXT: movdqa %xmm0, %xmm3
148 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
148 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
149149 ; SSE2-NEXT: psraw $8, %xmm3
150150 ; SSE2-NEXT: pmullw %xmm2, %xmm3
151151 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
385385 ; SSE2-LABEL: mul_v32i8c:
386386 ; SSE2: # BB#0: # %entry
387387 ; SSE2-NEXT: movdqa %xmm0, %xmm2
388 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
388 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
389389 ; SSE2-NEXT: psraw $8, %xmm2
390390 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [117,117,117,117,117,117,117,117]
391391 ; SSE2-NEXT: pmullw %xmm3, %xmm2
397397 ; SSE2-NEXT: pand %xmm4, %xmm0
398398 ; SSE2-NEXT: packuswb %xmm2, %xmm0
399399 ; SSE2-NEXT: movdqa %xmm1, %xmm2
400 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
400 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
401401 ; SSE2-NEXT: psraw $8, %xmm2
402402 ; SSE2-NEXT: pmullw %xmm3, %xmm2
403403 ; SSE2-NEXT: pand %xmm4, %xmm2
566566 ; SSE2-LABEL: mul_v32i8:
567567 ; SSE2: # BB#0: # %entry
568568 ; SSE2-NEXT: movdqa %xmm2, %xmm4
569 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
569 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
570570 ; SSE2-NEXT: psraw $8, %xmm4
571571 ; SSE2-NEXT: movdqa %xmm0, %xmm5
572 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
572 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
573573 ; SSE2-NEXT: psraw $8, %xmm5
574574 ; SSE2-NEXT: pmullw %xmm4, %xmm5
575575 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
582582 ; SSE2-NEXT: pand %xmm4, %xmm0
583583 ; SSE2-NEXT: packuswb %xmm5, %xmm0
584584 ; SSE2-NEXT: movdqa %xmm3, %xmm2
585 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
585 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
586586 ; SSE2-NEXT: psraw $8, %xmm2
587587 ; SSE2-NEXT: movdqa %xmm1, %xmm5
588 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
588 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm1[8],xmm5[9],xmm1[9],xmm5[10],xmm1[10],xmm5[11],xmm1[11],xmm5[12],xmm1[12],xmm5[13],xmm1[13],xmm5[14],xmm1[14],xmm5[15],xmm1[15]
589589 ; SSE2-NEXT: psraw $8, %xmm5
590590 ; SSE2-NEXT: pmullw %xmm2, %xmm5
591591 ; SSE2-NEXT: pand %xmm4, %xmm5
773773 ; SSE2-LABEL: mul_v64i8c:
774774 ; SSE2: # BB#0: # %entry
775775 ; SSE2-NEXT: movdqa %xmm0, %xmm6
776 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
776 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
777777 ; SSE2-NEXT: psraw $8, %xmm6
778778 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [117,117,117,117,117,117,117,117]
779779 ; SSE2-NEXT: pmullw %xmm4, %xmm6
785785 ; SSE2-NEXT: pand %xmm5, %xmm0
786786 ; SSE2-NEXT: packuswb %xmm6, %xmm0
787787 ; SSE2-NEXT: movdqa %xmm1, %xmm6
788 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
788 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm1[8],xmm6[9],xmm1[9],xmm6[10],xmm1[10],xmm6[11],xmm1[11],xmm6[12],xmm1[12],xmm6[13],xmm1[13],xmm6[14],xmm1[14],xmm6[15],xmm1[15]
789789 ; SSE2-NEXT: psraw $8, %xmm6
790790 ; SSE2-NEXT: pmullw %xmm4, %xmm6
791791 ; SSE2-NEXT: pand %xmm5, %xmm6
795795 ; SSE2-NEXT: pand %xmm5, %xmm1
796796 ; SSE2-NEXT: packuswb %xmm6, %xmm1
797797 ; SSE2-NEXT: movdqa %xmm2, %xmm6
798 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
798 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
799799 ; SSE2-NEXT: psraw $8, %xmm6
800800 ; SSE2-NEXT: pmullw %xmm4, %xmm6
801801 ; SSE2-NEXT: pand %xmm5, %xmm6
805805 ; SSE2-NEXT: pand %xmm5, %xmm2
806806 ; SSE2-NEXT: packuswb %xmm6, %xmm2
807807 ; SSE2-NEXT: movdqa %xmm3, %xmm6
808 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
808 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15]
809809 ; SSE2-NEXT: psraw $8, %xmm6
810810 ; SSE2-NEXT: pmullw %xmm4, %xmm6
811811 ; SSE2-NEXT: pand %xmm5, %xmm6
820820 ; SSE41: # BB#0: # %entry
821821 ; SSE41-NEXT: movdqa %xmm1, %xmm4
822822 ; SSE41-NEXT: movdqa %xmm0, %xmm1
823 ; SSE41-NEXT: pmovsxbw %xmm1, %xmm0
823 ; SSE41-NEXT: pmovsxbw %xmm0, %xmm0
824824 ; SSE41-NEXT: movdqa {{.*#+}} xmm6 = [117,117,117,117,117,117,117,117]
825825 ; SSE41-NEXT: pmullw %xmm6, %xmm0
826826 ; SSE41-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
938938 ; SSE2-LABEL: mul_v64i8:
939939 ; SSE2: # BB#0: # %entry
940940 ; SSE2-NEXT: movdqa %xmm4, %xmm8
941 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
941 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
942942 ; SSE2-NEXT: psraw $8, %xmm8
943943 ; SSE2-NEXT: movdqa %xmm0, %xmm9
944 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
944 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm0[8],xmm9[9],xmm0[9],xmm9[10],xmm0[10],xmm9[11],xmm0[11],xmm9[12],xmm0[12],xmm9[13],xmm0[13],xmm9[14],xmm0[14],xmm9[15],xmm0[15]
945945 ; SSE2-NEXT: psraw $8, %xmm9
946946 ; SSE2-NEXT: pmullw %xmm8, %xmm9
947947 ; SSE2-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
954954 ; SSE2-NEXT: pand %xmm8, %xmm0
955955 ; SSE2-NEXT: packuswb %xmm9, %xmm