llvm.org GIT mirror llvm / 6a0f060
[PowerPC] Select between VSX A-type and M-type FMA instructions just before RA The VSX instruction set has two types of FMA instructions: A-type (where the addend is taken from the output register) and M-type (where one of the product operands is taken from the output register). This adds a small pass that runs just after MI scheduling (and, thus, just before register allocation) that mutates A-type instructions (that are created during isel) into M-type instructions when: 1. This will eliminate an otherwise-necessary copy of the addend 2. One of the product operands is killed by the instruction The "right" moment to make this decision is in between scheduling and register allocation, because only there do we know whether or not one of the product operands is killed by any particular instruction. Unfortunately, this also makes the implementation somewhat complicated, because the MIs are not in SSA form and we need to preserve the LiveIntervals analysis. As a simple example, if we have: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9 %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16, %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16 ... %vreg9<def,tied1> = XSMADDADP %vreg9<tied0>, %vreg17, %vreg19, %RM<imp-use>; VSLRC:%vreg9,%vreg17,%vreg19 ... We can eliminate the copy by changing from the A-type to the M-type instruction. This means: %vreg5<def,tied1> = XSMADDADP %vreg5<tied0>, %vreg17, %vreg16, %RM<imp-use>; VSLRC:%vreg5,%vreg17,%vreg16 is replaced by: %vreg16<def,tied1> = XSMADDMDP %vreg16<tied0>, %vreg18, %vreg9, %RM<imp-use>; VSLRC:%vreg16,%vreg18,%vreg9 and we remove: %vreg5<def> = COPY %vreg9; VSLRC:%vreg5,%vreg9 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@204768 91177308-0d34-0410-b5e6-96231b3b80d8 Hal Finkel 6 years ago
4 changed file(s) with 407 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
2222
2323 namespace llvm {
2424 class PPCTargetMachine;
25 class PassRegistry;
2526 class FunctionPass;
2627 class ImmutablePass;
2728 class JITCodeEmitter;
3536 #endif
3637 FunctionPass *createPPCEarlyReturnPass();
3738 FunctionPass *createPPCVSXCopyPass();
39 FunctionPass *createPPCVSXFMAMutatePass();
3840 FunctionPass *createPPCBranchSelectionPass();
3941 FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
4042 FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
4446
4547 /// \brief Creates an PPC-specific Target Transformation Info pass.
4648 ImmutablePass *createPPCTargetTransformInfoPass(const PPCTargetMachine *TM);
49
50 void initializePPCVSXFMAMutatePass(PassRegistry&);
51 extern char &PPCVSXFMAMutateID;
4752
4853 namespace PPCII {
4954
1919 #include "PPCTargetMachine.h"
2020 #include "llvm/ADT/STLExtras.h"
2121 #include "llvm/ADT/Statistic.h"
22 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
2223 #include "llvm/CodeGen/MachineFrameInfo.h"
2324 #include "llvm/CodeGen/MachineFunctionPass.h"
2425 #include "llvm/CodeGen/MachineInstrBuilder.h"
2526 #include "llvm/CodeGen/MachineMemOperand.h"
2627 #include "llvm/CodeGen/MachineRegisterInfo.h"
2728 #include "llvm/CodeGen/PseudoSourceValue.h"
29 #include "llvm/CodeGen/SlotIndexes.h"
2830 #include "llvm/MC/MCAsmInfo.h"
2931 #include "llvm/Support/CommandLine.h"
32 #include "llvm/Support/Debug.h"
3033 #include "llvm/Support/ErrorHandling.h"
3134 #include "llvm/Support/TargetRegistry.h"
3235 #include "llvm/Support/raw_ostream.h"
4346
4447 static cl::opt DisableCmpOpt("disable-ppc-cmp-opt",
4548 cl::desc("Disable compare instruction optimization"), cl::Hidden);
49
50 static cl::opt DisableVSXFMAMutate("disable-ppc-vsx-fma-mutation",
51 cl::desc("Disable VSX FMA instruction mutation"), cl::Hidden);
4652
4753 // Pin the vtable to this file.
4854 void PPCInstrInfo::anchor() {}
15631569 }
15641570 }
15651571
1572 #undef DEBUG_TYPE
1573 #define DEBUG_TYPE "ppc-vsx-fma-mutate"
1574
1575 namespace {
1576 // PPCVSXFMAMutate pass - For copies between VSX registers and non-VSX registers
1577 // (Altivec and scalar floating-point registers), we need to transform the
1578 // copies into subregister copies with other restrictions.
1579 struct PPCVSXFMAMutate : public MachineFunctionPass {
1580 static char ID;
1581 PPCVSXFMAMutate() : MachineFunctionPass(ID) {
1582 initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
1583 }
1584
1585 LiveIntervals *LIS;
1586
1587 const PPCTargetMachine *TM;
1588 const PPCInstrInfo *TII;
1589
1590 protected:
1591 bool processBlock(MachineBasicBlock &MBB) {
1592 bool Changed = false;
1593
1594 MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
1595 for (MachineBasicBlock::iterator I = MBB.begin(), IE = MBB.end();
1596 I != IE; ++I) {
1597 MachineInstr *MI = I;
1598
1599 // The default (A-type) VSX FMA form kills the addend (it is taken from
1600 // the target register, which is then updated to reflect the result of
1601 // the FMA). If the instruction, however, kills one of the registers
1602 // used for the product, then we can use the M-form instruction (which
1603 // will take that value from the to-be-defined register).
1604
1605 int AltOpc = PPC::getAltVSXFMAOpcode(MI->getOpcode());
1606 if (AltOpc == -1)
1607 continue;
1608
1609 // This pass is run after register coalescing, and so we're looking for
1610 // a situation like this:
1611 // ...
1612 // %vreg5 = COPY %vreg9; VSLRC:%vreg5,%vreg9
1613 // %vreg5 = XSMADDADP %vreg5, %vreg17, %vreg16,
1614 // %RM; VSLRC:%vreg5,%vreg17,%vreg16
1615 // ...
1616 // %vreg9 = XSMADDADP %vreg9, %vreg17, %vreg19,
1617 // %RM; VSLRC:%vreg9,%vreg17,%vreg19
1618 // ...
1619 // Where we can eliminate the copy by changing from the A-type to the
1620 // M-type instruction. Specifically, for this example, this means:
1621 // %vreg5 = XSMADDADP %vreg5, %vreg17, %vreg16,
1622 // %RM; VSLRC:%vreg5,%vreg17,%vreg16
1623 // is replaced by:
1624 // %vreg16 = XSMADDMDP %vreg16, %vreg18, %vreg9,
1625 // %RM; VSLRC:%vreg16,%vreg18,%vreg9
1626 // and we remove: %vreg5 = COPY %vreg9; VSLRC:%vreg5,%vreg9
1627
1628 SlotIndex FMAIdx = LIS->getInstructionIndex(MI);
1629
1630 VNInfo *AddendValNo =
1631 LIS->getInterval(MI->getOperand(1).getReg()).Query(FMAIdx).valueIn();
1632 MachineInstr *AddendMI = LIS->getInstructionFromIndex(AddendValNo->def);
1633
1634 // The addend and this instruction must be in the same block.
1635
1636 if (AddendMI->getParent() != MI->getParent())
1637 continue;
1638
1639 // The addend must be a full copy within the same register class.
1640
1641 if (!AddendMI->isFullCopy())
1642 continue;
1643
1644 if (MRI.getRegClass(AddendMI->getOperand(0).getReg()) !=
1645 MRI.getRegClass(AddendMI->getOperand(1).getReg()))
1646 continue;
1647
1648 // In theory, there could be other uses of the addend copy before this
1649 // fma. We could deal with this, but that would require additional
1650 // logic below and I suspect it will not occur in any relevant
1651 // situations.
1652 bool OtherUsers = false;
1653 for (auto J = std::prev(I), JE = MachineBasicBlock::iterator(AddendMI);
1654 J != JE; --J)
1655 if (J->readsVirtualRegister(AddendMI->getOperand(0).getReg())) {
1656 OtherUsers = true;
1657 break;
1658 }
1659
1660 if (OtherUsers)
1661 continue;
1662
1663 // Find one of the product operands that is killed by this instruction.
1664
1665 unsigned KilledProdOp = 0, OtherProdOp = 0;
1666 if (LIS->getInterval(MI->getOperand(2).getReg())
1667 .Query(FMAIdx).isKill()) {
1668 KilledProdOp = 2;
1669 OtherProdOp = 3;
1670 } else if (LIS->getInterval(MI->getOperand(3).getReg())
1671 .Query(FMAIdx).isKill()) {
1672 KilledProdOp = 3;
1673 OtherProdOp = 2;
1674 }
1675
1676 // If there are no killed product operands, then this transformation is
1677 // likely not profitable.
1678 if (!KilledProdOp)
1679 continue;
1680
1681 // In order to replace the addend here with the source of the copy,
1682 // it must still be live here.
1683 if (!LIS->getInterval(AddendMI->getOperand(1).getReg()).liveAt(FMAIdx))
1684 continue;
1685
1686 // Transform: (O2 * O3) + O1 -> (O2 * O1) + O3.
1687
1688 unsigned AddReg = AddendMI->getOperand(1).getReg();
1689 unsigned KilledProdReg = MI->getOperand(KilledProdOp).getReg();
1690 unsigned OtherProdReg = MI->getOperand(OtherProdOp).getReg();
1691
1692 unsigned AddSubReg = AddendMI->getOperand(1).getSubReg();
1693 unsigned KilledProdSubReg = MI->getOperand(KilledProdOp).getSubReg();
1694 unsigned OtherProdSubReg = MI->getOperand(OtherProdOp).getSubReg();
1695
1696 bool AddRegKill = AddendMI->getOperand(1).isKill();
1697 bool KilledProdRegKill = MI->getOperand(KilledProdOp).isKill();
1698 bool OtherProdRegKill = MI->getOperand(OtherProdOp).isKill();
1699
1700 bool AddRegUndef = AddendMI->getOperand(1).isUndef();
1701 bool KilledProdRegUndef = MI->getOperand(KilledProdOp).isUndef();
1702 bool OtherProdRegUndef = MI->getOperand(OtherProdOp).isUndef();
1703
1704 unsigned OldFMAReg = MI->getOperand(0).getReg();
1705
1706 assert(OldFMAReg == AddendMI->getOperand(0).getReg() &&
1707 "Addend copy not tied to old FMA output!");
1708
1709 DEBUG(dbgs() << "VSX FMA Mutation:\n " << *MI;);
1710
1711 MI->getOperand(0).setReg(KilledProdReg);
1712 MI->getOperand(1).setReg(KilledProdReg);
1713 MI->getOperand(3).setReg(AddReg);
1714 MI->getOperand(2).setReg(OtherProdReg);
1715
1716 MI->getOperand(0).setSubReg(KilledProdSubReg);
1717 MI->getOperand(1).setSubReg(KilledProdSubReg);
1718 MI->getOperand(3).setSubReg(AddSubReg);
1719 MI->getOperand(2).setSubReg(OtherProdSubReg);
1720
1721 MI->getOperand(1).setIsKill(KilledProdRegKill);
1722 MI->getOperand(3).setIsKill(AddRegKill);
1723 MI->getOperand(2).setIsKill(OtherProdRegKill);
1724
1725 MI->getOperand(1).setIsUndef(KilledProdRegUndef);
1726 MI->getOperand(3).setIsUndef(AddRegUndef);
1727 MI->getOperand(2).setIsUndef(OtherProdRegUndef);
1728
1729 MI->setDesc(TII->get(AltOpc));
1730
1731 DEBUG(dbgs() << " -> " << *MI);
1732
1733 // The killed product operand was killed here, so we can reuse it now
1734 // for the result of the fma.
1735
1736 LiveInterval &FMAInt = LIS->getInterval(OldFMAReg);
1737 VNInfo *FMAValNo = FMAInt.getVNInfoAt(FMAIdx.getRegSlot());
1738 for (auto UI = MRI.reg_nodbg_begin(OldFMAReg), UE = MRI.reg_nodbg_end();
1739 UI != UE;) {
1740 MachineOperand &UseMO = *UI;
1741 MachineInstr *UseMI = UseMO.getParent();
1742 ++UI;
1743
1744 // Don't replace the result register of the copy we're about to erase.
1745 if (UseMI == AddendMI)
1746 continue;
1747
1748 UseMO.setReg(KilledProdReg);
1749 UseMO.setSubReg(KilledProdSubReg);
1750 }
1751
1752 // Extend the live intervals of the killed product operand to hold the
1753 // fma result.
1754
1755 LiveInterval &NewFMAInt = LIS->getInterval(KilledProdReg);
1756 for (LiveInterval::iterator AI = FMAInt.begin(), AE = FMAInt.end();
1757 AI != AE; ++AI) {
1758 // Don't add the segment that corresponds to the original copy.
1759 if (AI->valno == AddendValNo)
1760 continue;
1761
1762 VNInfo *NewFMAValNo =
1763 NewFMAInt.getNextValue(AI->start,
1764 LIS->getVNInfoAllocator());
1765
1766 NewFMAInt.addSegment(LiveInterval::Segment(AI->start, AI->end,
1767 NewFMAValNo));
1768 }
1769 DEBUG(dbgs() << " extended: " << NewFMAInt << '\n');
1770
1771 FMAInt.removeValNo(FMAValNo);
1772 DEBUG(dbgs() << " trimmed: " << FMAInt << '\n');
1773
1774 // Remove the (now unused) copy.
1775
1776 DEBUG(dbgs() << " removing: " << *AddendMI << '\n');
1777 LIS->RemoveMachineInstrFromMaps(AddendMI);
1778 AddendMI->eraseFromParent();
1779
1780 Changed = true;
1781 }
1782
1783 return Changed;
1784 }
1785
1786 public:
1787 virtual bool runOnMachineFunction(MachineFunction &MF) {
1788 LIS = &getAnalysis();
1789
1790 TM = static_cast(&MF.getTarget());
1791 TII = TM->getInstrInfo();
1792
1793 bool Changed = false;
1794
1795 if (DisableVSXFMAMutate)
1796 return Changed;
1797
1798 for (MachineFunction::iterator I = MF.begin(); I != MF.end();) {
1799 MachineBasicBlock &B = *I++;
1800 if (processBlock(B))
1801 Changed = true;
1802 }
1803
1804 return Changed;
1805 }
1806
1807 virtual void getAnalysisUsage(AnalysisUsage &AU) const {
1808 AU.addRequired();
1809 AU.addPreserved();
1810 AU.addRequired();
1811 AU.addPreserved();
1812 MachineFunctionPass::getAnalysisUsage(AU);
1813 }
1814 };
1815 }
1816
1817 INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
1818 "PowerPC VSX FMA Mutation", false, false)
1819 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
1820 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
1821 INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
1822 "PowerPC VSX FMA Mutation", false, false)
1823
1824 char &llvm::PPCVSXFMAMutateID = PPCVSXFMAMutate::ID;
1825
1826 char PPCVSXFMAMutate::ID = 0;
1827 FunctionPass*
1828 llvm::createPPCVSXFMAMutatePass() { return new PPCVSXFMAMutate(); }
15661829
15671830 #undef DEBUG_TYPE
15681831 #define DEBUG_TYPE "ppc-vsx-copy"
2424 static cl::
2525 opt DisableCTRLoops("disable-ppc-ctrloops", cl::Hidden,
2626 cl::desc("Disable CTR loops for PPC"));
27
28 static cl::opt
29 VSXFMAMutateEarly("schedule-ppc-vsx-fma-mutation-early",
30 cl::Hidden, cl::desc("Schedule VSX FMA instruction mutation early"));
2731
2832 extern "C" void LLVMInitializePowerPCTarget() {
2933 // Register the targets
125129 virtual bool addPreISel();
126130 virtual bool addILPOpts();
127131 virtual bool addInstSelector();
132 virtual bool addPreRegAlloc();
128133 virtual bool addPreSched2();
129134 virtual bool addPreEmitPass();
130135 };
161166
162167 if (getPPCSubtarget().hasVSX())
163168 addPass(createPPCVSXCopyPass());
169
170 return false;
171 }
172
173 bool PPCPassConfig::addPreRegAlloc() {
174 if (getPPCSubtarget().hasVSX()) {
175 initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
176 insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
177 &PPCVSXFMAMutateID);
178 }
164179
165180 return false;
166181 }
0 ; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx | FileCheck %s
1
2 ; Also run with -schedule-ppc-vsx-fma-mutation-early as a stress test for the
3 ; live-interval-updating logic.
4 ; RUN: llc < %s -mcpu=pwr7 -mattr=+vsx -schedule-ppc-vsx-fma-mutation-early
5 target datalayout = "E-m:e-i64:64-n32:64"
6 target triple = "powerpc64-unknown-linux-gnu"
7
8 define void @test1(double %a, double %b, double %c, double %e, double* nocapture %d) #0 {
9 entry:
10 %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
11 store double %0, double* %d, align 8
12 %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
13 %arrayidx1 = getelementptr inbounds double* %d, i64 1
14 store double %1, double* %arrayidx1, align 8
15 ret void
16
17 ; CHECK-LABEL: @test1
18 ; CHECK-DAG: li [[C1:[0-9]+]], 8
19 ; CHECK-DAG: xsmaddmdp 3, 2, 1
20 ; CHECK-DAG: xsmaddadp 1, 2, 4
21 ; CHECK-DAG: stxsdx 3, 0, 7
22 ; CHECK-DAG: stxsdx 1, 7, [[C1]]
23 ; CHECK: blr
24 }
25
26 define void @test2(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
27 entry:
28 %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
29 store double %0, double* %d, align 8
30 %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
31 %arrayidx1 = getelementptr inbounds double* %d, i64 1
32 store double %1, double* %arrayidx1, align 8
33 %2 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
34 %arrayidx2 = getelementptr inbounds double* %d, i64 2
35 store double %2, double* %arrayidx2, align 8
36 ret void
37
38 ; CHECK-LABEL: @test2
39 ; CHECK-DAG: li [[C1:[0-9]+]], 8
40 ; CHECK-DAG: li [[C2:[0-9]+]], 16
41 ; CHECK-DAG: xsmaddmdp 3, 2, 1
42 ; CHECK-DAG: xsmaddmdp 4, 2, 1
43 ; CHECK-DAG: xsmaddadp 1, 2, 5
44 ; CHECK-DAG: stxsdx 3, 0, 8
45 ; CHECK-DAG: stxsdx 4, 8, [[C1]]
46 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
47 ; CHECK: blr
48 }
49
50 define void @test3(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
51 entry:
52 %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
53 store double %0, double* %d, align 8
54 %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
55 %2 = tail call double @llvm.fma.f64(double %b, double %c, double %1)
56 %arrayidx1 = getelementptr inbounds double* %d, i64 3
57 store double %2, double* %arrayidx1, align 8
58 %3 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
59 %arrayidx2 = getelementptr inbounds double* %d, i64 2
60 store double %3, double* %arrayidx2, align 8
61 %arrayidx3 = getelementptr inbounds double* %d, i64 1
62 store double %1, double* %arrayidx3, align 8
63 ret void
64
65 ; CHECK-LABEL: @test3
66 ; CHECK-DAG: xxlor [[F1:[0-9]+]], 1, 1
67 ; CHECK-DAG: li [[C1:[0-9]+]], 24
68 ; CHECK-DAG: li [[C2:[0-9]+]], 16
69 ; CHECK-DAG: li [[C3:[0-9]+]], 8
70 ; CHECK-DAG: xsmaddmdp 4, 2, 1
71 ; CHECK-DAG: xsmaddadp 1, 2, 5
72
73 ; Note: We could convert this next FMA to M-type as well, but it would require
74 ; re-ordering the instructions.
75 ; CHECK-DAG: xsmaddadp [[F1]], 2, 3
76
77 ; CHECK-DAG: xsmaddmdp 2, 3, 4
78 ; CHECK-DAG: stxsdx [[F1]], 0, 8
79 ; CHECK-DAG: stxsdx 2, 8, [[C1]]
80 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
81 ; CHECK-DAG: stxsdx 4, 8, [[C3]]
82 ; CHECK-DAG: blr
83 }
84
85 define void @test4(double %a, double %b, double %c, double %e, double %f, double* nocapture %d) #0 {
86 entry:
87 %0 = tail call double @llvm.fma.f64(double %b, double %c, double %a)
88 store double %0, double* %d, align 8
89 %1 = tail call double @llvm.fma.f64(double %b, double %e, double %a)
90 %arrayidx1 = getelementptr inbounds double* %d, i64 1
91 store double %1, double* %arrayidx1, align 8
92 %2 = tail call double @llvm.fma.f64(double %b, double %c, double %1)
93 %arrayidx3 = getelementptr inbounds double* %d, i64 3
94 store double %2, double* %arrayidx3, align 8
95 %3 = tail call double @llvm.fma.f64(double %b, double %f, double %a)
96 %arrayidx4 = getelementptr inbounds double* %d, i64 2
97 store double %3, double* %arrayidx4, align 8
98 ret void
99
100 ; CHECK-LABEL: @test4
101 ; CHECK-DAG: xxlor [[F1:[0-9]+]], 1, 1
102 ; CHECK-DAG: li [[C1:[0-9]+]], 8
103 ; CHECK-DAG: li [[C2:[0-9]+]], 16
104 ; CHECK-DAG: xsmaddmdp 4, 2, 1
105
106 ; Note: We could convert this next FMA to M-type as well, but it would require
107 ; re-ordering the instructions.
108 ; CHECK-DAG: xsmaddadp 1, 2, 5
109
110 ; CHECK-DAG: xsmaddadp [[F1]], 2, 3
111 ; CHECK-DAG: stxsdx [[F1]], 0, 8
112 ; CHECK-DAG: stxsdx 4, 8, [[C1]]
113 ; CHECK-DAG: li [[C3:[0-9]+]], 24
114 ; CHECK-DAG: xsmaddadp 4, 2, 3
115 ; CHECK-DAG: stxsdx 4, 8, [[C3]]
116 ; CHECK-DAG: stxsdx 1, 8, [[C2]]
117 ; CHECK: blr
118 }
119
120 declare double @llvm.fma.f64(double, double, double) #0
121
122 attributes #0 = { nounwind readnone }
123