llvm.org GIT mirror llvm / 9e24ab7
[PowerPC] Add an MI SSA peephole pass. This patch adds a pass for doing PowerPC peephole optimizations at the MI level while the code is still in SSA form. This allows for easy modifications to the instructions while depending on a subsequent pass of DCE. Both passes are very fast due to the characteristics of SSA. At this time, the only peepholes added are for cleaning up various redundancies involving the XXPERMDI instruction. However, I would expect this will be a useful place to add more peepholes for inefficiencies generated during instruction selection. The pass is placed after VSX swap optimization, as it is best to let that pass remove unnecessary swaps before performing any remaining clean-ups. The utility of these clean-ups are demonstrated by changes to four existing test cases, all of which now have tighter expected code generation. I've also added Eric Schweiz's bugpoint-reduced test from PR25157, for which we now generate tight code. One other test started failing for me, and I've fixed it (test/Transforms/PlaceSafepoints/finite-loops.ll) as well; this is not related to my changes, and I'm not sure why it works before and not after. The problem is that the CHECK-NOT: of "statepoint" from test1 fails because of the "statepoint" in test2, and so forth. Adding a CHECK-LABEL in between keeps the different occurrences of that string properly scoped. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@252651 91177308-0d34-0410-b5e6-96231b3b80d8 Bill Schmidt 3 years ago
11 changed file(s) with 322 addition(s) and 28 deletion(s). Raw diff Collapse all Expand all
4040 FunctionPass *createPPCVSXCopyPass();
4141 FunctionPass *createPPCVSXFMAMutatePass();
4242 FunctionPass *createPPCVSXSwapRemovalPass();
43 FunctionPass *createPPCMIPeepholePass();
4344 FunctionPass *createPPCBranchSelectionPass();
4445 FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
4546 FunctionPass *createPPCTLSDynamicCallPass();
0 //===-------------- PPCMIPeephole.cpp - MI Peephole Cleanups -------------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This pass performs peephole optimizations to clean up ugly code
10 // sequences at the MachineInstruction layer. It runs at the end of
11 // the SSA phases, following VSX swap removal. A pass of dead code
12 // elimination follows this one for quick clean-up of any dead
13 // instructions introduced here. Although we could do this as callbacks
14 // from the generic peephole pass, this would have a couple of bad
15 // effects: it might remove optimization opportunities for VSX swap
16 // removal, and it would miss cleanups made possible following VSX
17 // swap removal.
18 //
19 //===---------------------------------------------------------------------===//
20
21 #include "PPCInstrInfo.h"
22 #include "PPC.h"
23 #include "PPCInstrBuilder.h"
24 #include "PPCTargetMachine.h"
25 #include "llvm/CodeGen/MachineFunctionPass.h"
26 #include "llvm/CodeGen/MachineInstrBuilder.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/Support/Debug.h"
29
30 using namespace llvm;
31
32 #define DEBUG_TYPE "ppc-mi-peepholes"
33
34 namespace llvm {
35 void initializePPCMIPeepholePass(PassRegistry&);
36 }
37
38 namespace {
39
40 struct PPCMIPeephole : public MachineFunctionPass {
41
42 static char ID;
43 const PPCInstrInfo *TII;
44 MachineFunction *MF;
45 MachineRegisterInfo *MRI;
46
47 PPCMIPeephole() : MachineFunctionPass(ID) {
48 initializePPCMIPeepholePass(*PassRegistry::getPassRegistry());
49 }
50
51 private:
52 // Initialize class variables.
53 void initialize(MachineFunction &MFParm);
54
55 // Perform peepholes.
56 bool simplifyCode(void);
57
58 // Find the "true" register represented by SrcReg (following chains
59 // of copies and subreg_to_reg operations).
60 unsigned lookThruCopyLike(unsigned SrcReg);
61
62 public:
63 // Main entry point for this pass.
64 bool runOnMachineFunction(MachineFunction &MF) override {
65 initialize(MF);
66 return simplifyCode();
67 }
68 };
69
70 // Initialize class variables.
71 void PPCMIPeephole::initialize(MachineFunction &MFParm) {
72 MF = &MFParm;
73 MRI = &MF->getRegInfo();
74 TII = MF->getSubtarget().getInstrInfo();
75 DEBUG(dbgs() << "*** PowerPC MI peephole pass ***\n\n");
76 DEBUG(MF->dump());
77 }
78
79 // Perform peephole optimizations.
80 bool PPCMIPeephole::simplifyCode(void) {
81 bool Simplified = false;
82 MachineInstr* ToErase = nullptr;
83
84 for (MachineBasicBlock &MBB : *MF) {
85 for (MachineInstr &MI : MBB) {
86
87 // If the previous instruction was marked for elimination,
88 // remove it now.
89 if (ToErase) {
90 ToErase->eraseFromParent();
91 ToErase = nullptr;
92 }
93
94 // Ignore debug instructions.
95 if (MI.isDebugValue())
96 continue;
97
98 // Per-opcode peepholes.
99 switch (MI.getOpcode()) {
100
101 default:
102 break;
103
104 case PPC::XXPERMDI: {
105 // Perform simplifications of 2x64 vector swaps and splats.
106 // A swap is identified by an immediate value of 2, and a splat
107 // is identified by an immediate value of 0 or 3.
108 int Immed = MI.getOperand(3).getImm();
109
110 if (Immed != 1) {
111
112 // For each of these simplifications, we need the two source
113 // regs to match. Unfortunately, MachineCSE ignores COPY and
114 // SUBREG_TO_REG, so for example we can see
115 // XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), immed.
116 // We have to look through chains of COPY and SUBREG_TO_REG
117 // to find the real source values for comparison.
118 unsigned TrueReg1 = lookThruCopyLike(MI.getOperand(1).getReg());
119 unsigned TrueReg2 = lookThruCopyLike(MI.getOperand(2).getReg());
120
121 if (TrueReg1 == TrueReg2
122 && TargetRegisterInfo::isVirtualRegister(TrueReg1)) {
123 MachineInstr *DefMI = MRI->getVRegDef(TrueReg1);
124
125 // If this is a splat or a swap fed by another splat, we
126 // can replace it with a copy.
127 if (DefMI && DefMI->getOpcode() == PPC::XXPERMDI) {
128 unsigned FeedImmed = DefMI->getOperand(3).getImm();
129 unsigned FeedReg1
130 = lookThruCopyLike(DefMI->getOperand(1).getReg());
131 unsigned FeedReg2
132 = lookThruCopyLike(DefMI->getOperand(2).getReg());
133
134 if ((FeedImmed == 0 || FeedImmed == 3) && FeedReg1 == FeedReg2) {
135 DEBUG(dbgs()
136 << "Optimizing splat/swap or splat/splat "
137 "to splat/copy: ");
138 DEBUG(MI.dump());
139 BuildMI(MBB, &MI, MI.getDebugLoc(),
140 TII->get(PPC::COPY), MI.getOperand(0).getReg())
141 .addOperand(MI.getOperand(1));
142 ToErase = &MI;
143 Simplified = true;
144 }
145
146 // If this is a splat fed by a swap, we can simplify modify
147 // the splat to splat the other value from the swap's input
148 // parameter.
149 else if ((Immed == 0 || Immed == 3)
150 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
151 DEBUG(dbgs() << "Optimizing swap/splat => splat: ");
152 DEBUG(MI.dump());
153 MI.getOperand(1).setReg(DefMI->getOperand(1).getReg());
154 MI.getOperand(2).setReg(DefMI->getOperand(2).getReg());
155 MI.getOperand(3).setImm(3 - Immed);
156 Simplified = true;
157 }
158
159 // If this is a swap fed by a swap, we can replace it
160 // with a copy from the first swap's input.
161 else if (Immed == 2 && FeedImmed == 2 && FeedReg1 == FeedReg2) {
162 DEBUG(dbgs() << "Optimizing swap/swap => copy: ");
163 DEBUG(MI.dump());
164 BuildMI(MBB, &MI, MI.getDebugLoc(),
165 TII->get(PPC::COPY), MI.getOperand(0).getReg())
166 .addOperand(DefMI->getOperand(1));
167 ToErase = &MI;
168 Simplified = true;
169 }
170 }
171 }
172 }
173 break;
174 }
175 }
176 }
177
178 // If the last instruction was marked for elimination,
179 // remove it now.
180 if (ToErase) {
181 ToErase->eraseFromParent();
182 ToErase = nullptr;
183 }
184 }
185
186 return Simplified;
187 }
188
189 // This is used to find the "true" source register for an
190 // XXPERMDI instruction, since MachineCSE does not handle the
191 // "copy-like" operations (Copy and SubregToReg). Returns
192 // the original SrcReg unless it is the target of a copy-like
193 // operation, in which case we chain backwards through all
194 // such operations to the ultimate source register. If a
195 // physical register is encountered, we stop the search.
196 unsigned PPCMIPeephole::lookThruCopyLike(unsigned SrcReg) {
197
198 while (true) {
199
200 MachineInstr *MI = MRI->getVRegDef(SrcReg);
201 if (!MI->isCopyLike())
202 return SrcReg;
203
204 unsigned CopySrcReg;
205 if (MI->isCopy())
206 CopySrcReg = MI->getOperand(1).getReg();
207 else {
208 assert(MI->isSubregToReg() && "bad opcode for lookThruCopyLike");
209 CopySrcReg = MI->getOperand(2).getReg();
210 }
211
212 if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg))
213 return CopySrcReg;
214
215 SrcReg = CopySrcReg;
216 }
217 }
218
219 } // end default namespace
220
221 INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
222 "PowerPC MI Peephole Optimization", false, false)
223 INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
224 "PowerPC MI Peephole Optimization", false, false)
225
226 char PPCMIPeephole::ID = 0;
227 FunctionPass*
228 llvm::createPPCMIPeepholePass() { return new PPCMIPeephole(); }
229
4040 static cl::
4141 opt DisableVSXSwapRemoval("disable-ppc-vsx-swap-removal", cl::Hidden,
4242 cl::desc("Disable VSX Swap Removal for PPC"));
43
44 static cl::
45 opt DisableMIPeephole("disable-ppc-peephole", cl::Hidden,
46 cl::desc("Disable machine peepholes for PPC"));
4347
4448 static cl::opt
4549 EnableGEPOpt("ppc-gep-opt", cl::Hidden,
347351 if (TM->getTargetTriple().getArch() == Triple::ppc64le &&
348352 !DisableVSXSwapRemoval)
349353 addPass(createPPCVSXSwapRemovalPass());
354 // Target-specific peephole cleanups performed after instruction
355 // selection.
356 if (!DisableMIPeephole) {
357 addPass(createPPCMIPeepholePass());
358 addPass(&DeadMachineInstructionElimID);
359 }
350360 }
351361
352362 void PPCPassConfig::addPreRegAlloc() {
6262 ret <2 x i64> %splat.splat
6363 ; CHECK: mtvsrd {{[0-9]+}}, 3
6464 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
65 ; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
65 ; CHECK-LE: xxspltd [[REG1]], [[REG1]], 0
6666 }
6767
6868 ; Function Attrs: nounwind
0 ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
1
2 ; Verify peephole simplification of splats and swaps. Bugpoint-reduced
3 ; test from Eric Schweitz.
4
5 %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625 = type <{ [28 x i8] }>
6 %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626 = type <{ [64 x i8] }>
7
8 @.BSS38 = external global %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, align 32
9 @_main1_2_ = external global %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, section ".comm", align 16
10
11 define void @aercalc_() {
12 L.entry:
13 br i1 undef, label %L.LB38_2426, label %L.LB38_2911
14
15 L.LB38_2911:
16 br i1 undef, label %L.LB38_2140, label %L.LB38_2640
17
18 L.LB38_2640:
19 unreachable
20
21 L.LB38_2426:
22 br i1 undef, label %L.LB38_2438, label %L.LB38_2920
23
24 L.LB38_2920:
25 br i1 undef, label %L.LB38_2438, label %L.LB38_2921
26
27 L.LB38_2921:
28 br label %L.LB38_2140
29
30 L.LB38_2140:
31 ret void
32
33 L.LB38_2438:
34 br i1 undef, label %L.LB38_2451, label %L.LB38_2935
35
36 L.LB38_2935:
37 br i1 undef, label %L.LB38_2451, label %L.LB38_2936
38
39 L.LB38_2936:
40 unreachable
41
42 L.LB38_2451:
43 br i1 undef, label %L.LB38_2452, label %L.LB38_2937
44
45 L.LB38_2937:
46 unreachable
47
48 L.LB38_2452:
49 %0 = load float, float* bitcast (i8* getelementptr inbounds (%struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625, %struct.BSS38.51.4488.9911.14348.16813.20264.24701.28152.31603.35054.39491.44914.45407.46393.46886.47872.49351.49844.50830.51323.52309.53295.53788.54281.55267.55760.59211.61625* @.BSS38, i64 0, i32 0, i64 16) to float*), align 16
50 %1 = fpext float %0 to double
51 %2 = insertelement <2 x double> undef, double %1, i32 1
52 store <2 x double> %2, <2 x double>* bitcast (i8* getelementptr inbounds (%struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626, %struct_main1_2_.491.4928.10351.14788.17253.20704.25141.28592.32043.35494.39931.45354.45847.46833.47326.48312.49791.50284.51270.51763.52749.53735.54228.54721.55707.56200.59651.61626* @_main1_2_, i64 0, i32 0, i64 32) to <2 x double>*), align 16
53 unreachable
54 }
55
56 ; CHECK-LABEL: @aercalc_
57 ; CHECK: lxsspx
58 ; CHECK: xxspltd
59 ; CHECK: stxvd2x
60 ; CHECK-NOT: xxswapd
1414 }
1515
1616 ; CHECK-LABEL: @bar0
17 ; CHECK-DAG: xxswapd {{[0-9]+}}, 1
1817 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
1918 ; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
2019 ; CHECK: xxpermdi [[REG3:[0-9]+]], [[REG2]], [[REG1]], 1
2120 ; CHECK: stxvd2x [[REG3]]
21 ; CHECK-NOT: xxswapd
2222
2323 define void @bar1(double %y) {
2424 entry:
2929 }
3030
3131 ; CHECK-LABEL: @bar1
32 ; CHECK-DAG: xxswapd {{[0-9]+}}, 1
3332 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
3433 ; CHECK-DAG: xxspltd [[REG2:[0-9]+]]
3534 ; CHECK: xxmrghd [[REG3:[0-9]+]], [[REG1]], [[REG2]]
3635 ; CHECK: stxvd2x [[REG3]]
36 ; CHECK-NOT: xxswapd
3737
3838 define void @baz0() {
3939 entry:
1919 ; CHECK-LABEL: @bar0
2020 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
2121 ; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
22 ; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
23 ; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
22 ; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
2423 ; CHECK: xxpermdi [[REG5:[0-9]+]], [[REG4]], [[REG1]], 1
2524 ; CHECK: stxvd2x [[REG5]]
2625
3635 ; CHECK-LABEL: @bar1
3736 ; CHECK-DAG: lxvd2x [[REG1:[0-9]+]]
3837 ; CHECK-DAG: lxsdx [[REG2:[0-9]+]]
39 ; CHECK: xxswapd [[REG3:[0-9]+]], [[REG2]]
40 ; CHECK: xxspltd [[REG4:[0-9]+]], [[REG3]], 1
38 ; CHECK: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
4139 ; CHECK: xxmrghd [[REG5:[0-9]+]], [[REG1]], [[REG4]]
4240 ; CHECK: stxvd2x [[REG5]]
4341
12271227 ; CHECK-LE-LABEL: @test80
12281228 ; CHECK-LE-DAG: mtvsrd [[R1:[0-9]+]], 3
12291229 ; CHECK-LE-DAG: addi [[R2:[0-9]+]], {{[0-9]+}}, .LCPI
1230 ; CHECK-LE-DAG: xxswapd [[V1:[0-9]+]], [[R1]]
12311230 ; CHECK-LE-DAG: lxvd2x [[V2:[0-9]+]], 0, [[R2]]
1232 ; CHECK-LE-DAG: xxspltd 34, [[V1]]
1231 ; CHECK-LE-DAG: xxspltd 34, [[R1]]
12331232 ; CHECK-LE-DAG: xxswapd 35, [[V2]]
12341233 ; CHECK-LE: vaddudm 2, 2, 3
12351234 ; CHECK-LE: blr
3232 %r = extractelement <2 x double> %v, i32 0
3333 ret double %r
3434
35 ; FIXME: Swap optimization will collapse this into lxvd2x 1, 0, 3.
36
3735 ; CHECK-LABEL: teste0
38 ; CHECK: lxvd2x 0, 0, 3
39 ; CHECK: xxswapd 0, 0
40 ; CHECK: xxswapd 1, 0
36 ; CHECK: lxvd2x 1, 0, 3
4137 }
4238
4339 define double @teste1(<2 x double>* %p1) {
77
88 ; CHECK-LABEL: test00
99 ; CHECK: lxvd2x 0, 0, 3
10 ; CHECK: xxswapd 0, 0
11 ; CHECK: xxspltd 34, 0, 1
10 ; CHECK: xxspltd 34, 0, 0
1211 }
1312
1413 define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) {
5756 ret <2 x double> %v3
5857
5958 ; CHECK-LABEL: @test10
60 ; CHECK: lxvd2x 0, 0, 3
61 ; CHECK: xxswapd 0, 0
62 ; CHECK: xxswapd 34, 0
59 ; CHECK: lxvd2x 34, 0, 3
6360 }
6461
6562 define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
7067
7168 ; CHECK-LABEL: @test11
7269 ; CHECK: lxvd2x 0, 0, 3
73 ; CHECK: xxswapd 0, 0
74 ; CHECK: xxspltd 34, 0, 0
70 ; CHECK: xxspltd 34, 0, 1
7571 }
7672
7773 define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) {
138134
139135 ; CHECK-LABEL: @test22
140136 ; CHECK: lxvd2x 0, 0, 4
141 ; CHECK: xxswapd 0, 0
142 ; CHECK: xxspltd 34, 0, 1
137 ; CHECK: xxspltd 34, 0, 0
143138 }
144139
145140 define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) {
188183 ret <2 x double> %v3
189184
190185 ; CHECK-LABEL: @test32
191 ; CHECK: lxvd2x 0, 0, 4
192 ; CHECK: xxswapd 0, 0
193 ; CHECK: xxswapd 34, 0
186 ; CHECK: lxvd2x 34, 0, 4
194187 }
195188
196189 define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
201194
202195 ; CHECK-LABEL: @test33
203196 ; CHECK: lxvd2x 0, 0, 4
204 ; CHECK: xxswapd 0, 0
205 ; CHECK: xxspltd 34, 0, 0
197 ; CHECK: xxspltd 34, 0, 1
206198 }
1010 ; CHECK: statepoint
1111 ; CHECK-LABEL: loop
1212 ; CHECK-NOT: statepoint
13 ; CHECK-LABEL: exit
1314
1415 entry:
1516 br label %loop
3132 ; CHECK: statepoint
3233 ; CHECK-LABEL: loop
3334 ; CHECK-NOT: statepoint
35 ; CHECK-LABEL: exit
3436
3537 entry:
3638 br label %loop
5557 ; CHECK: statepoint
5658 ; CHECK-LABEL: loop
5759 ; CHECK-NOT: statepoint
60 ; CHECK-LABEL: exit
5861
5962 entry:
6063 br label %loop
7679 ; CHECK: statepoint
7780 ; CHECK-LABEL: loop
7881 ; CHECK: statepoint
82 ; CHECK-LABEL: exit
7983
8084 ; COUNTED-64-LABEL: test4
8185 ; COUNTED-64-LABEL: entry
8286 ; COUNTED-64: statepoint
8387 ; COUNTED-64-LABEL: loop
8488 ; COUNTED-64-NOT: statepoint
89 ; COUNTED-64-LABEL: exit
8590
8691 entry:
8792 br label %loop
104109 ; CHECK: statepoint
105110 ; CHECK-LABEL: loop
106111 ; CHECK: statepoint
112 ; CHECK-LABEL: exit
107113
108114 ; COUNTED-64-LABEL: test5
109115 ; COUNTED-64-LABEL: entry
110116 ; COUNTED-64: statepoint
111117 ; COUNTED-64-LABEL: loop
112118 ; COUNTED-64: statepoint
119 ; COUNTED-64-LABEL: exit
113120
114121 entry:
115122 br label %loop