llvm.org GIT mirror llvm / a18156c
LEA code size optimization pass (Part 1): Remove redundant address recalculations, by Andrey Turetsky Add new x86 pass which replaces address calculations in load or store instructions with def register of existing LEA (must be in the same basic block), if the LEA calculates address that differs only by a displacement. Works only with -Os or -Oz. Differential Revision: http://reviews.llvm.org/D13294 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@254712 91177308-0d34-0410-b5e6-96231b3b80d8 Alexey Bataev 4 years ago
5 changed file(s) with 463 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
3333 X86VZeroUpper.cpp
3434 X86FixupLEAs.cpp
3535 X86WinEHState.cpp
36 X86OptimizeLEAs.cpp
3637 )
3738
3839 add_llvm_target(X86CodeGen ${sources})
5757 /// to eliminate execution delays in some Atom processors.
5858 FunctionPass *createX86FixupLEAs();
5959
60 /// createX86OptimizeLEAs() - Return a pass that removes redundant
61 /// address recalculations.
62 FunctionPass *createX86OptimizeLEAs();
63
6064 /// createX86CallFrameOptimization - Return a pass that optimizes
6165 /// the code-size of x86 call sequences. This is done by replacing
6266 /// esp-relative movs with pushes.
0 //===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the pass that performs some optimizations with LEA
10 // instructions in order to improve code size.
11 // Currently, it does one thing:
12 // 1) Address calculations in load and store instructions are replaced by
13 // existing LEA def registers where possible.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "X86.h"
18 #include "X86InstrInfo.h"
19 #include "X86Subtarget.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveVariables.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/Passes.h"
26 #include "llvm/IR/Function.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Target/TargetInstrInfo.h"
30
31 using namespace llvm;
32
33 #define DEBUG_TYPE "x86-optimize-LEAs"
34
35 STATISTIC(NumSubstLEAs, "Number of LEA instruction substitutions");
36
37 namespace {
38 class OptimizeLEAPass : public MachineFunctionPass {
39 public:
40 OptimizeLEAPass() : MachineFunctionPass(ID) {}
41
42 const char *getPassName() const override { return "X86 LEA Optimize"; }
43
44 /// \brief Loop over all of the basic blocks, replacing address
45 /// calculations in load and store instructions, if it's already
46 /// been calculated by LEA. Also, remove redundant LEAs.
47 bool runOnMachineFunction(MachineFunction &MF) override;
48
49 private:
50 /// \brief Returns a distance between two instructions inside one basic block.
51 /// Negative result means, that instructions occur in reverse order.
52 int calcInstrDist(const MachineInstr &First, const MachineInstr &Last);
53
54 /// \brief Choose the best \p LEA instruction from the \p List to replace
55 /// address calculation in \p MI instruction. Return the address displacement
56 /// and the distance between \p MI and the choosen \p LEA in \p AddrDispShift
57 /// and \p Dist.
58 bool chooseBestLEA(const SmallVectorImpl &List,
59 const MachineInstr &MI, MachineInstr *&LEA,
60 int64_t &AddrDispShift, int &Dist);
61
62 /// \brief Returns true if two machine operand are identical and they are not
63 /// physical registers.
64 bool isIdenticalOp(const MachineOperand &MO1, const MachineOperand &MO2);
65
66 /// \brief Returns true if the instruction is LEA.
67 bool isLEA(const MachineInstr &MI);
68
69 /// \brief Returns true if two instructions have memory operands that only
70 /// differ by displacement. The numbers of the first memory operands for both
71 /// instructions are specified through \p N1 and \p N2. The address
72 /// displacement is returned through AddrDispShift.
73 bool isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
74 const MachineInstr &MI2, unsigned N2,
75 int64_t &AddrDispShift);
76
77 /// \brief Find all LEA instructions in the basic block.
78 void findLEAs(const MachineBasicBlock &MBB,
79 SmallVectorImpl &List);
80
81 /// \brief Removes redundant address calculations.
82 bool removeRedundantAddrCalc(const SmallVectorImpl &List);
83
84 MachineRegisterInfo *MRI;
85 const X86InstrInfo *TII;
86 const X86RegisterInfo *TRI;
87
88 static char ID;
89 };
90 char OptimizeLEAPass::ID = 0;
91 }
92
93 FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
94
95 int OptimizeLEAPass::calcInstrDist(const MachineInstr &First,
96 const MachineInstr &Last) {
97 const MachineBasicBlock *MBB = First.getParent();
98
99 // Both instructions must be in the same basic block.
100 assert(Last.getParent() == MBB &&
101 "Instructions are in different basic blocks");
102
103 return std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&Last)) -
104 std::distance(MBB->begin(), MachineBasicBlock::const_iterator(&First));
105 }
106
107 // Find the best LEA instruction in the List to replace address recalculation in
108 // MI. Such LEA must meet these requirements:
109 // 1) The address calculated by the LEA differs only by the displacement from
110 // the address used in MI.
111 // 2) The register class of the definition of the LEA is compatible with the
112 // register class of the address base register of MI.
113 // 3) Displacement of the new memory operand should fit in 1 byte if possible.
114 // 4) The LEA should be as close to MI as possible, and prior to it if
115 // possible.
116 bool OptimizeLEAPass::chooseBestLEA(const SmallVectorImpl &List,
117 const MachineInstr &MI, MachineInstr *&LEA,
118 int64_t &AddrDispShift, int &Dist) {
119 const MachineFunction *MF = MI.getParent()->getParent();
120 const MCInstrDesc &Desc = MI.getDesc();
121 int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, MI.getOpcode()) +
122 X86II::getOperandBias(Desc);
123
124 LEA = nullptr;
125
126 // Loop over all LEA instructions.
127 for (auto DefMI : List) {
128 int64_t AddrDispShiftTemp = 0;
129
130 // Compare instructions memory operands.
131 if (!isSimilarMemOp(MI, MemOpNo, *DefMI, 1, AddrDispShiftTemp))
132 continue;
133
134 // Make sure address displacement fits 4 bytes.
135 if (!isInt<32>(AddrDispShiftTemp))
136 continue;
137
138 // Check that LEA def register can be used as MI address base. Some
139 // instructions can use a limited set of registers as address base, for
140 // example MOV8mr_NOREX. We could constrain the register class of the LEA
141 // def to suit MI, however since this case is very rare and hard to
142 // reproduce in a test it's just more reliable to skip the LEA.
143 if (TII->getRegClass(Desc, MemOpNo + X86::AddrBaseReg, TRI, *MF) !=
144 MRI->getRegClass(DefMI->getOperand(0).getReg()))
145 continue;
146
147 // Choose the closest LEA instruction from the list, prior to MI if
148 // possible. Note that we took into account resulting address displacement
149 // as well. Also note that the list is sorted by the order in which the LEAs
150 // occur, so the break condition is pretty simple.
151 int DistTemp = calcInstrDist(*DefMI, MI);
152 assert(DistTemp != 0 &&
153 "The distance between two different instructions cannot be zero");
154 if (DistTemp > 0 || LEA == nullptr) {
155 // Do not update return LEA, if the current one provides a displacement
156 // which fits in 1 byte, while the new candidate does not.
157 if (LEA != nullptr && !isInt<8>(AddrDispShiftTemp) &&
158 isInt<8>(AddrDispShift))
159 continue;
160
161 LEA = DefMI;
162 AddrDispShift = AddrDispShiftTemp;
163 Dist = DistTemp;
164 }
165
166 // FIXME: Maybe we should not always stop at the first LEA after MI.
167 if (DistTemp < 0)
168 break;
169 }
170
171 return LEA != nullptr;
172 }
173
174 bool OptimizeLEAPass::isIdenticalOp(const MachineOperand &MO1,
175 const MachineOperand &MO2) {
176 return MO1.isIdenticalTo(MO2) &&
177 (!MO1.isReg() ||
178 !TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
179 }
180
181 bool OptimizeLEAPass::isLEA(const MachineInstr &MI) {
182 unsigned Opcode = MI.getOpcode();
183 return Opcode == X86::LEA16r || Opcode == X86::LEA32r ||
184 Opcode == X86::LEA64r || Opcode == X86::LEA64_32r;
185 }
186
187 // Check if MI1 and MI2 have memory operands which represent addresses that
188 // differ only by displacement.
189 bool OptimizeLEAPass::isSimilarMemOp(const MachineInstr &MI1, unsigned N1,
190 const MachineInstr &MI2, unsigned N2,
191 int64_t &AddrDispShift) {
192 // Address base, scale, index and segment operands must be identical.
193 static const int IdenticalOpNums[] = {X86::AddrBaseReg, X86::AddrScaleAmt,
194 X86::AddrIndexReg, X86::AddrSegmentReg};
195 for (auto &N : IdenticalOpNums)
196 if (!isIdenticalOp(MI1.getOperand(N1 + N), MI2.getOperand(N2 + N)))
197 return false;
198
199 // Address displacement operands may differ by a constant.
200 const MachineOperand *Op1 = &MI1.getOperand(N1 + X86::AddrDisp);
201 const MachineOperand *Op2 = &MI2.getOperand(N2 + X86::AddrDisp);
202 if (!isIdenticalOp(*Op1, *Op2)) {
203 if (Op1->isImm() && Op2->isImm())
204 AddrDispShift = Op1->getImm() - Op2->getImm();
205 else if (Op1->isGlobal() && Op2->isGlobal() &&
206 Op1->getGlobal() == Op2->getGlobal())
207 AddrDispShift = Op1->getOffset() - Op2->getOffset();
208 else
209 return false;
210 }
211
212 return true;
213 }
214
215 void OptimizeLEAPass::findLEAs(const MachineBasicBlock &MBB,
216 SmallVectorImpl &List) {
217 for (auto &MI : MBB) {
218 if (isLEA(MI))
219 List.push_back(const_cast(&MI));
220 }
221 }
222
223 // Try to find load and store instructions which recalculate addresses already
224 // calculated by some LEA and replace their memory operands with its def
225 // register.
226 bool OptimizeLEAPass::removeRedundantAddrCalc(
227 const SmallVectorImpl &List) {
228 bool Changed = false;
229
230 assert(List.size() > 0);
231 MachineBasicBlock *MBB = List[0]->getParent();
232
233 // Process all instructions in basic block.
234 for (auto I = MBB->begin(), E = MBB->end(); I != E;) {
235 MachineInstr &MI = *I++;
236 unsigned Opcode = MI.getOpcode();
237
238 // Instruction must be load or store.
239 if (!MI.mayLoadOrStore())
240 continue;
241
242 // Get the number of the first memory operand.
243 const MCInstrDesc &Desc = MI.getDesc();
244 int MemOpNo = X86II::getMemoryOperandNo(Desc.TSFlags, Opcode);
245
246 // If instruction has no memory operand - skip it.
247 if (MemOpNo < 0)
248 continue;
249
250 MemOpNo += X86II::getOperandBias(Desc);
251
252 // Get the best LEA instruction to replace address calculation.
253 MachineInstr *DefMI;
254 int64_t AddrDispShift;
255 int Dist;
256 if (!chooseBestLEA(List, MI, DefMI, AddrDispShift, Dist))
257 continue;
258
259 // If LEA occurs before current instruction, we can freely replace
260 // the instruction. If LEA occurs after, we can lift LEA above the
261 // instruction and this way to be able to replace it. Since LEA and the
262 // instruction have similar memory operands (thus, the same def
263 // instructions for these operands), we can always do that, without
264 // worries of using registers before their defs.
265 if (Dist < 0) {
266 DefMI->removeFromParent();
267 MBB->insert(MachineBasicBlock::iterator(&MI), DefMI);
268 }
269
270 // Since we can possibly extend register lifetime, clear kill flags.
271 MRI->clearKillFlags(DefMI->getOperand(0).getReg());
272
273 ++NumSubstLEAs;
274 DEBUG(dbgs() << "OptimizeLEAs: Candidate to replace: "; MI.dump(););
275
276 // Change instruction operands.
277 MI.getOperand(MemOpNo + X86::AddrBaseReg)
278 .ChangeToRegister(DefMI->getOperand(0).getReg(), false);
279 MI.getOperand(MemOpNo + X86::AddrScaleAmt).ChangeToImmediate(1);
280 MI.getOperand(MemOpNo + X86::AddrIndexReg)
281 .ChangeToRegister(X86::NoRegister, false);
282 MI.getOperand(MemOpNo + X86::AddrDisp).ChangeToImmediate(AddrDispShift);
283 MI.getOperand(MemOpNo + X86::AddrSegmentReg)
284 .ChangeToRegister(X86::NoRegister, false);
285
286 DEBUG(dbgs() << "OptimizeLEAs: Replaced by: "; MI.dump(););
287
288 Changed = true;
289 }
290
291 return Changed;
292 }
293
294 bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
295 bool Changed = false;
296 bool OptSize = MF.getFunction()->optForSize();
297 bool MinSize = MF.getFunction()->optForMinSize();
298
299 // Perform this optimization only if we care about code size.
300 if (!OptSize && !MinSize)
301 return false;
302
303 MRI = &MF.getRegInfo();
304 TII = MF.getSubtarget().getInstrInfo();
305 TRI = MF.getSubtarget().getRegisterInfo();
306
307 // Process all basic blocks.
308 for (auto &MBB : MF) {
309 SmallVector LEAs;
310
311 // Find all LEA instructions in basic block.
312 findLEAs(MBB, LEAs);
313
314 // If current basic block has no LEAs, move on to the next one.
315 if (LEAs.empty())
316 continue;
317
318 // Remove redundant address calculations.
319 Changed |= removeRedundantAddrCalc(LEAs);
320 }
321
322 return Changed;
323 }
253253 }
254254
255255 void X86PassConfig::addPreRegAlloc() {
256 if (getOptLevel() != CodeGenOpt::None)
257 addPass(createX86OptimizeLEAs());
258
256259 addPass(createX86CallFrameOptimization());
257260 }
258261
0 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
1
2 %struct.anon1 = type { i32, i32, i32 }
3 %struct.anon2 = type { i32, [32 x i32], i32 }
4
5 @arr1 = external global [65 x %struct.anon1], align 16
6 @arr2 = external global [65 x %struct.anon2], align 16
7
8 define void @test1(i64 %x) nounwind {
9 entry:
10 %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
11 %tmp = load i32, i32* %a, align 4
12 %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
13 %tmp1 = load i32, i32* %b, align 4
14 %sub = sub i32 %tmp, %tmp1
15 %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
16 %tmp2 = load i32, i32* %c, align 4
17 %add = add nsw i32 %sub, %tmp2
18 switch i32 %add, label %sw.epilog [
19 i32 1, label %sw.bb.1
20 i32 2, label %sw.bb.2
21 ]
22
23 sw.bb.1: ; preds = %entry
24 store i32 111, i32* %b, align 4
25 store i32 222, i32* %c, align 4
26 br label %sw.epilog
27
28 sw.bb.2: ; preds = %entry
29 store i32 333, i32* %b, align 4
30 store i32 444, i32* %c, align 4
31 br label %sw.epilog
32
33 sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
34 ret void
35 ; CHECK-LABEL: test1:
36 ; CHECK: leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
37 ; CHECK: movl arr1(,[[REG1]],4), {{.*}}
38 ; CHECK: leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
39 ; CHECK: subl arr1+4(,[[REG1]],4), {{.*}}
40 ; CHECK: leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
41 ; CHECK: addl arr1+8(,[[REG1]],4), {{.*}}
42 ; CHECK: movl ${{[1-4]+}}, ([[REG2]])
43 ; CHECK: movl ${{[1-4]+}}, ([[REG3]])
44 ; CHECK: movl ${{[1-4]+}}, ([[REG2]])
45 ; CHECK: movl ${{[1-4]+}}, ([[REG3]])
46 }
47
48 define void @test2(i64 %x) nounwind optsize {
49 entry:
50 %a = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 0
51 %tmp = load i32, i32* %a, align 4
52 %b = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 1
53 %tmp1 = load i32, i32* %b, align 4
54 %sub = sub i32 %tmp, %tmp1
55 %c = getelementptr inbounds [65 x %struct.anon1], [65 x %struct.anon1]* @arr1, i64 0, i64 %x, i32 2
56 %tmp2 = load i32, i32* %c, align 4
57 %add = add nsw i32 %sub, %tmp2
58 switch i32 %add, label %sw.epilog [
59 i32 1, label %sw.bb.1
60 i32 2, label %sw.bb.2
61 ]
62
63 sw.bb.1: ; preds = %entry
64 store i32 111, i32* %b, align 4
65 store i32 222, i32* %c, align 4
66 br label %sw.epilog
67
68 sw.bb.2: ; preds = %entry
69 store i32 333, i32* %b, align 4
70 store i32 444, i32* %c, align 4
71 br label %sw.epilog
72
73 sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
74 ret void
75 ; CHECK-LABEL: test2:
76 ; CHECK: leaq (%rdi,%rdi,2), [[REG1:%[a-z]+]]
77 ; CHECK: leaq arr1+4(,[[REG1]],4), [[REG2:%[a-z]+]]
78 ; CHECK: movl -4([[REG2]]), {{.*}}
79 ; CHECK: subl ([[REG2]]), {{.*}}
80 ; CHECK: leaq arr1+8(,[[REG1]],4), [[REG3:%[a-z]+]]
81 ; CHECK: addl ([[REG3]]), {{.*}}
82 ; CHECK: movl ${{[1-4]+}}, ([[REG2]])
83 ; CHECK: movl ${{[1-4]+}}, ([[REG3]])
84 ; CHECK: movl ${{[1-4]+}}, ([[REG2]])
85 ; CHECK: movl ${{[1-4]+}}, ([[REG3]])
86 }
87
88 ; Check that LEA optimization pass takes into account a resultant address
89 ; displacement when choosing a LEA instruction for replacing a redundant
90 ; address recalculation.
91
92 define void @test3(i64 %x) nounwind optsize {
93 entry:
94 %a = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 2
95 %tmp = load i32, i32* %a, align 4
96 %b = getelementptr inbounds [65 x %struct.anon2], [65 x %struct.anon2]* @arr2, i64 0, i64 %x, i32 0
97 %tmp1 = load i32, i32* %b, align 4
98 %add = add nsw i32 %tmp, %tmp1
99 switch i32 %add, label %sw.epilog [
100 i32 1, label %sw.bb.1
101 i32 2, label %sw.bb.2
102 ]
103
104 sw.bb.1: ; preds = %entry
105 store i32 111, i32* %a, align 4
106 store i32 222, i32* %b, align 4
107 br label %sw.epilog
108
109 sw.bb.2: ; preds = %entry
110 store i32 333, i32* %a, align 4
111 store i32 444, i32* %b, align 4
112 br label %sw.epilog
113
114 sw.epilog: ; preds = %sw.bb.2, %sw.bb.1, %entry
115 ret void
116 ; CHECK-LABEL: test3:
117 ; CHECK: imulq {{.*}}, [[REG1:%[a-z]+]]
118 ; CHECK: leaq arr2+132([[REG1]]), [[REG2:%[a-z]+]]
119 ; CHECK: leaq arr2([[REG1]]), [[REG3:%[a-z]+]]
120
121 ; REG3's definition is closer to movl than REG2's, but the pass still chooses
122 ; REG2 because it provides the resultant address displacement fitting 1 byte.
123
124 ; CHECK: movl ([[REG2]]), {{.*}}
125 ; CHECK: addl ([[REG3]]), {{.*}}
126 ; CHECK: movl ${{[1-4]+}}, ([[REG2]])
127 ; CHECK: movl ${{[1-4]+}}, ([[REG3]])
128 ; CHECK: movl ${{[1-4]+}}, ([[REG2]])
129 ; CHECK: movl ${{[1-4]+}}, ([[REG3]])
130 }