llvm.org GIT mirror llvm / fa502aa
[X86] New pass to change byte and word instructions to zero-extending versions. Differential Revision: http://reviews.llvm.org/D17032 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@260572 91177308-0d34-0410-b5e6-96231b3b80d8 Kevin B. Smith 4 years ago
5 changed file(s) with 412 addition(s) and 0 deletion(s). Raw diff Collapse all Expand all
3535 X86FixupLEAs.cpp
3636 X86WinEHState.cpp
3737 X86OptimizeLEAs.cpp
38 X86FixupBWInsts.cpp
3839 )
3940
4041 add_llvm_target(X86CodeGen ${sources})
7171 /// must run after prologue/epilogue insertion and before lowering
7272 /// the MachineInstr to MC.
7373 FunctionPass *createX86ExpandPseudoPass();
74
75 /// Return a Machine IR pass that selectively replaces
76 /// certain byte and word instructions by equivalent 32 bit instructions,
77 /// in order to eliminate partial register usage, false dependences on
78 /// the upper portions of registers, and to save code size.
79 FunctionPass *createX86FixupBWInsts();
7480 } // End llvm namespace
7581
7682 #endif
0 //===-- X86FixupBWInsts.cpp - Fixup Byte or Word instructions -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file defines the pass that looks through the machine instructions
10 /// late in the compilation, and finds byte or word instructions that
11 /// can be profitably replaced with 32 bit instructions that give equivalent
12 /// results for the bits of the results that are used. There are two possible
13 /// reasons to do this.
14 ///
15 /// One reason is to avoid false-dependences on the upper portions
16 /// of the registers. Only instructions that have a destination register
17 /// which is not in any of the source registers can be affected by this.
18 /// Any instruction where one of the source registers is also the destination
19 /// register is unaffected, because it has a true dependence on the source
20 /// register already. So, this consideration primarily affects load
21 /// instructions and register-to-register moves. It would
22 /// seem like cmov(s) would also be affected, but because of the way cmov is
23 /// really implemented by most machines as reading both the destination and
24 /// and source regsters, and then "merging" the two based on a condition,
25 /// it really already should be considered as having a true dependence on the
26 /// destination register as well.
27 ///
28 /// The other reason to do this is for potential code size savings. Word
29 /// operations need an extra override byte compared to their 32 bit
30 /// versions. So this can convert many word operations to their larger
31 /// size, saving a byte in encoding. This could introduce partial register
32 /// dependences where none existed however. As an example take:
33 /// orw ax, $0x1000
34 /// addw ax, $3
35 /// now if this were to get transformed into
36 /// orw ax, $1000
37 /// addl eax, $3
38 /// because the addl encodes shorter than the addw, this would introduce
39 /// a use of a register that was only partially written earlier. On older
40 /// Intel processors this can be quite a performance penalty, so this should
41 /// probably only be done when it can be proven that a new partial dependence
42 /// wouldn't be created, or when your know a newer processor is being
43 /// targeted, or when optimizing for minimum code size.
44 ///
45 //===----------------------------------------------------------------------===//
46
47 #include "X86.h"
48 #include "X86InstrInfo.h"
49 #include "X86Subtarget.h"
50 #include "llvm/ADT/Statistic.h"
51 #include "llvm/CodeGen/LiveVariables.h"
52 #include "llvm/CodeGen/MachineFunctionPass.h"
53 #include "llvm/CodeGen/MachineInstrBuilder.h"
54 #include "llvm/CodeGen/MachineLoopInfo.h"
55 #include "llvm/CodeGen/MachineRegisterInfo.h"
56 #include "llvm/CodeGen/Passes.h"
57 #include "llvm/Support/Debug.h"
58 #include "llvm/Support/raw_ostream.h"
59 #include "llvm/Target/TargetInstrInfo.h"
60 using namespace llvm;
61
62 #define DEBUG_TYPE "x86-fixup-bw-insts"
63
64 // Option to allow this optimization pass to have fine-grained control.
65 // This is turned off by default so as not to affect a large number of
66 // existing lit tests.
67 static cl::opt
68 FixupBWInsts("fixup-byte-word-insts",
69 cl::desc("Change byte and word instructions to larger sizes"),
70 cl::init(false), cl::Hidden);
71
72 namespace {
73 class FixupBWInstPass : public MachineFunctionPass {
74 static char ID;
75
76 const char *getPassName() const override {
77 return "X86 Byte/Word Instruction Fixup";
78 }
79
80 /// \brief Loop over all of the instructions in the basic block
81 /// replacing applicable byte or word instructions with better
82 /// alternatives.
83 void processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB) const;
84
85 /// \brief This sets the \p SuperDestReg to the 32 bit super reg
86 /// of the original destination register of the MachineInstr
87 /// passed in. It returns true if that super register is dead
88 /// just prior to \p OrigMI, and false if not.
89 /// \pre OrigDestSize must be 8 or 16.
90 bool getSuperRegDestIfDead(MachineInstr *OrigMI, unsigned OrigDestSize,
91 unsigned &SuperDestReg) const;
92
93 /// \brief Change the MachineInstr \p MI into the equivalent extending load
94 /// to 32 bit register if it is safe to do so. Return the replacement
95 /// instruction if OK, otherwise return nullptr.
96 /// \pre OrigDestSize must be 8 or 16.
97 MachineInstr *tryReplaceLoad(unsigned New32BitOpcode, unsigned OrigDestSize,
98 MachineInstr *MI) const;
99
100 public:
101 FixupBWInstPass() : MachineFunctionPass(ID) {}
102
103 void getAnalysisUsage(AnalysisUsage &AU) const override {
104 AU.addRequired(); // Machine loop info is used to
105 // guide some heuristics.
106 MachineFunctionPass::getAnalysisUsage(AU);
107 }
108
109 /// \brief Loop over all of the basic blocks,
110 /// replacing byte and word instructions by equivalent 32 bit instructions
111 /// where performance or code size can be improved.
112 bool runOnMachineFunction(MachineFunction &MF) override;
113
114 private:
115 MachineFunction *MF;
116
117 /// Machine instruction info used throughout the class.
118 const X86InstrInfo *TII;
119
120 /// Local member for function's OptForSize attribute.
121 bool OptForSize;
122
123 /// Machine loop info used for guiding some heruistics.
124 MachineLoopInfo *MLI;
125 };
126 char FixupBWInstPass::ID = 0;
127 }
128
129 FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); }
130
131 bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
132 if (!FixupBWInsts)
133 return false;
134
135 this->MF = &MF;
136 TII = MF.getSubtarget().getInstrInfo();
137 OptForSize = MF.getFunction()->optForSize();
138 MLI = &getAnalysis();
139
140 DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
141
142 // Process all basic blocks.
143 for (auto &MBB : MF)
144 processBasicBlock(MF, MBB);
145
146 DEBUG(dbgs() << "End X86FixupBWInsts\n";);
147
148 return true;
149 }
150
151 // TODO: This method of analysis can miss some legal cases, because the
152 // super-register could be live into the address expression for a memory
153 // reference for the instruction, and still be killed/last used by the
154 // instruction. However, the existing query interfaces don't seem to
155 // easily allow that to be checked.
156 //
157 // What we'd really like to know is whether after OrigMI, the
158 // only portion of SuperDestReg that is alive is the portion that
159 // was the destination register of OrigMI.
160 bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
161 unsigned OrigDestSize,
162 unsigned &SuperDestReg) const {
163
164 unsigned OrigDestReg = OrigMI->getOperand(0).getReg();
165 SuperDestReg = getX86SubSuperRegister(OrigDestReg, 32);
166
167 // Make sure that the sub-register that this instruction has as its
168 // destination is the lowest order sub-register of the super-register.
169 // If it isn't, then the register isn't really dead even if the
170 // super-register is considered dead.
171 // This test works because getX86SubSuperRegister returns the low portion
172 // register by default when getting a sub-register, so if that doesn't
173 // match the original destination register, then the original destination
174 // register must not have been the low register portion of that size.
175 if (getX86SubSuperRegister(SuperDestReg, OrigDestSize) != OrigDestReg)
176 return false;
177
178 MachineBasicBlock::LivenessQueryResult LQR =
179 OrigMI->getParent()->computeRegisterLiveness(&TII->getRegisterInfo(),
180 SuperDestReg, OrigMI);
181
182 if (LQR != MachineBasicBlock::LQR_Dead)
183 return false;
184
185 if (OrigDestSize == 8) {
186 // In the case of byte registers, we also have to check that the upper
187 // byte register is also dead. That is considered to be independent of
188 // whether the super-register is dead.
189 unsigned UpperByteReg = getX86SubSuperRegister(SuperDestReg, 8, true);
190
191 LQR = OrigMI->getParent()->computeRegisterLiveness(&TII->getRegisterInfo(),
192 UpperByteReg, OrigMI);
193 if (LQR != MachineBasicBlock::LQR_Dead)
194 return false;
195 }
196
197 return true;
198 }
199
200 MachineInstr *FixupBWInstPass::tryReplaceLoad(unsigned New32BitOpcode,
201 unsigned OrigDestSize,
202 MachineInstr *MI) const {
203 unsigned NewDestReg;
204
205 // We are going to try to rewrite this load to a larger zero-extending
206 // load. This is safe if all portions of the 32 bit super-register
207 // of the original destination register, except for the original destination
208 // register are dead. getSuperRegDestIfDead checks that.
209 if (!getSuperRegDestIfDead(MI, OrigDestSize, NewDestReg))
210 return nullptr;
211
212 // Safe to change the instruction.
213 MachineInstrBuilder MIB =
214 BuildMI(*MF, MI->getDebugLoc(), TII->get(New32BitOpcode), NewDestReg);
215
216 unsigned NumArgs = MI->getNumOperands();
217 for (unsigned i = 1; i < NumArgs; ++i)
218 MIB.addOperand(MI->getOperand(i));
219
220 MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
221
222 return MIB;
223 }
224
225 void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
226 MachineBasicBlock &MBB) const {
227
228 // This algorithm doesn't delete the instructions it is replacing
229 // right away. By leaving the existing instructions in place, the
230 // register liveness information doesn't change, and this makes the
231 // analysis that goes on be better than if the replaced instructions
232 // were immediately removed.
233 //
234 // This algorithm always creates a replacement instruction
235 // and notes that and the original in a data structure, until the
236 // whole BB has been analyzed. This keeps the replacement instructions
237 // from making it seem as if the larger register might be live.
238 SmallVector, 8> MIReplacements;
239
240 for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
241 MachineInstr *NewMI = nullptr;
242 MachineInstr *MI = I;
243
244 // See if this is an instruction of the type we are currently looking for.
245 switch (MI->getOpcode()) {
246
247 case X86::MOV8rm:
248 // Only replace 8 bit loads with the zero extending versions if
249 // in an inner most loop and not optimizing for size. This takes
250 // an extra byte to encode, and provides limited performance upside.
251 if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
252 if (ML->begin() == ML->end() && !OptForSize)
253 NewMI = tryReplaceLoad(X86::MOVZX32rm8, 8, MI);
254 }
255 break;
256
257 case X86::MOV16rm:
258 // Always try to replace 16 bit load with 32 bit zero extending.
259 // Code size is the same, and there is sometimes a perf advantage
260 // from eliminating a false dependence on the upper portion of
261 // the register.
262 NewMI = tryReplaceLoad(X86::MOVZX32rm16, 16, MI);
263 break;
264
265 default:
266 // nothing to do here.
267 break;
268 }
269
270 if (NewMI)
271 MIReplacements.push_back(std::make_pair(MI, NewMI));
272 }
273
274 while (!MIReplacements.empty()) {
275 MachineInstr *MI = MIReplacements.back().first;
276 MachineInstr *NewMI = MIReplacements.back().second;
277 MIReplacements.pop_back();
278 MBB.insert(MI, NewMI);
279 MBB.erase(MI);
280 }
281 }
278278 addPass(createX86IssueVZeroUpperPass());
279279
280280 if (getOptLevel() != CodeGenOpt::None) {
281 addPass(createX86FixupBWInsts());
281282 addPass(createX86PadShortFunctions());
282283 addPass(createX86FixupLEAs());
283284 }
0 ; RUN: llc -fixup-byte-word-insts -march=x86-64 < %s | FileCheck %s
1
2 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
3 target triple = "x86_64-apple-macosx10.8.0"
4
5 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
6
7 ; This has byte loads interspersed with byte stores, in a single
8 ; basic-block loop. The upper portion should be dead, so the movb loads
9 ; should have been changed into movzbl instead.
10 ; TODO: The second movb load doesn't get fixed due to register liveness
11 ; not being accurate enough.
12 ; CHECK-LABEL: foo1
13 ; load:
14 ; CHECK: movzbl
15 ; store:
16 ; CHECK: movb
17 ; load:
18 ; CHECK: movb
19 ; store:
20 ; CHECK: movb
21 ; CHECK: ret
22 define void @foo1(i32 %count,
23 %struct.A* noalias nocapture %q,
24 %struct.A* noalias nocapture %p)
25 nounwind uwtable noinline ssp {
26 %1 = icmp sgt i32 %count, 0
27 br i1 %1, label %.lr.ph, label %._crit_edge
28
29 .lr.ph: ; preds = %0
30 %2 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 0
31 %3 = getelementptr inbounds %struct.A, %struct.A* %q, i64 0, i32 1
32 br label %a4
33
34 a4: ; preds = %4, %.lr.ph
35 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
36 %.01 = phi %struct.A* [ %p, %.lr.ph ], [ %a10, %a4 ]
37 %a5 = load i8, i8* %2, align 1
38 %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
39 store i8 %a5, i8* %a7, align 1
40 %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
41 %a6 = load i8, i8* %3, align 1
42 store i8 %a6, i8* %a8, align 1
43 %a9 = add nsw i32 %i.02, 1
44 %a10 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
45 %exitcond = icmp eq i32 %a9, %count
46 br i1 %exitcond, label %._crit_edge, label %a4
47
48 ._crit_edge: ; preds = %4, %0
49 ret void
50 }
51
52 %struct.B = type { i16, i16, i16, i16, i16, i16, i16, i16 }
53
54 ; This has word loads interspersed with word stores.
55 ; The upper portion should be dead, so the movw loads should have
56 ; been changed into movzwl instead.
57 ; TODO: The second movw load doesn't get fixed due to register liveness
58 ; not being accurate enough.
59 ; CHECK-LABEL: foo2
60 ; load:
61 ; CHECK: movzwl
62 ; store:
63 ; CHECK: movw
64 ; load:
65 ; CHECK: movw
66 ; store:
67 ; CHECK: movw
68 ; CHECK: ret
69 define void @foo2(i32 %count,
70 %struct.B* noalias nocapture %q,
71 %struct.B* noalias nocapture %p)
72 nounwind uwtable noinline ssp {
73 %1 = icmp sgt i32 %count, 0
74 br i1 %1, label %.lr.ph, label %._crit_edge
75
76 .lr.ph: ; preds = %0
77 %2 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 0
78 %3 = getelementptr inbounds %struct.B, %struct.B* %q, i64 0, i32 1
79 br label %a4
80
81 a4: ; preds = %4, %.lr.ph
82 %i.02 = phi i32 [ 0, %.lr.ph ], [ %a9, %a4 ]
83 %.01 = phi %struct.B* [ %p, %.lr.ph ], [ %a10, %a4 ]
84 %a5 = load i16, i16* %2, align 2
85 %a7 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 0
86 store i16 %a5, i16* %a7, align 2
87 %a8 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 0, i32 1
88 %a6 = load i16, i16* %3, align 2
89 store i16 %a6, i16* %a8, align 2
90 %a9 = add nsw i32 %i.02, 1
91 %a10 = getelementptr inbounds %struct.B, %struct.B* %.01, i64 1
92 %exitcond = icmp eq i32 %a9, %count
93 br i1 %exitcond, label %._crit_edge, label %a4
94
95 ._crit_edge: ; preds = %4, %0
96 ret void
97 }
98
99 ; This test contains nothing but a simple byte load and store. Since
100 ; movb encodes smaller, we do not want to use movzbl unless in a tight loop.
101 ; So this test checks that movb is used.
102 ; CHECK-LABEL: foo3:
103 ; CHECK: movb
104 ; CHECK: movb
105 define void @foo3(i8 *%dst, i8 *%src) {
106 %t0 = load i8, i8 *%src, align 1
107 store i8 %t0, i8 *%dst, align 1
108 ret void
109 }
110
111 ; This test contains nothing but a simple word load and store. Since
112 ; movw and movzwl are the same size, we should always choose to use
113 ; movzwl instead.
114 ; CHECK-LABEL: foo4:
115 ; CHECK: movzwl
116 ; CHECK: movw
117 define void @foo4(i16 *%dst, i16 *%src) {
118 %t0 = load i16, i16 *%src, align 2
119 store i16 %t0, i16 *%dst, align 2
120 ret void
121 }