llvm.org GIT mirror llvm / d6ac8e9
This patch adds the X86FixupLEAs pass, which will reduce instruction latency for certain models of the Intel Atom family, by converting instructions into their equivalent LEA instructions, when it is both useful and possible to do so. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180573 91177308-0d34-0410-b5e6-96231b3b80d8 Preston Gurd 7 years ago
11 changed file(s) with 444 addition(s) and 1 deletion(s). Raw diff Collapse all Expand all
3232 X86TargetObjectFile.cpp
3333 X86TargetTransformInfo.cpp
3434 X86VZeroUpper.cpp
35 X86FixupLEAs.cpp
3536 )
3637
3738 if( CMAKE_CL_64 )
6868 /// createX86PadShortFunctions - Return a pass that pads short functions
6969 /// with NOOPs. This will prevent a stall when returning on the Atom.
7070 FunctionPass *createX86PadShortFunctions();
71 /// createX86FixupLEAs - Return a a pass that selectively replaces
72 /// certain instructions (like add, sub, inc, dec, some shifts,
73 /// and some multiplies) by equivalent LEA instructions, in order
74 /// to eliminate execution delays in some Atom processors.
75 FunctionPass *createX86FixupLEAs();
7176
7277 } // End llvm namespace
7378
138138 def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
139139 "CallRegIndirect", "true",
140140 "Call register indirect">;
141 def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
142 "LEA instruction needs inputs at AG stage">;
141143
142144 //===----------------------------------------------------------------------===//
143145 // X86 processors supported.
187189 FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
188190 FeatureSlowDivide,
189191 FeatureCallRegIndirect,
192 FeatureLEAUsesAG,
190193 FeaturePadShortFunctions]>;
191194
192195 // "Arrandale" along with corei3 and corei5
0 //===-- X86FixupLEAs.cpp - use or replace LEA instructions -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the pass which will find instructions which
10 // can be re-written as LEA instructions in order to reduce pipeline
11 // delays for some models of the Intel Atom family.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #define DEBUG_TYPE "x86-fixup-LEAs"
16 #include "X86.h"
17 #include "X86InstrInfo.h"
18 #include "X86Subtarget.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/LiveVariables.h"
21 #include "llvm/CodeGen/MachineFunctionPass.h"
22 #include "llvm/CodeGen/MachineInstrBuilder.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/Support/Debug.h"
26 #include "llvm/Support/raw_ostream.h"
27 #include "llvm/Target/TargetInstrInfo.h"
28 using namespace llvm;
29
30 STATISTIC(NumLEAs, "Number of LEA instructions created");
31
32 namespace {
33 class FixupLEAPass : public MachineFunctionPass {
34 enum RegUsageState { RU_NotUsed, RU_Write, RU_Read };
35 static char ID;
36 bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI);
37
38 virtual const char *getPassName() const { return "X86 Atom LEA Fixup";}
39 void seekLEAFixup(MachineOperand& p, MachineBasicBlock::iterator& I,
40 MachineFunction::iterator MFI);
41 void processInstruction(MachineBasicBlock::iterator& I,
42 MachineFunction::iterator MFI);
43 RegUsageState usesRegister(MachineOperand& p,
44 MachineBasicBlock::iterator I);
45 MachineBasicBlock::iterator searchBackwards(MachineOperand& p,
46 MachineBasicBlock::iterator& I,
47 MachineFunction::iterator MFI);
48 MachineInstr* postRAConvertToLEA(MachineFunction::iterator &MFI,
49 MachineBasicBlock::iterator &MBBI,
50 LiveVariables *LV) const;
51
52 public:
53 FixupLEAPass() : MachineFunctionPass(ID) {}
54
55 virtual bool runOnMachineFunction(MachineFunction &MF);
56
57 private:
58 MachineFunction *MF;
59 const TargetMachine *TM;
60 const TargetInstrInfo *TII; // Machine instruction info.
61 LiveVariables *LV;
62
63 };
64 char FixupLEAPass::ID = 0;
65 }
66
67 /// postRAConvertToLEA - if an instruction can be converted to an
68 /// equivalent LEA, insert the new instruction into the basic block
69 /// and return a pointer to it. Otherwise, return zero.
70 MachineInstr *
71 FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
72 MachineBasicBlock::iterator &MBBI,
73 LiveVariables *LV) const {
74 MachineInstr* MI = MBBI;
75 MachineInstr* NewMI;
76 switch (MI->getOpcode()) {
77 case X86::MOV32rr:
78 case X86::MOV64rr: {
79 const MachineOperand& Src = MI->getOperand(1);
80 const MachineOperand& Dest = MI->getOperand(0);
81 NewMI = BuildMI(*MF, MI->getDebugLoc(),
82 TII->get( MI->getOpcode() == X86::MOV32rr ? X86::LEA32r : X86::LEA64r))
83 .addOperand(Dest)
84 .addOperand(Src).addImm(1).addReg(0).addImm(0).addReg(0);
85 MFI->insert(MBBI, NewMI); // Insert the new inst
86 return NewMI;
87 }
88 case X86::ADD64ri32:
89 case X86::ADD64ri8:
90 case X86::ADD64ri32_DB:
91 case X86::ADD64ri8_DB:
92 case X86::ADD32ri:
93 case X86::ADD32ri8:
94 case X86::ADD32ri_DB:
95 case X86::ADD32ri8_DB:
96 case X86::ADD16ri:
97 case X86::ADD16ri8:
98 case X86::ADD16ri_DB:
99 case X86::ADD16ri8_DB:
100 if (!MI->getOperand(2).isImm()) {
101 // convertToThreeAddress will call getImm()
102 // which requires isImm() to be true
103 return 0;
104 }
105 }
106 return TII->convertToThreeAddress(MFI, MBBI, LV);
107 }
108
109 FunctionPass *llvm::createX86FixupLEAs() {
110 return new FixupLEAPass();
111 }
112
113 /// runOnMachineFunction - Loop over all of the basic blocks,
114 /// replacing instructions by equivalent LEA instructions
115 /// if needed and when possible.
116 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
117 MF = &Func;
118 TII = Func.getTarget().getInstrInfo();
119 TM = &MF->getTarget();
120 LV = getAnalysisIfAvailable();
121
122 DEBUG(dbgs() << "Start X86FixupLEAs\n";);
123 // Process all basic blocks.
124 for (MachineFunction::iterator I = Func.begin(), E = Func.end(); I != E; ++I)
125 processBasicBlock(Func, I);
126 DEBUG(dbgs() << "End X86FixupLEAs\n";);
127
128 return true;
129 }
130
131 /// usesRegister - Determine if an instruction references a machine register
132 /// and, if so, whether it reads or writes the register.
133 FixupLEAPass::RegUsageState FixupLEAPass::usesRegister(MachineOperand& p,
134 MachineBasicBlock::iterator I) {
135 RegUsageState RegUsage = RU_NotUsed;
136 MachineInstr* MI = I;
137
138 for (unsigned int i = 0; i < MI->getNumOperands(); ++i) {
139 MachineOperand& opnd = MI->getOperand(i);
140 if (opnd.isReg() && opnd.getReg() == p.getReg()){
141 if (opnd.isDef())
142 return RU_Write;
143 RegUsage = RU_Read;
144 }
145 }
146 return RegUsage;
147 }
148
149 /// getPreviousInstr - Given a reference to an instruction in a basic
150 /// block, return a reference to the previous instruction in the block,
151 /// wrapping around to the last instruction of the block if the block
152 /// branches to itself.
153 static inline bool getPreviousInstr(MachineBasicBlock::iterator& I,
154 MachineFunction::iterator MFI) {
155 if (I == MFI->begin()) {
156 if (MFI->isPredecessor(MFI)) {
157 I = --MFI->end();
158 return true;
159 }
160 else
161 return false;
162 }
163 --I;
164 return true;
165 }
166
167 /// searchBackwards - Step backwards through a basic block, looking
168 /// for an instruction which writes a register within
169 /// a maximum of INSTR_DISTANCE_THRESHOLD instruction latency cycles.
170 MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
171 MachineBasicBlock::iterator& I,
172 MachineFunction::iterator MFI) {
173 int InstrDistance = 1;
174 MachineBasicBlock::iterator CurInst;
175 static const int INSTR_DISTANCE_THRESHOLD = 5;
176
177 CurInst = I;
178 bool Found;
179 Found = getPreviousInstr(CurInst, MFI);
180 while( Found && I != CurInst) {
181 if (CurInst->isCall() || CurInst->isInlineAsm())
182 break;
183 if (InstrDistance > INSTR_DISTANCE_THRESHOLD)
184 break; // too far back to make a difference
185 if (usesRegister(p, CurInst) == RU_Write){
186 return CurInst;
187 }
188 InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
189 Found = getPreviousInstr(CurInst, MFI);
190 }
191 return 0;
192 }
193
194 /// processInstruction - Given a memory access or LEA instruction
195 /// whose address mode uses a base and/or index register, look for
196 /// an opportunity to replace the instruction which sets the base or index
197 /// register with an equivalent LEA instruction.
198 void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
199 MachineFunction::iterator MFI) {
200 // Process a load, store, or LEA instruction.
201 MachineInstr *MI = I;
202 int opcode = MI->getOpcode();
203 const MCInstrDesc& Desc = MI->getDesc();
204 int AddrOffset = X86II::getMemoryOperandNo(Desc.TSFlags, opcode);
205 if (AddrOffset >= 0) {
206 AddrOffset += X86II::getOperandBias(Desc);
207 MachineOperand& p = MI->getOperand(AddrOffset + X86::AddrBaseReg);
208 if (p.isReg() && p.getReg() != X86::ESP) {
209 seekLEAFixup(p, I, MFI);
210 }
211 MachineOperand& q = MI->getOperand(AddrOffset + X86::AddrIndexReg);
212 if (q.isReg() && q.getReg() != X86::ESP) {
213 seekLEAFixup(q, I, MFI);
214 }
215 }
216 }
217
218 /// seekLEAFixup - Given a machine register, look for the instruction
219 /// which writes it in the current basic block. If found,
220 /// try to replace it with an equivalent LEA instruction.
221 /// If replacement succeeds, then also process the the newly created
222 /// instruction.
223 void FixupLEAPass::seekLEAFixup(MachineOperand& p,
224 MachineBasicBlock::iterator& I,
225 MachineFunction::iterator MFI) {
226 MachineBasicBlock::iterator MBI = searchBackwards(p, I, MFI);
227 if (MBI) {
228 MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI, LV);
229 if (NewMI) {
230 ++NumLEAs;
231 DEBUG(dbgs() << "Candidate to replace:"; MBI->dump(););
232 // now to replace with an equivalent LEA...
233 DEBUG(dbgs() << "Replaced by: "; NewMI->dump(););
234 MFI->erase(MBI);
235 MachineBasicBlock::iterator J =
236 static_cast (NewMI);
237 processInstruction(J, MFI);
238 }
239 }
240 }
241
242 /// processBasicBlock - Loop over all of the instructions in the basic block,
243 /// replacing adds and shifts with LEA instructions, where appropriate.
244 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
245 MachineFunction::iterator MFI) {
246
247 for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
248 processInstruction(I, MFI);
249 return false;
250 }
466466 PostRAScheduler = false;
467467 PadShortFunctions = false;
468468 CallRegIndirect = false;
469 LEAUsesAG = false;
469470 stackAlignment = 4;
470471 // FIXME: this is a known good value for Yonah. How about others?
471472 MaxInlineSizeThreshold = 128;
164164 /// CallRegIndirect - True if the Calls with memory reference should be converted
165165 /// to a register-based indirect call.
166166 bool CallRegIndirect;
167 /// LEAUsesAG - True if the LEA instruction inputs have to be ready at
168 /// address generation (AG) time.
169 bool LEAUsesAG;
167170
168171 /// stackAlignment - The minimum alignment known to hold of the stack frame on
169172 /// entry to the function and which must be maintained by every function.
277280 bool hasSlowDivide() const { return HasSlowDivide; }
278281 bool padShortFunctions() const { return PadShortFunctions; }
279282 bool callRegIndirect() const { return CallRegIndirect; }
283 bool LEAusesAG() const { return LEAUsesAG; }
280284
281285 bool isAtom() const { return X86ProcFamily == IntelAtom; }
282286
214214 addPass(createX86PadShortFunctions());
215215 ShouldPrint = true;
216216 }
217 if (getOptLevel() != CodeGenOpt::None &&
218 getX86Subtarget().LEAusesAG()){
219 addPass(createX86FixupLEAs());
220 ShouldPrint = true;
221 }
217222
218223 return ShouldPrint;
219224 }
0 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
1 ; CHECK: addl
2 ; CHECK-NEXT:leal
3 ; CHECK-NEXT:decl
4 ; CHECK-NEXT:jne
5
6 ; Test for the FixupLEAs pre-emit pass. An LEA should be substituted for the ADD
7 ; that increments the array pointer because it is within 5 instructions of the
8 ; corresponding load. The ADD precedes the load by following the loop back edge.
9
10 ; Original C code
11 ;int test(int n, int * array)
12 ;{
13 ; int sum = 0;
14 ; for(int i = 0; i < n; i++)
15 ; sum += array[i];
16 ; return sum;
17 ;}
18
19 define i32 @test(i32 %n, i32* nocapture %array) {
20 entry:
21 %cmp4 = icmp sgt i32 %n, 0
22 br i1 %cmp4, label %for.body, label %for.end
23
24 for.body:
25 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
26 %sum.05 = phi i32 [ %add, %for.body ], [ 0, %entry ]
27 %arrayidx = getelementptr inbounds i32* %array, i32 %i.06
28 %0 = load i32* %arrayidx, align 4
29 %add = add nsw i32 %0, %sum.05
30 %inc = add nsw i32 %i.06, 1
31 %exitcond = icmp eq i32 %inc, %n
32 br i1 %exitcond, label %for.end, label %for.body
33
34 for.end:
35 %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.body ]
36 ret i32 %sum.0.lcssa
37 }
0 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
1 ; CHECK:BB#5
2 ; CHECK-NEXT:leal
3 ; CHECK-NEXT:leal
4 ; CHECK-NEXT:leal
5 ; CHECK-NEXT:movl
6
7
8 ; Test for fixup lea pre-emit pass. LEA instructions should be substituted for
9 ; ADD instructions which compute the address and index of the load because they
10 ; precede the load within 5 instructions. An LEA should also be substituted for
11 ; an ADD which computes part of the index because it precedes the index LEA
12 ; within 5 instructions, this substitution is referred to as backwards chaining.
13
14 ; Original C Code
15 ;struct node_t
16 ;{
17 ; int k, m, n, p;
18 ; int * array;
19 ;};
20
21 ;extern struct node_t getnode();
22
23 ;int test()
24 ;{
25 ; int sum = 0;
26 ; struct node_t n = getnode();
27 ; if(n.array != 0 && n.p > 0 && n.k > 0 && n.n > 0 && n.m > 0) {
28 ; sum = ((int*)((int)n.array + n.p) )[ n.k + n.m + n.n ];
29 ; }
30 ; return sum;
31 ;}
32
33 %struct.node_t = type { i32, i32, i32, i32, i32* }
34
35 define i32 @test() {
36 entry:
37 %n = alloca %struct.node_t, align 4
38 call void bitcast (void (%struct.node_t*, ...)* @getnode to void (%struct.node_t*)*)(%struct.node_t* sret %n)
39 %array = getelementptr inbounds %struct.node_t* %n, i32 0, i32 4
40 %0 = load i32** %array, align 4
41 %cmp = icmp eq i32* %0, null
42 br i1 %cmp, label %if.end, label %land.lhs.true
43
44 land.lhs.true:
45 %p = getelementptr inbounds %struct.node_t* %n, i32 0, i32 3
46 %1 = load i32* %p, align 4
47 %cmp1 = icmp sgt i32 %1, 0
48 br i1 %cmp1, label %land.lhs.true2, label %if.end
49
50 land.lhs.true2:
51 %k = getelementptr inbounds %struct.node_t* %n, i32 0, i32 0
52 %2 = load i32* %k, align 4
53 %cmp3 = icmp sgt i32 %2, 0
54 br i1 %cmp3, label %land.lhs.true4, label %if.end
55
56 land.lhs.true4:
57 %n5 = getelementptr inbounds %struct.node_t* %n, i32 0, i32 2
58 %3 = load i32* %n5, align 4
59 %cmp6 = icmp sgt i32 %3, 0
60 br i1 %cmp6, label %land.lhs.true7, label %if.end
61
62 land.lhs.true7:
63 %m = getelementptr inbounds %struct.node_t* %n, i32 0, i32 1
64 %4 = load i32* %m, align 4
65 %cmp8 = icmp sgt i32 %4, 0
66 br i1 %cmp8, label %if.then, label %if.end
67
68 if.then:
69 %add = add i32 %3, %2
70 %add12 = add i32 %add, %4
71 %5 = ptrtoint i32* %0 to i32
72 %add15 = add nsw i32 %1, %5
73 %6 = inttoptr i32 %add15 to i32*
74 %arrayidx = getelementptr inbounds i32* %6, i32 %add12
75 %7 = load i32* %arrayidx, align 4
76 br label %if.end
77
78 if.end:
79 %sum.0 = phi i32 [ %7, %if.then ], [ 0, %land.lhs.true7 ], [ 0, %land.lhs.true4 ], [ 0, %land.lhs.true2 ], [ 0, %land.lhs.true ], [ 0, %entry ]
80 ret i32 %sum.0
81 }
82
83 declare void @getnode(%struct.node_t* sret, ...)
0 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
1 ; CHECK: addl ([[reg:%[a-z]+]])
2 ; CHECK-NEXT: addl $4, [[reg]]
3
4 ; Test for the FixupLEAs pre-emit pass.
5 ; An LEA should NOT be substituted for the ADD instruction
6 ; that increments the array pointer if it is greater than 5 instructions
7 ; away from the memory reference that uses it.
8
9 ; Original C code: clang -m32 -S -O2
10 ;int test(int n, int * array, int * m, int * array2)
11 ;{
12 ; int i, j = 0;
13 ; int sum = 0;
14 ; for (i = 0, j = 0; i < n;) {
15 ; ++i;
16 ; *m += array2[j++];
17 ; sum += array[i];
18 ; }
19 ; return sum;
20 ;}
21
22 define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 {
23 entry:
24 %cmp7 = icmp sgt i32 %n, 0
25 br i1 %cmp7, label %for.body.lr.ph, label %for.end
26
27 for.body.lr.ph: ; preds = %entry
28 %.pre = load i32* %m, align 4
29 br label %for.body
30
31 for.body: ; preds = %for.body, %for.body.lr.ph
32 %0 = phi i32 [ %.pre, %for.body.lr.ph ], [ %add, %for.body ]
33 %sum.010 = phi i32 [ 0, %for.body.lr.ph ], [ %add3, %for.body ]
34 %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ]
35 %inc1 = add nsw i32 %j.09, 1
36 %arrayidx = getelementptr inbounds i32* %array2, i32 %j.09
37 %1 = load i32* %arrayidx, align 4
38 %add = add nsw i32 %0, %1
39 store i32 %add, i32* %m, align 4
40 %arrayidx2 = getelementptr inbounds i32* %array, i32 %inc1
41 %2 = load i32* %arrayidx2, align 4
42 %add3 = add nsw i32 %2, %sum.010
43 %exitcond = icmp eq i32 %inc1, %n
44 br i1 %exitcond, label %for.end, label %for.body
45
46 for.end: ; preds = %for.body, %entry
47 %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add3, %for.body ]
48 ret i32 %sum.0.lcssa
49 }
50
1616 ; ATOM-NEXT: movsd A(,%rax,8)
1717 ; ATOM-NEXT: mulsd
1818 ; ATOM-NEXT: movsd
19 ; ATOM-NEXT: incq %rax
19 ; ATOM-NEXT: leaq 1(%rax), %rax
2020
2121 @A = external global [0 x double]
2222