llvm.org GIT mirror llvm / dd30b47
The current Intel Atom microarchitecture has a feature whereby when a function returns early then it is slightly faster to execute a sequence of NOP instructions to wait until the return address is ready, as opposed to simply stalling on the ret instruction until the return address is ready. When compiling for X86 Atom only, this patch will run a pass, called "X86PadShortFunction" which will add NOP instructions where less than four cycles elapse between function entry and return. It includes tests. Patch by Andy Zhang. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171524 91177308-0d34-0410-b5e6-96231b3b80d8 Preston Gurd 7 years ago
11 changed file(s) with 282 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
2424 X86JITInfo.cpp
2525 X86MCInstLower.cpp
2626 X86MachineFunctionInfo.cpp
27 X86PadShortFunction.cpp
2728 X86RegisterInfo.cpp
2829 X86SelectionDAGInfo.cpp
2930 X86Subtarget.cpp
6262 ///
6363 FunctionPass *createEmitX86CodeToMemory();
6464
65 /// createX86PadShortFunctions - Return a pass that pads short functions
66 /// with NOOPs. This will prevent a stall when returning from the function
67 /// on the Atom.
68 FunctionPass *createX86PadShortFunctions();
69
6570 } // End llvm namespace
6671
6772 #endif
122122 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
123123 "Use LEA for adjusting the stack pointer">;
124124 def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
125 "HasSlowDivide", "true",
126 "Use small divide for positive values less than 256">;
125 "HasSlowDivide", "true",
126 "Use small divide for positive values less than 256">;
127 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
128 "PadShortFunctions", "true",
129 "Pad short functions">;
127130
128131 //===----------------------------------------------------------------------===//
129132 // X86 processors supported.
166169 FeatureSlowBTMem]>;
167170 def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
168171 FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
169 FeatureSlowDivide]>;
172 FeatureSlowDivide, FeaturePadShortFunctions]>;
170173 // "Arrandale" along with corei3 and corei5
171174 def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
172175 FeatureSlowBTMem, FeatureFastUAMem,
0 //===-------- X86PadShortFunction.cpp - pad short functions -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the pass which will pad short functions to prevent
10 // a stall if a function returns before the return address is ready. This
11 // is needed for some Intel Atom processors.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include
16 #include
17
18 #define DEBUG_TYPE "x86-pad-short-functions"
19 #include "X86.h"
20 #include "X86InstrInfo.h"
21 #include "llvm/ADT/Statistic.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineRegisterInfo.h"
25 #include "llvm/CodeGen/Passes.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include "llvm/Target/TargetInstrInfo.h"
29 using namespace llvm;
30
31 STATISTIC(NumBBsPadded, "Number of basic blocks padded");
32
33 namespace {
34 struct PadShortFunc : public MachineFunctionPass {
35 static char ID;
36 PadShortFunc() : MachineFunctionPass(ID)
37 , Threshold(4)
38 {}
39
40 virtual bool runOnMachineFunction(MachineFunction &MF);
41
42 virtual const char *getPassName() const
43 {
44 return "X86 Atom pad short functions";
45 }
46
47 private:
48 bool addPadding(MachineFunction &MF,
49 MachineBasicBlock &MBB,
50 MachineBasicBlock::iterator &MBBI,
51 unsigned int NOOPsToAdd);
52
53 void findReturn(MachineFunction &MF,
54 MachineBasicBlock &MBB,
55 unsigned int Cycles);
56
57 bool cyclesUntilReturn(MachineFunction &MF,
58 MachineBasicBlock &MBB,
59 unsigned int &Cycles,
60 MachineBasicBlock::iterator *Location = 0);
61
62 const unsigned int Threshold;
63 std::map ReturnBBs;
64 };
65
66 char PadShortFunc::ID = 0;
67 }
68
69 FunctionPass *llvm::createX86PadShortFunctions() {
70 return new PadShortFunc();
71 }
72
73 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
74 /// NOOP instructions before early exits.
75 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
76 // Process all basic blocks.
77 ReturnBBs.clear();
78
79 // Search through basic blocks and mark the ones that have early returns
80 findReturn(MF, *MF.begin(), 0);
81
82 int BBNum;
83 MachineBasicBlock::iterator ReturnLoc;
84 MachineBasicBlock *MBB;
85
86 unsigned int Cycles = 0;
87 unsigned int BBCycles;
88
89 // Pad the identified basic blocks with NOOPs
90 for (std::map::iterator I = ReturnBBs.begin();
91 I != ReturnBBs.end(); ++I) {
92 BBNum = I->first;
93 Cycles = I->second;
94
95 if (Cycles < Threshold) {
96 MBB = MF.getBlockNumbered(BBNum);
97 if (!cyclesUntilReturn(MF, *MBB, BBCycles, &ReturnLoc))
98 continue;
99
100 addPadding(MF, *MBB, ReturnLoc, Threshold - Cycles);
101 NumBBsPadded++;
102 }
103 }
104
105 return false;
106 }
107
108 /// findReturn - Starting at MBB, follow control flow and add all
109 /// basic blocks that contain a return to ReturnBBs.
110 void PadShortFunc::findReturn(MachineFunction &MF,
111 MachineBasicBlock &MBB,
112 unsigned int Cycles)
113 {
114 // If this BB has a return, note how many cycles it takes to get there.
115 bool hasReturn = cyclesUntilReturn(MF, MBB, Cycles);
116 if (Cycles >= Threshold)
117 return;
118
119 if (hasReturn) {
120 int BBNum = MBB.getNumber();
121 ReturnBBs[BBNum] = std::max(ReturnBBs[BBNum], Cycles);
122
123 return;
124 }
125
126 // Follow branches in BB and look for returns
127 for (MachineBasicBlock::succ_iterator I = MBB.succ_begin();
128 I != MBB.succ_end(); ++I) {
129 findReturn(MF, **I, Cycles);
130 }
131 }
132
133 /// cyclesUntilReturn - if the MBB has a return instruction, set Location to
134 /// to the instruction and return true. Return false otherwise.
135 /// Cycles will be incremented by the number of cycles taken to reach the
136 /// return or the end of the BB, whichever occurs first.
137 bool PadShortFunc::cyclesUntilReturn(MachineFunction &MF,
138 MachineBasicBlock &MBB,
139 unsigned int &Cycles,
140 MachineBasicBlock::iterator *Location)
141 {
142 const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
143 const TargetMachine &Target = MF.getTarget();
144
145 for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBB.end();
146 ++MBBI) {
147 MachineInstr *MI = MBBI;
148 // Mark basic blocks with a return instruction. Calls to other functions
149 // do not count because the called function will be padded, if necessary
150 if (MI->isReturn() && !MI->isCall()) {
151 if (Location)
152 *Location = MBBI;
153 return true;
154 }
155
156 Cycles += TII.getInstrLatency(Target.getInstrItineraryData(), MI);
157 }
158
159 return false;
160 }
161
162 /// addPadding - Add the given number of NOOP instructions to the function
163 /// right before the return at MBBI
164 bool PadShortFunc::addPadding(MachineFunction &MF,
165 MachineBasicBlock &MBB,
166 MachineBasicBlock::iterator &MBBI,
167 unsigned int NOOPsToAdd)
168 {
169 const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
170
171 DebugLoc DL = MBBI->getDebugLoc();
172
173 while (NOOPsToAdd-- > 0) {
174 // Since Atom has two instruction execution ports,
175 // the code emits two noops, which will be executed in parallell
176 // during one cycle.
177 BuildMI(MBB, MBBI, DL, TII.get(X86::NOOP));
178 BuildMI(MBB, MBBI, DL, TII.get(X86::NOOP));
179 }
180
181 return true;
182 }
183
349349 , UseLeaForSP(false)
350350 , HasSlowDivide(false)
351351 , PostRAScheduler(false)
352 , PadShortFunctions(false)
352353 , stackAlignment(4)
353354 // FIXME: this is a known good value for Yonah. How about others?
354355 , MaxInlineSizeThreshold(128)
144144
145145 /// PostRAScheduler - True if using post-register-allocation scheduler.
146146 bool PostRAScheduler;
147
148 /// PadShortFunctions - True if the short functions should be padded to prevent
149 /// a stall when returning too early.
150 bool PadShortFunctions;
147151
148152 /// stackAlignment - The minimum alignment known to hold of the stack frame on
149153 /// entry to the function and which must be maintained by every function.
230234 bool hasCmpxchg16b() const { return HasCmpxchg16b; }
231235 bool useLeaForSP() const { return UseLeaForSP; }
232236 bool hasSlowDivide() const { return HasSlowDivide; }
237 bool padShortFunctions() const { return PadShortFunctions; }
233238
234239 bool isAtom() const { return X86ProcFamily == IntelAtom; }
235240
189189 addPass(createX86IssueVZeroUpperPass());
190190 ShouldPrint = true;
191191 }
192 if (getX86Subtarget().padShortFunctions()){
193 addPass(createX86PadShortFunctions());
194 ShouldPrint = true;
195 }
192196
193197 return ShouldPrint;
194198 }
0 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s
1
2 declare void @external_function(...)
3
4 define i32 @test_return_val(i32 %a) nounwind {
5 ; CHECK: test_return_val
6 ; CHECK: movl
7 ; CHECK: nop
8 ; CHECK: nop
9 ; CHECK: nop
10 ; CHECK: nop
11 ; CHECK: nop
12 ; CHECK: nop
13 ; CHECK: ret
14 ret i32 %a
15 }
16
17 define i32 @test_add(i32 %a, i32 %b) nounwind {
18 ; CHECK: test_add
19 ; CHECK: addl
20 ; CHECK: nop
21 ; CHECK: nop
22 ; CHECK: nop
23 ; CHECK: nop
24 ; CHECK: ret
25 %result = add i32 %a, %b
26 ret i32 %result
27 }
28
29 define i32 @test_multiple_ret(i32 %a, i32 %b, i1 %c) nounwind {
30 ; CHECK: @test_multiple_ret
31 ; CHECK: je
32
33 ; CHECK: nop
34 ; CHECK: nop
35 ; CHECK: ret
36
37 ; CHECK: nop
38 ; CHECK: nop
39 ; CHECK: ret
40
41 br i1 %c, label %bb1, label %bb2
42
43 bb1:
44 ret i32 %a
45
46 bb2:
47 ret i32 %b
48 }
49
50 define void @test_call_others(i32 %x) nounwind
51 {
52 ; CHECK: test_call_others
53 ; CHECK: je
54 %tobool = icmp eq i32 %x, 0
55 br i1 %tobool, label %if.end, label %true.case
56
57 ; CHECK: jmp external_function
58 true.case:
59 tail call void bitcast (void (...)* @external_function to void ()*)() nounwind
60 br label %if.end
61
62 ; CHECK: nop
63 ; CHECK: nop
64 ; CHECK: nop
65 ; CHECK: nop
66 ; CHECK: ret
67 if.end:
68 ret void
69
70 }
None ; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
1 ; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
0 ; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
1 ; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
22
33 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
44 target triple = "x86_64-apple-darwin10.0.0"
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mcpu=core2 -mattr=+mmx,+sse2 | FileCheck %s
11 ; rdar://6602459
22
33 @g_v1di = external global <1 x i64>
281281 ; ATOM: test13:
282282 ; ATOM: cmpl
283283 ; ATOM-NEXT: sbbl
284 ; ATOM-NEXT: ret
284 ; ATOM: ret
285285 }
286286
287287 define i32 @test14(i32 %a, i32 %b) nounwind {
298298 ; ATOM: cmpl
299299 ; ATOM-NEXT: sbbl
300300 ; ATOM-NEXT: notl
301 ; ATOM-NEXT: ret
301 ; ATOM: ret
302302 }
303303
304304 ; rdar://10961709