llvm.org GIT mirror llvm / c7b902e
Pad Short Functions for Intel Atom The current Intel Atom microarchitecture has a feature whereby when a function returns early then it is slightly faster to execute a sequence of NOP instructions to wait until the return address is ready, as opposed to simply stalling on the ret instruction until the return address is ready. When compiling for X86 Atom only, this patch will run a pass, called "X86PadShortFunction" which will add NOP instructions where less than four cycles elapse between function entry and return. It includes tests. This patch has been updated to address Nadav's review comments - Optimize only at >= O1 and don't do optimization if -Os is set - Stores MachineBasicBlock* instead of BBNum - Uses DenseMap instead of std::map - Fixes placement of braces Patch by Andy Zhang. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@171879 91177308-0d34-0410-b5e6-96231b3b80d8 Preston Gurd 7 years ago
11 changed file(s) with 277 addition(s) and 8 deletion(s). Raw diff Collapse all Expand all
2424 X86JITInfo.cpp
2525 X86MCInstLower.cpp
2626 X86MachineFunctionInfo.cpp
27 X86PadShortFunction.cpp
2728 X86RegisterInfo.cpp
2829 X86SelectionDAGInfo.cpp
2930 X86Subtarget.cpp
6565 /// \brief Creates an X86-specific Target Transformation Info pass.
6666 ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
6767
68 /// createX86PadShortFunctions - Return a pass that pads short functions
69 /// with NOOPs. This will prevent a stall when returning on the Atom.
70 FunctionPass *createX86PadShortFunctions();
71
6872 } // End llvm namespace
6973
7074 #endif
122122 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
123123 "Use LEA for adjusting the stack pointer">;
124124 def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
125 "HasSlowDivide", "true",
126 "Use small divide for positive values less than 256">;
125 "HasSlowDivide", "true",
126 "Use small divide for positive values less than 256">;
127 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
128 "PadShortFunctions", "true",
129 "Pad short functions">;
127130
128131 //===----------------------------------------------------------------------===//
129132 // X86 processors supported.
166169 FeatureSlowBTMem]>;
167170 def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
168171 FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
169 FeatureSlowDivide]>;
172 FeatureSlowDivide, FeaturePadShortFunctions]>;
170173 // "Arrandale" along with corei3 and corei5
171174 def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
172175 FeatureSlowBTMem, FeatureFastUAMem,
0 //===-------- X86PadShortFunction.cpp - pad short functions -----------===//
1 //
2 // The LLVM Compiler Infrastructure
3 //
4 // This file is distributed under the University of Illinois Open Source
5 // License. See LICENSE.TXT for details.
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines the pass which will pad short functions to prevent
10 // a stall if a function returns before the return address is ready. This
11 // is needed for some Intel Atom processors.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include
16
17 #define DEBUG_TYPE "x86-pad-short-functions"
18 #include "X86.h"
19 #include "X86InstrInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/MachineFunctionPass.h"
22 #include "llvm/CodeGen/MachineInstrBuilder.h"
23 #include "llvm/CodeGen/MachineRegisterInfo.h"
24 #include "llvm/CodeGen/Passes.h"
25 #include "llvm/IR/Function.h"
26 #include "llvm/Support/Debug.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include "llvm/Target/TargetInstrInfo.h"
29
30 using namespace llvm;
31
32 STATISTIC(NumBBsPadded, "Number of basic blocks padded");
33
34 namespace {
35 struct PadShortFunc : public MachineFunctionPass {
36 static char ID;
37 PadShortFunc() : MachineFunctionPass(ID)
38 , Threshold(4), TM(0), TII(0) {}
39
40 virtual bool runOnMachineFunction(MachineFunction &MF);
41
42 virtual const char *getPassName() const {
43 return "X86 Atom pad short functions";
44 }
45
46 private:
47 void findReturns(MachineBasicBlock *MBB,
48 unsigned int Cycles = 0);
49
50 bool cyclesUntilReturn(MachineBasicBlock *MBB,
51 unsigned int &Cycles,
52 MachineBasicBlock::iterator *Location = 0);
53
54 void addPadding(MachineBasicBlock *MBB,
55 MachineBasicBlock::iterator &MBBI,
56 unsigned int NOOPsToAdd);
57
58 const unsigned int Threshold;
59 DenseMap ReturnBBs;
60
61 const TargetMachine *TM;
62 const TargetInstrInfo *TII;
63 };
64
65 char PadShortFunc::ID = 0;
66 }
67
68 FunctionPass *llvm::createX86PadShortFunctions() {
69 return new PadShortFunc();
70 }
71
72 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
73 /// NOOP instructions before early exits.
74 bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
75 bool OptForSize = MF.getFunction()->getAttributes().
76 hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
77
78 if (OptForSize)
79 return false;
80
81 TM = &MF.getTarget();
82 TII = TM->getInstrInfo();
83
84 // Search through basic blocks and mark the ones that have early returns
85 ReturnBBs.clear();
86 findReturns(MF.begin());
87
88 bool MadeChange = false;
89
90 MachineBasicBlock::iterator ReturnLoc;
91 MachineBasicBlock *MBB;
92 unsigned int Cycles = 0;
93 unsigned int BBCycles;
94
95 // Pad the identified basic blocks with NOOPs
96 for (DenseMap::iterator I = ReturnBBs.begin();
97 I != ReturnBBs.end(); ++I) {
98 MBB = I->first;
99 Cycles = I->second;
100
101 if (Cycles < Threshold) {
102 if (!cyclesUntilReturn(MBB, BBCycles, &ReturnLoc))
103 continue;
104
105 addPadding(MBB, ReturnLoc, Threshold - Cycles);
106 NumBBsPadded++;
107 MadeChange = true;
108 }
109 }
110
111 return MadeChange;
112 }
113
114 /// findReturn - Starting at MBB, follow control flow and add all
115 /// basic blocks that contain a return to ReturnBBs.
116 void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
117 // If this BB has a return, note how many cycles it takes to get there.
118 bool hasReturn = cyclesUntilReturn(MBB, Cycles);
119 if (Cycles >= Threshold)
120 return;
121
122 if (hasReturn) {
123 ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
124 return;
125 }
126
127 // Follow branches in BB and look for returns
128 for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
129 I != MBB->succ_end(); ++I) {
130 findReturns(*I, Cycles);
131 }
132 }
133
134 /// cyclesUntilReturn - if the MBB has a return instruction, set Location
135 /// to the instruction and return true. Return false otherwise.
136 /// Cycles will be incremented by the number of cycles taken to reach the
137 /// return or the end of the BB, whichever occurs first.
138 bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
139 unsigned int &Cycles,
140 MachineBasicBlock::iterator *Location) {
141 for (MachineBasicBlock::iterator MBBI = MBB->begin();
142 MBBI != MBB->end(); ++MBBI) {
143 MachineInstr *MI = MBBI;
144 // Mark basic blocks with a return instruction. Calls to other
145 // functions do not count because the called function will be padded,
146 // if necessary.
147 if (MI->isReturn() && !MI->isCall()) {
148 if (Location)
149 *Location = MBBI;
150 return true;
151 }
152
153 Cycles += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
154 }
155
156 return false;
157 }
158
159 /// addPadding - Add the given number of NOOP instructions to the function
160 /// just prior to the return at MBBI
161 void PadShortFunc::addPadding(MachineBasicBlock *MBB,
162 MachineBasicBlock::iterator &MBBI,
163 unsigned int NOOPsToAdd) {
164 DebugLoc DL = MBBI->getDebugLoc();
165
166 while (NOOPsToAdd-- > 0) {
167 BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
168 BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
169 }
170 }
349349 , UseLeaForSP(false)
350350 , HasSlowDivide(false)
351351 , PostRAScheduler(false)
352 , PadShortFunctions(false)
352353 , stackAlignment(4)
353354 // FIXME: this is a known good value for Yonah. How about others?
354355 , MaxInlineSizeThreshold(128)
144144
145145 /// PostRAScheduler - True if using post-register-allocation scheduler.
146146 bool PostRAScheduler;
147
148 /// PadShortFunctions - True if the short functions should be padded to prevent
149 /// a stall when returning too early.
150 bool PadShortFunctions;
147151
148152 /// stackAlignment - The minimum alignment known to hold of the stack frame on
149153 /// entry to the function and which must be maintained by every function.
230234 bool hasCmpxchg16b() const { return HasCmpxchg16b; }
231235 bool useLeaForSP() const { return UseLeaForSP; }
232236 bool hasSlowDivide() const { return HasSlowDivide; }
237 bool padShortFunctions() const { return PadShortFunctions; }
233238
234239 bool isAtom() const { return X86ProcFamily == IntelAtom; }
235240
201201 ShouldPrint = true;
202202 }
203203
204 if (getOptLevel() != CodeGenOpt::None &&
205 getX86Subtarget().padShortFunctions()) {
206 addPass(createX86PadShortFunctions());
207 ShouldPrint = true;
208 }
209
204210 return ShouldPrint;
205211 }
206212
0 ; RUN: llc < %s -O1 -mcpu=atom -mtriple=i686-linux | FileCheck %s
1
2 declare void @external_function(...)
3
4 define i32 @test_return_val(i32 %a) nounwind {
5 ; CHECK: test_return_val
6 ; CHECK: movl
7 ; CHECK: nop
8 ; CHECK: nop
9 ; CHECK: nop
10 ; CHECK: nop
11 ; CHECK: nop
12 ; CHECK: nop
13 ; CHECK: ret
14 ret i32 %a
15 }
16
17 define i32 @test_optsize(i32 %a) nounwind optsize {
18 ; CHECK: test_optsize
19 ; CHECK: movl
20 ; CHECK-NEXT: ret
21 ret i32 %a
22 }
23
24 define i32 @test_add(i32 %a, i32 %b) nounwind {
25 ; CHECK: test_add
26 ; CHECK: addl
27 ; CHECK: nop
28 ; CHECK: nop
29 ; CHECK: nop
30 ; CHECK: nop
31 ; CHECK: ret
32 %result = add i32 %a, %b
33 ret i32 %result
34 }
35
36 define i32 @test_multiple_ret(i32 %a, i32 %b, i1 %c) nounwind {
37 ; CHECK: @test_multiple_ret
38 ; CHECK: je
39
40 ; CHECK: nop
41 ; CHECK: nop
42 ; CHECK: ret
43
44 ; CHECK: nop
45 ; CHECK: nop
46 ; CHECK: ret
47
48 br i1 %c, label %bb1, label %bb2
49
50 bb1:
51 ret i32 %a
52
53 bb2:
54 ret i32 %b
55 }
56
57 define void @test_call_others(i32 %x) nounwind
58 {
59 ; CHECK: test_call_others
60 ; CHECK: je
61 %tobool = icmp eq i32 %x, 0
62 br i1 %tobool, label %if.end, label %true.case
63
64 ; CHECK: jmp external_function
65 true.case:
66 tail call void bitcast (void (...)* @external_function to void ()*)() nounwind
67 br label %if.end
68
69 ; CHECK: nop
70 ; CHECK: nop
71 ; CHECK: nop
72 ; CHECK: nop
73 ; CHECK: ret
74 if.end:
75 ret void
76
77 }
None ; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
1 ; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
0 ; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
1 ; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
22
33 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
44 target triple = "x86_64-apple-darwin10.0.0"
None ; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mattr=+mmx,+sse2 | FileCheck %s
0 ; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -mcpu=core2 -mattr=+mmx,+sse2 | FileCheck %s
11 ; rdar://6602459
22
33 @g_v1di = external global <1 x i64>
281281 ; ATOM: test13:
282282 ; ATOM: cmpl
283283 ; ATOM-NEXT: sbbl
284 ; ATOM-NEXT: ret
284 ; ATOM: ret
285285 }
286286
287287 define i32 @test14(i32 %a, i32 %b) nounwind {
298298 ; ATOM: cmpl
299299 ; ATOM-NEXT: sbbl
300300 ; ATOM-NEXT: notl
301 ; ATOM-NEXT: ret
301 ; ATOM: ret
302302 }
303303
304304 ; rdar://10961709