llvm.org GIT mirror llvm / 323ac85
ARM: fold prologue/epilogue sp updates into push/pop for code size ARM prologues usually look like: push {r7, lr} sub sp, sp, #4 If code size is extremely important, this can be optimised to the single instruction: push {r6, r7, lr} where we don't actually care about the contents of r6, but pushing it subtracts 4 from sp as a side effect. This should implement such a conversion, predicated on the "minsize" function attribute (-Oz) since I've yet to find any code it actually makes faster. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@194264 91177308-0d34-0410-b5e6-96231b3b80d8 Tim Northover 6 years ago
5 changed file(s) with 292 addition(s) and 32 deletion(s). Raw diff Collapse all Expand all
18541854 .setMIFlags(MIFlags);
18551855 BaseReg = DestReg;
18561856 }
1857 }
1858
1859 bool llvm::tryFoldSPUpdateIntoPushPop(MachineFunction &MF,
1860 MachineInstr *MI,
1861 unsigned NumBytes) {
1862 // This optimisation potentially adds lots of load and store
1863 // micro-operations, it's only really a great benefit to code-size.
1864 if (!MF.getFunction()->hasFnAttribute(Attribute::MinSize))
1865 return false;
1866
1867 // If only one register is pushed/popped, LLVM can use an LDR/STR
1868 // instead. We can't modify those so make sure we're dealing with an
1869 // instruction we understand.
1870 bool IsPop = isPopOpcode(MI->getOpcode());
1871 bool IsPush = isPushOpcode(MI->getOpcode());
1872 if (!IsPush && !IsPop)
1873 return false;
1874
1875 bool IsVFPPushPop = MI->getOpcode() == ARM::VSTMDDB_UPD ||
1876 MI->getOpcode() == ARM::VLDMDIA_UPD;
1877 bool IsT1PushPop = MI->getOpcode() == ARM::tPUSH ||
1878 MI->getOpcode() == ARM::tPOP ||
1879 MI->getOpcode() == ARM::tPOP_RET;
1880
1881 assert((IsT1PushPop || (MI->getOperand(0).getReg() == ARM::SP &&
1882 MI->getOperand(1).getReg() == ARM::SP)) &&
1883 "trying to fold sp update into non-sp-updating push/pop");
1884
1885 // The VFP push & pop act on D-registers, so we can only fold an adjustment
1886 // by a multiple of 8 bytes in correctly. Similarly rN is 4-bytes. Don't try
1887 // if this is violated.
1888 if (NumBytes % (IsVFPPushPop ? 8 : 4) != 0)
1889 return false;
1890
1891 // ARM and Thumb2 push/pop insts have explicit "sp, sp" operands (+
1892 // pred) so the list starts at 4. Thumb1 starts after the predicate.
1893 int RegListIdx = IsT1PushPop ? 2 : 4;
1894
1895 // Calculate the space we'll need in terms of registers.
1896 unsigned FirstReg = MI->getOperand(RegListIdx).getReg();
1897 unsigned RD0Reg, RegsNeeded;
1898 if (IsVFPPushPop) {
1899 RD0Reg = ARM::D0;
1900 RegsNeeded = NumBytes / 8;
1901 } else {
1902 RD0Reg = ARM::R0;
1903 RegsNeeded = NumBytes / 4;
1904 }
1905
1906 // We're going to have to strip all list operands off before
1907 // re-adding them since the order matters, so save the existing ones
1908 // for later.
1909 SmallVector RegList;
1910 for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
1911 RegList.push_back(MI->getOperand(i));
1912
1913 MachineBasicBlock *MBB = MI->getParent();
1914 const TargetRegisterInfo *TRI = MF.getRegInfo().getTargetRegisterInfo();
1915
1916 // Now try to find enough space in the reglist to allocate NumBytes.
1917 for (unsigned CurReg = FirstReg - 1; CurReg >= RD0Reg && RegsNeeded;
1918 --CurReg, --RegsNeeded) {
1919 if (!IsPop) {
1920 // Pushing any register is completely harmless, mark the
1921 // register involved as undef since we don't care about it in
1922 // the slightest.
1923 RegList.push_back(MachineOperand::CreateReg(CurReg, false, false,
1924 false, false, true));
1925 continue;
1926 }
1927
1928 // However, we can only pop an extra register if it's not live. Otherwise we
1929 // might clobber a return value register. We assume that once we find a live
1930 // return register all lower ones will be too so there's no use proceeding.
1931 if (MBB->computeRegisterLiveness(TRI, CurReg, MI) !=
1932 MachineBasicBlock::LQR_Dead)
1933 return false;
1934
1935 // Mark the unimportant registers as in the POP.
1936 RegList.push_back(MachineOperand::CreateReg(CurReg, true, false, true));
1937 }
1938
1939 if (RegsNeeded > 0)
1940 return false;
1941
1942 // Finally we know we can profitably perform the optimisation so go
1943 // ahead: strip all existing registers off and add them back again
1944 // in the right order.
1945 for (int i = MI->getNumOperands() - 1; i >= RegListIdx; --i)
1946 MI->RemoveOperand(i);
1947
1948 // Add the complete list back in.
1949 MachineInstrBuilder MIB(MF, &*MI);
1950 for (int i = RegList.size() - 1; i >= 0; --i)
1951 MIB.addOperand(RegList[i]);
1952
1953 return true;
18571954 }
18581955
18591956 bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
361361 return Opc == ARM::BX || Opc == ARM::MOVPCRX || Opc == ARM::tBRIND;
362362 }
363363
364 static inline bool isPopOpcode(int Opc) {
365 return Opc == ARM::tPOP_RET || Opc == ARM::LDMIA_RET ||
366 Opc == ARM::t2LDMIA_RET || Opc == ARM::tPOP || Opc == ARM::LDMIA_UPD ||
367 Opc == ARM::t2LDMIA_UPD || Opc == ARM::VLDMDIA_UPD;
368 }
369
370 static inline bool isPushOpcode(int Opc) {
371 return Opc == ARM::tPUSH || Opc == ARM::t2STMDB_UPD ||
372 Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
373 }
374
364375 /// getInstrPredicate - If instruction is predicated, returns its predicate
365376 /// condition, otherwise returns AL. It also returns the condition code
366377 /// register by reference.
400411 const ARMBaseRegisterInfo& MRI,
401412 unsigned MIFlags = 0);
402413
414 /// Tries to add registers to the reglist of a given base-updating
415 /// push/pop instruction to adjust the stack by an additional
416 /// NumBytes. This can save a few bytes per function in code-size, but
417 /// obviously generates more memory traffic. As such, it only takes
418 /// effect in functions being optimised for size.
419 bool tryFoldSPUpdateIntoPushPop(MachineFunction &MF, MachineInstr *MI,
420 unsigned NumBytes);
403421
404422 /// rewriteARMFrameIndex / rewriteT2FrameIndex -
405423 /// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
9292 const ARMBaseInstrInfo &TII,
9393 const uint16_t *CSRegs) {
9494 // Integer spill area is handled with "pop".
95 if (MI->getOpcode() == ARM::LDMIA_RET ||
96 MI->getOpcode() == ARM::t2LDMIA_RET ||
97 MI->getOpcode() == ARM::LDMIA_UPD ||
98 MI->getOpcode() == ARM::t2LDMIA_UPD ||
99 MI->getOpcode() == ARM::VLDMDIA_UPD) {
95 if (isPopOpcode(MI->getOpcode())) {
10096 // The first two operands are predicates. The last two are
10197 // imp-def and imp-use of SP. Check everything in between.
10298 for (int i = 5, e = MI->getNumOperands(); i != e; ++i)
220216 }
221217
222218 // Move past area 1.
223 if (GPRCS1Size > 0) MBBI++;
219 MachineBasicBlock::iterator LastPush = MBB.end(), FramePtrPush;
220 if (GPRCS1Size > 0)
221 FramePtrPush = LastPush = MBBI++;
224222
225223 // Determine starting offsets of spill areas.
226224 bool HasFP = hasFP(MF);
227225 unsigned DPRCSOffset = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
228226 unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
229227 unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
230 if (HasFP)
228 int FramePtrOffsetInPush = 0;
229 if (HasFP) {
230 FramePtrOffsetInPush = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
231231 AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) +
232232 NumBytes);
233 }
233234 AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
234235 AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
235236 AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
236237
237 // Set FP to point to the stack slot that contains the previous FP.
238 // For iOS, FP is R7, which has now been stored in spill area 1.
239 // Otherwise, if this is not iOS, all the callee-saved registers go
240 // into spill area 1, including the FP in R11. In either case, it is
241 // now safe to emit this assignment.
242 if (HasFP) {
243 int FramePtrOffset = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
244 emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, MBBI, dl, TII,
245 FramePtr, ARM::SP, FramePtrOffset,
246 MachineInstr::FrameSetup);
247 }
248
249238 // Move past area 2.
250 if (GPRCS2Size > 0) MBBI++;
239 if (GPRCS2Size > 0) {
240 LastPush = MBBI++;
241 }
251242
252243 // Move past area 3.
253244 if (DPRCSSize > 0) {
254 MBBI++;
245 LastPush = MBBI++;
255246 // Since vpush register list cannot have gaps, there may be multiple vpush
256247 // instructions in the prologue.
257248 while (MBBI->getOpcode() == ARM::VSTMDDB_UPD)
258 MBBI++;
249 LastPush = MBBI++;
259250 }
260251
261252 // Move past the aligned DPRCS2 area.
271262
272263 if (NumBytes) {
273264 // Adjust SP after all the callee-save spills.
274 emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
275 MachineInstr::FrameSetup);
265 if (tryFoldSPUpdateIntoPushPop(MF, LastPush, NumBytes))
266 FramePtrOffsetInPush += NumBytes;
267 else
268 emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes,
269 MachineInstr::FrameSetup);
270
276271 if (HasFP && isARM)
277272 // Restore from fp only in ARM mode: e.g. sub sp, r7, #24
278273 // Note it's not safe to do this in Thumb2 mode because it would have
284279 // The interrupt handler can end up clobbering the registers.
285280 AFI->setShouldRestoreSPFromFP(true);
286281 }
282
283 // Set FP to point to the stack slot that contains the previous FP.
284 // For iOS, FP is R7, which has now been stored in spill area 1.
285 // Otherwise, if this is not iOS, all the callee-saved registers go
286 // into spill area 1, including the FP in R11. In either case, it
287 // is in area one and the adjustment needs to take place just after
288 // that push.
289 if (HasFP)
290 emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, ++FramePtrPush, dl, TII,
291 FramePtr, ARM::SP, FramePtrOffsetInPush,
292 MachineInstr::FrameSetup);
293
287294
288295 if (STI.isTargetELF() && hasFP(MF))
289296 MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
379386 if (NumBytes != 0)
380387 emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
381388 } else {
389 MachineBasicBlock::iterator FirstPop = MBBI;
390
382391 // Unwind MBBI to point to first LDR / VLDRD.
383392 const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
384393 if (MBBI != MBB.begin()) {
385 do
394 do {
395 if (isPopOpcode(MBBI->getOpcode()))
396 FirstPop = MBBI;
397
386398 --MBBI;
387 while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
399 } while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
388400 if (!isCSRestore(MBBI, TII, CSRegs))
389401 ++MBBI;
390402 }
428440 ARM::SP)
429441 .addReg(FramePtr));
430442 }
431 } else if (NumBytes)
432 emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
443 } else if (NumBytes && !tryFoldSPUpdateIntoPushPop(MF, FirstPop, NumBytes))
444 emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
433445
434446 // Increment past our save areas.
435447 if (AFI->getDPRCalleeSavedAreaSize()) {
163163 AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
164164 NumBytes = DPRCSOffset;
165165
166 int FramePtrOffsetInBlock = 0;
167 if (tryFoldSPUpdateIntoPushPop(MF, prior(MBBI), NumBytes)) {
168 FramePtrOffsetInBlock = NumBytes;
169 NumBytes = 0;
170 }
171
166172 // Adjust FP so it point to the stack slot that contains the previous FP.
167173 if (HasFP) {
168 int FramePtrOffset = MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
174 FramePtrOffsetInBlock += MFI->getObjectOffset(FramePtrSpillFI) + GPRCS1Size;
169175 AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
170 .addReg(ARM::SP).addImm(FramePtrOffset / 4)
176 .addReg(ARM::SP).addImm(FramePtrOffsetInBlock / 4)
171177 .setMIFlags(MachineInstr::FrameSetup));
172178 if (NumBytes > 508)
173179 // If offset is > 508 then sp cannot be adjusted in a single instruction,
291297 &MBB.front() != MBBI &&
292298 prior(MBBI)->getOpcode() == ARM::tPOP) {
293299 MachineBasicBlock::iterator PMBBI = prior(MBBI);
294 emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
295 } else
300 if (!tryFoldSPUpdateIntoPushPop(MF, PMBBI, NumBytes))
301 emitSPUpdate(MBB, PMBBI, TII, dl, *RegInfo, NumBytes);
302 } else if (!tryFoldSPUpdateIntoPushPop(MF, MBBI, NumBytes))
296303 emitSPUpdate(MBB, MBBI, TII, dl, *RegInfo, NumBytes);
297304 }
298305 }
0 ; RUN: llc -mtriple=thumbv7-apple-darwin-eabi < %s | FileCheck %s
1 ; RUN: llc -mtriple=thumbv6m-apple-darwin-eabi -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-T1
2 ; RUN: llc -mtriple=thumbv7-apple-darwin-ios -disable-fp-elim < %s | FileCheck %s --check-prefix=CHECK-IOS
3
4
5 declare void @bar(i8*)
6
7 %bigVec = type [2 x double]
8
9 @var = global %bigVec zeroinitializer
10
11 define void @check_simple() minsize {
12 ; CHECK-LABEL: check_simple:
13 ; CHECK: push.w {r7, r8, r9, r10, r11, lr}
14 ; CHECK-NOT: sub sp, sp,
15 ; ...
16 ; CHECK-NOT: add sp, sp,
17 ; CHECK: pop.w {r7, r8, r9, r10, r11, pc}
18
19 ; CHECK-T1-LABEL: check_simple:
20 ; CHECK-T1: push {r3, r4, r5, r6, r7, lr}
21 ; CHECK-T1: add r7, sp, #16
22 ; CHECK-T1-NOT: sub sp, sp,
23 ; ...
24 ; CHECK-T1-NOT: add sp, sp,
25 ; CHECK-T1: pop {r3, r4, r5, r6, r7, pc}
26
27 ; iOS always has a frame pointer and messing with the push affects
28 ; how it's set in the prologue. Make sure we get that right.
29 ; CHECK-IOS-LABEL: check_simple:
30 ; CHECK-IOS: push {r3, r4, r5, r6, r7, lr}
31 ; CHECK-NOT: sub sp,
32 ; CHECK-IOS: add r7, sp, #16
33 ; CHECK-NOT: sub sp,
34 ; ...
35 ; CHECK-NOT: add sp,
36 ; CHEC: pop {r3, r4, r5, r6, r7, pc}
37
38 %var = alloca i8, i32 16
39 call void @bar(i8* %var)
40 ret void
41 }
42
43 define void @check_simple_too_big() minsize {
44 ; CHECK-LABEL: check_simple_too_big:
45 ; CHECK: push.w {r11, lr}
46 ; CHECK: sub sp,
47 ; ...
48 ; CHECK: add sp,
49 ; CHECK: pop.w {r11, pc}
50 %var = alloca i8, i32 64
51 call void @bar(i8* %var)
52 ret void
53 }
54
55 define void @check_vfp_fold() minsize {
56 ; CHECK-LABEL: check_vfp_fold:
57 ; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
58 ; CHECK: vpush {d6, d7, d8, d9}
59 ; CHECK-NOT: sub sp,
60 ; ...
61 ; CHECK: vldmia r[[GLOBREG]], {d8, d9}
62 ; ...
63 ; CHECK-NOT: add sp,
64 ; CHECK: vpop {d6, d7, d8, d9}
65 ; CHECKL pop {r[[GLOBREG]], pc}
66
67 ; iOS uses aligned NEON stores here, which is convenient since we
68 ; want to make sure that works too.
69 ; CHECK-IOS-LABEL: check_vfp_fold:
70 ; CHECK-IOS: push {r0, r1, r2, r3, r4, r7, lr}
71 ; CHECK-IOS: sub.w r4, sp, #16
72 ; CHECK-IOS: bic r4, r4, #15
73 ; CHECK-IOS: mov sp, r4
74 ; CHECK-IOS: vst1.64 {d8, d9}, [r4:128]
75 ; ...
76 ; CHECK-IOS: add r4, sp, #16
77 ; CHECK-IOS: vld1.64 {d8, d9}, [r4:128]
78 ; CHECK-IOS: mov sp, r4
79 ; CHECK-IOS: pop {r4, r7, pc}
80
81 %var = alloca i8, i32 16
82
83 %tmp = load %bigVec* @var
84 call void @bar(i8* %var)
85 store %bigVec %tmp, %bigVec* @var
86
87 ret void
88 }
89
90 ; This function should use just enough space that the "add sp, sp, ..." could be
91 ; folded in except that doing so would clobber the value being returned.
92 define i64 @check_no_return_clobber() minsize {
93 ; CHECK-LABEL: check_no_return_clobber:
94 ; CHECK: push.w {r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, lr}
95 ; CHECK-NOT: sub sp,
96 ; ...
97 ; CHECK: add sp, #40
98 ; CHECK: pop.w {r11, pc}
99
100 ; Just to keep iOS FileCheck within previous function:
101 ; CHECK-IOS-LABEL: check_no_return_clobber:
102
103 %var = alloca i8, i32 40
104 call void @bar(i8* %var)
105 ret i64 0
106 }
107
108 define arm_aapcs_vfpcc double @check_vfp_no_return_clobber() minsize {
109 ; CHECK-LABEL: check_vfp_no_return_clobber:
110 ; CHECK: push {r[[GLOBREG:[0-9]+]], lr}
111 ; CHECK: vpush {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9}
112 ; CHECK-NOT: sub sp,
113 ; ...
114 ; CHECK: add sp, #64
115 ; CHECK: vpop {d8, d9}
116 ; CHECK: pop {r[[GLOBREG]], pc}
117
118 %var = alloca i8, i32 64
119
120 %tmp = load %bigVec* @var
121 call void @bar(i8* %var)
122 store %bigVec %tmp, %bigVec* @var
123
124 ret double 1.0
125 }