llvm.org GIT mirror llvm / 70af20f
CXX_FAST_TLS calling convention: performance improvement for x86-64. This is the same change on x86-64 as r255821 on AArch64. rdar://9001553 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@257428 91177308-0d34-0410-b5e6-96231b3b80d8 Manman Ren 4 years ago
8 changed file(s) with 117 addition(s) and 43 deletion(s). Raw diff Collapse all Expand all
830830 def CSR_64_TLS_Darwin : CalleeSavedRegs<(add CSR_64, RCX, RDX, RSI,
831831 R8, R9, R10, R11)>;
832832
833 // CSRs that are handled by prologue, epilogue.
834 def CSR_64_CXX_TLS_Darwin_PE : CalleeSavedRegs<(add)>;
835
836 // CSRs that are handled explicitly via copies.
837 def CSR_64_CXX_TLS_Darwin_ViaCopy : CalleeSavedRegs<(add CSR_64_TLS_Darwin)>;
838
833839 // All GPRs - except r11
834840 def CSR_64_RT_MostRegs : CalleeSavedRegs<(add CSR_64, RAX, RCX, RDX, RSI, RDI,
835841 R8, R9, R10, RSP)>;
999999 FuncInfo.MF->getInfo();
10001000
10011001 if (!FuncInfo.CanLowerReturn)
1002 return false;
1003
1004 if (TLI.supportSplitCSR(FuncInfo.MF))
10021005 return false;
10031006
10041007 CallingConv::ID CC = F.getCallingConv();
23082308 // RAX/EAX now acts like a return value.
23092309 RetOps.push_back(
23102310 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2311 }
2312
2313 const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
2314 const MCPhysReg *I =
2315 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2316 if (I) {
2317 for (; *I; ++I) {
2318 if (X86::GR64RegClass.contains(*I))
2319 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2320 else
2321 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2322 }
23112323 }
23122324
23132325 RetOps[0] = Chain; // Update chain.
2882628838 Attribute::MinSize);
2882728839 return OptSize && !VT.isVector();
2882828840 }
28841
28842 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
28843 if (!Subtarget->is64Bit())
28844 return;
28845
28846 // Update IsSplitCSR in X86MachineFunctionInfo.
28847 X86MachineFunctionInfo *AFI =
28848 Entry->getParent()->getInfo();
28849 AFI->setIsSplitCSR(true);
28850 }
28851
28852 void X86TargetLowering::insertCopiesSplitCSR(
28853 MachineBasicBlock *Entry,
28854 const SmallVectorImpl &Exits) const {
28855 const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
28856 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
28857 if (!IStart)
28858 return;
28859
28860 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
28861 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
28862 for (const MCPhysReg *I = IStart; *I; ++I) {
28863 const TargetRegisterClass *RC = nullptr;
28864 if (X86::GR64RegClass.contains(*I))
28865 RC = &X86::GR64RegClass;
28866 else
28867 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
28868
28869 unsigned NewVR = MRI->createVirtualRegister(RC);
28870 // Create copy from CSR to a virtual register.
28871 // FIXME: this currently does not emit CFI pseudo-instructions, it works
28872 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
28873 // nounwind. If we want to generalize this later, we may need to emit
28874 // CFI pseudo-instructions.
28875 assert(Entry->getParent()->getFunction()->hasFnAttribute(
28876 Attribute::NoUnwind) &&
28877 "Function should be nounwind in insertCopiesSplitCSR!");
28878 Entry->addLiveIn(*I);
28879 BuildMI(*Entry, Entry->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
28880 NewVR)
28881 .addReg(*I);
28882
28883 for (auto *Exit : Exits)
28884 BuildMI(*Exit, Exit->begin(), DebugLoc(), TII->get(TargetOpcode::COPY),
28885 *I)
28886 .addReg(NewVR);
28887 }
28888 }
10561056 const SmallVectorImpl &OutVals,
10571057 SDLoc dl, SelectionDAG &DAG) const override;
10581058
1059 bool supportSplitCSR(MachineFunction *MF) const override {
1060 return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
1061 MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
1062 }
1063 void initializeSplitCSR(MachineBasicBlock *Entry) const override;
1064 void insertCopiesSplitCSR(
1065 MachineBasicBlock *Entry,
1066 const SmallVectorImpl &Exits) const override;
1067
10591068 bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
10601069
10611070 bool mayBeEmittedAsTailCall(CallInst *CI) const override;
9191 /// used to address arguments in a function using a base pointer.
9292 int SEHFramePtrSaveIndex = 0;
9393
94 /// True if this function has a subset of CSRs that is handled explicitly via
95 /// copies.
96 bool IsSplitCSR = false;
97
9498 private:
9599 /// ForwardedMustTailRegParms - A list of virtual and physical registers
96100 /// that must be forwarded to every musttail call.
159163 SmallVectorImpl &getForwardedMustTailRegParms() {
160164 return ForwardedMustTailRegParms;
161165 }
166
167 bool isSplitCSR() const { return IsSplitCSR; }
168 void setIsSplitCSR(bool s) { IsSplitCSR = s; }
162169 };
163170
164171 } // End llvm namespace
249249 return CSR_64_RT_AllRegs_SaveList;
250250 case CallingConv::CXX_FAST_TLS:
251251 if (Is64Bit)
252 return CSR_64_TLS_Darwin_SaveList;
252 return MF->getInfo()->isSplitCSR() ?
253 CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList;
253254 break;
254255 case CallingConv::Intel_OCL_BI: {
255256 if (HasAVX512 && IsWin64)
302303 if (CallsEHReturn)
303304 return CSR_32EHRet_SaveList;
304305 return CSR_32_SaveList;
306 }
307
308 const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
309 const MachineFunction *MF) const {
310 assert(MF && "Invalid MachineFunction pointer.");
311 if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
312 MF->getInfo()->isSplitCSR())
313 return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
314 return nullptr;
305315 }
306316
307317 const uint32_t *
9898 /// callee-save registers on this target.
9999 const MCPhysReg *
100100 getCalleeSavedRegs(const MachineFunction* MF) const override;
101 const MCPhysReg *
102 getCalleeSavedRegsViaCopy(const MachineFunction *MF) const override;
101103 const uint32_t *getCallPreservedMask(const MachineFunction &MF,
102104 CallingConv::ID) const override;
103105 const uint32_t *getNoPreservedMask() const override;
11 ; TLS function were wrongly model and after fixing that, shrink-wrapping
22 ; cannot help here. To achieve the expected lowering, we need to playing
33 ; tricks similar to AArch64 fast TLS calling convention (r255821).
4 ; Re-enable the following run line when
5 ; _RUN_: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck --check-prefix=SHRINK %s
4 ; Applying tricks on x86-64 similar to r255821.
5 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -enable-shrink-wrap=true | FileCheck %s
66 %struct.S = type { i8 }
77
88 @sg = internal thread_local global %struct.S zeroinitializer, align 1
1515
1616 ; Every GPR should be saved - except rdi, rax, and rsp
1717 ; CHECK-LABEL: _ZTW2sg
18 ; CHECK: pushq %r11
19 ; CHECK: pushq %r10
20 ; CHECK: pushq %r9
21 ; CHECK: pushq %r8
22 ; CHECK: pushq %rsi
23 ; CHECK: pushq %rdx
24 ; CHECK: pushq %rcx
25 ; CHECK: pushq %rbx
18 ; CHECK-NOT: pushq %r11
19 ; CHECK-NOT: pushq %r10
20 ; CHECK-NOT: pushq %r9
21 ; CHECK-NOT: pushq %r8
22 ; CHECK-NOT: pushq %rsi
23 ; CHECK-NOT: pushq %rdx
24 ; CHECK-NOT: pushq %rcx
25 ; CHECK-NOT: pushq %rbx
2626 ; CHECK: callq
2727 ; CHECK: jne
2828 ; CHECK: callq
2929 ; CHECK: tlv_atexit
3030 ; CHECK: callq
31 ; CHECK: popq %rbx
32 ; CHECK: popq %rcx
33 ; CHECK: popq %rdx
34 ; CHECK: popq %rsi
35 ; CHECK: popq %r8
36 ; CHECK: popq %r9
37 ; CHECK: popq %r10
38 ; CHECK: popq %r11
39 ; SHRINK-LABEL: _ZTW2sg
40 ; SHRINK: callq
41 ; SHRINK: jne
42 ; SHRINK: pushq %r11
43 ; SHRINK: pushq %r10
44 ; SHRINK: pushq %r9
45 ; SHRINK: pushq %r8
46 ; SHRINK: pushq %rsi
47 ; SHRINK: pushq %rdx
48 ; SHRINK: pushq %rcx
49 ; SHRINK: pushq %rbx
50 ; SHRINK: callq
51 ; SHRINK: tlv_atexit
52 ; SHRINK: popq %rbx
53 ; SHRINK: popq %rcx
54 ; SHRINK: popq %rdx
55 ; SHRINK: popq %rsi
56 ; SHRINK: popq %r8
57 ; SHRINK: popq %r9
58 ; SHRINK: popq %r10
59 ; SHRINK: popq %r11
60 ; SHRINK: LBB{{.*}}:
61 ; SHRINK: callq
62 define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() {
31 ; CHECK-NOT: popq %rbx
32 ; CHECK-NOT: popq %rcx
33 ; CHECK-NOT: popq %rdx
34 ; CHECK-NOT: popq %rsi
35 ; CHECK-NOT: popq %r8
36 ; CHECK-NOT: popq %r9
37 ; CHECK-NOT: popq %r10
38 ; CHECK-NOT: popq %r11
39 define cxx_fast_tlscc nonnull %struct.S* @_ZTW2sg() nounwind {
6340 %.b.i = load i1, i1* @__tls_guard, align 1
6441 br i1 %.b.i, label %__tls_init.exit, label %init.i
6542